Index: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.cpp
===================================================================
--- user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.cpp	(revision 247191)
+++ user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.cpp	(revision 247192)
@@ -1,6410 +1,6434 @@
 //===--- Tools.cpp - Tools Implementations --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #include "Tools.h"
 
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Arg.h"
 #include "clang/Driver/ArgList.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Job.h"
 #include "clang/Driver/Option.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
 #include "clang/Basic/ObjCRuntime.h"
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #include "InputInfo.h"
 #include "SanitizerArgs.h"
 #include "ToolChains.h"
 
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang;
 
 /// CheckPreprocessingOptions - Perform some validation of preprocessing
 /// arguments that is shared with gcc.
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
   if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC))
     if (!Args.hasArg(options::OPT_E) && !D.CCCIsCPP)
       D.Diag(diag::err_drv_argument_only_allowed_with)
         << A->getAsString(Args) << "-E";
 }
 
 /// CheckCodeGenerationOptions - Perform some validation of code generation
 /// arguments that is shared with gcc.
 static void CheckCodeGenerationOptions(const Driver &D, const ArgList &Args) {
   // In gcc, only ARM checks this, but it seems reasonable to check universally.
   if (Args.hasArg(options::OPT_static))
     if (const Arg *A = Args.getLastArg(options::OPT_dynamic,
                                        options::OPT_mdynamic_no_pic))
       D.Diag(diag::err_drv_argument_not_allowed_with)
         << A->getAsString(Args) << "-static";
 }
 
 // Quote target names for inclusion in GNU Make dependency files.
 // Only the characters '$', '#', ' ', '\t' are quoted.
 static void QuoteTarget(StringRef Target,
                         SmallVectorImpl<char> &Res) {
   for (unsigned i = 0, e = Target.size(); i != e; ++i) {
     switch (Target[i]) {
     case ' ':
     case '\t':
       // Escape the preceding backslashes
       for (int j = i - 1; j >= 0 && Target[j] == '\\'; --j)
         Res.push_back('\\');
 
       // Escape the space/tab
       Res.push_back('\\');
       break;
     case '$':
       Res.push_back('$');
       break;
     case '#':
       Res.push_back('\\');
       break;
     default:
       break;
     }
 
     Res.push_back(Target[i]);
   }
 }
 
 static void addDirectoryList(const ArgList &Args,
                              ArgStringList &CmdArgs,
                              const char *ArgName,
                              const char *EnvVar) {
   const char *DirList = ::getenv(EnvVar);
   bool CombinedArg = false;
 
   if (!DirList)
     return; // Nothing to do.
 
   StringRef Name(ArgName);
   if (Name.equals("-I") || Name.equals("-L"))
     CombinedArg = true;
 
   StringRef Dirs(DirList);
   if (Dirs.empty()) // Empty string should not add '.'.
     return;
 
   StringRef::size_type Delim;
   while ((Delim = Dirs.find(llvm::sys::PathSeparator)) != StringRef::npos) {
     if (Delim == 0) { // Leading colon.
       if (CombinedArg) {
         CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + "."));
       } else {
         CmdArgs.push_back(ArgName);
         CmdArgs.push_back(".");
       }
     } else {
       if (CombinedArg) {
         CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + Dirs.substr(0, Delim)));
       } else {
         CmdArgs.push_back(ArgName);
         CmdArgs.push_back(Args.MakeArgString(Dirs.substr(0, Delim)));
       }
     }
     Dirs = Dirs.substr(Delim + 1);
   }
 
   if (Dirs.empty()) { // Trailing colon.
     if (CombinedArg) {
       CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + "."));
     } else {
       CmdArgs.push_back(ArgName);
       CmdArgs.push_back(".");
     }
   } else { // Add the last path.
     if (CombinedArg) {
       CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + Dirs));
     } else {
       CmdArgs.push_back(ArgName);
       CmdArgs.push_back(Args.MakeArgString(Dirs));
     }
   }
 }
 
 static void AddLinkerInputs(const ToolChain &TC,
                             const InputInfoList &Inputs, const ArgList &Args,
                             ArgStringList &CmdArgs) {
   const Driver &D = TC.getDriver();
 
   // Add extra linker input arguments which are not treated as inputs
   // (constructed via -Xarch_).
   Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input);
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
 
     if (!TC.HasNativeLLVMSupport()) {
       // Don't try to pass LLVM inputs unless we have native support.
       if (II.getType() == types::TY_LLVM_IR ||
           II.getType() == types::TY_LTO_IR ||
           II.getType() == types::TY_LLVM_BC ||
           II.getType() == types::TY_LTO_BC)
         D.Diag(diag::err_drv_no_linker_llvm_support)
           << TC.getTripleString();
     }
 
     // Add filenames immediately.
     if (II.isFilename()) {
       CmdArgs.push_back(II.getFilename());
       continue;
     }
 
     // Otherwise, this is a linker input argument.
     const Arg &A = II.getInputArg();
 
     // Handle reserved library options.
     if (A.getOption().matches(options::OPT_Z_reserved_lib_stdcxx)) {
       TC.AddCXXStdlibLibArgs(Args, CmdArgs);
     } else if (A.getOption().matches(options::OPT_Z_reserved_lib_cckext)) {
       TC.AddCCKextLibArgs(Args, CmdArgs);
     } else
       A.renderAsInput(Args, CmdArgs);
   }
 
   // LIBRARY_PATH - included following the user specified library paths.
   addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
 }
 
 /// \brief Determine whether Objective-C automated reference counting is
 /// enabled.
 static bool isObjCAutoRefCount(const ArgList &Args) {
   return Args.hasFlag(options::OPT_fobjc_arc, options::OPT_fno_objc_arc, false);
 }
 
 /// \brief Determine whether we are linking the ObjC runtime.
 static bool isObjCRuntimeLinked(const ArgList &Args) {
   if (isObjCAutoRefCount(Args)) {
     Args.ClaimAllArgs(options::OPT_fobjc_link_runtime);
     return true;
   }
   return Args.hasArg(options::OPT_fobjc_link_runtime);
 }
 
 static void addProfileRT(const ToolChain &TC, const ArgList &Args,
                          ArgStringList &CmdArgs,
                          llvm::Triple Triple) {
   if (!(Args.hasArg(options::OPT_fprofile_arcs) ||
         Args.hasArg(options::OPT_fprofile_generate) ||
         Args.hasArg(options::OPT_fcreate_profile) ||
         Args.hasArg(options::OPT_coverage)))
     return;
 
   // GCC links libgcov.a by adding -L<inst>/gcc/lib/gcc/<triple>/<ver> -lgcov to
   // the link line. We cannot do the same thing because unlike gcov there is a
   // libprofile_rt.so. We used to use the -l:libprofile_rt.a syntax, but that is
   // not supported by old linkers.
   std::string ProfileRT =
     std::string(TC.getDriver().Dir) + "/../lib/libprofile_rt.a";
 
   CmdArgs.push_back(Args.MakeArgString(ProfileRT));
 }
 
 static bool forwardToGCC(const Option &O) {
   return !O.hasFlag(options::NoForward) &&
          !O.hasFlag(options::DriverOption) &&
          !O.hasFlag(options::LinkerInput);
 }
 
 void Clang::AddPreprocessingOptions(Compilation &C,
                                     const Driver &D,
                                     const ArgList &Args,
                                     ArgStringList &CmdArgs,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs) const {
   Arg *A;
 
   CheckPreprocessingOptions(D, Args);
 
   Args.AddLastArg(CmdArgs, options::OPT_C);
   Args.AddLastArg(CmdArgs, options::OPT_CC);
 
   // Handle dependency file generation.
   if ((A = Args.getLastArg(options::OPT_M, options::OPT_MM)) ||
       (A = Args.getLastArg(options::OPT_MD)) ||
       (A = Args.getLastArg(options::OPT_MMD))) {
     // Determine the output location.
     const char *DepFile;
     if (Arg *MF = Args.getLastArg(options::OPT_MF)) {
       DepFile = MF->getValue();
       C.addFailureResultFile(DepFile);
     } else if (Output.getType() == types::TY_Dependencies) {
       DepFile = Output.getFilename();
     } else if (A->getOption().matches(options::OPT_M) ||
                A->getOption().matches(options::OPT_MM)) {
       DepFile = "-";
     } else {
       DepFile = darwin::CC1::getDependencyFileName(Args, Inputs);
       C.addFailureResultFile(DepFile);
     }
     CmdArgs.push_back("-dependency-file");
     CmdArgs.push_back(DepFile);
 
     // Add a default target if one wasn't specified.
     if (!Args.hasArg(options::OPT_MT) && !Args.hasArg(options::OPT_MQ)) {
       const char *DepTarget;
 
       // If user provided -o, that is the dependency target, except
       // when we are only generating a dependency file.
       Arg *OutputOpt = Args.getLastArg(options::OPT_o);
       if (OutputOpt && Output.getType() != types::TY_Dependencies) {
         DepTarget = OutputOpt->getValue();
       } else {
         // Otherwise derive from the base input.
         //
         // FIXME: This should use the computed output file location.
         SmallString<128> P(Inputs[0].getBaseInput());
         llvm::sys::path::replace_extension(P, "o");
         DepTarget = Args.MakeArgString(llvm::sys::path::filename(P));
       }
 
       CmdArgs.push_back("-MT");
       SmallString<128> Quoted;
       QuoteTarget(DepTarget, Quoted);
       CmdArgs.push_back(Args.MakeArgString(Quoted));
     }
 
     if (A->getOption().matches(options::OPT_M) ||
         A->getOption().matches(options::OPT_MD))
       CmdArgs.push_back("-sys-header-deps");
   }
 
   if (Args.hasArg(options::OPT_MG)) {
     if (!A || A->getOption().matches(options::OPT_MD) ||
               A->getOption().matches(options::OPT_MMD))
       D.Diag(diag::err_drv_mg_requires_m_or_mm);
     CmdArgs.push_back("-MG");
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_MP);
 
   // Convert all -MQ <target> args to -MT <quoted target>
   for (arg_iterator it = Args.filtered_begin(options::OPT_MT,
                                              options::OPT_MQ),
          ie = Args.filtered_end(); it != ie; ++it) {
     const Arg *A = *it;
     A->claim();
 
     if (A->getOption().matches(options::OPT_MQ)) {
       CmdArgs.push_back("-MT");
       SmallString<128> Quoted;
       QuoteTarget(A->getValue(), Quoted);
       CmdArgs.push_back(Args.MakeArgString(Quoted));
 
     // -MT flag - no change
     } else {
       A->render(Args, CmdArgs);
     }
   }
 
   // Add -i* options, and automatically translate to
   // -include-pch/-include-pth for transparent PCH support. It's
   // wonky, but we include looking for .gch so we can support seamless
   // replacement into a build system already set up to be generating
   // .gch files.
   bool RenderedImplicitInclude = false;
   for (arg_iterator it = Args.filtered_begin(options::OPT_clang_i_Group),
          ie = Args.filtered_end(); it != ie; ++it) {
     const Arg *A = it;
 
     if (A->getOption().matches(options::OPT_include)) {
       bool IsFirstImplicitInclude = !RenderedImplicitInclude;
       RenderedImplicitInclude = true;
 
       // Use PCH if the user requested it.
       bool UsePCH = D.CCCUsePCH;
 
       bool FoundPTH = false;
       bool FoundPCH = false;
       llvm::sys::Path P(A->getValue());
       bool Exists;
       if (UsePCH) {
         P.appendSuffix("pch");
         if (!llvm::sys::fs::exists(P.str(), Exists) && Exists)
           FoundPCH = true;
         else
           P.eraseSuffix();
       }
 
       if (!FoundPCH) {
         P.appendSuffix("pth");
         if (!llvm::sys::fs::exists(P.str(), Exists) && Exists)
           FoundPTH = true;
         else
           P.eraseSuffix();
       }
 
       if (!FoundPCH && !FoundPTH) {
         P.appendSuffix("gch");
         if (!llvm::sys::fs::exists(P.str(), Exists) && Exists) {
           FoundPCH = UsePCH;
           FoundPTH = !UsePCH;
         }
         else
           P.eraseSuffix();
       }
 
       if (FoundPCH || FoundPTH) {
         if (IsFirstImplicitInclude) {
           A->claim();
           if (UsePCH)
             CmdArgs.push_back("-include-pch");
           else
             CmdArgs.push_back("-include-pth");
           CmdArgs.push_back(Args.MakeArgString(P.str()));
           continue;
         } else {
           // Ignore the PCH if not first on command line and emit warning.
           D.Diag(diag::warn_drv_pch_not_first_include)
               << P.str() << A->getAsString(Args);
         }
       }
     }
 
     // Not translated, render as usual.
     A->claim();
     A->render(Args, CmdArgs);
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_D, options::OPT_U);
   Args.AddAllArgs(CmdArgs, options::OPT_I_Group, options::OPT_F,
                   options::OPT_index_header_map);
 
   // Add -Wp, and -Xassembler if using the preprocessor.
 
   // FIXME: There is a very unfortunate problem here, some troubled
   // souls abuse -Wp, to pass preprocessor options in gcc syntax. To
   // really support that we would have to parse and then translate
   // those options. :(
   Args.AddAllArgValues(CmdArgs, options::OPT_Wp_COMMA,
                        options::OPT_Xpreprocessor);
 
   // -I- is a deprecated GCC feature, reject it.
   if (Arg *A = Args.getLastArg(options::OPT_I_))
     D.Diag(diag::err_drv_I_dash_not_supported) << A->getAsString(Args);
 
   // If we have a --sysroot, and don't have an explicit -isysroot flag, add an
   // -isysroot to the CC1 invocation.
   StringRef sysroot = C.getSysRoot();
   if (sysroot != "") {
     if (!Args.hasArg(options::OPT_isysroot)) {
       CmdArgs.push_back("-isysroot");
       CmdArgs.push_back(C.getArgs().MakeArgString(sysroot));
     }
   }
   
   // If a module path was provided, pass it along. Otherwise, use a temporary
   // directory.
   if (Arg *A = Args.getLastArg(options::OPT_fmodule_cache_path)) {
     A->claim();
     A->render(Args, CmdArgs);
   } else {
     SmallString<128> DefaultModuleCache;
     llvm::sys::path::system_temp_directory(/*erasedOnReboot=*/false, 
                                            DefaultModuleCache);
     llvm::sys::path::append(DefaultModuleCache, "clang-module-cache");
     CmdArgs.push_back("-fmodule-cache-path");
     CmdArgs.push_back(Args.MakeArgString(DefaultModuleCache));
   }
   
   // Parse additional include paths from environment variables.
   // FIXME: We should probably sink the logic for handling these from the
   // frontend into the driver. It will allow deleting 4 otherwise unused flags.
   // CPATH - included following the user specified includes (but prior to
   // builtin and standard includes).
   addDirectoryList(Args, CmdArgs, "-I", "CPATH");
   // C_INCLUDE_PATH - system includes enabled when compiling C.
   addDirectoryList(Args, CmdArgs, "-c-isystem", "C_INCLUDE_PATH");
   // CPLUS_INCLUDE_PATH - system includes enabled when compiling C++.
   addDirectoryList(Args, CmdArgs, "-cxx-isystem", "CPLUS_INCLUDE_PATH");
   // OBJC_INCLUDE_PATH - system includes enabled when compiling ObjC.
   addDirectoryList(Args, CmdArgs, "-objc-isystem", "OBJC_INCLUDE_PATH");
   // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++.
   addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");
 
   // Add C++ include arguments, if needed.
   if (types::isCXX(Inputs[0].getType()))
     getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
 
   // Add system include arguments.
   getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs);
 }
 
 /// getLLVMArchSuffixForARM - Get the LLVM arch name to use for a particular
 /// CPU.
 //
 // FIXME: This is redundant with -mcpu, why does LLVM use this.
 // FIXME: tblgen this, or kill it!
 static const char *getLLVMArchSuffixForARM(StringRef CPU) {
   return llvm::StringSwitch<const char *>(CPU)
     .Cases("arm7tdmi", "arm7tdmi-s", "arm710t", "v4t")
     .Cases("arm720t", "arm9", "arm9tdmi", "v4t")
     .Cases("arm920", "arm920t", "arm922t", "v4t")
     .Cases("arm940t", "ep9312","v4t")
     .Cases("arm10tdmi",  "arm1020t", "v5")
     .Cases("arm9e",  "arm926ej-s",  "arm946e-s", "v5e")
     .Cases("arm966e-s",  "arm968e-s",  "arm10e", "v5e")
     .Cases("arm1020e",  "arm1022e",  "xscale", "iwmmxt", "v5e")
     .Cases("arm1136j-s",  "arm1136jf-s",  "arm1176jz-s", "v6")
     .Cases("arm1176jzf-s",  "mpcorenovfp",  "mpcore", "v6")
     .Cases("arm1156t2-s",  "arm1156t2f-s", "v6t2")
     .Cases("cortex-a8", "cortex-a9", "cortex-a15", "v7")
     .Case("cortex-m3", "v7m")
     .Case("cortex-m4", "v7m")
     .Case("cortex-m0", "v6m")
     .Case("cortex-a9-mp", "v7f")
     .Case("swift", "v7s")
     .Default("");
 }
 
 /// getARMTargetCPU - Get the (LLVM) name of the ARM cpu we are targeting.
 //
 // FIXME: tblgen this.
 static std::string getARMTargetCPU(const ArgList &Args,
                                    const llvm::Triple &Triple) {
   // FIXME: Warn on inconsistent use of -mcpu and -march.
 
   // If we have -mcpu=, use that.
   if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     StringRef MCPU = A->getValue();
     // Handle -mcpu=native.
     if (MCPU == "native")
       return llvm::sys::getHostCPUName();
     else
       return MCPU;
   }
 
   StringRef MArch;
   if (Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     // Otherwise, if we have -march= choose the base CPU for that arch.
     MArch = A->getValue();
   } else {
     // Otherwise, use the Arch from the triple.
     MArch = Triple.getArchName();
   }
 
   // Handle -march=native.
   std::string NativeMArch;
   if (MArch == "native") {
     std::string CPU = llvm::sys::getHostCPUName();
     if (CPU != "generic") {
       // Translate the native cpu into the architecture. The switch below will
       // then chose the minimum cpu for that arch.
       NativeMArch = std::string("arm") + getLLVMArchSuffixForARM(CPU);
       MArch = NativeMArch;
     }
   }
 
   return llvm::StringSwitch<const char *>(MArch)
     .Cases("armv2", "armv2a","arm2")
     .Case("armv3", "arm6")
     .Case("armv3m", "arm7m")
     .Cases("armv4", "armv4t", "arm7tdmi")
     .Cases("armv5", "armv5t", "arm10tdmi")
     .Cases("armv5e", "armv5te", "arm1022e")
     .Case("armv5tej", "arm926ej-s")
     .Cases("armv6", "armv6k", "arm1136jf-s")
     .Case("armv6j", "arm1136j-s")
     .Cases("armv6z", "armv6zk", "arm1176jzf-s")
     .Case("armv6t2", "arm1156t2-s")
     .Cases("armv7", "armv7a", "armv7-a", "cortex-a8")
     .Cases("armv7f", "armv7-f", "cortex-a9-mp")
     .Cases("armv7s", "armv7-s", "swift")
     .Cases("armv7r", "armv7-r", "cortex-r4")
     .Cases("armv7m", "armv7-m", "cortex-m3")
     .Case("ep9312", "ep9312")
     .Case("iwmmxt", "iwmmxt")
     .Case("xscale", "xscale")
     .Cases("armv6m", "armv6-m", "cortex-m0")
     // If all else failed, return the most base CPU LLVM supports.
     .Default("arm7tdmi");
 }
 
 // FIXME: Move to target hook.
 static bool isSignedCharDefault(const llvm::Triple &Triple) {
   switch (Triple.getArch()) {
   default:
     return true;
 
   case llvm::Triple::arm:
   case llvm::Triple::ppc:
   case llvm::Triple::ppc64:
     if (Triple.isOSDarwin())
       return true;
     return false;
   }
 }
 
 // Handle -mfpu=.
 //
 // FIXME: Centralize feature selection, defaulting shouldn't be also in the
 // frontend target.
 static void addFPUArgs(const Driver &D, const Arg *A, const ArgList &Args,
                        ArgStringList &CmdArgs) {
   StringRef FPU = A->getValue();
 
   // Set the target features based on the FPU.
   if (FPU == "fpa" || FPU == "fpe2" || FPU == "fpe3" || FPU == "maverick") {
     // Disable any default FPU support.
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-vfp2");
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-vfp3");
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-neon");
   } else if (FPU == "vfp3-d16" || FPU == "vfpv3-d16") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+vfp3");
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+d16");
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-neon");
   } else if (FPU == "vfp") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+vfp2");
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-neon");
   } else if (FPU == "vfp3" || FPU == "vfpv3") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+vfp3");
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-neon");
   } else if (FPU == "neon") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+neon");
   } else
     D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args);
 }
 
 // Handle -mfpmath=.
 static void addFPMathArgs(const Driver &D, const Arg *A, const ArgList &Args,
                           ArgStringList &CmdArgs, StringRef CPU) {
   StringRef FPMath = A->getValue();
   
   // Set the target features based on the FPMath.
   if (FPMath == "neon") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+neonfp");
     
     if (CPU != "cortex-a8" && CPU != "cortex-a9" && CPU != "cortex-a9-mp" &&
         CPU != "cortex-a15")
       D.Diag(diag::err_drv_invalid_feature) << "-mfpmath=neon" << CPU;
     
   } else if (FPMath == "vfp" || FPMath == "vfp2" || FPMath == "vfp3" ||
              FPMath == "vfp4") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-neonfp");
 
     // FIXME: Add warnings when disabling a feature not present for a given CPU.    
   } else
     D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args);
 }
 
 // Select the float ABI as determined by -msoft-float, -mhard-float, and
 // -mfloat-abi=.
 static StringRef getARMFloatABI(const Driver &D,
                                 const ArgList &Args,
                                 const llvm::Triple &Triple) {
   StringRef FloatABI;
   if (Arg *A = Args.getLastArg(options::OPT_msoft_float,
                                options::OPT_mhard_float,
                                options::OPT_mfloat_abi_EQ)) {
     if (A->getOption().matches(options::OPT_msoft_float))
       FloatABI = "soft";
     else if (A->getOption().matches(options::OPT_mhard_float))
       FloatABI = "hard";
     else {
       FloatABI = A->getValue();
       if (FloatABI != "soft" && FloatABI != "softfp" && FloatABI != "hard") {
         D.Diag(diag::err_drv_invalid_mfloat_abi)
           << A->getAsString(Args);
         FloatABI = "soft";
       }
     }
   }
 
   // If unspecified, choose the default based on the platform.
   if (FloatABI.empty()) {
     switch (Triple.getOS()) {
     case llvm::Triple::Darwin:
     case llvm::Triple::MacOSX:
     case llvm::Triple::IOS: {
       // Darwin defaults to "softfp" for v6 and v7.
       //
       // FIXME: Factor out an ARM class so we can cache the arch somewhere.
       std::string ArchName =
         getLLVMArchSuffixForARM(getARMTargetCPU(Args, Triple));
       if (StringRef(ArchName).startswith("v6") ||
           StringRef(ArchName).startswith("v7"))
         FloatABI = "softfp";
       else
         FloatABI = "soft";
       break;
     }
 
     case llvm::Triple::FreeBSD:
       // FreeBSD defaults to soft float
       FloatABI = "soft";
       break;
 
     default:
       switch(Triple.getEnvironment()) {
       case llvm::Triple::GNUEABIHF:
         FloatABI = "hard";
         break;
       case llvm::Triple::GNUEABI:
         FloatABI = "softfp";
         break;
       case llvm::Triple::EABI:
         // EABI is always AAPCS, and if it was not marked 'hard', it's softfp
         FloatABI = "softfp";
         break;
       case llvm::Triple::Android: {
         std::string ArchName =
           getLLVMArchSuffixForARM(getARMTargetCPU(Args, Triple));
         if (StringRef(ArchName).startswith("v7"))
           FloatABI = "softfp";
         else
           FloatABI = "soft";
         break;
       }
       default:
         // Assume "soft", but warn the user we are guessing.
         FloatABI = "soft";
         D.Diag(diag::warn_drv_assuming_mfloat_abi_is) << "soft";
         break;
       }
     }
   }
 
   return FloatABI;
 }
 
 
 void Clang::AddARMTargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs,
                              bool KernelOrKext) const {
   const Driver &D = getToolChain().getDriver();
   // Get the effective triple, which takes into account the deployment target.
   std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args);
   llvm::Triple Triple(TripleStr);
   std::string CPUName = getARMTargetCPU(Args, Triple);
 
   // Select the ABI to use.
   //
   // FIXME: Support -meabi.
   const char *ABIName = 0;
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
     ABIName = A->getValue();
   } else if (Triple.isOSDarwin()) {
     // The backend is hardwired to assume AAPCS for M-class processors, ensure
     // the frontend matches that.
     if (StringRef(CPUName).startswith("cortex-m")) {
       ABIName = "aapcs";
     } else {
       ABIName = "apcs-gnu";
     }
   } else {
     // Select the default based on the platform.
     switch(Triple.getEnvironment()) {
     case llvm::Triple::Android:
     case llvm::Triple::GNUEABI:
     case llvm::Triple::GNUEABIHF:
       ABIName = "aapcs-linux";
       break;
     case llvm::Triple::EABI:
       ABIName = "aapcs";
       break;
     default:
       ABIName = "apcs-gnu";
     }
   }
   CmdArgs.push_back("-target-abi");
   CmdArgs.push_back(ABIName);
 
   // Set the CPU based on -march= and -mcpu=.
   CmdArgs.push_back("-target-cpu");
   CmdArgs.push_back(Args.MakeArgString(CPUName));
 
   // Determine floating point ABI from the options & target defaults.
   StringRef FloatABI = getARMFloatABI(D, Args, Triple);
   if (FloatABI == "soft") {
     // Floating point operations and argument passing are soft.
     //
     // FIXME: This changes CPP defines, we need -target-soft-float.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
   } else if (FloatABI == "softfp") {
     // Floating point operations are hard, but argument passing is soft.
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
   } else {
     // Floating point operations and argument passing are hard.
     assert(FloatABI == "hard" && "Invalid float abi!");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("hard");
   }
 
   // Set appropriate target features for floating point mode.
   //
   // FIXME: Note, this is a hack, the LLVM backend doesn't actually use these
   // yet (it uses the -mfloat-abi and -msoft-float options above), and it is
   // stripped out by the ARM target.
 
   // Use software floating point operations?
   if (FloatABI == "soft") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+soft-float");
   }
 
   // Use software floating point argument passing?
   if (FloatABI != "hard") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+soft-float-abi");
   }
 
   // Honor -mfpu=.
   if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ))
     addFPUArgs(D, A, Args, CmdArgs);
 
   // Honor -mfpmath=.
   if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ))
     addFPMathArgs(D, A, Args, CmdArgs, getARMTargetCPU(Args, Triple));
 
   // Setting -msoft-float effectively disables NEON because of the GCC
   // implementation, although the same isn't true of VFP or VFP3.
   if (FloatABI == "soft") {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("-neon");
   }
 
   // Kernel code has more strict alignment requirements.
   if (KernelOrKext) {
     if (Triple.getOS() != llvm::Triple::IOS || Triple.isOSVersionLT(6)) {
       CmdArgs.push_back("-backend-option");
       CmdArgs.push_back("-arm-long-calls");
     }
 
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-arm-strict-align");
 
     // The kext linker doesn't know how to deal with movw/movt.
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-arm-darwin-use-movt=0");
   }
 
   // Setting -mno-global-merge disables the codegen global merge pass. Setting 
   // -mglobal-merge has no effect as the pass is enabled by default.
   if (Arg *A = Args.getLastArg(options::OPT_mglobal_merge,
                                options::OPT_mno_global_merge)) {
     if (A->getOption().matches(options::OPT_mno_global_merge))
       CmdArgs.push_back("-mno-global-merge");
   }
 
   if (Args.hasArg(options::OPT_mno_implicit_float))
     CmdArgs.push_back("-no-implicit-float");
 }
 
 // Translate MIPS CPU name alias option to CPU name.
 static StringRef getMipsCPUFromAlias(const Arg &A) {
   if (A.getOption().matches(options::OPT_mips32))
     return "mips32";
   if (A.getOption().matches(options::OPT_mips32r2))
     return "mips32r2";
   if (A.getOption().matches(options::OPT_mips64))
     return "mips64";
   if (A.getOption().matches(options::OPT_mips64r2))
     return "mips64r2";
   llvm_unreachable("Unexpected option");
   return "";
 }
 
 // Get CPU and ABI names. They are not independent
 // so we have to calculate them together.
 static void getMipsCPUAndABI(const ArgList &Args,
                              const ToolChain &TC,
                              StringRef &CPUName,
                              StringRef &ABIName) {
   const char *DefMips32CPU = "mips32";
   const char *DefMips64CPU = "mips64";
 
   if (Arg *A = Args.getLastArg(options::OPT_march_EQ,
                                options::OPT_mcpu_EQ,
                                options::OPT_mips_CPUs_Group)) {
     if (A->getOption().matches(options::OPT_mips_CPUs_Group))
       CPUName = getMipsCPUFromAlias(*A);
     else
       CPUName = A->getValue();
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
     ABIName = A->getValue();
 
   // Setup default CPU and ABI names.
   if (CPUName.empty() && ABIName.empty()) {
     switch (TC.getTriple().getArch()) {
     default:
       llvm_unreachable("Unexpected triple arch name");
     case llvm::Triple::mips:
     case llvm::Triple::mipsel:
       CPUName = DefMips32CPU;
       break;
     case llvm::Triple::mips64:
     case llvm::Triple::mips64el:
       CPUName = DefMips64CPU;
       break;
     }
   }
 
   if (!ABIName.empty()) {
     // Deduce CPU name from ABI name.
     CPUName = llvm::StringSwitch<const char *>(ABIName)
       .Cases("o32", "eabi", DefMips32CPU)
       .Cases("n32", "n64", DefMips64CPU)
       .Default("");
   }
   else if (!CPUName.empty()) {
     // Deduce ABI name from CPU name.
     ABIName = llvm::StringSwitch<const char *>(CPUName)
       .Cases("mips32", "mips32r2", "o32")
       .Cases("mips64", "mips64r2", "n64")
       .Default("");
   }
 
   // FIXME: Warn on inconsistent cpu and abi usage.
 }
 
 // Select the MIPS float ABI as determined by -msoft-float, -mhard-float,
 // and -mfloat-abi=.
 static StringRef getMipsFloatABI(const Driver &D, const ArgList &Args) {
   // Select the float ABI as determined by -msoft-float, -mhard-float,
   // and -mfloat-abi=.
   StringRef FloatABI;
   if (Arg *A = Args.getLastArg(options::OPT_msoft_float,
                                options::OPT_mhard_float,
                                options::OPT_mfloat_abi_EQ)) {
     if (A->getOption().matches(options::OPT_msoft_float))
       FloatABI = "soft";
     else if (A->getOption().matches(options::OPT_mhard_float))
       FloatABI = "hard";
     else {
       FloatABI = A->getValue();
       if (FloatABI != "soft" && FloatABI != "single" && FloatABI != "hard") {
         D.Diag(diag::err_drv_invalid_mfloat_abi) << A->getAsString(Args);
         FloatABI = "hard";
       }
     }
   }
 
   // If unspecified, choose the default based on the platform.
   if (FloatABI.empty()) {
     // Assume "hard", because it's a default value used by gcc.
     // When we start to recognize specific target MIPS processors,
     // we will be able to select the default more correctly.
     FloatABI = "hard";
   }
 
   return FloatABI;
 }
 
 static void AddTargetFeature(const ArgList &Args,
                              ArgStringList &CmdArgs,
                              OptSpecifier OnOpt,
                              OptSpecifier OffOpt,
                              StringRef FeatureName) {
   if (Arg *A = Args.getLastArg(OnOpt, OffOpt)) {
     CmdArgs.push_back("-target-feature");
     if (A->getOption().matches(OnOpt))
       CmdArgs.push_back(Args.MakeArgString("+" + FeatureName));
     else
       CmdArgs.push_back(Args.MakeArgString("-" + FeatureName));
   }
 }
 
 void Clang::AddMIPSTargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
   StringRef CPUName;
   StringRef ABIName;
   getMipsCPUAndABI(Args, getToolChain(), CPUName, ABIName);
 
   CmdArgs.push_back("-target-cpu");
   CmdArgs.push_back(CPUName.data());
 
   CmdArgs.push_back("-target-abi");
   CmdArgs.push_back(ABIName.data());
 
   StringRef FloatABI = getMipsFloatABI(D, Args);
 
   if (FloatABI == "soft") {
     // Floating point operations and argument passing are soft.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
 
     // FIXME: Note, this is a hack. We need to pass the selected float
     // mode to the MipsTargetInfoBase to define appropriate macros there.
     // Now it is the only method.
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+soft-float");
   }
   else if (FloatABI == "single") {
     // Restrict the use of hardware floating-point
     // instructions to 32-bit operations.
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+single-float");
   }
   else {
     // Floating point operations and argument passing are hard.
     assert(FloatABI == "hard" && "Invalid float abi!");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("hard");
   }
 
   AddTargetFeature(Args, CmdArgs,
                    options::OPT_mips16, options::OPT_mno_mips16,
                    "mips16");
   AddTargetFeature(Args, CmdArgs,
                    options::OPT_mdsp, options::OPT_mno_dsp,
                    "dsp");
   AddTargetFeature(Args, CmdArgs,
                    options::OPT_mdspr2, options::OPT_mno_dspr2,
                    "dspr2");
 
   if (Arg *A = Args.getLastArg(options::OPT_G)) {
     StringRef v = A->getValue();
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back(Args.MakeArgString("-mips-ssection-threshold=" + v));
     A->claim();
   }
 }
 
 /// getPPCTargetCPU - Get the (LLVM) name of the PowerPC cpu we are targeting.
 static std::string getPPCTargetCPU(const ArgList &Args) {
   if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     StringRef CPUName = A->getValue();
 
     if (CPUName == "native") {
       std::string CPU = llvm::sys::getHostCPUName();
       if (!CPU.empty() && CPU != "generic")
         return CPU;
       else
         return "";
     }
 
     return llvm::StringSwitch<const char *>(CPUName)
       .Case("common", "generic")
       .Case("440", "440")
       .Case("440fp", "440")
       .Case("450", "450")
       .Case("601", "601")
       .Case("602", "602")
       .Case("603", "603")
       .Case("603e", "603e")
       .Case("603ev", "603ev")
       .Case("604", "604")
       .Case("604e", "604e")
       .Case("620", "620")
       .Case("G3", "g3")
       .Case("7400", "7400")
       .Case("G4", "g4")
       .Case("7450", "7450")
       .Case("G4+", "g4+")
       .Case("750", "750")
       .Case("970", "970")
       .Case("G5", "g5")
       .Case("a2", "a2")
       .Case("e500mc", "e500mc")
       .Case("e5500", "e5500")
       .Case("power6", "pwr6")
       .Case("power7", "pwr7")
       .Case("powerpc", "ppc")
       .Case("powerpc64", "ppc64")
       .Default("");
   }
 
   return "";
 }
 
 void Clang::AddPPCTargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   std::string TargetCPUName = getPPCTargetCPU(Args);
 
   // LLVM may default to generating code for the native CPU,
   // but, like gcc, we default to a more generic option for
   // each architecture. (except on Darwin)
   llvm::Triple Triple = getToolChain().getTriple();
   if (TargetCPUName.empty() && !Triple.isOSDarwin()) {
     if (Triple.getArch() == llvm::Triple::ppc64)
       TargetCPUName = "ppc64";
     else
       TargetCPUName = "ppc";
   }
 
   if (!TargetCPUName.empty()) {
     CmdArgs.push_back("-target-cpu");
     CmdArgs.push_back(Args.MakeArgString(TargetCPUName.c_str()));
   }
 }
 
 void Clang::AddSparcTargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
 
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     CmdArgs.push_back("-target-cpu");
     CmdArgs.push_back(A->getValue());
   }
 
   // Select the float ABI as determined by -msoft-float, -mhard-float, and
   StringRef FloatABI;
   if (Arg *A = Args.getLastArg(options::OPT_msoft_float,
                                options::OPT_mhard_float)) {
     if (A->getOption().matches(options::OPT_msoft_float))
       FloatABI = "soft";
     else if (A->getOption().matches(options::OPT_mhard_float))
       FloatABI = "hard";
   }
 
   // If unspecified, choose the default based on the platform.
   if (FloatABI.empty()) {
     switch (getToolChain().getTriple().getOS()) {
     default:
       // Assume "soft", but warn the user we are guessing.
       FloatABI = "soft";
       D.Diag(diag::warn_drv_assuming_mfloat_abi_is) << "soft";
       break;
     }
   }
 
   if (FloatABI == "soft") {
     // Floating point operations and argument passing are soft.
     //
     // FIXME: This changes CPP defines, we need -target-soft-float.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back("+soft-float");
   } else {
     assert(FloatABI == "hard" && "Invalid float abi!");
     CmdArgs.push_back("-mhard-float");
   }
 }
 
+static const char *getX86TargetCPU(const ArgList &Args,
+                                   const llvm::Triple &Triple) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+    if (StringRef(A->getValue()) != "native")
+      return A->getValue();
+
+    // FIXME: Reject attempts to use -march=native unless the target matches
+    // the host.
+    //
+    // FIXME: We should also incorporate the detected target features for use
+    // with -native.
+    std::string CPU = llvm::sys::getHostCPUName();
+    if (!CPU.empty() && CPU != "generic")
+      return Args.MakeArgString(CPU);
+  }
+
+  // Select the default CPU if none was given (or detection failed).
+
+  if (Triple.getArch() != llvm::Triple::x86_64 &&
+      Triple.getArch() != llvm::Triple::x86)
+    return 0; // This routine is only handling x86 targets.
+
+  bool Is64Bit = Triple.getArch() == llvm::Triple::x86_64;
+
+  // FIXME: Need target hooks.
+  if (Triple.isOSDarwin())
+    return Is64Bit ? "core2" : "yonah";
+
+  // Everything else goes to x86-64 in 64-bit mode.
+  if (Is64Bit)
+    return "x86-64";
+
+  if (Triple.getOSName().startswith("haiku"))
+    return "i586";
+  if (Triple.getOSName().startswith("openbsd"))
+    return "i486";
+  if (Triple.getOSName().startswith("bitrig"))
+    return "i686";
+  if (Triple.getOSName().startswith("freebsd"))
+    return "i486";
+  if (Triple.getOSName().startswith("netbsd"))
+    return "i486";
+  // All x86 devices running Android have core2 as their common
+  // denominator. This makes a better choice than pentium4.
+  if (Triple.getEnvironment() == llvm::Triple::Android)
+    return "core2";
+
+  // Fallback to p4.
+  return "pentium4";
+}
+
 void Clang::AddX86TargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
-  const bool isAndroid =
-    getToolChain().getTriple().getEnvironment() == llvm::Triple::Android;
   if (!Args.hasFlag(options::OPT_mred_zone,
                     options::OPT_mno_red_zone,
                     true) ||
       Args.hasArg(options::OPT_mkernel) ||
       Args.hasArg(options::OPT_fapple_kext))
     CmdArgs.push_back("-disable-red-zone");
 
   if (Args.hasFlag(options::OPT_msoft_float,
                    options::OPT_mno_soft_float,
                    false))
     CmdArgs.push_back("-no-implicit-float");
 
-  const char *CPUName = 0;
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
-    if (StringRef(A->getValue()) == "native") {
-      // FIXME: Reject attempts to use -march=native unless the target matches
-      // the host.
-      //
-      // FIXME: We should also incorporate the detected target features for use
-      // with -native.
-      std::string CPU = llvm::sys::getHostCPUName();
-      if (!CPU.empty() && CPU != "generic")
-        CPUName = Args.MakeArgString(CPU);
-    } else
-      CPUName = A->getValue();
-  }
-
-  // Select the default CPU if none was given (or detection failed).
-  if (!CPUName) {
-    // FIXME: Need target hooks.
-    if (getToolChain().getTriple().isOSDarwin()) {
-      if (getToolChain().getArch() == llvm::Triple::x86_64)
-        CPUName = "core2";
-      else if (getToolChain().getArch() == llvm::Triple::x86)
-        CPUName = "yonah";
-    } else if (getToolChain().getOS().startswith("haiku"))  {
-      if (getToolChain().getArch() == llvm::Triple::x86_64)
-        CPUName = "x86-64";
-      else if (getToolChain().getArch() == llvm::Triple::x86)
-        CPUName = "i586";
-    } else if (getToolChain().getOS().startswith("openbsd"))  {
-      if (getToolChain().getArch() == llvm::Triple::x86_64)
-        CPUName = "x86-64";
-      else if (getToolChain().getArch() == llvm::Triple::x86)
-        CPUName = "i486";
-    } else if (getToolChain().getOS().startswith("bitrig"))  {
-      if (getToolChain().getArch() == llvm::Triple::x86_64)
-        CPUName = "x86-64";
-      else if (getToolChain().getArch() == llvm::Triple::x86)
-        CPUName = "i686";
-    } else if (getToolChain().getOS().startswith("freebsd"))  {
-      if (getToolChain().getArch() == llvm::Triple::x86_64)
-        CPUName = "x86-64";
-      else if (getToolChain().getArch() == llvm::Triple::x86)
-        CPUName = "i486";
-    } else if (getToolChain().getOS().startswith("netbsd"))  {
-      if (getToolChain().getArch() == llvm::Triple::x86_64)
-        CPUName = "x86-64";
-      else if (getToolChain().getArch() == llvm::Triple::x86)
-        CPUName = "i486";
-    } else {
-      if (getToolChain().getArch() == llvm::Triple::x86_64)
-        CPUName = "x86-64";
-      else if (getToolChain().getArch() == llvm::Triple::x86)
-        // All x86 devices running Android have core2 as their common
-        // denominator. This makes a better choice than pentium4.
-        CPUName = isAndroid ? "core2" : "pentium4";
-    }
-  }
-
-  if (CPUName) {
+  if (const char *CPUName = getX86TargetCPU(Args, getToolChain().getTriple())) {
     CmdArgs.push_back("-target-cpu");
     CmdArgs.push_back(CPUName);
   }
 
   // The required algorithm here is slightly strange: the options are applied
   // in order (so -mno-sse -msse2 disables SSE3), but any option that gets
   // directly overridden later is ignored (so "-mno-sse -msse2 -mno-sse2 -msse"
   // is equivalent to "-mno-sse2 -msse"). The -cc1 handling deals with the
   // former correctly, but not the latter; handle directly-overridden
   // attributes here.
   llvm::StringMap<unsigned> PrevFeature;
   std::vector<const char*> Features;
   for (arg_iterator it = Args.filtered_begin(options::OPT_m_x86_Features_Group),
          ie = Args.filtered_end(); it != ie; ++it) {
     StringRef Name = (*it)->getOption().getName();
     (*it)->claim();
 
     // Skip over "-m".
     assert(Name.startswith("m") && "Invalid feature name.");
     Name = Name.substr(1);
 
     bool IsNegative = Name.startswith("no-");
     if (IsNegative)
       Name = Name.substr(3);
 
     unsigned& Prev = PrevFeature[Name];
     if (Prev)
       Features[Prev - 1] = 0;
     Prev = Features.size() + 1;
     Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name));
   }
   for (unsigned i = 0; i < Features.size(); i++) {
     if (Features[i]) {
       CmdArgs.push_back("-target-feature");
       CmdArgs.push_back(Features[i]);
     }
   }
 }
 
 static Arg* getLastHexagonArchArg (const ArgList &Args)
 {
   Arg * A = NULL;
 
   for (ArgList::const_iterator it = Args.begin(), ie = Args.end();
        it != ie; ++it) {
     if ((*it)->getOption().matches(options::OPT_march_EQ) ||
         (*it)->getOption().matches(options::OPT_mcpu_EQ)) {
       A = *it;
       A->claim();
     }
     else if ((*it)->getOption().matches(options::OPT_m_Joined)){
       StringRef Value = (*it)->getValue(0);
       if (Value.startswith("v")) {
         A = *it;
         A->claim();
       }
     }
   }
   return A;
 }
 
 static StringRef getHexagonTargetCPU(const ArgList &Args)
 {
   Arg *A;
   llvm::StringRef WhichHexagon;
 
   // Select the default CPU (v4) if none was given or detection failed.
   if ((A = getLastHexagonArchArg (Args))) {
     WhichHexagon = A->getValue();
     if (WhichHexagon == "")
       return "v4";
     else
       return WhichHexagon;
   }
   else
     return "v4";
 }
 
 void Clang::AddHexagonTargetArgs(const ArgList &Args,
                                  ArgStringList &CmdArgs) const {
   llvm::Triple Triple = getToolChain().getTriple();
 
   CmdArgs.push_back("-target-cpu");
   CmdArgs.push_back(Args.MakeArgString("hexagon" + getHexagonTargetCPU(Args)));
   CmdArgs.push_back("-fno-signed-char");
   CmdArgs.push_back("-nobuiltininc");
 
   if (Args.hasArg(options::OPT_mqdsp6_compat))
     CmdArgs.push_back("-mqdsp6-compat");
 
   if (Arg *A = Args.getLastArg(options::OPT_G,
                                options::OPT_msmall_data_threshold_EQ)) {
     std::string SmallDataThreshold="-small-data-threshold=";
     SmallDataThreshold += A->getValue();
     CmdArgs.push_back ("-mllvm");
     CmdArgs.push_back(Args.MakeArgString(SmallDataThreshold));
     A->claim();
   }
 
   if (!Args.hasArg(options::OPT_fno_short_enums))
     CmdArgs.push_back("-fshort-enums");
   if (Args.getLastArg(options::OPT_mieee_rnd_near)) {
     CmdArgs.push_back ("-mllvm");
     CmdArgs.push_back ("-enable-hexagon-ieee-rnd-near");
   }
   CmdArgs.push_back ("-mllvm");
   CmdArgs.push_back ("-machine-sink-split=0");
 }
 
 static bool
 shouldUseExceptionTablesForObjCExceptions(const ObjCRuntime &runtime,
                                           const llvm::Triple &Triple) {
   // We use the zero-cost exception tables for Objective-C if the non-fragile
   // ABI is enabled or when compiling for x86_64 and ARM on Snow Leopard and
   // later.
   if (runtime.isNonFragile())
     return true;
 
   if (!Triple.isOSDarwin())
     return false;
 
   return (!Triple.isMacOSXVersionLT(10,5) &&
           (Triple.getArch() == llvm::Triple::x86_64 ||
            Triple.getArch() == llvm::Triple::arm));
 }
 
 /// addExceptionArgs - Adds exception related arguments to the driver command
 /// arguments. There's a master flag, -fexceptions and also language specific
 /// flags to enable/disable C++ and Objective-C exceptions.
 /// This makes it possible to for example disable C++ exceptions but enable
 /// Objective-C exceptions.
 static void addExceptionArgs(const ArgList &Args, types::ID InputType,
                              const llvm::Triple &Triple,
                              bool KernelOrKext,
                              const ObjCRuntime &objcRuntime,
                              ArgStringList &CmdArgs) {
   if (KernelOrKext) {
     // -mkernel and -fapple-kext imply no exceptions, so claim exception related
     // arguments now to avoid warnings about unused arguments.
     Args.ClaimAllArgs(options::OPT_fexceptions);
     Args.ClaimAllArgs(options::OPT_fno_exceptions);
     Args.ClaimAllArgs(options::OPT_fobjc_exceptions);
     Args.ClaimAllArgs(options::OPT_fno_objc_exceptions);
     Args.ClaimAllArgs(options::OPT_fcxx_exceptions);
     Args.ClaimAllArgs(options::OPT_fno_cxx_exceptions);
     return;
   }
 
   // Exceptions are enabled by default.
   bool ExceptionsEnabled = true;
 
   // This keeps track of whether exceptions were explicitly turned on or off.
   bool DidHaveExplicitExceptionFlag = false;
 
   if (Arg *A = Args.getLastArg(options::OPT_fexceptions,
                                options::OPT_fno_exceptions)) {
     if (A->getOption().matches(options::OPT_fexceptions))
       ExceptionsEnabled = true;
     else
       ExceptionsEnabled = false;
 
     DidHaveExplicitExceptionFlag = true;
   }
 
   bool ShouldUseExceptionTables = false;
 
   // Exception tables and cleanups can be enabled with -fexceptions even if the
   // language itself doesn't support exceptions.
   if (ExceptionsEnabled && DidHaveExplicitExceptionFlag)
     ShouldUseExceptionTables = true;
 
   // Obj-C exceptions are enabled by default, regardless of -fexceptions. This
   // is not necessarily sensible, but follows GCC.
   if (types::isObjC(InputType) &&
       Args.hasFlag(options::OPT_fobjc_exceptions,
                    options::OPT_fno_objc_exceptions,
                    true)) {
     CmdArgs.push_back("-fobjc-exceptions");
 
     ShouldUseExceptionTables |=
       shouldUseExceptionTablesForObjCExceptions(objcRuntime, Triple);
   }
 
   if (types::isCXX(InputType)) {
     bool CXXExceptionsEnabled = ExceptionsEnabled;
 
     if (Arg *A = Args.getLastArg(options::OPT_fcxx_exceptions,
                                  options::OPT_fno_cxx_exceptions,
                                  options::OPT_fexceptions,
                                  options::OPT_fno_exceptions)) {
       if (A->getOption().matches(options::OPT_fcxx_exceptions))
         CXXExceptionsEnabled = true;
       else if (A->getOption().matches(options::OPT_fno_cxx_exceptions))
         CXXExceptionsEnabled = false;
     }
 
     if (CXXExceptionsEnabled) {
       CmdArgs.push_back("-fcxx-exceptions");
 
       ShouldUseExceptionTables = true;
     }
   }
 
   if (ShouldUseExceptionTables)
     CmdArgs.push_back("-fexceptions");
 }
 
 static bool ShouldDisableCFI(const ArgList &Args,
                              const ToolChain &TC) {
   bool Default = true;
   if (TC.getTriple().isOSDarwin()) {
     // The native darwin assembler doesn't support cfi directives, so
     // we disable them if we think the .s file will be passed to it.
     Default = Args.hasFlag(options::OPT_integrated_as,
 			   options::OPT_no_integrated_as,
 			   TC.IsIntegratedAssemblerDefault());
   }
   return !Args.hasFlag(options::OPT_fdwarf2_cfi_asm,
 		       options::OPT_fno_dwarf2_cfi_asm,
 		       Default);
 }
 
 static bool ShouldDisableDwarfDirectory(const ArgList &Args,
                                         const ToolChain &TC) {
   bool IsIADefault = TC.IsIntegratedAssemblerDefault();
   bool UseIntegratedAs = Args.hasFlag(options::OPT_integrated_as,
                                       options::OPT_no_integrated_as,
                                       IsIADefault);
   bool UseDwarfDirectory = Args.hasFlag(options::OPT_fdwarf_directory_asm,
                                         options::OPT_fno_dwarf_directory_asm,
                                         UseIntegratedAs);
   return !UseDwarfDirectory;
 }
 
 /// \brief Check whether the given input tree contains any compilation actions.
 static bool ContainsCompileAction(const Action *A) {
   if (isa<CompileJobAction>(A))
     return true;
 
   for (Action::const_iterator it = A->begin(), ie = A->end(); it != ie; ++it)
     if (ContainsCompileAction(*it))
       return true;
 
   return false;
 }
 
 /// \brief Check if -relax-all should be passed to the internal assembler.
 /// This is done by default when compiling non-assembler source with -O0.
 static bool UseRelaxAll(Compilation &C, const ArgList &Args) {
   bool RelaxDefault = true;
 
   if (Arg *A = Args.getLastArg(options::OPT_O_Group))
     RelaxDefault = A->getOption().matches(options::OPT_O0);
 
   if (RelaxDefault) {
     RelaxDefault = false;
     for (ActionList::const_iterator it = C.getActions().begin(),
            ie = C.getActions().end(); it != ie; ++it) {
       if (ContainsCompileAction(*it)) {
         RelaxDefault = true;
         break;
       }
     }
   }
 
   return Args.hasFlag(options::OPT_mrelax_all, options::OPT_mno_relax_all,
     RelaxDefault);
 }
 
 SanitizerArgs::SanitizerArgs(const Driver &D, const ArgList &Args) {
   Kind = 0;
 
   const Arg *AsanArg, *TsanArg, *UbsanArg;
   for (ArgList::const_iterator I = Args.begin(), E = Args.end(); I != E; ++I) {
     unsigned Add = 0, Remove = 0;
     const char *DeprecatedReplacement = 0;
     if ((*I)->getOption().matches(options::OPT_faddress_sanitizer)) {
       Add = Address;
       DeprecatedReplacement = "-fsanitize=address";
     } else if ((*I)->getOption().matches(options::OPT_fno_address_sanitizer)) {
       Remove = Address;
       DeprecatedReplacement = "-fno-sanitize=address";
     } else if ((*I)->getOption().matches(options::OPT_fthread_sanitizer)) {
       Add = Thread;
       DeprecatedReplacement = "-fsanitize=thread";
     } else if ((*I)->getOption().matches(options::OPT_fno_thread_sanitizer)) {
       Remove = Thread;
       DeprecatedReplacement = "-fno-sanitize=thread";
     } else if ((*I)->getOption().matches(options::OPT_fcatch_undefined_behavior)) {
       Add = Undefined;
       DeprecatedReplacement = "-fsanitize=undefined";
     } else if ((*I)->getOption().matches(options::OPT_fsanitize_EQ)) {
       Add = parse(D, *I);
     } else if ((*I)->getOption().matches(options::OPT_fno_sanitize_EQ)) {
       Remove = parse(D, *I);
     } else {
       continue;
     }
 
     (*I)->claim();
 
     Kind |= Add;
     Kind &= ~Remove;
 
     if (Add & NeedsAsanRt) AsanArg = *I;
     if (Add & NeedsTsanRt) TsanArg = *I;
     if (Add & NeedsUbsanRt) UbsanArg = *I;
 
     // If this is a deprecated synonym, produce a warning directing users
     // towards the new spelling.
     if (DeprecatedReplacement)
       D.Diag(diag::warn_drv_deprecated_arg)
         << (*I)->getAsString(Args) << DeprecatedReplacement;
   }
 
   // Only one runtime library can be used at once.
   // FIXME: Allow Ubsan to be combined with the other two.
   bool NeedsAsan = needsAsanRt();
   bool NeedsTsan = needsTsanRt();
   bool NeedsUbsan = needsUbsanRt();
   if (NeedsAsan + NeedsTsan + NeedsUbsan > 1)
     D.Diag(diag::err_drv_argument_not_allowed_with)
       << describeSanitizeArg(Args, NeedsAsan ? AsanArg : TsanArg,
                              NeedsAsan ? NeedsAsanRt : NeedsTsanRt)
       << describeSanitizeArg(Args, NeedsUbsan ? UbsanArg : TsanArg,
                              NeedsUbsan ? NeedsUbsanRt : NeedsTsanRt);
 }
 
 /// If AddressSanitizer is enabled, add appropriate linker flags (Linux).
 /// This needs to be called before we add the C run-time (malloc, etc).
 static void addAsanRTLinux(const ToolChain &TC, const ArgList &Args,
                            ArgStringList &CmdArgs) {
   if(TC.getTriple().getEnvironment() == llvm::Triple::Android) {
     if (!Args.hasArg(options::OPT_shared)) {
       if (!Args.hasArg(options::OPT_pie))
         TC.getDriver().Diag(diag::err_drv_asan_android_requires_pie);
     }
 
     SmallString<128> LibAsan(TC.getDriver().ResourceDir);
     llvm::sys::path::append(LibAsan, "lib", "linux",
         (Twine("libclang_rt.asan-") +
             TC.getArchName() + "-android.so"));
     CmdArgs.push_back(Args.MakeArgString(LibAsan));
   } else {
     if (!Args.hasArg(options::OPT_shared)) {
       // LibAsan is "libclang_rt.asan-<ArchName>.a" in the Linux library
       // resource directory.
       SmallString<128> LibAsan(TC.getDriver().ResourceDir);
       llvm::sys::path::append(LibAsan, "lib", "linux",
                               (Twine("libclang_rt.asan-") +
                                TC.getArchName() + ".a"));
       CmdArgs.push_back(Args.MakeArgString(LibAsan));
       CmdArgs.push_back("-lpthread");
       CmdArgs.push_back("-ldl");
       CmdArgs.push_back("-export-dynamic");
     }
   }
 }
 
 /// If ThreadSanitizer is enabled, add appropriate linker flags (Linux).
 /// This needs to be called before we add the C run-time (malloc, etc).
 static void addTsanRTLinux(const ToolChain &TC, const ArgList &Args,
                            ArgStringList &CmdArgs) {
   if (!Args.hasArg(options::OPT_shared)) {
     // LibTsan is "libclang_rt.tsan-<ArchName>.a" in the Linux library
     // resource directory.
     SmallString<128> LibTsan(TC.getDriver().ResourceDir);
     llvm::sys::path::append(LibTsan, "lib", "linux",
                             (Twine("libclang_rt.tsan-") +
                              TC.getArchName() + ".a"));
     CmdArgs.push_back(Args.MakeArgString(LibTsan));
     CmdArgs.push_back("-lpthread");
     CmdArgs.push_back("-ldl");
     CmdArgs.push_back("-export-dynamic");
   }
 }
 
 /// If UndefinedBehaviorSanitizer is enabled, add appropriate linker flags
 /// (Linux).
 static void addUbsanRTLinux(const ToolChain &TC, const ArgList &Args,
                             ArgStringList &CmdArgs) {
   if (!Args.hasArg(options::OPT_shared)) {
     // LibUbsan is "libclang_rt.ubsan-<ArchName>.a" in the Linux library
     // resource directory.
     SmallString<128> LibUbsan(TC.getDriver().ResourceDir);
     llvm::sys::path::append(LibUbsan, "lib", "linux",
                             (Twine("libclang_rt.ubsan-") +
                              TC.getArchName() + ".a"));
     CmdArgs.push_back(Args.MakeArgString(LibUbsan));
     CmdArgs.push_back("-lpthread");
   }
 }
 
 static bool shouldUseFramePointer(const ArgList &Args,
                                   const llvm::Triple &Triple) {
   if (Arg *A = Args.getLastArg(options::OPT_fno_omit_frame_pointer,
                                options::OPT_fomit_frame_pointer))
     return A->getOption().matches(options::OPT_fno_omit_frame_pointer);
 
   // Don't use a frame pointer on linux x86 and x86_64 if optimizing.
   if ((Triple.getArch() == llvm::Triple::x86_64 ||
        Triple.getArch() == llvm::Triple::x86) &&
       Triple.getOS() == llvm::Triple::Linux) {
     if (Arg *A = Args.getLastArg(options::OPT_O_Group))
       if (!A->getOption().matches(options::OPT_O0))
         return false;
   }
 
   return true;
 }
 
 void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                          const InputInfo &Output,
                          const InputInfoList &Inputs,
                          const ArgList &Args,
                          const char *LinkingOutput) const {
   bool KernelOrKext = Args.hasArg(options::OPT_mkernel,
                                   options::OPT_fapple_kext);
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
 
   // Invoke ourselves in -cc1 mode.
   //
   // FIXME: Implement custom jobs for internal actions.
   CmdArgs.push_back("-cc1");
 
   // Add the "effective" target triple.
   CmdArgs.push_back("-triple");
   std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args);
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
   // Select the appropriate action.
   RewriteKind rewriteKind = RK_None;
   
   if (isa<AnalyzeJobAction>(JA)) {
     assert(JA.getType() == types::TY_Plist && "Invalid output type.");
     CmdArgs.push_back("-analyze");
   } else if (isa<MigrateJobAction>(JA)) {
     CmdArgs.push_back("-migrate");
   } else if (isa<PreprocessJobAction>(JA)) {
     if (Output.getType() == types::TY_Dependencies)
       CmdArgs.push_back("-Eonly");
     else
       CmdArgs.push_back("-E");
   } else if (isa<AssembleJobAction>(JA)) {
     CmdArgs.push_back("-emit-obj");
 
     if (UseRelaxAll(C, Args))
       CmdArgs.push_back("-mrelax-all");
 
     // When using an integrated assembler, translate -Wa, and -Xassembler
     // options.
     for (arg_iterator it = Args.filtered_begin(options::OPT_Wa_COMMA,
                                                options::OPT_Xassembler),
            ie = Args.filtered_end(); it != ie; ++it) {
       const Arg *A = *it;
       A->claim();
 
       for (unsigned i = 0, e = A->getNumValues(); i != e; ++i) {
         StringRef Value = A->getValue(i);
 
         if (Value == "-force_cpusubtype_ALL") {
           // Do nothing, this is the default and we don't support anything else.
         } else if (Value == "-L") {
           CmdArgs.push_back("-msave-temp-labels");
         } else if (Value == "--fatal-warnings") {
           CmdArgs.push_back("-mllvm");
           CmdArgs.push_back("-fatal-assembler-warnings");
         } else if (Value == "--noexecstack") {
           CmdArgs.push_back("-mnoexecstack");
         } else {
           D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getOption().getName() << Value;
         }
       }
     }
 
     // Also ignore explicit -force_cpusubtype_ALL option.
     (void) Args.hasArg(options::OPT_force__cpusubtype__ALL);
   } else if (isa<PrecompileJobAction>(JA)) {
     // Use PCH if the user requested it.
     bool UsePCH = D.CCCUsePCH;
 
     if (JA.getType() == types::TY_Nothing)
       CmdArgs.push_back("-fsyntax-only");
     else if (UsePCH)
       CmdArgs.push_back("-emit-pch");
     else
       CmdArgs.push_back("-emit-pth");
   } else {
     assert(isa<CompileJobAction>(JA) && "Invalid action for clang tool.");
 
     if (JA.getType() == types::TY_Nothing) {
       CmdArgs.push_back("-fsyntax-only");
     } else if (JA.getType() == types::TY_LLVM_IR ||
                JA.getType() == types::TY_LTO_IR) {
       CmdArgs.push_back("-emit-llvm");
     } else if (JA.getType() == types::TY_LLVM_BC ||
                JA.getType() == types::TY_LTO_BC) {
       CmdArgs.push_back("-emit-llvm-bc");
     } else if (JA.getType() == types::TY_PP_Asm) {
       CmdArgs.push_back("-S");
     } else if (JA.getType() == types::TY_AST) {
       CmdArgs.push_back("-emit-pch");
     } else if (JA.getType() == types::TY_RewrittenObjC) {
       CmdArgs.push_back("-rewrite-objc");
       rewriteKind = RK_NonFragile;
     } else if (JA.getType() == types::TY_RewrittenLegacyObjC) {
       CmdArgs.push_back("-rewrite-objc");
       rewriteKind = RK_Fragile;
     } else {
       assert(JA.getType() == types::TY_PP_Asm &&
              "Unexpected output type!");
     }
   }
 
   // The make clang go fast button.
   CmdArgs.push_back("-disable-free");
 
   // Disable the verification pass in -asserts builds.
 #ifdef NDEBUG
   CmdArgs.push_back("-disable-llvm-verifier");
 #endif
 
   // Set the main file name, so that debug info works even with
   // -save-temps.
   CmdArgs.push_back("-main-file-name");
   CmdArgs.push_back(darwin::CC1::getBaseInputName(Args, Inputs));
 
   // Some flags which affect the language (via preprocessor
   // defines). See darwin::CC1::AddCPPArgs.
   if (Args.hasArg(options::OPT_static))
     CmdArgs.push_back("-static-define");
 
   if (isa<AnalyzeJobAction>(JA)) {
     // Enable region store model by default.
     CmdArgs.push_back("-analyzer-store=region");
 
     // Treat blocks as analysis entry points.
     CmdArgs.push_back("-analyzer-opt-analyze-nested-blocks");
 
     CmdArgs.push_back("-analyzer-eagerly-assume");
 
     // Add default argument set.
     if (!Args.hasArg(options::OPT__analyzer_no_default_checks)) {
       CmdArgs.push_back("-analyzer-checker=core");
 
       if (getToolChain().getTriple().getOS() != llvm::Triple::Win32)
         CmdArgs.push_back("-analyzer-checker=unix");
 
       if (getToolChain().getTriple().getVendor() == llvm::Triple::Apple)
         CmdArgs.push_back("-analyzer-checker=osx");
       
       CmdArgs.push_back("-analyzer-checker=deadcode");
       
       // Enable the following experimental checkers for testing. 
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.UncheckedReturn");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.getpw");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.gets");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mktemp");      
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mkstemp");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.vfork");
     }
 
     // Set the output format. The default is plist, for (lame) historical
     // reasons.
     CmdArgs.push_back("-analyzer-output");
     if (Arg *A = Args.getLastArg(options::OPT__analyzer_output))
       CmdArgs.push_back(A->getValue());
     else
       CmdArgs.push_back("plist");
 
     // Disable the presentation of standard compiler warnings when
     // using --analyze.  We only want to show static analyzer diagnostics
     // or frontend errors.
     CmdArgs.push_back("-w");
 
     // Add -Xanalyzer arguments when running as analyzer.
     Args.AddAllArgValues(CmdArgs, options::OPT_Xanalyzer);
   }
 
   CheckCodeGenerationOptions(D, Args);
 
   // For the PIC and PIE flag options, this logic is different from the legacy
   // logic in very old versions of GCC, as that logic was just a bug no one had
   // ever fixed. This logic is both more rational and consistent with GCC's new
   // logic now that the bugs are fixed. The last argument relating to either
   // PIC or PIE wins, and no other argument is used. If the last argument is
   // any flavor of the '-fno-...' arguments, both PIC and PIE are disabled. Any
   // PIE option implicitly enables PIC at the same level.
   bool PIE = false;
   bool PIC = getToolChain().isPICDefault();
   bool IsPICLevelTwo = PIC;
   if (Arg *A = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC,
                                options::OPT_fpic, options::OPT_fno_pic,
                                options::OPT_fPIE, options::OPT_fno_PIE,
                                options::OPT_fpie, options::OPT_fno_pie)) {
     Option O = A->getOption();
     if (O.matches(options::OPT_fPIC) || O.matches(options::OPT_fpic) ||
         O.matches(options::OPT_fPIE) || O.matches(options::OPT_fpie)) {
       PIE = O.matches(options::OPT_fPIE) || O.matches(options::OPT_fpie);
       PIC = PIE || O.matches(options::OPT_fPIC) || O.matches(options::OPT_fpic);
       IsPICLevelTwo = O.matches(options::OPT_fPIE) ||
                       O.matches(options::OPT_fPIC);
     } else {
       PIE = PIC = false;
     }
   }
   // Check whether the tool chain trumps the PIC-ness decision. If the PIC-ness
   // is forced, then neither PIC nor PIE flags will have no effect.
   if (getToolChain().isPICDefaultForced()) {
     PIE = false;
     PIC = getToolChain().isPICDefault();
     IsPICLevelTwo = PIC;
   }
 
   // Inroduce a Darwin-specific hack. If the default is PIC but the flags
   // specified while enabling PIC enabled level 1 PIC, just force it back to
   // level 2 PIC instead. This matches the behavior of Darwin GCC (based on my
   // informal testing).
   if (PIC && getToolChain().getTriple().isOSDarwin())
     IsPICLevelTwo |= getToolChain().isPICDefault();
 
   // Note that these flags are trump-cards. Regardless of the order w.r.t. the
   // PIC or PIE options above, if these show up, PIC is disabled.
   llvm::Triple Triple(TripleStr);
   if ((Args.hasArg(options::OPT_mkernel) ||
        Args.hasArg(options::OPT_fapple_kext)) &&
       (Triple.getOS() != llvm::Triple::IOS ||
        Triple.isOSVersionLT(6)))
     PIC = PIE = false;
   if (Args.hasArg(options::OPT_static))
     PIC = PIE = false;
 
   if (Arg *A = Args.getLastArg(options::OPT_mdynamic_no_pic)) {
     // This is a very special mode. It trumps the other modes, almost no one
     // uses it, and it isn't even valid on any OS but Darwin.
     if (!getToolChain().getTriple().isOSDarwin())
       D.Diag(diag::err_drv_unsupported_opt_for_target)
         << A->getSpelling() << getToolChain().getTriple().str();
 
     // FIXME: Warn when this flag trumps some other PIC or PIE flag.
 
     CmdArgs.push_back("-mrelocation-model");
     CmdArgs.push_back("dynamic-no-pic");
 
     // Only a forced PIC mode can cause the actual compile to have PIC defines
     // etc., no flags are sufficient. This behavior was selected to closely
     // match that of llvm-gcc and Apple GCC before that.
     if (getToolChain().isPICDefault() && getToolChain().isPICDefaultForced()) {
       CmdArgs.push_back("-pic-level");
       CmdArgs.push_back("2");
     }
   } else {
     // Currently, LLVM only knows about PIC vs. static; the PIE differences are
     // handled in Clang's IRGen by the -pie-level flag.
     CmdArgs.push_back("-mrelocation-model");
     CmdArgs.push_back(PIC ? "pic" : "static");
 
     if (PIC) {
       CmdArgs.push_back("-pic-level");
       CmdArgs.push_back(IsPICLevelTwo ? "2" : "1");
       if (PIE) {
         CmdArgs.push_back("-pie-level");
         CmdArgs.push_back(IsPICLevelTwo ? "2" : "1");
       }
     }
   }
 
   if (!Args.hasFlag(options::OPT_fmerge_all_constants,
                     options::OPT_fno_merge_all_constants))
     CmdArgs.push_back("-fno-merge-all-constants");
 
   // LLVM Code Generator Options.
 
   if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) {
     CmdArgs.push_back("-mregparm");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Args.hasFlag(options::OPT_mrtd, options::OPT_mno_rtd, false))
     CmdArgs.push_back("-mrtd");
 
   if (shouldUseFramePointer(Args, getToolChain().getTriple()))
     CmdArgs.push_back("-mdisable-fp-elim");
   if (!Args.hasFlag(options::OPT_fzero_initialized_in_bss,
                     options::OPT_fno_zero_initialized_in_bss))
     CmdArgs.push_back("-mno-zero-initialized-in-bss");
   if (!Args.hasFlag(options::OPT_fstrict_aliasing,
                     options::OPT_fno_strict_aliasing,
                     getToolChain().IsStrictAliasingDefault()))
     CmdArgs.push_back("-relaxed-aliasing");
   if (Args.hasFlag(options::OPT_fstrict_enums, options::OPT_fno_strict_enums,
                    false))
     CmdArgs.push_back("-fstrict-enums");
   if (!Args.hasFlag(options::OPT_foptimize_sibling_calls,
                     options::OPT_fno_optimize_sibling_calls))
     CmdArgs.push_back("-mdisable-tail-calls");
 
   // Handle various floating point optimization flags, mapping them to the
   // appropriate LLVM code generation flags. The pattern for all of these is to
   // default off the codegen optimizations, and if any flag enables them and no
   // flag disables them after the flag enabling them, enable the codegen
   // optimization. This is complicated by several "umbrella" flags.
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_ffinite_math_only,
                                options::OPT_fno_finite_math_only,
                                options::OPT_fhonor_infinities,
                                options::OPT_fno_honor_infinities))
     if (A->getOption().getID() != options::OPT_fno_fast_math &&
         A->getOption().getID() != options::OPT_fno_finite_math_only &&
         A->getOption().getID() != options::OPT_fhonor_infinities)
       CmdArgs.push_back("-menable-no-infs");
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_ffinite_math_only,
                                options::OPT_fno_finite_math_only,
                                options::OPT_fhonor_nans,
                                options::OPT_fno_honor_nans))
     if (A->getOption().getID() != options::OPT_fno_fast_math &&
         A->getOption().getID() != options::OPT_fno_finite_math_only &&
         A->getOption().getID() != options::OPT_fhonor_nans)
       CmdArgs.push_back("-menable-no-nans");
 
   // -fmath-errno is the default on some platforms, e.g. BSD-derived OSes.
   bool MathErrno = getToolChain().IsMathErrnoDefault();
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_fmath_errno,
                                options::OPT_fno_math_errno))
     MathErrno = A->getOption().getID() == options::OPT_fmath_errno;
   if (MathErrno)
     CmdArgs.push_back("-fmath-errno");
 
   // There are several flags which require disabling very specific
   // optimizations. Any of these being disabled forces us to turn off the
   // entire set of LLVM optimizations, so collect them through all the flag
   // madness.
   bool AssociativeMath = false;
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_funsafe_math_optimizations,
                                options::OPT_fno_unsafe_math_optimizations,
                                options::OPT_fassociative_math,
                                options::OPT_fno_associative_math))
     if (A->getOption().getID() != options::OPT_fno_fast_math &&
         A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations &&
         A->getOption().getID() != options::OPT_fno_associative_math)
       AssociativeMath = true;
   bool ReciprocalMath = false;
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_funsafe_math_optimizations,
                                options::OPT_fno_unsafe_math_optimizations,
                                options::OPT_freciprocal_math,
                                options::OPT_fno_reciprocal_math))
     if (A->getOption().getID() != options::OPT_fno_fast_math &&
         A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations &&
         A->getOption().getID() != options::OPT_fno_reciprocal_math)
       ReciprocalMath = true;
   bool SignedZeros = true;
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_funsafe_math_optimizations,
                                options::OPT_fno_unsafe_math_optimizations,
                                options::OPT_fsigned_zeros,
                                options::OPT_fno_signed_zeros))
     if (A->getOption().getID() != options::OPT_fno_fast_math &&
         A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations &&
         A->getOption().getID() != options::OPT_fsigned_zeros)
       SignedZeros = false;
   bool TrappingMath = true;
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_funsafe_math_optimizations,
                                options::OPT_fno_unsafe_math_optimizations,
                                options::OPT_ftrapping_math,
                                options::OPT_fno_trapping_math))
     if (A->getOption().getID() != options::OPT_fno_fast_math &&
         A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations &&
         A->getOption().getID() != options::OPT_ftrapping_math)
       TrappingMath = false;
   if (!MathErrno && AssociativeMath && ReciprocalMath && !SignedZeros &&
       !TrappingMath)
     CmdArgs.push_back("-menable-unsafe-fp-math");
 
 
   // Validate and pass through -fp-contract option. 
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math,
                                options::OPT_fno_fast_math,
                                options::OPT_ffp_contract)) {
     if (A->getOption().getID() == options::OPT_ffp_contract) {
       StringRef Val = A->getValue();
       if (Val == "fast" || Val == "on" || Val == "off") {
         CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + Val));
       } else {
         D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getOption().getName() << Val;
       }
     } else if (A->getOption().getID() == options::OPT_ffast_math) {
       // If fast-math is set then set the fp-contract mode to fast.
       CmdArgs.push_back(Args.MakeArgString("-ffp-contract=fast"));
     }
   }
 
   // We separately look for the '-ffast-math' and '-ffinite-math-only' flags,
   // and if we find them, tell the frontend to provide the appropriate
   // preprocessor macros. This is distinct from enabling any optimizations as
   // these options induce language changes which must survive serialization
   // and deserialization, etc.
   if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math))
     if (A->getOption().matches(options::OPT_ffast_math))
       CmdArgs.push_back("-ffast-math");
   if (Arg *A = Args.getLastArg(options::OPT_ffinite_math_only, options::OPT_fno_fast_math))
     if (A->getOption().matches(options::OPT_ffinite_math_only))
       CmdArgs.push_back("-ffinite-math-only");
 
   // Decide whether to use verbose asm. Verbose assembly is the default on
   // toolchains which have the integrated assembler on by default.
   bool IsVerboseAsmDefault = getToolChain().IsIntegratedAssemblerDefault();
   if (Args.hasFlag(options::OPT_fverbose_asm, options::OPT_fno_verbose_asm,
                    IsVerboseAsmDefault) ||
       Args.hasArg(options::OPT_dA))
     CmdArgs.push_back("-masm-verbose");
 
   if (Args.hasArg(options::OPT_fdebug_pass_structure)) {
     CmdArgs.push_back("-mdebug-pass");
     CmdArgs.push_back("Structure");
   }
   if (Args.hasArg(options::OPT_fdebug_pass_arguments)) {
     CmdArgs.push_back("-mdebug-pass");
     CmdArgs.push_back("Arguments");
   }
 
   // Enable -mconstructor-aliases except on darwin, where we have to
   // work around a linker bug;  see <rdar://problem/7651567>.
   if (!getToolChain().getTriple().isOSDarwin())
     CmdArgs.push_back("-mconstructor-aliases");
 
   // Darwin's kernel doesn't support guard variables; just die if we
   // try to use them.
   if (KernelOrKext && getToolChain().getTriple().isOSDarwin())
     CmdArgs.push_back("-fforbid-guard-variables");
 
   if (Args.hasArg(options::OPT_mms_bitfields)) {
     CmdArgs.push_back("-mms-bitfields");
   }
 
   // This is a coarse approximation of what llvm-gcc actually does, both
   // -fasynchronous-unwind-tables and -fnon-call-exceptions interact in more
   // complicated ways.
   bool AsynchronousUnwindTables =
     Args.hasFlag(options::OPT_fasynchronous_unwind_tables,
                  options::OPT_fno_asynchronous_unwind_tables,
                  getToolChain().IsUnwindTablesDefault() &&
                  !KernelOrKext);
   if (Args.hasFlag(options::OPT_funwind_tables, options::OPT_fno_unwind_tables,
                    AsynchronousUnwindTables))
     CmdArgs.push_back("-munwind-tables");
 
   getToolChain().addClangTargetOptions(CmdArgs);
 
   if (Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
   }
 
   // FIXME: Handle -mtune=.
   (void) Args.hasArg(options::OPT_mtune_EQ);
 
   if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) {
     CmdArgs.push_back("-mcode-model");
     CmdArgs.push_back(A->getValue());
   }
 
   // Add target specific cpu and features flags.
   switch(getToolChain().getTriple().getArch()) {
   default:
     break;
 
   case llvm::Triple::arm:
   case llvm::Triple::thumb:
     AddARMTargetArgs(Args, CmdArgs, KernelOrKext);
     break;
 
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
     AddMIPSTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::ppc:
   case llvm::Triple::ppc64:
     AddPPCTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::sparc:
     AddSparcTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     AddX86TargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::hexagon:
     AddHexagonTargetArgs(Args, CmdArgs);
     break;
   }
 
 
 
   // Pass the linker version in use.
   if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) {
     CmdArgs.push_back("-target-linker-version");
     CmdArgs.push_back(A->getValue());
   }
 
   // -mno-omit-leaf-frame-pointer is the default on Darwin.
   if (Args.hasFlag(options::OPT_momit_leaf_frame_pointer,
                    options::OPT_mno_omit_leaf_frame_pointer,
                    !getToolChain().getTriple().isOSDarwin()))
     CmdArgs.push_back("-momit-leaf-frame-pointer");
 
   // Explicitly error on some things we know we don't support and can't just
   // ignore.
   types::ID InputType = Inputs[0].getType();
   if (!Args.hasArg(options::OPT_fallow_unsupported)) {
     Arg *Unsupported;
     if (types::isCXX(InputType) &&
         getToolChain().getTriple().isOSDarwin() &&
         getToolChain().getTriple().getArch() == llvm::Triple::x86) {
       if ((Unsupported = Args.getLastArg(options::OPT_fapple_kext)) ||
           (Unsupported = Args.getLastArg(options::OPT_mkernel)))
         D.Diag(diag::err_drv_clang_unsupported_opt_cxx_darwin_i386)
           << Unsupported->getOption().getName();
     }
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_v);
   Args.AddLastArg(CmdArgs, options::OPT_H);
   if (D.CCPrintHeaders && !D.CCGenDiagnostics) {
     CmdArgs.push_back("-header-include-file");
     CmdArgs.push_back(D.CCPrintHeadersFilename ?
                       D.CCPrintHeadersFilename : "-");
   }
   Args.AddLastArg(CmdArgs, options::OPT_P);
   Args.AddLastArg(CmdArgs, options::OPT_print_ivar_layout);
 
   if (D.CCLogDiagnostics && !D.CCGenDiagnostics) {
     CmdArgs.push_back("-diagnostic-log-file");
     CmdArgs.push_back(D.CCLogDiagnosticsFilename ?
                       D.CCLogDiagnosticsFilename : "-");
   }
 
   // Use the last option from "-g" group. "-gline-tables-only" is
   // preserved, all other debug options are substituted with "-g".
   Args.ClaimAllArgs(options::OPT_g_Group);
   if (Arg *A = Args.getLastArg(options::OPT_g_Group)) {
     if (A->getOption().matches(options::OPT_gline_tables_only)) {
       CmdArgs.push_back("-gline-tables-only");
     } else if (!A->getOption().matches(options::OPT_g0) &&
                !A->getOption().matches(options::OPT_ggdb0)) {
       CmdArgs.push_back("-g");
     }
   }
 
   // We ignore flags -gstrict-dwarf and -grecord-gcc-switches for now.
   Args.ClaimAllArgs(options::OPT_g_flags_Group);
   if (Args.hasArg(options::OPT_gcolumn_info))
     CmdArgs.push_back("-dwarf-column-info");
 
   Args.AddAllArgs(CmdArgs, options::OPT_ffunction_sections);
   Args.AddAllArgs(CmdArgs, options::OPT_fdata_sections);
 
   Args.AddAllArgs(CmdArgs, options::OPT_finstrument_functions);
 
   if (Args.hasArg(options::OPT_ftest_coverage) ||
       Args.hasArg(options::OPT_coverage))
     CmdArgs.push_back("-femit-coverage-notes");
   if (Args.hasArg(options::OPT_fprofile_arcs) ||
       Args.hasArg(options::OPT_coverage))
     CmdArgs.push_back("-femit-coverage-data");
 
   if (C.getArgs().hasArg(options::OPT_c) ||
       C.getArgs().hasArg(options::OPT_S)) {
     if (Output.isFilename()) {
       CmdArgs.push_back("-coverage-file");
       SmallString<128> absFilename(Output.getFilename());
       llvm::sys::fs::make_absolute(absFilename);
       CmdArgs.push_back(Args.MakeArgString(absFilename));
     }
   }
 
   // Pass options for controlling the default header search paths.
   if (Args.hasArg(options::OPT_nostdinc)) {
     CmdArgs.push_back("-nostdsysteminc");
     CmdArgs.push_back("-nobuiltininc");
   } else {
     if (Args.hasArg(options::OPT_nostdlibinc))
         CmdArgs.push_back("-nostdsysteminc");
     Args.AddLastArg(CmdArgs, options::OPT_nostdincxx);
     Args.AddLastArg(CmdArgs, options::OPT_nobuiltininc);
   }
 
   // Pass the path to compiler resource files.
   CmdArgs.push_back("-resource-dir");
   CmdArgs.push_back(D.ResourceDir.c_str());
 
   Args.AddLastArg(CmdArgs, options::OPT_working_directory);
 
   bool ARCMTEnabled = false;
   if (!Args.hasArg(options::OPT_fno_objc_arc)) {
     if (const Arg *A = Args.getLastArg(options::OPT_ccc_arcmt_check,
                                        options::OPT_ccc_arcmt_modify,
                                        options::OPT_ccc_arcmt_migrate)) {
       ARCMTEnabled = true;
       switch (A->getOption().getID()) {
       default:
         llvm_unreachable("missed a case");
       case options::OPT_ccc_arcmt_check:
         CmdArgs.push_back("-arcmt-check");
         break;
       case options::OPT_ccc_arcmt_modify:
         CmdArgs.push_back("-arcmt-modify");
         break;
       case options::OPT_ccc_arcmt_migrate:
         CmdArgs.push_back("-arcmt-migrate");
         CmdArgs.push_back("-mt-migrate-directory");
         CmdArgs.push_back(A->getValue());
 
         Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_report_output);
         Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_emit_arc_errors);
         break;
       }
     }
   }
 
   if (const Arg *A = Args.getLastArg(options::OPT_ccc_objcmt_migrate)) {
     if (ARCMTEnabled) {
       D.Diag(diag::err_drv_argument_not_allowed_with)
         << A->getAsString(Args) << "-ccc-arcmt-migrate";
     }
     CmdArgs.push_back("-mt-migrate-directory");
     CmdArgs.push_back(A->getValue());
 
     if (!Args.hasArg(options::OPT_objcmt_migrate_literals,
                      options::OPT_objcmt_migrate_subscripting)) {
       // None specified, means enable them all.
       CmdArgs.push_back("-objcmt-migrate-literals");
       CmdArgs.push_back("-objcmt-migrate-subscripting");
     } else {
       Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_literals);
       Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_subscripting);
     }
   }
 
   // Add preprocessing options like -I, -D, etc. if we are using the
   // preprocessor.
   //
   // FIXME: Support -fpreprocessed
   if (types::getPreprocessedType(InputType) != types::TY_INVALID)
     AddPreprocessingOptions(C, D, Args, CmdArgs, Output, Inputs);
 
   // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
   // that "The compiler can only warn and ignore the option if not recognized".
   // When building with ccache, it will pass -D options to clang even on
   // preprocessed inputs and configure concludes that -fPIC is not supported.
   Args.ClaimAllArgs(options::OPT_D);
 
   // Manually translate -O to -O2 and -O4 to -O3; let clang reject
   // others.
   if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
     if (A->getOption().matches(options::OPT_O4))
       CmdArgs.push_back("-O3");
     else if (A->getOption().matches(options::OPT_O) &&
              A->getValue()[0] == '\0')
       CmdArgs.push_back("-O2");
     else
       A->render(Args, CmdArgs);
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_W_Group);
   if (Args.hasFlag(options::OPT_pedantic, options::OPT_no_pedantic, false))
     CmdArgs.push_back("-pedantic");
   Args.AddLastArg(CmdArgs, options::OPT_pedantic_errors);
   Args.AddLastArg(CmdArgs, options::OPT_w);
 
   // Handle -{std, ansi, trigraphs} -- take the last of -{std, ansi}
   // (-ansi is equivalent to -std=c89).
   //
   // If a std is supplied, only add -trigraphs if it follows the
   // option.
   if (Arg *Std = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi)) {
     if (Std->getOption().matches(options::OPT_ansi))
       if (types::isCXX(InputType))
         CmdArgs.push_back("-std=c++98");
       else
         CmdArgs.push_back("-std=c89");
     else
       Std->render(Args, CmdArgs);
 
     if (Arg *A = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi,
                                  options::OPT_trigraphs))
       if (A != Std)
         A->render(Args, CmdArgs);
   } else {
     // Honor -std-default.
     //
     // FIXME: Clang doesn't correctly handle -std= when the input language
     // doesn't match. For the time being just ignore this for C++ inputs;
     // eventually we want to do all the standard defaulting here instead of
     // splitting it between the driver and clang -cc1.
     if (!types::isCXX(InputType))
       Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ,
                                 "-std=", /*Joined=*/true);
     else if (getToolChain().getTriple().getOS() == llvm::Triple::Win32)
       CmdArgs.push_back("-std=c++11");
 
     Args.AddLastArg(CmdArgs, options::OPT_trigraphs);
   }
 
   // Map the bizarre '-Wwrite-strings' flag to a more sensible
   // '-fconst-strings'; this better indicates its actual behavior.
   if (Args.hasFlag(options::OPT_Wwrite_strings, options::OPT_Wno_write_strings,
                    false)) {
     // For perfect compatibility with GCC, we do this even in the presence of
     // '-w'. This flag names something other than a warning for GCC.
     CmdArgs.push_back("-fconst-strings");
   }
 
   // GCC provides a macro definition '__DEPRECATED' when -Wdeprecated is active
   // during C++ compilation, which it is by default. GCC keeps this define even
   // in the presence of '-w', match this behavior bug-for-bug.
   if (types::isCXX(InputType) &&
       Args.hasFlag(options::OPT_Wdeprecated, options::OPT_Wno_deprecated,
                    true)) {
     CmdArgs.push_back("-fdeprecated-macro");
   }
 
   // Translate GCC's misnamer '-fasm' arguments to '-fgnu-keywords'.
   if (Arg *Asm = Args.getLastArg(options::OPT_fasm, options::OPT_fno_asm)) {
     if (Asm->getOption().matches(options::OPT_fasm))
       CmdArgs.push_back("-fgnu-keywords");
     else
       CmdArgs.push_back("-fno-gnu-keywords");
   }
 
   if (ShouldDisableCFI(Args, getToolChain()))
     CmdArgs.push_back("-fno-dwarf2-cfi-asm");
 
   if (ShouldDisableDwarfDirectory(Args, getToolChain()))
     CmdArgs.push_back("-fno-dwarf-directory-asm");
 
   if (const char *pwd = ::getenv("PWD")) {
     // GCC also verifies that stat(pwd) and stat(".") have the same inode
     // number. Not doing those because stats are slow, but we could.
     if (llvm::sys::path::is_absolute(pwd)) {
       std::string CompDir = pwd;
       CmdArgs.push_back("-fdebug-compilation-dir");
       CmdArgs.push_back(Args.MakeArgString(CompDir));
     }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftemplate_depth_,
                                options::OPT_ftemplate_depth_EQ)) {
     CmdArgs.push_back("-ftemplate-depth");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_depth_EQ)) {
     CmdArgs.push_back("-fconstexpr-depth");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_Wlarge_by_value_copy_EQ,
                                options::OPT_Wlarge_by_value_copy_def)) {
     if (A->getNumValues()) {
       StringRef bytes = A->getValue();
       CmdArgs.push_back(Args.MakeArgString("-Wlarge-by-value-copy=" + bytes));
     } else
       CmdArgs.push_back("-Wlarge-by-value-copy=64"); // default value
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fbounds_checking,
                                options::OPT_fbounds_checking_EQ)) {
     if (A->getNumValues()) {
       StringRef val = A->getValue();
       CmdArgs.push_back(Args.MakeArgString("-fbounds-checking=" + val));
     } else
       CmdArgs.push_back("-fbounds-checking=1");
   }
 
   if (Args.hasArg(options::OPT_relocatable_pch))
     CmdArgs.push_back("-relocatable-pch");
 
   if (Arg *A = Args.getLastArg(options::OPT_fconstant_string_class_EQ)) {
     CmdArgs.push_back("-fconstant-string-class");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftabstop_EQ)) {
     CmdArgs.push_back("-ftabstop");
     CmdArgs.push_back(A->getValue());
   }
 
   CmdArgs.push_back("-ferror-limit");
   if (Arg *A = Args.getLastArg(options::OPT_ferror_limit_EQ))
     CmdArgs.push_back(A->getValue());
   else
     CmdArgs.push_back("19");
 
   if (Arg *A = Args.getLastArg(options::OPT_fmacro_backtrace_limit_EQ)) {
     CmdArgs.push_back("-fmacro-backtrace-limit");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftemplate_backtrace_limit_EQ)) {
     CmdArgs.push_back("-ftemplate-backtrace-limit");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_backtrace_limit_EQ)) {
     CmdArgs.push_back("-fconstexpr-backtrace-limit");
     CmdArgs.push_back(A->getValue());
   }
 
   // Pass -fmessage-length=.
   CmdArgs.push_back("-fmessage-length");
   if (Arg *A = Args.getLastArg(options::OPT_fmessage_length_EQ)) {
     CmdArgs.push_back(A->getValue());
   } else {
     // If -fmessage-length=N was not specified, determine whether this is a
     // terminal and, if so, implicitly define -fmessage-length appropriately.
     unsigned N = llvm::sys::Process::StandardErrColumns();
     CmdArgs.push_back(Args.MakeArgString(Twine(N)));
   }
 
   if (const Arg *A = Args.getLastArg(options::OPT_fvisibility_EQ)) {
     CmdArgs.push_back("-fvisibility");
     CmdArgs.push_back(A->getValue());
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_fvisibility_inlines_hidden);
 
   Args.AddLastArg(CmdArgs, options::OPT_ftlsmodel_EQ);
 
   // -fhosted is default.
   if (Args.hasFlag(options::OPT_ffreestanding, options::OPT_fhosted, false) ||
       KernelOrKext)
     CmdArgs.push_back("-ffreestanding");
 
   // Forward -f (flag) options which we can pass directly.
   Args.AddLastArg(CmdArgs, options::OPT_femit_all_decls);
   Args.AddLastArg(CmdArgs, options::OPT_fformat_extensions);
   Args.AddLastArg(CmdArgs, options::OPT_fheinous_gnu_extensions);
   Args.AddLastArg(CmdArgs, options::OPT_flimit_debug_info);
   Args.AddLastArg(CmdArgs, options::OPT_fno_limit_debug_info);
   Args.AddLastArg(CmdArgs, options::OPT_fno_operator_names);
   Args.AddLastArg(CmdArgs, options::OPT_faltivec);
   Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_show_template_tree);
   Args.AddLastArg(CmdArgs, options::OPT_fno_elide_type);
 
   SanitizerArgs Sanitize(D, Args);
   Sanitize.addArgs(Args, CmdArgs);
 
   // Report and error for -faltivec on anything other then PowerPC.
   if (const Arg *A = Args.getLastArg(options::OPT_faltivec))
     if (!(getToolChain().getTriple().getArch() == llvm::Triple::ppc ||
           getToolChain().getTriple().getArch() == llvm::Triple::ppc64))
       D.Diag(diag::err_drv_argument_only_allowed_with)
         << A->getAsString(Args) << "ppc/ppc64";
 
   if (getToolChain().SupportsProfiling())
     Args.AddLastArg(CmdArgs, options::OPT_pg);
 
   // -flax-vector-conversions is default.
   if (!Args.hasFlag(options::OPT_flax_vector_conversions,
                     options::OPT_fno_lax_vector_conversions))
     CmdArgs.push_back("-fno-lax-vector-conversions");
 
   if (Args.getLastArg(options::OPT_fapple_kext))
     CmdArgs.push_back("-fapple-kext");
 
   if (Args.hasFlag(options::OPT_frewrite_includes,
                    options::OPT_fno_rewrite_includes, false))
     CmdArgs.push_back("-frewrite-includes");
 
   Args.AddLastArg(CmdArgs, options::OPT_fobjc_sender_dependent_dispatch);
   Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_print_source_range_info);
   Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_parseable_fixits);
   Args.AddLastArg(CmdArgs, options::OPT_ftime_report);
   Args.AddLastArg(CmdArgs, options::OPT_ftrapv);
 
   if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) {
     CmdArgs.push_back("-ftrapv-handler");
     CmdArgs.push_back(A->getValue());
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_ftrap_function_EQ);
 
   // -fno-strict-overflow implies -fwrapv if it isn't disabled, but
   // -fstrict-overflow won't turn off an explicitly enabled -fwrapv.
   if (Arg *A = Args.getLastArg(options::OPT_fwrapv,
                                options::OPT_fno_wrapv)) {
     if (A->getOption().matches(options::OPT_fwrapv))
       CmdArgs.push_back("-fwrapv");
   } else if (Arg *A = Args.getLastArg(options::OPT_fstrict_overflow,
                                       options::OPT_fno_strict_overflow)) {
     if (A->getOption().matches(options::OPT_fno_strict_overflow))
       CmdArgs.push_back("-fwrapv");
   }
   Args.AddLastArg(CmdArgs, options::OPT_fwritable_strings);
   Args.AddLastArg(CmdArgs, options::OPT_funroll_loops);
 
   Args.AddLastArg(CmdArgs, options::OPT_pthread);
 
 
   // -stack-protector=0 is default.
   unsigned StackProtectorLevel = 0;
   if (Arg *A = Args.getLastArg(options::OPT_fno_stack_protector,
                                options::OPT_fstack_protector_all,
                                options::OPT_fstack_protector)) {
     if (A->getOption().matches(options::OPT_fstack_protector))
       StackProtectorLevel = 1;
     else if (A->getOption().matches(options::OPT_fstack_protector_all))
       StackProtectorLevel = 2;
   } else {
     StackProtectorLevel =
       getToolChain().GetDefaultStackProtectorLevel(KernelOrKext);
   }
   if (StackProtectorLevel) {
     CmdArgs.push_back("-stack-protector");
     CmdArgs.push_back(Args.MakeArgString(Twine(StackProtectorLevel)));
   }
 
   // --param ssp-buffer-size=
   for (arg_iterator it = Args.filtered_begin(options::OPT__param),
        ie = Args.filtered_end(); it != ie; ++it) {
     StringRef Str((*it)->getValue());
     if (Str.startswith("ssp-buffer-size=")) {
       if (StackProtectorLevel) {
         CmdArgs.push_back("-stack-protector-buffer-size");
         // FIXME: Verify the argument is a valid integer.
         CmdArgs.push_back(Args.MakeArgString(Str.drop_front(16)));
       }
       (*it)->claim();
     }
   }
 
   // Translate -mstackrealign
   if (Args.hasFlag(options::OPT_mstackrealign, options::OPT_mno_stackrealign,
                    false)) {
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-force-align-stack");
   }
   if (!Args.hasFlag(options::OPT_mno_stackrealign, options::OPT_mstackrealign,
                    false)) {
     CmdArgs.push_back(Args.MakeArgString("-mstackrealign"));
   }
 
   if (Args.hasArg(options::OPT_mstack_alignment)) {
     StringRef alignment = Args.getLastArgValue(options::OPT_mstack_alignment);
     CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment));
   }
   if (Args.hasArg(options::OPT_mstrict_align)) {
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-arm-strict-align");
   }
 
   // Forward -f options with positive and negative forms; we translate
   // these by hand.
 
   if (Args.hasArg(options::OPT_mkernel)) {
     if (!Args.hasArg(options::OPT_fapple_kext) && types::isCXX(InputType))
       CmdArgs.push_back("-fapple-kext");
     if (!Args.hasArg(options::OPT_fbuiltin))
       CmdArgs.push_back("-fno-builtin");
     Args.ClaimAllArgs(options::OPT_fno_builtin);
   }
   // -fbuiltin is default.
   else if (!Args.hasFlag(options::OPT_fbuiltin, options::OPT_fno_builtin))
     CmdArgs.push_back("-fno-builtin");
 
   if (!Args.hasFlag(options::OPT_fassume_sane_operator_new,
                     options::OPT_fno_assume_sane_operator_new))
     CmdArgs.push_back("-fno-assume-sane-operator-new");
 
   // -fblocks=0 is default.
   if (Args.hasFlag(options::OPT_fblocks, options::OPT_fno_blocks,
                    getToolChain().IsBlocksDefault()) ||
         (Args.hasArg(options::OPT_fgnu_runtime) &&
          Args.hasArg(options::OPT_fobjc_nonfragile_abi) &&
          !Args.hasArg(options::OPT_fno_blocks))) {
     CmdArgs.push_back("-fblocks");
 
     if (!Args.hasArg(options::OPT_fgnu_runtime) && 
         !getToolChain().hasBlocksRuntime())
       CmdArgs.push_back("-fblocks-runtime-optional");
   }
 
   // -fmodules enables modules (off by default). However, for C++/Objective-C++,
   // users must also pass -fcxx-modules. The latter flag will disappear once the
   // modules implementation is solid for C++/Objective-C++ programs as well.
   if (Args.hasFlag(options::OPT_fmodules, options::OPT_fno_modules, false)) {
     bool AllowedInCXX = Args.hasFlag(options::OPT_fcxx_modules, 
                                      options::OPT_fno_cxx_modules, 
                                      false);
     if (AllowedInCXX || !types::isCXX(InputType))
       CmdArgs.push_back("-fmodules");
   }
 
   // -faccess-control is default.
   if (Args.hasFlag(options::OPT_fno_access_control,
                    options::OPT_faccess_control,
                    false))
     CmdArgs.push_back("-fno-access-control");
 
   // -felide-constructors is the default.
   if (Args.hasFlag(options::OPT_fno_elide_constructors,
                    options::OPT_felide_constructors,
                    false))
     CmdArgs.push_back("-fno-elide-constructors");
 
   // -frtti is default.
   if (!Args.hasFlag(options::OPT_frtti, options::OPT_fno_rtti) ||
       KernelOrKext) {
     CmdArgs.push_back("-fno-rtti");
 
     // -fno-rtti cannot usefully be combined with -fsanitize=vptr.
     if (Sanitize.sanitizesVptr()) {
       std::string NoRttiArg =
         Args.getLastArg(options::OPT_mkernel,
                         options::OPT_fapple_kext,
                         options::OPT_fno_rtti)->getAsString(Args);
       D.Diag(diag::err_drv_argument_not_allowed_with)
         << "-fsanitize=vptr" << NoRttiArg;
     }
   }
 
   // -fshort-enums=0 is default for all architectures except Hexagon.
   if (Args.hasFlag(options::OPT_fshort_enums,
                    options::OPT_fno_short_enums,
                    getToolChain().getTriple().getArch() ==
                    llvm::Triple::hexagon))
     CmdArgs.push_back("-fshort-enums");
 
   // -fsigned-char is default.
   if (!Args.hasFlag(options::OPT_fsigned_char, options::OPT_funsigned_char,
                     isSignedCharDefault(getToolChain().getTriple())))
     CmdArgs.push_back("-fno-signed-char");
 
   // -fthreadsafe-static is default.
   if (!Args.hasFlag(options::OPT_fthreadsafe_statics,
                     options::OPT_fno_threadsafe_statics))
     CmdArgs.push_back("-fno-threadsafe-statics");
 
   // -fuse-cxa-atexit is default.
   if (!Args.hasFlag(options::OPT_fuse_cxa_atexit,
                     options::OPT_fno_use_cxa_atexit,
                    getToolChain().getTriple().getOS() != llvm::Triple::Cygwin &&
                   getToolChain().getTriple().getOS() != llvm::Triple::MinGW32 &&
               getToolChain().getTriple().getArch() != llvm::Triple::hexagon) ||
       KernelOrKext)
     CmdArgs.push_back("-fno-use-cxa-atexit");
 
   // -fms-extensions=0 is default.
   if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions,
                    getToolChain().getTriple().getOS() == llvm::Triple::Win32))
     CmdArgs.push_back("-fms-extensions");
 
   // -fms-inline-asm.
   if (Args.hasArg(options::OPT_fenable_experimental_ms_inline_asm))
     CmdArgs.push_back("-fenable-experimental-ms-inline-asm");
 
   // -fms-compatibility=0 is default.
   if (Args.hasFlag(options::OPT_fms_compatibility, 
                    options::OPT_fno_ms_compatibility,
                    (getToolChain().getTriple().getOS() == llvm::Triple::Win32 &&
                     Args.hasFlag(options::OPT_fms_extensions, 
                                  options::OPT_fno_ms_extensions,
                                  true))))
     CmdArgs.push_back("-fms-compatibility");
 
   // -fmsc-version=1300 is default.
   if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions,
                    getToolChain().getTriple().getOS() == llvm::Triple::Win32) ||
       Args.hasArg(options::OPT_fmsc_version)) {
     StringRef msc_ver = Args.getLastArgValue(options::OPT_fmsc_version);
     if (msc_ver.empty())
       CmdArgs.push_back("-fmsc-version=1300");
     else
       CmdArgs.push_back(Args.MakeArgString("-fmsc-version=" + msc_ver));
   }
 
 
   // -fborland-extensions=0 is default.
   if (Args.hasFlag(options::OPT_fborland_extensions,
                    options::OPT_fno_borland_extensions, false))
     CmdArgs.push_back("-fborland-extensions");
 
   // -fno-delayed-template-parsing is default, except for Windows where MSVC STL
   // needs it.
   if (Args.hasFlag(options::OPT_fdelayed_template_parsing,
                    options::OPT_fno_delayed_template_parsing,
                    getToolChain().getTriple().getOS() == llvm::Triple::Win32))
     CmdArgs.push_back("-fdelayed-template-parsing");
 
   // -fgnu-keywords default varies depending on language; only pass if
   // specified.
   if (Arg *A = Args.getLastArg(options::OPT_fgnu_keywords,
                                options::OPT_fno_gnu_keywords))
     A->render(Args, CmdArgs);
 
   if (Args.hasFlag(options::OPT_fgnu89_inline,
                    options::OPT_fno_gnu89_inline,
                    false))
     CmdArgs.push_back("-fgnu89-inline");
 
   if (Args.hasArg(options::OPT_fno_inline))
     CmdArgs.push_back("-fno-inline");
 
   if (Args.hasArg(options::OPT_fno_inline_functions))
     CmdArgs.push_back("-fno-inline-functions");
 
   ObjCRuntime objcRuntime = AddObjCRuntimeArgs(Args, CmdArgs, rewriteKind);
 
   // -fobjc-dispatch-method is only relevant with the nonfragile-abi, and
   // legacy is the default.
   if (objcRuntime.isNonFragile()) {
     if (!Args.hasFlag(options::OPT_fobjc_legacy_dispatch,
                       options::OPT_fno_objc_legacy_dispatch,
                       objcRuntime.isLegacyDispatchDefaultForArch(
                         getToolChain().getTriple().getArch()))) {
       if (getToolChain().UseObjCMixedDispatch())
         CmdArgs.push_back("-fobjc-dispatch-method=mixed");
       else
         CmdArgs.push_back("-fobjc-dispatch-method=non-legacy");
     }
   }
 
   // -fobjc-default-synthesize-properties=1 is default. This only has an effect
   // if the nonfragile objc abi is used.
   if (getToolChain().IsObjCDefaultSynthPropertiesDefault()) {
     CmdArgs.push_back("-fobjc-default-synthesize-properties");
   }
 
   // -fencode-extended-block-signature=1 is default.
   if (getToolChain().IsEncodeExtendedBlockSignatureDefault()) {
     CmdArgs.push_back("-fencode-extended-block-signature");
   }
   
   // Allow -fno-objc-arr to trump -fobjc-arr/-fobjc-arc.
   // NOTE: This logic is duplicated in ToolChains.cpp.
   bool ARC = isObjCAutoRefCount(Args);
   if (ARC) {
     getToolChain().CheckObjCARC();
 
     CmdArgs.push_back("-fobjc-arc");
 
     // FIXME: It seems like this entire block, and several around it should be
     // wrapped in isObjC, but for now we just use it here as this is where it
     // was being used previously.
     if (types::isCXX(InputType) && types::isObjC(InputType)) {
       if (getToolChain().GetCXXStdlibType(Args) == ToolChain::CST_Libcxx)
         CmdArgs.push_back("-fobjc-arc-cxxlib=libc++");
       else
         CmdArgs.push_back("-fobjc-arc-cxxlib=libstdc++");
     }
 
     // Allow the user to enable full exceptions code emission.
     // We define off for Objective-CC, on for Objective-C++.
     if (Args.hasFlag(options::OPT_fobjc_arc_exceptions,
                      options::OPT_fno_objc_arc_exceptions,
                      /*default*/ types::isCXX(InputType)))
       CmdArgs.push_back("-fobjc-arc-exceptions");
   }
 
   // -fobjc-infer-related-result-type is the default, except in the Objective-C
   // rewriter.
   if (rewriteKind != RK_None)
     CmdArgs.push_back("-fno-objc-infer-related-result-type");
 
   // Handle -fobjc-gc and -fobjc-gc-only. They are exclusive, and -fobjc-gc-only
   // takes precedence.
   const Arg *GCArg = Args.getLastArg(options::OPT_fobjc_gc_only);
   if (!GCArg)
     GCArg = Args.getLastArg(options::OPT_fobjc_gc);
   if (GCArg) {
     if (ARC) {
       D.Diag(diag::err_drv_objc_gc_arr)
         << GCArg->getAsString(Args);
     } else if (getToolChain().SupportsObjCGC()) {
       GCArg->render(Args, CmdArgs);
     } else {
       // FIXME: We should move this to a hard error.
       D.Diag(diag::warn_drv_objc_gc_unsupported)
         << GCArg->getAsString(Args);
     }
   }
 
   // Add exception args.
   addExceptionArgs(Args, InputType, getToolChain().getTriple(),
                    KernelOrKext, objcRuntime, CmdArgs);
 
   if (getToolChain().UseSjLjExceptions())
     CmdArgs.push_back("-fsjlj-exceptions");
 
   // C++ "sane" operator new.
   if (!Args.hasFlag(options::OPT_fassume_sane_operator_new,
                     options::OPT_fno_assume_sane_operator_new))
     CmdArgs.push_back("-fno-assume-sane-operator-new");
 
   // -fconstant-cfstrings is default, and may be subject to argument translation
   // on Darwin.
   if (!Args.hasFlag(options::OPT_fconstant_cfstrings,
                     options::OPT_fno_constant_cfstrings) ||
       !Args.hasFlag(options::OPT_mconstant_cfstrings,
                     options::OPT_mno_constant_cfstrings))
     CmdArgs.push_back("-fno-constant-cfstrings");
 
   // -fshort-wchar default varies depending on platform; only
   // pass if specified.
   if (Arg *A = Args.getLastArg(options::OPT_fshort_wchar))
     A->render(Args, CmdArgs);
 
   // -fno-pascal-strings is default, only pass non-default. If the tool chain
   // happened to translate to -mpascal-strings, we want to back translate here.
   //
   // FIXME: This is gross; that translation should be pulled from the
   // tool chain.
   if (Args.hasFlag(options::OPT_fpascal_strings,
                    options::OPT_fno_pascal_strings,
                    false) ||
       Args.hasFlag(options::OPT_mpascal_strings,
                    options::OPT_mno_pascal_strings,
                    false))
     CmdArgs.push_back("-fpascal-strings");
 
   // Honor -fpack-struct= and -fpack-struct, if given. Note that
   // -fno-pack-struct doesn't apply to -fpack-struct=.
   if (Arg *A = Args.getLastArg(options::OPT_fpack_struct_EQ)) {
     std::string PackStructStr = "-fpack-struct=";
     PackStructStr += A->getValue();
     CmdArgs.push_back(Args.MakeArgString(PackStructStr));
   } else if (Args.hasFlag(options::OPT_fpack_struct,
                           options::OPT_fno_pack_struct, false)) {
     CmdArgs.push_back("-fpack-struct=1");
   }
 
   if (Args.hasArg(options::OPT_mkernel) ||
       Args.hasArg(options::OPT_fapple_kext)) {
     if (!Args.hasArg(options::OPT_fcommon))
       CmdArgs.push_back("-fno-common");
     Args.ClaimAllArgs(options::OPT_fno_common);
   }
 
   // -fcommon is default, only pass non-default.
   else if (!Args.hasFlag(options::OPT_fcommon, options::OPT_fno_common))
     CmdArgs.push_back("-fno-common");
 
   // -fsigned-bitfields is default, and clang doesn't yet support
   // -funsigned-bitfields.
   if (!Args.hasFlag(options::OPT_fsigned_bitfields,
                     options::OPT_funsigned_bitfields))
     D.Diag(diag::warn_drv_clang_unsupported)
       << Args.getLastArg(options::OPT_funsigned_bitfields)->getAsString(Args);
 
   // -fsigned-bitfields is default, and clang doesn't support -fno-for-scope.
   if (!Args.hasFlag(options::OPT_ffor_scope,
                     options::OPT_fno_for_scope))
     D.Diag(diag::err_drv_clang_unsupported)
       << Args.getLastArg(options::OPT_fno_for_scope)->getAsString(Args);
 
   // -fcaret-diagnostics is default.
   if (!Args.hasFlag(options::OPT_fcaret_diagnostics,
                     options::OPT_fno_caret_diagnostics, true))
     CmdArgs.push_back("-fno-caret-diagnostics");
 
   // -fdiagnostics-fixit-info is default, only pass non-default.
   if (!Args.hasFlag(options::OPT_fdiagnostics_fixit_info,
                     options::OPT_fno_diagnostics_fixit_info))
     CmdArgs.push_back("-fno-diagnostics-fixit-info");
 
   // Enable -fdiagnostics-show-option by default.
   if (Args.hasFlag(options::OPT_fdiagnostics_show_option,
                    options::OPT_fno_diagnostics_show_option))
     CmdArgs.push_back("-fdiagnostics-show-option");
 
   if (const Arg *A =
         Args.getLastArg(options::OPT_fdiagnostics_show_category_EQ)) {
     CmdArgs.push_back("-fdiagnostics-show-category");
     CmdArgs.push_back(A->getValue());
   }
 
   if (const Arg *A =
         Args.getLastArg(options::OPT_fdiagnostics_format_EQ)) {
     CmdArgs.push_back("-fdiagnostics-format");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(
       options::OPT_fdiagnostics_show_note_include_stack,
       options::OPT_fno_diagnostics_show_note_include_stack)) {
     if (A->getOption().matches(
         options::OPT_fdiagnostics_show_note_include_stack))
       CmdArgs.push_back("-fdiagnostics-show-note-include-stack");
     else
       CmdArgs.push_back("-fno-diagnostics-show-note-include-stack");
   }
 
   // Color diagnostics are the default, unless the terminal doesn't support
   // them.
   if (Args.hasFlag(options::OPT_fcolor_diagnostics,
                    options::OPT_fno_color_diagnostics,
                    llvm::sys::Process::StandardErrHasColors()))
     CmdArgs.push_back("-fcolor-diagnostics");
 
   if (!Args.hasFlag(options::OPT_fshow_source_location,
                     options::OPT_fno_show_source_location))
     CmdArgs.push_back("-fno-show-source-location");
 
   if (!Args.hasFlag(options::OPT_fshow_column,
                     options::OPT_fno_show_column,
                     true))
     CmdArgs.push_back("-fno-show-column");
 
   if (!Args.hasFlag(options::OPT_fspell_checking,
                     options::OPT_fno_spell_checking))
     CmdArgs.push_back("-fno-spell-checking");
 
 
   // Silently ignore -fasm-blocks for now.
   (void) Args.hasFlag(options::OPT_fasm_blocks, options::OPT_fno_asm_blocks,
                       false);
 
   if (Arg *A = Args.getLastArg(options::OPT_fshow_overloads_EQ))
     A->render(Args, CmdArgs);
 
   // -fdollars-in-identifiers default varies depending on platform and
   // language; only pass if specified.
   if (Arg *A = Args.getLastArg(options::OPT_fdollars_in_identifiers,
                                options::OPT_fno_dollars_in_identifiers)) {
     if (A->getOption().matches(options::OPT_fdollars_in_identifiers))
       CmdArgs.push_back("-fdollars-in-identifiers");
     else
       CmdArgs.push_back("-fno-dollars-in-identifiers");
   }
 
   // -funit-at-a-time is default, and we don't support -fno-unit-at-a-time for
   // practical purposes.
   if (Arg *A = Args.getLastArg(options::OPT_funit_at_a_time,
                                options::OPT_fno_unit_at_a_time)) {
     if (A->getOption().matches(options::OPT_fno_unit_at_a_time))
       D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args);
   }
 
   if (Args.hasFlag(options::OPT_fapple_pragma_pack,
                    options::OPT_fno_apple_pragma_pack, false))
     CmdArgs.push_back("-fapple-pragma-pack");
 
   // Default to -fno-builtin-str{cat,cpy} on Darwin for ARM.
   //
   // FIXME: This is disabled until clang -cc1 supports -fno-builtin-foo. PR4941.
 #if 0
   if (getToolChain().getTriple().isOSDarwin() &&
       (getToolChain().getTriple().getArch() == llvm::Triple::arm ||
        getToolChain().getTriple().getArch() == llvm::Triple::thumb)) {
     if (!Args.hasArg(options::OPT_fbuiltin_strcat))
       CmdArgs.push_back("-fno-builtin-strcat");
     if (!Args.hasArg(options::OPT_fbuiltin_strcpy))
       CmdArgs.push_back("-fno-builtin-strcpy");
   }
 #endif
 
   // Only allow -traditional or -traditional-cpp outside in preprocessing modes.
   if (Arg *A = Args.getLastArg(options::OPT_traditional,
                                options::OPT_traditional_cpp)) {
     if (isa<PreprocessJobAction>(JA))
       CmdArgs.push_back("-traditional-cpp");
     else
       D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args);
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_dM);
   Args.AddLastArg(CmdArgs, options::OPT_dD);
   
   // Handle serialized diagnostics.
   if (Arg *A = Args.getLastArg(options::OPT__serialize_diags)) {
     CmdArgs.push_back("-serialize-diagnostic-file");
     CmdArgs.push_back(Args.MakeArgString(A->getValue()));
   }
 
   if (Args.hasArg(options::OPT_fretain_comments_from_system_headers))
     CmdArgs.push_back("-fretain-comments-from-system-headers");
 
   // Forward -Xclang arguments to -cc1, and -mllvm arguments to the LLVM option
   // parser.
   Args.AddAllArgValues(CmdArgs, options::OPT_Xclang);
   for (arg_iterator it = Args.filtered_begin(options::OPT_mllvm),
          ie = Args.filtered_end(); it != ie; ++it) {
     (*it)->claim();
 
     // We translate this by hand to the -cc1 argument, since nightly test uses
     // it and developers have been trained to spell it with -mllvm.
     if (StringRef((*it)->getValue(0)) == "-disable-llvm-optzns")
       CmdArgs.push_back("-disable-llvm-optzns");
     else
       (*it)->render(Args, CmdArgs);
   }
 
   if (Output.getType() == types::TY_Dependencies) {
     // Handled with other dependency code.
   } else if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back("-x");
     if (Args.hasArg(options::OPT_rewrite_objc))
       CmdArgs.push_back(types::getTypeName(types::TY_PP_ObjCXX));
     else
       CmdArgs.push_back(types::getTypeName(II.getType()));
     if (II.isFilename())
       CmdArgs.push_back(II.getFilename());
     else
       II.getInputArg().renderAsInput(Args, CmdArgs);
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_undef);
 
   const char *Exec = getToolChain().getDriver().getClangProgramPath();
 
   // Optionally embed the -cc1 level arguments into the debug info, for build
   // analysis.
   if (getToolChain().UseDwarfDebugFlags()) {
     ArgStringList OriginalArgs;
     for (ArgList::const_iterator it = Args.begin(),
            ie = Args.end(); it != ie; ++it)
       (*it)->render(Args, OriginalArgs);
 
     SmallString<256> Flags;
     Flags += Exec;
     for (unsigned i = 0, e = OriginalArgs.size(); i != e; ++i) {
       Flags += " ";
       Flags += OriginalArgs[i];
     }
     CmdArgs.push_back("-dwarf-debug-flags");
     CmdArgs.push_back(Args.MakeArgString(Flags.str()));
   }
 
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 
   if (Arg *A = Args.getLastArg(options::OPT_pg))
     if (Args.hasArg(options::OPT_fomit_frame_pointer))
       D.Diag(diag::err_drv_argument_not_allowed_with)
         << "-fomit-frame-pointer" << A->getAsString(Args);
 
   // Claim some arguments which clang supports automatically.
 
   // -fpch-preprocess is used with gcc to add a special marker in the output to
   // include the PCH file. Clang's PTH solution is completely transparent, so we
   // do not need to deal with it at all.
   Args.ClaimAllArgs(options::OPT_fpch_preprocess);
 
   // Claim some arguments which clang doesn't support, but we don't
   // care to warn the user about.
   Args.ClaimAllArgs(options::OPT_clang_ignored_f_Group);
   Args.ClaimAllArgs(options::OPT_clang_ignored_m_Group);
 
   // Disable warnings for clang -E -use-gold-plugin -emit-llvm foo.c
   Args.ClaimAllArgs(options::OPT_use_gold_plugin);
   Args.ClaimAllArgs(options::OPT_emit_llvm);
 }
 
 void ClangAs::AddARMTargetArgs(const ArgList &Args,
                                ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
   llvm::Triple Triple = getToolChain().getTriple();
 
   // Set the CPU based on -march= and -mcpu=.
   CmdArgs.push_back("-target-cpu");
   CmdArgs.push_back(Args.MakeArgString(getARMTargetCPU(Args, Triple)));
 
   // Honor -mfpu=.
   if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ))
     addFPUArgs(D, A, Args, CmdArgs);
 
   // Honor -mfpmath=.
   if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ))
     addFPMathArgs(D, A, Args, CmdArgs, getARMTargetCPU(Args, Triple));
 }
 
+void ClangAs::AddX86TargetArgs(const ArgList &Args,
+                               ArgStringList &CmdArgs) const {
+  // Set the CPU based on -march=.
+  if (const char *CPUName = getX86TargetCPU(Args, getToolChain().getTriple())) {
+    CmdArgs.push_back("-target-cpu");
+    CmdArgs.push_back(CPUName);
+  }
+}
+
 /// Add options related to the Objective-C runtime/ABI.
 ///
 /// Returns true if the runtime is non-fragile.
 ObjCRuntime Clang::AddObjCRuntimeArgs(const ArgList &args,
                                       ArgStringList &cmdArgs,
                                       RewriteKind rewriteKind) const {
   // Look for the controlling runtime option.
   Arg *runtimeArg = args.getLastArg(options::OPT_fnext_runtime,
                                     options::OPT_fgnu_runtime,
                                     options::OPT_fobjc_runtime_EQ);
 
   // Just forward -fobjc-runtime= to the frontend.  This supercedes
   // options about fragility.
   if (runtimeArg &&
       runtimeArg->getOption().matches(options::OPT_fobjc_runtime_EQ)) {
     ObjCRuntime runtime;
     StringRef value = runtimeArg->getValue();
     if (runtime.tryParse(value)) {
       getToolChain().getDriver().Diag(diag::err_drv_unknown_objc_runtime)
         << value;
     }
 
     runtimeArg->render(args, cmdArgs);
     return runtime;
   }
 
   // Otherwise, we'll need the ABI "version".  Version numbers are
   // slightly confusing for historical reasons:
   //   1 - Traditional "fragile" ABI
   //   2 - Non-fragile ABI, version 1
   //   3 - Non-fragile ABI, version 2
   unsigned objcABIVersion = 1;
   // If -fobjc-abi-version= is present, use that to set the version.
   if (Arg *abiArg = args.getLastArg(options::OPT_fobjc_abi_version_EQ)) {
     StringRef value = abiArg->getValue();
     if (value == "1")
       objcABIVersion = 1;
     else if (value == "2")
       objcABIVersion = 2;
     else if (value == "3")
       objcABIVersion = 3;
     else
       getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported)
         << value;
   } else {
     // Otherwise, determine if we are using the non-fragile ABI.
     bool nonFragileABIIsDefault = 
       (rewriteKind == RK_NonFragile || 
        (rewriteKind == RK_None &&
         getToolChain().IsObjCNonFragileABIDefault()));
     if (args.hasFlag(options::OPT_fobjc_nonfragile_abi,
                      options::OPT_fno_objc_nonfragile_abi,
                      nonFragileABIIsDefault)) {
       // Determine the non-fragile ABI version to use.
 #ifdef DISABLE_DEFAULT_NONFRAGILEABI_TWO
       unsigned nonFragileABIVersion = 1;
 #else
       unsigned nonFragileABIVersion = 2;
 #endif
 
       if (Arg *abiArg = args.getLastArg(
             options::OPT_fobjc_nonfragile_abi_version_EQ)) {
         StringRef value = abiArg->getValue();
         if (value == "1")
           nonFragileABIVersion = 1;
         else if (value == "2")
           nonFragileABIVersion = 2;
         else
           getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported)
             << value;
       }
 
       objcABIVersion = 1 + nonFragileABIVersion;
     } else {
       objcABIVersion = 1;
     }
   }
 
   // We don't actually care about the ABI version other than whether
   // it's non-fragile.
   bool isNonFragile = objcABIVersion != 1;
 
   // If we have no runtime argument, ask the toolchain for its default runtime.
   // However, the rewriter only really supports the Mac runtime, so assume that.
   ObjCRuntime runtime;
   if (!runtimeArg) {
     switch (rewriteKind) {
     case RK_None:
       runtime = getToolChain().getDefaultObjCRuntime(isNonFragile);
       break;
     case RK_Fragile:
       runtime = ObjCRuntime(ObjCRuntime::FragileMacOSX, VersionTuple());
       break;
     case RK_NonFragile:
       runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple());
       break;
     }
 
   // -fnext-runtime
   } else if (runtimeArg->getOption().matches(options::OPT_fnext_runtime)) {
     // On Darwin, make this use the default behavior for the toolchain.
     if (getToolChain().getTriple().isOSDarwin()) {
       runtime = getToolChain().getDefaultObjCRuntime(isNonFragile);
 
     // Otherwise, build for a generic macosx port.
     } else {
       runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple());
     }
 
   // -fgnu-runtime
   } else {
     assert(runtimeArg->getOption().matches(options::OPT_fgnu_runtime));
     // Legacy behaviour is to target the gnustep runtime if we are i
     // non-fragile mode or the GCC runtime in fragile mode.
     if (isNonFragile)
       runtime = ObjCRuntime(ObjCRuntime::GNUstep, VersionTuple(1,6));
     else
       runtime = ObjCRuntime(ObjCRuntime::GCC, VersionTuple());
   }
 
   cmdArgs.push_back(args.MakeArgString(
                                  "-fobjc-runtime=" + runtime.getAsString()));
   return runtime;
 }
 
 void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
                            const InputInfo &Output,
                            const InputInfoList &Inputs,
                            const ArgList &Args,
                            const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   assert(Inputs.size() == 1 && "Unexpected number of inputs.");
   const InputInfo &Input = Inputs[0];
 
   // Don't warn about "clang -w -c foo.s"
   Args.ClaimAllArgs(options::OPT_w);
   // and "clang -emit-llvm -c foo.s"
   Args.ClaimAllArgs(options::OPT_emit_llvm);
   // and "clang -use-gold-plugin -c foo.s"
   Args.ClaimAllArgs(options::OPT_use_gold_plugin);
 
   // Invoke ourselves in -cc1as mode.
   //
   // FIXME: Implement custom jobs for internal actions.
   CmdArgs.push_back("-cc1as");
 
   // Add the "effective" target triple.
   CmdArgs.push_back("-triple");
   std::string TripleStr = 
     getToolChain().ComputeEffectiveClangTriple(Args, Input.getType());
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
   // Set the output mode, we currently only expect to be used as a real
   // assembler.
   CmdArgs.push_back("-filetype");
   CmdArgs.push_back("obj");
 
   if (UseRelaxAll(C, Args))
     CmdArgs.push_back("-relax-all");
 
   // Add target specific cpu and features flags.
   switch(getToolChain().getTriple().getArch()) {
   default:
     break;
 
   case llvm::Triple::arm:
   case llvm::Triple::thumb:
     AddARMTargetArgs(Args, CmdArgs);
     break;
+
+  case llvm::Triple::x86:
+  case llvm::Triple::x86_64:
+    AddX86TargetArgs(Args, CmdArgs);
+    break;
   }
 
   // Ignore explicit -force_cpusubtype_ALL option.
   (void) Args.hasArg(options::OPT_force__cpusubtype__ALL);
 
   // Determine the original source input.
   const Action *SourceAction = &JA;
   while (SourceAction->getKind() != Action::InputClass) {
     assert(!SourceAction->getInputs().empty() && "unexpected root action!");
     SourceAction = SourceAction->getInputs()[0];
   }
 
   // Forward -g, assuming we are dealing with an actual assembly file.
   if (SourceAction->getType() == types::TY_Asm ||
       SourceAction->getType() == types::TY_PP_Asm) {
     Args.ClaimAllArgs(options::OPT_g_Group);
     if (Arg *A = Args.getLastArg(options::OPT_g_Group))
       if (!A->getOption().matches(options::OPT_g0))
         CmdArgs.push_back("-g");
   }
 
   // Optionally embed the -cc1as level arguments into the debug info, for build
   // analysis.
   if (getToolChain().UseDwarfDebugFlags()) {
     ArgStringList OriginalArgs;
     for (ArgList::const_iterator it = Args.begin(),
            ie = Args.end(); it != ie; ++it)
       (*it)->render(Args, OriginalArgs);
 
     SmallString<256> Flags;
     const char *Exec = getToolChain().getDriver().getClangProgramPath();
     Flags += Exec;
     for (unsigned i = 0, e = OriginalArgs.size(); i != e; ++i) {
       Flags += " ";
       Flags += OriginalArgs[i];
     }
     CmdArgs.push_back("-dwarf-debug-flags");
     CmdArgs.push_back(Args.MakeArgString(Flags.str()));
   }
 
   // FIXME: Add -static support, once we have it.
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
   Args.AddAllArgs(CmdArgs, options::OPT_mllvm);
 
   assert(Output.isFilename() && "Unexpected lipo output.");
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   assert(Input.isFilename() && "Invalid input.");
   CmdArgs.push_back(Input.getFilename());
 
   const char *Exec = getToolChain().getDriver().getClangProgramPath();
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void gcc::Common::ConstructJob(Compilation &C, const JobAction &JA,
                                const InputInfo &Output,
                                const InputInfoList &Inputs,
                                const ArgList &Args,
                                const char *LinkingOutput) const {
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   for (ArgList::const_iterator
          it = Args.begin(), ie = Args.end(); it != ie; ++it) {
     Arg *A = *it;
     if (forwardToGCC(A->getOption())) {
       // Don't forward any -g arguments to assembly steps.
       if (isa<AssembleJobAction>(JA) &&
           A->getOption().matches(options::OPT_g_Group))
         continue;
 
       // It is unfortunate that we have to claim here, as this means
       // we will basically never report anything interesting for
       // platforms using a generic gcc, even if we are just using gcc
       // to get to the assembler.
       A->claim();
       A->render(Args, CmdArgs);
     }
   }
 
   RenderExtraToolArgs(JA, CmdArgs);
 
   // If using a driver driver, force the arch.
   llvm::Triple::ArchType Arch = getToolChain().getArch();
   if (getToolChain().getTriple().isOSDarwin()) {
     CmdArgs.push_back("-arch");
 
     // FIXME: Remove these special cases.
     if (Arch == llvm::Triple::ppc)
       CmdArgs.push_back("ppc");
     else if (Arch == llvm::Triple::ppc64)
       CmdArgs.push_back("ppc64");
     else
       CmdArgs.push_back(Args.MakeArgString(getToolChain().getArchName()));
   }
 
   // Try to force gcc to match the tool chain we want, if we recognize
   // the arch.
   //
   // FIXME: The triple class should directly provide the information we want
   // here.
   if (Arch == llvm::Triple::x86 || Arch == llvm::Triple::ppc)
     CmdArgs.push_back("-m32");
   else if (Arch == llvm::Triple::x86_64 || Arch == llvm::Triple::x86_64)
     CmdArgs.push_back("-m64");
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Unexpected output");
     CmdArgs.push_back("-fsyntax-only");
   }
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   // Only pass -x if gcc will understand it; otherwise hope gcc
   // understands the suffix correctly. The main use case this would go
   // wrong in is for linker inputs if they happened to have an odd
   // suffix; really the only way to get this to happen is a command
   // like '-x foobar a.c' which will treat a.c like a linker input.
   //
   // FIXME: For the linker case specifically, can we safely convert
   // inputs into '-Wl,' options?
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
 
     // Don't try to pass LLVM or AST inputs to a generic gcc.
     if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR ||
         II.getType() == types::TY_LLVM_BC || II.getType() == types::TY_LTO_BC)
       D.Diag(diag::err_drv_no_linker_llvm_support)
         << getToolChain().getTripleString();
     else if (II.getType() == types::TY_AST)
       D.Diag(diag::err_drv_no_ast_support)
         << getToolChain().getTripleString();
 
     if (types::canTypeBeUserSpecified(II.getType())) {
       CmdArgs.push_back("-x");
       CmdArgs.push_back(types::getTypeName(II.getType()));
     }
 
     if (II.isFilename())
       CmdArgs.push_back(II.getFilename());
     else {
       const Arg &A = II.getInputArg();
 
       // Reverse translate some rewritten options.
       if (A.getOption().matches(options::OPT_Z_reserved_lib_stdcxx)) {
         CmdArgs.push_back("-lstdc++");
         continue;
       }
 
       // Don't render as input, we need gcc to do the translations.
       A.render(Args, CmdArgs);
     }
   }
 
   const std::string customGCCName = D.getCCCGenericGCCName();
   const char *GCCName;
   if (!customGCCName.empty())
     GCCName = customGCCName.c_str();
   else if (D.CCCIsCXX) {
     GCCName = "g++";
   } else
     GCCName = "gcc";
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath(GCCName));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void gcc::Preprocess::RenderExtraToolArgs(const JobAction &JA,
                                           ArgStringList &CmdArgs) const {
   CmdArgs.push_back("-E");
 }
 
 void gcc::Precompile::RenderExtraToolArgs(const JobAction &JA,
                                           ArgStringList &CmdArgs) const {
   // The type is good enough.
 }
 
 void gcc::Compile::RenderExtraToolArgs(const JobAction &JA,
                                        ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
 
   // If -flto, etc. are present then make sure not to force assembly output.
   if (JA.getType() == types::TY_LLVM_IR || JA.getType() == types::TY_LTO_IR ||
       JA.getType() == types::TY_LLVM_BC || JA.getType() == types::TY_LTO_BC)
     CmdArgs.push_back("-c");
   else {
     if (JA.getType() != types::TY_PP_Asm)
       D.Diag(diag::err_drv_invalid_gcc_output_type)
         << getTypeName(JA.getType());
 
     CmdArgs.push_back("-S");
   }
 }
 
 void gcc::Assemble::RenderExtraToolArgs(const JobAction &JA,
                                         ArgStringList &CmdArgs) const {
   CmdArgs.push_back("-c");
 }
 
 void gcc::Link::RenderExtraToolArgs(const JobAction &JA,
                                     ArgStringList &CmdArgs) const {
   // The types are (hopefully) good enough.
 }
 
 // Hexagon tools start.
 void hexagon::Assemble::RenderExtraToolArgs(const JobAction &JA,
                                         ArgStringList &CmdArgs) const {
 
 }
 void hexagon::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                const InputInfo &Output,
                                const InputInfoList &Inputs,
                                const ArgList &Args,
                                const char *LinkingOutput) const {
 
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   std::string MarchString = "-march=";
   MarchString += getHexagonTargetCPU(Args);
   CmdArgs.push_back(Args.MakeArgString(MarchString));
 
   RenderExtraToolArgs(JA, CmdArgs);
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Unexpected output");
     CmdArgs.push_back("-fsyntax-only");
   }
 
 
   // Only pass -x if gcc will understand it; otherwise hope gcc
   // understands the suffix correctly. The main use case this would go
   // wrong in is for linker inputs if they happened to have an odd
   // suffix; really the only way to get this to happen is a command
   // like '-x foobar a.c' which will treat a.c like a linker input.
   //
   // FIXME: For the linker case specifically, can we safely convert
   // inputs into '-Wl,' options?
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
 
     // Don't try to pass LLVM or AST inputs to a generic gcc.
     if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR ||
         II.getType() == types::TY_LLVM_BC || II.getType() == types::TY_LTO_BC)
       D.Diag(clang::diag::err_drv_no_linker_llvm_support)
         << getToolChain().getTripleString();
     else if (II.getType() == types::TY_AST)
       D.Diag(clang::diag::err_drv_no_ast_support)
         << getToolChain().getTripleString();
 
     if (II.isFilename())
       CmdArgs.push_back(II.getFilename());
     else
       // Don't render as input, we need gcc to do the translations. FIXME: Pranav: What is this ?
       II.getInputArg().render(Args, CmdArgs);
   }
 
   const char *GCCName = "hexagon-as";
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath(GCCName));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 
 }
 void hexagon::Link::RenderExtraToolArgs(const JobAction &JA,
                                     ArgStringList &CmdArgs) const {
   // The types are (hopefully) good enough.
 }
 
 void hexagon::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                const InputInfo &Output,
                                const InputInfoList &Inputs,
                                const ArgList &Args,
                                const char *LinkingOutput) const {
 
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   for (ArgList::const_iterator
          it = Args.begin(), ie = Args.end(); it != ie; ++it) {
     Arg *A = *it;
     if (forwardToGCC(A->getOption())) {
       // Don't forward any -g arguments to assembly steps.
       if (isa<AssembleJobAction>(JA) &&
           A->getOption().matches(options::OPT_g_Group))
         continue;
 
       // It is unfortunate that we have to claim here, as this means
       // we will basically never report anything interesting for
       // platforms using a generic gcc, even if we are just using gcc
       // to get to the assembler.
       A->claim();
       A->render(Args, CmdArgs);
     }
   }
 
   RenderExtraToolArgs(JA, CmdArgs);
 
   // Add Arch Information
   Arg *A;
   if ((A = getLastHexagonArchArg(Args))) {
     if (A->getOption().matches(options::OPT_m_Joined))
       A->render(Args, CmdArgs);
     else
       CmdArgs.push_back (Args.MakeArgString("-m" + getHexagonTargetCPU(Args)));
   }
   else {
     CmdArgs.push_back (Args.MakeArgString("-m" + getHexagonTargetCPU(Args)));
   }
 
   CmdArgs.push_back("-mqdsp6-compat");
 
   const char *GCCName;
   if (C.getDriver().CCCIsCXX)
     GCCName = "hexagon-g++";
   else
     GCCName = "hexagon-gcc";
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath(GCCName));
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   }
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
 
     // Don't try to pass LLVM or AST inputs to a generic gcc.
     if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR ||
         II.getType() == types::TY_LLVM_BC || II.getType() == types::TY_LTO_BC)
       D.Diag(clang::diag::err_drv_no_linker_llvm_support)
         << getToolChain().getTripleString();
     else if (II.getType() == types::TY_AST)
       D.Diag(clang::diag::err_drv_no_ast_support)
         << getToolChain().getTripleString();
 
     if (II.isFilename())
       CmdArgs.push_back(II.getFilename());
     else
       // Don't render as input, we need gcc to do the translations. FIXME: Pranav: What is this ?
       II.getInputArg().render(Args, CmdArgs);
   }
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 
 }
 // Hexagon tools end.
 
 llvm::Triple::ArchType darwin::getArchTypeForDarwinArchName(StringRef Str) {
   // See arch(3) and llvm-gcc's driver-driver.c. We don't implement support for
   // archs which Darwin doesn't use.
 
   // The matching this routine does is fairly pointless, since it is neither the
   // complete architecture list, nor a reasonable subset. The problem is that
   // historically the driver driver accepts this and also ties its -march=
   // handling to the architecture name, so we need to be careful before removing
   // support for it.
 
   // This code must be kept in sync with Clang's Darwin specific argument
   // translation.
 
   return llvm::StringSwitch<llvm::Triple::ArchType>(Str)
     .Cases("ppc", "ppc601", "ppc603", "ppc604", "ppc604e", llvm::Triple::ppc)
     .Cases("ppc750", "ppc7400", "ppc7450", "ppc970", llvm::Triple::ppc)
     .Case("ppc64", llvm::Triple::ppc64)
     .Cases("i386", "i486", "i486SX", "i586", "i686", llvm::Triple::x86)
     .Cases("pentium", "pentpro", "pentIIm3", "pentIIm5", "pentium4",
            llvm::Triple::x86)
     .Case("x86_64", llvm::Triple::x86_64)
     // This is derived from the driver driver.
     .Cases("arm", "armv4t", "armv5", "armv6", llvm::Triple::arm)
     .Cases("armv7", "armv7f", "armv7k", "armv7s", "xscale", llvm::Triple::arm)
     .Case("r600", llvm::Triple::r600)
     .Case("nvptx", llvm::Triple::nvptx)
     .Case("nvptx64", llvm::Triple::nvptx64)
     .Case("amdil", llvm::Triple::amdil)
     .Case("spir", llvm::Triple::spir)
     .Default(llvm::Triple::UnknownArch);
 }
 
 const char *darwin::CC1::getCC1Name(types::ID Type) const {
   switch (Type) {
   default:
     llvm_unreachable("Unexpected type for Darwin CC1 tool.");
   case types::TY_Asm:
   case types::TY_C: case types::TY_CHeader:
   case types::TY_PP_C: case types::TY_PP_CHeader:
     return "cc1";
   case types::TY_ObjC: case types::TY_ObjCHeader:
   case types::TY_PP_ObjC: case types::TY_PP_ObjC_Alias:
   case types::TY_PP_ObjCHeader:
     return "cc1obj";
   case types::TY_CXX: case types::TY_CXXHeader:
   case types::TY_PP_CXX: case types::TY_PP_CXXHeader:
     return "cc1plus";
   case types::TY_ObjCXX: case types::TY_ObjCXXHeader:
   case types::TY_PP_ObjCXX: case types::TY_PP_ObjCXX_Alias:
   case types::TY_PP_ObjCXXHeader:
     return "cc1objplus";
   }
 }
 
 void darwin::CC1::anchor() {}
 
 const char *darwin::CC1::getBaseInputName(const ArgList &Args,
                                           const InputInfoList &Inputs) {
   return Args.MakeArgString(
     llvm::sys::path::filename(Inputs[0].getBaseInput()));
 }
 
 const char *darwin::CC1::getBaseInputStem(const ArgList &Args,
                                           const InputInfoList &Inputs) {
   const char *Str = getBaseInputName(Args, Inputs);
 
   if (const char *End = strrchr(Str, '.'))
     return Args.MakeArgString(std::string(Str, End));
 
   return Str;
 }
 
 const char *
 darwin::CC1::getDependencyFileName(const ArgList &Args,
                                    const InputInfoList &Inputs) {
   // FIXME: Think about this more.
   std::string Res;
 
   if (Arg *OutputOpt = Args.getLastArg(options::OPT_o)) {
     std::string Str(OutputOpt->getValue());
     Res = Str.substr(0, Str.rfind('.'));
   } else {
     Res = darwin::CC1::getBaseInputStem(Args, Inputs);
   }
   return Args.MakeArgString(Res + ".d");
 }
 
 void darwin::CC1::RemoveCC1UnsupportedArgs(ArgStringList &CmdArgs) const {
   for (ArgStringList::iterator it = CmdArgs.begin(), ie = CmdArgs.end();
        it != ie;) {
 
     StringRef Option = *it;
     bool RemoveOption = false;
 
     // Erase both -fmodule-cache-path and its argument.
     if (Option.equals("-fmodule-cache-path") && it+2 != ie) {
       it = CmdArgs.erase(it, it+2);
       ie = CmdArgs.end();
       continue;
     }
 
     // Remove unsupported -f options.
     if (Option.startswith("-f")) {
       // Remove -f/-fno- to reduce the number of cases.
       if (Option.startswith("-fno-"))
         Option = Option.substr(5);
       else
         Option = Option.substr(2);
       RemoveOption = llvm::StringSwitch<bool>(Option)
         .Case("altivec", true)
         .Case("modules", true)
         .Case("diagnostics-show-note-include-stack", true)
         .Default(false);
     }
 
     // Handle machine specific options.
     if (Option.startswith("-m")) {
       RemoveOption = llvm::StringSwitch<bool>(Option)
         .Case("-mthumb", true)
         .Case("-mno-thumb", true)
         .Case("-mno-fused-madd", true)
         .Case("-mlong-branch", true)
         .Case("-mlongcall", true)
         .Case("-mcpu=G4", true)
         .Case("-mcpu=G5", true)
         .Default(false);
     }
     
     // Handle warning options.
     if (Option.startswith("-W")) {
       // Remove -W/-Wno- to reduce the number of cases.
       if (Option.startswith("-Wno-"))
         Option = Option.substr(5);
       else
         Option = Option.substr(2);
       
       RemoveOption = llvm::StringSwitch<bool>(Option)
         .Case("address-of-temporary", true)
         .Case("ambiguous-member-template", true)
         .Case("analyzer-incompatible-plugin", true)
         .Case("array-bounds", true)
         .Case("array-bounds-pointer-arithmetic", true)
         .Case("bind-to-temporary-copy", true)
         .Case("bitwise-op-parentheses", true)
         .Case("bool-conversions", true)
         .Case("builtin-macro-redefined", true)
         .Case("c++-hex-floats", true)
         .Case("c++0x-compat", true)
         .Case("c++0x-extensions", true)
         .Case("c++0x-narrowing", true)
         .Case("c++11-compat", true)
         .Case("c++11-extensions", true)
         .Case("c++11-narrowing", true)
         .Case("conditional-uninitialized", true)
         .Case("constant-conversion", true)
         .Case("conversion-null", true)
         .Case("CFString-literal", true)
         .Case("constant-logical-operand", true)
         .Case("custom-atomic-properties", true)
         .Case("default-arg-special-member", true)
         .Case("delegating-ctor-cycles", true)
         .Case("delete-non-virtual-dtor", true)
         .Case("deprecated-implementations", true)
         .Case("deprecated-writable-strings", true)
         .Case("distributed-object-modifiers", true)
         .Case("duplicate-method-arg", true)
         .Case("dynamic-class-memaccess", true)
         .Case("enum-compare", true)
         .Case("enum-conversion", true)
         .Case("exit-time-destructors", true)
         .Case("gnu", true)
         .Case("gnu-designator", true)
         .Case("header-hygiene", true)
         .Case("idiomatic-parentheses", true)
         .Case("ignored-qualifiers", true)
         .Case("implicit-atomic-properties", true)
         .Case("incompatible-pointer-types", true)
         .Case("incomplete-implementation", true)
         .Case("int-conversion", true)
         .Case("initializer-overrides", true)
         .Case("invalid-noreturn", true)
         .Case("invalid-token-paste", true)
         .Case("language-extension-token", true)
         .Case("literal-conversion", true)
         .Case("literal-range", true)
         .Case("local-type-template-args", true)
         .Case("logical-op-parentheses", true)
         .Case("method-signatures", true)
         .Case("microsoft", true)
         .Case("mismatched-tags", true)
         .Case("missing-method-return-type", true)
         .Case("non-pod-varargs", true)
         .Case("nonfragile-abi2", true)
         .Case("null-arithmetic", true)
         .Case("null-dereference", true)
         .Case("out-of-line-declaration", true)
         .Case("overriding-method-mismatch", true)
         .Case("readonly-setter-attrs", true)
         .Case("return-stack-address", true)
         .Case("self-assign", true)
         .Case("semicolon-before-method-body", true)
         .Case("sentinel", true)
         .Case("shift-overflow", true)
         .Case("shift-sign-overflow", true)
         .Case("sign-conversion", true)
         .Case("sizeof-array-argument", true)
         .Case("sizeof-pointer-memaccess", true)
         .Case("string-compare", true)
         .Case("super-class-method-mismatch", true)
         .Case("tautological-compare", true)
         .Case("typedef-redefinition", true)
         .Case("typename-missing", true)
         .Case("undefined-reinterpret-cast", true)
         .Case("unknown-warning-option", true)
         .Case("unnamed-type-template-args", true)
         .Case("unneeded-internal-declaration", true)
         .Case("unneeded-member-function", true)
         .Case("unused-comparison", true)
         .Case("unused-exception-parameter", true)
         .Case("unused-member-function", true)
         .Case("unused-result", true)
         .Case("vector-conversions", true)
         .Case("vla", true)
         .Case("used-but-marked-unused", true)
         .Case("weak-vtables", true)
         .Default(false);
     } // if (Option.startswith("-W"))
     if (RemoveOption) {
       it = CmdArgs.erase(it);
       ie = CmdArgs.end();
     } else {
       ++it;
     }
   }
 }
 
 void darwin::CC1::AddCC1Args(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
 
   CheckCodeGenerationOptions(D, Args);
 
   // Derived from cc1 spec.
   if ((!Args.hasArg(options::OPT_mkernel) ||
        (getDarwinToolChain().isTargetIPhoneOS() &&
         !getDarwinToolChain().isIPhoneOSVersionLT(6, 0))) &&
       !Args.hasArg(options::OPT_static) &&
       !Args.hasArg(options::OPT_mdynamic_no_pic))
     CmdArgs.push_back("-fPIC");
 
   if (getToolChain().getTriple().getArch() == llvm::Triple::arm ||
       getToolChain().getTriple().getArch() == llvm::Triple::thumb) {
     if (!Args.hasArg(options::OPT_fbuiltin_strcat))
       CmdArgs.push_back("-fno-builtin-strcat");
     if (!Args.hasArg(options::OPT_fbuiltin_strcpy))
       CmdArgs.push_back("-fno-builtin-strcpy");
   }
 
   if (Args.hasArg(options::OPT_g_Flag) &&
       !Args.hasArg(options::OPT_fno_eliminate_unused_debug_symbols))
     CmdArgs.push_back("-feliminate-unused-debug-symbols");
 }
 
 void darwin::CC1::AddCC1OptionsArgs(const ArgList &Args, ArgStringList &CmdArgs,
                                     const InputInfoList &Inputs,
                                     const ArgStringList &OutputArgs) const {
   const Driver &D = getToolChain().getDriver();
 
   // Derived from cc1_options spec.
   if (Args.hasArg(options::OPT_fast) ||
       Args.hasArg(options::OPT_fastf) ||
       Args.hasArg(options::OPT_fastcp))
     CmdArgs.push_back("-O3");
 
   if (Arg *A = Args.getLastArg(options::OPT_pg))
     if (Args.hasArg(options::OPT_fomit_frame_pointer))
       D.Diag(diag::err_drv_argument_not_allowed_with)
         << A->getAsString(Args) << "-fomit-frame-pointer";
 
   AddCC1Args(Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_Q))
     CmdArgs.push_back("-quiet");
 
   CmdArgs.push_back("-dumpbase");
   CmdArgs.push_back(darwin::CC1::getBaseInputName(Args, Inputs));
 
   Args.AddAllArgs(CmdArgs, options::OPT_d_Group);
 
   Args.AddAllArgs(CmdArgs, options::OPT_m_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_a_Group);
 
   // FIXME: The goal is to use the user provided -o if that is our
   // final output, otherwise to drive from the original input
   // name. Find a clean way to go about this.
   if ((Args.hasArg(options::OPT_c) || Args.hasArg(options::OPT_S)) &&
       Args.hasArg(options::OPT_o)) {
     Arg *OutputOpt = Args.getLastArg(options::OPT_o);
     CmdArgs.push_back("-auxbase-strip");
     CmdArgs.push_back(OutputOpt->getValue());
   } else {
     CmdArgs.push_back("-auxbase");
     CmdArgs.push_back(darwin::CC1::getBaseInputStem(Args, Inputs));
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_g_Group);
 
   Args.AddAllArgs(CmdArgs, options::OPT_O);
   // FIXME: -Wall is getting some special treatment. Investigate.
   Args.AddAllArgs(CmdArgs, options::OPT_W_Group, options::OPT_pedantic_Group);
   Args.AddLastArg(CmdArgs, options::OPT_w);
   Args.AddAllArgs(CmdArgs, options::OPT_std_EQ, options::OPT_ansi,
                   options::OPT_trigraphs);
   if (!Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi)) {
     // Honor -std-default.
     Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ,
                               "-std=", /*Joined=*/true);
   }
 
   if (Args.hasArg(options::OPT_v))
     CmdArgs.push_back("-version");
   if (Args.hasArg(options::OPT_pg) &&
       getToolChain().SupportsProfiling())
     CmdArgs.push_back("-p");
   Args.AddLastArg(CmdArgs, options::OPT_p);
 
   // The driver treats -fsyntax-only specially.
   if (getToolChain().getTriple().getArch() == llvm::Triple::arm ||
       getToolChain().getTriple().getArch() == llvm::Triple::thumb) {
     // Removes -fbuiltin-str{cat,cpy}; these aren't recognized by cc1 but are
     // used to inhibit the default -fno-builtin-str{cat,cpy}.
     //
     // FIXME: Should we grow a better way to deal with "removing" args?
     for (arg_iterator it = Args.filtered_begin(options::OPT_f_Group,
                                                options::OPT_fsyntax_only),
            ie = Args.filtered_end(); it != ie; ++it) {
       if (!(*it)->getOption().matches(options::OPT_fbuiltin_strcat) &&
           !(*it)->getOption().matches(options::OPT_fbuiltin_strcpy)) {
         (*it)->claim();
         (*it)->render(Args, CmdArgs);
       }
     }
   } else
     Args.AddAllArgs(CmdArgs, options::OPT_f_Group, options::OPT_fsyntax_only);
 
   // Claim Clang only -f options, they aren't worth warning about.
   Args.ClaimAllArgs(options::OPT_f_clang_Group);
 
   Args.AddAllArgs(CmdArgs, options::OPT_undef);
   if (Args.hasArg(options::OPT_Qn))
     CmdArgs.push_back("-fno-ident");
 
   // FIXME: This isn't correct.
   //Args.AddLastArg(CmdArgs, options::OPT__help)
   //Args.AddLastArg(CmdArgs, options::OPT__targetHelp)
 
   CmdArgs.append(OutputArgs.begin(), OutputArgs.end());
 
   // FIXME: Still don't get what is happening here. Investigate.
   Args.AddAllArgs(CmdArgs, options::OPT__param);
 
   if (Args.hasArg(options::OPT_fmudflap) ||
       Args.hasArg(options::OPT_fmudflapth)) {
     CmdArgs.push_back("-fno-builtin");
     CmdArgs.push_back("-fno-merge-constants");
   }
 
   if (Args.hasArg(options::OPT_coverage)) {
     CmdArgs.push_back("-fprofile-arcs");
     CmdArgs.push_back("-ftest-coverage");
   }
 
   if (types::isCXX(Inputs[0].getType()))
     CmdArgs.push_back("-D__private_extern__=extern");
 }
 
 void darwin::CC1::AddCPPOptionsArgs(const ArgList &Args, ArgStringList &CmdArgs,
                                     const InputInfoList &Inputs,
                                     const ArgStringList &OutputArgs) const {
   // Derived from cpp_options
   AddCPPUniqueOptionsArgs(Args, CmdArgs, Inputs);
 
   CmdArgs.append(OutputArgs.begin(), OutputArgs.end());
 
   AddCC1Args(Args, CmdArgs);
 
   // NOTE: The code below has some commonality with cpp_options, but
   // in classic gcc style ends up sending things in different
   // orders. This may be a good merge candidate once we drop pedantic
   // compatibility.
 
   Args.AddAllArgs(CmdArgs, options::OPT_m_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_std_EQ, options::OPT_ansi,
                   options::OPT_trigraphs);
   if (!Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi)) {
     // Honor -std-default.
     Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ,
                               "-std=", /*Joined=*/true);
   }
   Args.AddAllArgs(CmdArgs, options::OPT_W_Group, options::OPT_pedantic_Group);
   Args.AddLastArg(CmdArgs, options::OPT_w);
 
   // The driver treats -fsyntax-only specially.
   Args.AddAllArgs(CmdArgs, options::OPT_f_Group, options::OPT_fsyntax_only);
 
   // Claim Clang only -f options, they aren't worth warning about.
   Args.ClaimAllArgs(options::OPT_f_clang_Group);
 
   if (Args.hasArg(options::OPT_g_Group) && !Args.hasArg(options::OPT_g0) &&
       !Args.hasArg(options::OPT_fno_working_directory))
     CmdArgs.push_back("-fworking-directory");
 
   Args.AddAllArgs(CmdArgs, options::OPT_O);
   Args.AddAllArgs(CmdArgs, options::OPT_undef);
   if (Args.hasArg(options::OPT_save_temps))
     CmdArgs.push_back("-fpch-preprocess");
 }
 
 void darwin::CC1::AddCPPUniqueOptionsArgs(const ArgList &Args,
                                           ArgStringList &CmdArgs,
                                           const InputInfoList &Inputs) const {
   const Driver &D = getToolChain().getDriver();
 
   CheckPreprocessingOptions(D, Args);
 
   // Derived from cpp_unique_options.
   // -{C,CC} only with -E is checked in CheckPreprocessingOptions().
   Args.AddLastArg(CmdArgs, options::OPT_C);
   Args.AddLastArg(CmdArgs, options::OPT_CC);
   if (!Args.hasArg(options::OPT_Q))
     CmdArgs.push_back("-quiet");
   Args.AddAllArgs(CmdArgs, options::OPT_nostdinc);
   Args.AddAllArgs(CmdArgs, options::OPT_nostdincxx);
   Args.AddLastArg(CmdArgs, options::OPT_v);
   Args.AddAllArgs(CmdArgs, options::OPT_I_Group, options::OPT_F);
   Args.AddLastArg(CmdArgs, options::OPT_P);
 
   // FIXME: Handle %I properly.
   if (getToolChain().getArch() == llvm::Triple::x86_64) {
     CmdArgs.push_back("-imultilib");
     CmdArgs.push_back("x86_64");
   }
 
   if (Args.hasArg(options::OPT_MD)) {
     CmdArgs.push_back("-MD");
     CmdArgs.push_back(darwin::CC1::getDependencyFileName(Args, Inputs));
   }
 
   if (Args.hasArg(options::OPT_MMD)) {
     CmdArgs.push_back("-MMD");
     CmdArgs.push_back(darwin::CC1::getDependencyFileName(Args, Inputs));
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_M);
   Args.AddLastArg(CmdArgs, options::OPT_MM);
   Args.AddAllArgs(CmdArgs, options::OPT_MF);
   Args.AddLastArg(CmdArgs, options::OPT_MG);
   Args.AddLastArg(CmdArgs, options::OPT_MP);
   Args.AddAllArgs(CmdArgs, options::OPT_MQ);
   Args.AddAllArgs(CmdArgs, options::OPT_MT);
   if (!Args.hasArg(options::OPT_M) && !Args.hasArg(options::OPT_MM) &&
       (Args.hasArg(options::OPT_MD) || Args.hasArg(options::OPT_MMD))) {
     if (Arg *OutputOpt = Args.getLastArg(options::OPT_o)) {
       CmdArgs.push_back("-MQ");
       CmdArgs.push_back(OutputOpt->getValue());
     }
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_remap);
   if (Args.hasArg(options::OPT_g3))
     CmdArgs.push_back("-dD");
   Args.AddLastArg(CmdArgs, options::OPT_H);
 
   AddCPPArgs(Args, CmdArgs);
 
   Args.AddAllArgs(CmdArgs, options::OPT_D, options::OPT_U, options::OPT_A);
   Args.AddAllArgs(CmdArgs, options::OPT_i_Group);
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
 
     CmdArgs.push_back(II.getFilename());
   }
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wp_COMMA,
                        options::OPT_Xpreprocessor);
 
   if (Args.hasArg(options::OPT_fmudflap)) {
     CmdArgs.push_back("-D_MUDFLAP");
     CmdArgs.push_back("-include");
     CmdArgs.push_back("mf-runtime.h");
   }
 
   if (Args.hasArg(options::OPT_fmudflapth)) {
     CmdArgs.push_back("-D_MUDFLAP");
     CmdArgs.push_back("-D_MUDFLAPTH");
     CmdArgs.push_back("-include");
     CmdArgs.push_back("mf-runtime.h");
   }
 }
 
 void darwin::CC1::AddCPPArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   // Derived from cpp spec.
 
   if (Args.hasArg(options::OPT_static)) {
     // The gcc spec is broken here, it refers to dynamic but
     // that has been translated. Start by being bug compatible.
 
     // if (!Args.hasArg(arglist.parser.dynamicOption))
     CmdArgs.push_back("-D__STATIC__");
   } else
     CmdArgs.push_back("-D__DYNAMIC__");
 
   if (Args.hasArg(options::OPT_pthread))
     CmdArgs.push_back("-D_REENTRANT");
 }
 
 void darwin::Preprocess::ConstructJob(Compilation &C, const JobAction &JA,
                                       const InputInfo &Output,
                                       const InputInfoList &Inputs,
                                       const ArgList &Args,
                                       const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   assert(Inputs.size() == 1 && "Unexpected number of inputs!");
 
   CmdArgs.push_back("-E");
 
   if (Args.hasArg(options::OPT_traditional) ||
       Args.hasArg(options::OPT_traditional_cpp))
     CmdArgs.push_back("-traditional-cpp");
 
   ArgStringList OutputArgs;
   assert(Output.isFilename() && "Unexpected CC1 output.");
   OutputArgs.push_back("-o");
   OutputArgs.push_back(Output.getFilename());
 
   if (Args.hasArg(options::OPT_E) || getToolChain().getDriver().CCCIsCPP) {
     AddCPPOptionsArgs(Args, CmdArgs, Inputs, OutputArgs);
   } else {
     AddCPPOptionsArgs(Args, CmdArgs, Inputs, ArgStringList());
     CmdArgs.append(OutputArgs.begin(), OutputArgs.end());
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_d_Group);
 
   RemoveCC1UnsupportedArgs(CmdArgs);
 
   const char *CC1Name = getCC1Name(Inputs[0].getType());
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath(CC1Name));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void darwin::Compile::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfo &Output,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   assert(Inputs.size() == 1 && "Unexpected number of inputs!");
 
   // Silence warning about unused --serialize-diagnostics
   Args.ClaimAllArgs(options::OPT__serialize_diags);
 
   types::ID InputType = Inputs[0].getType();
   if (const Arg *A = Args.getLastArg(options::OPT_traditional))
     D.Diag(diag::err_drv_argument_only_allowed_with)
       << A->getAsString(Args) << "-E";
 
   if (JA.getType() == types::TY_LLVM_IR ||
       JA.getType() == types::TY_LTO_IR)
     CmdArgs.push_back("-emit-llvm");
   else if (JA.getType() == types::TY_LLVM_BC ||
            JA.getType() == types::TY_LTO_BC)
     CmdArgs.push_back("-emit-llvm-bc");
   else if (Output.getType() == types::TY_AST)
     D.Diag(diag::err_drv_no_ast_support)
       << getToolChain().getTripleString();
   else if (JA.getType() != types::TY_PP_Asm &&
            JA.getType() != types::TY_PCH)
     D.Diag(diag::err_drv_invalid_gcc_output_type)
       << getTypeName(JA.getType());
 
   ArgStringList OutputArgs;
   if (Output.getType() != types::TY_PCH) {
     OutputArgs.push_back("-o");
     if (Output.isNothing())
       OutputArgs.push_back("/dev/null");
     else
       OutputArgs.push_back(Output.getFilename());
   }
 
   // There is no need for this level of compatibility, but it makes
   // diffing easier.
   bool OutputArgsEarly = (Args.hasArg(options::OPT_fsyntax_only) ||
                           Args.hasArg(options::OPT_S));
 
   if (types::getPreprocessedType(InputType) != types::TY_INVALID) {
     AddCPPUniqueOptionsArgs(Args, CmdArgs, Inputs);
     if (OutputArgsEarly) {
       AddCC1OptionsArgs(Args, CmdArgs, Inputs, OutputArgs);
     } else {
       AddCC1OptionsArgs(Args, CmdArgs, Inputs, ArgStringList());
       CmdArgs.append(OutputArgs.begin(), OutputArgs.end());
     }
   } else {
     CmdArgs.push_back("-fpreprocessed");
 
     for (InputInfoList::const_iterator
            it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
       const InputInfo &II = *it;
 
       // Reject AST inputs.
       if (II.getType() == types::TY_AST) {
         D.Diag(diag::err_drv_no_ast_support)
           << getToolChain().getTripleString();
         return;
       }
 
       CmdArgs.push_back(II.getFilename());
     }
 
     if (OutputArgsEarly) {
       AddCC1OptionsArgs(Args, CmdArgs, Inputs, OutputArgs);
     } else {
       AddCC1OptionsArgs(Args, CmdArgs, Inputs, ArgStringList());
       CmdArgs.append(OutputArgs.begin(), OutputArgs.end());
     }
   }
 
   if (Output.getType() == types::TY_PCH) {
     assert(Output.isFilename() && "Invalid PCH output.");
 
     CmdArgs.push_back("-o");
     // NOTE: gcc uses a temp .s file for this, but there doesn't seem
     // to be a good reason.
     const char *TmpPath = C.getArgs().MakeArgString(
       D.GetTemporaryPath("cc", "s"));
     C.addTempFile(TmpPath);
     CmdArgs.push_back(TmpPath);
 
     // If we're emitting a pch file with the last 4 characters of ".pth"
     // and falling back to llvm-gcc we want to use ".gch" instead.
     std::string OutputFile(Output.getFilename());
     size_t loc = OutputFile.rfind(".pth");
     if (loc != std::string::npos)
       OutputFile.replace(loc, 4, ".gch");
     const char *Tmp = C.getArgs().MakeArgString("--output-pch="+OutputFile);
     CmdArgs.push_back(Tmp);
   }
 
   RemoveCC1UnsupportedArgs(CmdArgs);
 
   const char *CC1Name = getCC1Name(Inputs[0].getType());
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath(CC1Name));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void darwin::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs,
                                     const ArgList &Args,
                                     const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   assert(Inputs.size() == 1 && "Unexpected number of inputs.");
   const InputInfo &Input = Inputs[0];
 
   // Determine the original source input.
   const Action *SourceAction = &JA;
   while (SourceAction->getKind() != Action::InputClass) {
     assert(!SourceAction->getInputs().empty() && "unexpected root action!");
     SourceAction = SourceAction->getInputs()[0];
   }
 
   // Forward -g, assuming we are dealing with an actual assembly file.
   if (SourceAction->getType() == types::TY_Asm ||
       SourceAction->getType() == types::TY_PP_Asm) {
     if (Args.hasArg(options::OPT_gstabs))
       CmdArgs.push_back("--gstabs");
     else if (Args.hasArg(options::OPT_g_Group))
       CmdArgs.push_back("-g");
   }
 
   // Derived from asm spec.
   AddDarwinArch(Args, CmdArgs);
 
   // Use -force_cpusubtype_ALL on x86 by default.
   if (getToolChain().getTriple().getArch() == llvm::Triple::x86 ||
       getToolChain().getTriple().getArch() == llvm::Triple::x86_64 ||
       Args.hasArg(options::OPT_force__cpusubtype__ALL))
     CmdArgs.push_back("-force_cpusubtype_ALL");
 
   if (getToolChain().getTriple().getArch() != llvm::Triple::x86_64 &&
       (((Args.hasArg(options::OPT_mkernel) ||
          Args.hasArg(options::OPT_fapple_kext)) &&
         (!getDarwinToolChain().isTargetIPhoneOS() ||
          getDarwinToolChain().isIPhoneOSVersionLT(6, 0))) ||
        Args.hasArg(options::OPT_static)))
     CmdArgs.push_back("-static");
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   assert(Output.isFilename() && "Unexpected lipo output.");
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   assert(Input.isFilename() && "Invalid input.");
   CmdArgs.push_back(Input.getFilename());
 
   // asm_final spec is empty.
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void darwin::DarwinTool::anchor() {}
 
 void darwin::DarwinTool::AddDarwinArch(const ArgList &Args,
                                        ArgStringList &CmdArgs) const {
   StringRef ArchName = getDarwinToolChain().getDarwinArchName(Args);
 
   // Derived from darwin_arch spec.
   CmdArgs.push_back("-arch");
   CmdArgs.push_back(Args.MakeArgString(ArchName));
 
   // FIXME: Is this needed anymore?
   if (ArchName == "arm")
     CmdArgs.push_back("-force_cpusubtype_ALL");
 }
 
 bool darwin::Link::NeedsTempPath(const InputInfoList &Inputs) const {
   // We only need to generate a temp path for LTO if we aren't compiling object
   // files. When compiling source files, we run 'dsymutil' after linking. We
   // don't run 'dsymutil' when compiling object files.
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it)
     if (it->getType() != types::TY_Object)
       return true;
 
   return false;
 }
 
 void darwin::Link::AddLinkArgs(Compilation &C,
                                const ArgList &Args,
                                ArgStringList &CmdArgs,
                                const InputInfoList &Inputs) const {
   const Driver &D = getToolChain().getDriver();
   const toolchains::Darwin &DarwinTC = getDarwinToolChain();
 
   unsigned Version[3] = { 0, 0, 0 };
   if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) {
     bool HadExtra;
     if (!Driver::GetReleaseVersion(A->getValue(), Version[0],
                                    Version[1], Version[2], HadExtra) ||
         HadExtra)
       D.Diag(diag::err_drv_invalid_version_number)
         << A->getAsString(Args);
   }
 
   // Newer linkers support -demangle, pass it if supported and not disabled by
   // the user.
   if (Version[0] >= 100 && !Args.hasArg(options::OPT_Z_Xlinker__no_demangle)) {
     // Don't pass -demangle to ld_classic.
     //
     // FIXME: This is a temporary workaround, ld should be handling this.
     bool UsesLdClassic = (getToolChain().getArch() == llvm::Triple::x86 &&
                           Args.hasArg(options::OPT_static));
     if (getToolChain().getArch() == llvm::Triple::x86) {
       for (arg_iterator it = Args.filtered_begin(options::OPT_Xlinker,
                                                  options::OPT_Wl_COMMA),
              ie = Args.filtered_end(); it != ie; ++it) {
         const Arg *A = *it;
         for (unsigned i = 0, e = A->getNumValues(); i != e; ++i)
           if (StringRef(A->getValue(i)) == "-kext")
             UsesLdClassic = true;
       }
     }
     if (!UsesLdClassic)
       CmdArgs.push_back("-demangle");
   }
 
   // If we are using LTO, then automatically create a temporary file path for
   // the linker to use, so that it's lifetime will extend past a possible
   // dsymutil step.
   if (Version[0] >= 116 && D.IsUsingLTO(Args) && NeedsTempPath(Inputs)) {
     const char *TmpPath = C.getArgs().MakeArgString(
       D.GetTemporaryPath("cc", types::getTypeTempSuffix(types::TY_Object)));
     C.addTempFile(TmpPath);
     CmdArgs.push_back("-object_path_lto");
     CmdArgs.push_back(TmpPath);
   }
 
   // Derived from the "link" spec.
   Args.AddAllArgs(CmdArgs, options::OPT_static);
   if (!Args.hasArg(options::OPT_static))
     CmdArgs.push_back("-dynamic");
   if (Args.hasArg(options::OPT_fgnu_runtime)) {
     // FIXME: gcc replaces -lobjc in forward args with -lobjc-gnu
     // here. How do we wish to handle such things?
   }
 
   if (!Args.hasArg(options::OPT_dynamiclib)) {
     AddDarwinArch(Args, CmdArgs);
     // FIXME: Why do this only on this path?
     Args.AddLastArg(CmdArgs, options::OPT_force__cpusubtype__ALL);
 
     Args.AddLastArg(CmdArgs, options::OPT_bundle);
     Args.AddAllArgs(CmdArgs, options::OPT_bundle__loader);
     Args.AddAllArgs(CmdArgs, options::OPT_client__name);
 
     Arg *A;
     if ((A = Args.getLastArg(options::OPT_compatibility__version)) ||
         (A = Args.getLastArg(options::OPT_current__version)) ||
         (A = Args.getLastArg(options::OPT_install__name)))
       D.Diag(diag::err_drv_argument_only_allowed_with)
         << A->getAsString(Args) << "-dynamiclib";
 
     Args.AddLastArg(CmdArgs, options::OPT_force__flat__namespace);
     Args.AddLastArg(CmdArgs, options::OPT_keep__private__externs);
     Args.AddLastArg(CmdArgs, options::OPT_private__bundle);
   } else {
     CmdArgs.push_back("-dylib");
 
     Arg *A;
     if ((A = Args.getLastArg(options::OPT_bundle)) ||
         (A = Args.getLastArg(options::OPT_bundle__loader)) ||
         (A = Args.getLastArg(options::OPT_client__name)) ||
         (A = Args.getLastArg(options::OPT_force__flat__namespace)) ||
         (A = Args.getLastArg(options::OPT_keep__private__externs)) ||
         (A = Args.getLastArg(options::OPT_private__bundle)))
       D.Diag(diag::err_drv_argument_not_allowed_with)
         << A->getAsString(Args) << "-dynamiclib";
 
     Args.AddAllArgsTranslated(CmdArgs, options::OPT_compatibility__version,
                               "-dylib_compatibility_version");
     Args.AddAllArgsTranslated(CmdArgs, options::OPT_current__version,
                               "-dylib_current_version");
 
     AddDarwinArch(Args, CmdArgs);
 
     Args.AddAllArgsTranslated(CmdArgs, options::OPT_install__name,
                               "-dylib_install_name");
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_all__load);
   Args.AddAllArgs(CmdArgs, options::OPT_allowable__client);
   Args.AddLastArg(CmdArgs, options::OPT_bind__at__load);
   if (DarwinTC.isTargetIPhoneOS())
     Args.AddLastArg(CmdArgs, options::OPT_arch__errors__fatal);
   Args.AddLastArg(CmdArgs, options::OPT_dead__strip);
   Args.AddLastArg(CmdArgs, options::OPT_no__dead__strip__inits__and__terms);
   Args.AddAllArgs(CmdArgs, options::OPT_dylib__file);
   Args.AddLastArg(CmdArgs, options::OPT_dynamic);
   Args.AddAllArgs(CmdArgs, options::OPT_exported__symbols__list);
   Args.AddLastArg(CmdArgs, options::OPT_flat__namespace);
   Args.AddAllArgs(CmdArgs, options::OPT_force__load);
   Args.AddAllArgs(CmdArgs, options::OPT_headerpad__max__install__names);
   Args.AddAllArgs(CmdArgs, options::OPT_image__base);
   Args.AddAllArgs(CmdArgs, options::OPT_init);
 
   // Add the deployment target.
   VersionTuple TargetVersion = DarwinTC.getTargetVersion();
 
   // If we had an explicit -mios-simulator-version-min argument, honor that,
   // otherwise use the traditional deployment targets. We can't just check the
   // is-sim attribute because existing code follows this path, and the linker
   // may not handle the argument.
   //
   // FIXME: We may be able to remove this, once we can verify no one depends on
   // it.
   if (Args.hasArg(options::OPT_mios_simulator_version_min_EQ))
     CmdArgs.push_back("-ios_simulator_version_min");
   else if (DarwinTC.isTargetIPhoneOS())
     CmdArgs.push_back("-iphoneos_version_min");
   else
     CmdArgs.push_back("-macosx_version_min");
   CmdArgs.push_back(Args.MakeArgString(TargetVersion.getAsString()));
 
   Args.AddLastArg(CmdArgs, options::OPT_nomultidefs);
   Args.AddLastArg(CmdArgs, options::OPT_multi__module);
   Args.AddLastArg(CmdArgs, options::OPT_single__module);
   Args.AddAllArgs(CmdArgs, options::OPT_multiply__defined);
   Args.AddAllArgs(CmdArgs, options::OPT_multiply__defined__unused);
 
   if (const Arg *A = Args.getLastArg(options::OPT_fpie, options::OPT_fPIE,
                                      options::OPT_fno_pie,
                                      options::OPT_fno_PIE)) {
     if (A->getOption().matches(options::OPT_fpie) ||
         A->getOption().matches(options::OPT_fPIE))
       CmdArgs.push_back("-pie");
     else
       CmdArgs.push_back("-no_pie");
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_prebind);
   Args.AddLastArg(CmdArgs, options::OPT_noprebind);
   Args.AddLastArg(CmdArgs, options::OPT_nofixprebinding);
   Args.AddLastArg(CmdArgs, options::OPT_prebind__all__twolevel__modules);
   Args.AddLastArg(CmdArgs, options::OPT_read__only__relocs);
   Args.AddAllArgs(CmdArgs, options::OPT_sectcreate);
   Args.AddAllArgs(CmdArgs, options::OPT_sectorder);
   Args.AddAllArgs(CmdArgs, options::OPT_seg1addr);
   Args.AddAllArgs(CmdArgs, options::OPT_segprot);
   Args.AddAllArgs(CmdArgs, options::OPT_segaddr);
   Args.AddAllArgs(CmdArgs, options::OPT_segs__read__only__addr);
   Args.AddAllArgs(CmdArgs, options::OPT_segs__read__write__addr);
   Args.AddAllArgs(CmdArgs, options::OPT_seg__addr__table);
   Args.AddAllArgs(CmdArgs, options::OPT_seg__addr__table__filename);
   Args.AddAllArgs(CmdArgs, options::OPT_sub__library);
   Args.AddAllArgs(CmdArgs, options::OPT_sub__umbrella);
 
   // Give --sysroot= preference, over the Apple specific behavior to also use
   // --isysroot as the syslibroot.
   StringRef sysroot = C.getSysRoot();
   if (sysroot != "") {
     CmdArgs.push_back("-syslibroot");
     CmdArgs.push_back(C.getArgs().MakeArgString(sysroot));
   } else if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) {
     CmdArgs.push_back("-syslibroot");
     CmdArgs.push_back(A->getValue());
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_twolevel__namespace);
   Args.AddLastArg(CmdArgs, options::OPT_twolevel__namespace__hints);
   Args.AddAllArgs(CmdArgs, options::OPT_umbrella);
   Args.AddAllArgs(CmdArgs, options::OPT_undefined);
   Args.AddAllArgs(CmdArgs, options::OPT_unexported__symbols__list);
   Args.AddAllArgs(CmdArgs, options::OPT_weak__reference__mismatches);
   Args.AddLastArg(CmdArgs, options::OPT_X_Flag);
   Args.AddAllArgs(CmdArgs, options::OPT_y);
   Args.AddLastArg(CmdArgs, options::OPT_w);
   Args.AddAllArgs(CmdArgs, options::OPT_pagezero__size);
   Args.AddAllArgs(CmdArgs, options::OPT_segs__read__);
   Args.AddLastArg(CmdArgs, options::OPT_seglinkedit);
   Args.AddLastArg(CmdArgs, options::OPT_noseglinkedit);
   Args.AddAllArgs(CmdArgs, options::OPT_sectalign);
   Args.AddAllArgs(CmdArgs, options::OPT_sectobjectsymbols);
   Args.AddAllArgs(CmdArgs, options::OPT_segcreate);
   Args.AddLastArg(CmdArgs, options::OPT_whyload);
   Args.AddLastArg(CmdArgs, options::OPT_whatsloaded);
   Args.AddAllArgs(CmdArgs, options::OPT_dylinker__install__name);
   Args.AddLastArg(CmdArgs, options::OPT_dylinker);
   Args.AddLastArg(CmdArgs, options::OPT_Mach);
 }
 
 void darwin::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                 const InputInfo &Output,
                                 const InputInfoList &Inputs,
                                 const ArgList &Args,
                                 const char *LinkingOutput) const {
   assert(Output.getType() == types::TY_Image && "Invalid linker output type.");
 
   // The logic here is derived from gcc's behavior; most of which
   // comes from specs (starting with link_command). Consult gcc for
   // more information.
   ArgStringList CmdArgs;
 
   /// Hack(tm) to ignore linking errors when we are doing ARC migration.
   if (Args.hasArg(options::OPT_ccc_arcmt_check,
                   options::OPT_ccc_arcmt_migrate)) {
     for (ArgList::const_iterator I = Args.begin(), E = Args.end(); I != E; ++I)
       (*I)->claim();
     const char *Exec =
       Args.MakeArgString(getToolChain().GetProgramPath("touch"));
     CmdArgs.push_back(Output.getFilename());
     C.addCommand(new Command(JA, *this, Exec, CmdArgs));
     return;
   }
 
   // I'm not sure why this particular decomposition exists in gcc, but
   // we follow suite for ease of comparison.
   AddLinkArgs(C, Args, CmdArgs, Inputs);
 
   Args.AddAllArgs(CmdArgs, options::OPT_d_Flag);
   Args.AddAllArgs(CmdArgs, options::OPT_s);
   Args.AddAllArgs(CmdArgs, options::OPT_t);
   Args.AddAllArgs(CmdArgs, options::OPT_Z_Flag);
   Args.AddAllArgs(CmdArgs, options::OPT_u_Group);
   Args.AddLastArg(CmdArgs, options::OPT_e);
   Args.AddAllArgs(CmdArgs, options::OPT_m_Separate);
   Args.AddAllArgs(CmdArgs, options::OPT_r);
 
   // Forward -ObjC when either -ObjC or -ObjC++ is used, to force loading
   // members of static archive libraries which implement Objective-C classes or
   // categories.
   if (Args.hasArg(options::OPT_ObjC) || Args.hasArg(options::OPT_ObjCXX))
     CmdArgs.push_back("-ObjC");
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     // Derived from startfile spec.
     if (Args.hasArg(options::OPT_dynamiclib)) {
       // Derived from darwin_dylib1 spec.
       if (getDarwinToolChain().isTargetIOSSimulator()) {
         // The simulator doesn't have a versioned crt1 file.
         CmdArgs.push_back("-ldylib1.o");
       } else if (getDarwinToolChain().isTargetIPhoneOS()) {
         if (getDarwinToolChain().isIPhoneOSVersionLT(3, 1))
           CmdArgs.push_back("-ldylib1.o");
       } else {
         if (getDarwinToolChain().isMacosxVersionLT(10, 5))
           CmdArgs.push_back("-ldylib1.o");
         else if (getDarwinToolChain().isMacosxVersionLT(10, 6))
           CmdArgs.push_back("-ldylib1.10.5.o");
       }
     } else {
       if (Args.hasArg(options::OPT_bundle)) {
         if (!Args.hasArg(options::OPT_static)) {
           // Derived from darwin_bundle1 spec.
           if (getDarwinToolChain().isTargetIOSSimulator()) {
             // The simulator doesn't have a versioned crt1 file.
             CmdArgs.push_back("-lbundle1.o");
           } else if (getDarwinToolChain().isTargetIPhoneOS()) {
             if (getDarwinToolChain().isIPhoneOSVersionLT(3, 1))
               CmdArgs.push_back("-lbundle1.o");
           } else {
             if (getDarwinToolChain().isMacosxVersionLT(10, 6))
               CmdArgs.push_back("-lbundle1.o");
           }
         }
       } else {
         if (Args.hasArg(options::OPT_pg) &&
             getToolChain().SupportsProfiling()) {
           if (Args.hasArg(options::OPT_static) ||
               Args.hasArg(options::OPT_object) ||
               Args.hasArg(options::OPT_preload)) {
             CmdArgs.push_back("-lgcrt0.o");
           } else {
             CmdArgs.push_back("-lgcrt1.o");
 
             // darwin_crt2 spec is empty.
           }
           // By default on OS X 10.8 and later, we don't link with a crt1.o
           // file and the linker knows to use _main as the entry point.  But,
           // when compiling with -pg, we need to link with the gcrt1.o file,
           // so pass the -no_new_main option to tell the linker to use the
           // "start" symbol as the entry point.
           if (getDarwinToolChain().isTargetMacOS() &&
               !getDarwinToolChain().isMacosxVersionLT(10, 8))
             CmdArgs.push_back("-no_new_main");
         } else {
           if (Args.hasArg(options::OPT_static) ||
               Args.hasArg(options::OPT_object) ||
               Args.hasArg(options::OPT_preload)) {
             CmdArgs.push_back("-lcrt0.o");
           } else {
             // Derived from darwin_crt1 spec.
             if (getDarwinToolChain().isTargetIOSSimulator()) {
               // The simulator doesn't have a versioned crt1 file.
               CmdArgs.push_back("-lcrt1.o");
             } else if (getDarwinToolChain().isTargetIPhoneOS()) {
               if (getDarwinToolChain().isIPhoneOSVersionLT(3, 1))
                 CmdArgs.push_back("-lcrt1.o");
               else if (getDarwinToolChain().isIPhoneOSVersionLT(6, 0))
                 CmdArgs.push_back("-lcrt1.3.1.o");
             } else {
               if (getDarwinToolChain().isMacosxVersionLT(10, 5))
                 CmdArgs.push_back("-lcrt1.o");
               else if (getDarwinToolChain().isMacosxVersionLT(10, 6))
                 CmdArgs.push_back("-lcrt1.10.5.o");
               else if (getDarwinToolChain().isMacosxVersionLT(10, 8))
                 CmdArgs.push_back("-lcrt1.10.6.o");
 
               // darwin_crt2 spec is empty.
             }
           }
         }
       }
     }
 
     if (!getDarwinToolChain().isTargetIPhoneOS() &&
         Args.hasArg(options::OPT_shared_libgcc) &&
         getDarwinToolChain().isMacosxVersionLT(10, 5)) {
       const char *Str =
         Args.MakeArgString(getToolChain().GetFilePath("crt3.o"));
       CmdArgs.push_back(Str);
     }
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
 
   SanitizerArgs Sanitize(getToolChain().getDriver(), Args);
   // If we're building a dynamic lib with -fsanitize=address, or
   // -fsanitize=undefined, unresolved symbols may appear. Mark all
   // of them as dynamic_lookup. Linking executables is handled in
   // lib/Driver/ToolChains.cpp.
   if (Sanitize.needsAsanRt() || Sanitize.needsUbsanRt()) {
     if (Args.hasArg(options::OPT_dynamiclib) ||
         Args.hasArg(options::OPT_bundle)) {
       CmdArgs.push_back("-undefined");
       CmdArgs.push_back("dynamic_lookup");
     }
   }
 
   if (Args.hasArg(options::OPT_fopenmp))
     // This is more complicated in gcc...
     CmdArgs.push_back("-lgomp");
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
   
   if (isObjCRuntimeLinked(Args) &&
       !Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     // Avoid linking compatibility stubs on i386 mac.
     if (!getDarwinToolChain().isTargetMacOS() ||
         getDarwinToolChain().getArch() != llvm::Triple::x86) {
       // If we don't have ARC or subscripting runtime support, link in the
       // runtime stubs.  We have to do this *before* adding any of the normal
       // linker inputs so that its initializer gets run first.
       ObjCRuntime runtime =
         getDarwinToolChain().getDefaultObjCRuntime(/*nonfragile*/ true);
       // We use arclite library for both ARC and subscripting support.
       if ((!runtime.hasNativeARC() && isObjCAutoRefCount(Args)) ||
           !runtime.hasSubscripting())
         getDarwinToolChain().AddLinkARCArgs(Args, CmdArgs);
     }
     CmdArgs.push_back("-framework");
     CmdArgs.push_back("Foundation");
     // Link libobj.
     CmdArgs.push_back("-lobjc");
   }
 
   if (LinkingOutput) {
     CmdArgs.push_back("-arch_multiple");
     CmdArgs.push_back("-final_output");
     CmdArgs.push_back(LinkingOutput);
   }
 
   if (Args.hasArg(options::OPT_fnested_functions))
     CmdArgs.push_back("-allow_stack_execute");
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     if (getToolChain().getDriver().CCCIsCXX)
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
 
     // link_ssp spec is empty.
 
     // Let the tool chain choose which runtime library to link.
     getDarwinToolChain().AddLinkRuntimeLibArgs(Args, CmdArgs);
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     // endfile_spec is empty.
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_F);
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void darwin::Lipo::ConstructJob(Compilation &C, const JobAction &JA,
                                 const InputInfo &Output,
                                 const InputInfoList &Inputs,
                                 const ArgList &Args,
                                 const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   CmdArgs.push_back("-create");
   assert(Output.isFilename() && "Unexpected lipo output.");
 
   CmdArgs.push_back("-output");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     assert(II.isFilename() && "Unexpected lipo input.");
     CmdArgs.push_back(II.getFilename());
   }
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("lipo"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void darwin::Dsymutil::ConstructJob(Compilation &C, const JobAction &JA,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs,
                                     const ArgList &Args,
                                     const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
   const InputInfo &Input = Inputs[0];
   assert(Input.isFilename() && "Unexpected dsymutil input.");
   CmdArgs.push_back(Input.getFilename());
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("dsymutil"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void darwin::VerifyDebug::ConstructJob(Compilation &C, const JobAction &JA,
 				       const InputInfo &Output,
 				       const InputInfoList &Inputs,
 				       const ArgList &Args,
 				       const char *LinkingOutput) const {
   ArgStringList CmdArgs;
   CmdArgs.push_back("--verify");
   CmdArgs.push_back("--debug-info");
   CmdArgs.push_back("--eh-frame");
   CmdArgs.push_back("--quiet");
 
   assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
   const InputInfo &Input = Inputs[0];
   assert(Input.isFilename() && "Unexpected verify input");
 
   // Grabbing the output of the earlier dsymutil run.
   CmdArgs.push_back(Input.getFilename());
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("dwarfdump"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void solaris::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                       const InputInfo &Output,
                                       const InputInfoList &Inputs,
                                       const ArgList &Args,
                                       const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 
 void solaris::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfo &Output,
                                   const InputInfoList &Inputs,
                                   const ArgList &Args,
                                   const char *LinkingOutput) const {
   // FIXME: Find a real GCC, don't hard-code versions here
   std::string GCCLibPath = "/usr/gcc/4.5/lib/gcc/";
   const llvm::Triple &T = getToolChain().getTriple();
   std::string LibPath = "/usr/lib/";
   llvm::Triple::ArchType Arch = T.getArch();
   switch (Arch) {
         case llvm::Triple::x86:
           GCCLibPath += ("i386-" + T.getVendorName() + "-" +
               T.getOSName()).str() + "/4.5.2/";
           break;
         case llvm::Triple::x86_64:
           GCCLibPath += ("i386-" + T.getVendorName() + "-" +
               T.getOSName()).str();
           GCCLibPath += "/4.5.2/amd64/";
           LibPath += "amd64/";
           break;
         default:
           assert(0 && "Unsupported architecture");
   }
 
   ArgStringList CmdArgs;
 
   // Demangle C++ names in errors
   CmdArgs.push_back("-C");
 
   if ((!Args.hasArg(options::OPT_nostdlib)) &&
       (!Args.hasArg(options::OPT_shared))) {
     CmdArgs.push_back("-e");
     CmdArgs.push_back("_start");
   }
 
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
     CmdArgs.push_back("-dn");
   } else {
     CmdArgs.push_back("-Bdynamic");
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-shared");
     } else {
       CmdArgs.push_back("--dynamic-linker");
       CmdArgs.push_back(Args.MakeArgString(LibPath + "ld.so.1"));
     }
   }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back(Args.MakeArgString(LibPath + "crt1.o"));
       CmdArgs.push_back(Args.MakeArgString(LibPath + "crti.o"));
       CmdArgs.push_back(Args.MakeArgString(LibPath + "values-Xa.o"));
       CmdArgs.push_back(Args.MakeArgString(GCCLibPath + "crtbegin.o"));
     } else {
       CmdArgs.push_back(Args.MakeArgString(LibPath + "crti.o"));
       CmdArgs.push_back(Args.MakeArgString(LibPath + "values-Xa.o"));
       CmdArgs.push_back(Args.MakeArgString(GCCLibPath + "crtbegin.o"));
     }
     if (getToolChain().getDriver().CCCIsCXX)
       CmdArgs.push_back(Args.MakeArgString(LibPath + "cxa_finalize.o"));
   }
 
   CmdArgs.push_back(Args.MakeArgString("-L" + GCCLibPath));
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
   Args.AddAllArgs(CmdArgs, options::OPT_r);
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     if (getToolChain().getDriver().CCCIsCXX)
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
     CmdArgs.push_back("-lgcc_s");
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-lgcc");
       CmdArgs.push_back("-lc");
       CmdArgs.push_back("-lm");
     }
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     CmdArgs.push_back(Args.MakeArgString(GCCLibPath + "crtend.o"));
   }
   CmdArgs.push_back(Args.MakeArgString(LibPath + "crtn.o"));
 
   addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple());
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void auroraux::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                       const InputInfo &Output,
                                       const InputInfoList &Inputs,
                                       const ArgList &Args,
                                       const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("gas"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void auroraux::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfo &Output,
                                   const InputInfoList &Inputs,
                                   const ArgList &Args,
                                   const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   if ((!Args.hasArg(options::OPT_nostdlib)) &&
       (!Args.hasArg(options::OPT_shared))) {
     CmdArgs.push_back("-e");
     CmdArgs.push_back("_start");
   }
 
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
     CmdArgs.push_back("-dn");
   } else {
 //    CmdArgs.push_back("--eh-frame-hdr");
     CmdArgs.push_back("-Bdynamic");
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-shared");
     } else {
       CmdArgs.push_back("--dynamic-linker");
       CmdArgs.push_back("/lib/ld.so.1"); // 64Bit Path /lib/amd64/ld.so.1
     }
   }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crt1.o")));
       CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crti.o")));
       CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crtbegin.o")));
     } else {
       CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crti.o")));
     }
     CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crtn.o")));
   }
 
   CmdArgs.push_back(Args.MakeArgString("-L/opt/gcc4/lib/gcc/"
                                        + getToolChain().getTripleString()
                                        + "/4.2.4"));
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     // FIXME: For some reason GCC passes -lgcc before adding
     // the default system libraries. Just mimic this for now.
     CmdArgs.push_back("-lgcc");
 
     if (Args.hasArg(options::OPT_pthread))
       CmdArgs.push_back("-pthread");
     if (!Args.hasArg(options::OPT_shared))
       CmdArgs.push_back("-lc");
     CmdArgs.push_back("-lgcc");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared))
       CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crtend.o")));
   }
 
   addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple());
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void openbsd::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                      const InputInfo &Output,
                                      const InputInfoList &Inputs,
                                      const ArgList &Args,
                                      const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void openbsd::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                  const InputInfo &Output,
                                  const InputInfoList &Inputs,
                                  const ArgList &Args,
                                  const char *LinkingOutput) const {
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   if ((!Args.hasArg(options::OPT_nostdlib)) &&
       (!Args.hasArg(options::OPT_shared))) {
     CmdArgs.push_back("-e");
     CmdArgs.push_back("__start");
   }
 
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
   } else {
     if (Args.hasArg(options::OPT_rdynamic))
       CmdArgs.push_back("-export-dynamic");
     CmdArgs.push_back("--eh-frame-hdr");
     CmdArgs.push_back("-Bdynamic");
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-shared");
     } else {
       CmdArgs.push_back("-dynamic-linker");
       CmdArgs.push_back("/usr/libexec/ld.so");
     }
   }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared)) {
       if (Args.hasArg(options::OPT_pg))  
         CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("gcrt0.o")));
       else
         CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crt0.o")));
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtbegin.o")));
     } else {
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtbeginS.o")));
     }
   }
 
   std::string Triple = getToolChain().getTripleString();
   if (Triple.substr(0, 6) == "x86_64")
     Triple.replace(0, 6, "amd64");
   CmdArgs.push_back(Args.MakeArgString("-L/usr/lib/gcc-lib/" + Triple +
                                        "/4.2.1"));
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     if (D.CCCIsCXX) {
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       if (Args.hasArg(options::OPT_pg)) 
         CmdArgs.push_back("-lm_p");
       else
         CmdArgs.push_back("-lm");
     }
 
     // FIXME: For some reason GCC passes -lgcc before adding
     // the default system libraries. Just mimic this for now.
     CmdArgs.push_back("-lgcc");
 
     if (Args.hasArg(options::OPT_pthread)) {
       if (!Args.hasArg(options::OPT_shared) &&
           Args.hasArg(options::OPT_pg))
          CmdArgs.push_back("-lpthread_p");
       else
          CmdArgs.push_back("-lpthread");
     }
 
     if (!Args.hasArg(options::OPT_shared)) {
       if (Args.hasArg(options::OPT_pg))
          CmdArgs.push_back("-lc_p");
       else
          CmdArgs.push_back("-lc");
     }
 
     CmdArgs.push_back("-lgcc");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared))
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtend.o")));
     else
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtendS.o")));
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void bitrig::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs,
                                     const ArgList &Args,
                                     const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void bitrig::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                 const InputInfo &Output,
                                 const InputInfoList &Inputs,
                                 const ArgList &Args,
                                 const char *LinkingOutput) const {
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   if ((!Args.hasArg(options::OPT_nostdlib)) &&
       (!Args.hasArg(options::OPT_shared))) {
     CmdArgs.push_back("-e");
     CmdArgs.push_back("__start");
   }
 
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
   } else {
     if (Args.hasArg(options::OPT_rdynamic))
       CmdArgs.push_back("-export-dynamic");
     CmdArgs.push_back("--eh-frame-hdr");
     CmdArgs.push_back("-Bdynamic");
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-shared");
     } else {
       CmdArgs.push_back("-dynamic-linker");
       CmdArgs.push_back("/usr/libexec/ld.so");
     }
   }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared)) {
       if (Args.hasArg(options::OPT_pg))
         CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("gcrt0.o")));
       else
         CmdArgs.push_back(Args.MakeArgString(
                                 getToolChain().GetFilePath("crt0.o")));
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtbegin.o")));
     } else {
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtbeginS.o")));
     }
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     if (D.CCCIsCXX) {
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       if (Args.hasArg(options::OPT_pg))
         CmdArgs.push_back("-lm_p");
       else
         CmdArgs.push_back("-lm");
     }
 
     if (Args.hasArg(options::OPT_pthread)) {
       if (!Args.hasArg(options::OPT_shared) &&
           Args.hasArg(options::OPT_pg))
         CmdArgs.push_back("-lpthread_p");
       else
         CmdArgs.push_back("-lpthread");
     }
 
     if (!Args.hasArg(options::OPT_shared)) {
       if (Args.hasArg(options::OPT_pg))
         CmdArgs.push_back("-lc_p");
       else
         CmdArgs.push_back("-lc");
     }
 
     std::string myarch = "-lclang_rt.";
     const llvm::Triple &T = getToolChain().getTriple();
     llvm::Triple::ArchType Arch = T.getArch();
     switch (Arch) {
           case llvm::Triple::arm:
             myarch += ("arm");
             break;
           case llvm::Triple::x86:
             myarch += ("i386");
             break;
           case llvm::Triple::x86_64:
             myarch += ("amd64");
             break;
           default:
             assert(0 && "Unsupported architecture");
      }
      CmdArgs.push_back(Args.MakeArgString(myarch));
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared))
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtend.o")));
     else
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtendS.o")));
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void freebsd::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                      const InputInfo &Output,
                                      const InputInfoList &Inputs,
                                      const ArgList &Args,
                                      const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   // When building 32-bit code on FreeBSD/amd64, we have to explicitly
   // instruct as in the base system to assemble 32-bit code.
   if (getToolChain().getArch() == llvm::Triple::x86)
     CmdArgs.push_back("--32");
   else if (getToolChain().getArch() == llvm::Triple::ppc)
     CmdArgs.push_back("-a32");
   else if (getToolChain().getArch() == llvm::Triple::mips ||
            getToolChain().getArch() == llvm::Triple::mipsel ||
            getToolChain().getArch() == llvm::Triple::mips64 ||
            getToolChain().getArch() == llvm::Triple::mips64el) {
     StringRef CPUName;
     StringRef ABIName;
     getMipsCPUAndABI(Args, getToolChain(), CPUName, ABIName);
 
     CmdArgs.push_back("-march");
     CmdArgs.push_back(CPUName.data());
 
     // Convert ABI name to the GNU tools acceptable variant.
     if (ABIName == "o32")
       ABIName = "32";
     else if (ABIName == "n64")
       ABIName = "64";
 
     CmdArgs.push_back("-mabi");
     CmdArgs.push_back(ABIName.data());
 
     if (getToolChain().getArch() == llvm::Triple::mips ||
         getToolChain().getArch() == llvm::Triple::mips64)
       CmdArgs.push_back("-EB");
     else
       CmdArgs.push_back("-EL");
 
     Arg *LastPICArg = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC,
                                       options::OPT_fpic, options::OPT_fno_pic,
                                       options::OPT_fPIE, options::OPT_fno_PIE,
                                       options::OPT_fpie, options::OPT_fno_pie);
     if (LastPICArg &&
         (LastPICArg->getOption().matches(options::OPT_fPIC) ||
          LastPICArg->getOption().matches(options::OPT_fpic) ||
          LastPICArg->getOption().matches(options::OPT_fPIE) ||
          LastPICArg->getOption().matches(options::OPT_fpie))) {
       CmdArgs.push_back("-KPIC");
     }
   } else if (getToolChain().getArch() == llvm::Triple::arm ||
              getToolChain().getArch() == llvm::Triple::thumb) {
     CmdArgs.push_back("-mfpu=softvfp");
     switch(getToolChain().getTriple().getEnvironment()) {
     case llvm::Triple::GNUEABI:
     case llvm::Triple::EABI:
       break;
 
     default:
       CmdArgs.push_back("-matpcs");
     }
   }
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void freebsd::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                  const InputInfo &Output,
                                  const InputInfoList &Inputs,
                                  const ArgList &Args,
                                  const char *LinkingOutput) const {
   const toolchains::FreeBSD& ToolChain = 
     static_cast<const toolchains::FreeBSD&>(getToolChain());
   const Driver &D = ToolChain.getDriver();
   ArgStringList CmdArgs;
 
   // Silence warning for "clang -g foo.o -o foo"
   Args.ClaimAllArgs(options::OPT_g_Group);
   // and "clang -emit-llvm foo.o -o foo"
   Args.ClaimAllArgs(options::OPT_emit_llvm);
   // and for "clang -w foo.o -o foo". Other warning options are already
   // handled somewhere else.
   Args.ClaimAllArgs(options::OPT_w);
 
   if (!D.SysRoot.empty())
     CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot));
 
   if (Args.hasArg(options::OPT_pie))
     CmdArgs.push_back("-pie");
 
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
   } else {
     if (Args.hasArg(options::OPT_rdynamic))
       CmdArgs.push_back("-export-dynamic");
     CmdArgs.push_back("--eh-frame-hdr");
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-Bshareable");
     } else {
       CmdArgs.push_back("-dynamic-linker");
       CmdArgs.push_back("/libexec/ld-elf.so.1");
     }
     if (ToolChain.getTriple().getOSMajorVersion() >= 9) {
       llvm::Triple::ArchType Arch = ToolChain.getArch();
       if (Arch == llvm::Triple::arm || Arch == llvm::Triple::sparc ||
           Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64) {
         CmdArgs.push_back("--hash-style=both");
       }
     }
     CmdArgs.push_back("--enable-new-dtags");
   }
 
   // When building 32-bit code on FreeBSD/amd64, we have to explicitly
   // instruct ld in the base system to link 32-bit code.
   if (ToolChain.getArch() == llvm::Triple::x86) {
     CmdArgs.push_back("-m");
     CmdArgs.push_back("elf_i386_fbsd");
   }
 
   if (ToolChain.getArch() == llvm::Triple::ppc) {
     CmdArgs.push_back("-m");
     CmdArgs.push_back("elf32ppc_fbsd");
   }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     const char *crt1 = NULL;
     if (!Args.hasArg(options::OPT_shared)) {
       if (Args.hasArg(options::OPT_pg))
         crt1 = "gcrt1.o";
       else if (Args.hasArg(options::OPT_pie))
         crt1 = "Scrt1.o";
       else
         crt1 = "crt1.o";
     }
     if (crt1)
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crt1)));
 
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o")));
 
     const char *crtbegin = NULL;
     if (Args.hasArg(options::OPT_static))
       crtbegin = "crtbeginT.o";
     else if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie))
       crtbegin = "crtbeginS.o";
     else
       crtbegin = "crtbegin.o";
 
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtbegin)));
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   const ToolChain::path_list Paths = ToolChain.getFilePaths();
   for (ToolChain::path_list::const_iterator i = Paths.begin(), e = Paths.end();
        i != e; ++i)
     CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + *i));
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
   Args.AddAllArgs(CmdArgs, options::OPT_s);
   Args.AddAllArgs(CmdArgs, options::OPT_t);
   Args.AddAllArgs(CmdArgs, options::OPT_Z_Flag);
   Args.AddAllArgs(CmdArgs, options::OPT_r);
 
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     if (D.CCCIsCXX) {
       ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
       if (Args.hasArg(options::OPT_pg))
         CmdArgs.push_back("-lm_p");
       else
         CmdArgs.push_back("-lm");
     }
     // FIXME: For some reason GCC passes -lgcc and -lgcc_s before adding
     // the default system libraries. Just mimic this for now.
     if (Args.hasArg(options::OPT_pg))
       CmdArgs.push_back("-lgcc_p");
     else
       CmdArgs.push_back("-lgcc");
     if (Args.hasArg(options::OPT_static)) {
       CmdArgs.push_back("-lgcc_eh");
     } else if (Args.hasArg(options::OPT_pg)) {
       CmdArgs.push_back("-lgcc_eh_p");
     } else {
       CmdArgs.push_back("--as-needed");
       CmdArgs.push_back("-lgcc_s");
       CmdArgs.push_back("--no-as-needed");
     }
 
     if (Args.hasArg(options::OPT_pthread)) {
       if (Args.hasArg(options::OPT_pg))
         CmdArgs.push_back("-lpthread_p");
       else
         CmdArgs.push_back("-lpthread");
     }
 
     if (Args.hasArg(options::OPT_pg)) {
       if (Args.hasArg(options::OPT_shared))
         CmdArgs.push_back("-lc");
       else
         CmdArgs.push_back("-lc_p");
       CmdArgs.push_back("-lgcc_p");
     } else {
       CmdArgs.push_back("-lc");
       CmdArgs.push_back("-lgcc");
     }
 
     if (Args.hasArg(options::OPT_static)) {
       CmdArgs.push_back("-lgcc_eh");
     } else if (Args.hasArg(options::OPT_pg)) {
       CmdArgs.push_back("-lgcc_eh_p");
     } else {
       CmdArgs.push_back("--as-needed");
       CmdArgs.push_back("-lgcc_s");
       CmdArgs.push_back("--no-as-needed");
     }
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie))
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtendS.o")));
     else
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtend.o")));
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtn.o")));
   }
 
   addProfileRT(ToolChain, Args, CmdArgs, ToolChain.getTriple());
 
   const char *Exec =
     Args.MakeArgString(ToolChain.GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void netbsd::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                      const InputInfo &Output,
                                      const InputInfoList &Inputs,
                                      const ArgList &Args,
                                      const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   // When building 32-bit code on NetBSD/amd64, we have to explicitly
   // instruct as in the base system to assemble 32-bit code.
   if (getToolChain().getArch() == llvm::Triple::x86)
     CmdArgs.push_back("--32");
 
   // Set byte order explicitly
   if (getToolChain().getArch() == llvm::Triple::mips)
     CmdArgs.push_back("-EB");
   else if (getToolChain().getArch() == llvm::Triple::mipsel)
     CmdArgs.push_back("-EL");
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec = Args.MakeArgString((getToolChain().GetProgramPath("as")));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void netbsd::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                  const InputInfo &Output,
                                  const InputInfoList &Inputs,
                                  const ArgList &Args,
                                  const char *LinkingOutput) const {
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   if (!D.SysRoot.empty())
     CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot));
 
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
   } else {
     if (Args.hasArg(options::OPT_rdynamic))
       CmdArgs.push_back("-export-dynamic");
     CmdArgs.push_back("--eh-frame-hdr");
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-Bshareable");
     } else {
       CmdArgs.push_back("-dynamic-linker");
       CmdArgs.push_back("/libexec/ld.elf_so");
     }
   }
 
   // When building 32-bit code on NetBSD/amd64, we have to explicitly
   // instruct ld in the base system to link 32-bit code.
   if (getToolChain().getArch() == llvm::Triple::x86) {
     CmdArgs.push_back("-m");
     CmdArgs.push_back("elf_i386");
   }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crt0.o")));
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crti.o")));
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtbegin.o")));
     } else {
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crti.o")));
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtbeginS.o")));
     }
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
   Args.AddAllArgs(CmdArgs, options::OPT_s);
   Args.AddAllArgs(CmdArgs, options::OPT_t);
   Args.AddAllArgs(CmdArgs, options::OPT_Z_Flag);
   Args.AddAllArgs(CmdArgs, options::OPT_r);
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     if (D.CCCIsCXX) {
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
     // FIXME: For some reason GCC passes -lgcc and -lgcc_s before adding
     // the default system libraries. Just mimic this for now.
     if (Args.hasArg(options::OPT_static)) {
       CmdArgs.push_back("-lgcc_eh");
     } else {
       CmdArgs.push_back("--as-needed");
       CmdArgs.push_back("-lgcc_s");
       CmdArgs.push_back("--no-as-needed");
     }
     CmdArgs.push_back("-lgcc");
 
     if (Args.hasArg(options::OPT_pthread))
       CmdArgs.push_back("-lpthread");
     CmdArgs.push_back("-lc");
 
     CmdArgs.push_back("-lgcc");
     if (Args.hasArg(options::OPT_static)) {
       CmdArgs.push_back("-lgcc_eh");
     } else {
       CmdArgs.push_back("--as-needed");
       CmdArgs.push_back("-lgcc_s");
       CmdArgs.push_back("--no-as-needed");
     }
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared))
       CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath(
                                                                   "crtend.o")));
     else
       CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath(
                                                                  "crtendS.o")));
     CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath(
                                                                     "crtn.o")));
   }
 
   addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple());
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void linuxtools::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                         const InputInfo &Output,
                                         const InputInfoList &Inputs,
                                         const ArgList &Args,
                                         const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   // Add --32/--64 to make sure we get the format we want.
   // This is incomplete
   if (getToolChain().getArch() == llvm::Triple::x86) {
     CmdArgs.push_back("--32");
   } else if (getToolChain().getArch() == llvm::Triple::x86_64) {
     CmdArgs.push_back("--64");
   } else if (getToolChain().getArch() == llvm::Triple::ppc) {
     CmdArgs.push_back("-a32");
     CmdArgs.push_back("-mppc");
     CmdArgs.push_back("-many");
   } else if (getToolChain().getArch() == llvm::Triple::ppc64) {
     CmdArgs.push_back("-a64");
     CmdArgs.push_back("-mppc64");
     CmdArgs.push_back("-many");
   } else if (getToolChain().getArch() == llvm::Triple::arm) {
     StringRef MArch = getToolChain().getArchName();
     if (MArch == "armv7" || MArch == "armv7a" || MArch == "armv7-a")
       CmdArgs.push_back("-mfpu=neon");
 
     StringRef ARMFloatABI = getARMFloatABI(getToolChain().getDriver(), Args,
                                            getToolChain().getTriple());
     CmdArgs.push_back(Args.MakeArgString("-mfloat-abi=" + ARMFloatABI));
 
     Args.AddLastArg(CmdArgs, options::OPT_march_EQ);
     Args.AddLastArg(CmdArgs, options::OPT_mcpu_EQ);
     Args.AddLastArg(CmdArgs, options::OPT_mfpu_EQ);
   } else if (getToolChain().getArch() == llvm::Triple::mips ||
              getToolChain().getArch() == llvm::Triple::mipsel ||
              getToolChain().getArch() == llvm::Triple::mips64 ||
              getToolChain().getArch() == llvm::Triple::mips64el) {
     StringRef CPUName;
     StringRef ABIName;
     getMipsCPUAndABI(Args, getToolChain(), CPUName, ABIName);
 
     CmdArgs.push_back("-march");
     CmdArgs.push_back(CPUName.data());
 
     // Convert ABI name to the GNU tools acceptable variant.
     if (ABIName == "o32")
       ABIName = "32";
     else if (ABIName == "n64")
       ABIName = "64";
 
     CmdArgs.push_back("-mabi");
     CmdArgs.push_back(ABIName.data());
 
     if (getToolChain().getArch() == llvm::Triple::mips ||
         getToolChain().getArch() == llvm::Triple::mips64)
       CmdArgs.push_back("-EB");
     else
       CmdArgs.push_back("-EL");
 
     Arg *LastPICArg = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC,
                                       options::OPT_fpic, options::OPT_fno_pic,
                                       options::OPT_fPIE, options::OPT_fno_PIE,
                                       options::OPT_fpie, options::OPT_fno_pie);
     if (LastPICArg &&
         (LastPICArg->getOption().matches(options::OPT_fPIC) ||
          LastPICArg->getOption().matches(options::OPT_fpic) ||
          LastPICArg->getOption().matches(options::OPT_fPIE) ||
          LastPICArg->getOption().matches(options::OPT_fpie))) {
       CmdArgs.push_back("-KPIC");
     }
   }
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 static void AddLibgcc(llvm::Triple Triple, const Driver &D,
                       ArgStringList &CmdArgs, const ArgList &Args) {
   bool isAndroid = Triple.getEnvironment() == llvm::Triple::Android;
   bool StaticLibgcc = isAndroid || Args.hasArg(options::OPT_static) ||
     Args.hasArg(options::OPT_static_libgcc);
   if (!D.CCCIsCXX)
     CmdArgs.push_back("-lgcc");
 
   if (StaticLibgcc) {
     if (D.CCCIsCXX)
       CmdArgs.push_back("-lgcc");
   } else {
     if (!D.CCCIsCXX)
       CmdArgs.push_back("--as-needed");
     CmdArgs.push_back("-lgcc_s");
     if (!D.CCCIsCXX)
       CmdArgs.push_back("--no-as-needed");
   }
 
   if (StaticLibgcc && !isAndroid)
     CmdArgs.push_back("-lgcc_eh");
   else if (!Args.hasArg(options::OPT_shared) && D.CCCIsCXX)
     CmdArgs.push_back("-lgcc");
 }
 
 static bool hasMipsN32ABIArg(const ArgList &Args) {
   Arg *A = Args.getLastArg(options::OPT_mabi_EQ);
   return A && (A->getValue() == StringRef("n32"));
 }
 
 void linuxtools::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs,
                                     const ArgList &Args,
                                     const char *LinkingOutput) const {
   const toolchains::Linux& ToolChain =
     static_cast<const toolchains::Linux&>(getToolChain());
   const Driver &D = ToolChain.getDriver();
   const bool isAndroid =
     ToolChain.getTriple().getEnvironment() == llvm::Triple::Android;
 
   ArgStringList CmdArgs;
 
   // Silence warning for "clang -g foo.o -o foo"
   Args.ClaimAllArgs(options::OPT_g_Group);
   // and "clang -emit-llvm foo.o -o foo"
   Args.ClaimAllArgs(options::OPT_emit_llvm);
   // and for "clang -w foo.o -o foo". Other warning options are already
   // handled somewhere else.
   Args.ClaimAllArgs(options::OPT_w);
 
   if (!D.SysRoot.empty())
     CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot));
 
   if (Args.hasArg(options::OPT_pie))
     CmdArgs.push_back("-pie");
 
   if (Args.hasArg(options::OPT_rdynamic))
     CmdArgs.push_back("-export-dynamic");
 
   if (Args.hasArg(options::OPT_s))
     CmdArgs.push_back("-s");
 
   for (std::vector<std::string>::const_iterator i = ToolChain.ExtraOpts.begin(),
          e = ToolChain.ExtraOpts.end();
        i != e; ++i)
     CmdArgs.push_back(i->c_str());
 
   if (!Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("--eh-frame-hdr");
   }
 
   CmdArgs.push_back("-m");
   if (ToolChain.getArch() == llvm::Triple::x86)
     CmdArgs.push_back("elf_i386");
   else if (ToolChain.getArch() == llvm::Triple::arm
            ||  ToolChain.getArch() == llvm::Triple::thumb)
     CmdArgs.push_back("armelf_linux_eabi");
   else if (ToolChain.getArch() == llvm::Triple::ppc)
     CmdArgs.push_back("elf32ppclinux");
   else if (ToolChain.getArch() == llvm::Triple::ppc64)
     CmdArgs.push_back("elf64ppc");
   else if (ToolChain.getArch() == llvm::Triple::mips)
     CmdArgs.push_back("elf32btsmip");
   else if (ToolChain.getArch() == llvm::Triple::mipsel)
     CmdArgs.push_back("elf32ltsmip");
   else if (ToolChain.getArch() == llvm::Triple::mips64) {
     if (hasMipsN32ABIArg(Args))
       CmdArgs.push_back("elf32btsmipn32");
     else
       CmdArgs.push_back("elf64btsmip");
   }
   else if (ToolChain.getArch() == llvm::Triple::mips64el) {
     if (hasMipsN32ABIArg(Args))
       CmdArgs.push_back("elf32ltsmipn32");
     else
       CmdArgs.push_back("elf64ltsmip");
   }
   else
     CmdArgs.push_back("elf_x86_64");
 
   if (Args.hasArg(options::OPT_static)) {
     if (ToolChain.getArch() == llvm::Triple::arm
         || ToolChain.getArch() == llvm::Triple::thumb)
       CmdArgs.push_back("-Bstatic");
     else
       CmdArgs.push_back("-static");
   } else if (Args.hasArg(options::OPT_shared)) {
     CmdArgs.push_back("-shared");
     if (isAndroid) {
       CmdArgs.push_back("-Bsymbolic");
     }
   }
 
   if (ToolChain.getArch() == llvm::Triple::arm ||
       ToolChain.getArch() == llvm::Triple::thumb ||
       (!Args.hasArg(options::OPT_static) &&
        !Args.hasArg(options::OPT_shared))) {
     CmdArgs.push_back("-dynamic-linker");
     if (isAndroid)
       CmdArgs.push_back("/system/bin/linker");
     else if (ToolChain.getArch() == llvm::Triple::x86)
       CmdArgs.push_back("/lib/ld-linux.so.2");
     else if (ToolChain.getArch() == llvm::Triple::arm ||
              ToolChain.getArch() == llvm::Triple::thumb) {
       if (ToolChain.getTriple().getEnvironment() == llvm::Triple::GNUEABIHF)
         CmdArgs.push_back("/lib/ld-linux-armhf.so.3");
       else
         CmdArgs.push_back("/lib/ld-linux.so.3");
     }
     else if (ToolChain.getArch() == llvm::Triple::mips ||
              ToolChain.getArch() == llvm::Triple::mipsel)
       CmdArgs.push_back("/lib/ld.so.1");
     else if (ToolChain.getArch() == llvm::Triple::mips64 ||
              ToolChain.getArch() == llvm::Triple::mips64el) {
       if (hasMipsN32ABIArg(Args))
         CmdArgs.push_back("/lib32/ld.so.1");
       else
         CmdArgs.push_back("/lib64/ld.so.1");
     }
     else if (ToolChain.getArch() == llvm::Triple::ppc)
       CmdArgs.push_back("/lib/ld.so.1");
     else if (ToolChain.getArch() == llvm::Triple::ppc64)
       CmdArgs.push_back("/lib64/ld64.so.1");
     else
       CmdArgs.push_back("/lib64/ld-linux-x86-64.so.2");
   }
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!isAndroid) {
       const char *crt1 = NULL;
       if (!Args.hasArg(options::OPT_shared)){
         if (Args.hasArg(options::OPT_pie))
           crt1 = "Scrt1.o";
         else
           crt1 = "crt1.o";
       }
       if (crt1)
         CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crt1)));
 
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o")));
     }
 
     const char *crtbegin;
     if (Args.hasArg(options::OPT_static))
       crtbegin = isAndroid ? "crtbegin_static.o" : "crtbeginT.o";
     else if (Args.hasArg(options::OPT_shared))
       crtbegin = isAndroid ? "crtbegin_so.o" : "crtbeginS.o";
     else if (Args.hasArg(options::OPT_pie))
       crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbeginS.o";
     else
       crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbegin.o";
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtbegin)));
 
     // Add crtfastmath.o if available and fast math is enabled.
     ToolChain.AddFastMathRuntimeIfAvailable(Args, CmdArgs);
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
 
   const ToolChain::path_list Paths = ToolChain.getFilePaths();
 
   for (ToolChain::path_list::const_iterator i = Paths.begin(), e = Paths.end();
        i != e; ++i)
     CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + *i));
 
   // Tell the linker to load the plugin. This has to come before AddLinkerInputs
   // as gold requires -plugin to come before any -plugin-opt that -Wl might
   // forward.
   if (D.IsUsingLTO(Args) || Args.hasArg(options::OPT_use_gold_plugin)) {
     CmdArgs.push_back("-plugin");
     std::string Plugin = ToolChain.getDriver().Dir + "/../lib/LLVMgold.so";
     CmdArgs.push_back(Args.MakeArgString(Plugin));
+
+    // Try to pass driver level flags relevant to LTO code generation down to
+    // the plugin.
+
+    // Handle architecture-specific flags for selecting CPU variants.
+    if (ToolChain.getArch() == llvm::Triple::x86 ||
+        ToolChain.getArch() == llvm::Triple::x86_64)
+      CmdArgs.push_back(
+          Args.MakeArgString(Twine("-plugin-opt=mcpu=") +
+                             getX86TargetCPU(Args, ToolChain.getTriple())));
+    else if (ToolChain.getArch() == llvm::Triple::arm ||
+             ToolChain.getArch() == llvm::Triple::thumb)
+      CmdArgs.push_back(
+          Args.MakeArgString(Twine("-plugin-opt=mcpu=") +
+                             getARMTargetCPU(Args, ToolChain.getTriple())));
+
+    // FIXME: Factor out logic for MIPS, PPC, and other targets to support this
+    // as well.
   }
+
 
   if (Args.hasArg(options::OPT_Z_Xlinker__no_demangle))
     CmdArgs.push_back("--no-demangle");
 
   AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs);
 
   SanitizerArgs Sanitize(D, Args);
 
   // Call this before we add the C++ ABI library.
   if (Sanitize.needsUbsanRt())
     addUbsanRTLinux(getToolChain(), Args, CmdArgs);
 
   if (D.CCCIsCXX &&
       !Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
       !Args.hasArg(options::OPT_static);
     if (OnlyLibstdcxxStatic)
       CmdArgs.push_back("-Bstatic");
     ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
     if (OnlyLibstdcxxStatic)
       CmdArgs.push_back("-Bdynamic");
     CmdArgs.push_back("-lm");
   }
 
   // Call this before we add the C run-time.
   if (Sanitize.needsAsanRt())
     addAsanRTLinux(getToolChain(), Args, CmdArgs);
   if (Sanitize.needsTsanRt())
     addTsanRTLinux(getToolChain(), Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib)) {
     if (!Args.hasArg(options::OPT_nodefaultlibs)) {
       if (Args.hasArg(options::OPT_static))
         CmdArgs.push_back("--start-group");
 
       AddLibgcc(ToolChain.getTriple(), D, CmdArgs, Args);
 
       if (Args.hasArg(options::OPT_pthread) ||
           Args.hasArg(options::OPT_pthreads))
         CmdArgs.push_back("-lpthread");
 
       CmdArgs.push_back("-lc");
 
       if (Args.hasArg(options::OPT_static))
         CmdArgs.push_back("--end-group");
       else
         AddLibgcc(ToolChain.getTriple(), D, CmdArgs, Args);
     }
 
     if (!Args.hasArg(options::OPT_nostartfiles)) {
       const char *crtend;
       if (Args.hasArg(options::OPT_shared))
         crtend = isAndroid ? "crtend_so.o" : "crtendS.o";
       else if (Args.hasArg(options::OPT_pie))
         crtend = isAndroid ? "crtend_android.o" : "crtendS.o";
       else
         crtend = isAndroid ? "crtend_android.o" : "crtend.o";
 
       CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtend)));
       if (!isAndroid)
         CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtn.o")));
     }
   }
 
   addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple());
 
   C.addCommand(new Command(JA, *this, ToolChain.Linker.c_str(), CmdArgs));
 }
 
 void minix::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfo &Output,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void minix::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                const InputInfo &Output,
                                const InputInfoList &Inputs,
                                const ArgList &Args,
                                const char *LinkingOutput) const {
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
       CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crt1.o")));
       CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crti.o")));
       CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
       CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtn.o")));
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
 
   addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple());
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     if (D.CCCIsCXX) {
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (Args.hasArg(options::OPT_pthread))
       CmdArgs.push_back("-lpthread");
     CmdArgs.push_back("-lc");
     CmdArgs.push_back("-lCompilerRT-Generic");
     CmdArgs.push_back("-L/usr/pkg/compiler-rt/lib");
     CmdArgs.push_back(
 	 Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
   }
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 /// DragonFly Tools
 
 // For now, DragonFly Assemble does just about the same as for
 // FreeBSD, but this may change soon.
 void dragonfly::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
                                        const InputInfo &Output,
                                        const InputInfoList &Inputs,
                                        const ArgList &Args,
                                        const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   // When building 32-bit code on DragonFly/pc64, we have to explicitly
   // instruct as in the base system to assemble 32-bit code.
   if (getToolChain().getArch() == llvm::Triple::x86)
     CmdArgs.push_back("--32");
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA,
                        options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   for (InputInfoList::const_iterator
          it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     const InputInfo &II = *it;
     CmdArgs.push_back(II.getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void dragonfly::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfo &Output,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   if (!D.SysRoot.empty())
     CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot));
 
   if (Args.hasArg(options::OPT_static)) {
     CmdArgs.push_back("-Bstatic");
   } else {
     if (Args.hasArg(options::OPT_shared))
       CmdArgs.push_back("-Bshareable");
     else {
       CmdArgs.push_back("-dynamic-linker");
       CmdArgs.push_back("/usr/libexec/ld-elf.so.2");
     }
   }
 
   // When building 32-bit code on DragonFly/pc64, we have to explicitly
   // instruct ld in the base system to link 32-bit code.
   if (getToolChain().getArch() == llvm::Triple::x86) {
     CmdArgs.push_back("-m");
     CmdArgs.push_back("elf_i386");
   }
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back(
             Args.MakeArgString(getToolChain().GetFilePath("crt1.o")));
       CmdArgs.push_back(
             Args.MakeArgString(getToolChain().GetFilePath("crti.o")));
       CmdArgs.push_back(
             Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
     } else {
       CmdArgs.push_back(
             Args.MakeArgString(getToolChain().GetFilePath("crti.o")));
       CmdArgs.push_back(
             Args.MakeArgString(getToolChain().GetFilePath("crtbeginS.o")));
     }
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   Args.AddAllArgs(CmdArgs, options::OPT_T_Group);
   Args.AddAllArgs(CmdArgs, options::OPT_e);
 
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nodefaultlibs)) {
     // FIXME: GCC passes on -lgcc, -lgcc_pic and a whole lot of
     //         rpaths
     CmdArgs.push_back("-L/usr/lib/gcc41");
 
     if (!Args.hasArg(options::OPT_static)) {
       CmdArgs.push_back("-rpath");
       CmdArgs.push_back("/usr/lib/gcc41");
 
       CmdArgs.push_back("-rpath-link");
       CmdArgs.push_back("/usr/lib/gcc41");
 
       CmdArgs.push_back("-rpath");
       CmdArgs.push_back("/usr/lib");
 
       CmdArgs.push_back("-rpath-link");
       CmdArgs.push_back("/usr/lib");
     }
 
     if (D.CCCIsCXX) {
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
 
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-lgcc_pic");
     } else {
       CmdArgs.push_back("-lgcc");
     }
 
 
     if (Args.hasArg(options::OPT_pthread))
       CmdArgs.push_back("-lpthread");
 
     if (!Args.hasArg(options::OPT_nolibc)) {
       CmdArgs.push_back("-lc");
     }
 
     if (Args.hasArg(options::OPT_shared)) {
       CmdArgs.push_back("-lgcc_pic");
     } else {
       CmdArgs.push_back("-lgcc");
     }
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
       !Args.hasArg(options::OPT_nostartfiles)) {
     if (!Args.hasArg(options::OPT_shared))
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtend.o")));
     else
       CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtendS.o")));
     CmdArgs.push_back(Args.MakeArgString(
                               getToolChain().GetFilePath("crtn.o")));
   }
 
   addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple());
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("ld"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
 
 void visualstudio::Link::ConstructJob(Compilation &C, const JobAction &JA,
                                       const InputInfo &Output,
                                       const InputInfoList &Inputs,
                                       const ArgList &Args,
                                       const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   if (Output.isFilename()) {
     CmdArgs.push_back(Args.MakeArgString(std::string("-out:") +
                                          Output.getFilename()));
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   if (!Args.hasArg(options::OPT_nostdlib) &&
     !Args.hasArg(options::OPT_nostartfiles)) {
     CmdArgs.push_back("-defaultlib:libcmt");
   }
 
   CmdArgs.push_back("-nologo");
 
   Args.AddAllArgValues(CmdArgs, options::OPT_l);
 
   // Add filenames immediately.
   for (InputInfoList::const_iterator
        it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
     if (it->isFilename())
       CmdArgs.push_back(it->getFilename());
   }
 
   const char *Exec =
     Args.MakeArgString(getToolChain().GetProgramPath("link.exe"));
   C.addCommand(new Command(JA, *this, Exec, CmdArgs));
 }
Index: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.h
===================================================================
--- user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.h	(revision 247191)
+++ user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.h	(revision 247192)
@@ -1,646 +1,647 @@
 //===--- Tools.h - Tool Implementations -------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef CLANG_LIB_DRIVER_TOOLS_H_
 #define CLANG_LIB_DRIVER_TOOLS_H_
 
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Compiler.h"
 
 namespace clang {
   class ObjCRuntime;
 
 namespace driver {
   class Driver;
 
 namespace toolchains {
   class Darwin;
 }
 
 namespace tools {
 
   /// \brief Clang compiler tool.
   class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
     void AddPreprocessingOptions(Compilation &C,
                                  const Driver &D,
                                  const ArgList &Args,
                                  ArgStringList &CmdArgs,
                                  const InputInfo &Output,
                                  const InputInfoList &Inputs) const;
 
     void AddARMTargetArgs(const ArgList &Args, ArgStringList &CmdArgs,
                           bool KernelOrKext) const;
     void AddMIPSTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const;
     void AddPPCTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const;
     void AddSparcTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const;
     void AddX86TargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const;
     void AddHexagonTargetArgs (const ArgList &Args, ArgStringList &CmdArgs) const;
 
     enum RewriteKind { RK_None, RK_Fragile, RK_NonFragile };
 
     ObjCRuntime AddObjCRuntimeArgs(const ArgList &args, ArgStringList &cmdArgs,
                                    RewriteKind rewrite) const;
 
   public:
     Clang(const ToolChain &TC) : Tool("clang", "clang frontend", TC) {}
 
     virtual bool hasGoodDiagnostics() const { return true; }
     virtual bool hasIntegratedAssembler() const { return true; }
     virtual bool hasIntegratedCPP() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   /// \brief Clang integrated assembler tool.
   class LLVM_LIBRARY_VISIBILITY ClangAs : public Tool {
     void AddARMTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const;
+    void AddX86TargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const;
   public:
     ClangAs(const ToolChain &TC) : Tool("clang::as",
                                         "clang integrated assembler", TC) {}
 
     virtual bool hasGoodDiagnostics() const { return true; }
     virtual bool hasIntegratedAssembler() const { return false; }
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   /// gcc - Generic GCC tool implementations.
 namespace gcc {
   class LLVM_LIBRARY_VISIBILITY Common : public Tool {
   public:
     Common(const char *Name, const char *ShortName,
            const ToolChain &TC) : Tool(Name, ShortName, TC) {}
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
 
     /// RenderExtraToolArgs - Render any arguments necessary to force
     /// the particular tool mode.
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const = 0;
   };
 
 
   class LLVM_LIBRARY_VISIBILITY Preprocess : public Common {
   public:
     Preprocess(const ToolChain &TC) : Common("gcc::Preprocess",
                                              "gcc preprocessor", TC) {}
 
     virtual bool hasGoodDiagnostics() const { return true; }
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Precompile : public Common  {
   public:
     Precompile(const ToolChain &TC) : Common("gcc::Precompile",
                                              "gcc precompile", TC) {}
 
     virtual bool hasGoodDiagnostics() const { return true; }
     virtual bool hasIntegratedCPP() const { return true; }
 
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Compile : public Common  {
   public:
     Compile(const ToolChain &TC) : Common("gcc::Compile",
                                           "gcc frontend", TC) {}
 
     virtual bool hasGoodDiagnostics() const { return true; }
     virtual bool hasIntegratedCPP() const { return true; }
 
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Assemble : public Common  {
   public:
     Assemble(const ToolChain &TC) : Common("gcc::Assemble",
                                            "assembler (via gcc)", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Link : public Common  {
   public:
     Link(const ToolChain &TC) : Common("gcc::Link",
                                        "linker (via gcc)", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const;
   };
 } // end namespace gcc
 
 namespace hexagon {
   // For Hexagon, we do not need to instantiate tools for PreProcess, PreCompile and Compile.
   // We simply use "clang -cc1" for those actions.
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool {
   public:
     Assemble(const ToolChain &TC) : Tool("hexagon::Assemble",
       "hexagon-as", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const;
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Link : public Tool {
   public:
     Link(const ToolChain &TC) : Tool("hexagon::Link",
       "hexagon-ld", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void RenderExtraToolArgs(const JobAction &JA,
                                      ArgStringList &CmdArgs) const;
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace hexagon.
 
 
 namespace darwin {
   llvm::Triple::ArchType getArchTypeForDarwinArchName(StringRef Str);
 
   class LLVM_LIBRARY_VISIBILITY DarwinTool : public Tool {
     virtual void anchor();
   protected:
     void AddDarwinArch(const ArgList &Args, ArgStringList &CmdArgs) const;
 
     const toolchains::Darwin &getDarwinToolChain() const {
       return reinterpret_cast<const toolchains::Darwin&>(getToolChain());
     }
 
   public:
     DarwinTool(const char *Name, const char *ShortName,
                const ToolChain &TC) : Tool(Name, ShortName, TC) {}
   };
 
   class LLVM_LIBRARY_VISIBILITY CC1 : public DarwinTool  {
     virtual void anchor();
   public:
     static const char *getBaseInputName(const ArgList &Args,
                                  const InputInfoList &Input);
     static const char *getBaseInputStem(const ArgList &Args,
                                  const InputInfoList &Input);
     static const char *getDependencyFileName(const ArgList &Args,
                                              const InputInfoList &Inputs);
 
   protected:
     const char *getCC1Name(types::ID Type) const;
 
     void AddCC1Args(const ArgList &Args, ArgStringList &CmdArgs) const;
     void RemoveCC1UnsupportedArgs(ArgStringList &CmdArgs) const;
     void AddCC1OptionsArgs(const ArgList &Args, ArgStringList &CmdArgs,
                            const InputInfoList &Inputs,
                            const ArgStringList &OutputArgs) const;
     void AddCPPOptionsArgs(const ArgList &Args, ArgStringList &CmdArgs,
                            const InputInfoList &Inputs,
                            const ArgStringList &OutputArgs) const;
     void AddCPPUniqueOptionsArgs(const ArgList &Args,
                                  ArgStringList &CmdArgs,
                                  const InputInfoList &Inputs) const;
     void AddCPPArgs(const ArgList &Args, ArgStringList &CmdArgs) const;
 
   public:
     CC1(const char *Name, const char *ShortName,
         const ToolChain &TC) : DarwinTool(Name, ShortName, TC) {}
 
     virtual bool hasGoodDiagnostics() const { return true; }
     virtual bool hasIntegratedCPP() const { return true; }
   };
 
   class LLVM_LIBRARY_VISIBILITY Preprocess : public CC1  {
   public:
     Preprocess(const ToolChain &TC) : CC1("darwin::Preprocess",
                                           "gcc preprocessor", TC) {}
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Compile : public CC1  {
   public:
     Compile(const ToolChain &TC) : CC1("darwin::Compile", "gcc frontend", TC) {}
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Assemble : public DarwinTool  {
   public:
     Assemble(const ToolChain &TC) : DarwinTool("darwin::Assemble",
                                                "assembler", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Link : public DarwinTool  {
     bool NeedsTempPath(const InputInfoList &Inputs) const;
     void AddLinkArgs(Compilation &C, const ArgList &Args,
                      ArgStringList &CmdArgs, const InputInfoList &Inputs) const;
 
   public:
     Link(const ToolChain &TC) : DarwinTool("darwin::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Lipo : public DarwinTool  {
   public:
     Lipo(const ToolChain &TC) : DarwinTool("darwin::Lipo", "lipo", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY Dsymutil : public DarwinTool  {
   public:
     Dsymutil(const ToolChain &TC) : DarwinTool("darwin::Dsymutil",
                                                "dsymutil", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 
   class LLVM_LIBRARY_VISIBILITY VerifyDebug : public DarwinTool  {
   public:
     VerifyDebug(const ToolChain &TC) : DarwinTool("darwin::VerifyDebug",
 						  "dwarfdump", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
 			      const InputInfo &Output,
 			      const InputInfoList &Inputs,
 			      const ArgList &TCArgs,
 			      const char *LinkingOutput) const;
   };
 
 }
 
   /// openbsd -- Directly call GNU Binutils assembler and linker
 namespace openbsd {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("openbsd::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("openbsd::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace openbsd
 
   /// bitrig -- Directly call GNU Binutils assembler and linker
 namespace bitrig {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("bitrig::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("bitrig::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace bitrig
 
   /// freebsd -- Directly call GNU Binutils assembler and linker
 namespace freebsd {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("freebsd::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("freebsd::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace freebsd
 
   /// netbsd -- Directly call GNU Binutils assembler and linker
 namespace netbsd {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
 
   public:
     Assemble(const ToolChain &TC)
       : Tool("netbsd::Assemble", "assembler", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
 
   public:
     Link(const ToolChain &TC)
       : Tool("netbsd::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace netbsd
 
   /// linux -- Directly call GNU Binutils assembler and linker
 namespace linuxtools {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("linux::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("linux::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 }
   /// minix -- Directly call GNU Binutils assembler and linker
 namespace minix {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("minix::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("minix::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace minix
 
   /// solaris -- Directly call Solaris assembler and linker
 namespace solaris {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("solaris::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("solaris::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace solaris
 
   /// auroraux -- Directly call GNU Binutils assembler and linker
 namespace auroraux {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("auroraux::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("auroraux::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace auroraux
 
   /// dragonfly -- Directly call GNU Binutils assembler and linker
 namespace dragonfly {
   class LLVM_LIBRARY_VISIBILITY Assemble : public Tool  {
   public:
     Assemble(const ToolChain &TC) : Tool("dragonfly::Assemble", "assembler",
                                          TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("dragonfly::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace dragonfly
 
   /// Visual studio tools.
 namespace visualstudio {
   class LLVM_LIBRARY_VISIBILITY Link : public Tool  {
   public:
     Link(const ToolChain &TC) : Tool("visualstudio::Link", "linker", TC) {}
 
     virtual bool hasIntegratedCPP() const { return false; }
     virtual bool isLinkJob() const { return true; }
 
     virtual void ConstructJob(Compilation &C, const JobAction &JA,
                               const InputInfo &Output,
                               const InputInfoList &Inputs,
                               const ArgList &TCArgs,
                               const char *LinkingOutput) const;
   };
 } // end namespace visualstudio
 
 } // end namespace toolchains
 } // end namespace driver
 } // end namespace clang
 
 #endif // CLANG_LIB_DRIVER_TOOLS_H_
Index: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang
===================================================================
--- user/attilio/vmobj-rwlock/contrib/llvm/tools/clang	(revision 247191)
+++ user/attilio/vmobj-rwlock/contrib/llvm/tools/clang	(revision 247192)

Property changes on: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/contrib/llvm/tools/clang:r247016-247191
Index: user/attilio/vmobj-rwlock/contrib/llvm
===================================================================
--- user/attilio/vmobj-rwlock/contrib/llvm	(revision 247191)
+++ user/attilio/vmobj-rwlock/contrib/llvm	(revision 247192)

Property changes on: user/attilio/vmobj-rwlock/contrib/llvm
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/contrib/llvm:r247016-247191
Index: user/attilio/vmobj-rwlock/etc/Makefile
===================================================================
--- user/attilio/vmobj-rwlock/etc/Makefile	(revision 247191)
+++ user/attilio/vmobj-rwlock/etc/Makefile	(revision 247192)
@@ -1,390 +1,390 @@
 #	from: @(#)Makefile	5.11 (Berkeley) 5/21/91
 # $FreeBSD$
 
 .include <bsd.own.mk>
 
 .if ${MK_SENDMAIL} != "no"
 SUBDIR=	sendmail
 .endif
 
 BIN1=	crontab \
 	devd.conf \
 	devfs.conf \
 	ddb.conf \
 	dhclient.conf \
 	disktab \
 	fbtab \
 	ftpusers \
 	gettytab \
 	group \
 	hosts \
 	hosts.allow \
 	hosts.equiv \
 	inetd.conf \
 	libalias.conf \
 	login.access \
 	login.conf \
 	mac.conf \
 	motd \
 	netconfig \
 	network.subr \
 	networks \
 	newsyslog.conf \
 	nsswitch.conf \
 	phones \
 	profile \
 	protocols \
 	rc \
 	rc.bsdextended \
 	rc.firewall \
 	rc.initdiskless \
 	rc.sendmail \
 	rc.shutdown \
 	rc.subr \
 	remote \
 	rpc \
 	services \
 	shells \
 	sysctl.conf \
 	syslog.conf \
 	termcap.small
 
 .if exists(${.CURDIR}/etc.${MACHINE}/ttys)
 BIN1+=	etc.${MACHINE}/ttys
 .elif exists(${.CURDIR}/etc.${MACHINE_ARCH}/ttys)
 BIN1+=	etc.${MACHINE_ARCH}/ttys
 .elif exists(${.CURDIR}/etc.${MACHINE_CPUARCH}/ttys)
 BIN1+=	etc.${MACHINE_CPUARCH}/ttys
 .else
 .error etc.MACHINE/ttys missing
 .endif
 
 OPENBSMDIR=			${.CURDIR}/../contrib/openbsm
 BSM_ETC_OPEN_FILES=		${OPENBSMDIR}/etc/audit_class \
 				${OPENBSMDIR}/etc/audit_event
 BSM_ETC_RESTRICTED_FILES=	${OPENBSMDIR}/etc/audit_control \
 				${OPENBSMDIR}/etc/audit_user
 BSM_ETC_EXEC_FILES=		${OPENBSMDIR}/etc/audit_warn
 BSM_ETC_DIR=			${DESTDIR}/etc/security
 
 # NB: keep these sorted by MK_* knobs
 
 .if ${MK_AMD} != "no"
 BIN1+= amd.map
 .endif
 
 .if ${MK_APM} != "no"
 BIN1+= apmd.conf
 .endif
 
 .if ${MK_BSNMP} != "no"
 BIN1+= snmpd.config
 .endif
 
 .if ${MK_FREEBSD_UPDATE} != "no"
 BIN1+= freebsd-update.conf
 .endif
 
 .if ${MK_LOCATE} != "no"
 BIN1+=	${.CURDIR}/../usr.bin/locate/locate/locate.rc
 .endif
 
 .if ${MK_LPR} != "no"
 BIN1+=	hosts.lpd printcap
 .endif
 
 .if ${MK_MAIL} != "no"
 BIN1+=	${.CURDIR}/../usr.bin/mail/misc/mail.rc
 .endif
 
 .if ${MK_NTP} != "no"
 BIN1+=	ntp.conf
 .endif
 
 .if ${MK_OPENSSH} != "no"
 SSH=	${.CURDIR}/../crypto/openssh/ssh_config \
 	${.CURDIR}/../crypto/openssh/sshd_config \
 	${.CURDIR}/../crypto/openssh/moduli
 .endif
 .if ${MK_OPENSSL} != "no"
 SSL=	${.CURDIR}/../crypto/openssl/apps/openssl.cnf
 .endif
 
 .if ${MK_NS_CACHING} != "no"
 BIN1+= nscd.conf
 .endif
 
 .if ${MK_PORTSNAP} != "no"
 BIN1+= portsnap.conf
 .endif
 
 .if ${MK_PF} != "no"
 BIN1+= pf.os
 .endif
 
 .if ${MK_TCSH} != "no"
 BIN1+= csh.cshrc csh.login csh.logout
 .endif
 
 .if ${MK_WIRELESS} != "no"
 BIN1+= regdomain.xml
 .endif
 
 # -rwxr-xr-x root:wheel, for the new cron root:wheel
 BIN2=	netstart pccard_ether rc.suspend rc.resume
 
 MTREE=	BSD.include.dist BSD.root.dist BSD.usr.dist BSD.var.dist
 .if ${MK_SENDMAIL} != "no"
 MTREE+=	BSD.sendmail.dist
 .endif
 .if ${MK_BIND} != "no"
 MTREE+=	BIND.chroot.dist
 .if ${MK_BIND_LIBS} != "no"
 MTREE+=	BIND.include.dist
 .endif
 .endif
 
 PPPCNF=	ppp.conf
 
 .if ${MK_SENDMAIL} == "no"
 ETCMAIL=mailer.conf aliases
 .else
 ETCMAIL=Makefile README mailer.conf access.sample virtusertable.sample \
 	mailertable.sample aliases
 .endif
 
 # Special top level files for FreeBSD
 FREEBSD=COPYRIGHT
 
 # Sanitize DESTDIR
 DESTDIR:=	${DESTDIR:C://*:/:g}
 
 afterinstall:
 .if ${MK_MAN} != "no"
 	${_+_}cd ${.CURDIR}/../share/man; ${MAKE} makedb
 .endif
 
 distribute:
 	${_+_}cd ${.CURDIR} ; ${MAKE} install DESTDIR=${DISTDIR}/${DISTRIBUTION}
 	${_+_}cd ${.CURDIR} ; ${MAKE} distribution DESTDIR=${DISTDIR}/${DISTRIBUTION}
 
 .include <bsd.endian.mk>
 .if ${TARGET_ENDIANNESS} == "1234"
 CAP_MKDB_ENDIAN?= -l
 PWD_MKDB_ENDIAN?= -L
 .elif ${TARGET_ENDIANNESS} == "4321"
 CAP_MKDB_ENDIAN?= -b
 PWD_MKDB_ENDIAN?= -B
 .else
 CAP_MKDB_ENDIAN?=
 PWD_MKDB_ENDIAN?=
 .endif
 
 .if defined(NO_ROOT)
 METALOG.add?=	cat -l >> ${METALOG}
 .endif
 
 distribution:
 .if !defined(DESTDIR)
 	@echo "set DESTDIR before running \"make ${.TARGET}\""
 	@false
 .endif
 	cd ${.CURDIR}; \
 	    ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 		${BIN1} ${DESTDIR}/etc; \
 	    cap_mkdb ${CAP_MKDB_ENDIAN} ${DESTDIR}/etc/login.conf; \
 	    ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 755 \
 		${BIN2} ${DESTDIR}/etc; \
 	    ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 600 \
 		master.passwd nsmb.conf opieaccess ${DESTDIR}/etc;
 .if ${MK_AT} == "no"
 	sed -i "" -e 's;.*/usr/libexec/atrun;#&;' ${DESTDIR}/etc/crontab
 .endif
 .if ${MK_TCSH} == "no"
 	sed -i "" -e 's;/bin/csh;/bin/sh;' ${DESTDIR}/etc/master.passwd
 .endif
 	pwd_mkdb ${PWD_MKDB_ENDIAN} -i -p -d ${DESTDIR}/etc \
 	    ${DESTDIR}/etc/master.passwd
 .if defined(NO_ROOT)
 	( \
 		echo "./etc/login.conf.db type=file mode=0644 uname=root gname=wheel"; \
 		echo "./etc/passwd type=file mode=0644 uname=root gname=wheel"; \
 		echo "./etc/pwd.db type=file mode=0644 uname=root gname=wheel"; \
 		echo "./etc/spwd.db type=file mode=0600 uname=root gname=wheel"; \
 	) | ${METALOG.add}
 .endif
 .if ${MK_ATF} != "no"
 	${_+_}cd ${.CURDIR}/atf; ${MAKE} install
 .endif
 .if ${MK_BLUETOOTH} != "no"
 	${_+_}cd ${.CURDIR}/bluetooth; ${MAKE} install
 .endif
 	${_+_}cd ${.CURDIR}/defaults; ${MAKE} install
 	${_+_}cd ${.CURDIR}/devd; ${MAKE} install
 	${_+_}cd ${.CURDIR}/gss; ${MAKE} install
 	${_+_}cd ${.CURDIR}/periodic; ${MAKE} install
 	${_+_}cd ${.CURDIR}/rc.d; ${MAKE} install
 	${_+_}cd ${.CURDIR}/../gnu/usr.bin/send-pr; ${MAKE} etc-gnats-freefall
 	${_+_}cd ${.CURDIR}/../share/termcap; ${MAKE} etc-termcap
 	${_+_}cd ${.CURDIR}/../usr.sbin/rmt; ${MAKE} etc-rmt
 	${_+_}cd ${.CURDIR}/pam.d; ${MAKE} install
 	cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 0444 \
 	    ${BSM_ETC_OPEN_FILES} ${BSM_ETC_DIR}
 	cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 0600 \
 	    ${BSM_ETC_RESTRICTED_FILES} ${BSM_ETC_DIR}
 	cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 0500 \
 	    ${BSM_ETC_EXEC_FILES} ${BSM_ETC_DIR}
 .if ${MK_BIND_MTREE} != "no"
 	if [ ! -e ${DESTDIR}/etc/namedb ]; then \
 		ln -s ../var/named/etc/namedb ${DESTDIR}/etc/namedb; \
 	fi
 .endif
 .if ${MK_BIND_ETC} != "no"
 	${_+_}cd ${.CURDIR}/namedb; ${MAKE} install
 .endif
 .if ${MK_SENDMAIL} != "no"
 	${_+_}cd ${.CURDIR}/sendmail; ${MAKE} distribution
 .endif
 .if ${MK_OPENSSH} != "no"
 	cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 	    ${SSH} ${DESTDIR}/etc/ssh
 .endif
 .if ${MK_OPENSSL} != "no"
 	cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 	    ${SSL} ${DESTDIR}/etc/ssl
 .endif
 .if ${MK_KERBEROS} != "no"
 	cd ${.CURDIR}/root; \
 	    ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 		dot.k5login ${DESTDIR}/root/.k5login;
 .endif
 	cd ${.CURDIR}/root; \
 	    ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 		dot.profile ${DESTDIR}/root/.profile; \
 	    rm -f ${DESTDIR}/.profile; \
 	    ln ${DESTDIR}/root/.profile ${DESTDIR}/.profile
 .if ${MK_TCSH} != "no"
 	cd ${.CURDIR}/root; \
 	    ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 		dot.cshrc ${DESTDIR}/root/.cshrc; \
 	    ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 		dot.login ${DESTDIR}/root/.login; \
 	    rm -f ${DESTDIR}/.cshrc; \
 	    ln ${DESTDIR}/root/.cshrc ${DESTDIR}/.cshrc
 .endif
 	cd ${.CURDIR}/mtree; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \
 	    ${MTREE} ${DESTDIR}/etc/mtree
 .if ${MK_PPP} != "no"
 	cd ${.CURDIR}/ppp; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 600 \
 	    ${PPPCNF} ${DESTDIR}/etc/ppp
 .endif
 .if ${MK_MAIL} != "no"
 	cd ${.CURDIR}/mail; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \
 	    ${ETCMAIL} ${DESTDIR}/etc/mail
 	if [ -d ${DESTDIR}/etc/mail -a -f ${DESTDIR}/etc/mail/aliases -a \
 	      ! -f ${DESTDIR}/etc/aliases ]; then \
 		ln -s mail/aliases ${DESTDIR}/etc/aliases; \
 	fi
 .endif
 	${INSTALL} -o ${BINOWN} -g operator -m 664 /dev/null \
 	    ${DESTDIR}/etc/dumpdates
 	${INSTALL} -o nobody -g ${BINGRP} -m 644 /dev/null \
 	    ${DESTDIR}/var/db/locate.database
 	${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 ${.CURDIR}/minfree \
 	    ${DESTDIR}/var/crash
 	cd ${.CURDIR}/..; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \
 		${FREEBSD} ${DESTDIR}/
 .if ${MK_BOOT} != "no"
 .if exists(${.CURDIR}/../sys/${MACHINE}/conf/GENERIC.hints)
 	${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \
 	    ${.CURDIR}/../sys/${MACHINE}/conf/GENERIC.hints \
 	    ${DESTDIR}/boot/device.hints
 .endif
 .endif
 .if ${MK_NIS} == "no"
 	sed -i "" -e 's/.*_compat:/# &/' -e 's/compat$$/files/' \
 		${DESTDIR}/etc/nsswitch.conf
 .endif
 
 MTREE_CMD?=	mtree
 
 MTREES=		mtree/BSD.root.dist		/		\
 		mtree/BSD.var.dist		/var		\
 		mtree/BSD.usr.dist		/usr		\
 		mtree/BSD.include.dist		/usr/include
 .if ${MK_BIND_LIBS} != "no"
 MTREES+=	mtree/BIND.include.dist		/usr/include
 .endif
 .if ${MK_BIND_MTREE} != "no"
 MTREES+=	mtree/BIND.chroot.dist		/var/named
 .endif
 .if ${MK_GROFF} != "no"
 MTREES+=	mtree/BSD.groff.dist		/usr
 .endif
 .if ${MK_SENDMAIL} != "no"
 MTREES+=	mtree/BSD.sendmail.dist		/
 .endif
 .for mtree in ${LOCAL_MTREE}
 MTREES+=	../${mtree}			/
 .endfor
 
 distrib-dirs:
 	@set ${MTREES}; \
 	while test $$# -ge 2; do \
 		m=${.CURDIR}/$$1; \
 		shift; \
 		d=${DESTDIR}$$1; \
 		shift; \
 		${ECHO} ${MTREE_CMD} -deU ${MTREE_FOLLOWS_SYMLINKS} \
 		    -f $$m -p $$d; \
 		${MTREE_CMD} -deU ${MTREE_FOLLOWS_SYMLINKS} -f $$m -p $$d; \
 	done; true
 .if defined(NO_ROOT)
 	@set ${MTREES}; \
 	while test $$# -ge 2; do \
 		m=${.CURDIR}/$$1; \
 		shift; \
 		d=$$1; \
 		test "$$d" == "/" && d=""; \
 		d=${DISTBASE}$$d; \
 		shift; \
 		${ECHO} "${MTREE_CMD:N-W} -C -f $$m -K uname,gname | " \
 		    "sed s#^\.#.$$d# | ${METALOG.add}" ; \
 		${MTREE_CMD:N-W} -C -f $$m -K uname,gname | sed s#^\.#.$$d# | \
 		    ${METALOG.add} ; \
 	done; true
 .endif
-	${INSTALL_SYMLINK} usr/src/sys ${DESTDIR}/
+	${INSTALL_SYMLINK} usr/src/sys ${DESTDIR}/sys
 	cd ${DESTDIR}/usr/share/man; \
 	for mandir in man*; do \
 		${INSTALL_SYMLINK} ../$$mandir \
 		    ${DESTDIR}/usr/share/man/en.ISO8859-1/; \
 		${INSTALL_SYMLINK} ../$$mandir \
 		    ${DESTDIR}/usr/share/man/en.UTF-8/; \
 	done
 	cd ${DESTDIR}/usr/share/openssl/man; \
 	for mandir in man*; do \
 		${INSTALL_SYMLINK} ../$$mandir \
 		    ${DESTDIR}/usr/share/openssl/man/en.ISO8859-1/; \
 	done
 	set - `grep "^[a-zA-Z]" ${.CURDIR}/man.alias`; \
 	while [ $$# -gt 0 ] ; do \
 		${INSTALL_SYMLINK} "$$2" "${DESTDIR}/usr/share/man/$$1"; \
 		${INSTALL_SYMLINK} "$$2" \
 		    "${DESTDIR}/usr/share/openssl/man/$$1"; \
 		shift; shift; \
 	done
 	set - `grep "^[a-zA-Z]" ${.CURDIR}/nls.alias`; \
 	while [ $$# -gt 0 ] ; do \
 		${INSTALL_SYMLINK} "$$2" "${DESTDIR}/usr/share/nls/$$1"; \
 		shift; shift; \
 	done
 
 etc-examples:
 	cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \
 	    ${BIN1} ${BIN2} nsmb.conf opieaccess \
 	    ${DESTDIR}/usr/share/examples/etc
 	${_+_}cd ${.CURDIR}/defaults; ${MAKE} install \
 	    DESTDIR=${DESTDIR}/usr/share/examples
 
 .include <bsd.prog.mk>
Index: user/attilio/vmobj-rwlock/lib/libc/stdlib/bsearch.3
===================================================================
--- user/attilio/vmobj-rwlock/lib/libc/stdlib/bsearch.3	(revision 247191)
+++ user/attilio/vmobj-rwlock/lib/libc/stdlib/bsearch.3	(revision 247192)
@@ -1,89 +1,95 @@
 .\" Copyright (c) 1990, 1991, 1993, 1994
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" This code is derived from software contributed to Berkeley by
 .\" the American National Standards Committee X3, on Information
 .\" Processing Systems.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 4. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)bsearch.3	8.3 (Berkeley) 4/19/94
 .\" $FreeBSD$
 .\"
-.Dd April 19, 1994
+.Dd February 22, 2013
 .Dt BSEARCH 3
 .Os
 .Sh NAME
 .Nm bsearch
 .Nd binary search of a sorted table
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In stdlib.h
 .Ft void *
 .Fn bsearch "const void *key" "const void *base" "size_t nmemb" "size_t size" "int (*compar) (const void *, const void *)"
 .Sh DESCRIPTION
 The
 .Fn bsearch
 function searches an array of
 .Fa nmemb
 objects, the initial member of which is
 pointed to by
 .Fa base ,
 for a member that matches the object pointed to by
 .Fa key .
 The size of each member of the array is specified by
 .Fa size .
 .Pp
 The contents of the array should be in ascending sorted order according
 to the comparison function referenced by
 .Fa compar .
 The
 .Fa compar
 routine
 is expected to have
 two arguments which point to the
 .Fa key
 object and to an array member, in that order, and should return an integer
 less than, equal to, or greater than zero if the
 .Fa key
 object is found, respectively, to be less than, to match, or be
 greater than the array member.
+See the
+.Fa int_compare
+sample function in
+.Xr qsort 3
+for a comparison function that is also compatible with
+.Fn bsearch .
 .Sh RETURN VALUES
 The
 .Fn bsearch
 function returns a pointer to a matching member of the array, or a null
 pointer if no match is found.
 If two members compare as equal, which member is matched is unspecified.
 .Sh SEE ALSO
 .Xr db 3 ,
 .Xr lsearch 3 ,
 .Xr qsort 3
 .\" .Xr tsearch 3
 .Sh STANDARDS
 The
 .Fn bsearch
 function conforms to
 .St -isoC .
Index: user/attilio/vmobj-rwlock/lib/libc
===================================================================
--- user/attilio/vmobj-rwlock/lib/libc	(revision 247191)
+++ user/attilio/vmobj-rwlock/lib/libc	(revision 247192)

Property changes on: user/attilio/vmobj-rwlock/lib/libc
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/lib/libc:r247097-247191
Index: user/attilio/vmobj-rwlock/share/man/man9/VFS_SET.9
===================================================================
--- user/attilio/vmobj-rwlock/share/man/man9/VFS_SET.9	(revision 247191)
+++ user/attilio/vmobj-rwlock/share/man/man9/VFS_SET.9	(revision 247192)
@@ -1,108 +1,111 @@
 .\"
 .\" Copyright (C) 2001 Chad David <davidc@acns.ab.ca>. All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer as
 .\"    the first lines of this file unmodified other than the possible
 .\"    addition of one or more copyright notices.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
 .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 .\" DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
 .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 .\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 .\" CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 .\" DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd January 28, 2013
+.Dd February 21, 2013
 .Dt VFS_SET 9
 .Os
 .Sh NAME
 .Nm VFS_SET
 .Nd set up loadable file system
 .Vt vfsconf
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/kernel.h
 .In sys/module.h
 .In sys/mount.h
 .Ft void
 .Fn VFS_SET "struct vfsops *vfsops" "fsname" "int flags"
 .Sh DESCRIPTION
 .Fn VFS_SET
 creates a
 .Vt vfsconf
 structure for the loadable module with the given
 .Fa vfsops , fsname
 and
 .Fa flags ,
 and declares it by calling
 .Xr DECLARE_MODULE 9
 using
 .Fn vfs_modevent
 as the event handler.
 .Pp
 Possible values for the
 .Fa flags
 argument are:
 .Bl -hang -width ".Dv VFCF_DELEGADMIN"
 .It Dv VFCF_STATIC
 File system should be statically available in the kernel.
 .It Dv VFCF_NETWORK
 Network exportable file system.
 .It Dv VFCF_READONLY
 Does not support write operations.
 .It Dv VFCF_SYNTHETIC
 Pseudo file system, data does not represent on-disk files.
 .It Dv VFCF_LOOPBACK
 Loopback file system layer.
 .It Dv VFCF_UNICODE
 File names are stored as Unicode.
 .It Dv VFCF_JAIL
 Can be mounted from within a jail if
 .Va security.jail.mount_allowed
 sysctl is set to
 .Dv 1 .
 .It Dv VFCF_DELEGADMIN
 Supports delegated administration if
 .Va vfs.usermount
 sysctl is set to
 .Dv 1 .
+.It Dv VFCF_SBDRY
+When in VFS method, the thread suspension is deferred to the user
+boundary upon arrival of stop action.
 .El
 .Sh PSEUDOCODE
 .Bd -literal
 /*
  * Fill in the fields for which we have special methods.
  * The others are initially null.  This tells vfs to change them to
  * pointers to vfs_std* functions during file system registration.
  */
 static struct vfsops myfs_vfsops = {
         .vfs_mount =    myfs_mount,
         .vfs_root =     myfs_root,
         .vfs_statfs =   myfs_statfs,
         .vfs_unmount =  myfs_unmount,
 };
 
 VFS_SET(myfs_vfsops, myfs, 0);
 .Ed
 .Sh SEE ALSO
 .Xr jail 2 ,
 .Xr jail 8 ,
 .Xr DECLARE_MODULE 9 ,
 .Xr vfsconf 9 ,
 .Xr vfs_modevent 9
 .Sh AUTHORS
 This manual page was written by
 .An Chad David Aq davidc@acns.ab.ca .
Index: user/attilio/vmobj-rwlock/sys/cam/ata/ata_pmp.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cam/ata/ata_pmp.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cam/ata/ata_pmp.c	(revision 247192)
@@ -1,785 +1,787 @@
 /*-
  * Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/cons.h>
 #include <geom/geom_disk.h>
 #endif /* _KERNEL */
 
 #ifndef _KERNEL
 #include <stdio.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_xpt_internal.h>
 #include <cam/cam_sim.h>
 
 #include <cam/ata/ata_all.h>
 
 #ifdef _KERNEL
 
 typedef enum {
 	PMP_STATE_NORMAL,
 	PMP_STATE_PORTS,
 	PMP_STATE_PRECONFIG,
 	PMP_STATE_RESET,
 	PMP_STATE_CONNECT,
 	PMP_STATE_CHECK,
 	PMP_STATE_CLEAR,
 	PMP_STATE_CONFIG,
 	PMP_STATE_SCAN
 } pmp_state;
 
 typedef enum {
 	PMP_FLAG_SCTX_INIT	= 0x200
 } pmp_flags;
 
 typedef enum {
 	PMP_CCB_PROBE		= 0x01,
 } pmp_ccb_state;
 
 /* Offsets into our private area for storing information */
 #define ccb_state	ppriv_field0
 #define ccb_bp		ppriv_ptr1
 
 struct pmp_softc {
 	SLIST_ENTRY(pmp_softc)	links;
 	pmp_state		state;
 	pmp_flags		flags;
 	uint32_t		pm_pid;
 	uint32_t		pm_prv;
 	int			pm_ports;
 	int			pm_step;
 	int			pm_try;
 	int			found;
 	int			reset;
 	int			frozen;
 	int			restart;
 	int			events;
 #define PMP_EV_RESET	1
 #define PMP_EV_RESCAN	2
 	u_int			caps;
 	struct task		sysctl_task;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 };
 
 static	periph_init_t	pmpinit;
 static	void		pmpasync(void *callback_arg, u_int32_t code,
 				struct cam_path *path, void *arg);
 static	void		pmpsysctlinit(void *context, int pending);
 static	periph_ctor_t	pmpregister;
 static	periph_dtor_t	pmpcleanup;
 static	periph_start_t	pmpstart;
 static	periph_oninv_t	pmponinvalidate;
 static	void		pmpdone(struct cam_periph *periph,
 			       union ccb *done_ccb);
 
 #ifndef PMP_DEFAULT_TIMEOUT
 #define PMP_DEFAULT_TIMEOUT 30	/* Timeout in seconds */
 #endif
 
 #ifndef	PMP_DEFAULT_RETRY
 #define	PMP_DEFAULT_RETRY	1
 #endif
 
 #ifndef	PMP_DEFAULT_HIDE_SPECIAL
 #define	PMP_DEFAULT_HIDE_SPECIAL	1
 #endif
 
 static int pmp_retry_count = PMP_DEFAULT_RETRY;
 static int pmp_default_timeout = PMP_DEFAULT_TIMEOUT;
 static int pmp_hide_special = PMP_DEFAULT_HIDE_SPECIAL;
 
 static SYSCTL_NODE(_kern_cam, OID_AUTO, pmp, CTLFLAG_RD, 0,
             "CAM Direct Access Disk driver");
 SYSCTL_INT(_kern_cam_pmp, OID_AUTO, retry_count, CTLFLAG_RW,
            &pmp_retry_count, 0, "Normal I/O retry count");
 TUNABLE_INT("kern.cam.pmp.retry_count", &pmp_retry_count);
 SYSCTL_INT(_kern_cam_pmp, OID_AUTO, default_timeout, CTLFLAG_RW,
            &pmp_default_timeout, 0, "Normal I/O timeout (in seconds)");
 TUNABLE_INT("kern.cam.pmp.default_timeout", &pmp_default_timeout);
 SYSCTL_INT(_kern_cam_pmp, OID_AUTO, hide_special, CTLFLAG_RW,
            &pmp_hide_special, 0, "Hide extra ports");
 TUNABLE_INT("kern.cam.pmp.hide_special", &pmp_hide_special);
 
 static struct periph_driver pmpdriver =
 {
 	pmpinit, "pmp",
 	TAILQ_HEAD_INITIALIZER(pmpdriver.units), /* generation */ 0,
 	CAM_PERIPH_DRV_EARLY
 };
 
 PERIPHDRIVER_DECLARE(pmp, pmpdriver);
 
 static MALLOC_DEFINE(M_ATPMP, "ata_pmp", "ata_pmp buffers");
 
 static void
 pmpinit(void)
 {
 	cam_status status;
 
 	/*
 	 * Install a global async callback.  This callback will
 	 * receive async callbacks like "new device found".
 	 */
 	status = xpt_register_async(AC_FOUND_DEVICE, pmpasync, NULL, NULL);
 
 	if (status != CAM_REQ_CMP) {
 		printf("pmp: Failed to attach master async callback "
 		       "due to status 0x%x!\n", status);
 	}
 }
 
 static void
 pmpfreeze(struct cam_periph *periph, int mask)
 {
 	struct pmp_softc *softc = (struct pmp_softc *)periph->softc;
 	struct cam_path *dpath;
 	int i;
 
 	mask &= ~softc->frozen;
 	for (i = 0; i < 15; i++) {
 		if ((mask & (1 << i)) == 0)
 			continue;
 		if (xpt_create_path(&dpath, periph,
 		    xpt_path_path_id(periph->path),
 		    i, 0) == CAM_REQ_CMP) {
 			softc->frozen |= (1 << i);
 			xpt_acquire_device(dpath->device);
 			cam_freeze_devq_arg(dpath,
 			    RELSIM_RELEASE_RUNLEVEL, CAM_RL_BUS + 1);
 			xpt_free_path(dpath);
 		}
 	}
 }
 
 static void
 pmprelease(struct cam_periph *periph, int mask)
 {
 	struct pmp_softc *softc = (struct pmp_softc *)periph->softc;
 	struct cam_path *dpath;
 	int i;
 
 	mask &= softc->frozen;
 	for (i = 0; i < 15; i++) {
 		if ((mask & (1 << i)) == 0)
 			continue;
 		if (xpt_create_path(&dpath, periph,
 		    xpt_path_path_id(periph->path),
 		    i, 0) == CAM_REQ_CMP) {
 			softc->frozen &= ~(1 << i);
 			cam_release_devq(dpath,
 			    RELSIM_RELEASE_RUNLEVEL, 0, CAM_RL_BUS + 1, FALSE);
 			xpt_release_device(dpath->device);
 			xpt_free_path(dpath);
 		}
 	}
 }
 
 static void
 pmponinvalidate(struct cam_periph *periph)
 {
 	struct cam_path *dpath;
 	int i;
 
 	/*
 	 * De-register any async callbacks.
 	 */
 	xpt_register_async(0, pmpasync, periph, periph->path);
 
 	for (i = 0; i < 15; i++) {
 		if (xpt_create_path(&dpath, periph,
 		    xpt_path_path_id(periph->path),
 		    i, 0) == CAM_REQ_CMP) {
 			xpt_async(AC_LOST_DEVICE, dpath, NULL);
 			xpt_free_path(dpath);
 		}
 	}
 	pmprelease(periph, -1);
 	xpt_print(periph->path, "lost device\n");
 }
 
 static void
 pmpcleanup(struct cam_periph *periph)
 {
 	struct pmp_softc *softc;
 
 	softc = (struct pmp_softc *)periph->softc;
 
 	xpt_print(periph->path, "removing device entry\n");
 	cam_periph_unlock(periph);
 
 	/*
 	 * If we can't free the sysctl tree, oh well...
 	 */
 	if ((softc->flags & PMP_FLAG_SCTX_INIT) != 0
 	    && sysctl_ctx_free(&softc->sysctl_ctx) != 0) {
 		xpt_print(periph->path, "can't remove sysctl context\n");
 	}
 
 	free(softc, M_DEVBUF);
 	cam_periph_lock(periph);
 }
 
 static void
 pmpasync(void *callback_arg, u_int32_t code,
 	struct cam_path *path, void *arg)
 {
 	struct cam_periph *periph;
 	struct pmp_softc *softc;
 
 	periph = (struct cam_periph *)callback_arg;
 	switch (code) {
 	case AC_FOUND_DEVICE:
 	{
 		struct ccb_getdev *cgd;
 		cam_status status;
  
 		cgd = (struct ccb_getdev *)arg;
 		if (cgd == NULL)
 			break;
 
 		if (cgd->protocol != PROTO_SATAPM)
 			break;
 
 		/*
 		 * Allocate a peripheral instance for
 		 * this device and start the probe
 		 * process.
 		 */
 		status = cam_periph_alloc(pmpregister, pmponinvalidate,
 					  pmpcleanup, pmpstart,
 					  "pmp", CAM_PERIPH_BIO,
 					  cgd->ccb_h.path, pmpasync,
 					  AC_FOUND_DEVICE, cgd);
 
 		if (status != CAM_REQ_CMP
 		 && status != CAM_REQ_INPROG)
 			printf("pmpasync: Unable to attach to new device "
 				"due to status 0x%x\n", status);
 		break;
 	}
 	case AC_SCSI_AEN:
 	case AC_SENT_BDR:
 	case AC_BUS_RESET:
 		softc = (struct pmp_softc *)periph->softc;
 		cam_periph_async(periph, code, path, arg);
 		if (code == AC_SCSI_AEN)
 			softc->events |= PMP_EV_RESCAN;
 		else
 			softc->events |= PMP_EV_RESET;
 		if (code == AC_SCSI_AEN && softc->state != PMP_STATE_NORMAL)
 			break;
 		xpt_hold_boot();
 		pmpfreeze(periph, softc->found);
 		if (code == AC_SENT_BDR || code == AC_BUS_RESET)
 			softc->found = 0; /* We have to reset everything. */
 		if (softc->state == PMP_STATE_NORMAL) {
 			softc->state = PMP_STATE_PRECONFIG;
 			cam_periph_acquire(periph);
 			xpt_schedule(periph, CAM_PRIORITY_DEV);
 		} else
 			softc->restart = 1;
 		break;
 	default:
 		cam_periph_async(periph, code, path, arg);
 		break;
 	}
 }
 
 static void
 pmpsysctlinit(void *context, int pending)
 {
 	struct cam_periph *periph;
 	struct pmp_softc *softc;
 	char tmpstr[80], tmpstr2[80];
 
 	periph = (struct cam_periph *)context;
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP)
 		return;
 
 	softc = (struct pmp_softc *)periph->softc;
 	snprintf(tmpstr, sizeof(tmpstr), "CAM PMP unit %d", periph->unit_number);
 	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);
 
 	sysctl_ctx_init(&softc->sysctl_ctx);
 	softc->flags |= PMP_FLAG_SCTX_INIT;
 	softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_cam_pmp), OID_AUTO, tmpstr2,
 		CTLFLAG_RD, 0, tmpstr);
 	if (softc->sysctl_tree == NULL) {
 		printf("pmpsysctlinit: unable to allocate sysctl tree\n");
 		cam_periph_release(periph);
 		return;
 	}
 
 	cam_periph_release(periph);
 }
 
 static cam_status
 pmpregister(struct cam_periph *periph, void *arg)
 {
 	struct pmp_softc *softc;
 	struct ccb_getdev *cgd;
 
 	cgd = (struct ccb_getdev *)arg;
 	if (cgd == NULL) {
 		printf("pmpregister: no getdev CCB, can't register device\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	softc = (struct pmp_softc *)malloc(sizeof(*softc), M_DEVBUF,
 	    M_NOWAIT|M_ZERO);
 
 	if (softc == NULL) {
 		printf("pmpregister: Unable to probe new device. "
 		       "Unable to allocate softc\n");				
 		return(CAM_REQ_CMP_ERR);
 	}
 	periph->softc = softc;
 
 	softc->pm_pid = ((uint32_t *)&cgd->ident_data)[0];
 	softc->pm_prv = ((uint32_t *)&cgd->ident_data)[1];
 	TASK_INIT(&softc->sysctl_task, 0, pmpsysctlinit, periph);
 
 	xpt_announce_periph(periph, NULL);
 
 	/*
 	 * Add async callbacks for bus reset and
 	 * bus device reset calls.  I don't bother
 	 * checking if this fails as, in most cases,
 	 * the system will function just fine without
 	 * them and the only alternative would be to
 	 * not attach the device on failure.
 	 */
 	xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE |
 		AC_SCSI_AEN, pmpasync, periph, periph->path);
 
 	/*
 	 * Take an exclusive refcount on the periph while pmpstart is called
 	 * to finish the probe.  The reference will be dropped in pmpdone at
 	 * the end of probe.
 	 */
 	(void)cam_periph_acquire(periph);
 	xpt_hold_boot();
 	softc->state = PMP_STATE_PORTS;
 	softc->events = PMP_EV_RESCAN;
 	xpt_schedule(periph, CAM_PRIORITY_DEV);
 
 	return(CAM_REQ_CMP);
 }
 
 static void
 pmpstart(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct ccb_trans_settings cts;
 	struct ccb_ataio *ataio;
 	struct pmp_softc *softc;
 	struct cam_path *dpath;
 	int revision = 0;
 
 	softc = (struct pmp_softc *)periph->softc;
 	ataio = &start_ccb->ataio;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("pmpstart\n"));
 
 	if (softc->restart) {
 		softc->restart = 0;
 		softc->state = min(softc->state, PMP_STATE_PRECONFIG);
 	}
 	/* Fetch user wanted device speed. */
 	if (softc->state == PMP_STATE_RESET ||
 	    softc->state == PMP_STATE_CONNECT) {
 		if (xpt_create_path(&dpath, periph,
 		    xpt_path_path_id(periph->path),
 		    softc->pm_step, 0) == CAM_REQ_CMP) {
 			bzero(&cts, sizeof(cts));
 			xpt_setup_ccb(&cts.ccb_h, dpath, CAM_PRIORITY_NONE);
 			cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 			cts.type = CTS_TYPE_USER_SETTINGS;
 			xpt_action((union ccb *)&cts);
 			if (cts.xport_specific.sata.valid & CTS_SATA_VALID_REVISION)
 				revision = cts.xport_specific.sata.revision;
 			xpt_free_path(dpath);
 		}
 	}
 	switch (softc->state) {
 	case PMP_STATE_PORTS:
 		cam_fill_ataio(ataio,
 		      pmp_retry_count,
 		      pmpdone,
 		      /*flags*/CAM_DIR_NONE,
 		      0,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      pmp_default_timeout * 1000);
 		ata_pm_read_cmd(ataio, 2, 15);
 		break;
 	case PMP_STATE_PRECONFIG:
 		/* Get/update host SATA capabilities. */
 		bzero(&cts, sizeof(cts));
 		xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE);
 		cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 		cts.type = CTS_TYPE_CURRENT_SETTINGS;
 		xpt_action((union ccb *)&cts);
 		if (cts.xport_specific.sata.valid & CTS_SATA_VALID_CAPS)
 			softc->caps = cts.xport_specific.sata.caps;
 		cam_fill_ataio(ataio,
 		      pmp_retry_count,
 		      pmpdone,
 		      /*flags*/CAM_DIR_NONE,
 		      0,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      pmp_default_timeout * 1000);
 		ata_pm_write_cmd(ataio, 0x60, 15, 0x0);
 		break;
 	case PMP_STATE_RESET:
 		cam_fill_ataio(ataio,
 		      pmp_retry_count,
 		      pmpdone,
 		      /*flags*/CAM_DIR_NONE,
 		      0,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      pmp_default_timeout * 1000);
 		ata_pm_write_cmd(ataio, 2, softc->pm_step,
 		    (revision << 4) |
 		    ((softc->found & (1 << softc->pm_step)) ? 0 : 1));
 		break;
 	case PMP_STATE_CONNECT:
 		cam_fill_ataio(ataio,
 		      pmp_retry_count,
 		      pmpdone,
 		      /*flags*/CAM_DIR_NONE,
 		      0,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      pmp_default_timeout * 1000);
 		ata_pm_write_cmd(ataio, 2, softc->pm_step,
 		    (revision << 4));
 		break;
 	case PMP_STATE_CHECK:
 		cam_fill_ataio(ataio,
 		      pmp_retry_count,
 		      pmpdone,
 		      /*flags*/CAM_DIR_NONE,
 		      0,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      pmp_default_timeout * 1000);
 		ata_pm_read_cmd(ataio, 0, softc->pm_step);
 		break;
 	case PMP_STATE_CLEAR:
 		softc->reset = 0;
 		cam_fill_ataio(ataio,
 		      pmp_retry_count,
 		      pmpdone,
 		      /*flags*/CAM_DIR_NONE,
 		      0,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      pmp_default_timeout * 1000);
 		ata_pm_write_cmd(ataio, 1, softc->pm_step, 0xFFFFFFFF);
 		break;
 	case PMP_STATE_CONFIG:
 		cam_fill_ataio(ataio,
 		      pmp_retry_count,
 		      pmpdone,
 		      /*flags*/CAM_DIR_NONE,
 		      0,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      pmp_default_timeout * 1000);
 		ata_pm_write_cmd(ataio, 0x60, 15, 0x07 |
 		    ((softc->caps & CTS_SATA_CAPS_H_AN) ? 0x08 : 0));
 		break;
 	default:
 		break;
 	}
 	xpt_action(start_ccb);
 }
 
 static void
 pmpdone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	struct ccb_trans_settings cts;
 	struct pmp_softc *softc;
 	struct ccb_ataio *ataio;
 	struct cam_path *dpath;
 	u_int32_t  priority, res;
 	int i;
 
 	softc = (struct pmp_softc *)periph->softc;
 	ataio = &done_ccb->ataio;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("pmpdone\n"));
 
 	priority = done_ccb->ccb_h.pinfo.priority;
 
 	if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (cam_periph_error(done_ccb, 0, 0, NULL) == ERESTART) {
 			return;
 		} else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) {
 			cam_release_devq(done_ccb->ccb_h.path,
 			    /*relsim_flags*/0,
 			    /*reduction*/0,
 			    /*timeout*/0,
 			    /*getcount_only*/0);
 		}
 		goto done;
 	}
 
 	if (softc->restart) {
 		softc->restart = 0;
 		xpt_release_ccb(done_ccb);
 		softc->state = min(softc->state, PMP_STATE_PRECONFIG);
 		xpt_schedule(periph, priority);
 		return;
 	}
 
 	switch (softc->state) {
 	case PMP_STATE_PORTS:
 		softc->pm_ports = (ataio->res.lba_high << 24) +
 		    (ataio->res.lba_mid << 16) +
 		    (ataio->res.lba_low << 8) +
 		    ataio->res.sector_count;
 		if (pmp_hide_special) {
 			/*
 			 * This PMP declares 6 ports, while only 5 of them
 			 * are real. Port 5 is a SEMB port, probing which
 			 * causes timeouts if external SEP is not connected
 			 * to PMP over I2C.
 			 */
-			if (softc->pm_pid == 0x37261095 && softc->pm_ports == 6)
+			if ((softc->pm_pid == 0x37261095 ||
+			     softc->pm_pid == 0x38261095) &&
+			    softc->pm_ports == 6)
 				softc->pm_ports = 5;
 
 			/*
 			 * This PMP declares 7 ports, while only 5 of them
 			 * are real. Port 5 is a fake "Config  Disk" with
 			 * 640 sectors size. Port 6 is a SEMB port.
 			 */
 			if (softc->pm_pid == 0x47261095 && softc->pm_ports == 7)
 				softc->pm_ports = 5;
 
 			/*
 			 * These PMPs have extra configuration port.
 			 */
 			if (softc->pm_pid == 0x57231095 ||
 			    softc->pm_pid == 0x57331095 ||
 			    softc->pm_pid == 0x57341095 ||
 			    softc->pm_pid == 0x57441095)
 				softc->pm_ports--;
 		}
 		printf("%s%d: %d fan-out ports\n",
 		    periph->periph_name, periph->unit_number,
 		    softc->pm_ports);
 		softc->state = PMP_STATE_PRECONFIG;
 		xpt_release_ccb(done_ccb);
 		xpt_schedule(periph, priority);
 		return;
 	case PMP_STATE_PRECONFIG:
 		softc->pm_step = 0;
 		softc->state = PMP_STATE_RESET;
 		softc->reset |= ~softc->found;
 		xpt_release_ccb(done_ccb);
 		xpt_schedule(periph, priority);
 		return;
 	case PMP_STATE_RESET:
 		softc->pm_step++;
 		if (softc->pm_step >= softc->pm_ports) {
 			softc->pm_step = 0;
 			cam_freeze_devq(periph->path);
 			cam_release_devq(periph->path,
 			    RELSIM_RELEASE_AFTER_TIMEOUT,
 			    /*reduction*/0,
 			    /*timeout*/5,
 			    /*getcount_only*/0);
 			softc->state = PMP_STATE_CONNECT;
 		}
 		xpt_release_ccb(done_ccb);
 		xpt_schedule(periph, priority);
 		return;
 	case PMP_STATE_CONNECT:
 		softc->pm_step++;
 		if (softc->pm_step >= softc->pm_ports) {
 			softc->pm_step = 0;
 			softc->pm_try = 0;
 			cam_freeze_devq(periph->path);
 			cam_release_devq(periph->path,
 			    RELSIM_RELEASE_AFTER_TIMEOUT,
 			    /*reduction*/0,
 			    /*timeout*/10,
 			    /*getcount_only*/0);
 			softc->state = PMP_STATE_CHECK;
 		}
 		xpt_release_ccb(done_ccb);
 		xpt_schedule(periph, priority);
 		return;
 	case PMP_STATE_CHECK:
 		res = (ataio->res.lba_high << 24) +
 		    (ataio->res.lba_mid << 16) +
 		    (ataio->res.lba_low << 8) +
 		    ataio->res.sector_count;
 		if (((res & 0xf0f) == 0x103 && (res & 0x0f0) != 0) ||
 		    (res & 0x600) != 0) {
 			if (bootverbose) {
 				printf("%s%d: port %d status: %08x\n",
 				    periph->periph_name, periph->unit_number,
 				    softc->pm_step, res);
 			}
 			/* Report device speed if it is online. */
 			if ((res & 0xf0f) == 0x103 &&
 			    xpt_create_path(&dpath, periph,
 			    xpt_path_path_id(periph->path),
 			    softc->pm_step, 0) == CAM_REQ_CMP) {
 				bzero(&cts, sizeof(cts));
 				xpt_setup_ccb(&cts.ccb_h, dpath, CAM_PRIORITY_NONE);
 				cts.ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
 				cts.type = CTS_TYPE_CURRENT_SETTINGS;
 				cts.xport_specific.sata.revision = (res & 0x0f0) >> 4;
 				cts.xport_specific.sata.valid = CTS_SATA_VALID_REVISION;
 				cts.xport_specific.sata.caps = softc->caps &
 				    (CTS_SATA_CAPS_H_PMREQ |
 				     CTS_SATA_CAPS_H_DMAAA |
 				     CTS_SATA_CAPS_H_AN);
 				cts.xport_specific.sata.valid |= CTS_SATA_VALID_CAPS;
 				xpt_action((union ccb *)&cts);
 				xpt_free_path(dpath);
 			}
 			softc->found |= (1 << softc->pm_step);
 			softc->pm_step++;
 		} else {
 			if (softc->pm_try < 10) {
 				cam_freeze_devq(periph->path);
 				cam_release_devq(periph->path,
 				    RELSIM_RELEASE_AFTER_TIMEOUT,
 				    /*reduction*/0,
 				    /*timeout*/10,
 				    /*getcount_only*/0);
 				softc->pm_try++;
 			} else {
 				if (bootverbose) {
 					printf("%s%d: port %d status: %08x\n",
 					    periph->periph_name, periph->unit_number,
 					    softc->pm_step, res);
 				}
 				softc->found &= ~(1 << softc->pm_step);
 				if (xpt_create_path(&dpath, periph,
 				    done_ccb->ccb_h.path_id,
 				    softc->pm_step, 0) == CAM_REQ_CMP) {
 					xpt_async(AC_LOST_DEVICE, dpath, NULL);
 					xpt_free_path(dpath);
 				}
 				softc->pm_step++;
 			}
 		}
 		if (softc->pm_step >= softc->pm_ports) {
 			if (softc->reset & softc->found) {
 				cam_freeze_devq(periph->path);
 				cam_release_devq(periph->path,
 				    RELSIM_RELEASE_AFTER_TIMEOUT,
 				    /*reduction*/0,
 				    /*timeout*/1000,
 				    /*getcount_only*/0);
 			}
 			softc->state = PMP_STATE_CLEAR;
 			softc->pm_step = 0;
 		}
 		xpt_release_ccb(done_ccb);
 		xpt_schedule(periph, priority);
 		return;
 	case PMP_STATE_CLEAR:
 		softc->pm_step++;
 		if (softc->pm_step >= softc->pm_ports) {
 			softc->state = PMP_STATE_CONFIG;
 			softc->pm_step = 0;
 		}
 		xpt_release_ccb(done_ccb);
 		xpt_schedule(periph, priority);
 		return;
 	case PMP_STATE_CONFIG:
 		for (i = 0; i < softc->pm_ports; i++) {
 			union ccb *ccb;
 
 			if ((softc->found & (1 << i)) == 0)
 				continue;
 			if (xpt_create_path(&dpath, periph,
 			    xpt_path_path_id(periph->path),
 			    i, 0) != CAM_REQ_CMP) {
 				printf("pmpdone: xpt_create_path failed\n");
 				continue;
 			}
 			/* If we did hard reset to this device, inform XPT. */
 			if ((softc->reset & softc->found & (1 << i)) != 0)
 				xpt_async(AC_SENT_BDR, dpath, NULL);
 			/* If rescan requested, scan this device. */
 			if (softc->events & PMP_EV_RESCAN) {
 				ccb = xpt_alloc_ccb_nowait();
 				if (ccb == NULL) {
 					xpt_free_path(dpath);
 					goto done;
 				}
 				xpt_setup_ccb(&ccb->ccb_h, dpath, CAM_PRIORITY_XPT);
 				xpt_rescan(ccb);
 			} else
 				xpt_free_path(dpath);
 		}
 		break;
 	default:
 		break;
 	}
 done:
 	xpt_release_ccb(done_ccb);
 	softc->state = PMP_STATE_NORMAL;
 	softc->events = 0;
 	xpt_release_boot();
 	pmprelease(periph, -1);
 	cam_periph_release_locked(periph);
 }
 
 #endif /* _KERNEL */
Index: user/attilio/vmobj-rwlock/sys/cam/scsi/scsi_da.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cam/scsi/scsi_da.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cam/scsi/scsi_da.c	(revision 247192)
@@ -1,2915 +1,2919 @@
 /*-
  * Implementation of SCSI Direct Access Peripheral driver for CAM.
  *
  * Copyright (c) 1997 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/cons.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #endif /* _KERNEL */
 
 #ifndef _KERNEL
 #include <stdio.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_sim.h>
 
 #include <cam/scsi/scsi_message.h>
 
 #ifndef _KERNEL 
 #include <cam/scsi/scsi_da.h>
 #endif /* !_KERNEL */
 
 #ifdef _KERNEL
 typedef enum {
 	DA_STATE_PROBE,
 	DA_STATE_PROBE2,
 	DA_STATE_NORMAL
 } da_state;
 
 typedef enum {
 	DA_FLAG_PACK_INVALID	= 0x001,
 	DA_FLAG_NEW_PACK	= 0x002,
 	DA_FLAG_PACK_LOCKED	= 0x004,
 	DA_FLAG_PACK_REMOVABLE	= 0x008,
 	DA_FLAG_SAW_MEDIA	= 0x010,
 	DA_FLAG_NEED_OTAG	= 0x020,
 	DA_FLAG_WENT_IDLE	= 0x040,
 	DA_FLAG_RETRY_UA	= 0x080,
 	DA_FLAG_OPEN		= 0x100,
 	DA_FLAG_SCTX_INIT	= 0x200,
 	DA_FLAG_CAN_RC16	= 0x400,
 	DA_FLAG_PROBED		= 0x800		
 } da_flags;
 
 typedef enum {
 	DA_Q_NONE		= 0x00,
 	DA_Q_NO_SYNC_CACHE	= 0x01,
 	DA_Q_NO_6_BYTE		= 0x02,
 	DA_Q_NO_PREVENT		= 0x04,
 	DA_Q_4K			= 0x08
 } da_quirks;
 
 typedef enum {
 	DA_CCB_PROBE		= 0x01,
 	DA_CCB_PROBE2		= 0x02,
 	DA_CCB_BUFFER_IO	= 0x03,
 	DA_CCB_WAITING		= 0x04,
 	DA_CCB_DUMP		= 0x05,
 	DA_CCB_DELETE		= 0x06,
 	DA_CCB_TUR		= 0x07,
 	DA_CCB_TYPE_MASK	= 0x0F,
 	DA_CCB_RETRY_UA		= 0x10
 } da_ccb_state;
 
 typedef enum {
 	DA_DELETE_NONE,
 	DA_DELETE_DISABLE,
 	DA_DELETE_ZERO,
 	DA_DELETE_WS10,
 	DA_DELETE_WS16,
 	DA_DELETE_UNMAP,
 	DA_DELETE_MAX = DA_DELETE_UNMAP
 } da_delete_methods;
 
 static const char *da_delete_method_names[] =
     { "NONE", "DISABLE", "ZERO", "WS10", "WS16", "UNMAP" };
 
 /* Offsets into our private area for storing information */
 #define ccb_state	ppriv_field0
 #define ccb_bp		ppriv_ptr1
 
 struct disk_params {
 	u_int8_t  heads;
 	u_int32_t cylinders;
 	u_int8_t  secs_per_track;
 	u_int32_t secsize;	/* Number of bytes/sector */
 	u_int64_t sectors;	/* total number sectors */
 	u_int     stripesize;
 	u_int     stripeoffset;
 };
 
 #define UNMAP_MAX_RANGES	512
 
 struct da_softc {
 	struct	 bio_queue_head bio_queue;
 	struct	 bio_queue_head delete_queue;
 	struct	 bio_queue_head delete_run_queue;
 	SLIST_ENTRY(da_softc) links;
 	LIST_HEAD(, ccb_hdr) pending_ccbs;
 	da_state state;
 	da_flags flags;	
 	da_quirks quirks;
 	int	 minimum_cmd_size;
 	int	 error_inject;
 	int	 ordered_tag_count;
 	int	 outstanding_cmds;
 	int	 unmap_max_ranges;
 	int	 unmap_max_lba;
 	int	 delete_running;
 	int	 tur;
 	da_delete_methods	 delete_method;
 	struct	 disk_params params;
 	struct	 disk *disk;
 	union	 ccb saved_ccb;
 	struct task		sysctl_task;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 	struct callout		sendordered_c;
 	uint64_t wwpn;
 	uint8_t	 unmap_buf[UNMAP_MAX_RANGES * 16 + 8];
 	struct scsi_read_capacity_data_long rcaplong;
 	struct callout		mediapoll_c;
 };
 
 struct da_quirk_entry {
 	struct scsi_inquiry_pattern inq_pat;
 	da_quirks quirks;
 };
 
 static const char quantum[] = "QUANTUM";
 static const char microp[] = "MICROP";
 
 static struct da_quirk_entry da_quirk_table[] =
 {
 	/* SPI, FC devices */
 	{
 		/*
 		 * Fujitsu M2513A MO drives.
 		 * Tested devices: M2513A2 firmware versions 1200 & 1300.
 		 * (dip switch selects whether T_DIRECT or T_OPTICAL device)
 		 * Reported by: W.Scholten <whs@xs4all.nl>
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/* See above. */
 		{T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * This particular Fujitsu drive doesn't like the
 		 * synchronize cache command.
 		 * Reported by: Tom Jackson <toj@gorilla.net>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * This drive doesn't like the synchronize cache command
 		 * either.  Reported by: Matthew Jacob <mjacob@feral.com>
 		 * in NetBSD PR kern/6027, August 24, 1998.
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, microp, "2217*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * This drive doesn't like the synchronize cache command
 		 * either.  Reported by: Hellmuth Michaelis (hm@kts.org)
 		 * (PR 8882).
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, microp, "2112*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: Blaz Zupan <blaz@gold.amis.net>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: Blaz Zupan <blaz@gold.amis.net>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: walter@pelissero.de
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Doesn't work correctly with 6 byte reads/writes.
 		 * Returns illegal request, and points to byte 9 of the
 		 * 6-byte CDB.
 		 * Reported by:  Adam McDougall <bsdx@spawnet.com>
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4*", "*"},
 		/*quirks*/ DA_Q_NO_6_BYTE
 	},
 	{
 		/* See above. */
 		{T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2*", "*"},
 		/*quirks*/ DA_Q_NO_6_BYTE
 	},
 	{
 		/*
 		 * Doesn't like the synchronize cache command.
 		 * Reported by: walter@pelissero.de
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * The CISS RAID controllers do not support SYNC_CACHE
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	/* USB mass storage devices supported by umass(4) */
 	{
 		/*
 		 * EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player
 		 * PR: kern/51675
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Power Quotient Int. (PQI) USB flash key
 		 * PR: kern/53067
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "USB Flash Disk*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
  	{
  		/*
  		 * Creative Nomad MUVO mp3 player (USB)
  		 * PR: kern/53094
  		 */
  		{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"},
  		/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
  	},
 	{
 		/*
 		 * Jungsoft NEXDISK USB flash key
 		 * PR: kern/54737
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * FreeDik USB Mini Data Drive
 		 * PR: kern/54786
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Sigmatel USB Flash MP3 Player
 		 * PR: kern/57046
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
 	},
 	{
 		/*
 		 * Neuros USB Digital Audio Computer
 		 * PR: kern/63645
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * SEAGRAND NP-900 MP3 Player
 		 * PR: kern/64563
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
 	},
 	{
 		/*
 		 * iRiver iFP MP3 player (with UMS Firmware)
 		 * PR: kern/54881, i386/63941, kern/66124
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
  	},
 	{
 		/*
 		 * Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01
 		 * PR: kern/70158
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * ZICPlay USB MP3 Player with FM
 		 * PR: kern/75057
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS*" , "USB DISK*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * TEAC USB floppy mechanisms
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Kingston DataTraveler II+ USB Pen-Drive.
 		 * Reported by: Pawel Jakub Dawidek <pjd@FreeBSD.org>
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * USB DISK Pro PMAP
 		 * Reported by: jhs
 		 * PR: usb/96381
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Motorola E398 Mobile Phone (TransFlash memory card).
 		 * Reported by: Wojciech A. Koszek <dunstan@FreeBSD.czest.pl>
 		 * PR: usb/89889
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Qware BeatZkey! Pro
 		 * PR: usb/79164
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Time DPA20B 1GB MP3 Player
 		 * PR: usb/81846
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0*", "(FS) FLASH DISK*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Samsung USB key 128Mb
 		 * PR: usb/90081
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Kingston DataTraveler 2.0 USB Flash memory.
 		 * PR: usb/89196
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Creative MUVO Slim mp3 player (USB)
 		 * PR: usb/86131
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT
 		},
 	{
 		/*
 		 * United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3)
 		 * PR: usb/80487
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * SanDisk Micro Cruzer 128MB
 		 * PR: usb/75970
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * TOSHIBA TransMemory USB sticks
 		 * PR: kern/94660
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * PNY USB Flash keys
 		 * PR: usb/75578, usb/72344, usb/65436 
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "*" , "USB DISK*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Genesys 6-in-1 Card Reader
 		 * PR: usb/94647
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Rekam Digital CAMERA
 		 * PR: usb/98713
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA*", "4MP-9J6*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * iRiver H10 MP3 player
 		 * PR: usb/102547
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * iRiver U10 MP3 player
 		 * PR: usb/92306
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * X-Micro Flash Disk
 		 * PR: usb/96901
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * EasyMP3 EM732X USB 2.0 Flash MP3 Player
 		 * PR: usb/96546
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*",
 		"1.00"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Denver MP3 player
 		 * PR: usb/107101
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Philips USB Key Audio KEY013
 		 * PR: usb/68412
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT
 	},
 	{
 		/*
 		 * JNC MP3 Player
 		 * PR: usb/94439
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC*" , "MP3 Player*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * SAMSUNG MP0402H
 		 * PR: usb/108427
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * I/O Magic USB flash - Giga Bank
 		 * PR: usb/108810
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor*", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * JoyFly 128mb USB Flash Drive
 		 * PR: 96133
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * ChipsBnk usb stick
 		 * PR: 103702
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A
 		 * PR: 129858
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Samsung YP-U3 mp3-player
 		 * PR: 125398
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3",
 		 "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*",
 		 "2000"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Sony Cyber-Shot DSC cameras
 		 * PR: usb/137035
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT
 	},
+	{
+		{T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3",
+		 "1.00"}, /*quirks*/ DA_Q_NO_PREVENT
+	},
 	/* ATA/SATA devices over SAS/USB/... */
 	{
 		/* Hitachi Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Samsung Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST???DM*", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Barracuda Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Thin Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* Seagate Momentus Thin Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ST???LT*", "*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Caviar Green Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Black Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/* WDC Scorpio Blue Advanced Format (4k) drives */
 		{ T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT*", "*" },
 		/*quirks*/DA_Q_4K
 	},
 	{
 		/*
 		 * Olympus FE-210 camera
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * LG UP3S MP3 player
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * Laser MP3-2GA13 MP3 player
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk",
 		"*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 	{
 		/*
 		 * LaCie external 250GB Hard drive des by Porsche
 		 * Submitted by: Ben Stuyts <ben@altesco.nl>
 		 * PR: 121474
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"},
 		/*quirks*/ DA_Q_NO_SYNC_CACHE
 	},
 };
 
 static	disk_strategy_t	dastrategy;
 static	dumper_t	dadump;
 static	periph_init_t	dainit;
 static	void		daasync(void *callback_arg, u_int32_t code,
 				struct cam_path *path, void *arg);
 static	void		dasysctlinit(void *context, int pending);
 static	int		dacmdsizesysctl(SYSCTL_HANDLER_ARGS);
 static	int		dadeletemethodsysctl(SYSCTL_HANDLER_ARGS);
 static	int		dadeletemethodset(struct da_softc *softc,
 					  da_delete_methods delete_method);
 static	periph_ctor_t	daregister;
 static	periph_dtor_t	dacleanup;
 static	periph_start_t	dastart;
 static	periph_oninv_t	daoninvalidate;
 static	void		dadone(struct cam_periph *periph,
 			       union ccb *done_ccb);
 static  int		daerror(union ccb *ccb, u_int32_t cam_flags,
 				u_int32_t sense_flags);
 static void		daprevent(struct cam_periph *periph, int action);
 static void		dareprobe(struct cam_periph *periph);
 static void		dasetgeom(struct cam_periph *periph, uint32_t block_len,
 				  uint64_t maxsector,
 				  struct scsi_read_capacity_data_long *rcaplong,
 				  size_t rcap_size);
 static timeout_t	dasendorderedtag;
 static void		dashutdown(void *arg, int howto);
 static timeout_t	damediapoll;
 
 #ifndef	DA_DEFAULT_POLL_PERIOD
 #define	DA_DEFAULT_POLL_PERIOD	3
 #endif
 
 #ifndef DA_DEFAULT_TIMEOUT
 #define DA_DEFAULT_TIMEOUT 60	/* Timeout in seconds */
 #endif
 
 #ifndef	DA_DEFAULT_RETRY
 #define	DA_DEFAULT_RETRY	4
 #endif
 
 #ifndef	DA_DEFAULT_SEND_ORDERED
 #define	DA_DEFAULT_SEND_ORDERED	1
 #endif
 
 
 static int da_poll_period = DA_DEFAULT_POLL_PERIOD;
 static int da_retry_count = DA_DEFAULT_RETRY;
 static int da_default_timeout = DA_DEFAULT_TIMEOUT;
 static int da_send_ordered = DA_DEFAULT_SEND_ORDERED;
 
 static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD, 0,
             "CAM Direct Access Disk driver");
 SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RW,
            &da_poll_period, 0, "Media polling period in seconds");
 TUNABLE_INT("kern.cam.da.poll_period", &da_poll_period);
 SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RW,
            &da_retry_count, 0, "Normal I/O retry count");
 TUNABLE_INT("kern.cam.da.retry_count", &da_retry_count);
 SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RW,
            &da_default_timeout, 0, "Normal I/O timeout (in seconds)");
 TUNABLE_INT("kern.cam.da.default_timeout", &da_default_timeout);
 SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RW,
            &da_send_ordered, 0, "Send Ordered Tags");
 TUNABLE_INT("kern.cam.da.send_ordered", &da_send_ordered);
 
 /*
  * DA_ORDEREDTAG_INTERVAL determines how often, relative
  * to the default timeout, we check to see whether an ordered
  * tagged transaction is appropriate to prevent simple tag
  * starvation.  Since we'd like to ensure that there is at least
  * 1/2 of the timeout length left for a starved transaction to
  * complete after we've sent an ordered tag, we must poll at least
  * four times in every timeout period.  This takes care of the worst
  * case where a starved transaction starts during an interval that
  * meets the requirement "don't send an ordered tag" test so it takes
  * us two intervals to determine that a tag must be sent.
  */
 #ifndef DA_ORDEREDTAG_INTERVAL
 #define DA_ORDEREDTAG_INTERVAL 4
 #endif
 
 static struct periph_driver dadriver =
 {
 	dainit, "da",
 	TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0
 };
 
 PERIPHDRIVER_DECLARE(da, dadriver);
 
 static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers");
 
 static int
 daopen(struct disk *dp)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	int unit;
 	int error;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 	if (periph == NULL) {
 		return (ENXIO);	
 	}
 
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
 		return (ENXIO);
 	}
 
 	cam_periph_lock(periph);
 	if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) {
 		cam_periph_unlock(periph);
 		cam_periph_release(periph);
 		return (error);
 	}
 
 	unit = periph->unit_number;
 	softc = (struct da_softc *)periph->softc;
 	softc->flags |= DA_FLAG_OPEN;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
 	    ("daopen\n"));
 
 	if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) {
 		/* Invalidate our pack information. */
 		softc->flags &= ~DA_FLAG_PACK_INVALID;
 	}
 
 	dareprobe(periph);
 
 	/* Wait for the disk size update.  */
 	error = msleep(&softc->disk->d_mediasize, periph->sim->mtx, PRIBIO,
 	    "dareprobe", 0);
 	if (error != 0)
 		xpt_print(periph->path, "unable to retrieve capacity data");
 
 	if (periph->flags & CAM_PERIPH_INVALID ||
 	    softc->disk->d_sectorsize == 0 ||
 	    softc->disk->d_mediasize == 0)
 		error = ENXIO;
 
 	if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 &&
 	    (softc->quirks & DA_Q_NO_PREVENT) == 0)
 		daprevent(periph, PR_PREVENT);
 
 	if (error == 0)
 		softc->flags |= DA_FLAG_SAW_MEDIA;
 
 	cam_periph_unhold(periph);
 	cam_periph_unlock(periph);
 
 	if (error != 0) {
 		softc->flags &= ~DA_FLAG_OPEN;
 		cam_periph_release(periph);
 	}
 
 	return (error);
 }
 
 static int
 daclose(struct disk *dp)
 {
 	struct	cam_periph *periph;
 	struct	da_softc *softc;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 	if (periph == NULL)
 		return (0);	
 
 	cam_periph_lock(periph);
 	if (cam_periph_hold(periph, PRIBIO) != 0) {
 		cam_periph_unlock(periph);
 		cam_periph_release(periph);
 		return (0);
 	}
 
 	softc = (struct da_softc *)periph->softc;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH,
 	    ("daclose\n"));
 
 	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0
 	 && (softc->flags & DA_FLAG_PACK_INVALID) == 0) {
 		union	ccb *ccb;
 
 		ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 		scsi_synchronize_cache(&ccb->csio,
 				       /*retries*/1,
 				       /*cbfcnp*/dadone,
 				       MSG_SIMPLE_Q_TAG,
 				       /*begin_lba*/0,/* Cover the whole disk */
 				       /*lb_count*/0,
 				       SSD_FULL_SIZE,
 				       5 * 60 * 1000);
 
 		cam_periph_runccb(ccb, daerror, /*cam_flags*/0,
 				  /*sense_flags*/SF_RETRY_UA | SF_QUIET_IR,
 				  softc->disk->d_devstat);
 		xpt_release_ccb(ccb);
 
 	}
 
 	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0) {
 		if ((softc->quirks & DA_Q_NO_PREVENT) == 0)
 			daprevent(periph, PR_ALLOW);
 		/*
 		 * If we've got removeable media, mark the blocksize as
 		 * unavailable, since it could change when new media is
 		 * inserted.
 		 */
 		softc->disk->d_devstat->flags |= DEVSTAT_BS_UNAVAILABLE;
 	}
 
 	softc->flags &= ~DA_FLAG_OPEN;
 	cam_periph_unhold(periph);
 	cam_periph_unlock(periph);
 	cam_periph_release(periph);
 	return (0);	
 }
 
 static void
 daschedule(struct cam_periph *periph)
 {
 	struct da_softc *softc = (struct da_softc *)periph->softc;
 	uint32_t prio;
 
 	if (softc->state != DA_STATE_NORMAL)
 		return;
 
 	/* Check if cam_periph_getccb() was called. */
 	prio = periph->immediate_priority;
 
 	/* Check if we have more work to do. */
 	if (bioq_first(&softc->bio_queue) ||
 	    (!softc->delete_running && bioq_first(&softc->delete_queue)) ||
 	    softc->tur) {
 		prio = CAM_PRIORITY_NORMAL;
 	}
 
 	/* Schedule CCB if any of above is true. */
 	if (prio != CAM_PRIORITY_NONE)
 		xpt_schedule(periph, prio);
 }
 
 /*
  * Actually translate the requested transfer into one the physical driver
  * can understand.  The transfer is described by a buf and will include
  * only one physical transfer.
  */
 static void
 dastrategy(struct bio *bp)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	
 	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
 	if (periph == NULL) {
 		biofinish(bp, NULL, ENXIO);
 		return;
 	}
 	softc = (struct da_softc *)periph->softc;
 
 	cam_periph_lock(periph);
 
 	/*
 	 * If the device has been made invalid, error out
 	 */
 	if ((softc->flags & DA_FLAG_PACK_INVALID)) {
 		cam_periph_unlock(periph);
 		biofinish(bp, NULL, ENXIO);
 		return;
 	}
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp));
 
 	/*
 	 * Place it in the queue of disk activities for this disk
 	 */
 	if (bp->bio_cmd == BIO_DELETE) {
 		if (bp->bio_bcount == 0)
 			biodone(bp);
 		else
 			bioq_disksort(&softc->delete_queue, bp);
 	} else
 		bioq_disksort(&softc->bio_queue, bp);
 
 	/*
 	 * Schedule ourselves for performing the work.
 	 */
 	daschedule(periph);
 	cam_periph_unlock(periph);
 
 	return;
 }
 
 static int
 dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length)
 {
 	struct	    cam_periph *periph;
 	struct	    da_softc *softc;
 	u_int	    secsize;
 	struct	    ccb_scsiio csio;
 	struct	    disk *dp;
 	int	    error = 0;
 
 	dp = arg;
 	periph = dp->d_drv1;
 	if (periph == NULL)
 		return (ENXIO);
 	softc = (struct da_softc *)periph->softc;
 	cam_periph_lock(periph);
 	secsize = softc->params.secsize;
 	
 	if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) {
 		cam_periph_unlock(periph);
 		return (ENXIO);
 	}
 
 	if (length > 0) {
 		xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 		csio.ccb_h.ccb_state = DA_CCB_DUMP;
 		scsi_read_write(&csio,
 				/*retries*/0,
 				dadone,
 				MSG_ORDERED_Q_TAG,
 				/*read*/FALSE,
 				/*byte2*/0,
 				/*minimum_cmd_size*/ softc->minimum_cmd_size,
 				offset / secsize,
 				length / secsize,
 				/*data_ptr*/(u_int8_t *) virtual,
 				/*dxfer_len*/length,
 				/*sense_len*/SSD_FULL_SIZE,
 				da_default_timeout * 1000);
 		xpt_polled_action((union ccb *)&csio);
 
 		error = cam_periph_error((union ccb *)&csio,
 		    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
 		if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0,
 			    /*reduction*/0, /*timeout*/0, /*getcount_only*/0);
 		if (error != 0)
 			printf("Aborting dump due to I/O error.\n");
 		cam_periph_unlock(periph);
 		return (error);
 	}
 		
 	/*
 	 * Sync the disk cache contents to the physical media.
 	 */
 	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) {
 
 		xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 		csio.ccb_h.ccb_state = DA_CCB_DUMP;
 		scsi_synchronize_cache(&csio,
 				       /*retries*/0,
 				       /*cbfcnp*/dadone,
 				       MSG_SIMPLE_Q_TAG,
 				       /*begin_lba*/0,/* Cover the whole disk */
 				       /*lb_count*/0,
 				       SSD_FULL_SIZE,
 				       5 * 60 * 1000);
 		xpt_polled_action((union ccb *)&csio);
 
 		error = cam_periph_error((union ccb *)&csio,
 		    0, SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, NULL);
 		if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0,
 			    /*reduction*/0, /*timeout*/0, /*getcount_only*/0);
 		if (error != 0)
 			xpt_print(periph->path, "Synchronize cache failed\n");
 	}
 	cam_periph_unlock(periph);
 	return (error);
 }
 
 static int
 dagetattr(struct bio *bp)
 {
 	int ret;
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)bp->bio_disk->d_drv1;
 	if (periph == NULL)
 		return (ENXIO);
 
 	cam_periph_lock(periph);
 	ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute,
 	    periph->path);
 	cam_periph_unlock(periph);
 	if (ret == 0)
 		bp->bio_completed = bp->bio_length;
 	return ret;
 }
 
 static void
 dainit(void)
 {
 	cam_status status;
 
 	/*
 	 * Install a global async callback.  This callback will
 	 * receive async callbacks like "new device found".
 	 */
 	status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL);
 
 	if (status != CAM_REQ_CMP) {
 		printf("da: Failed to attach master async callback "
 		       "due to status 0x%x!\n", status);
 	} else if (da_send_ordered) {
 
 		/* Register our shutdown event handler */
 		if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown, 
 					   NULL, SHUTDOWN_PRI_DEFAULT)) == NULL)
 		    printf("dainit: shutdown event registration failed!\n");
 	}
 }
 
 /*
  * Callback from GEOM, called when it has finished cleaning up its
  * resources.
  */
 static void
 dadiskgonecb(struct disk *dp)
 {
 	struct cam_periph *periph;
 
 	periph = (struct cam_periph *)dp->d_drv1;
 
 	cam_periph_release(periph);
 }
 
 static void
 daoninvalidate(struct cam_periph *periph)
 {
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)periph->softc;
 
 	/*
 	 * De-register any async callbacks.
 	 */
 	xpt_register_async(0, daasync, periph, periph->path);
 
 	softc->flags |= DA_FLAG_PACK_INVALID;
 
 	/*
 	 * Return all queued I/O with ENXIO.
 	 * XXX Handle any transactions queued to the card
 	 *     with XPT_ABORT_CCB.
 	 */
 	bioq_flush(&softc->bio_queue, NULL, ENXIO);
 	bioq_flush(&softc->delete_queue, NULL, ENXIO);
 
 	/*
 	 * Tell GEOM that we've gone away, we'll get a callback when it is
 	 * done cleaning up its resources.
 	 */
 	disk_gone(softc->disk);
 
 	xpt_print(periph->path, "lost device - %d outstanding, %d refs\n",
 		  softc->outstanding_cmds, periph->refcount);
 }
 
 static void
 dacleanup(struct cam_periph *periph)
 {
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)periph->softc;
 
 	xpt_print(periph->path, "removing device entry\n");
 	cam_periph_unlock(periph);
 
 	/*
 	 * If we can't free the sysctl tree, oh well...
 	 */
 	if ((softc->flags & DA_FLAG_SCTX_INIT) != 0
 	    && sysctl_ctx_free(&softc->sysctl_ctx) != 0) {
 		xpt_print(periph->path, "can't remove sysctl context\n");
 	}
 
 	callout_drain(&softc->mediapoll_c);
 	disk_destroy(softc->disk);
 	callout_drain(&softc->sendordered_c);
 	free(softc, M_DEVBUF);
 	cam_periph_lock(periph);
 }
 
 static void
 daasync(void *callback_arg, u_int32_t code,
 	struct cam_path *path, void *arg)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 
 	periph = (struct cam_periph *)callback_arg;
 	switch (code) {
 	case AC_FOUND_DEVICE:
 	{
 		struct ccb_getdev *cgd;
 		cam_status status;
  
 		cgd = (struct ccb_getdev *)arg;
 		if (cgd == NULL)
 			break;
 
 		if (cgd->protocol != PROTO_SCSI)
 			break;
 
 		if (SID_TYPE(&cgd->inq_data) != T_DIRECT
 		    && SID_TYPE(&cgd->inq_data) != T_RBC
 		    && SID_TYPE(&cgd->inq_data) != T_OPTICAL)
 			break;
 
 		/*
 		 * Allocate a peripheral instance for
 		 * this device and start the probe
 		 * process.
 		 */
 		status = cam_periph_alloc(daregister, daoninvalidate,
 					  dacleanup, dastart,
 					  "da", CAM_PERIPH_BIO,
 					  cgd->ccb_h.path, daasync,
 					  AC_FOUND_DEVICE, cgd);
 
 		if (status != CAM_REQ_CMP
 		 && status != CAM_REQ_INPROG)
 			printf("daasync: Unable to attach to new device "
 				"due to status 0x%x\n", status);
 		return;
 	}
 	case AC_ADVINFO_CHANGED:
 	{
 		uintptr_t buftype;
 
 		buftype = (uintptr_t)arg;
 		if (buftype == CDAI_TYPE_PHYS_PATH) {
 			struct da_softc *softc;
 
 			softc = periph->softc;
 			disk_attr_changed(softc->disk, "GEOM::physpath",
 					  M_NOWAIT);
 		}
 		break;
 	}
 	case AC_UNIT_ATTENTION:
 	{
 		union ccb *ccb;
 		int error_code, sense_key, asc, ascq;
 
 		softc = (struct da_softc *)periph->softc;
 		ccb = (union ccb *)arg;
 
 		/*
 		 * Handle all UNIT ATTENTIONs except our own,
 		 * as they will be handled by daerror().
 		 */
 		if (xpt_path_periph(ccb->ccb_h.path) != periph &&
 		    scsi_extract_sense_ccb(ccb,
 		     &error_code, &sense_key, &asc, &ascq)) {
 			if (asc == 0x2A && ascq == 0x09) {
 				xpt_print(ccb->ccb_h.path,
 				    "capacity data has changed\n");
 				dareprobe(periph);
 			} else if (asc == 0x28 && ascq == 0x00)
 				disk_media_changed(softc->disk, M_NOWAIT);
 		}
 		cam_periph_async(periph, code, path, arg);
 		break;
 	}
 	case AC_SCSI_AEN:
 		softc = (struct da_softc *)periph->softc;
 		if (!softc->tur) {
 			if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
 				softc->tur = 1;
 				daschedule(periph);
 			}
 		}
 		/* FALLTHROUGH */
 	case AC_SENT_BDR:
 	case AC_BUS_RESET:
 	{
 		struct ccb_hdr *ccbh;
 
 		softc = (struct da_softc *)periph->softc;
 		/*
 		 * Don't fail on the expected unit attention
 		 * that will occur.
 		 */
 		softc->flags |= DA_FLAG_RETRY_UA;
 		LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le)
 			ccbh->ccb_state |= DA_CCB_RETRY_UA;
 		break;
 	}
 	default:
 		break;
 	}
 	cam_periph_async(periph, code, path, arg);
 }
 
 static void
 dasysctlinit(void *context, int pending)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	char tmpstr[80], tmpstr2[80];
 	struct ccb_trans_settings cts;
 
 	periph = (struct cam_periph *)context;
 	/*
 	 * periph was held for us when this task was enqueued
 	 */
 	if (periph->flags & CAM_PERIPH_INVALID) {
 		cam_periph_release(periph);
 		return;
 	}
 
 	softc = (struct da_softc *)periph->softc;
 	snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number);
 	snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number);
 
 	sysctl_ctx_init(&softc->sysctl_ctx);
 	softc->flags |= DA_FLAG_SCTX_INIT;
 	softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2,
 		CTLFLAG_RD, 0, tmpstr);
 	if (softc->sysctl_tree == NULL) {
 		printf("dasysctlinit: unable to allocate sysctl tree\n");
 		cam_periph_release(periph);
 		return;
 	}
 
 	/*
 	 * Now register the sysctl handler, so the user can change the value on
 	 * the fly.
 	 */
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RW,
 		softc, 0, dadeletemethodsysctl, "A",
 		"BIO_DELETE execution method");
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 		OID_AUTO, "minimum_cmd_size", CTLTYPE_INT | CTLFLAG_RW,
 		&softc->minimum_cmd_size, 0, dacmdsizesysctl, "I",
 		"Minimum CDB size");
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_tree),
 		       OID_AUTO,
 		       "error_inject",
 		       CTLFLAG_RW,
 		       &softc->error_inject,
 		       0,
 		       "error_inject leaf");
 
 
 	/*
 	 * Add some addressing info.
 	 */
 	memset(&cts, 0, sizeof (cts));
 	xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE);
 	cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 	cts.type = CTS_TYPE_CURRENT_SETTINGS;
 	cam_periph_lock(periph);
 	xpt_action((union ccb *)&cts);
 	cam_periph_unlock(periph);
 	if (cts.ccb_h.status != CAM_REQ_CMP) {
 		cam_periph_release(periph);
 		return;
 	}
 	if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) {
 		struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc;
 		if (fc->valid & CTS_FC_VALID_WWPN) {
 			softc->wwpn = fc->wwpn;
 			SYSCTL_ADD_UQUAD(&softc->sysctl_ctx,
 			    SYSCTL_CHILDREN(softc->sysctl_tree),
 			    OID_AUTO, "wwpn", CTLFLAG_RD,
 			    &softc->wwpn, "World Wide Port Name");
 		}
 	}
 	cam_periph_release(periph);
 }
 
 static int
 dacmdsizesysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = *(int *)arg1;
 
 	error = sysctl_handle_int(oidp, &value, 0, req);
 
 	if ((error != 0)
 	 || (req->newptr == NULL))
 		return (error);
 
 	/*
 	 * Acceptable values here are 6, 10, 12 or 16.
 	 */
 	if (value < 6)
 		value = 6;
 	else if ((value > 6)
 	      && (value <= 10))
 		value = 10;
 	else if ((value > 10)
 	      && (value <= 12))
 		value = 12;
 	else if (value > 12)
 		value = 16;
 
 	*(int *)arg1 = value;
 
 	return (0);
 }
 
 static int
 dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method)
 {
 
 	if (delete_method < 0 || delete_method > DA_DELETE_MAX)
 		return (EINVAL);
 
 	softc->delete_method = delete_method;
 
 	if (softc->delete_method > DA_DELETE_DISABLE)
 		softc->disk->d_flags |= DISKFLAG_CANDELETE;
 	else
 		softc->disk->d_flags &= ~DISKFLAG_CANDELETE;
 
 	return (0);
 }
 
 static int
 dadeletemethodsysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	const char *p;
 	struct da_softc *softc;
 	int i, error, value;
 
 	softc = (struct da_softc *)arg1;
 
 	value = softc->delete_method;
 	if (value < 0 || value > DA_DELETE_MAX)
 		p = "UNKNOWN";
 	else
 		p = da_delete_method_names[value];
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	for (i = 0; i <= DA_DELETE_MAX; i++) {
 		if (strcmp(buf, da_delete_method_names[i]) != 0)
 			continue;
 		return dadeletemethodset(softc, i);
 	}
 	return (EINVAL);
 }
 
 static cam_status
 daregister(struct cam_periph *periph, void *arg)
 {
 	struct da_softc *softc;
 	struct ccb_pathinq cpi;
 	struct ccb_getdev *cgd;
 	char tmpstr[80];
 	caddr_t match;
 
 	cgd = (struct ccb_getdev *)arg;
 	if (cgd == NULL) {
 		printf("daregister: no getdev CCB, can't register device\n");
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	softc = (struct da_softc *)malloc(sizeof(*softc), M_DEVBUF,
 	    M_NOWAIT|M_ZERO);
 
 	if (softc == NULL) {
 		printf("daregister: Unable to probe new device. "
 		       "Unable to allocate softc\n");				
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	LIST_INIT(&softc->pending_ccbs);
 	softc->state = DA_STATE_PROBE;
 	bioq_init(&softc->bio_queue);
 	bioq_init(&softc->delete_queue);
 	bioq_init(&softc->delete_run_queue);
 	if (SID_IS_REMOVABLE(&cgd->inq_data))
 		softc->flags |= DA_FLAG_PACK_REMOVABLE;
 	softc->unmap_max_ranges = UNMAP_MAX_RANGES;
 	softc->unmap_max_lba = 1024*1024*2;
 
 	periph->softc = softc;
 
 	/*
 	 * See if this device has any quirks.
 	 */
 	match = cam_quirkmatch((caddr_t)&cgd->inq_data,
 			       (caddr_t)da_quirk_table,
 			       sizeof(da_quirk_table)/sizeof(*da_quirk_table),
 			       sizeof(*da_quirk_table), scsi_inquiry_match);
 
 	if (match != NULL)
 		softc->quirks = ((struct da_quirk_entry *)match)->quirks;
 	else
 		softc->quirks = DA_Q_NONE;
 
 	/* Check if the SIM does not want 6 byte commands */
 	bzero(&cpi, sizeof(cpi));
 	xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 	cpi.ccb_h.func_code = XPT_PATH_INQ;
 	xpt_action((union ccb *)&cpi);
 	if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE))
 		softc->quirks |= DA_Q_NO_6_BYTE;
 
 	TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph);
 
 	/*
 	 * Take an exclusive refcount on the periph while dastart is called
 	 * to finish the probe.  The reference will be dropped in dadone at
 	 * the end of probe.
 	 */
 	(void)cam_periph_hold(periph, PRIBIO);
 
 	/*
 	 * Schedule a periodic event to occasionally send an
 	 * ordered tag to a device.
 	 */
 	callout_init_mtx(&softc->sendordered_c, periph->sim->mtx, 0);
 	callout_reset(&softc->sendordered_c,
 	    (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
 	    dasendorderedtag, softc);
 
 	mtx_unlock(periph->sim->mtx);
 	/*
 	 * RBC devices don't have to support READ(6), only READ(10).
 	 */
 	if (softc->quirks & DA_Q_NO_6_BYTE || SID_TYPE(&cgd->inq_data) == T_RBC)
 		softc->minimum_cmd_size = 10;
 	else
 		softc->minimum_cmd_size = 6;
 
 	/*
 	 * Load the user's default, if any.
 	 */
 	snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size",
 		 periph->unit_number);
 	TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size);
 
 	/*
 	 * 6, 10, 12 and 16 are the currently permissible values.
 	 */
 	if (softc->minimum_cmd_size < 6)
 		softc->minimum_cmd_size = 6;
 	else if ((softc->minimum_cmd_size > 6)
 	      && (softc->minimum_cmd_size <= 10))
 		softc->minimum_cmd_size = 10;
 	else if ((softc->minimum_cmd_size > 10)
 	      && (softc->minimum_cmd_size <= 12))
 		softc->minimum_cmd_size = 12;
 	else if (softc->minimum_cmd_size > 12)
 		softc->minimum_cmd_size = 16;
 
 	/* Predict whether device may support READ CAPACITY(16). */
 	if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3) {
 		softc->flags |= DA_FLAG_CAN_RC16;
 		softc->state = DA_STATE_PROBE2;
 	}
 
 	/*
 	 * Register this media as a disk.
 	 */
 	softc->disk = disk_alloc();
 	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
 			  periph->unit_number, 0,
 			  DEVSTAT_BS_UNAVAILABLE,
 			  SID_TYPE(&cgd->inq_data) |
 			  XPORT_DEVSTAT_TYPE(cpi.transport),
 			  DEVSTAT_PRIORITY_DISK);
 	softc->disk->d_open = daopen;
 	softc->disk->d_close = daclose;
 	softc->disk->d_strategy = dastrategy;
 	softc->disk->d_dump = dadump;
 	softc->disk->d_getattr = dagetattr;
 	softc->disk->d_gone = dadiskgonecb;
 	softc->disk->d_name = "da";
 	softc->disk->d_drv1 = periph;
 	if (cpi.maxio == 0)
 		softc->disk->d_maxsize = DFLTPHYS;	/* traditional default */
 	else if (cpi.maxio > MAXPHYS)
 		softc->disk->d_maxsize = MAXPHYS;	/* for safety */
 	else
 		softc->disk->d_maxsize = cpi.maxio;
 	softc->disk->d_unit = periph->unit_number;
 	softc->disk->d_flags = 0;
 	if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0)
 		softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
 	cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor,
 	    sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr));
 	strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr));
 	cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)],
 	    cgd->inq_data.product, sizeof(cgd->inq_data.product),
 	    sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr));
 	softc->disk->d_hba_vendor = cpi.hba_vendor;
 	softc->disk->d_hba_device = cpi.hba_device;
 	softc->disk->d_hba_subvendor = cpi.hba_subvendor;
 	softc->disk->d_hba_subdevice = cpi.hba_subdevice;
 
 	/*
 	 * Acquire a reference to the periph before we register with GEOM.
 	 * We'll release this reference once GEOM calls us back (via
 	 * dadiskgonecb()) telling us that our provider has been freed.
 	 */
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
 		mtx_lock(periph->sim->mtx);
 		return (CAM_REQ_CMP_ERR);
 	}
 
 	disk_create(softc->disk, DISK_VERSION);
 	mtx_lock(periph->sim->mtx);
 
 	/*
 	 * Add async callbacks for events of interest.
 	 * I don't bother checking if this fails as,
 	 * in most cases, the system will function just
 	 * fine without them and the only alternative
 	 * would be to not attach the device on failure.
 	 */
 	xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE |
 	    AC_ADVINFO_CHANGED | AC_SCSI_AEN | AC_UNIT_ATTENTION,
 	    daasync, periph, periph->path);
 
 	/*
 	 * Emit an attribute changed notification just in case 
 	 * physical path information arrived before our async
 	 * event handler was registered, but after anyone attaching
 	 * to our disk device polled it.
 	 */
 	disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT);
 
 	/*
 	 * Schedule a periodic media polling events.
 	 */
 	callout_init_mtx(&softc->mediapoll_c, periph->sim->mtx, 0);
 	if ((softc->flags & DA_FLAG_PACK_REMOVABLE) &&
 	    (cgd->inq_flags & SID_AEN) == 0 &&
 	    da_poll_period != 0)
 		callout_reset(&softc->mediapoll_c, da_poll_period * hz,
 		    damediapoll, periph);
 
 	xpt_schedule(periph, CAM_PRIORITY_DEV);
 
 	return(CAM_REQ_CMP);
 }
 
 static void
 dastart(struct cam_periph *periph, union ccb *start_ccb)
 {
 	struct da_softc *softc;
 
 	softc = (struct da_softc *)periph->softc;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n"));
 
 	switch (softc->state) {
 	case DA_STATE_NORMAL:
 	{
 		struct bio *bp, *bp1;
 		uint8_t tag_code;
 
 		/* Execute immediate CCB if waiting. */
 		if (periph->immediate_priority <= periph->pinfo.priority) {
 			CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE,
 					("queuing for immediate ccb\n"));
 			start_ccb->ccb_h.ccb_state = DA_CCB_WAITING;
 			SLIST_INSERT_HEAD(&periph->ccb_list, &start_ccb->ccb_h,
 					  periph_links.sle);
 			periph->immediate_priority = CAM_PRIORITY_NONE;
 			wakeup(&periph->ccb_list);
 			/* May have more work to do, so ensure we stay scheduled */
 			daschedule(periph);
 			break;
 		}
 
 		/* Run BIO_DELETE if not running yet. */
 		if (!softc->delete_running &&
 		    (bp = bioq_first(&softc->delete_queue)) != NULL) {
 		    uint64_t lba;
 		    u_int count;
 
 		    if (softc->delete_method == DA_DELETE_UNMAP) {
 			uint8_t *buf = softc->unmap_buf;
 			uint64_t lastlba = (uint64_t)-1;
 			uint32_t lastcount = 0;
 			int blocks = 0, off, ranges = 0;
 
 			softc->delete_running = 1;
 			bzero(softc->unmap_buf, sizeof(softc->unmap_buf));
 			bp1 = bp;
 			do {
 				bioq_remove(&softc->delete_queue, bp1);
 				if (bp1 != bp)
 					bioq_insert_tail(&softc->delete_run_queue, bp1);
 				lba = bp1->bio_pblkno;
 				count = bp1->bio_bcount / softc->params.secsize;
 
 				/* Try to extend the previous range. */
 				if (lba == lastlba) {
 					lastcount += count;
 					off = (ranges - 1) * 16 + 8;
 					scsi_ulto4b(lastcount, &buf[off + 8]);
 				} else if (count > 0) {
 					off = ranges * 16 + 8;
 					scsi_u64to8b(lba, &buf[off + 0]);
 					scsi_ulto4b(count, &buf[off + 8]);
 					lastcount = count;
 					ranges++;
 				}
 				blocks += count;
 				lastlba = lba + count;
 				bp1 = bioq_first(&softc->delete_queue);
 				if (bp1 == NULL ||
 				    ranges >= softc->unmap_max_ranges ||
 				    blocks + bp1->bio_bcount /
 				     softc->params.secsize > softc->unmap_max_lba)
 					break;
 			} while (1);
 			scsi_ulto2b(ranges * 16 + 6, &buf[0]);
 			scsi_ulto2b(ranges * 16, &buf[2]);
 
 			scsi_unmap(&start_ccb->csio,
 					/*retries*/da_retry_count,
 					/*cbfcnp*/dadone,
 					/*tag_action*/MSG_SIMPLE_Q_TAG,
 					/*byte2*/0,
 					/*data_ptr*/ buf,
 					/*dxfer_len*/ ranges * 16 + 8,
 					/*sense_len*/SSD_FULL_SIZE,
 					da_default_timeout * 1000);
 			start_ccb->ccb_h.ccb_state = DA_CCB_DELETE;
 			goto out;
 		    } else if (softc->delete_method == DA_DELETE_ZERO ||
 			       softc->delete_method == DA_DELETE_WS10 ||
 			       softc->delete_method == DA_DELETE_WS16) {
 			softc->delete_running = 1;
 			lba = bp->bio_pblkno;
 			count = 0;
 			bp1 = bp;
 			do {
 				bioq_remove(&softc->delete_queue, bp1);
 				if (bp1 != bp)
 					bioq_insert_tail(&softc->delete_run_queue, bp1);
 				count += bp1->bio_bcount / softc->params.secsize;
 				bp1 = bioq_first(&softc->delete_queue);
 				if (bp1 == NULL ||
 				    lba + count != bp1->bio_pblkno ||
 				    count + bp1->bio_bcount /
 				     softc->params.secsize > 0xffff)
 					break;
 			} while (1);
 
 			scsi_write_same(&start_ccb->csio,
 					/*retries*/da_retry_count,
 					/*cbfcnp*/dadone,
 					/*tag_action*/MSG_SIMPLE_Q_TAG,
 					/*byte2*/softc->delete_method ==
 					    DA_DELETE_ZERO ? 0 : SWS_UNMAP,
 					softc->delete_method ==
 					    DA_DELETE_WS16 ? 16 : 10,
 					/*lba*/lba,
 					/*block_count*/count,
 					/*data_ptr*/ __DECONST(void *,
 					    zero_region),
 					/*dxfer_len*/ softc->params.secsize,
 					/*sense_len*/SSD_FULL_SIZE,
 					da_default_timeout * 1000);
 			start_ccb->ccb_h.ccb_state = DA_CCB_DELETE;
 			goto out;
 		    } else {
 			bioq_flush(&softc->delete_queue, NULL, 0);
 			/* FALLTHROUGH */
 		    }
 		}
 
 		/* Run regular command. */
 		bp = bioq_takefirst(&softc->bio_queue);
 		if (bp == NULL) {
 			if (softc->tur) {
 				softc->tur = 0;
 				scsi_test_unit_ready(&start_ccb->csio,
 				     /*retries*/ da_retry_count,
 				     dadone,
 				     MSG_SIMPLE_Q_TAG,
 				     SSD_FULL_SIZE,
 				     da_default_timeout * 1000);
 				start_ccb->ccb_h.ccb_bp = NULL;
 				start_ccb->ccb_h.ccb_state = DA_CCB_TUR;
 				xpt_action(start_ccb);
 			} else
 				xpt_release_ccb(start_ccb);
 			break;
 		}
 		if (softc->tur) {
 			softc->tur = 0;
 			cam_periph_release_locked(periph);
 		}
 
 		if ((bp->bio_flags & BIO_ORDERED) != 0 ||
 		    (softc->flags & DA_FLAG_NEED_OTAG) != 0) {
 			softc->flags &= ~DA_FLAG_NEED_OTAG;
 			softc->ordered_tag_count++;
 			tag_code = MSG_ORDERED_Q_TAG;
 		} else {
 			tag_code = MSG_SIMPLE_Q_TAG;
 		}
 
 		switch (bp->bio_cmd) {
 		case BIO_READ:
 		case BIO_WRITE:
 			scsi_read_write(&start_ccb->csio,
 					/*retries*/da_retry_count,
 					/*cbfcnp*/dadone,
 					/*tag_action*/tag_code,
 					/*read_op*/bp->bio_cmd
 						== BIO_READ,
 					/*byte2*/0,
 					softc->minimum_cmd_size,
 					/*lba*/bp->bio_pblkno,
 					/*block_count*/bp->bio_bcount /
 					softc->params.secsize,
 					/*data_ptr*/ bp->bio_data,
 					/*dxfer_len*/ bp->bio_bcount,
 					/*sense_len*/SSD_FULL_SIZE,
 					da_default_timeout * 1000);
 			break;
 		case BIO_FLUSH:
 			/*
 			 * BIO_FLUSH doesn't currently communicate
 			 * range data, so we synchronize the cache
 			 * over the whole disk.  We also force
 			 * ordered tag semantics the flush applies
 			 * to all previously queued I/O.
 			 */
 			scsi_synchronize_cache(&start_ccb->csio,
 					       /*retries*/1,
 					       /*cbfcnp*/dadone,
 					       MSG_ORDERED_Q_TAG,
 					       /*begin_lba*/0,
 					       /*lb_count*/0,
 					       SSD_FULL_SIZE,
 					       da_default_timeout*1000);
 			break;
 		}
 		start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO;
 
 out:
 		/*
 		 * Block out any asyncronous callbacks
 		 * while we touch the pending ccb list.
 		 */
 		LIST_INSERT_HEAD(&softc->pending_ccbs,
 				 &start_ccb->ccb_h, periph_links.le);
 		softc->outstanding_cmds++;
 
 		/* We expect a unit attention from this device */
 		if ((softc->flags & DA_FLAG_RETRY_UA) != 0) {
 			start_ccb->ccb_h.ccb_state |= DA_CCB_RETRY_UA;
 			softc->flags &= ~DA_FLAG_RETRY_UA;
 		}
 
 		start_ccb->ccb_h.ccb_bp = bp;
 		xpt_action(start_ccb);
 
 		/* May have more work to do, so ensure we stay scheduled */
 		daschedule(periph);
 		break;
 	}
 	case DA_STATE_PROBE:
 	{
 		struct scsi_read_capacity_data *rcap;
 
 		rcap = (struct scsi_read_capacity_data *)
 		    malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT|M_ZERO);
 		if (rcap == NULL) {
 			printf("dastart: Couldn't malloc read_capacity data\n");
 			/* da_free_periph??? */
 			break;
 		}
 		scsi_read_capacity(&start_ccb->csio,
 				   /*retries*/da_retry_count,
 				   dadone,
 				   MSG_SIMPLE_Q_TAG,
 				   rcap,
 				   SSD_FULL_SIZE,
 				   /*timeout*/5000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE;
 		xpt_action(start_ccb);
 		break;
 	}
 	case DA_STATE_PROBE2:
 	{
 		struct scsi_read_capacity_data_long *rcaplong;
 
 		rcaplong = (struct scsi_read_capacity_data_long *)
 			malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT|M_ZERO);
 		if (rcaplong == NULL) {
 			printf("dastart: Couldn't malloc read_capacity data\n");
 			/* da_free_periph??? */
 			break;
 		}
 		scsi_read_capacity_16(&start_ccb->csio,
 				      /*retries*/ da_retry_count,
 				      /*cbfcnp*/ dadone,
 				      /*tag_action*/ MSG_SIMPLE_Q_TAG,
 				      /*lba*/ 0,
 				      /*reladr*/ 0,
 				      /*pmi*/ 0,
 				      /*rcap_buf*/ (uint8_t *)rcaplong,
 				      /*rcap_buf_len*/ sizeof(*rcaplong),
 				      /*sense_len*/ SSD_FULL_SIZE,
 				      /*timeout*/ da_default_timeout * 1000);
 		start_ccb->ccb_h.ccb_bp = NULL;
 		start_ccb->ccb_h.ccb_state = DA_CCB_PROBE2;
 		xpt_action(start_ccb);	
 		break;
 	}
 	}
 }
 
 static int
 cmd6workaround(union ccb *ccb)
 {
 	struct scsi_rw_6 cmd6;
 	struct scsi_rw_10 *cmd10;
 	struct da_softc *softc;
 	u_int8_t *cdb;
 	struct bio *bp;
 	int frozen;
 
 	cdb = ccb->csio.cdb_io.cdb_bytes;
 	softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc;
 
 	if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) {
 		if (softc->delete_method == DA_DELETE_UNMAP) {
 			xpt_print(ccb->ccb_h.path, "UNMAP is not supported, "
 			    "switching to WRITE SAME(16) with UNMAP.\n");
 			dadeletemethodset(softc, DA_DELETE_WS16);
 		} else if (softc->delete_method == DA_DELETE_WS16) {
 			xpt_print(ccb->ccb_h.path,
 			    "WRITE SAME(16) with UNMAP is not supported, "
 			    "disabling BIO_DELETE.\n");
 			dadeletemethodset(softc, DA_DELETE_DISABLE);
 		} else if (softc->delete_method == DA_DELETE_WS10) {
 			xpt_print(ccb->ccb_h.path,
 			    "WRITE SAME(10) with UNMAP is not supported, "
 			    "disabling BIO_DELETE.\n");
 			dadeletemethodset(softc, DA_DELETE_DISABLE);
 		} else if (softc->delete_method == DA_DELETE_ZERO) {
 			xpt_print(ccb->ccb_h.path,
 			    "WRITE SAME(10) is not supported, "
 			    "disabling BIO_DELETE.\n");
 			dadeletemethodset(softc, DA_DELETE_DISABLE);
 		} else
 			dadeletemethodset(softc, DA_DELETE_DISABLE);
 		while ((bp = bioq_takefirst(&softc->delete_run_queue))
 		    != NULL)
 			bioq_disksort(&softc->delete_queue, bp);
 		bioq_insert_tail(&softc->delete_queue,
 		    (struct bio *)ccb->ccb_h.ccb_bp);
 		ccb->ccb_h.ccb_bp = NULL;
 		return (0);
 	}
 
 	/* Translation only possible if CDB is an array and cmd is R/W6 */
 	if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 ||
 	    (*cdb != READ_6 && *cdb != WRITE_6))
 		return 0;
 
 	xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, "
 	    "increasing minimum_cmd_size to 10.\n");
  	softc->minimum_cmd_size = 10;
 
 	bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6));
 	cmd10 = (struct scsi_rw_10 *)cdb;
 	cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10;
 	cmd10->byte2 = 0;
 	scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr);
 	cmd10->reserved = 0;
 	scsi_ulto2b(cmd6.length, cmd10->length);
 	cmd10->control = cmd6.control;
 	ccb->csio.cdb_len = sizeof(*cmd10);
 
 	/* Requeue request, unfreezing queue if necessary */
 	frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0;
  	ccb->ccb_h.status = CAM_REQUEUE_REQ;
 	xpt_action(ccb);
 	if (frozen) {
 		cam_release_devq(ccb->ccb_h.path,
 				 /*relsim_flags*/0,
 				 /*reduction*/0,
 				 /*timeout*/0,
 				 /*getcount_only*/0);
 	}
 	return (ERESTART);
 }
 
 static void
 dadone(struct cam_periph *periph, union ccb *done_ccb)
 {
 	struct da_softc *softc;
 	struct ccb_scsiio *csio;
 	u_int32_t  priority;
 	da_ccb_state state;
 
 	softc = (struct da_softc *)periph->softc;
 	priority = done_ccb->ccb_h.pinfo.priority;
 
 	CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n"));
 
 	csio = &done_ccb->csio;
 	state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK;
 	switch (state) {
 	case DA_CCB_BUFFER_IO:
 	case DA_CCB_DELETE:
 	{
 		struct bio *bp, *bp1;
 
 		bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
 		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			int error;
 			int sf;
 
 			if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0)
 				sf = SF_RETRY_UA;
 			else
 				sf = 0;
 
 			error = daerror(done_ccb, CAM_RETRY_SELTO, sf);
 			if (error == ERESTART) {
 				/*
 				 * A retry was scheuled, so
 				 * just return.
 				 */
 				return;
 			}
 			bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
 			if (error != 0) {
 				int queued_error;
 
 				/*
 				 * return all queued I/O with EIO, so that
 				 * the client can retry these I/Os in the
 				 * proper order should it attempt to recover.
 				 */
 				queued_error = EIO;
 
 				if (error == ENXIO
 				 && (softc->flags & DA_FLAG_PACK_INVALID)== 0) {
 					/*
 					 * Catastrophic error.  Mark our pack as
 					 * invalid.
 					 */
 					/*
 					 * XXX See if this is really a media
 					 * XXX change first?
 					 */
 					xpt_print(periph->path,
 					    "Invalidating pack\n");
 					softc->flags |= DA_FLAG_PACK_INVALID;
 					queued_error = ENXIO;
 				}
 				bioq_flush(&softc->bio_queue, NULL,
 					   queued_error);
 				if (bp != NULL) {
 					bp->bio_error = error;
 					bp->bio_resid = bp->bio_bcount;
 					bp->bio_flags |= BIO_ERROR;
 				}
 			} else if (bp != NULL) {
 				bp->bio_resid = csio->resid;
 				bp->bio_error = 0;
 				if (bp->bio_resid != 0)
 					bp->bio_flags |= BIO_ERROR;
 			}
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				cam_release_devq(done_ccb->ccb_h.path,
 						 /*relsim_flags*/0,
 						 /*reduction*/0,
 						 /*timeout*/0,
 						 /*getcount_only*/0);
 		} else if (bp != NULL) {
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				panic("REQ_CMP with QFRZN");
 			bp->bio_resid = csio->resid;
 			if (csio->resid > 0)
 				bp->bio_flags |= BIO_ERROR;
 			if (softc->error_inject != 0) {
 				bp->bio_error = softc->error_inject;
 				bp->bio_resid = bp->bio_bcount;
 				bp->bio_flags |= BIO_ERROR;
 				softc->error_inject = 0;
 			}
 
 		}
 
 		/*
 		 * Block out any asyncronous callbacks
 		 * while we touch the pending ccb list.
 		 */
 		LIST_REMOVE(&done_ccb->ccb_h, periph_links.le);
 		softc->outstanding_cmds--;
 		if (softc->outstanding_cmds == 0)
 			softc->flags |= DA_FLAG_WENT_IDLE;
 
 		if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) {
 			xpt_print(periph->path, "oustanding %d\n",
 				  softc->outstanding_cmds);
 		}
 
 		if (state == DA_CCB_DELETE) {
 			while ((bp1 = bioq_takefirst(&softc->delete_run_queue))
 			    != NULL) {
 				bp1->bio_resid = bp->bio_resid;
 				bp1->bio_error = bp->bio_error;
 				if (bp->bio_flags & BIO_ERROR)
 					bp1->bio_flags |= BIO_ERROR;
 				biodone(bp1);
 			}
 			softc->delete_running = 0;
 			if (bp != NULL)
 				biodone(bp);
 			daschedule(periph);
 		} else if (bp != NULL)
 			biodone(bp);
 		break;
 	}
 	case DA_CCB_PROBE:
 	case DA_CCB_PROBE2:
 	{
 		struct	   scsi_read_capacity_data *rdcap;
 		struct     scsi_read_capacity_data_long *rcaplong;
 		char	   announce_buf[80];
 
 		rdcap = NULL;
 		rcaplong = NULL;
 		if (state == DA_CCB_PROBE)
 			rdcap =(struct scsi_read_capacity_data *)csio->data_ptr;
 		else
 			rcaplong = (struct scsi_read_capacity_data_long *)
 				csio->data_ptr;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			struct disk_params *dp;
 			uint32_t block_size;
 			uint64_t maxsector;
 			u_int lbppbe;	/* LB per physical block exponent. */
 			u_int lalba;	/* Lowest aligned LBA. */
 
 			if (state == DA_CCB_PROBE) {
 				block_size = scsi_4btoul(rdcap->length);
 				maxsector = scsi_4btoul(rdcap->addr);
 				lbppbe = 0;
 				lalba = 0;
 
 				/*
 				 * According to SBC-2, if the standard 10
 				 * byte READ CAPACITY command returns 2^32,
 				 * we should issue the 16 byte version of
 				 * the command, since the device in question
 				 * has more sectors than can be represented
 				 * with the short version of the command.
 				 */
 				if (maxsector == 0xffffffff) {
 					softc->state = DA_STATE_PROBE2;
 					free(rdcap, M_SCSIDA);
 					xpt_release_ccb(done_ccb);
 					xpt_schedule(periph, priority);
 					return;
 				}
 			} else {
 				block_size = scsi_4btoul(rcaplong->length);
 				maxsector = scsi_8btou64(rcaplong->addr);
 				lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE;
 				lalba = scsi_2btoul(rcaplong->lalba_lbp);
 			}
 
 			/*
 			 * Because GEOM code just will panic us if we
 			 * give them an 'illegal' value we'll avoid that
 			 * here.
 			 */
 			if (block_size == 0 && maxsector == 0) {
 				snprintf(announce_buf, sizeof(announce_buf),
 				        "0MB (no media?)");
 			} else if (block_size >= MAXPHYS || block_size == 0) {
 				xpt_print(periph->path,
 				    "unsupportable block size %ju\n",
 				    (uintmax_t) block_size);
 				announce_buf[0] = '\0';
 				cam_periph_invalidate(periph);
 			} else {
 				/*
 				 * We pass rcaplong into dasetgeom(),
 				 * because it will only use it if it is
 				 * non-NULL.
 				 */
 				dasetgeom(periph, block_size, maxsector,
 					  rcaplong, sizeof(*rcaplong));
 				if ((lalba & SRC16_LBPME_A)
 				 && softc->delete_method == DA_DELETE_NONE)
 					dadeletemethodset(softc, DA_DELETE_UNMAP);
 				dp = &softc->params;
 				snprintf(announce_buf, sizeof(announce_buf),
 				        "%juMB (%ju %u byte sectors: %dH %dS/T "
                                         "%dC)", (uintmax_t)
 	                                (((uintmax_t)dp->secsize *
 				        dp->sectors) / (1024*1024)),
 			                (uintmax_t)dp->sectors,
 				        dp->secsize, dp->heads,
                                         dp->secs_per_track, dp->cylinders);
 			}
 		} else {
 			int	error;
 
 			announce_buf[0] = '\0';
 
 			/*
 			 * Retry any UNIT ATTENTION type errors.  They
 			 * are expected at boot.
 			 */
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
 					SF_RETRY_UA|SF_NO_PRINT);
 			if (error == ERESTART) {
 				/*
 				 * A retry was scheuled, so
 				 * just return.
 				 */
 				return;
 			} else if (error != 0) {
 				int asc, ascq;
 				int sense_key, error_code;
 				int have_sense;
 				cam_status status;
 				struct ccb_getdev cgd;
 
 				/* Don't wedge this device's queue */
 				status = done_ccb->ccb_h.status;
 				if ((status & CAM_DEV_QFRZN) != 0)
 					cam_release_devq(done_ccb->ccb_h.path,
 							 /*relsim_flags*/0,
 							 /*reduction*/0,
 							 /*timeout*/0,
 							 /*getcount_only*/0);
 
 
 				xpt_setup_ccb(&cgd.ccb_h, 
 					      done_ccb->ccb_h.path,
 					      CAM_PRIORITY_NORMAL);
 				cgd.ccb_h.func_code = XPT_GDEV_TYPE;
 				xpt_action((union ccb *)&cgd);
 
 				if (scsi_extract_sense_ccb(done_ccb,
 				    &error_code, &sense_key, &asc, &ascq))
 					have_sense = TRUE;
 				else
 					have_sense = FALSE;
 
 				/*
 				 * If we tried READ CAPACITY(16) and failed,
 				 * fallback to READ CAPACITY(10).
 				 */
 				if ((state == DA_CCB_PROBE2) &&
 				    (softc->flags & DA_FLAG_CAN_RC16) &&
 				    (((csio->ccb_h.status & CAM_STATUS_MASK) ==
 					CAM_REQ_INVALID) ||
 				     ((have_sense) &&
 				      (error_code == SSD_CURRENT_ERROR) &&
 				      (sense_key == SSD_KEY_ILLEGAL_REQUEST)))) {
 					softc->flags &= ~DA_FLAG_CAN_RC16;
 					softc->state = DA_STATE_PROBE;
 					free(rdcap, M_SCSIDA);
 					xpt_release_ccb(done_ccb);
 					xpt_schedule(periph, priority);
 					return;
 				} else
 				/*
 				 * Attach to anything that claims to be a
 				 * direct access or optical disk device,
 				 * as long as it doesn't return a "Logical
 				 * unit not supported" (0x25) error.
 				 */
 				if ((have_sense) && (asc != 0x25)
 				 && (error_code == SSD_CURRENT_ERROR)) {
 					const char *sense_key_desc;
 					const char *asc_desc;
 
 					scsi_sense_desc(sense_key, asc, ascq,
 							&cgd.inq_data,
 							&sense_key_desc,
 							&asc_desc);
 					snprintf(announce_buf,
 					    sizeof(announce_buf),
 						"Attempt to query device "
 						"size failed: %s, %s",
 						sense_key_desc,
 						asc_desc);
 				} else { 
 					if (have_sense)
 						scsi_sense_print(
 							&done_ccb->csio);
 					else {
 						xpt_print(periph->path,
 						    "got CAM status %#x\n",
 						    done_ccb->ccb_h.status);
 					}
 
 					xpt_print(periph->path, "fatal error, "
 					    "failed to attach to device\n");
 
 					/*
 					 * Free up resources.
 					 */
 					cam_periph_invalidate(periph);
 				} 
 			}
 		}
 		free(csio->data_ptr, M_SCSIDA);
 		if (announce_buf[0] != '\0' && ((softc->flags & DA_FLAG_PROBED) == 0)) {
 			/*
 			 * Create our sysctl variables, now that we know
 			 * we have successfully attached.
 			 */
 			/* increase the refcount */
 			if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
 				taskqueue_enqueue(taskqueue_thread,
 						  &softc->sysctl_task);
 				xpt_announce_periph(periph, announce_buf);
 			} else {
 				xpt_print(periph->path, "fatal error, "
 				    "could not acquire reference count\n");
 			}
 		}
 		/*
 		 * Since our peripheral may be invalidated by an error
 		 * above or an external event, we must release our CCB
 		 * before releasing the probe lock on the peripheral.
 		 * The peripheral will only go away once the last lock
 		 * is removed, and we need it around for the CCB release
 		 * operation.
 		 */
 		xpt_release_ccb(done_ccb);
 		softc->state = DA_STATE_NORMAL;
 		daschedule(periph);
 		wakeup(&softc->disk->d_mediasize);
 		if ((softc->flags & DA_FLAG_PROBED) == 0) {
 			softc->flags |= DA_FLAG_PROBED;
 			cam_periph_unhold(periph);
 		} else
 			cam_periph_release_locked(periph);
 		return;
 	}
 	case DA_CCB_WAITING:
 	{
 		/* Caller will release the CCB */
 		wakeup(&done_ccb->ccb_h.cbfcnp);
 		return;
 	}
 	case DA_CCB_DUMP:
 		/* No-op.  We're polling */
 		return;
 	case DA_CCB_TUR:
 	{
 		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 
 			if (daerror(done_ccb, CAM_RETRY_SELTO,
 			    SF_RETRY_UA | SF_NO_RECOVERY | SF_NO_PRINT) ==
 			    ERESTART)
 				return;
 			if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0)
 				cam_release_devq(done_ccb->ccb_h.path,
 						 /*relsim_flags*/0,
 						 /*reduction*/0,
 						 /*timeout*/0,
 						 /*getcount_only*/0);
 		}
 		xpt_release_ccb(done_ccb);
 		cam_periph_release_locked(periph);
 		return;
 	}
 	default:
 		break;
 	}
 	xpt_release_ccb(done_ccb);
 }
 
 static void
 dareprobe(struct cam_periph *periph)
 {
 	struct da_softc	  *softc;
 	cam_status status;
 
 	softc = (struct da_softc *)periph->softc;
 
 	/* Probe in progress; don't interfere. */
 	if ((softc->flags & DA_FLAG_PROBED) == 0)
 		return;
 
 	status = cam_periph_acquire(periph);
 	KASSERT(status == CAM_REQ_CMP,
 	    ("dareprobe: cam_periph_acquire failed"));
 
 	if (softc->flags & DA_FLAG_CAN_RC16)
 		softc->state = DA_STATE_PROBE2;
 	else
 		softc->state = DA_STATE_PROBE;
 
 	xpt_schedule(periph, CAM_PRIORITY_DEV);
 }
 
 static int
 daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
 {
 	struct da_softc	  *softc;
 	struct cam_periph *periph;
 	int error, error_code, sense_key, asc, ascq;
 
 	periph = xpt_path_periph(ccb->ccb_h.path);
 	softc = (struct da_softc *)periph->softc;
 
  	/*
 	 * Automatically detect devices that do not support
  	 * READ(6)/WRITE(6) and upgrade to using 10 byte cdbs.
  	 */
 	error = 0;
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) {
 		error = cmd6workaround(ccb);
 	} else if (scsi_extract_sense_ccb(ccb,
 	    &error_code, &sense_key, &asc, &ascq)) {
 		if (sense_key == SSD_KEY_ILLEGAL_REQUEST)
  			error = cmd6workaround(ccb);
 		/*
 		 * If the target replied with CAPACITY DATA HAS CHANGED UA,
 		 * query the capacity and notify upper layers.
 		 */
 		else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
 		    asc == 0x2A && ascq == 0x09) {
 			xpt_print(periph->path, "capacity data has changed\n");
 			dareprobe(periph);
 			sense_flags |= SF_NO_PRINT;
 		} else if (sense_key == SSD_KEY_UNIT_ATTENTION &&
 		    asc == 0x28 && ascq == 0x00)
 			disk_media_changed(softc->disk, M_NOWAIT);
 		else if (sense_key == SSD_KEY_NOT_READY &&
 		    asc == 0x3a && (softc->flags & DA_FLAG_SAW_MEDIA)) {
 			softc->flags &= ~DA_FLAG_SAW_MEDIA;
 			disk_media_gone(softc->disk, M_NOWAIT);
 		}
 	}
 	if (error == ERESTART)
 		return (ERESTART);
 
 	/*
 	 * XXX
 	 * Until we have a better way of doing pack validation,
 	 * don't treat UAs as errors.
 	 */
 	sense_flags |= SF_RETRY_UA;
 	return(cam_periph_error(ccb, cam_flags, sense_flags,
 				&softc->saved_ccb));
 }
 
 static void
 damediapoll(void *arg)
 {
 	struct cam_periph *periph = arg;
 	struct da_softc *softc = periph->softc;
 
 	if (!softc->tur && softc->outstanding_cmds == 0) {
 		if (cam_periph_acquire(periph) == CAM_REQ_CMP) {
 			softc->tur = 1;
 			daschedule(periph);
 		}
 	}
 	/* Queue us up again */
 	if (da_poll_period != 0)
 		callout_schedule(&softc->mediapoll_c, da_poll_period * hz);
 }
 
 static void
 daprevent(struct cam_periph *periph, int action)
 {
 	struct	da_softc *softc;
 	union	ccb *ccb;		
 	int	error;
 		
 	softc = (struct da_softc *)periph->softc;
 
 	if (((action == PR_ALLOW)
 	  && (softc->flags & DA_FLAG_PACK_LOCKED) == 0)
 	 || ((action == PR_PREVENT)
 	  && (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) {
 		return;
 	}
 
 	ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL);
 
 	scsi_prevent(&ccb->csio,
 		     /*retries*/1,
 		     /*cbcfp*/dadone,
 		     MSG_SIMPLE_Q_TAG,
 		     action,
 		     SSD_FULL_SIZE,
 		     5000);
 
 	error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO,
 	    SF_RETRY_UA | SF_QUIET_IR, softc->disk->d_devstat);
 
 	if (error == 0) {
 		if (action == PR_ALLOW)
 			softc->flags &= ~DA_FLAG_PACK_LOCKED;
 		else
 			softc->flags |= DA_FLAG_PACK_LOCKED;
 	}
 
 	xpt_release_ccb(ccb);
 }
 
 static void
 dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector,
 	  struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len)
 {
 	struct ccb_calc_geometry ccg;
 	struct da_softc *softc;
 	struct disk_params *dp;
 	u_int lbppbe, lalba;
 	int error;
 
 	softc = (struct da_softc *)periph->softc;
 
 	dp = &softc->params;
 	dp->secsize = block_len;
 	dp->sectors = maxsector + 1;
 	if (rcaplong != NULL) {
 		lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE;
 		lalba = scsi_2btoul(rcaplong->lalba_lbp);
 		lalba &= SRC16_LALBA_A;
 	} else {
 		lbppbe = 0;
 		lalba = 0;
 	}
 
 	if (lbppbe > 0) {
 		dp->stripesize = block_len << lbppbe;
 		dp->stripeoffset = (dp->stripesize - block_len * lalba) %
 		    dp->stripesize;
 	} else if (softc->quirks & DA_Q_4K) {
 		dp->stripesize = 4096;
 		dp->stripeoffset = 0;
 	} else {
 		dp->stripesize = 0;
 		dp->stripeoffset = 0;
 	}
 	/*
 	 * Have the controller provide us with a geometry
 	 * for this disk.  The only time the geometry
 	 * matters is when we boot and the controller
 	 * is the only one knowledgeable enough to come
 	 * up with something that will make this a bootable
 	 * device.
 	 */
 	xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 	ccg.ccb_h.func_code = XPT_CALC_GEOMETRY;
 	ccg.block_size = dp->secsize;
 	ccg.volume_size = dp->sectors;
 	ccg.heads = 0;
 	ccg.secs_per_track = 0;
 	ccg.cylinders = 0;
 	xpt_action((union ccb*)&ccg);
 	if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		/*
 		 * We don't know what went wrong here- but just pick
 		 * a geometry so we don't have nasty things like divide
 		 * by zero.
 		 */
 		dp->heads = 255;
 		dp->secs_per_track = 255;
 		dp->cylinders = dp->sectors / (255 * 255);
 		if (dp->cylinders == 0) {
 			dp->cylinders = 1;
 		}
 	} else {
 		dp->heads = ccg.heads;
 		dp->secs_per_track = ccg.secs_per_track;
 		dp->cylinders = ccg.cylinders;
 	}
 
 	/*
 	 * If the user supplied a read capacity buffer, and if it is
 	 * different than the previous buffer, update the data in the EDT.
 	 * If it's the same, we don't bother.  This avoids sending an
 	 * update every time someone opens this device.
 	 */
 	if ((rcaplong != NULL)
 	 && (bcmp(rcaplong, &softc->rcaplong,
 		  min(sizeof(softc->rcaplong), rcap_len)) != 0)) {
 		struct ccb_dev_advinfo cdai;
 
 		xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 		cdai.ccb_h.func_code = XPT_DEV_ADVINFO;
 		cdai.buftype = CDAI_TYPE_RCAPLONG;
 		cdai.flags |= CDAI_FLAG_STORE;
 		cdai.bufsiz = rcap_len;
 		cdai.buf = (uint8_t *)rcaplong;
 		xpt_action((union ccb *)&cdai);
 		if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE);
 		if (cdai.ccb_h.status != CAM_REQ_CMP) {
 			xpt_print(periph->path, "%s: failed to set read "
 				  "capacity advinfo\n", __func__);
 			/* Use cam_error_print() to decode the status */
 			cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS,
 					CAM_EPF_ALL);
 		} else {
 			bcopy(rcaplong, &softc->rcaplong,
 			      min(sizeof(softc->rcaplong), rcap_len));
 		}
 	}
 
 	softc->disk->d_sectorsize = softc->params.secsize;
 	softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors;
 	softc->disk->d_stripesize = softc->params.stripesize;
 	softc->disk->d_stripeoffset = softc->params.stripeoffset;
 	/* XXX: these are not actually "firmware" values, so they may be wrong */
 	softc->disk->d_fwsectors = softc->params.secs_per_track;
 	softc->disk->d_fwheads = softc->params.heads;
 	softc->disk->d_devstat->block_size = softc->params.secsize;
 	softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE;
 	if (softc->delete_method > DA_DELETE_DISABLE)
 		softc->disk->d_flags |= DISKFLAG_CANDELETE;
 	else
 		softc->disk->d_flags &= ~DISKFLAG_CANDELETE;
 
 	error = disk_resize(softc->disk, M_NOWAIT);
 	if (error != 0)
 		xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error);
 }
 
 static void
 dasendorderedtag(void *arg)
 {
 	struct da_softc *softc = arg;
 
 	if (da_send_ordered) {
 		if ((softc->ordered_tag_count == 0) 
 		 && ((softc->flags & DA_FLAG_WENT_IDLE) == 0)) {
 			softc->flags |= DA_FLAG_NEED_OTAG;
 		}
 		if (softc->outstanding_cmds > 0)
 			softc->flags &= ~DA_FLAG_WENT_IDLE;
 
 		softc->ordered_tag_count = 0;
 	}
 	/* Queue us up again */
 	callout_reset(&softc->sendordered_c,
 	    (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL,
 	    dasendorderedtag, softc);
 }
 
 /*
  * Step through all DA peripheral drivers, and if the device is still open,
  * sync the disk cache to physical media.
  */
 static void
 dashutdown(void * arg, int howto)
 {
 	struct cam_periph *periph;
 	struct da_softc *softc;
 	int error;
 
 	TAILQ_FOREACH(periph, &dadriver.units, unit_links) {
 		union ccb ccb;
 
 		cam_periph_lock(periph);
 		softc = (struct da_softc *)periph->softc;
 
 		/*
 		 * We only sync the cache if the drive is still open, and
 		 * if the drive is capable of it..
 		 */
 		if (((softc->flags & DA_FLAG_OPEN) == 0)
 		 || (softc->quirks & DA_Q_NO_SYNC_CACHE)) {
 			cam_periph_unlock(periph);
 			continue;
 		}
 
 		xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 
 		ccb.ccb_h.ccb_state = DA_CCB_DUMP;
 		scsi_synchronize_cache(&ccb.csio,
 				       /*retries*/0,
 				       /*cbfcnp*/dadone,
 				       MSG_SIMPLE_Q_TAG,
 				       /*begin_lba*/0, /* whole disk */
 				       /*lb_count*/0,
 				       SSD_FULL_SIZE,
 				       60 * 60 * 1000);
 
 		xpt_polled_action(&ccb);
 
 		error = cam_periph_error(&ccb,
 		    0, SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, NULL);
 		if ((ccb.ccb_h.status & CAM_DEV_QFRZN) != 0)
 			cam_release_devq(ccb.ccb_h.path, /*relsim_flags*/0,
 			    /*reduction*/0, /*timeout*/0, /*getcount_only*/0);
 		if (error != 0)
 			xpt_print(periph->path, "Synchronize cache failed\n");
 		cam_periph_unlock(periph);
 	}
 }
 
 #else /* !_KERNEL */
 
 /*
  * XXX This is only left out of the kernel build to silence warnings.  If,
  * for some reason this function is used in the kernel, the ifdefs should
  * be moved so it is included both in the kernel and userland.
  */
 void
 scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave,
 		 u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
 		 u_int32_t timeout)
 {
 	struct scsi_format_unit *scsi_cmd;
 
 	scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes;
 	scsi_cmd->opcode = FORMAT_UNIT;
 	scsi_cmd->byte2 = byte2;
 	scsi_ulto2b(ileave, scsi_cmd->interleave);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 #endif /* _KERNEL */
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	(revision 247192)
@@ -1,5156 +1,5160 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefor exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (rangeing from 512 bytes to
  * 128K bytes).  We therefor choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal arc algorithms for
  * adjusting the cache use method 2.  We therefor provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
  * Buffers do not have their own mutexs, rather they rely on the
  * hash table mutexs for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexs).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each arc state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an arc list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * Arc buffers may have an associated eviction callback function.
  * This function will be invoked prior to removing the buffer (e.g.
  * in arc_do_user_evicts()).  Note however that the data associated
  * with the buffer may be evicted prior to the callback.  The callback
  * must be made with *no locks held* (to prevent deadlock).  Additionally,
  * the users of callbacks must ensure that their private data is
  * protected from simultaneous callbacks from arc_buf_evict()
  * and arc_do_user_evicts().
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <zfs_fletcher.h>
 #include <sys/sdt.h>
 
 #include <vm/vm_pageout.h>
 
 #ifdef illumos
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 int arc_procfd;
 #endif
 #endif /* illumos */
 
 static kmutex_t		arc_reclaim_thr_lock;
 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
 static uint8_t		arc_thread_exit;
 
 extern int zfs_write_limit_shift;
 extern uint64_t zfs_write_limit_max;
 extern kmutex_t zfs_write_limit_lock;
 
 #define	ARC_REDUCE_DNLC_PERCENT	3
 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 
 typedef enum arc_reclaim_strategy {
 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
 } arc_reclaim_strategy_t;
 
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
 /* shift of arc_c for calculating both min and max arc_p */
 static int		arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
 static int		arc_shrink_shift = 5;
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static int		arc_min_prefetch_lifespan;
 
 static int arc_dead;
 extern int zfs_prefetch_disable;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 static boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
 int zfs_disable_dup_eviction = 0;
 
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
     "Maximum ARC size");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
     "Minimum ARC size");
 
 /*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
  *	ARC_mru		- recently used, currently cached
  *	ARC_mru_ghost	- recentely used, no longer in cache
  *	ARC_mfu		- frequently used, currently cached
  *	ARC_mfu_ghost	- frequently used, no longer in cache
  *	ARC_l2c_only	- exists in L2ARC but not other states
  * When there are no active references to the buffer, they are
  * are linked onto a list in one of these arc states.  These are
  * the only buffers that can be evicted or deleted.  Within each
  * state there are multiple lists, one for meta-data and one for
  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
  * etc.) is tracked separately so that it can be managed more
  * explicitly: favored over data, limited explicitly.
  *
  * Anonymous buffers are buffers that are not associated with
  * a DVA.  These are buffers that hold dirty block copies
  * before they are written to stable storage.  By definition,
  * they are "ref'd" and are considered part of arc_mru
  * that cannot be freed.  Generally, they will aquire a DVA
  * as they are written and migrate onto the arc_mru list.
  *
  * The ARC_l2c_only state is for buffers that are in the second
  * level ARC but no longer in any of the ARC_m* lists.  The second
  * level ARC itself may also contain buffers that are in any of
  * the ARC_m* states - meaning that a buffer can exist in two
  * places.  The reason for the ARC_l2c_only state is to keep the
  * buffer header in the hash table, so that reads that hit the
  * second level ARC benefit from these fast lookups.
  */
 
 #define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
 struct arcs_lock {
 	kmutex_t	arcs_lock;
 #ifdef _KERNEL
 	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
 #endif
 };
 
 /*
  * must be power of two for mask use to work
  *
  */
 #define ARC_BUFC_NUMDATALISTS		16
 #define ARC_BUFC_NUMMETADATALISTS	16
 #define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
 
 typedef struct arc_state {
 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
 	uint64_t arcs_size;	/* total amount of data in this state */
 	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
 	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
 } arc_state_t;
 
 #define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
 
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
 static arc_state_t ARC_mru_ghost;
 static arc_state_t ARC_mfu;
 static arc_state_t ARC_mfu_ghost;
 static arc_state_t ARC_l2c_only;
 
 typedef struct arc_stats {
 	kstat_named_t arcstat_hits;
 	kstat_named_t arcstat_misses;
 	kstat_named_t arcstat_demand_data_hits;
 	kstat_named_t arcstat_demand_data_misses;
 	kstat_named_t arcstat_demand_metadata_hits;
 	kstat_named_t arcstat_demand_metadata_misses;
 	kstat_named_t arcstat_prefetch_data_hits;
 	kstat_named_t arcstat_prefetch_data_misses;
 	kstat_named_t arcstat_prefetch_metadata_hits;
 	kstat_named_t arcstat_prefetch_metadata_misses;
 	kstat_named_t arcstat_mru_hits;
 	kstat_named_t arcstat_mru_ghost_hits;
 	kstat_named_t arcstat_mfu_hits;
 	kstat_named_t arcstat_mfu_ghost_hits;
 	kstat_named_t arcstat_allocated;
 	kstat_named_t arcstat_deleted;
 	kstat_named_t arcstat_stolen;
 	kstat_named_t arcstat_recycle_miss;
 	kstat_named_t arcstat_mutex_miss;
 	kstat_named_t arcstat_evict_skip;
 	kstat_named_t arcstat_evict_l2_cached;
 	kstat_named_t arcstat_evict_l2_eligible;
 	kstat_named_t arcstat_evict_l2_ineligible;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
 	kstat_named_t arcstat_hash_chains;
 	kstat_named_t arcstat_hash_chain_max;
 	kstat_named_t arcstat_p;
 	kstat_named_t arcstat_c;
 	kstat_named_t arcstat_c_min;
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
 	kstat_named_t arcstat_hdr_size;
 	kstat_named_t arcstat_data_size;
 	kstat_named_t arcstat_other_size;
 	kstat_named_t arcstat_l2_hits;
 	kstat_named_t arcstat_l2_misses;
 	kstat_named_t arcstat_l2_feeds;
 	kstat_named_t arcstat_l2_rw_clash;
 	kstat_named_t arcstat_l2_read_bytes;
 	kstat_named_t arcstat_l2_write_bytes;
 	kstat_named_t arcstat_l2_writes_sent;
 	kstat_named_t arcstat_l2_writes_done;
 	kstat_named_t arcstat_l2_writes_error;
 	kstat_named_t arcstat_l2_writes_hdr_miss;
 	kstat_named_t arcstat_l2_evict_lock_retry;
 	kstat_named_t arcstat_l2_evict_reading;
 	kstat_named_t arcstat_l2_free_on_write;
 	kstat_named_t arcstat_l2_abort_lowmem;
 	kstat_named_t arcstat_l2_cksum_bad;
 	kstat_named_t arcstat_l2_io_error;
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_hdr_size;
 	kstat_named_t arcstat_l2_write_trylock_fail;
 	kstat_named_t arcstat_l2_write_passed_headroom;
 	kstat_named_t arcstat_l2_write_spa_mismatch;
 	kstat_named_t arcstat_l2_write_in_l2;
 	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
 	kstat_named_t arcstat_l2_write_not_cacheable;
 	kstat_named_t arcstat_l2_write_full;
 	kstat_named_t arcstat_l2_write_buffer_iter;
 	kstat_named_t arcstat_l2_write_pios;
 	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
 	kstat_named_t arcstat_l2_write_buffer_list_iter;
 	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 	kstat_named_t arcstat_memory_throttle_count;
 	kstat_named_t arcstat_duplicate_buffers;
 	kstat_named_t arcstat_duplicate_buffers_size;
 	kstat_named_t arcstat_duplicate_reads;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "allocated",			KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "stolen",			KSTAT_DATA_UINT64 },
 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "p",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "other_size",			KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
 	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
 	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
 	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
 	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
 	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
 	{ "l2_write_full",		KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
 	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
 	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
 	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
 
 #define	ARCSTAT_INCR(stat, val) \
 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
 
 #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 #define	ARCSTAT_MAXSTAT(stat) \
 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 kstat_t			*arc_ksp;
 static arc_state_t	*arc_anon;
 static arc_state_t	*arc_mru;
 static arc_state_t	*arc_mru_ghost;
 static arc_state_t	*arc_mfu;
 static arc_state_t	*arc_mfu_ghost;
 static arc_state_t	*arc_l2c_only;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
 static uint64_t		arc_loaned_bytes;
 static uint64_t		arc_meta_used;
 static uint64_t		arc_meta_limit;
 static uint64_t		arc_meta_max = 0;
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
     "ARC metadata used");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
     "ARC metadata limit");
 
 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
 typedef struct arc_callback arc_callback_t;
 
 struct arc_callback {
 	void			*acb_private;
 	arc_done_func_t		*acb_done;
 	arc_buf_t		*acb_buf;
 	zio_t			*acb_zio_dummy;
 	arc_callback_t		*acb_next;
 };
 
 typedef struct arc_write_callback arc_write_callback_t;
 
 struct arc_write_callback {
 	void		*awcb_private;
 	arc_done_func_t	*awcb_ready;
 	arc_done_func_t	*awcb_done;
 	arc_buf_t	*awcb_buf;
 };
 
 struct arc_buf_hdr {
 	/* protected by hash lock */
 	dva_t			b_dva;
 	uint64_t		b_birth;
 	uint64_t		b_cksum0;
 
 	kmutex_t		b_freeze_lock;
 	zio_cksum_t		*b_freeze_cksum;
 	void			*b_thawed;
 
 	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
 	uint32_t		b_flags;
 	uint32_t		b_datacnt;
 
 	arc_callback_t		*b_acb;
 	kcondvar_t		b_cv;
 
 	/* immutable */
 	arc_buf_contents_t	b_type;
 	uint64_t		b_size;
 	uint64_t		b_spa;
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
 	list_node_t		b_arc_node;
 
 	/* updated atomically */
 	clock_t			b_arc_access;
 
 	/* self protecting */
 	refcount_t		b_refcnt;
 
 	l2arc_buf_hdr_t		*b_l2hdr;
 	list_node_t		b_l2node;
 };
 
 static arc_buf_t *arc_eviction_list;
 static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
 static void arc_get_data_buf(arc_buf_t *buf);
 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 #ifdef illumos
 static void arc_buf_watch(arc_buf_t *buf);
 #endif /* illumos */
 
 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 /*
  * Private ARC flags.  These flags are private ARC only flags that will show up
  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
  * be passed in as arc_flags in things like arc_read.  However, these flags
  * should never be passed and should only be set by ARC code.  When adding new
  * public flags, make sure not to smash the private ones.
  */
 
 #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
 #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
 #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
 #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
 #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
 #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
 #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
 #define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
 				    (hdr)->b_l2hdr != NULL)
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 
 /*
  * Other sizes
  */
 
 #define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 
 /*
  * Hash table routines
  */
 
 #define	HT_LOCK_PAD	CACHE_LINE_SIZE
 
 struct ht_lock {
 	kmutex_t	ht_lock;
 #ifdef _KERNEL
 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 #endif
 };
 
 #define	BUF_LOCKS 256
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		2		/* num of writes */
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
 
 /*
  * L2ARC Performance Tunables
  */
 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
 boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
 boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
     &l2arc_write_max, 0, "max write size");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
     &l2arc_write_boost, 0, "extra write during warmup");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
     &l2arc_headroom, 0, "number of dev writes");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
     &l2arc_feed_secs, 0, "interval seconds");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
     &l2arc_feed_min_ms, 0, "min interval milliseconds");
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
     &l2arc_noprefetch, 0, "don't cache prefetch bufs");
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
     &l2arc_feed_again, 0, "turbo warmup");
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
     &l2arc_norw, 0, "no reads during writes");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
     &ARC_anon.arcs_size, 0, "size of anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
     &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
     &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
     &ARC_mru.arcs_size, 0, "size of mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
     &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
     &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
     &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
     "size of metadata in mru ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
     "size of data in mru ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
     &ARC_mfu.arcs_size, 0, "size of mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
     &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
     &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
     &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
     "size of metadata in mfu ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
     "size of data in mfu ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
     &ARC_l2c_only.arcs_size, 0, "size of mru state");
 
 /*
  * L2ARC Internals
  */
 typedef struct l2arc_dev {
 	vdev_t			*l2ad_vdev;	/* vdev */
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
 	uint64_t		l2ad_write;	/* desired write size, bytes */
 	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
 	uint64_t		l2ad_evict;	/* last addr eviction reached */
 	boolean_t		l2ad_first;	/* first sweep through */
 	boolean_t		l2ad_writing;	/* currently writing */
 	list_t			*l2ad_buflist;	/* buffer list */
 	list_node_t		l2ad_node;	/* device list node */
 } l2arc_dev_t;
 
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_t	*l2rcb_buf;		/* read buffer */
 	spa_t		*l2rcb_spa;		/* spa */
 	blkptr_t	l2rcb_bp;		/* original blkptr */
 	zbookmark_t	l2rcb_zb;		/* original bookmark */
 	int		l2rcb_flags;		/* original flags */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
 	l2arc_dev_t	*l2wcb_dev;		/* device info */
 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
 } l2arc_write_callback_t;
 
 struct l2arc_buf_hdr {
 	/* protected by arc_buf_hdr  mutex */
 	l2arc_dev_t	*b_dev;			/* L2ARC device */
 	uint64_t	b_daddr;		/* disk address, offset byte */
 };
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	void		*l2df_data;
 	size_t		l2df_size;
 	void		(*l2df_func)(void *, size_t);
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static void l2arc_read_done(zio_t *zio);
 static void l2arc_hdr_stat_add(void);
 static void l2arc_hdr_stat_remove(void);
 
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	uint8_t *vdva = (uint8_t *)dva;
 	uint64_t crc = -1ULL;
 	int i;
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 
 	for (i = 0; i < sizeof (dva_t); i++)
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 
 	crc ^= (spa>>8) ^ birth;
 
 	return (crc);
 }
 
 #define	BUF_EMPTY(buf)						\
 	((buf)->b_dva.dva_word[0] == 0 &&			\
 	(buf)->b_dva.dva_word[1] == 0 &&			\
 	(buf)->b_birth == 0)
 
 #define	BUF_EQUAL(spa, dva, birth, buf)				\
 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 	hdr->b_cksum0 = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *buf;
 
 	mutex_enter(hash_lock);
 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 	    buf = buf->b_hash_next) {
 		if (BUF_EQUAL(spa, dva, birth, buf)) {
 			*lockp = hash_lock;
 			return (buf);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fbuf;
 	uint32_t i;
 
 	ASSERT(!HDR_IN_HASH_TABLE(buf));
 	*lockp = hash_lock;
 	mutex_enter(hash_lock);
 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 	    fbuf = fbuf->b_hash_next, i++) {
 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 			return (fbuf);
 	}
 
 	buf->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = buf;
 	buf->b_flags |= ARC_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 
 	ARCSTAT_BUMP(arcstat_hash_elements);
 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *buf)
 {
 	arc_buf_hdr_t *fbuf, **bufp;
 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(buf));
 
 	bufp = &buf_hash_table.ht_table[idx];
 	while ((fbuf = *bufp) != buf) {
 		ASSERT(fbuf != NULL);
 		bufp = &fbuf->b_hash_next;
 	}
 	*bufp = buf->b_hash_next;
 	buf->b_hash_next = NULL;
 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 static kmem_cache_t *hdr_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 	int i;
 
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 	kmem_cache_destroy(hdr_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 /* ARGSUSED */
 static int
 hdr_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *buf = vbuf;
 
 	bzero(buf, sizeof (arc_buf_hdr_t));
 	refcount_create(&buf->b_refcnt);
 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_t *buf = vbuf;
 
 	bzero(buf, sizeof (arc_buf_t));
 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 /* ARGSUSED */
 static void
 hdr_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *buf = vbuf;
 
 	ASSERT(BUF_EMPTY(buf));
 	refcount_destroy(&buf->b_refcnt);
 	cv_destroy(&buf->b_cv);
 	mutex_destroy(&buf->b_freeze_lock);
 	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	arc_buf_t *buf = vbuf;
 
 	mutex_destroy(&buf->b_evict_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 /*
  * Reclaim callback -- invoked when memory is low.
  */
 /* ARGSUSED */
 static void
 hdr_recl(void *unused)
 {
 	dprintf("hdr_recl called\n");
 	/*
 	 * umem calls the reclaim func when we destroy the buf cache,
 	 * which is after we do arc_fini().
 	 */
 	if (!arc_dead)
 		cv_signal(&arc_reclaim_thr_cv);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average 64K block size.  The table will take up
 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 	 */
 	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++) {
 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 	}
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	mutex_enter(&buf->b_hdr->b_freeze_lock);
 	if (buf->b_hdr->b_freeze_cksum == NULL ||
 	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 		mutex_exit(&buf->b_hdr->b_freeze_lock);
 		return;
 	}
 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
 }
 
 static int
 arc_cksum_equal(arc_buf_t *buf)
 {
 	zio_cksum_t zc;
 	int equal;
 
 	mutex_enter(&buf->b_hdr->b_freeze_lock);
 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
 
 	return (equal);
 }
 
 static void
 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 {
 	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	mutex_enter(&buf->b_hdr->b_freeze_lock);
 	if (buf->b_hdr->b_freeze_cksum != NULL) {
 		mutex_exit(&buf->b_hdr->b_freeze_lock);
 		return;
 	}
 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 	    buf->b_hdr->b_freeze_cksum);
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
 #ifdef illumos
 	arc_buf_watch(buf);
 #endif /* illumos */
 }
 
 #ifdef illumos
 #ifndef _KERNEL
 typedef struct procctl {
 	long cmd;
 	prwatch_t prwatch;
 } procctl_t;
 #endif
 
 /* ARGSUSED */
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		int result;
 		procctl_t ctl;
 		ctl.cmd = PCWATCH;
 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
 		ctl.prwatch.pr_size = 0;
 		ctl.prwatch.pr_wflags = 0;
 		result = write(arc_procfd, &ctl, sizeof (ctl));
 		ASSERT3U(result, ==, sizeof (ctl));
 	}
 #endif
 }
 
 /* ARGSUSED */
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		int result;
 		procctl_t ctl;
 		ctl.cmd = PCWATCH;
 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
 		ctl.prwatch.pr_size = buf->b_hdr->b_size;
 		ctl.prwatch.pr_wflags = WA_WRITE;
 		result = write(arc_procfd, &ctl, sizeof (ctl));
 		ASSERT3U(result, ==, sizeof (ctl));
 	}
 #endif
 }
 #endif /* illumos */
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
 		if (buf->b_hdr->b_state != arc_anon)
 			panic("modifying non-anon buffer!");
 		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
 			panic("modifying buffer while i/o in progress!");
 		arc_cksum_verify(buf);
 	}
 
 	mutex_enter(&buf->b_hdr->b_freeze_lock);
 	if (buf->b_hdr->b_freeze_cksum != NULL) {
 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		buf->b_hdr->b_freeze_cksum = NULL;
 	}
 
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
 		if (buf->b_hdr->b_thawed)
 			kmem_free(buf->b_hdr->b_thawed, 1);
 		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
 	}
 
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
 
 #ifdef illumos
 	arc_buf_unwatch(buf);
 #endif /* illumos */
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	kmutex_t *hash_lock;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
 	    buf->b_hdr->b_state == arc_anon);
 	arc_cksum_compute(buf, B_FALSE);
 	mutex_exit(hash_lock);
 
 }
 
 static void
 get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
 {
 	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
 
 	if (ab->b_type == ARC_BUFC_METADATA)
 		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
 	else {
 		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
 		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
 	}
 
 	*list = &state->arcs_lists[buf_hashid];
 	*lock = ARCS_LOCK(state, buf_hashid);
 }
 
 
 static void
 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
 	ASSERT(MUTEX_HELD(hash_lock));
 
 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
 	    (ab->b_state != arc_anon)) {
 		uint64_t delta = ab->b_size * ab->b_datacnt;
 		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
 		list_t *list;
 		kmutex_t *lock;
 
 		get_buf_info(ab, ab->b_state, &list, &lock);
 		ASSERT(!MUTEX_HELD(lock));
 		mutex_enter(lock);
 		ASSERT(list_link_active(&ab->b_arc_node));
 		list_remove(list, ab);
 		if (GHOST_STATE(ab->b_state)) {
 			ASSERT0(ab->b_datacnt);
 			ASSERT3P(ab->b_buf, ==, NULL);
 			delta = ab->b_size;
 		}
 		ASSERT(delta > 0);
 		ASSERT3U(*size, >=, delta);
 		atomic_add_64(size, -delta);
 		mutex_exit(lock);
 		/* remove the prefetch flag if we get a reference */
 		if (ab->b_flags & ARC_PREFETCH)
 			ab->b_flags &= ~ARC_PREFETCH;
 	}
 }
 
 static int
 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
 	arc_state_t *state = ab->b_state;
 
 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
 	ASSERT(!GHOST_STATE(state));
 
 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
 		uint64_t *size = &state->arcs_lsize[ab->b_type];
 		list_t *list;
 		kmutex_t *lock;
 
 		get_buf_info(ab, state, &list, &lock);
 		ASSERT(!MUTEX_HELD(lock));
 		mutex_enter(lock);
 		ASSERT(!list_link_active(&ab->b_arc_node));
 		list_insert_head(list, ab);
 		ASSERT(ab->b_datacnt > 0);
 		atomic_add_64(size, ab->b_size * ab->b_datacnt);
 		mutex_exit(lock);
 	}
 	return (cnt);
 }
 
 /*
  * Move the supplied buffer to the indicated state.  The mutex
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 {
 	arc_state_t *old_state = ab->b_state;
 	int64_t refcnt = refcount_count(&ab->b_refcnt);
 	uint64_t from_delta, to_delta;
 	list_t *list;
 	kmutex_t *lock;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(new_state != old_state);
 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
 	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon) {
 			int use_mutex;
 			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
 
 			get_buf_info(ab, old_state, &list, &lock);
 			use_mutex = !MUTEX_HELD(lock);
 			if (use_mutex)
 				mutex_enter(lock);
 
 			ASSERT(list_link_active(&ab->b_arc_node));
 			list_remove(list, ab);
 
 			/*
 			 * If prefetching out of the ghost cache,
 			 * we will have a non-zero datacnt.
 			 */
 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
 				/* ghost elements have a ghost size */
 				ASSERT(ab->b_buf == NULL);
 				from_delta = ab->b_size;
 			}
 			ASSERT3U(*size, >=, from_delta);
 			atomic_add_64(size, -from_delta);
 
 			if (use_mutex)
 				mutex_exit(lock);
 		}
 		if (new_state != arc_anon) {
 			int use_mutex;
 			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
 
 			get_buf_info(ab, new_state, &list, &lock);
 			use_mutex = !MUTEX_HELD(lock);
 			if (use_mutex)
 				mutex_enter(lock);
 
 			list_insert_head(list, ab);
 
 			/* ghost elements have a ghost size */
 			if (GHOST_STATE(new_state)) {
 				ASSERT(ab->b_datacnt == 0);
 				ASSERT(ab->b_buf == NULL);
 				to_delta = ab->b_size;
 			}
 			atomic_add_64(size, to_delta);
 
 			if (use_mutex)
 				mutex_exit(lock);
 		}
 	}
 
 	ASSERT(!BUF_EMPTY(ab));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
 		buf_hash_remove(ab);
 
 	/* adjust state sizes */
 	if (to_delta)
 		atomic_add_64(&new_state->arcs_size, to_delta);
 	if (from_delta) {
 		ASSERT3U(old_state->arcs_size, >=, from_delta);
 		atomic_add_64(&old_state->arcs_size, -from_delta);
 	}
 	ab->b_state = new_state;
 
 	/* adjust l2arc hdr stats */
 	if (new_state == arc_l2c_only)
 		l2arc_hdr_stat_add();
 	else if (old_state == arc_l2c_only)
 		l2arc_hdr_stat_remove();
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
 		break;
 	}
 
 	atomic_add_64(&arc_meta_used, space);
 	atomic_add_64(&arc_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
 		break;
 	}
 
 	ASSERT(arc_meta_used >= space);
 	if (arc_meta_max < arc_meta_used)
 		arc_meta_max = arc_meta_used;
 	atomic_add_64(&arc_meta_used, -space);
 	ASSERT(arc_size >= space);
 	atomic_add_64(&arc_size, -space);
 }
 
 void *
 arc_data_buf_alloc(uint64_t size)
 {
 	if (arc_evict_needed(ARC_BUFC_DATA))
 		cv_signal(&arc_reclaim_thr_cv);
 	atomic_add_64(&arc_size, size);
 	return (zio_data_buf_alloc(size));
 }
 
 void
 arc_data_buf_free(void *buf, uint64_t size)
 {
 	zio_data_buf_free(buf, size);
 	ASSERT(arc_size >= size);
 	atomic_add_64(&arc_size, -size);
 }
 
 arc_buf_t *
 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 
 	ASSERT3U(size, >, 0);
 	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
 	ASSERT(BUF_EMPTY(hdr));
 	hdr->b_size = size;
 	hdr->b_type = type;
 	hdr->b_spa = spa_load_guid(spa);
 	hdr->b_state = arc_anon;
 	hdr->b_arc_access = 0;
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 	buf->b_next = NULL;
 	hdr->b_buf = buf;
 	arc_get_data_buf(buf);
 	hdr->b_datacnt = 1;
 	hdr->b_flags = 0;
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	(void) refcount_add(&hdr->b_refcnt, tag);
 
 	return (buf);
 }
 
 static char *arc_onloan_tag = "onloan";
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, int size)
 {
 	arc_buf_t *buf;
 
 	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
 
 	atomic_add_64(&arc_loaned_bytes, size);
 	return (buf);
 }
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(buf->b_data != NULL);
 	(void) refcount_add(&hdr->b_refcnt, tag);
 	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
 
 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr;
 
 	ASSERT(buf->b_data != NULL);
 	hdr = buf->b_hdr;
 	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
 	(void) refcount_remove(&hdr->b_refcnt, tag);
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 
 	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
 }
 
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
 	arc_buf_t *buf;
 	arc_buf_hdr_t *hdr = from->b_hdr;
 	uint64_t size = hdr->b_size;
 
 	ASSERT(hdr->b_state != arc_anon);
 
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 	buf->b_next = hdr->b_buf;
 	hdr->b_buf = buf;
 	arc_get_data_buf(buf);
 	bcopy(from->b_data, buf->b_data, size);
 
 	/*
 	 * This buffer already exists in the arc so create a duplicate
 	 * copy for the caller.  If the buffer is associated with user data
 	 * then track the size and number of duplicates.  These stats will be
 	 * updated as duplicate buffers are created and destroyed.
 	 */
 	if (hdr->b_type == ARC_BUFC_DATA) {
 		ARCSTAT_BUMP(arcstat_duplicate_buffers);
 		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
 	}
 	hdr->b_datacnt += 1;
 	return (buf);
 }
 
 void
 arc_buf_add_ref(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 
 	/*
 	 * Check to see if this buffer is evicted.  Callers
 	 * must verify b_data != NULL to know if the add_ref
 	 * was successful.
 	 */
 	mutex_enter(&buf->b_evict_lock);
 	if (buf->b_data == NULL) {
 		mutex_exit(&buf->b_evict_lock);
 		return;
 	}
 	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	mutex_exit(&buf->b_evict_lock);
 
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 	add_reference(hdr, hash_lock, tag);
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, hash_lock);
 	mutex_exit(hash_lock);
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
 	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
 	    data, metadata, hits);
 }
 
 /*
  * Free the arc data buffer.  If it is an l2arc write in progress,
  * the buffer is placed on l2arc_free_on_write to be freed later.
  */
 static void
 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (HDR_L2_WRITING(hdr)) {
 		l2arc_data_free_t *df;
 		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
 		df->l2df_data = buf->b_data;
 		df->l2df_size = hdr->b_size;
 		df->l2df_func = free_func;
 		mutex_enter(&l2arc_free_on_write_mtx);
 		list_insert_head(l2arc_free_on_write, df);
 		mutex_exit(&l2arc_free_on_write_mtx);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else {
 		free_func(buf->b_data, hdr->b_size);
 	}
 }
 
 static void
 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 {
 	arc_buf_t **bufp;
 
 	/* free up data associated with the buf */
 	if (buf->b_data) {
 		arc_state_t *state = buf->b_hdr->b_state;
 		uint64_t size = buf->b_hdr->b_size;
 		arc_buf_contents_t type = buf->b_hdr->b_type;
 
 		arc_cksum_verify(buf);
 #ifdef illumos
 		arc_buf_unwatch(buf);
 #endif /* illumos */
 
 		if (!recycle) {
 			if (type == ARC_BUFC_METADATA) {
 				arc_buf_data_free(buf, zio_buf_free);
 				arc_space_return(size, ARC_SPACE_DATA);
 			} else {
 				ASSERT(type == ARC_BUFC_DATA);
 				arc_buf_data_free(buf, zio_data_buf_free);
 				ARCSTAT_INCR(arcstat_data_size, -size);
 				atomic_add_64(&arc_size, -size);
 			}
 		}
 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
 			uint64_t *cnt = &state->arcs_lsize[type];
 
 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
 			ASSERT(state != arc_anon);
 
 			ASSERT3U(*cnt, >=, size);
 			atomic_add_64(cnt, -size);
 		}
 		ASSERT3U(state->arcs_size, >=, size);
 		atomic_add_64(&state->arcs_size, -size);
 		buf->b_data = NULL;
 
 		/*
 		 * If we're destroying a duplicate buffer make sure
 		 * that the appropriate statistics are updated.
 		 */
 		if (buf->b_hdr->b_datacnt > 1 &&
 		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
 			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
 		}
 		ASSERT(buf->b_hdr->b_datacnt > 0);
 		buf->b_hdr->b_datacnt -= 1;
 	}
 
 	/* only remove the buf if requested */
 	if (!all)
 		return;
 
 	/* remove the buf from the hdr list */
 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
 		continue;
 	*bufp = buf->b_next;
 	buf->b_next = NULL;
 
 	ASSERT(buf->b_efunc == NULL);
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
 
 	if (l2hdr != NULL) {
 		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
 		/*
 		 * To prevent arc_free() and l2arc_evict() from
 		 * attempting to free the same buffer at the same time,
 		 * a FREE_IN_PROGRESS flag is given to arc_free() to
 		 * give it priority.  l2arc_evict() can't destroy this
 		 * header while we are waiting on l2arc_buflist_mtx.
 		 *
 		 * The hdr may be removed from l2ad_buflist before we
 		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
 		 */
 		if (!buflist_held) {
 			mutex_enter(&l2arc_buflist_mtx);
 			l2hdr = hdr->b_l2hdr;
 		}
 
 		if (l2hdr != NULL) {
 			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
 			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
 			if (hdr->b_state == arc_l2c_only)
 				l2arc_hdr_stat_remove();
 			hdr->b_l2hdr = NULL;
 		}
 
 		if (!buflist_held)
 			mutex_exit(&l2arc_buflist_mtx);
 	}
 
 	if (!BUF_EMPTY(hdr)) {
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		buf_discard_identity(hdr);
 	}
 	while (hdr->b_buf) {
 		arc_buf_t *buf = hdr->b_buf;
 
 		if (buf->b_efunc) {
 			mutex_enter(&arc_eviction_mtx);
 			mutex_enter(&buf->b_evict_lock);
 			ASSERT(buf->b_hdr != NULL);
 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
 			hdr->b_buf = buf->b_next;
 			buf->b_hdr = &arc_eviction_hdr;
 			buf->b_next = arc_eviction_list;
 			arc_eviction_list = buf;
 			mutex_exit(&buf->b_evict_lock);
 			mutex_exit(&arc_eviction_mtx);
 		} else {
 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
 		}
 	}
 	if (hdr->b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_freeze_cksum = NULL;
 	}
 	if (hdr->b_thawed) {
 		kmem_free(hdr->b_thawed, 1);
 		hdr->b_thawed = NULL;
 	}
 
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	ASSERT3P(hdr->b_acb, ==, NULL);
 	kmem_cache_free(hdr_cache, hdr);
 }
 
 void
 arc_buf_free(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	int hashed = hdr->b_state != arc_anon;
 
 	ASSERT(buf->b_efunc == NULL);
 	ASSERT(buf->b_data != NULL);
 
 	if (hashed) {
 		kmutex_t *hash_lock = HDR_LOCK(hdr);
 
 		mutex_enter(hash_lock);
 		hdr = buf->b_hdr;
 		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 		(void) remove_reference(hdr, hash_lock, tag);
 		if (hdr->b_datacnt > 1) {
 			arc_buf_destroy(buf, FALSE, TRUE);
 		} else {
 			ASSERT(buf == hdr->b_buf);
 			ASSERT(buf->b_efunc == NULL);
 			hdr->b_flags |= ARC_BUF_AVAILABLE;
 		}
 		mutex_exit(hash_lock);
 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
 		int destroy_hdr;
 		/*
 		 * We are in the middle of an async write.  Don't destroy
 		 * this buffer unless the write completes before we finish
 		 * decrementing the reference count.
 		 */
 		mutex_enter(&arc_eviction_mtx);
 		(void) remove_reference(hdr, NULL, tag);
 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
 		mutex_exit(&arc_eviction_mtx);
 		if (destroy_hdr)
 			arc_hdr_destroy(hdr);
 	} else {
 		if (remove_reference(hdr, NULL, tag) > 0)
 			arc_buf_destroy(buf, FALSE, TRUE);
 		else
 			arc_hdr_destroy(hdr);
 	}
 }
 
 int
 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	int no_callback = (buf->b_efunc == NULL);
 
 	if (hdr->b_state == arc_anon) {
 		ASSERT(hdr->b_datacnt == 1);
 		arc_buf_free(buf, tag);
 		return (no_callback);
 	}
 
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT(hdr->b_state != arc_anon);
 	ASSERT(buf->b_data != NULL);
 
 	(void) remove_reference(hdr, hash_lock, tag);
 	if (hdr->b_datacnt > 1) {
 		if (no_callback)
 			arc_buf_destroy(buf, FALSE, TRUE);
 	} else if (no_callback) {
 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
 		ASSERT(buf->b_efunc == NULL);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
 	}
 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
 	    refcount_is_zero(&hdr->b_refcnt));
 	mutex_exit(hash_lock);
 	return (no_callback);
 }
 
 int
 arc_buf_size(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_size);
 }
 
 /*
  * Called from the DMU to determine if the current buffer should be
  * evicted. In order to ensure proper locking, the eviction must be initiated
  * from the DMU. Return true if the buffer is associated with user data and
  * duplicate buffers still exist.
  */
 boolean_t
 arc_buf_eviction_needed(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr;
 	boolean_t evict_needed = B_FALSE;
 
 	if (zfs_disable_dup_eviction)
 		return (B_FALSE);
 
 	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 	if (hdr == NULL) {
 		/*
 		 * We are in arc_do_user_evicts(); let that function
 		 * perform the eviction.
 		 */
 		ASSERT(buf->b_data == NULL);
 		mutex_exit(&buf->b_evict_lock);
 		return (B_FALSE);
 	} else if (buf->b_data == NULL) {
 		/*
 		 * We have already been added to the arc eviction list;
 		 * recommend eviction.
 		 */
 		ASSERT3P(hdr, ==, &arc_eviction_hdr);
 		mutex_exit(&buf->b_evict_lock);
 		return (B_TRUE);
 	}
 
 	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
 		evict_needed = B_TRUE;
 
 	mutex_exit(&buf->b_evict_lock);
 	return (evict_needed);
 }
 
 /*
  * Evict buffers from list until we've removed the specified number of
  * bytes.  Move the removed buffers to the appropriate evict state.
  * If the recycle flag is set, then attempt to "recycle" a buffer:
  * - look for a buffer to evict that is `bytes' long.
  * - return the data block from this buffer rather than freeing it.
  * This flag is used by callers that are trying to make space for a
  * new buffer in a full arc cache.
  *
  * This function makes a "best effort".  It skips over any buffers
  * it can't get a hash_lock on, and so may not catch all candidates.
  * It may also return without evicting as much space as requested.
  */
 static void *
 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
     arc_buf_contents_t type)
 {
 	arc_state_t *evicted_state;
 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
 	int64_t bytes_remaining;
 	arc_buf_hdr_t *ab, *ab_prev = NULL;
 	list_t *evicted_list, *list, *evicted_list_start, *list_start;
 	kmutex_t *lock, *evicted_lock;
 	kmutex_t *hash_lock;
 	boolean_t have_lock;
 	void *stolen = NULL;
 	static int evict_metadata_offset, evict_data_offset;
 	int i, idx, offset, list_count, count;
 
 	ASSERT(state == arc_mru || state == arc_mfu);
 
 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
 	if (type == ARC_BUFC_METADATA) {
 		offset = 0;
 		list_count = ARC_BUFC_NUMMETADATALISTS;
 		list_start = &state->arcs_lists[0];
 		evicted_list_start = &evicted_state->arcs_lists[0];
 		idx = evict_metadata_offset;
 	} else {
 		offset = ARC_BUFC_NUMMETADATALISTS;
 		list_start = &state->arcs_lists[offset];
 		evicted_list_start = &evicted_state->arcs_lists[offset];
 		list_count = ARC_BUFC_NUMDATALISTS;
 		idx = evict_data_offset;
 	}
 	bytes_remaining = evicted_state->arcs_lsize[type];
 	count = 0;
 
 evict_start:
 	list = &list_start[idx];
 	evicted_list = &evicted_list_start[idx];
 	lock = ARCS_LOCK(state, (offset + idx));
 	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
 
 	mutex_enter(lock);
 	mutex_enter(evicted_lock);
 
 	for (ab = list_tail(list); ab; ab = ab_prev) {
 		ab_prev = list_prev(list, ab);
 		bytes_remaining -= (ab->b_size * ab->b_datacnt);
 		/* prefetch buffers have a minimum lifespan */
 		if (HDR_IO_IN_PROGRESS(ab) ||
 		    (spa && ab->b_spa != spa) ||
 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
 		    ddi_get_lbolt() - ab->b_arc_access <
 		    arc_min_prefetch_lifespan)) {
 			skipped++;
 			continue;
 		}
 		/* "lookahead" for better eviction candidate */
 		if (recycle && ab->b_size != bytes &&
 		    ab_prev && ab_prev->b_size == bytes)
 			continue;
 		hash_lock = HDR_LOCK(ab);
 		have_lock = MUTEX_HELD(hash_lock);
 		if (have_lock || mutex_tryenter(hash_lock)) {
 			ASSERT0(refcount_count(&ab->b_refcnt));
 			ASSERT(ab->b_datacnt > 0);
 			while (ab->b_buf) {
 				arc_buf_t *buf = ab->b_buf;
 				if (!mutex_tryenter(&buf->b_evict_lock)) {
 					missed += 1;
 					break;
 				}
 				if (buf->b_data) {
 					bytes_evicted += ab->b_size;
 					if (recycle && ab->b_type == type &&
 					    ab->b_size == bytes &&
 					    !HDR_L2_WRITING(ab)) {
 						stolen = buf->b_data;
 						recycle = FALSE;
 					}
 				}
 				if (buf->b_efunc) {
 					mutex_enter(&arc_eviction_mtx);
 					arc_buf_destroy(buf,
 					    buf->b_data == stolen, FALSE);
 					ab->b_buf = buf->b_next;
 					buf->b_hdr = &arc_eviction_hdr;
 					buf->b_next = arc_eviction_list;
 					arc_eviction_list = buf;
 					mutex_exit(&arc_eviction_mtx);
 					mutex_exit(&buf->b_evict_lock);
 				} else {
 					mutex_exit(&buf->b_evict_lock);
 					arc_buf_destroy(buf,
 					    buf->b_data == stolen, TRUE);
 				}
 			}
 
 			if (ab->b_l2hdr) {
 				ARCSTAT_INCR(arcstat_evict_l2_cached,
 				    ab->b_size);
 			} else {
 				if (l2arc_write_eligible(ab->b_spa, ab)) {
 					ARCSTAT_INCR(arcstat_evict_l2_eligible,
 					    ab->b_size);
 				} else {
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_ineligible,
 					    ab->b_size);
 				}
 			}
 
 			if (ab->b_datacnt == 0) {
 				arc_change_state(evicted_state, ab, hash_lock);
 				ASSERT(HDR_IN_HASH_TABLE(ab));
 				ab->b_flags |= ARC_IN_HASH_TABLE;
 				ab->b_flags &= ~ARC_BUF_AVAILABLE;
 				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
 			}
 			if (!have_lock)
 				mutex_exit(hash_lock);
 			if (bytes >= 0 && bytes_evicted >= bytes)
 				break;
 			if (bytes_remaining > 0) {
 				mutex_exit(evicted_lock);
 				mutex_exit(lock);
 				idx  = ((idx + 1) & (list_count - 1));
 				count++;
 				goto evict_start;
 			}
 		} else {
 			missed += 1;
 		}
 	}
 
 	mutex_exit(evicted_lock);
 	mutex_exit(lock);
 
 	idx  = ((idx + 1) & (list_count - 1));
 	count++;
 
 	if (bytes_evicted < bytes) {
 		if (count < list_count)
 			goto evict_start;
 		else
 			dprintf("only evicted %lld bytes from %x",
 			    (longlong_t)bytes_evicted, state);
 	}
 	if (type == ARC_BUFC_METADATA)
 		evict_metadata_offset = idx;
 	else
 		evict_data_offset = idx;
 
 	if (skipped)
 		ARCSTAT_INCR(arcstat_evict_skip, skipped);
 
 	if (missed)
 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
 
 	/*
 	 * We have just evicted some date into the ghost state, make
 	 * sure we also adjust the ghost state size if necessary.
 	 */
 	if (arc_no_grow &&
 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
 		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
 		    arc_mru_ghost->arcs_size - arc_c;
 
 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete =
 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
 			arc_evict_ghost(arc_mru_ghost, 0, todelete);
 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
 			    arc_mru_ghost->arcs_size +
 			    arc_mfu_ghost->arcs_size - arc_c);
 			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
 		}
 	}
 	if (stolen)
 		ARCSTAT_BUMP(arcstat_stolen);
 
 	return (stolen);
 }
 
 /*
  * Remove buffers from list until we've removed the specified number of
  * bytes.  Destroy the buffers that are removed.
  */
 static void
 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
 {
 	arc_buf_hdr_t *ab, *ab_prev;
 	arc_buf_hdr_t marker = { 0 };
 	list_t *list, *list_start;
 	kmutex_t *hash_lock, *lock;
 	uint64_t bytes_deleted = 0;
 	uint64_t bufs_skipped = 0;
 	static int evict_offset;
 	int list_count, idx = evict_offset;
 	int offset, count = 0;
 
 	ASSERT(GHOST_STATE(state));
 
 	/*
 	 * data lists come after metadata lists
 	 */
 	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
 	list_count = ARC_BUFC_NUMDATALISTS;
 	offset = ARC_BUFC_NUMMETADATALISTS;
 
 evict_start:
 	list = &list_start[idx];
 	lock = ARCS_LOCK(state, idx + offset);
 
 	mutex_enter(lock);
 	for (ab = list_tail(list); ab; ab = ab_prev) {
 		ab_prev = list_prev(list, ab);
 		if (spa && ab->b_spa != spa)
 			continue;
 
 		/* ignore markers */
 		if (ab->b_spa == 0)
 			continue;
 
 		hash_lock = HDR_LOCK(ab);
 		/* caller may be trying to modify this buffer, skip it */
 		if (MUTEX_HELD(hash_lock))
 			continue;
 		if (mutex_tryenter(hash_lock)) {
 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
 			ASSERT(ab->b_buf == NULL);
 			ARCSTAT_BUMP(arcstat_deleted);
 			bytes_deleted += ab->b_size;
 
 			if (ab->b_l2hdr != NULL) {
 				/*
 				 * This buffer is cached on the 2nd Level ARC;
 				 * don't destroy the header.
 				 */
 				arc_change_state(arc_l2c_only, ab, hash_lock);
 				mutex_exit(hash_lock);
 			} else {
 				arc_change_state(arc_anon, ab, hash_lock);
 				mutex_exit(hash_lock);
 				arc_hdr_destroy(ab);
 			}
 
 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			if (bytes >= 0 && bytes_deleted >= bytes)
 				break;
 		} else if (bytes < 0) {
 			/*
 			 * Insert a list marker and then wait for the
 			 * hash lock to become available. Once its
 			 * available, restart from where we left off.
 			 */
 			list_insert_after(list, ab, &marker);
 			mutex_exit(lock);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			mutex_enter(lock);
 			ab_prev = list_prev(list, &marker);
 			list_remove(list, &marker);
 		} else
 			bufs_skipped += 1;
 	}
 	mutex_exit(lock);
 	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
 	count++;
 
 	if (count < list_count)
 		goto evict_start;
 
 	evict_offset = idx;
 	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
 	    (bytes < 0 || bytes_deleted < bytes)) {
 		list_start = &state->arcs_lists[0];
 		list_count = ARC_BUFC_NUMMETADATALISTS;
 		offset = count = 0;
 		goto evict_start;
 	}
 
 	if (bufs_skipped) {
 		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
 		ASSERT(bytes >= 0);
 	}
 
 	if (bytes_deleted < bytes)
 		dprintf("only deleted %lld bytes from %p",
 		    (longlong_t)bytes_deleted, state);
 }
 
 static void
 arc_adjust(void)
 {
 	int64_t adjustment, delta;
 
 	/*
 	 * Adjust MRU size
 	 */
 
 	adjustment = MIN((int64_t)(arc_size - arc_c),
 	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
 	    arc_p));
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
 		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
 		(void) arc_evict(arc_mru, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust MFU size
 	 */
 
 	adjustment = arc_size - arc_c;
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
 		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		int64_t delta = MIN(adjustment,
 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
 		(void) arc_evict(arc_mfu, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust ghost lists
 	 */
 
 	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
 
 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
 		arc_evict_ghost(arc_mru_ghost, 0, delta);
 	}
 
 	adjustment =
 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
 
 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
 		arc_evict_ghost(arc_mfu_ghost, 0, delta);
 	}
 }
 
 static void
 arc_do_user_evicts(void)
 {
 	static arc_buf_t *tmp_arc_eviction_list;
 
 	/*
 	 * Move list over to avoid LOR
 	 */
 restart:
 	mutex_enter(&arc_eviction_mtx);
 	tmp_arc_eviction_list = arc_eviction_list;
 	arc_eviction_list = NULL;
 	mutex_exit(&arc_eviction_mtx);
 
 	while (tmp_arc_eviction_list != NULL) {
 		arc_buf_t *buf = tmp_arc_eviction_list;
 		tmp_arc_eviction_list = buf->b_next;
 		mutex_enter(&buf->b_evict_lock);
 		buf->b_hdr = NULL;
 		mutex_exit(&buf->b_evict_lock);
 
 		if (buf->b_efunc != NULL)
 			VERIFY(buf->b_efunc(buf) == 0);
 
 		buf->b_efunc = NULL;
 		buf->b_private = NULL;
 		kmem_cache_free(buf_cache, buf);
 	}
 
 	if (arc_eviction_list != NULL)
 		goto restart;
 }
 
 /*
  * Flush all *evictable* data from the cache for the given spa.
  * NOTE: this will not touch "active" (i.e. referenced) data.
  */
 void
 arc_flush(spa_t *spa)
 {
 	uint64_t guid = 0;
 
 	if (spa)
 		guid = spa_load_guid(spa);
 
 	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
 		if (spa)
 			break;
 	}
 	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
 		if (spa)
 			break;
 	}
 	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
 		if (spa)
 			break;
 	}
 	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
 		if (spa)
 			break;
 	}
 
 	arc_evict_ghost(arc_mru_ghost, guid, -1);
 	arc_evict_ghost(arc_mfu_ghost, guid, -1);
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	arc_do_user_evicts();
 	mutex_exit(&arc_reclaim_thr_lock);
 	ASSERT(spa || arc_eviction_list == NULL);
 }
 
 void
 arc_shrink(void)
 {
 	if (arc_c > arc_c_min) {
 		uint64_t to_free;
 
 #ifdef _KERNEL
 		to_free = arc_c >> arc_shrink_shift;
 #else
 		to_free = arc_c >> arc_shrink_shift;
 #endif
 		if (arc_c > arc_c_min + to_free)
 			atomic_add_64(&arc_c, -to_free);
 		else
 			arc_c = arc_c_min;
 
 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
 		if (arc_c > arc_size)
 			arc_c = MAX(arc_size, arc_c_min);
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
 		ASSERT(arc_c >= arc_c_min);
 		ASSERT((int64_t)arc_p >= 0);
 	}
 
 	if (arc_size > arc_c)
 		arc_adjust();
 }
 
 static int needfree = 0;
 
 static int
 arc_reclaim_needed(void)
 {
 
 #ifdef _KERNEL
 
 	if (needfree)
 		return (1);
 
 	/*
 	 * Cooperate with pagedaemon when it's time for it to scan
 	 * and reclaim some pages.
 	 */
 	if (vm_paging_needed())
 		return (1);
 
 #ifdef sun
 	/*
 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
 	 */
 	extra = desfree;
 
 	/*
 	 * check that we're out of range of the pageout scanner.  It starts to
 	 * schedule paging if freemem is less than lotsfree and needfree.
 	 * lotsfree is the high-water mark for pageout, and needfree is the
 	 * number of needed free pages.  We add extra pages here to make sure
 	 * the scanner doesn't start up while we're freeing memory.
 	 */
 	if (freemem < lotsfree + needfree + extra)
 		return (1);
 
 	/*
 	 * check to make sure that swapfs has enough space so that anon
 	 * reservations can still succeed. anon_resvmem() checks that the
 	 * availrmem is greater than swapfs_minfree, and the number of reserved
 	 * swap pages.  We also add a bit of extra here just to prevent
 	 * circumstances from getting really dire.
 	 */
 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
 		return (1);
 
 #if defined(__i386)
 	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
 	 * kernel heap space before we ever run out of available physical
 	 * memory.  Most checks of the size of the heap_area compare against
 	 * tune.t_minarmem, which is the minimum available real memory that we
 	 * can have in the system.  However, this is generally fixed at 25 pages
 	 * which is so low that it's useless.  In this comparison, we seek to
 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
 	 * free)
 	 */
 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
 		return (1);
 #endif
 #else	/* !sun */
 	if (kmem_used() > (kmem_size() * 3) / 4)
 		return (1);
 #endif	/* sun */
 
 #else
 	if (spa_get_random(100) == 0)
 		return (1);
 #endif
 	return (0);
 }
 
 extern kmem_cache_t	*zio_buf_cache[];
 extern kmem_cache_t	*zio_data_buf_cache[];
 
 static void
 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 
 #ifdef _KERNEL
 	if (arc_meta_used >= arc_meta_limit) {
 		/*
 		 * We are exceeding our meta-data cache limit.
 		 * Purge some DNLC entries to release holds on meta-data.
 		 */
 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
 	}
 #if defined(__i386)
 	/*
 	 * Reclaim unused memory from all kmem caches.
 	 */
 	kmem_reap();
 #endif
 #endif
 
 	/*
 	 * An aggressive reclamation will shrink the cache size as well as
 	 * reap free buffers from the arc kmem caches.
 	 */
 	if (strat == ARC_RECLAIM_AGGR)
 		arc_shrink();
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_cache);
 }
 
 static void
 arc_reclaim_thread(void *dummy __unused)
 {
 	clock_t			growtime = 0;
 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
 	callb_cpr_t		cpr;
 
 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	while (arc_thread_exit == 0) {
 		if (arc_reclaim_needed()) {
 
 			if (arc_no_grow) {
 				if (last_reclaim == ARC_RECLAIM_CONS) {
 					last_reclaim = ARC_RECLAIM_AGGR;
 				} else {
 					last_reclaim = ARC_RECLAIM_CONS;
 				}
 			} else {
 				arc_no_grow = TRUE;
 				last_reclaim = ARC_RECLAIM_AGGR;
 				membar_producer();
 			}
 
 			/* reset the growth delay for every reclaim */
 			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
 
 			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
 				/*
 				 * If needfree is TRUE our vm_lowmem hook
 				 * was called and in that case we must free some
 				 * memory, so switch to aggressive mode.
 				 */
 				arc_no_grow = TRUE;
 				last_reclaim = ARC_RECLAIM_AGGR;
 			}
 			arc_kmem_reap_now(last_reclaim);
 			arc_warm = B_TRUE;
 
 		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
 			arc_no_grow = FALSE;
 		}
 
 		arc_adjust();
 
 		if (arc_eviction_list != NULL)
 			arc_do_user_evicts();
 
 #ifdef _KERNEL
 		if (needfree) {
 			needfree = 0;
 			wakeup(&needfree);
 		}
 #endif
 
 		/* block until needed, or one second, whichever is shorter */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait(&arc_reclaim_thr_cv,
 		    &arc_reclaim_thr_lock, hz);
 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
 	}
 
 	arc_thread_exit = 0;
 	cv_broadcast(&arc_reclaim_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
 	thread_exit();
 }
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are comming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
 
 	if (state == arc_l2c_only)
 		return;
 
 	ASSERT(bytes > 0);
 	/*
 	 * Adapt the target size of the MRU list:
 	 *	- if we just hit in the MRU ghost list, then increase
 	 *	  the target size of the MRU list.
 	 *	- if we just hit in the MFU ghost list, then increase
 	 *	  the target size of the MFU list by decreasing the
 	 *	  target size of the MRU list.
 	 */
 	if (state == arc_mru_ghost) {
 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
 		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
 		uint64_t delta;
 
 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
 		mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
 		arc_p = MAX(arc_p_min, arc_p - delta);
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
 	if (arc_reclaim_needed()) {
 		cv_signal(&arc_reclaim_thr_cv);
 		return;
 	}
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
 		atomic_add_64(&arc_c, (int64_t)bytes);
 		if (arc_c > arc_c_max)
 			arc_c = arc_c_max;
 		else if (state == arc_anon)
 			atomic_add_64(&arc_p, (int64_t)bytes);
 		if (arc_p > arc_c)
 			arc_p = arc_c;
 	}
 	ASSERT((int64_t)arc_p >= 0);
 }
 
 /*
  * Check if the cache has reached its limits and eviction is required
  * prior to insert.
  */
 static int
 arc_evict_needed(arc_buf_contents_t type)
 {
 	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
 		return (1);
 
 #ifdef sun
 #ifdef _KERNEL
 	/*
 	 * If zio data pages are being allocated out of a separate heap segment,
 	 * then enforce that the size of available vmem for this area remains
 	 * above about 1/32nd free.
 	 */
 	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
 	    vmem_size(zio_arena, VMEM_FREE) <
 	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
 		return (1);
 #endif
 #endif	/* sun */
 
 	if (arc_reclaim_needed())
 		return (1);
 
 	return (arc_size > arc_c);
 }
 
 /*
  * The buffer, supplied as the first argument, needs a data block.
  * So, if we are at cache max, determine which cache should be victimized.
  * We have the following cases:
  *
  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
  * In this situation if we're out of space, but the resident size of the MFU is
  * under the limit, victimize the MFU cache to satisfy this insertion request.
  *
  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
  * Here, we've used up all of the available space for the MRU, so we need to
  * evict from our own cache instead.  Evict from the set of resident MRU
  * entries.
  *
  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
  * c minus p represents the MFU space in the cache, since p is the size of the
  * cache that is dedicated to the MRU.  In this situation there's still space on
  * the MFU side, so the MRU side needs to be victimized.
  *
  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
  * MFU's resident set is consuming more space than it has been allotted.  In
  * this situation, we must victimize our own cache, the MFU, for this insertion.
  */
 static void
 arc_get_data_buf(arc_buf_t *buf)
 {
 	arc_state_t		*state = buf->b_hdr->b_state;
 	uint64_t		size = buf->b_hdr->b_size;
 	arc_buf_contents_t	type = buf->b_hdr->b_type;
 
 	arc_adapt(size, state);
 
 	/*
 	 * We have not yet reached cache maximum size,
 	 * just allocate a new buffer.
 	 */
 	if (!arc_evict_needed(type)) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
 			arc_space_consume(size, ARC_SPACE_DATA);
 		} else {
 			ASSERT(type == ARC_BUFC_DATA);
 			buf->b_data = zio_data_buf_alloc(size);
 			ARCSTAT_INCR(arcstat_data_size, size);
 			atomic_add_64(&arc_size, size);
 		}
 		goto out;
 	}
 
 	/*
 	 * If we are prefetching from the mfu ghost list, this buffer
 	 * will end up on the mru list; so steal space from there.
 	 */
 	if (state == arc_mfu_ghost)
 		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
 	else if (state == arc_mru_ghost)
 		state = arc_mru;
 
 	if (state == arc_mru || state == arc_anon) {
 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
 		state = (arc_mfu->arcs_lsize[type] >= size &&
 		    arc_p > mru_used) ? arc_mfu : arc_mru;
 	} else {
 		/* MFU cases */
 		uint64_t mfu_space = arc_c - arc_p;
 		state =  (arc_mru->arcs_lsize[type] >= size &&
 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
 	}
 	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
 			arc_space_consume(size, ARC_SPACE_DATA);
 		} else {
 			ASSERT(type == ARC_BUFC_DATA);
 			buf->b_data = zio_data_buf_alloc(size);
 			ARCSTAT_INCR(arcstat_data_size, size);
 			atomic_add_64(&arc_size, size);
 		}
 		ARCSTAT_BUMP(arcstat_recycle_miss);
 	}
 	ASSERT(buf->b_data != NULL);
 out:
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	if (!GHOST_STATE(buf->b_hdr->b_state)) {
 		arc_buf_hdr_t *hdr = buf->b_hdr;
 
 		atomic_add_64(&hdr->b_state->arcs_size, size);
 		if (list_link_active(&hdr->b_arc_node)) {
 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
 			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
 		}
 		/*
 		 * If we are growing the cache, and we are adding anonymous
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
 		if (arc_size < arc_c && hdr->b_state == arc_anon &&
 		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
 			arc_p = MIN(arc_c, arc_p + size);
 	}
 	ARCSTAT_BUMP(arcstat_allocated);
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  * NOTE: the hash lock is dropped in this function.
  */
 static void
 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 {
 	clock_t now;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 
 	if (buf->b_state == arc_anon) {
 		/*
 		 * This buffer is not in the cache, and does not
 		 * appear in our "ghost" list.  Add the new buffer
 		 * to the MRU state.
 		 */
 
 		ASSERT(buf->b_arc_access == 0);
 		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
 		arc_change_state(arc_mru, buf, hash_lock);
 
 	} else if (buf->b_state == arc_mru) {
 		now = ddi_get_lbolt();
 
 		/*
 		 * If this buffer is here because of a prefetch, then either:
 		 * - clear the flag if this is a "referencing" read
 		 *   (any subsequent access will bump this into the MFU state).
 		 * or
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
 			if (refcount_count(&buf->b_refcnt) == 0) {
 				ASSERT(list_link_active(&buf->b_arc_node));
 			} else {
 				buf->b_flags &= ~ARC_PREFETCH;
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
 			buf->b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * This buffer has been "accessed" only once so far,
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
 		if (now > buf->b_arc_access + ARC_MINTIME) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
 			buf->b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 			arc_change_state(arc_mfu, buf, hash_lock);
 		}
 		ARCSTAT_BUMP(arcstat_mru_hits);
 	} else if (buf->b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
 		 * was evicted from the cache.  Move it to the
 		 * MFU state.
 		 */
 
 		if (buf->b_flags & ARC_PREFETCH) {
 			new_state = arc_mru;
 			if (refcount_count(&buf->b_refcnt) > 0)
 				buf->b_flags &= ~ARC_PREFETCH;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		}
 
 		buf->b_arc_access = ddi_get_lbolt();
 		arc_change_state(new_state, buf, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 	} else if (buf->b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
 		 *
 		 * NOTE: an add_reference() that occurred when we did
 		 * the arc_read() will have kicked this off the list.
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
 			ASSERT(list_link_active(&buf->b_arc_node));
 		}
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		buf->b_arc_access = ddi_get_lbolt();
 	} else if (buf->b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
 		 * This buffer has been accessed more than once but has
 		 * been evicted from the cache.  Move it back to the
 		 * MFU state.
 		 */
 
 		if (buf->b_flags & ARC_PREFETCH) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
 			ASSERT0(refcount_count(&buf->b_refcnt));
 			new_state = arc_mru;
 		}
 
 		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		arc_change_state(new_state, buf, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 	} else if (buf->b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
 		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		arc_change_state(arc_mfu, buf, hash_lock);
 	} else {
 		ASSERT(!"invalid arc state");
 	}
 }
 
 /* a generic arc_done_func_t which you can use */
 /* ARGSUSED */
 void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	if (zio == NULL || zio->io_error == 0)
 		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
 /* a generic arc_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
 	if (zio && zio->io_error) {
 		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
 		ASSERT(buf->b_data);
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	arc_buf_hdr_t	*hdr, *found;
 	arc_buf_t	*buf;
 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
 	kmutex_t	*hash_lock;
 	arc_callback_t	*callback_list, *acb;
 	int		freeable = FALSE;
 
 	buf = zio->io_private;
 	hdr = buf->b_hdr;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
 	    &hash_lock);
 
 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 	    (found == hdr && HDR_L2_READING(hdr)));
 
 	hdr->b_flags &= ~ARC_L2_EVICTED;
 	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
 		hdr->b_flags &= ~ARC_L2CACHE;
 
 	/* byteswap if necessary */
 	callback_list = hdr->b_acb;
 	ASSERT(callback_list != NULL);
 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
 		dmu_object_byteswap_t bswap =
 		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
 		    byteswap_uint64_array :
 		    dmu_ot_byteswap[bswap].ob_func;
 		func(buf->b_data, hdr->b_size);
 	}
 
 	arc_cksum_compute(buf, B_FALSE);
 #ifdef illumos
 	arc_buf_watch(buf);
 #endif /* illumos */
 
 	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
 		 * called arc_access (to prevent any simultaneous readers from
 		 * getting confused).
 		 */
 		arc_access(hdr, hash_lock);
 	}
 
 	/* create copies of the data buffer for the callers */
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
 		if (acb->acb_done) {
 			if (abuf == NULL) {
 				ARCSTAT_BUMP(arcstat_duplicate_reads);
 				abuf = arc_buf_clone(buf);
 			}
 			acb->acb_buf = abuf;
 			abuf = NULL;
 		}
 	}
 	hdr->b_acb = NULL;
 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
 	if (abuf == buf) {
 		ASSERT(buf->b_efunc == NULL);
 		ASSERT(hdr->b_datacnt == 1);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
 	}
 
 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
 	if (zio->io_error != 0) {
 		hdr->b_flags |= ARC_IO_ERROR;
 		if (hdr->b_state != arc_anon)
 			arc_change_state(arc_anon, hdr, hash_lock);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 		freeable = refcount_is_zero(&hdr->b_refcnt);
 	}
 
 	/*
 	 * Broadcast before we drop the hash_lock to avoid the possibility
 	 * that the hdr (and hence the cv) might be freed before we get to
 	 * the cv_broadcast().
 	 */
 	cv_broadcast(&hdr->b_cv);
 
 	if (hash_lock) {
 		mutex_exit(hash_lock);
 	} else {
 		/*
 		 * This block was freed while we waited for the read to
 		 * complete.  It has been removed from the hash table and
 		 * moved to the anonymous state (so that it won't show up
 		 * in the cache).
 		 */
 		ASSERT3P(hdr->b_state, ==, arc_anon);
 		freeable = refcount_is_zero(&hdr->b_refcnt);
 	}
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done)
 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_next;
 		kmem_free(acb, sizeof (arc_callback_t));
 	}
 
 	if (freeable)
 		arc_hdr_destroy(hdr);
 }
 
 /*
  * "Read" the block block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
     void *private, int priority, int zio_flags, uint32_t *arc_flags,
     const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
+	arc_buf_t *buf = NULL;
 	kmutex_t *hash_lock;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 
 top:
 	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
 	    &hash_lock);
 	if (hdr && hdr->b_datacnt > 0) {
 
 		*arc_flags |= ARC_CACHED;
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 
 			if (*arc_flags & ARC_WAIT) {
 				cv_wait(&hdr->b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
 			ASSERT(*arc_flags & ARC_NOWAIT);
 
 			if (done) {
 				arc_callback_t	*acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 
 				ASSERT(acb->acb_done != NULL);
 				acb->acb_next = hdr->b_acb;
 				hdr->b_acb = acb;
 				add_reference(hdr, hash_lock, private);
 				mutex_exit(hash_lock);
 				return (0);
 			}
 			mutex_exit(hash_lock);
 			return (0);
 		}
 
 		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 
 		if (done) {
 			add_reference(hdr, hash_lock, private);
 			/*
 			 * If this block is already in use, create a new
 			 * copy of the data so that we will be guaranteed
 			 * that arc_release() will always succeed.
 			 */
 			buf = hdr->b_buf;
 			ASSERT(buf);
 			ASSERT(buf->b_data);
 			if (HDR_BUF_AVAILABLE(hdr)) {
 				ASSERT(buf->b_efunc == NULL);
 				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
 			} else {
 				buf = arc_buf_clone(buf);
 			}
 
 		} else if (*arc_flags & ARC_PREFETCH &&
 		    refcount_count(&hdr->b_refcnt) == 0) {
 			hdr->b_flags |= ARC_PREFETCH;
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
 		if (*arc_flags & ARC_L2CACHE)
 			hdr->b_flags |= ARC_L2CACHE;
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
 		    data, metadata, hits);
 
 		if (done)
 			done(NULL, buf, private);
 	} else {
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t	*acb;
 		vdev_t *vd = NULL;
-		uint64_t addr;
+		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 
 		if (hdr == NULL) {
 			/* this block is not in the cache */
 			arc_buf_hdr_t	*exists;
 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 			buf = arc_buf_alloc(spa, size, private, type);
 			hdr = buf->b_hdr;
 			hdr->b_dva = *BP_IDENTITY(bp);
 			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
 			exists = buf_hash_insert(hdr, &hash_lock);
 			if (exists) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				(void) arc_buf_remove_ref(buf, private);
 				goto top; /* restart the IO request */
 			}
 			/* if this is a prefetch, we don't have a reference */
 			if (*arc_flags & ARC_PREFETCH) {
 				(void) remove_reference(hdr, hash_lock,
 				    private);
 				hdr->b_flags |= ARC_PREFETCH;
 			}
 			if (*arc_flags & ARC_L2CACHE)
 				hdr->b_flags |= ARC_L2CACHE;
 			if (BP_GET_LEVEL(bp) > 0)
 				hdr->b_flags |= ARC_INDIRECT;
 		} else {
 			/* this block is in the ghost cache */
 			ASSERT(GHOST_STATE(hdr->b_state));
 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 			ASSERT0(refcount_count(&hdr->b_refcnt));
 			ASSERT(hdr->b_buf == NULL);
 
 			/* if this is a prefetch, we don't have a reference */
 			if (*arc_flags & ARC_PREFETCH)
 				hdr->b_flags |= ARC_PREFETCH;
 			else
 				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_L2CACHE)
 				hdr->b_flags |= ARC_L2CACHE;
 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 			buf->b_hdr = hdr;
 			buf->b_data = NULL;
 			buf->b_efunc = NULL;
 			buf->b_private = NULL;
 			buf->b_next = NULL;
 			hdr->b_buf = buf;
 			ASSERT(hdr->b_datacnt == 0);
 			hdr->b_datacnt = 1;
 			arc_get_data_buf(buf);
 			arc_access(hdr, hash_lock);
 		}
 
 		ASSERT(!GHOST_STATE(hdr->b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 
 		ASSERT(hdr->b_acb == NULL);
 		hdr->b_acb = acb;
 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
 
 		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
 		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr->b_daddr;
 			/*
 			 * Lock out device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		mutex_exit(hash_lock);
 
 		ASSERT3U(hdr->b_size, ==, size);
 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
 		    uint64_t, size, zbookmark_t *, zb);
 		ARCSTAT_BUMP(arcstat_misses);
 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
 		    data, metadata, misses);
 #ifdef _KERNEL
 		curthread->td_ru.ru_inblock++;
 #endif
 
 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
 			 */
 			if (hdr->b_l2hdr != NULL &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
 				l2arc_read_callback_t *cb;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_buf = buf;
 				cb->l2rcb_spa = spa;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
+				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+				    addr + size < vd->vdev_psize -
+				    VDEV_LABEL_END_SIZE);
+
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 */
 				rzio = zio_read_phys(pio, vd, addr, size,
 				    buf->b_data, ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority, zio_flags |
 				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
 
 				if (*arc_flags & ARC_NOWAIT) {
 					zio_nowait(rzio);
 					return (0);
 				}
 
 				ASSERT(*arc_flags & ARC_WAIT);
 				if (zio_wait(rzio) == 0)
 					return (0);
 
 				/* l2arc read error; goto zio_read() */
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			if (l2arc_ndev != 0) {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, zio_flags, zb);
 
 		if (*arc_flags & ARC_WAIT)
 			return (zio_wait(rzio));
 
 		ASSERT(*arc_flags & ARC_NOWAIT);
 		zio_nowait(rzio);
 	}
 	return (0);
 }
 
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
 	ASSERT(buf->b_hdr != NULL);
 	ASSERT(buf->b_hdr->b_state != arc_anon);
 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
 	ASSERT(buf->b_efunc == NULL);
 	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
 
 	buf->b_efunc = func;
 	buf->b_private = private;
 }
 
 /*
  * This is used by the DMU to let the ARC know that a buffer is
  * being evicted, so the ARC should clean up.  If this arc buf
  * is not yet in the evicted state, it will be put there.
  */
 int
 arc_buf_evict(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	arc_buf_t **bufp;
 	list_t *list, *evicted_list;
 	kmutex_t *lock, *evicted_lock;
 
 	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 	if (hdr == NULL) {
 		/*
 		 * We are in arc_do_user_evicts().
 		 */
 		ASSERT(buf->b_data == NULL);
 		mutex_exit(&buf->b_evict_lock);
 		return (0);
 	} else if (buf->b_data == NULL) {
 		arc_buf_t copy = *buf; /* structure assignment */
 		/*
 		 * We are on the eviction list; process this buffer now
 		 * but let arc_do_user_evicts() do the reaping.
 		 */
 		buf->b_efunc = NULL;
 		mutex_exit(&buf->b_evict_lock);
 		VERIFY(copy.b_efunc(&copy) == 0);
 		return (1);
 	}
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 
 	/*
 	 * Pull this buffer off of the hdr
 	 */
 	bufp = &hdr->b_buf;
 	while (*bufp != buf)
 		bufp = &(*bufp)->b_next;
 	*bufp = buf->b_next;
 
 	ASSERT(buf->b_data != NULL);
 	arc_buf_destroy(buf, FALSE, FALSE);
 
 	if (hdr->b_datacnt == 0) {
 		arc_state_t *old_state = hdr->b_state;
 		arc_state_t *evicted_state;
 
 		ASSERT(hdr->b_buf == NULL);
 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
 
 		evicted_state =
 		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
 		get_buf_info(hdr, old_state, &list, &lock);
 		get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
 		mutex_enter(lock);
 		mutex_enter(evicted_lock);
 
 		arc_change_state(evicted_state, hdr, hash_lock);
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
 		hdr->b_flags |= ARC_IN_HASH_TABLE;
 		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
 
 		mutex_exit(evicted_lock);
 		mutex_exit(lock);
 	}
 	mutex_exit(hash_lock);
 	mutex_exit(&buf->b_evict_lock);
 
 	VERIFY(buf->b_efunc(buf) == 0);
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 	buf->b_hdr = NULL;
 	buf->b_next = NULL;
 	kmem_cache_free(buf_cache, buf);
 	return (1);
 }
 
 /*
  * Release this buffer from the cache.  This must be done
  * after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock = NULL;
 	l2arc_buf_hdr_t *l2hdr;
 	uint64_t buf_size;
 
 	/*
 	 * It would be nice to assert that if it's DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 
 	/* this buffer is not on any list */
 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
 
 	if (hdr->b_state == arc_anon) {
 		/* this buffer is already released */
 		ASSERT(buf->b_efunc == NULL);
 	} else {
 		hash_lock = HDR_LOCK(hdr);
 		mutex_enter(hash_lock);
 		hdr = buf->b_hdr;
 		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	}
 
 	l2hdr = hdr->b_l2hdr;
 	if (l2hdr) {
 		mutex_enter(&l2arc_buflist_mtx);
 		hdr->b_l2hdr = NULL;
-		buf_size = hdr->b_size;
 	}
+	buf_size = hdr->b_size;
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_datacnt > 1) {
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
 		uint64_t spa = hdr->b_spa;
 		arc_buf_contents_t type = hdr->b_type;
 		uint32_t flags = hdr->b_flags;
 
 		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
 		/*
 		 * Pull the data off of this hdr and attach it to
 		 * a new anonymous hdr.
 		 */
 		(void) remove_reference(hdr, hash_lock, tag);
 		bufp = &hdr->b_buf;
 		while (*bufp != buf)
 			bufp = &(*bufp)->b_next;
 		*bufp = buf->b_next;
 		buf->b_next = NULL;
 
 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
 		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
 		if (refcount_is_zero(&hdr->b_refcnt)) {
 			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
 			ASSERT3U(*size, >=, hdr->b_size);
 			atomic_add_64(size, -hdr->b_size);
 		}
 
 		/*
 		 * We're releasing a duplicate user data buffer, update
 		 * our statistics accordingly.
 		 */
 		if (hdr->b_type == ARC_BUFC_DATA) {
 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
 			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
 			    -hdr->b_size);
 		}
 		hdr->b_datacnt -= 1;
 		arc_cksum_verify(buf);
 #ifdef illumos
 		arc_buf_unwatch(buf);
 #endif /* illumos */
 
 		mutex_exit(hash_lock);
 
 		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
 		nhdr->b_size = blksz;
 		nhdr->b_spa = spa;
 		nhdr->b_type = type;
 		nhdr->b_buf = buf;
 		nhdr->b_state = arc_anon;
 		nhdr->b_arc_access = 0;
 		nhdr->b_flags = flags & ARC_L2_WRITING;
 		nhdr->b_l2hdr = NULL;
 		nhdr->b_datacnt = 1;
 		nhdr->b_freeze_cksum = NULL;
 		(void) refcount_add(&nhdr->b_refcnt, tag);
 		buf->b_hdr = nhdr;
 		mutex_exit(&buf->b_evict_lock);
 		atomic_add_64(&arc_anon->arcs_size, blksz);
 	} else {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
 		ASSERT(!list_link_active(&hdr->b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		if (hdr->b_state != arc_anon)
 			arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_arc_access = 0;
 		if (hash_lock)
 			mutex_exit(hash_lock);
 
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 
 	if (l2hdr) {
 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
 		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
 		mutex_exit(&l2arc_buflist_mtx);
 	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	int released;
 
 	mutex_enter(&buf->b_evict_lock);
 	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
 	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
 int
 arc_has_callback(arc_buf_t *buf)
 {
 	int callback;
 
 	mutex_enter(&buf->b_evict_lock);
 	callback = (buf->b_efunc != NULL);
 	mutex_exit(&buf->b_evict_lock);
 	return (callback);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	int referenced;
 
 	mutex_enter(&buf->b_evict_lock);
 	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
 	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	/*
 	 * If the IO is already in progress, then this is a re-write
 	 * attempt, so we need to thaw and re-compute the cksum.
 	 * It is the responsibility of the callback to handle the
 	 * accounting for any re-write attempt.
 	 */
 	if (HDR_IO_IN_PROGRESS(hdr)) {
 		mutex_enter(&hdr->b_freeze_lock);
 		if (hdr->b_freeze_cksum != NULL) {
 			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 			hdr->b_freeze_cksum = NULL;
 		}
 		mutex_exit(&hdr->b_freeze_lock);
 	}
 	arc_cksum_compute(buf, B_FALSE);
 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(hdr->b_acb == NULL);
 
 	if (zio->io_error == 0) {
 		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
 		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
 	} else {
 		ASSERT(BUF_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero, we may have
 	 * compressed it away.  In this case no write was performed
 	 * so there will be no dva/birth/checksum.  The buffer must
 	 * therefore remain anonymous (and uncached).
 	 */
 	if (!BUF_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT(zio->io_error == 0);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(refcount_is_zero(&exists->b_refcnt));
 				arc_change_state(arc_anon, exists, hash_lock);
 				mutex_exit(hash_lock);
 				arc_hdr_destroy(exists);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT(hdr->b_datacnt == 1);
 				ASSERT(hdr->b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 		/* if it's not anon, we are doing a scrub */
 		if (!exists && hdr->b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
 
 	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
     arc_done_func_t *ready, arc_done_func_t *done, void *private,
     int priority, int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 
 	ASSERT(ready != NULL);
 	ASSERT(done != NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
 	ASSERT(hdr->b_acb == NULL);
 	if (l2arc)
 		hdr->b_flags |= ARC_L2CACHE;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
 	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
 static int
 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 {
 #ifdef _KERNEL
 	uint64_t available_memory =
 	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
 	static uint64_t page_load = 0;
 	static uint64_t last_txg = 0;
 
 #ifdef sun
 #if defined(__i386)
 	available_memory =
 	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
 #endif
 #endif	/* sun */
 	if (available_memory >= zfs_write_limit_max)
 		return (0);
 
 	if (txg > last_txg) {
 		last_txg = txg;
 		page_load = 0;
 	}
 	/*
 	 * If we are in pageout, we know that memory is already tight,
 	 * the arc is already going to be evicting, so we just want to
 	 * continue to let page writes occur as quickly as possible.
 	 */
 	if (curproc == pageproc) {
 		if (page_load > available_memory / 4)
 			return (ERESTART);
 		/* Note: reserve is inflated, so we deflate */
 		page_load += reserve / 8;
 		return (0);
 	} else if (page_load > 0 && arc_reclaim_needed()) {
 		/* memory is low, delay before restarting */
 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
 		return (EAGAIN);
 	}
 	page_load = 0;
 
 	if (arc_size > arc_c_min) {
 		uint64_t evictable_memory =
 		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
 		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
 		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
 		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
 	}
 
 	if (inflight_data > available_memory / 4) {
 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
 		return (ERESTART);
 	}
 #endif
 	return (0);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * Once in a while, fail for no reason.  Everything should cope.
 	 */
 	if (spa_get_random(10000) == 0) {
 		dprintf("forcing random failure\n");
 		return (ERESTART);
 	}
 #endif
 	if (reserve > arc_c/4 && !arc_no_grow)
 		arc_c = MIN(arc_c_max, reserve * 4);
 	if (reserve > arc_c)
 		return (ENOMEM);
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefor need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	if (error = arc_memory_throttle(reserve, anon_size, txg))
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 
 	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
 	    anon_size > arc_c / 4) {
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
 		    arc_tempreserve>>10,
 		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
 		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
 		    reserve>>10, arc_c>>10);
 		return (ERESTART);
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static kmutex_t arc_lowmem_lock;
 #ifdef _KERNEL
 static eventhandler_tag arc_event_lowmem = NULL;
 
 static void
 arc_lowmem(void *arg __unused, int howto __unused)
 {
 
 	/* Serialize access via arc_lowmem_lock. */
 	mutex_enter(&arc_lowmem_lock);
 	mutex_enter(&arc_reclaim_thr_lock);
 	needfree = 1;
 	cv_signal(&arc_reclaim_thr_cv);
 
 	/*
 	 * It is unsafe to block here in arbitrary threads, because we can come
 	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
 	 * with ARC reclaim thread.
 	 */
 	if (curproc == pageproc) {
 		while (needfree)
 			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
 	}
 	mutex_exit(&arc_reclaim_thr_lock);
 	mutex_exit(&arc_lowmem_lock);
 }
 #endif
 
 void
 arc_init(void)
 {
 	int i, prefetch_tunable_set = 0;
 
 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
 	arc_min_prefetch_lifespan = 1 * hz;
 
 	/* Start out with 1/8 of all memory */
 	arc_c = kmem_size() / 8;
 
 #ifdef sun
 #ifdef _KERNEL
 	/*
 	 * On architectures where the physical memory can be larger
 	 * than the addressable space (intel in 32-bit mode), we may
 	 * need to limit the cache to 1/8 of VM size.
 	 */
 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
 #endif
 #endif	/* sun */
 	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
 	arc_c_min = MAX(arc_c / 4, 64<<18);
 	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
 	if (arc_c * 8 >= 1<<30)
 		arc_c_max = (arc_c * 8) - (1<<30);
 	else
 		arc_c_max = arc_c_min;
 	arc_c_max = MAX(arc_c * 5, arc_c_max);
 
 #ifdef _KERNEL
 	/*
 	 * Allow the tunables to override our calculations if they are
 	 * reasonable (ie. over 16MB)
 	 */
 	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
 		arc_c_max = zfs_arc_max;
 	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
 		arc_c_min = zfs_arc_min;
 #endif
 
 	arc_c = arc_c_max;
 	arc_p = (arc_c >> 1);
 
 	/* limit meta-data to 1/4 of the arc capacity */
 	arc_meta_limit = arc_c_max / 4;
 
 	/* Allow the tunable to override if it is reasonable */
 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
 		arc_meta_limit = zfs_arc_meta_limit;
 
 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
 		arc_c_min = arc_meta_limit / 2;
 
 	if (zfs_arc_grow_retry > 0)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	if (zfs_arc_shrink_shift > 0)
 		arc_shrink_shift = zfs_arc_shrink_shift;
 
 	if (zfs_arc_p_min_shift > 0)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	zfs_arc_min = arc_c_min;
 	zfs_arc_max = arc_c_max;
 
 	arc_anon = &ARC_anon;
 	arc_mru = &ARC_mru;
 	arc_mru_ghost = &ARC_mru_ghost;
 	arc_mfu = &ARC_mfu;
 	arc_mfu_ghost = &ARC_mfu_ghost;
 	arc_l2c_only = &ARC_l2c_only;
 	arc_size = 0;
 
 	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
 		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 
 		list_create(&arc_mru->arcs_lists[i],
 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
 		list_create(&arc_mru_ghost->arcs_lists[i],
 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
 		list_create(&arc_mfu->arcs_lists[i],
 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
 		list_create(&arc_mfu_ghost->arcs_lists[i],
 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
 		list_create(&arc_mfu_ghost->arcs_lists[i],
 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
 		list_create(&arc_l2c_only->arcs_lists[i],
 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
 	}
 
 	buf_init();
 
 	arc_thread_exit = 0;
 	arc_eviction_list = NULL;
 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		kstat_install(arc_ksp);
 	}
 
 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
 	    TS_RUN, minclsyspri);
 
 #ifdef _KERNEL
 	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 #endif
 
 	arc_dead = FALSE;
 	arc_warm = B_FALSE;
 
 	if (zfs_write_limit_max == 0)
 		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
 	else
 		zfs_write_limit_shift = 0;
 	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
 
 #ifdef _KERNEL
 	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
 		prefetch_tunable_set = 1;
 
 #ifdef __i386__
 	if (prefetch_tunable_set == 0) {
 		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
 		    "-- to enable,\n");
 		printf("            add \"vfs.zfs.prefetch_disable=0\" "
 		    "to /boot/loader.conf.\n");
 		zfs_prefetch_disable = 1;
 	}
 #else
 	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
 	    prefetch_tunable_set == 0) {
 		printf("ZFS NOTICE: Prefetch is disabled by default if less "
 		    "than 4GB of RAM is present;\n"
 		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
 		    "to /boot/loader.conf.\n");
 		zfs_prefetch_disable = 1;
 	}
 #endif
 	/* Warn about ZFS memory and address space requirements. */
 	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
 		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
 		    "expect unstable behavior.\n");
 	}
 	if (kmem_size() < 512 * (1 << 20)) {
 		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
 		    "expect unstable behavior.\n");
 		printf("             Consider tuning vm.kmem_size and "
 		    "vm.kmem_size_max\n");
 		printf("             in /boot/loader.conf.\n");
 	}
 #endif
 }
 
 void
 arc_fini(void)
 {
 	int i;
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	arc_thread_exit = 1;
 	cv_signal(&arc_reclaim_thr_cv);
 	while (arc_thread_exit != 0)
 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
 	mutex_exit(&arc_reclaim_thr_lock);
 
 	arc_flush(NULL);
 
 	arc_dead = TRUE;
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	mutex_destroy(&arc_eviction_mtx);
 	mutex_destroy(&arc_reclaim_thr_lock);
 	cv_destroy(&arc_reclaim_thr_cv);
 
 	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
 		list_destroy(&arc_mru->arcs_lists[i]);
 		list_destroy(&arc_mru_ghost->arcs_lists[i]);
 		list_destroy(&arc_mfu->arcs_lists[i]);
 		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
 		list_destroy(&arc_l2c_only->arcs_lists[i]);
 
 		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
 		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
 		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
 		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
 		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
 		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
 	}
 
 	mutex_destroy(&zfs_write_limit_lock);
 
 	buf_fini();
 
 	ASSERT(arc_loaned_bytes == 0);
 
 	mutex_destroy(&arc_lowmem_lock);
 #ifdef _KERNEL
 	if (arc_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
 #endif
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there.  It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction.  The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (ab->b_spa != spa_guid) {
 		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
 		return (B_FALSE);
 	}
 	if (ab->b_l2hdr != NULL) {
 		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
 		return (B_FALSE);
 	}
 	if (HDR_IO_IN_PROGRESS(ab)) {
 		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
 		return (B_FALSE);
 	}
 	if (!HDR_L2CACHE(ab)) {
 		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
 		return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
 	uint64_t size;
 
 	size = dev->l2ad_write;
 
 	if (arc_warm == B_FALSE)
 		size += dev->l2ad_boost;
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 static void
 l2arc_hdr_stat_add(void)
 {
 	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
 	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
 }
 
 static void
 l2arc_hdr_stat_remove(void)
 {
 	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 	} while (vdev_is_dead(next->l2ad_vdev));
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev))
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write()
 {
 	list_t *buflist;
 	l2arc_data_free_t *df, *df_prev;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	buflist = l2arc_free_on_write;
 
 	for (df = list_tail(buflist); df; df = df_prev) {
 		df_prev = list_prev(buflist, df);
 		ASSERT(df->l2df_data != NULL);
 		ASSERT(df->l2df_func != NULL);
 		df->l2df_func(df->l2df_data, df->l2df_size);
 		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t *cb;
 	l2arc_dev_t *dev;
 	list_t *buflist;
 	arc_buf_hdr_t *head, *ab, *ab_prev;
 	l2arc_buf_hdr_t *abl2;
 	kmutex_t *hash_lock;
 
 	cb = zio->io_private;
 	ASSERT(cb != NULL);
 	dev = cb->l2wcb_dev;
 	ASSERT(dev != NULL);
 	head = cb->l2wcb_head;
 	ASSERT(head != NULL);
 	buflist = dev->l2ad_buflist;
 	ASSERT(buflist != NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	if (zio->io_error != 0)
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
 	mutex_enter(&l2arc_buflist_mtx);
 
 	/*
 	 * All writes completed, or an error was hit.
 	 */
 	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
 		ab_prev = list_prev(buflist, ab);
 
 		hash_lock = HDR_LOCK(ab);
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * This buffer misses out.  It may be in a stage
 			 * of eviction.  Its ARC_L2_WRITING flag will be
 			 * left set, denying reads to this buffer.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
 			continue;
 		}
 
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, ab);
 			abl2 = ab->b_l2hdr;
 			ab->b_l2hdr = NULL;
 			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
 			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
 		}
 
 		/*
 		 * Allow ARC to begin reads to this L2ARC entry.
 		 */
 		ab->b_flags &= ~ARC_L2_WRITING;
 
 		mutex_exit(hash_lock);
 	}
 
 	atomic_inc_64(&l2arc_writes_done);
 	list_remove(buflist, head);
 	kmem_cache_free(hdr_cache, head);
 	mutex_exit(&l2arc_buflist_mtx);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	kmutex_t *hash_lock;
 	int equal;
 
 	ASSERT(zio->io_vd != NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	cb = zio->io_private;
 	ASSERT(cb != NULL);
 	buf = cb->l2rcb_buf;
 	ASSERT(buf != NULL);
 
 	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	equal = arc_cksum_equal(buf);
 	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = buf;
 		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 		arc_read_done(zio);
 	} else {
 		mutex_exit(hash_lock);
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = EIO;
 		}
 		if (!equal)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
 			    buf->b_data, zio->io_size, arc_read_done, buf,
 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
 static list_t *
 l2arc_list_locked(int list_num, kmutex_t **lock)
 {
-	list_t *list;
+	list_t *list = NULL;
 	int idx;
 
 	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
 
 	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
 		idx = list_num;
 		list = &arc_mfu->arcs_lists[idx];
 		*lock = ARCS_LOCK(arc_mfu, idx);
 	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
 		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
 		list = &arc_mru->arcs_lists[idx];
 		*lock = ARCS_LOCK(arc_mru, idx);
 	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
 		ARC_BUFC_NUMDATALISTS)) {
 		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
 		list = &arc_mfu->arcs_lists[idx];
 		*lock = ARCS_LOCK(arc_mfu, idx);
 	} else {
 		idx = list_num - ARC_BUFC_NUMLISTS;
 		list = &arc_mru->arcs_lists[idx];
 		*lock = ARCS_LOCK(arc_mru, idx);
 	}
 
 	ASSERT(!(MUTEX_HELD(*lock)));
 	mutex_enter(*lock);
 	return (list);
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes.  This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	l2arc_buf_hdr_t *abl2;
 	arc_buf_hdr_t *ab, *ab_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 
 	buflist = dev->l2ad_buflist;
 
 	if (buflist == NULL)
 		return;
 
 	if (!all && dev->l2ad_first) {
 		/*
 		 * This is the first sweep through the device.  There is
 		 * nothing to evict.
 		 */
 		return;
 	}
 
 	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
 		/*
 		 * When nearing the end of the device, evict to the end
 		 * before the device write hand jumps to the start.
 		 */
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 top:
 	mutex_enter(&l2arc_buflist_mtx);
 	for (ab = list_tail(buflist); ab; ab = ab_prev) {
 		ab_prev = list_prev(buflist, ab);
 
 		hash_lock = HDR_LOCK(ab);
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&l2arc_buflist_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		if (HDR_L2_WRITE_HEAD(ab)) {
 			/*
 			 * We hit a write head node.  Leave it for
 			 * l2arc_write_done().
 			 */
 			list_remove(buflist, ab);
 			mutex_exit(hash_lock);
 			continue;
 		}
 
 		if (!all && ab->b_l2hdr != NULL &&
 		    (ab->b_l2hdr->b_daddr > taddr ||
 		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		if (HDR_FREE_IN_PROGRESS(ab)) {
 			/*
 			 * Already on the path to destruction.
 			 */
 			mutex_exit(hash_lock);
 			continue;
 		}
 
 		if (ab->b_state == arc_l2c_only) {
 			ASSERT(!HDR_L2_READING(ab));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_size.
 			 */
 			arc_change_state(arc_anon, ab, hash_lock);
 			arc_hdr_destroy(ab);
 		} else {
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(ab)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				ab->b_flags |= ARC_L2_EVICTED;
 			}
 
 			/*
 			 * Tell ARC this no longer exists in L2ARC.
 			 */
 			if (ab->b_l2hdr != NULL) {
 				abl2 = ab->b_l2hdr;
 				ab->b_l2hdr = NULL;
 				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
 				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
 			}
 			list_remove(buflist, ab);
 
 			/*
 			 * This may have been leftover after a
 			 * failed write.
 			 */
 			ab->b_flags &= ~ARC_L2_WRITING;
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&l2arc_buflist_mtx);
 
 	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
 	dev->l2ad_evict = taddr;
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t *ab, *ab_prev, *head;
 	l2arc_buf_hdr_t *hdrl2;
 	list_t *list;
 	uint64_t passed_sz, write_sz, buf_sz, headroom;
 	void *buf_data;
 	kmutex_t *hash_lock, *list_lock;
 	boolean_t have_lock, full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
 	uint64_t guid = spa_load_guid(spa);
 	int try;
 
 	ASSERT(dev->l2ad_vdev != NULL);
 
 	pio = NULL;
 	write_sz = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
 	head->b_flags |= ARC_L2_WRITE_HEAD;
 
 	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	mutex_enter(&l2arc_buflist_mtx);
 	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
 		list = l2arc_list_locked(try, &list_lock);
 		passed_sz = 0;
 		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
 
 		/*
 		 * L2ARC fast warmup.
 		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		headroom = target_sz * l2arc_headroom;
 		if (arc_warm == B_FALSE)
 			ab = list_head(list);
 		else
 			ab = list_tail(list);
 		if (ab == NULL)
 			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
 
 		for (; ab; ab = ab_prev) {
 			if (arc_warm == B_FALSE)
 				ab_prev = list_next(list, ab);
 			else
 				ab_prev = list_prev(list, ab);
 			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
 
 			hash_lock = HDR_LOCK(ab);
 			have_lock = MUTEX_HELD(hash_lock);
 			if (!have_lock && !mutex_tryenter(hash_lock)) {
 				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
 				/*
 				 * Skip this buffer rather than waiting.
 				 */
 				continue;
 			}
 
 			passed_sz += ab->b_size;
 			if (passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, ab)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
 
 			if ((write_sz + ab->b_size) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_l2_write_full);
 				break;
 			}
 
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
 				list_insert_head(dev->l2ad_buflist, head);
 
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 				ARCSTAT_BUMP(arcstat_l2_write_pios);
 			}
 
 			/*
 			 * Create and add a new L2ARC header.
 			 */
 			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
 			hdrl2->b_dev = dev;
 			hdrl2->b_daddr = dev->l2ad_hand;
 
 			ab->b_flags |= ARC_L2_WRITING;
 			ab->b_l2hdr = hdrl2;
 			list_insert_head(dev->l2ad_buflist, ab);
 			buf_data = ab->b_buf->b_data;
 			buf_sz = ab->b_size;
 
 			/*
 			 * Compute and store the buffer cksum before
 			 * writing.  On debug the cksum is verified first.
 			 */
 			arc_cksum_verify(ab->b_buf);
 			arc_cksum_compute(ab->b_buf, B_TRUE);
 
 			mutex_exit(hash_lock);
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
 			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 			(void) zio_nowait(wzio);
 
 			/*
 			 * Keep the clock hand suitably device-aligned.
 			 */
 			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
 
 			write_sz += buf_sz;
 			dev->l2ad_hand += buf_sz;
 		}
 
 		mutex_exit(list_lock);
 
 		if (full == B_TRUE)
 			break;
 	}
 	mutex_exit(&l2arc_buflist_mtx);
 
 	if (pio == NULL) {
 		ASSERT0(write_sz);
 		kmem_cache_free(hdr_cache, head);
 		return (0);
 	}
 
 	ASSERT3U(write_sz, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
 	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
 		vdev_space_update(dev->l2ad_vdev,
 		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 	}
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	return (write_sz);
 }
 
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 static void
 l2arc_feed_thread(void *dummy __unused)
 {
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
 		    next - ddi_get_lbolt());
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT(spa != NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (arc_reclaim_needed()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	l2arc_dev_t *dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev != NULL);
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t *adddev;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	adddev->l2ad_write = l2arc_write_max;
 	adddev->l2ad_boost = l2arc_write_boost;
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	ASSERT3U(adddev->l2ad_write, >, 0);
 
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
 	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
 
 	/*
 	 * Find the device by vdev
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
 		nextdev = list_next(l2arc_dev_list, dev);
 		if (vd == dev->l2ad_vdev) {
 			remdev = dev;
 			break;
 		}
 	}
 	ASSERT(remdev != NULL);
 
 	/*
 	 * Remove device from global list
 	 */
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(remdev->l2ad_buflist);
 	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
 	kmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 	l2arc_writes_sent = 0;
 	l2arc_writes_done = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	/*
 	 * This is called from dmu_fini(), which is called from spa_fini();
 	 * Because of this, we can assume that all l2arc devices have
 	 * already been removed when the pools themselves were removed.
 	 */
 
 	l2arc_do_free_on_write();
 
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_buflist_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & FWRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, minclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & FWRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	(revision 247192)
@@ -1,1849 +1,1848 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #ifdef _KERNEL
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 int zfs_nopwrite_enabled = 1;
 SYSCTL_DECL(_vfs_zfs);
 TUNABLE_INT("vfs.zfs.nopwrite_enabled", &zfs_nopwrite_enabled);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
     &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
 	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
 	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
 	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
 	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
 	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
 	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
 	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
 	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
 	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
 	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
 	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
 	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
 	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
 	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
 	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
 	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
 	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
 	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
 	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
 	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
 	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
 	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
 	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
 	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
 	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
 	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
 	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
 	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
 	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
 };
 
 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	blkid = dbuf_whichblock(dn, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL) {
 		err = EIO;
 	} else {
 		err = dbuf_read(db, NULL, db_flags);
 		if (err) {
 			dbuf_rele(db, tag);
 			db = NULL;
 		}
 	}
 
 	dnode_rele(dn, FTAG);
 	*dbp = &db->db; /* NULL db plus first field offset is NULL */
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = EINVAL;
 	} else if (newsize < 0 || newsize > db_fake->db_size) {
 		error = EINVAL;
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (!DMU_OT_IS_VALID(type)) {
 		error = EINVAL;
 	} else if (dn->dn_bonus != db) {
 		error = EINVAL;
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	type = dn->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  */
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 
 	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
 
 	*dbp = &db->db;
 	return (0);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	ASSERT(db != NULL);
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else
 		dbuf_rele(db, tag);
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = EINVAL;
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = ENOENT;
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
 	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
 	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
 	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
 		dbuf_flags |= DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
 		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (EIO);
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	if (dn->dn_objset->os_dsl_dataset)
 		dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
-	if (dp && dsl_pool_sync_context(dp))
-		start = gethrtime();
+	start = gethrtime();
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, offset);
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
 		if (db == NULL) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			zio_nowait(zio);
 			return (EIO);
 		}
 		/* initiate async i/o */
 		if (read)
 			(void) dbuf_read(db, zio, dbuf_flags);
 #ifdef _KERNEL
 		else
 			curthread->td_ru.ru_oublock++;
 #endif
 		dbp[i] = &db->db;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
 	/* track read overhead when we are in sync context */
 	if (dp && dsl_pool_sync_context(dp))
 		dp->dp_read_overhead += gethrtime() - start;
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
 	}
 
 	/* wait for other io to complete */
 	if (read) {
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = EIO;
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 static int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 void
 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	int nblks, i, err;
 
 	if (zfs_prefetch_disable)
 		return;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
 		dn = DMU_META_DNODE(os);
 
 		if (object == 0 || object >= DN_MAX_OBJECT)
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
 		dbuf_prefetch(dn, blkid);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
 
 	/*
 	 * XXX - Note, if the dnode for the requested object is not
 	 * already cached, we will do a *synchronous* read in the
 	 * dnode_hold() call.  The same is true for any indirects.
 	 */
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
 		    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
 		blkid = dbuf_whichblock(dn, offset);
 		for (i = 0; i < nblks; i++)
 			dbuf_prefetch(dn, blkid+i);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
 {
 	uint64_t len = *start - limit;
 	uint64_t blkcnt = 0;
 	uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
 	uint64_t iblkrange =
 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT(limit <= *start);
 
 	if (len <= iblkrange * maxblks) {
 		*start = limit;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	while (*start > limit && blkcnt < maxblks) {
 		int err;
 
 		/* find next allocated L1 indirect */
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no more, then we are done */
 		if (err == ESRCH) {
 			*start = limit;
 			return (0);
 		} else if (err) {
 			return (err);
 		}
 		blkcnt += 1;
 
 		/* reset offset to end of "next" block back */
 		*start = P2ALIGN(*start, iblkrange);
 		if (*start <= limit)
 			*start = limit;
 		else
 			*start -= 1;
 	}
 	return (0);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length, boolean_t free_dnode)
 {
 	dmu_tx_t *tx;
 	uint64_t object_size, start, end, len;
 	boolean_t trunc = (length == DMU_OBJECT_END);
 	int align, err;
 
 	align = 1 << dn->dn_datablkshift;
 	ASSERT(align > 0);
 	object_size = align == 1 ? dn->dn_datablksz :
 	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
 
 	end = offset + length;
 	if (trunc || end > object_size)
 		end = object_size;
 	if (end <= offset)
 		return (0);
 	length = end - offset;
 
 	while (length) {
 		start = end;
 		/* assert(offset <= start) */
 		err = get_next_chunk(dn, &start, offset);
 		if (err)
 			return (err);
 		len = trunc ? DMU_OBJECT_END : end - start;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, start, len);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		dnode_free_range(dn, start, trunc ? -1 : len, tx);
 
 		if (start == 0 && free_dnode) {
 			ASSERT(trunc);
 			dnode_free(dn, tx);
 		}
 
 		length -= end - start;
 
 		dmu_tx_commit(tx);
 		end = start;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_object(objset_t *os, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_tx_t *tx;
 	int err;
 
 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 	    FTAG, &dn);
 	if (err != 0)
 		return (err);
 	if (dn->dn_nlevels == 1) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, object);
 		dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err == 0) {
 			dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
 			dnode_free(dn, tx);
 			dmu_tx_commit(tx);
 		} else {
 			dmu_tx_abort(tx);
 		}
 	} else {
 		err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
 	}
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	dmu_buf_t **dbp;
 	int numbufs, err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block.  If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		int newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		bzero((char *)buf + newsz, size - newsz);
 		size = newsz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			int tocpy;
 			int bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = (int)MIN(db->db_size - bufoff, size);
 
 			bcopy((char *)db->db_data + bufoff, buf, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 /*
  * DMU support for xuio
  */
 kstat_t *xuio_ksp = NULL;
 
 int
 dmu_xuio_init(xuio_t *xuio, int nblk)
 {
 	dmu_xuio_t *priv;
 	uio_t *uio = &xuio->xu_uio;
 
 	uio->uio_iovcnt = nblk;
 	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
 
 	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
 	priv->cnt = nblk;
 	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
 	priv->iovp = uio->uio_iov;
 	XUIO_XUZC_PRIV(xuio) = priv;
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
 
 	return (0);
 }
 
 void
 dmu_xuio_fini(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int nblk = priv->cnt;
 
 	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
 	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
 	kmem_free(priv, sizeof (dmu_xuio_t));
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
 }
 
 /*
  * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
  * and increase priv->next by 1.
  */
 int
 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
 {
 	struct iovec *iov;
 	uio_t *uio = &xuio->xu_uio;
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int i = priv->next++;
 
 	ASSERT(i < priv->cnt);
 	ASSERT(off + n <= arc_buf_size(abuf));
 	iov = uio->uio_iov + i;
 	iov->iov_base = (char *)abuf->b_data + off;
 	iov->iov_len = n;
 	priv->bufs[i] = abuf;
 	return (0);
 }
 
 int
 dmu_xuio_cnt(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	return (priv->cnt);
 }
 
 arc_buf_t *
 dmu_xuio_arcbuf(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	return (priv->bufs[i]);
 }
 
 void
 dmu_xuio_clear(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	priv->bufs[i] = NULL;
 }
 
 static void
 xuio_stat_init(void)
 {
 	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (xuio_ksp != NULL) {
 		xuio_ksp->ks_data = &xuio_stats;
 		kstat_install(xuio_ksp);
 	}
 }
 
 static void
 xuio_stat_fini(void)
 {
 	if (xuio_ksp != NULL) {
 		kstat_delete(xuio_ksp);
 		xuio_ksp = NULL;
 	}
 }
 
 void
 xuio_stat_wbuf_copied()
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 }
 
 void
 xuio_stat_wbuf_nocopy()
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 	xuio_t *xuio = NULL;
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
 	    &numbufs, &dbp);
 	if (err)
 		return (err);
 
 #ifdef UIO_XUIO
 	if (uio->uio_extflg == UIO_XUIO)
 		xuio = (xuio_t *)uio;
 #endif
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		if (xuio) {
 			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 			arc_buf_t *dbuf_abuf = dbi->db_buf;
 			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
 			if (!err) {
 				uio->uio_resid -= tocpy;
 				uio->uio_loffset += tocpy;
 			}
 
 			if (abuf == dbuf_abuf)
 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 		} else {
 			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
 		}
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 static int
 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	int i;
 
 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		/*
 		 * XXX uiomove could block forever (eg. nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
 		 * to lock the pages in memory, so that uiomove won't
 		 * block.
 		 */
 		err = uiomove((char *)db->db_data + bufoff, tocpy,
 		    UIO_WRITE, uio);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 int
 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 #ifdef sun
 int
 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     page_t *pp, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy, copied, thiscpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 		caddr_t va;
 
 		ASSERT(size > 0);
 		ASSERT3U(db->db_size, >=, PAGESIZE);
 
 		bufoff = offset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
 			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
 			thiscpy = MIN(PAGESIZE, tocpy - copied);
 			va = zfs_map_page(pp, S_READ);
 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
 			zfs_unmap_page(pp, va);
 			pp = pp->p_next;
 			bufoff += PAGESIZE;
 		}
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 #endif	/* sun */
 #endif
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 	spa_t *spa;
 
 	DB_GET_SPA(&spa, db);
 	return (arc_loan_buf(spa, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 void
 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
 	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	uint32_t blksz = (uint32_t)arc_buf_size(buf);
 	uint64_t blkid;
 
 	DB_DNODE_ENTER(dbuf);
 	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(dbuf);
 
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		objset_t *os;
 		uint64_t object;
 
 		DB_DNODE_ENTER(dbuf);
 		dn = DB_DNODE(dbuf);
 		os = dn->dn_objset;
 		object = dn->dn_object;
 		DB_DNODE_EXIT(dbuf);
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 	}
 }
 
 typedef struct {
 	dbuf_dirty_record_t	*dsa_dr;
 	dmu_sync_cb_t		*dsa_done;
 	zgd_t			*dsa_zgd;
 	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
 		if (BP_IS_HOLE(bp)) {
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			bp->blk_fill = 1;
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			ASSERT(zio_checksum_table[chksum].ci_dedup);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 
 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
 		/*
 		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
 		 * then there is nothing to do here. Otherwise, free the
 		 * newly allocated block in this txg.
 		 */
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 			ASSERT(BP_EQUAL(bp, bp_orig));
 		} else {
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
 		dmu_tx_abort(tx);
 		return (EIO);	/* Make zl_get_data do txg_waited_synced() */
 	}
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	blkptr_t *bp = zgd->zgd_bp;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr;
 	dmu_sync_arg_t *dsa;
 	zbookmark_t zb;
 	zio_prop_t zp;
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (EEXIST);
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = db->db_last_dirty;
 	while (dr && dr->dr_txg != txg)
 		dr = dr->dr_next;
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (ENOENT);
 	}
 
 	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data is Y,
 	 * and the current in-memory data is Z (currently in dmu_sync).
 	 * X and Z are identical but Y is has been modified. Normally,
 	 * when X and Z are the same we will perform a nopwrite but if Y
 	 * is different we must disable nopwrite since the resulting write
 	 * of Y to disk can free the block containing X. If we allowed a
 	 * nopwrite to occur the block pointing to Z would reference a freed
 	 * block. Since this is a rare case we simplify this by disabling
 	 * nopwrite if the current dmu_sync-ing dbuf has been modified in
 	 * a previous transaction.
 	 */
 	if (dr->dr_next)
 		zp.zp_nopwrite = B_FALSE;
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (EALREADY);
 	}
 
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
 	    dmu_sync_ready, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
 	dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
 	dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/* XXX assumes dnode_hold will not get an i/o error */
 	(void) dnode_hold(os, object, FTAG, &dn);
 	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
 	dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/* XXX assumes dnode_hold will not get an i/o error */
 	(void) dnode_hold(os, object, FTAG, &dn);
 	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 int zfs_mdcomp_disable = 0;
 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
     &zfs_mdcomp_disable, 0, "Disable metadata compression");
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	int copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
 		 * XXX -- we should design a compression algorithm
 		 * that specializes in arrays of bps.
 		 */
 		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
 		    ZIO_COMPRESS_LZJB;
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (zio_checksum_table[checksum].ci_correctable < 1 ||
 		    zio_checksum_table[checksum].ci_eck)
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(dn->dn_compress, compress);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checkum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!zio_checksum_table[checksum].ci_dedup)
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have a cryptographically secure
 		 * checksum that has no known collisions (i.e. SHA-256)
 		 * and compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually exclusive.
 		 */
 		nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	zp->zp_checksum = checksum;
 	zp->zp_compress = compress;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 }
 
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int i, err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	/*
 	 * Sync any current changes before
 	 * we go trundling through the block pointers.
 	 */
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (list_link_active(&dn->dn_dirty_link[i]))
 			break;
 	}
 	if (i != TXG_SIZE) {
 		dnode_rele(dn, FTAG);
 		txg_wait_synced(dmu_objset_pool(os), 0);
 		err = dnode_hold(os, object, FTAG, &dn);
 		if (err)
 			return (err);
 	}
 
 	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  * This is specifically optimized for zfs_getattr().
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add 1 for dnode space */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + 1;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 /* ARGSUSED */
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 }
 
 void
 dmu_init(void)
 {
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	xuio_stat_init();
 	dmu_objset_init();
 	dnode_init();
 	dbuf_init();
 	zfetch_init();
 	l2arc_init();
 	arc_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini();
 	l2arc_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	xuio_stat_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	(revision 247192)
@@ -1,1786 +1,1787 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dnode.h>
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
  * before it can be safely accessed.
  */
 krwlock_t os_lock;
 
 void
 dmu_objset_init(void)
 {
 	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
 }
 
 void
 dmu_objset_fini(void)
 {
 	rw_destroy(&os_lock);
 }
 
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
 	return (os->os_spa);
 }
 
 zilog_t *
 dmu_objset_zil(objset_t *os)
 {
 	return (os->os_zil);
 }
 
 dsl_pool_t *
 dmu_objset_pool(objset_t *os)
 {
 	dsl_dataset_t *ds;
 
 	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 		return (ds->ds_dir->dd_pool);
 	else
 		return (spa_get_dsl(os->os_spa));
 }
 
 dsl_dataset_t *
 dmu_objset_ds(objset_t *os)
 {
 	return (os->os_dsl_dataset);
 }
 
 dmu_objset_type_t
 dmu_objset_type(objset_t *os)
 {
 	return (os->os_phys->os_type);
 }
 
 void
 dmu_objset_name(objset_t *os, char *buf)
 {
 	dsl_dataset_name(os->os_dsl_dataset, buf);
 }
 
 uint64_t
 dmu_objset_id(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	return (ds ? ds->ds_object : 0);
 }
 
 uint64_t
 dmu_objset_syncprop(objset_t *os)
 {
 	return (os->os_sync);
 }
 
 uint64_t
 dmu_objset_logbias(objset_t *os)
 {
 	return (os->os_logbias);
 }
 
 static void
 checksum_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 }
 
 static void
 compression_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
 	os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
 }
 
 static void
 copies_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval > 0);
 	ASSERT(newval <= spa_max_replication(os->os_spa));
 
 	os->os_copies = newval;
 }
 
 static void
 dedup_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 	spa_t *spa = os->os_spa;
 	enum zio_checksum checksum;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 
 	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 }
 
 static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_primary_cache = newval;
 }
 
 static void
 secondary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_secondary_cache = newval;
 }
 
 static void
 sync_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 	    newval == ZFS_SYNC_DISABLED);
 
 	os->os_sync = newval;
 	if (os->os_zil)
 		zil_set_sync(os->os_zil, newval);
 }
 
 static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 	    newval == ZFS_LOGBIAS_THROUGHPUT);
 	os->os_logbias = newval;
 	if (os->os_zil)
 		zil_set_logbias(os->os_zil, newval);
 }
 
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
 	objset_phys_t *osp = buf;
 
 	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 	dnode_byteswap(&osp->os_meta_dnode);
 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 	osp->os_type = BSWAP_64(osp->os_type);
 	osp->os_flags = BSWAP_64(osp->os_flags);
 	if (size == sizeof (objset_phys_t)) {
 		dnode_byteswap(&osp->os_userused_dnode);
 		dnode_byteswap(&osp->os_groupused_dnode);
 	}
 }
 
 int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     objset_t **osp)
 {
 	objset_t *os;
 	int i, err;
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
 	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 	os->os_dsl_dataset = ds;
 	os->os_spa = spa;
 	os->os_rootbp = bp;
 	if (!BP_IS_HOLE(os->os_rootbp)) {
 		uint32_t aflags = ARC_WAIT;
 		zbookmark_t zb;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_L2CACHE;
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err) {
 			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = EIO;
 			return (err);
 		}
 
 		/* Increase the blocksize if we are permitted. */
 		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
 			arc_buf_t *buf = arc_buf_alloc(spa,
 			    sizeof (objset_phys_t), &os->os_phys_buf,
 			    ARC_BUFC_METADATA);
 			bzero(buf->b_data, sizeof (objset_phys_t));
 			bcopy(os->os_phys_buf->b_data, buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
 			(void) arc_buf_remove_ref(os->os_phys_buf,
 			    &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}
 
 		os->os_phys = os->os_phys_buf->b_data;
 		os->os_flags = os->os_phys->os_flags;
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
 		os->os_phys_buf = arc_buf_alloc(spa, size,
 		    &os->os_phys_buf, ARC_BUFC_METADATA);
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
 	}
 
 	/*
 	 * Note: the changed_cb will be called once before the register
 	 * func returns, thus changing the checksum/compression from the
 	 * default (fletcher2/off).  Snapshots don't need to know about
 	 * checksum/compression/copies.
 	 */
 	if (ds) {
 		err = dsl_prop_register(ds, "primarycache",
 		    primary_cache_changed_cb, os);
 		if (err == 0)
 			err = dsl_prop_register(ds, "secondarycache",
 			    secondary_cache_changed_cb, os);
 		if (!dsl_dataset_is_snapshot(ds)) {
 			if (err == 0)
 				err = dsl_prop_register(ds, "checksum",
 				    checksum_changed_cb, os);
 			if (err == 0)
 				err = dsl_prop_register(ds, "compression",
 				    compression_changed_cb, os);
 			if (err == 0)
 				err = dsl_prop_register(ds, "copies",
 				    copies_changed_cb, os);
 			if (err == 0)
 				err = dsl_prop_register(ds, "dedup",
 				    dedup_changed_cb, os);
 			if (err == 0)
 				err = dsl_prop_register(ds, "logbias",
 				    logbias_changed_cb, os);
 			if (err == 0)
 				err = dsl_prop_register(ds, "sync",
 				    sync_changed_cb, os);
 		}
 		if (err) {
 			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
 			    &os->os_phys_buf) == 1);
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
 	} else if (ds == NULL) {
 		/* It's the meta-objset. */
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 		os->os_compress = ZIO_COMPRESS_LZJB;
 		os->os_copies = spa_max_replication(spa);
 		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 		os->os_dedup_verify = 0;
 		os->os_logbias = 0;
 		os->os_sync = 0;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
 
 	if (ds == NULL || !dsl_dataset_is_snapshot(ds))
 		os->os_zil_header = os->os_phys->os_zil_header;
 	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
 		list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
 	}
 	list_create(&os->os_dnodes, sizeof (dnode_t),
 	    offsetof(dnode_t, dn_link));
 	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	DMU_META_DNODE(os) = dnode_special_open(os,
 	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
 	    &os->os_meta_dnode);
 	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 		DMU_USERUSED_DNODE(os) = dnode_special_open(os,
 		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
 		    &os->os_userused_dnode);
 		DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
 		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
 		    &os->os_groupused_dnode);
 	}
 
 	/*
 	 * We should be the only thread trying to do this because we
 	 * have ds_opening_lock
 	 */
 	if (ds) {
 		mutex_enter(&ds->ds_lock);
 		ASSERT(ds->ds_objset == NULL);
 		ds->ds_objset = os;
 		mutex_exit(&ds->ds_lock);
 	}
 
 	*osp = os;
 	return (0);
 }
 
 int
 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
 	int err = 0;
 
 	mutex_enter(&ds->ds_opening_lock);
 	*osp = ds->ds_objset;
 	if (*osp == NULL) {
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 		    ds, dsl_dataset_get_blkptr(ds), osp);
 	}
 	mutex_exit(&ds->ds_opening_lock);
 	return (err);
 }
 
 /* called from zpl */
 int
 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_dataset_hold(name, tag, &ds);
 	if (err)
 		return (err);
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err)
 		dsl_dataset_rele(ds, tag);
 
 	return (err);
 }
 
 /* called from zpl */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_dataset_own(name, B_FALSE, tag, &ds);
 	if (err)
 		return (err);
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err) {
 		dsl_dataset_disown(ds, tag);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 		dmu_objset_disown(*osp, tag);
 		return (EINVAL);
 	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
 		dmu_objset_disown(*osp, tag);
 		return (EROFS);
 	}
 	return (err);
 }
 
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
 	dsl_dataset_rele(os->os_dsl_dataset, tag);
 }
 
 void
 dmu_objset_disown(objset_t *os, void *tag)
 {
 	dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
 int
 dmu_objset_evict_dbufs(objset_t *os)
 {
 	dnode_t *dn;
 
 	mutex_enter(&os->os_lock);
 
 	/* process the mdn last, since the other dnodes have holds on it */
 	list_remove(&os->os_dnodes, DMU_META_DNODE(os));
 	list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
 
 	/*
 	 * Find the first dnode with holds.  We have to do this dance
 	 * because dnode_add_ref() only works if you already have a
 	 * hold.  If there are no holds then it has no dbufs so OK to
 	 * skip.
 	 */
 	for (dn = list_head(&os->os_dnodes);
 	    dn && !dnode_add_ref(dn, FTAG);
 	    dn = list_next(&os->os_dnodes, dn))
 		continue;
 
 	while (dn) {
 		dnode_t *next_dn = dn;
 
 		do {
 			next_dn = list_next(&os->os_dnodes, next_dn);
 		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
 
 		mutex_exit(&os->os_lock);
 		dnode_evict_dbufs(dn);
 		dnode_rele(dn, FTAG);
 		mutex_enter(&os->os_lock);
 		dn = next_dn;
 	}
 	dn = list_head(&os->os_dnodes);
 	mutex_exit(&os->os_lock);
 	return (dn != DMU_META_DNODE(os));
 }
 
 void
 dmu_objset_evict(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!dmu_objset_is_dirty(os, t));
 
 	if (ds) {
 		if (!dsl_dataset_is_snapshot(ds)) {
 			VERIFY(0 == dsl_prop_unregister(ds, "checksum",
 			    checksum_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "compression",
 			    compression_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "copies",
 			    copies_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "dedup",
 			    dedup_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "logbias",
 			    logbias_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "sync",
 			    sync_changed_cb, os));
 		}
 		VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
 		    primary_cache_changed_cb, os));
 		VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
 		    secondary_cache_changed_cb, os));
 	}
 
 	if (os->os_sa)
 		sa_tear_down(os);
 
 	/*
 	 * We should need only a single pass over the dnode list, since
 	 * nothing can be added to the list at this point.
 	 */
 	(void) dmu_objset_evict_dbufs(os);
 
 	dnode_special_close(&os->os_meta_dnode);
 	if (DMU_USERUSED_DNODE(os)) {
 		dnode_special_close(&os->os_userused_dnode);
 		dnode_special_close(&os->os_groupused_dnode);
 	}
 	zil_free(os->os_zil);
 
 	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
 	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
 
 	/*
 	 * This is a barrier to prevent the objset from going away in
 	 * dnode_move() until we can safely ensure that the objset is still in
 	 * use. We consider the objset valid before the barrier and invalid
 	 * after the barrier.
 	 */
 	rw_enter(&os_lock, RW_READER);
 	rw_exit(&os_lock);
 
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
 	kmem_free(os, sizeof (objset_t));
 }
 
 timestruc_t
 dmu_objset_snap_cmtime(objset_t *os)
 {
 	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 }
 
 /* called from dsl for meta-objset */
 objset_t *
 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, dmu_tx_t *tx)
 {
 	objset_t *os;
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	if (ds != NULL)
 		VERIFY(0 == dmu_objset_from_ds(ds, &os));
 	else
 		VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os));
 
 	mdn = DMU_META_DNODE(os);
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
 
 	/*
 	 * We don't want to have to increase the meta-dnode's nlevels
 	 * later, because then we could do it in quescing context while
 	 * we are also accessing it in open context.
 	 *
 	 * This precaution is not necessary for the MOS (ds == NULL),
 	 * because the MOS is only updated in syncing context.
 	 * This is most fortunate: the MOS is the only objset that
 	 * needs to be synced multiple times as spa_sync() iterates
 	 * to convergence, so minimizing its dn_nlevels matters.
 	 */
 	if (ds != NULL) {
 		int levels = 1;
 
 		/*
 		 * Determine the number of levels necessary for the meta-dnode
 		 * to contain DN_MAX_OBJECT dnodes.
 		 */
 		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
 		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
 			levels++;
 
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 		    mdn->dn_nlevels = levels;
 	}
 
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
 	ASSERT(type < DMU_OST_NUMTYPES);
 	os->os_phys->os_type = type;
 	if (dmu_objset_userused_enabled(os)) {
 		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 		os->os_flags = os->os_phys->os_flags;
 	}
 
 	dsl_dataset_dirty(ds, tx);
 
 	return (os);
 }
 
 struct oscarg {
 	void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	void *userarg;
 	dsl_dataset_t *clone_origin;
 	const char *lastname;
 	dmu_objset_type_t type;
 	uint64_t flags;
 	cred_t *cr;
 };
 
 /*ARGSUSED*/
 static int
 dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct oscarg *oa = arg2;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	int err;
 	uint64_t ddobj;
 
 	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
 	    oa->lastname, sizeof (uint64_t), 1, &ddobj);
 	if (err != ENOENT)
 		return (err ? err : EEXIST);
 
 	if (oa->clone_origin != NULL) {
 		/* You can't clone across pools. */
 		if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
 			return (EXDEV);
 
 		/* You can only clone snapshots, not the head datasets. */
 		if (!dsl_dataset_is_snapshot(oa->clone_origin))
 			return (EINVAL);
 	}
 
 	return (0);
 }
 
 static void
 dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	spa_t *spa = dd->dd_pool->dp_spa;
 	struct oscarg *oa = arg2;
 	uint64_t obj;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	obj = dsl_dataset_create_sync(dd, oa->lastname,
 	    oa->clone_origin, oa->flags, oa->cr, tx);
 
 	if (oa->clone_origin == NULL) {
 		dsl_pool_t *dp = dd->dd_pool;
 		dsl_dataset_t *ds;
 		blkptr_t *bp;
 		objset_t *os;
 
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 		bp = dsl_dataset_get_blkptr(ds);
 		ASSERT(BP_IS_HOLE(bp));
 
 		os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx);
 
 		if (oa->userfunc)
 			oa->userfunc(os, oa->userarg, oa->cr, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
 	dsl_dir_t *pdd;
 	const char *tail;
 	int err = 0;
 	struct oscarg oa = { 0 };
 
 	ASSERT(strchr(name, '@') == NULL);
 	err = dsl_dir_open(name, FTAG, &pdd, &tail);
 	if (err)
 		return (err);
 	if (tail == NULL) {
 		dsl_dir_close(pdd, FTAG);
 		return (EEXIST);
 	}
 
 	oa.userfunc = func;
 	oa.userarg = arg;
 	oa.lastname = tail;
 	oa.type = type;
 	oa.flags = flags;
 	oa.cr = CRED();
 
 	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
 	    dmu_objset_create_sync, pdd, &oa, 5);
 	dsl_dir_close(pdd, FTAG);
 	return (err);
 }
 
 int
 dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
 {
 	dsl_dir_t *pdd;
 	const char *tail;
 	int err = 0;
 	struct oscarg oa = { 0 };
 
 	ASSERT(strchr(name, '@') == NULL);
 	err = dsl_dir_open(name, FTAG, &pdd, &tail);
 	if (err)
 		return (err);
 	if (tail == NULL) {
 		dsl_dir_close(pdd, FTAG);
 		return (EEXIST);
 	}
 
 	oa.lastname = tail;
 	oa.clone_origin = clone_origin;
 	oa.flags = flags;
 	oa.cr = CRED();
 
 	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
 	    dmu_objset_create_sync, pdd, &oa, 5);
 	dsl_dir_close(pdd, FTAG);
 	return (err);
 }
 
 int
 dmu_objset_destroy(const char *name, boolean_t defer)
 {
 	dsl_dataset_t *ds;
 	int error;
 
 	error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
 	if (error == 0) {
 		error = dsl_dataset_destroy(ds, FTAG, defer);
 		/* dsl_dataset_destroy() closes the ds. */
 	}
 
 	return (error);
 }
 
 struct snaparg {
 	dsl_sync_task_group_t *dstg;
 	char *snapname;
 	char *htag;
 	char failed[MAXPATHLEN];
 	boolean_t recursive;
 	boolean_t needsuspend;
 	boolean_t temporary;
 	nvlist_t *props;
 	struct dsl_ds_holdarg *ha;	/* only needed in the temporary case */
 	dsl_dataset_t *newds;
 };
 
 static int
 snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	objset_t *os = arg1;
 	struct snaparg *sn = arg2;
 	int error;
 
 	/* The props have already been checked by zfs_check_userprops(). */
 
 	error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
 	    sn->snapname, tx);
 	if (error)
 		return (error);
 
 	if (sn->temporary) {
 		/*
 		 * Ideally we would just call
 		 * dsl_dataset_user_hold_check() and
 		 * dsl_dataset_destroy_check() here.  However the
 		 * dataset we want to hold and destroy is the snapshot
 		 * that we just confirmed we can create, but it won't
 		 * exist until after these checks are run.  Do any
 		 * checks we can here and if more checks are added to
 		 * those routines in the future, similar checks may be
 		 * necessary here.
 		 */
 		if (spa_version(os->os_spa) < SPA_VERSION_USERREFS)
 			return (ENOTSUP);
 		/*
 		 * Not checking number of tags because the tag will be
 		 * unique, as it will be the only tag.
 		 */
 		if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
 			return (E2BIG);
 
 		sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
 		sn->ha->temphold = B_TRUE;
 		sn->ha->htag = sn->htag;
 	}
 	return (error);
 }
 
 static void
 snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	objset_t *os = arg1;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	struct snaparg *sn = arg2;
 
 	dsl_dataset_snapshot_sync(ds, sn->snapname, tx);
 
 	if (sn->props) {
 		dsl_props_arg_t pa;
 		pa.pa_props = sn->props;
 		pa.pa_source = ZPROP_SRC_LOCAL;
 		dsl_props_set_sync(ds->ds_prev, &pa, tx);
 	}
 
 	if (sn->temporary) {
 		struct dsl_ds_destroyarg da;
 
 		dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx);
 		kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg));
 		sn->ha = NULL;
 		sn->newds = ds->ds_prev;
 
 		da.ds = ds->ds_prev;
 		da.defer = B_TRUE;
 		dsl_dataset_destroy_sync(&da, FTAG, tx);
 	}
 }
 
 static int
 dmu_objset_snapshot_one(const char *name, void *arg)
 {
 	struct snaparg *sn = arg;
 	objset_t *os;
 	int err;
 	char *cp;
 
 	/*
 	 * If the objset starts with a '%', then ignore it unless it was
 	 * explicitly named (ie, not recursive).  These hidden datasets
 	 * are always inconsistent, and by not opening them here, we can
 	 * avoid a race with dsl_dir_destroy_check().
 	 */
 	cp = strrchr(name, '/');
 	if (cp && cp[1] == '%' && sn->recursive)
 		return (0);
 
 	(void) strcpy(sn->failed, name);
 
 	/*
 	 * Check permissions if we are doing a recursive snapshot.  The
 	 * permission checks for the starting dataset have already been
 	 * performed in zfs_secpolicy_snapshot()
 	 */
 	if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED())))
 		return (err);
 
 	err = dmu_objset_hold(name, sn, &os);
 	if (err != 0)
 		return (err);
 
 	/*
 	 * If the objset is in an inconsistent state (eg, in the process
 	 * of being destroyed), don't snapshot it.  As with %hidden
 	 * datasets, we return EBUSY if this name was explicitly
 	 * requested (ie, not recursive), and otherwise ignore it.
 	 */
 	if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
 		dmu_objset_rele(os, sn);
 		return (sn->recursive ? 0 : EBUSY);
 	}
 
 	if (sn->needsuspend) {
 		err = zil_suspend(dmu_objset_zil(os));
 		if (err) {
 			dmu_objset_rele(os, sn);
 			return (err);
 		}
 	}
 	dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync,
 	    os, sn, 3);
 
 	return (0);
 }
 
 int
 dmu_objset_snapshot(char *fsname, char *snapname, char *tag,
     nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd)
 {
 	dsl_sync_task_t *dst;
 	struct snaparg sn;
 	spa_t *spa;
 	minor_t minor;
 	int err;
 
 	(void) strcpy(sn.failed, fsname);
 
 	err = spa_open(fsname, &spa, FTAG);
 	if (err)
 		return (err);
 
 	if (temporary) {
 		if (cleanup_fd < 0) {
 			spa_close(spa, FTAG);
 			return (EINVAL);
 		}
 		if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) {
 			spa_close(spa, FTAG);
 			return (err);
 		}
 	}
 
 	sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	sn.snapname = snapname;
 	sn.htag = tag;
 	sn.props = props;
 	sn.recursive = recursive;
 	sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
 	sn.temporary = temporary;
 	sn.ha = NULL;
 	sn.newds = NULL;
 
 	if (recursive) {
 		err = dmu_objset_find(fsname,
 		    dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
 	} else {
 		err = dmu_objset_snapshot_one(fsname, &sn);
 	}
 
 	if (err == 0)
 		err = dsl_sync_task_group_wait(sn.dstg);
 
 	for (dst = list_head(&sn.dstg->dstg_tasks); dst;
 	    dst = list_next(&sn.dstg->dstg_tasks, dst)) {
 		objset_t *os = dst->dst_arg1;
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		if (dst->dst_err) {
 			dsl_dataset_name(ds, sn.failed);
 		} else if (temporary) {
 			dsl_register_onexit_hold_cleanup(sn.newds, tag, minor);
 		}
 		if (sn.needsuspend)
 			zil_resume(dmu_objset_zil(os));
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 		if (dst->dst_err == 0 && dmu_objset_type(os) == DMU_OST_ZVOL) {
 			char name[MAXNAMELEN];
 
 			dmu_objset_name(os, name);
 			strlcat(name, "@", sizeof(name));
 			strlcat(name, snapname, sizeof(name));
 			zvol_create_minors(name);
 		}
 #endif
 #endif
 		dmu_objset_rele(os, &sn);
 	}
 
 	if (err)
 		(void) strcpy(fsname, sn.failed);
 	if (temporary)
 		zfs_onexit_fd_rele(cleanup_fd);
 	dsl_sync_task_group_destroy(sn.dstg);
 	spa_close(spa, FTAG);
 	return (err);
 }
 
 static void
 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	while (dn = list_head(list)) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
 		 * Initialize dn_zio outside dnode_sync() because the
 		 * meta-dnode needs to set it ouside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
 
 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 		list_remove(list, dn);
 
 		if (newlist) {
 			(void) dnode_add_ref(dn, newlist);
 			list_insert_tail(newlist, dn);
 		}
 
 		dnode_sync(dn, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
 	ASSERT(bp == os->os_rootbp);
 	ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET);
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 
 	/*
 	 * Update rootbp fill count: it should be the number of objects
 	 * allocated in the object set (not counting the "special"
 	 * objects that are stored in the objset_phys_t -- the meta
 	 * dnode and user/group accounting objects).
 	 */
 	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_t *os = arg;
 
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 }
 
 /* called from dsl */
 void
 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
 	list_t *newlist = NULL;
 	dbuf_dirty_record_t *dr;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* XXX the write_done callback should really give us the tx... */
 	os->os_synctx = tx;
 
 	if (os->os_dsl_dataset == NULL) {
 		/*
 		 * This is the MOS.  If we have upgraded,
 		 * spa_max_replication() could change, so reset
 		 * os_copies here.
 		 */
 		os->os_copies = spa_max_replication(os->os_spa);
 	}
 
 	/*
 	 * Create the root block IO
 	 */
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
 	    dmu_objset_write_ready, dmu_objset_write_done, os,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
 	DMU_META_DNODE(os)->dn_zio = zio;
 	dnode_sync(DMU_META_DNODE(os), tx);
 
 	os->os_phys->os_flags = os->os_flags;
 
 	if (DMU_USERUSED_DNODE(os) &&
 	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
 		DMU_USERUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_USERUSED_DNODE(os), tx);
 		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
 	}
 
 	txgoff = tx->tx_txg & TXG_MASK;
 
 	if (dmu_objset_userused_enabled(os)) {
 		newlist = &os->os_synced_dnodes;
 		/*
 		 * We must create the list here because it uses the
 		 * dn_dirty_link[] of this txg.
 		 */
 		list_create(newlist, sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[txgoff]));
 	}
 
 	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
 	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
 	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
 		ASSERT(dr->dr_dbuf->db_level == 0);
 		list_remove(list, dr);
 		if (dr->dr_zio)
 			zio_nowait(dr->dr_zio);
 	}
 	/*
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
 	os->os_phys->os_zil_header = os->os_zil_header;
 	zio_nowait(zio);
 }
 
 boolean_t
 dmu_objset_is_dirty(objset_t *os, uint64_t txg)
 {
 	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
 	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
 }
 
 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
 {
 	used_cbs[ost] = cb;
 }
 
 boolean_t
 dmu_objset_userused_enabled(objset_t *os)
 {
 	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
 	    used_cbs[os->os_phys->os_type] != NULL &&
 	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
 static void
 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
     uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
 {
 	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
 		int64_t delta = DNODE_SIZE + used;
 		if (subtract)
 			delta = -delta;
 		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
 		    user, delta, tx));
 		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
 		    group, delta, tx));
 	}
 }
 
 void
 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	list_t *list = &os->os_synced_dnodes;
 
 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
 	while (dn = list_head(list)) {
 		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
 		    dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED);
 
 		/* Allocate the user/groupused objects if necessary. */
 		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
 			VERIFY(0 == zap_create_claim(os,
 			    DMU_USERUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 			VERIFY(0 == zap_create_claim(os,
 			    DMU_GROUPUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		}
 
 		/*
 		 * We intentionally modify the zap object even if the
 		 * net delta is zero.  Otherwise
 		 * the block of the zap obj could be shared between
 		 * datasets but need to be different between them after
 		 * a bprewrite.
 		 */
 
 		flags = dn->dn_id_flags;
 		ASSERT(flags);
 		if (flags & DN_ID_OLD_EXIST)  {
 			do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
 			    dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
 		}
 		if (flags & DN_ID_NEW_EXIST) {
 			do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
 			    dn->dn_phys->dn_flags,  dn->dn_newuid,
 			    dn->dn_newgid, B_FALSE, tx);
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_oldused = 0;
 		dn->dn_oldflags = 0;
 		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
 			dn->dn_olduid = dn->dn_newuid;
 			dn->dn_oldgid = dn->dn_newgid;
 			dn->dn_id_flags |= DN_ID_OLD_EXIST;
 			if (dn->dn_bonuslen == 0)
 				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 			else
 				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		}
 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
 		mutex_exit(&dn->dn_mtx);
 
 		list_remove(list, dn);
 		dnode_rele(dn, list);
 	}
 }
 
 /*
  * Returns a pointer to data to find uid/gid from
  *
  * If a dirty record for transaction group that is syncing can't
  * be found then NULL is returned.  In the NULL case it is assumed
  * the uid/gid aren't changing.
  */
 static void *
 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr, **drp;
 	void *data;
 
 	if (db->db_dirtycnt == 0)
 		return (db->db.db_data);  /* Nothing is changing */
 
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg == tx->tx_txg)
 			break;
 
 	if (dr == NULL) {
 		data = NULL;
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(dr->dr_dbuf);
 		dn = DB_DNODE(dr->dr_dbuf);
 
 		if (dn->dn_bonuslen == 0 &&
 		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
 			data = dr->dt.dl.dr_data->b_data;
 		else
 			data = dr->dt.dl.dr_data;
 
 		DB_DNODE_EXIT(dr->dr_dbuf);
 	}
 
 	return (data);
 }
 
 void
 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	void *data = NULL;
 	dmu_buf_impl_t *db = NULL;
-	uint64_t *user, *group;
+	uint64_t *user = NULL;
+	uint64_t *group = NULL;
 	int flags = dn->dn_id_flags;
 	int error;
 	boolean_t have_spill = B_FALSE;
 
 	if (!dmu_objset_userused_enabled(dn->dn_objset))
 		return;
 
 	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
 	    DN_ID_CHKED_SPILL)))
 		return;
 
 	if (before && dn->dn_bonuslen != 0)
 		data = DN_BONUS(dn->dn_phys);
 	else if (!before && dn->dn_bonuslen != 0) {
 		if (dn->dn_bonus) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 			data = dmu_objset_userquota_find_data(db, tx);
 		} else {
 			data = DN_BONUS(dn->dn_phys);
 		}
 	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
 			int rf = 0;
 
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
 				rf |= DB_RF_HAVESTRUCT;
 			error = dmu_spill_hold_by_dnode(dn,
 			    rf | DB_RF_MUST_SUCCEED,
 			    FTAG, (dmu_buf_t **)&db);
 			ASSERT(error == 0);
 			mutex_enter(&db->db_mtx);
 			data = (before) ? db->db.db_data :
 			    dmu_objset_userquota_find_data(db, tx);
 			have_spill = B_TRUE;
 	} else {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 
 	if (before) {
 		ASSERT(data);
 		user = &dn->dn_olduid;
 		group = &dn->dn_oldgid;
 	} else if (data) {
 		user = &dn->dn_newuid;
 		group = &dn->dn_newgid;
 	}
 
 	/*
 	 * Must always call the callback in case the object
 	 * type has changed and that type isn't an object type to track
 	 */
 	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
 	    user, group);
 
 	/*
 	 * Preserve existing uid/gid when the callback can't determine
 	 * what the new uid/gid are and the callback returned EEXIST.
 	 * The EEXIST error tells us to just use the existing uid/gid.
 	 * If we don't know what the old values are then just assign
 	 * them to 0, since that is a new file  being created.
 	 */
 	if (!before && data == NULL && error == EEXIST) {
 		if (flags & DN_ID_OLD_EXIST) {
 			dn->dn_newuid = dn->dn_olduid;
 			dn->dn_newgid = dn->dn_oldgid;
 		} else {
 			dn->dn_newuid = 0;
 			dn->dn_newgid = 0;
 		}
 		error = 0;
 	}
 
 	if (db)
 		mutex_exit(&db->db_mtx);
 
 	mutex_enter(&dn->dn_mtx);
 	if (error == 0 && before)
 		dn->dn_id_flags |= DN_ID_OLD_EXIST;
 	if (error == 0 && !before)
 		dn->dn_id_flags |= DN_ID_NEW_EXIST;
 
 	if (have_spill) {
 		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 	} else {
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 	}
 	mutex_exit(&dn->dn_mtx);
 	if (have_spill)
 		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 }
 
 boolean_t
 dmu_objset_userspace_present(objset_t *os)
 {
 	return (os->os_phys->os_flags &
 	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 }
 
 int
 dmu_objset_userspace_upgrade(objset_t *os)
 {
 	uint64_t obj;
 	int err = 0;
 
 	if (dmu_objset_userspace_present(os))
 		return (0);
 	if (!dmu_objset_userused_enabled(os))
 		return (ENOTSUP);
 	if (dmu_objset_is_snapshot(os))
 		return (EINVAL);
 
 	/*
 	 * We simply need to mark every object dirty, so that it will be
 	 * synced out and now accounted.  If this is called
 	 * concurrently, or if we already did some work before crashing,
 	 * that's fine, since we track each object's accounted state
 	 * independently.
 	 */
 
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
 		dmu_tx_t *tx;
 		dmu_buf_t *db;
 		int objerr;
 
 		if (issig(JUSTLOOKING) && issig(FORREAL))
 			return (EINTR);
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
 		if (objerr)
 			continue;
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, obj);
 		objerr = dmu_tx_assign(tx, TXG_WAIT);
 		if (objerr) {
 			dmu_tx_abort(tx);
 			continue;
 		}
 		dmu_buf_will_dirty(db, tx);
 		dmu_buf_rele(db, FTAG);
 		dmu_tx_commit(tx);
 	}
 
 	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	return (0);
 }
 
 void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
 	    usedobjsp, availobjsp);
 }
 
 uint64_t
 dmu_objset_fsid_guid(objset_t *os)
 {
 	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
 }
 
 void
 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
 {
 	stat->dds_type = os->os_phys->os_type;
 	if (os->os_dsl_dataset)
 		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
 }
 
 void
 dmu_objset_stats(objset_t *os, nvlist_t *nv)
 {
 	ASSERT(os->os_dsl_dataset ||
 	    os->os_phys->os_type == DMU_OST_META);
 
 	if (os->os_dsl_dataset != NULL)
 		dsl_dataset_stats(os->os_dsl_dataset, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
 	    os->os_phys->os_type);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
 	    dmu_objset_userspace_present(os));
 }
 
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
 	if (os->os_dsl_dataset != NULL)
 		return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
 	else
 		return (B_FALSE);
 }
 
 int
 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
     boolean_t *conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
 	if (ds->ds_phys->ds_snapnames_zapobj == 0)
 		return (ENOENT);
 
 	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
 	    real, maxlen, conflict));
 }
 
 int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	if (ds->ds_phys->ds_snapnames_zapobj == 0)
 		return (ENOENT);
 
 	zap_cursor_init_serialized(&cursor,
 	    ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (ENOENT);
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (ENAMETOOLONG);
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	if (case_conflict)
 		*case_conflict = attr.za_normalization_conflict;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
 {
 	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	/* there is no next dir on a snapshot! */
 	if (os->os_dsl_dataset->ds_object !=
 	    dd->dd_phys->dd_head_dataset_obj)
 		return (ENOENT);
 
 	zap_cursor_init_serialized(&cursor,
 	    dd->dd_pool->dp_meta_objset,
 	    dd->dd_phys->dd_child_dir_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (ENOENT);
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (ENAMETOOLONG);
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 struct findarg {
 	int (*func)(const char *, void *);
 	void *arg;
 };
 
 /* ARGSUSED */
 static int
 findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 {
 	struct findarg *fa = arg;
 	return (fa->func(dsname, fa->arg));
 }
 
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
  * Perhaps change all callers to use dmu_objset_find_spa()?
  */
 int
 dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
     int flags)
 {
 	struct findarg fa;
 	fa.func = func;
 	fa.arg = arg;
 	return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
 }
 
 /*
  * Find all objsets under name, call func on each
  */
 int
 dmu_objset_find_spa(spa_t *spa, const char *name,
     int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	char *child;
 	uint64_t thisobj;
 	int err;
 
 	if (name == NULL)
 		name = spa_name(spa);
 	err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
 	if (err)
 		return (err);
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_close(dd, FTAG);
 		return (0);
 	}
 
 	thisobj = dd->dd_phys->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	dp = dd->dd_pool;
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dd->dd_phys->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT(attr->za_integer_length == sizeof (uint64_t));
 			ASSERT(attr->za_num_integers == 1);
 
 			child = kmem_asprintf("%s/%s", name, attr->za_name);
 			err = dmu_objset_find_spa(spa, child, func, arg, flags);
 			strfree(child);
 			if (err)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
 		if (err) {
 			dsl_dir_close(dd, FTAG);
 			kmem_free(attr, sizeof (zap_attribute_t));
 			return (err);
 		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
 		if (!dsl_pool_sync_context(dp))
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 		if (!dsl_pool_sync_context(dp))
 			rw_exit(&dp->dp_config_rwlock);
 
 		if (err == 0) {
 			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT(attr->za_integer_length ==
 				    sizeof (uint64_t));
 				ASSERT(attr->za_num_integers == 1);
 
 				child = kmem_asprintf("%s@%s",
 				    name, attr->za_name);
 				err = func(spa, attr->za_first_integer,
 				    child, arg);
 				strfree(child);
 				if (err)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	dsl_dir_close(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
 
 	if (err)
 		return (err);
 
 	/*
 	 * Apply to self if appropriate.
 	 */
 	err = func(spa, thisobj, name, arg);
 	return (err);
 }
 
 /* ARGSUSED */
 int
 dmu_objset_prefetch(const char *name, void *arg)
 {
 	dsl_dataset_t *ds;
 
 	if (dsl_dataset_hold(name, FTAG, &ds))
 		return (0);
 
 	if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
 		mutex_enter(&ds->ds_opening_lock);
 		if (ds->ds_objset == NULL) {
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
 
 			SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
 			    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 			(void) arc_read(NULL, dsl_dataset_get_spa(ds),
 			    &ds->ds_phys->ds_bp, NULL, NULL,
 			    ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
 		}
 		mutex_exit(&ds->ds_opening_lock);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dmu_objset_set_user(objset_t *os, void *user_ptr)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	os->os_user_ptr = user_ptr;
 }
 
 void *
 dmu_objset_get_user(objset_t *os)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	return (os->os_user_ptr);
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c	(revision 247192)
@@ -1,4364 +1,4361 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
 
 static char *dsl_reaper = "the grim reaper";
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
 #define	SWITCH64(x, y) \
 	{ \
 		uint64_t __tmp = (x); \
 		(x) = (y); \
 		(y) = __tmp; \
 	}
 
 #define	DS_REF_MAX	(1ULL << 62)
 
 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
 
 #define	DSL_DATASET_IS_DESTROYED(ds)	((ds)->ds_owner == dsl_reaper)
 
 
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
  * partially accounted for in our ancestors.
  */
 static int64_t
 parent_delta(dsl_dataset_t *ds, int64_t delta)
 {
 	uint64_t old_bytes, new_bytes;
 
 	if (ds->ds_reserved == 0)
 		return (delta);
 
 	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
 	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
 
 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
 	return (new_bytes - old_bytes);
 }
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
 
 	dprintf_bp(bp, "ds=%p", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
 	if (BP_IS_HOLE(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 	if (ds == NULL) {
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    used, compressed, uncompressed);
 		return;
 	}
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
 	ds->ds_phys->ds_referenced_bytes += used;
 	ds->ds_phys->ds_compressed_bytes += compressed;
 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 	ds->ds_phys->ds_unique_bytes += used;
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
 	dsl_dir_transfer_space(ds->ds_dir, used - delta,
 	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
 }
 
 int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(bp->blk_birth <= tx->tx_txg);
 
 	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	ASSERT(used > 0);
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 		dsl_pool_mos_diduse_space(tx->tx_pool,
 		    -used, -compressed, -uncompressed);
 		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
 	ASSERT(!dsl_dataset_is_snapshot(ds));
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		mutex_enter(&ds->ds_dir->dd_lock);
 		mutex_enter(&ds->ds_lock);
 		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
 		    !DS_UNIQUE_IS_ACCURATE(ds));
 		delta = parent_delta(ds, -used);
 		ds->ds_phys->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    delta, -compressed, -uncompressed, tx);
 		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
 		mutex_exit(&ds->ds_dir->dd_lock);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		if (async) {
 			/*
 			 * We are here as part of zio's write done callback,
 			 * which means we're a zio interrupt thread.  We can't
 			 * call dsl_deadlist_insert() now because it may block
 			 * waiting for I/O.  Instead, put bp on the deferred
 			 * queue and let dsl_pool_sync() finish the job.
 			 */
 			bplist_append(&ds->ds_pending_deadlist, bp);
 		} else {
 			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    ds->ds_phys->ds_prev_snap_obj);
 		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object && bp->blk_birth >
 		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
 			ds->ds_prev->ds_phys->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
 		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
 	}
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
 	ds->ds_phys->ds_referenced_bytes -= used;
 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 	ds->ds_phys->ds_compressed_bytes -= compressed;
 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 
 	return (used);
 }
 
 uint64_t
 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 {
 	uint64_t trysnap = 0;
 
 	if (ds == NULL)
 		return (0);
 	/*
 	 * The snapshot creation could fail, but that would cause an
 	 * incorrect FALSE return, which would only result in an
 	 * overestimation of the amount of space that an operation would
 	 * consume, which is OK.
 	 *
 	 * There's also a small window where we could miss a pending
 	 * snapshot, because we could set the sync task in the quiescing
 	 * phase.  So this should only be used as a guess.
 	 */
 	if (ds->ds_trysnap_txg >
 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 		trysnap = ds->ds_trysnap_txg;
 	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 }
 
 boolean_t
 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
     uint64_t blk_birth)
 {
 	if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
 		return (B_FALSE);
 
 	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 
 	return (B_TRUE);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 {
 	dsl_dataset_t *ds = dsv;
 
 	ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 
 	unique_remove(ds->ds_fsid_guid);
 
 	if (ds->ds_objset != NULL)
 		dmu_objset_evict(ds->ds_objset);
 
 	if (ds->ds_prev) {
 		dsl_dataset_drop_ref(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	bplist_destroy(&ds->ds_pending_deadlist);
 	if (db != NULL) {
 		dsl_deadlist_close(&ds->ds_deadlist);
 	} else {
 		ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
 		ASSERT(!ds->ds_deadlist.dl_oldfmt);
 	}
 	if (ds->ds_dir)
 		dsl_dir_close(ds->ds_dir, ds);
 
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
 	if (mutex_owned(&ds->ds_lock))
 		mutex_exit(&ds->ds_lock);
 	mutex_destroy(&ds->ds_lock);
 	mutex_destroy(&ds->ds_recvlock);
 	if (mutex_owned(&ds->ds_opening_lock))
 		mutex_exit(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_opening_lock);
 	rw_destroy(&ds->ds_rwlock);
 	cv_destroy(&ds->ds_exclusive_cv);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
 static int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
 	int err;
 	dmu_buf_t *headdbuf;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
 		return (0);
 	if (ds->ds_phys->ds_next_snap_obj == 0)
 		return (0);
 
 	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
 	if (err)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 	dmu_buf_rele(headdbuf, FTAG);
 	return (err);
 }
 
 static int
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 	    value, mt, NULL, 0, NULL);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_lookup(mos, snapobj, name, 8, 1, value);
 	return (err);
 }
 
 static int
 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
 
 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_remove(mos, snapobj, name, tx);
 	return (err);
 }
 
 static int
 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
 	int err;
 	dmu_object_info_t doi;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 	if (err)
 		return (err);
 
 	/* Make sure dsobj has the correct object type. */
 	dmu_object_info_from_db(dbuf, &doi);
 	if (doi.doi_type != DMU_OT_DSL_DATASET)
 		return (EINVAL);
 
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
-		dsl_dataset_t *winner;
+		dsl_dataset_t *winner = NULL;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
 		ds->ds_phys = dbuf->db_data;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 
 		rw_init(&ds->ds_rwlock, 0, 0, 0);
 		cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 
 		bplist_create(&ds->ds_pending_deadlist);
 		dsl_deadlist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
 
 		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 		    offsetof(dmu_sendarg_t, dsa_link));
 
 		if (err == 0) {
 			err = dsl_dir_open_obj(dp,
 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		}
 		if (err) {
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
 		if (!dsl_dataset_is_snapshot(ds)) {
 			ds->ds_snapname[0] = '\0';
 			if (ds->ds_phys->ds_prev_snap_obj) {
 				err = dsl_dataset_get_ref(dp,
 				    ds->ds_phys->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
 			if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
 				err = zap_count(
 				    ds->ds_dir->dd_pool->dp_meta_objset,
 				    ds->ds_phys->ds_userrefs_obj,
 				    &ds->ds_userrefs);
 			}
 		}
 
 		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 			/*
 			 * In sync context, we're called with either no lock
 			 * or with the write lock.  If we're not syncing,
 			 * we're always called with the read lock held.
 			 */
 			boolean_t need_lock =
 			    !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
 			    dsl_pool_sync_context(dp);
 
 			if (need_lock)
 				rw_enter(&dp->dp_config_rwlock, RW_READER);
 
 			err = dsl_prop_get_ds(ds,
 			    "refreservation", sizeof (uint64_t), 1,
 			    &ds->ds_reserved, NULL);
 			if (err == 0) {
 				err = dsl_prop_get_ds(ds,
 				    "refquota", sizeof (uint64_t), 1,
 				    &ds->ds_quota, NULL);
 			}
 
 			if (need_lock)
 				rw_exit(&dp->dp_config_rwlock);
 		} else {
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
-		if (err == 0) {
-			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
-			    dsl_dataset_evict);
-		}
-		if (err || winner) {
+		if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
+		    &ds->ds_phys, dsl_dataset_evict)) != NULL) {
 			bplist_destroy(&ds->ds_pending_deadlist);
 			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
 				dsl_dataset_drop_ref(ds->ds_prev, ds);
 			dsl_dir_close(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
 			ds = winner;
 		} else {
 			ds->ds_fsid_guid =
 			    unique_insert(ds->ds_phys->ds_fsid_guid);
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 	mutex_enter(&ds->ds_lock);
 	if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 		mutex_exit(&ds->ds_lock);
 		dmu_buf_rele(ds->ds_dbuf, tag);
 		return (ENOENT);
 	}
 	mutex_exit(&ds->ds_lock);
 	*dsp = ds;
 	return (0);
 }
 
 static int
 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/*
 	 * In syncing context we don't want the rwlock lock: there
 	 * may be an existing writer waiting for sync phase to
 	 * finish.  We don't need to worry about such writers, since
 	 * sync phase is single-threaded, so the writer can't be
 	 * doing anything while we are active.
 	 */
 	if (dsl_pool_sync_context(dp)) {
 		ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 		return (0);
 	}
 
 	/*
 	 * Normal users will hold the ds_rwlock as a READER until they
 	 * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
 	 * drop their READER lock after they set the ds_owner field.
 	 *
 	 * If the dataset is being destroyed, the destroy thread will
 	 * obtain a WRITER lock for exclusive access after it's done its
 	 * open-context work and then change the ds_owner to
 	 * dsl_reaper once destruction is assured.  So threads
 	 * may block here temporarily, until the "destructability" of
 	 * the dataset is determined.
 	 */
 	ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
 	mutex_enter(&ds->ds_lock);
 	while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
 		rw_exit(&dp->dp_config_rwlock);
 		cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
 		if (DSL_DATASET_IS_DESTROYED(ds)) {
 			mutex_exit(&ds->ds_lock);
 			dsl_dataset_drop_ref(ds, tag);
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 			return (ENOENT);
 		}
 		/*
 		 * The dp_config_rwlock lives above the ds_lock. And
 		 * we need to check DSL_DATASET_IS_DESTROYED() while
 		 * holding the ds_lock, so we have to drop and reacquire
 		 * the ds_lock here.
 		 */
 		mutex_exit(&ds->ds_lock);
 		rw_enter(&dp->dp_config_rwlock, RW_READER);
 		mutex_enter(&ds->ds_lock);
 	}
 	mutex_exit(&ds->ds_lock);
 	return (0);
 }
 
 int
 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
 
 	if (err)
 		return (err);
 	return (dsl_dataset_hold_ref(*dsp, tag));
 }
 
 int
 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
     void *tag, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 	if (err)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		*dsp = NULL;
 		return (EBUSY);
 	}
 	return (0);
 }
 
 int
 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
 
 	err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 	if (err)
 		return (err);
 
 	dp = dd->dd_pool;
 	obj = dd->dd_phys->dd_head_dataset_obj;
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	if (obj)
 		err = dsl_dataset_get_ref(dp, obj, tag, dsp);
 	else
 		err = ENOENT;
 	if (err)
 		goto out;
 
 	err = dsl_dataset_hold_ref(*dsp, tag);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
 		dsl_dataset_t *ds = NULL;
 
 		if (*snapname++ != '@') {
 			dsl_dataset_rele(*dsp, tag);
 			err = ENOENT;
 			goto out;
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
 		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 		if (err == 0)
 			err = dsl_dataset_get_ref(dp, obj, tag, &ds);
 		dsl_dataset_rele(*dsp, tag);
 
 		ASSERT3U((err == 0), ==, (ds != NULL));
 
 		if (ds) {
 			mutex_enter(&ds->ds_lock);
 			if (ds->ds_snapname[0] == 0)
 				(void) strlcpy(ds->ds_snapname, snapname,
 				    sizeof (ds->ds_snapname));
 			mutex_exit(&ds->ds_lock);
 			err = dsl_dataset_hold_ref(ds, tag);
 			*dsp = err ? NULL : ds;
 		}
 	}
 out:
 	rw_exit(&dp->dp_config_rwlock);
 	dsl_dir_close(dd, FTAG);
 	return (err);
 }
 
 int
 dsl_dataset_own(const char *name, boolean_t inconsistentok,
     void *tag, dsl_dataset_t **dsp)
 {
 	int err = dsl_dataset_hold(name, tag, dsp);
 	if (err)
 		return (err);
 	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 		dsl_dataset_rele(*dsp, tag);
 		return (EBUSY);
 	}
 	return (0);
 }
 
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
 	if (ds == NULL) {
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			/*
 			 * We use a "recursive" mutex so that we
 			 * can call dprintf_ds() with ds_lock held.
 			 */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				mutex_enter(&ds->ds_lock);
 				(void) strcat(name, ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				(void) strcat(name, ds->ds_snapname);
 			}
 		}
 	}
 }
 
 static int
 dsl_dataset_namelen(dsl_dataset_t *ds)
 {
 	int result;
 
 	if (ds == NULL) {
 		result = 3;	/* "mos" */
 	} else {
 		result = dsl_dir_namelen(ds->ds_dir);
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			++result;	/* adding one for the @-sign */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				mutex_enter(&ds->ds_lock);
 				result += strlen(ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
 			} else {
 				result += strlen(ds->ds_snapname);
 			}
 		}
 	}
 
 	return (result);
 }
 
 void
 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 {
 	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 {
 	if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 		rw_exit(&ds->ds_rwlock);
 	}
 	dsl_dataset_drop_ref(ds, tag);
 }
 
 void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
 	ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 	    (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = NULL;
 	if (RW_WRITE_HELD(&ds->ds_rwlock)) {
 		rw_exit(&ds->ds_rwlock);
 		cv_broadcast(&ds->ds_exclusive_cv);
 	}
 	mutex_exit(&ds->ds_lock);
 	if (ds->ds_dbuf)
 		dsl_dataset_drop_ref(ds, tag);
 	else
 		dsl_dataset_evict(NULL, ds);
 }
 
 boolean_t
 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 {
 	boolean_t gotit = FALSE;
 
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_owner == NULL &&
 	    (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
 		ds->ds_owner = tag;
 		if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 			rw_exit(&ds->ds_rwlock);
 		gotit = TRUE;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (gotit);
 }
 
 void
 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 {
 	ASSERT3P(owner, ==, ds->ds_owner);
 	if (!RW_WRITE_HELD(&ds->ds_rwlock))
 		rw_enter(&ds->ds_rwlock, RW_WRITER);
 }
 
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (origin == NULL)
 		origin = dp->dp_origin_snap;
 
 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_flags = flags;
 	dsphys->ds_fsid_guid = unique_create();
 	do {
 		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 		    sizeof (dsphys->ds_guid));
 	} while (dsphys->ds_guid == 0);
 	dsphys->ds_snapnames_zapobj =
 	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 
 	if (origin == NULL) {
 		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 	} else {
 		dsl_dataset_t *ohds;
 
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
 		    origin->ds_phys->ds_creation_txg;
 		dsphys->ds_referenced_bytes =
 		    origin->ds_phys->ds_referenced_bytes;
 		dsphys->ds_compressed_bytes =
 		    origin->ds_phys->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
 		    origin->ds_phys->ds_uncompressed_bytes;
 		dsphys->ds_bp = origin->ds_phys->ds_bp;
 		dsphys->ds_flags |= origin->ds_phys->ds_flags;
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		origin->ds_phys->ds_num_children++;
 
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 		    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 		dsl_dataset_rele(ohds, FTAG);
 
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 			if (origin->ds_phys->ds_next_clones_obj == 0) {
 				origin->ds_phys->ds_next_clones_obj =
 				    zap_create(mos,
 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY(0 == zap_add_int(mos,
 			    origin->ds_phys->ds_next_clones_obj,
 			    dsobj, tx));
 		}
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dd->dd_phys->dd_origin_obj = origin->ds_object;
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			if (origin->ds_dir->dd_phys->dd_clones == 0) {
 				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 				origin->ds_dir->dd_phys->dd_clones =
 				    zap_create(mos,
 				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 			}
 			VERIFY3U(0, ==, zap_add_int(mos,
 			    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 		}
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
 
 	return (dsobj);
 }
 
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = pdd->dd_pool;
 	uint64_t dsobj, ddobj;
 	dsl_dir_t *dd;
 
 	ASSERT(lastname[0] != '@');
 
 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 
 	dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
 	dsl_dir_close(dd, FTAG);
 
 	/*
 	 * If we are creating a clone, make sure we zero out any stale
 	 * data from the origin snapshots zil header.
 	 */
 	if (origin != NULL) {
 		dsl_dataset_t *ds;
 		objset_t *os;
 
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
 		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 		dsl_dataset_dirty(ds, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	return (dsobj);
 }
 
 #ifdef __FreeBSD__
 /* FreeBSD ioctl compat begin */
 struct destroyarg {
 	nvlist_t *nvl;
 	const char *snapname;
 };
 
 static int
 dsl_check_snap_cb(const char *name, void *arg)
 {
 	struct destroyarg *da = arg;
 	dsl_dataset_t *ds;
 	char *dsname;
 
 	dsname = kmem_asprintf("%s@%s", name, da->snapname);
 	VERIFY(nvlist_add_boolean(da->nvl, dsname) == 0);
 
 	return (0);
 }
 
 int
 dmu_get_recursive_snaps_nvl(const char *fsname, const char *snapname,
     nvlist_t *snaps)
 {
 	struct destroyarg *da;
 	int err;
 
 	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
 	da->nvl = snaps;
 	da->snapname = snapname;
 	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
 	    DS_FIND_CHILDREN);
 	kmem_free(da, sizeof (struct destroyarg));
 
 	return (err);
 }
 /* FreeBSD ioctl compat end */
 #endif /* __FreeBSD__ */
 
 /*
  * The snapshots must all be in the same pool.
  */
 int
 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed)
 {
 	int err;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	nvpair_t *pair;
 	dsl_sync_task_group_t *dstg;
 
 	pair = nvlist_next_nvpair(snaps, NULL);
 	if (pair == NULL)
 		return (0);
 
 	err = spa_open(nvpair_name(pair), &spa, FTAG);
 	if (err)
 		return (err);
 	dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		dsl_dataset_t *ds;
 
 		err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 		if (err == 0) {
 			struct dsl_ds_destroyarg *dsda;
 
 			dsl_dataset_make_exclusive(ds, dstg);
 			dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 			    KM_SLEEP);
 			dsda->ds = ds;
 			dsda->defer = defer;
 			dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 			    dsl_dataset_destroy_sync, dsda, dstg, 0);
 		} else if (err == ENOENT) {
 			err = 0;
 		} else {
 			(void) strcpy(failed, nvpair_name(pair));
 			break;
 		}
 	}
 
 	if (err == 0)
 		err = dsl_sync_task_group_wait(dstg);
 
 	for (dst = list_head(&dstg->dstg_tasks); dst;
 	    dst = list_next(&dstg->dstg_tasks, dst)) {
 		struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
 		dsl_dataset_t *ds = dsda->ds;
 
 		/*
 		 * Return the file system name that triggered the error
 		 */
 		if (dst->dst_err) {
 			dsl_dataset_name(ds, failed);
 		}
 		ASSERT3P(dsda->rm_origin, ==, NULL);
 		dsl_dataset_disown(ds, dstg);
 		kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
 	}
 
 	dsl_sync_task_group_destroy(dstg);
 	spa_close(spa, FTAG);
 	return (err);
 
 }
 
 static boolean_t
 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
 {
 	boolean_t might_destroy = B_FALSE;
 
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
 	    DS_IS_DEFER_DESTROY(ds))
 		might_destroy = B_TRUE;
 	mutex_exit(&ds->ds_lock);
 
 	return (might_destroy);
 }
 
 /*
  * If we're removing a clone, and these three conditions are true:
  *	1) the clone's origin has no other children
  *	2) the clone's origin has no user references
  *	3) the clone's origin has been marked for deferred destruction
  * Then, prepare to remove the origin as part of this sync task group.
  */
 static int
 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
 {
 	dsl_dataset_t *ds = dsda->ds;
 	dsl_dataset_t *origin = ds->ds_prev;
 
 	if (dsl_dataset_might_destroy_origin(origin)) {
 		char *name;
 		int namelen;
 		int error;
 
 		namelen = dsl_dataset_namelen(origin) + 1;
 		name = kmem_alloc(namelen, KM_SLEEP);
 		dsl_dataset_name(origin, name);
 #ifdef _KERNEL
 		error = zfs_unmount_snap(name, NULL);
 		if (error) {
 			kmem_free(name, namelen);
 			return (error);
 		}
 #endif
 		error = dsl_dataset_own(name, B_TRUE, tag, &origin);
 		kmem_free(name, namelen);
 		if (error)
 			return (error);
 		dsda->rm_origin = origin;
 		dsl_dataset_make_exclusive(origin, tag);
 	}
 
 	return (0);
 }
 
 /*
  * ds must be opened as OWNER.  On return (whether successful or not),
  * ds will be closed and caller can no longer dereference it.
  */
 int
 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
 {
 	int err;
 	dsl_sync_task_group_t *dstg;
 	objset_t *os;
 	dsl_dir_t *dd;
 	uint64_t obj;
 	struct dsl_ds_destroyarg dsda = { 0 };
 	dsl_dataset_t dummy_ds = { 0 };
 
 	dsda.ds = ds;
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		/* Destroying a snapshot is simpler */
 		dsl_dataset_make_exclusive(ds, tag);
 
 		dsda.defer = defer;
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
 		    &dsda, tag, 0);
 		ASSERT3P(dsda.rm_origin, ==, NULL);
 		goto out;
 	} else if (defer) {
 		err = EINVAL;
 		goto out;
 	}
 
 	dd = ds->ds_dir;
 	dummy_ds.ds_dir = dd;
 	dummy_ds.ds_object = ds->ds_object;
 
 	if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
 	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 		/*
 		 * Check for errors and mark this ds as inconsistent, in
 		 * case we crash while freeing the objects.
 		 */
 		err = dsl_sync_task_do(dd->dd_pool,
 		    dsl_dataset_destroy_begin_check,
 		    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
 		if (err)
 			goto out;
 
 		err = dmu_objset_from_ds(ds, &os);
 		if (err)
 			goto out;
 
 		/*
 		 * Remove all objects while in the open context so that
 		 * there is less work to do in the syncing context.
 		 */
 		for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
 		    ds->ds_phys->ds_prev_snap_txg)) {
 			/*
 			 * Ignore errors, if there is not enough disk space
 			 * we will deal with it in dsl_dataset_destroy_sync().
 			 */
 			(void) dmu_free_object(os, obj);
 		}
 		if (err != ESRCH)
 			goto out;
 
 		/*
 		 * Sync out all in-flight IO.
 		 */
 		txg_wait_synced(dd->dd_pool, 0);
 
 		/*
 		 * If we managed to free all the objects in open
 		 * context, the user space accounting should be zero.
 		 */
 		if (ds->ds_phys->ds_bp.blk_fill == 0 &&
 		    dmu_objset_userused_enabled(os)) {
 			uint64_t count;
 
 			ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
 			    &count) != 0 || count == 0);
 			ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
 			    &count) != 0 || count == 0);
 		}
 	}
 
 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 	err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
 	rw_exit(&dd->dd_pool->dp_config_rwlock);
 
 	if (err)
 		goto out;
 
 	/*
 	 * Blow away the dsl_dir + head dataset.
 	 */
 	dsl_dataset_make_exclusive(ds, tag);
 	/*
 	 * If we're removing a clone, we might also need to remove its
 	 * origin.
 	 */
 	do {
 		dsda.need_prep = B_FALSE;
 		if (dsl_dir_is_clone(dd)) {
 			err = dsl_dataset_origin_rm_prep(&dsda, tag);
 			if (err) {
 				dsl_dir_close(dd, FTAG);
 				goto out;
 			}
 		}
 
 		dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
 		dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 		    dsl_dataset_destroy_sync, &dsda, tag, 0);
 		dsl_sync_task_create(dstg, dsl_dir_destroy_check,
 		    dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
 		err = dsl_sync_task_group_wait(dstg);
 		dsl_sync_task_group_destroy(dstg);
 
 		/*
 		 * We could be racing against 'zfs release' or 'zfs destroy -d'
 		 * on the origin snap, in which case we can get EBUSY if we
 		 * needed to destroy the origin snap but were not ready to
 		 * do so.
 		 */
 		if (dsda.need_prep) {
 			ASSERT(err == EBUSY);
 			ASSERT(dsl_dir_is_clone(dd));
 			ASSERT(dsda.rm_origin == NULL);
 		}
 	} while (dsda.need_prep);
 
 	if (dsda.rm_origin != NULL)
 		dsl_dataset_disown(dsda.rm_origin, tag);
 
 	/* if it is successful, dsl_dir_destroy_sync will close the dd */
 	if (err)
 		dsl_dir_close(dd, FTAG);
 out:
 	dsl_dataset_disown(ds, tag);
 	return (err);
 }
 
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
 	return (&ds->ds_phys->ds_bp);
 }
 
 void
 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* If it's the meta-objset, set dp_meta_rootbp */
 	if (ds == NULL) {
 		tx->tx_pool->dp_meta_rootbp = *bp;
 	} else {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_phys->ds_bp = *bp;
 	}
 }
 
 spa_t *
 dsl_dataset_get_spa(dsl_dataset_t *ds)
 {
 	return (ds->ds_dir->dd_pool->dp_spa);
 }
 
 void
 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp;
 
 	if (ds == NULL) /* this is the meta-objset */
 		return;
 
 	ASSERT(ds->ds_objset != NULL);
 
 	if (ds->ds_phys->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
 	dp = ds->ds_dir->dd_pool;
 
 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 	}
 }
 
 boolean_t
 dsl_dataset_is_dirty(dsl_dataset_t *ds)
 {
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
 		    ds, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * The unique space in the head dataset can be calculated by subtracting
  * the space used in the most recent snapshot, that is still being used
  * in this file system, from the space currently in use.  To figure out
  * the space in the most recent snapshot still in use, we need to take
  * the total space used in the snapshot and subtract out the space that
  * has been freed up since the snapshot was taken.
  */
 static void
 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 {
 	uint64_t mrs_used;
 	uint64_t dlused, dlcomp, dluncomp;
 
 	ASSERT(!dsl_dataset_is_snapshot(ds));
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0)
 		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
 	else
 		mrs_used = 0;
 
 	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
 	ASSERT3U(dlused, <=, mrs_used);
 	ds->ds_phys->ds_unique_bytes =
 	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
 	    SPA_VERSION_UNIQUE_ACCURATE)
 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
 struct killarg {
 	dsl_dataset_t *ds;
 	dmu_tx_t *tx;
 };
 
 /* ARGSUSED */
 static int
 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
 	if (bp == NULL)
 		return (0);
 
 	if (zb->zb_level == ZB_ZIL_LEVEL) {
 		ASSERT(zilog != NULL);
 		/*
 		 * It's a block in the intent log.  It has no
 		 * accounting, so just free it.
 		 */
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
 		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t count;
 	int err;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EBUSY);
 
 	/*
 	 * This is really a dsl_dir thing, but check it here so that
 	 * we'll be less likely to leave this dataset inconsistent &
 	 * nearly destroyed.
 	 */
 	err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
 	if (err)
 		return (err);
 	if (count != 0)
 		return (EEXIST);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
 	    "dataset = %llu", ds->ds_object);
 }
 
 static int
 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
     dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = dsda->ds;
 	dsl_dataset_t *ds_prev = ds->ds_prev;
 
 	if (dsl_dataset_might_destroy_origin(ds_prev)) {
 		struct dsl_ds_destroyarg ndsda = {0};
 
 		/*
 		 * If we're not prepared to remove the origin, don't remove
 		 * the clone either.
 		 */
 		if (dsda->rm_origin == NULL) {
 			dsda->need_prep = B_TRUE;
 			return (EBUSY);
 		}
 
 		ndsda.ds = ds_prev;
 		ndsda.is_origin_rm = B_TRUE;
 		return (dsl_dataset_destroy_check(&ndsda, tag, tx));
 	}
 
 	/*
 	 * If we're not going to remove the origin after all,
 	 * undo the open context setup.
 	 */
 	if (dsda->rm_origin != NULL) {
 		dsl_dataset_disown(dsda->rm_origin, tag);
 		dsda->rm_origin = NULL;
 	}
 
 	return (0);
 }
 
 /*
  * If you add new checks here, you may need to add
  * additional checks to the "temporary" case in
  * snapshot_check() in dmu_objset.c.
  */
 /* ARGSUSED */
 int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	struct dsl_ds_destroyarg *dsda = arg1;
 	dsl_dataset_t *ds = dsda->ds;
 
 	/* we have an owner hold, so noone else can destroy us */
 	ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 
 	/*
 	 * Only allow deferred destroy on pools that support it.
 	 * NOTE: deferred destroy is only supported on snapshots.
 	 */
 	if (dsda->defer) {
 		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
 		    SPA_VERSION_USERREFS)
 			return (ENOTSUP);
 		ASSERT(dsl_dataset_is_snapshot(ds));
 		return (0);
 	}
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
 	 * (Except if the only snapshots are from the branch we cloned
 	 * from.)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EBUSY);
 
 	/*
 	 * If we made changes this txg, traverse_dsl_dataset won't find
 	 * them.  Try again.
 	 */
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
 		return (EAGAIN);
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		/*
 		 * If this snapshot has an elevated user reference count,
 		 * we can't destroy it yet.
 		 */
 		if (ds->ds_userrefs > 0 && !dsda->releasing)
 			return (EBUSY);
 
 		mutex_enter(&ds->ds_lock);
 		/*
 		 * Can't delete a branch point. However, if we're destroying
 		 * a clone and removing its origin due to it having a user
 		 * hold count of 0 and having been marked for deferred destroy,
 		 * it's OK for the origin to have a single clone.
 		 */
 		if (ds->ds_phys->ds_num_children >
 		    (dsda->is_origin_rm ? 2 : 1)) {
 			mutex_exit(&ds->ds_lock);
 			return (EEXIST);
 		}
 		mutex_exit(&ds->ds_lock);
 	} else if (dsl_dir_is_clone(ds->ds_dir)) {
 		return (dsl_dataset_origin_check(dsda, arg2, tx));
 	}
 
 	/* XXX we should do some i/o error checking... */
 	return (0);
 }
 
 struct refsarg {
 	kmutex_t lock;
 	boolean_t gone;
 	kcondvar_t cv;
 };
 
 /* ARGSUSED */
 static void
 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
 {
 	struct refsarg *arg = argv;
 
 	mutex_enter(&arg->lock);
 	arg->gone = TRUE;
 	cv_signal(&arg->cv);
 	mutex_exit(&arg->lock);
 }
 
 static void
 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
 {
 	struct refsarg arg;
 
 	bzero(&arg, sizeof(arg));
 	mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
 	arg.gone = FALSE;
 	(void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
 	    dsl_dataset_refs_gone);
 	dmu_buf_rele(ds->ds_dbuf, tag);
 	mutex_enter(&arg.lock);
 	while (!arg.gone)
 		cv_wait(&arg.cv, &arg.lock);
 	ASSERT(arg.gone);
 	mutex_exit(&arg.lock);
 	ds->ds_dbuf = NULL;
 	ds->ds_phys = NULL;
 	mutex_destroy(&arg.lock);
 	cv_destroy(&arg.cv);
 }
 
 static void
 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t count;
 	int err;
 
 	ASSERT(ds->ds_phys->ds_num_children >= 2);
 	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
 	/*
 	 * The err should not be ENOENT, but a bug in a previous version
 	 * of the code could cause upgrade_clones_cb() to not set
 	 * ds_next_snap_obj when it should, leading to a missing entry.
 	 * If we knew that the pool was created after
 	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
 	 * ENOENT.  However, at least we can check that we don't have
 	 * too many entries in the next_clones_obj even after failing to
 	 * remove this one.
 	 */
 	if (err != ENOENT) {
 		VERIFY0(err);
 	}
 	ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
 	    &count));
 	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
 }
 
 static void
 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	/*
 	 * If it is the old version, dd_clones doesn't exist so we can't
 	 * find the clones, but deadlist_remove_key() is a no-op so it
 	 * doesn't matter.
 	 */
 	if (ds->ds_dir->dd_phys->dd_clones == 0)
 		return;
 
 	for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone));
 		if (clone->ds_dir->dd_origin_txg > mintxg) {
 			dsl_deadlist_remove_key(&clone->ds_deadlist,
 			    mintxg, tx);
 			dsl_dataset_remove_clones_key(clone, mintxg, tx);
 		}
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
 }
 
 struct process_old_arg {
 	dsl_dataset_t *ds;
 	dsl_dataset_t *ds_prev;
 	boolean_t after_branch_point;
 	zio_t *pio;
 	uint64_t used, comp, uncomp;
 };
 
 static int
 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	struct process_old_arg *poa = arg;
 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
 
 	if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
 		    bp->blk_birth >
 		    poa->ds_prev->ds_phys->ds_prev_snap_txg) {
 			poa->ds_prev->ds_phys->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
 		}
 	} else {
 		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
 		poa->comp += BP_GET_PSIZE(bp);
 		poa->uncomp += BP_GET_UCSIZE(bp);
 		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
 	}
 	return (0);
 }
 
 static void
 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
 {
 	struct process_old_arg poa = { 0 };
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 
 	ASSERT(ds->ds_deadlist.dl_oldfmt);
 	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
 
 	poa.ds = ds;
 	poa.ds_prev = ds_prev;
 	poa.after_branch_point = after_branch_point;
 	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
 	    process_old_cb, &poa, tx));
 	VERIFY0(zio_wait(poa.pio));
 	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
 
 	/* change snapused */
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 	    -poa.used, -poa.comp, -poa.uncomp, tx);
 
 	/* swap next's deadlist to our deadlist */
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_close(&ds_next->ds_deadlist);
 	SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
 	    ds->ds_phys->ds_deadlist_obj);
 	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
 	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
 	    ds_next->ds_phys->ds_deadlist_obj);
 }
 
 static int
 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	int err;
 	struct killarg ka;
 
 	/*
 	 * Free everything that we point to (that's born after
 	 * the previous snapshot, if we are a clone)
 	 *
 	 * NB: this should be very quick, because we already
 	 * freed all the objects in open context.
 	 */
 	ka.ds = ds;
 	ka.tx = tx;
 	err = traverse_dataset(ds,
 	    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
 	    kill_blkptr, &ka);
 	ASSERT0(err);
 	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
 
 	return (err);
 }
 
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	struct dsl_ds_destroyarg *dsda = arg1;
 	dsl_dataset_t *ds = dsda->ds;
 	int err;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	dsl_dataset_t *ds_prev = NULL;
 	boolean_t wont_destroy;
 	uint64_t obj;
 
 	wont_destroy = (dsda->defer &&
 	    (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
 
 	ASSERT(ds->ds_owner || wont_destroy);
 	ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
 	ASSERT(ds->ds_prev == NULL ||
 	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
 	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
 	if (wont_destroy) {
 		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
 		return;
 	}
 
 	/* signal any waiters that this dataset is going away */
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = dsl_reaper;
 	cv_broadcast(&ds->ds_exclusive_cv);
 	mutex_exit(&ds->ds_lock);
 
 	/* Remove our reservation */
 	if (ds->ds_reserved != 0) {
 		dsl_prop_setarg_t psa;
 		uint64_t value = 0;
 
 		dsl_prop_setarg_init_uint64(&psa, "refreservation",
 		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
 		    &value);
 		psa.psa_effective_value = 0;	/* predict default value */
 
 		dsl_dataset_set_reservation_sync(ds, &psa, tx);
 		ASSERT0(ds->ds_reserved);
 	}
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	dsl_scan_ds_destroyed(ds, tx);
 
 	obj = ds->ds_object;
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		if (ds->ds_prev) {
 			ds_prev = ds->ds_prev;
 		} else {
 			VERIFY(0 == dsl_dataset_hold_obj(dp,
 			    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
 		}
 		after_branch_point =
 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
 		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
 			remove_from_next_clones(ds_prev, obj, tx);
 			if (ds->ds_phys->ds_next_snap_obj != 0) {
 				VERIFY(0 == zap_add_int(mos,
 				    ds_prev->ds_phys->ds_next_clones_obj,
 				    ds->ds_phys->ds_next_snap_obj, tx));
 			}
 		}
 		if (after_branch_point &&
 		    ds->ds_phys->ds_next_snap_obj == 0) {
 			/* This clone is toast. */
 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
 			ds_prev->ds_phys->ds_num_children--;
 
 			/*
 			 * If the clone's origin has no other clones, no
 			 * user holds, and has been marked for deferred
 			 * deletion, then we should have done the necessary
 			 * destroy setup for it.
 			 */
 			if (ds_prev->ds_phys->ds_num_children == 1 &&
 			    ds_prev->ds_userrefs == 0 &&
 			    DS_IS_DEFER_DESTROY(ds_prev)) {
 				ASSERT3P(dsda->rm_origin, !=, NULL);
 			} else {
 				ASSERT3P(dsda->rm_origin, ==, NULL);
 			}
 		} else if (!after_branch_point) {
 			ds_prev->ds_phys->ds_next_snap_obj =
 			    ds->ds_phys->ds_next_snap_obj;
 		}
 	}
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		dsl_dataset_t *ds_next;
 		uint64_t old_unique;
 		uint64_t used = 0, comp = 0, uncomp = 0;
 
 		VERIFY(0 == dsl_dataset_hold_obj(dp,
 		    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
 		old_unique = ds_next->ds_phys->ds_unique_bytes;
 
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 		ds_next->ds_phys->ds_prev_snap_obj =
 		    ds->ds_phys->ds_prev_snap_obj;
 		ds_next->ds_phys->ds_prev_snap_txg =
 		    ds->ds_phys->ds_prev_snap_txg;
 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
 
 
 		if (ds_next->ds_deadlist.dl_oldfmt) {
 			process_old_deadlist(ds, ds_prev, ds_next,
 			    after_branch_point, tx);
 		} else {
 			/* Adjust prev's unique space. */
 			if (ds_prev && !after_branch_point) {
 				dsl_deadlist_space_range(&ds_next->ds_deadlist,
 				    ds_prev->ds_phys->ds_prev_snap_txg,
 				    ds->ds_phys->ds_prev_snap_txg,
 				    &used, &comp, &uncomp);
 				ds_prev->ds_phys->ds_unique_bytes += used;
 			}
 
 			/* Adjust snapused. */
 			dsl_deadlist_space_range(&ds_next->ds_deadlist,
 			    ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
 			    &used, &comp, &uncomp);
 			dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 			    -used, -comp, -uncomp, tx);
 
 			/* Move blocks to be freed to pool's free list. */
 			dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
 			    &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
 			    tx);
 			dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
 			    DD_USED_HEAD, used, comp, uncomp, tx);
 
 			/* Merge our deadlist into next's and free it. */
 			dsl_deadlist_merge(&ds_next->ds_deadlist,
 			    ds->ds_phys->ds_deadlist_obj, tx);
 		}
 		dsl_deadlist_close(&ds->ds_deadlist);
 		dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
 
 		/* Collapse range in clone heads */
 		dsl_dataset_remove_clones_key(ds,
 		    ds->ds_phys->ds_creation_txg, tx);
 
 		if (dsl_dataset_is_snapshot(ds_next)) {
 			dsl_dataset_t *ds_nextnext;
 
 			/*
 			 * Update next's unique to include blocks which
 			 * were previously shared by only this snapshot
 			 * and it.  Those blocks will be born after the
 			 * prev snap and before this snap, and will have
 			 * died after the next snap and before the one
 			 * after that (ie. be on the snap after next's
 			 * deadlist).
 			 */
 			VERIFY(0 == dsl_dataset_hold_obj(dp,
 			    ds_next->ds_phys->ds_next_snap_obj,
 			    FTAG, &ds_nextnext));
 			dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
 			    ds->ds_phys->ds_prev_snap_txg,
 			    ds->ds_phys->ds_creation_txg,
 			    &used, &comp, &uncomp);
 			ds_next->ds_phys->ds_unique_bytes += used;
 			dsl_dataset_rele(ds_nextnext, FTAG);
 			ASSERT3P(ds_next->ds_prev, ==, NULL);
 
 			/* Collapse range in this head. */
 			dsl_dataset_t *hds;
 			VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 			    ds->ds_dir->dd_phys->dd_head_dataset_obj,
 			    FTAG, &hds));
 			dsl_deadlist_remove_key(&hds->ds_deadlist,
 			    ds->ds_phys->ds_creation_txg, tx);
 			dsl_dataset_rele(hds, FTAG);
 
 		} else {
 			ASSERT3P(ds_next->ds_prev, ==, ds);
 			dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
 			ds_next->ds_prev = NULL;
 			if (ds_prev) {
 				VERIFY(0 == dsl_dataset_get_ref(dp,
 				    ds->ds_phys->ds_prev_snap_obj,
 				    ds_next, &ds_next->ds_prev));
 			}
 
 			dsl_dataset_recalc_head_uniq(ds_next);
 
 			/*
 			 * Reduce the amount of our unconsmed refreservation
 			 * being charged to our parent by the amount of
 			 * new unique data we have gained.
 			 */
 			if (old_unique < ds_next->ds_reserved) {
 				int64_t mrsdelta;
 				uint64_t new_unique =
 				    ds_next->ds_phys->ds_unique_bytes;
 
 				ASSERT(old_unique <= new_unique);
 				mrsdelta = MIN(new_unique - old_unique,
 				    ds_next->ds_reserved - old_unique);
 				dsl_dir_diduse_space(ds->ds_dir,
 				    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
 			}
 		}
 		dsl_dataset_rele(ds_next, FTAG);
 	} else {
 		zfeature_info_t *async_destroy =
 		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
 		objset_t *os;
 
 		/*
 		 * There's no next snapshot, so this is a head dataset.
 		 * Destroy the deadlist.  Unless it's a clone, the
 		 * deadlist should be empty.  (If it's a clone, it's
 		 * safe to ignore the deadlist contents.)
 		 */
 		dsl_deadlist_close(&ds->ds_deadlist);
 		dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
 
 		if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
 			err = old_synchronous_dataset_destroy(ds, tx);
 		} else {
 			/*
 			 * Move the bptree into the pool's list of trees to
 			 * clean up and update space accounting information.
 			 */
 			uint64_t used, comp, uncomp;
 
 			zil_destroy_sync(dmu_objset_zil(os), tx);
 
 			if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
 				spa_feature_incr(dp->dp_spa, async_destroy, tx);
 				dp->dp_bptree_obj = bptree_alloc(mos, tx);
 				VERIFY(zap_add(mos,
 				    DMU_POOL_DIRECTORY_OBJECT,
 				    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 				    &dp->dp_bptree_obj, tx) == 0);
 			}
 
 			used = ds->ds_dir->dd_phys->dd_used_bytes;
 			comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
 			uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
 
 			ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
 			    ds->ds_phys->ds_unique_bytes == used);
 
 			bptree_add(mos, dp->dp_bptree_obj,
 			    &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
 			    used, comp, uncomp, tx);
 			dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 			    -used, -comp, -uncomp, tx);
 			dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 			    used, comp, uncomp, tx);
 		}
 
 		if (ds->ds_prev != NULL) {
 			if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 				VERIFY3U(0, ==, zap_remove_int(mos,
 				    ds->ds_prev->ds_dir->dd_phys->dd_clones,
 				    ds->ds_object, tx));
 			}
 			dsl_dataset_rele(ds->ds_prev, ds);
 			ds->ds_prev = ds_prev = NULL;
 		}
 	}
 
 	/*
 	 * This must be done after the dsl_traverse(), because it will
 	 * re-open the objset.
 	 */
 	if (ds->ds_objset) {
 		dmu_objset_evict(ds->ds_objset);
 		ds->ds_objset = NULL;
 	}
 
 	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
 		/* Erase the link in the dir */
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
 		ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
 		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
 		ASSERT(err == 0);
 	} else {
 		/* remove from snapshot namespace */
 		dsl_dataset_t *ds_head;
 		ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
 		VERIFY(0 == dsl_dataset_hold_obj(dp,
 		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 		{
 			uint64_t val;
 
 			err = dsl_dataset_snap_lookup(ds_head,
 			    ds->ds_snapname, &val);
 			ASSERT0(err);
 			ASSERT3U(val, ==, obj);
 		}
 #endif
 		err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
 		ASSERT(err == 0);
 		dsl_dataset_rele(ds_head, FTAG);
 	}
 
 	if (ds_prev && ds->ds_prev != ds_prev)
 		dsl_dataset_rele(ds_prev, FTAG);
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 	spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
 	    "dataset = %llu", ds->ds_object);
 
 	if (ds->ds_phys->ds_next_clones_obj != 0) {
 		uint64_t count;
 		ASSERT(0 == zap_count(mos,
 		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
 		VERIFY(0 == dmu_object_free(mos,
 		    ds->ds_phys->ds_next_clones_obj, tx));
 	}
 	if (ds->ds_phys->ds_props_obj != 0)
 		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
 	if (ds->ds_phys->ds_userrefs_obj != 0)
 		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
 	dsl_dir_close(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dsl_dataset_drain_refs(ds, tag);
 	VERIFY(0 == dmu_object_free(mos, obj, tx));
 
 	if (dsda->rm_origin) {
 		/*
 		 * Remove the origin of the clone we just destroyed.
 		 */
 		struct dsl_ds_destroyarg ndsda = {0};
 
 		ndsda.ds = dsda->rm_origin;
 		dsl_dataset_destroy_sync(&ndsda, tag, tx);
 	}
 }
 
 static int
 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	uint64_t asize;
 
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	/*
 	 * If there's an fs-only reservation, any blocks that might become
 	 * owned by the snapshot dataset must be accommodated by space
 	 * outside of the reservation.
 	 */
 	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
 	asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
 		return (ENOSPC);
 
 	/*
 	 * Propogate any reserved space for this snapshot to other
 	 * snapshot checks in this sync group.
 	 */
 	if (asize > 0)
 		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
 
 	return (0);
 }
 
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	const char *snapname = arg2;
 	int err;
 	uint64_t value;
 
 	/*
 	 * We don't allow multiple snapshots of the same txg.  If there
 	 * is already one, try again.
 	 */
 	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
 		return (EAGAIN);
 
 	/*
 	 * Check for conflicting name snapshot name.
 	 */
 	err = dsl_dataset_snap_lookup(ds, snapname, &value);
 	if (err == 0)
 		return (EEXIST);
 	if (err != ENOENT)
 		return (err);
 
 	/*
 	 * Check that the dataset's name is not too long.  Name consists
 	 * of the dataset's length + 1 for the @-sign + snapshot name's length
 	 */
 	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
 		return (ENAMETOOLONG);
 
 	err = dsl_dataset_snapshot_reserve_space(ds, tx);
 	if (err)
 		return (err);
 
 	ds->ds_trysnap_txg = tx->tx_txg;
 	return (0);
 }
 
 void
 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	const char *snapname = arg2;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
 	int err;
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
 	 */
 	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
 		crtxg = 1;
 	else
 		crtxg = tx->tx_txg;
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
 	do {
 		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 		    sizeof (dsphys->ds_guid));
 	} while (dsphys->ds_guid == 0);
 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
 	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = crtxg;
 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
 	dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
 	dsphys->ds_flags = ds->ds_phys->ds_flags;
 	dsphys->ds_bp = ds->ds_phys->ds_bp;
 	dmu_buf_rele(dbuf, FTAG);
 
 	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		uint64_t next_clones_obj =
 		    ds->ds_prev->ds_phys->ds_next_clones_obj;
 		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    ds->ds_prev->ds_phys->ds_num_children > 1);
 		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
 			remove_from_next_clones(ds->ds_prev,
 			    dsphys->ds_next_snap_obj, tx);
 			VERIFY3U(0, ==, zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
 	}
 
 	/*
 	 * If we have a reference-reservation on this dataset, we will
 	 * need to increase the amount of refreservation being charged
 	 * since our unique space is going to zero.
 	 */
 	if (ds->ds_reserved) {
 		int64_t delta;
 		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
 		delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
 		    delta, 0, 0, tx);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
 	    ds->ds_dir->dd_myname, snapname, dsobj,
 	    ds->ds_phys->ds_prev_snap_txg);
 	ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
 	    UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
 	dsl_deadlist_close(&ds->ds_deadlist);
 	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
 	dsl_deadlist_add_key(&ds->ds_deadlist,
 	    ds->ds_phys->ds_prev_snap_txg, tx);
 
 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
 	ds->ds_phys->ds_prev_snap_obj = dsobj;
 	ds->ds_phys->ds_prev_snap_txg = crtxg;
 	ds->ds_phys->ds_unique_bytes = 0;
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 
 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx);
 	ASSERT(err == 0);
 
 	if (ds->ds_prev)
 		dsl_dataset_drop_ref(ds->ds_prev, ds);
 	VERIFY(0 == dsl_dataset_get_ref(dp,
 	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 
 	dsl_scan_ds_snapshotted(ds, tx);
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
 	spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
 	    "dataset = %llu", dsobj);
 }
 
 void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_objset != NULL);
 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
 
 	/*
 	 * in case we had to change ds_fsid_guid when we opened it,
 	 * sync it out now.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
 
 	dmu_objset_sync(ds->ds_objset, zio, tx);
 }
 
 static void
 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	uint64_t count = 0;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	nvlist_t *propval;
 	nvlist_t *val;
 
 	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	/*
 	 * There may me missing entries in ds_next_clones_obj
 	 * due to a bug in a previous version of the code.
 	 * Only trust it if it has the right number of entries.
 	 */
 	if (ds->ds_phys->ds_next_clones_obj != 0) {
 		ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
 		    &count));
 	}
 	if (count != ds->ds_phys->ds_num_children - 1) {
 		goto fail;
 	}
 	for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_dataset_t *clone;
 		char buf[ZFS_MAXNAMELEN];
 		/*
 		 * Even though we hold the dp_config_rwlock, the dataset
 		 * may fail to open, returning ENOENT.  If there is a
 		 * thread concurrently attempting to destroy this
 		 * dataset, it will have the ds_rwlock held for
 		 * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
 		 * dsl_dataset_hold_ref() will fail its
 		 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
 		 * dp_config_rwlock, and wait for the destroy progress
 		 * and signal ds_exclusive_cv.  If the destroy was
 		 * successful, we will see that
 		 * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
 		 */
 		if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 		    za.za_first_integer, FTAG, &clone) != 0)
 			continue;
 		dsl_dir_name(clone->ds_dir, buf);
 		VERIFY(nvlist_add_boolean(val, buf) == 0);
 		dsl_dataset_rele(clone, FTAG);
 	}
 	zap_cursor_fini(&zc);
 	VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
 	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
 	    propval) == 0);
 fail:
 	nvlist_free(val);
 	nvlist_free(propval);
 	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
 	uint64_t refd, avail, uobjs, aobjs, ratio;
 
 	dsl_dir_stats(ds->ds_dir, nv);
 
 	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
 	    ds->ds_phys->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
 	    ds->ds_phys->ds_creation_txg);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
 	    ds->ds_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
 	    ds->ds_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
 	    ds->ds_phys->ds_guid);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
 	    ds->ds_phys->ds_unique_bytes);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
 	    ds->ds_object);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
 	    ds->ds_userrefs);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		uint64_t written, comp, uncomp;
 		dsl_pool_t *dp = ds->ds_dir->dd_pool;
 		dsl_dataset_t *prev;
 
 		rw_enter(&dp->dp_config_rwlock, RW_READER);
 		int err = dsl_dataset_hold_obj(dp,
 		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
 		rw_exit(&dp->dp_config_rwlock);
 		if (err == 0) {
 			err = dsl_dataset_space_written(prev, ds, &written,
 			    &comp, &uncomp);
 			dsl_dataset_rele(prev, FTAG);
 			if (err == 0) {
 				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
 				    written);
 			}
 		}
 	}
 	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
 	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
 	    ds->ds_phys->ds_compressed_bytes);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
 		 * This is a snapshot; override the dd's space used with
 		 * our unique space and compression ratio.
 		 */
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 		    ds->ds_phys->ds_unique_bytes);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
 
 		get_clones_stat(ds, nv);
 	}
 }
 
 void
 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
 	stat->dds_guid = ds->ds_phys->ds_guid;
 	if (ds->ds_phys->ds_next_snap_obj) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
 	} else {
 		stat->dds_is_snapshot = B_FALSE;
 		stat->dds_num_clones = 0;
 	}
 
 	/* clone origin is really a dsl_dir thing... */
 	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
 	if (dsl_dir_is_clone(ds->ds_dir)) {
 		dsl_dataset_t *ods;
 
 		VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
 		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
 		dsl_dataset_name(ods, stat->dds_origin);
 		dsl_dataset_drop_ref(ods, FTAG);
 	} else {
 		stat->dds_origin[0] = '\0';
 	}
 	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 }
 
 uint64_t
 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
 {
 	return (ds->ds_fsid_guid);
 }
 
 void
 dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	*refdbytesp = ds->ds_phys->ds_referenced_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
 	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
 		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
 	if (ds->ds_quota != 0) {
 		/*
 		 * Adjust available bytes according to refquota
 		 */
 		if (*refdbytesp < ds->ds_quota)
 			*availbytesp = MIN(*availbytesp,
 			    ds->ds_quota - *refdbytesp);
 		else
 			*availbytesp = 0;
 	}
 	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 boolean_t
 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 	if (ds->ds_prev == NULL)
 		return (B_FALSE);
 	if (ds->ds_phys->ds_bp.blk_birth >
 	    ds->ds_prev->ds_phys->ds_creation_txg) {
 		objset_t *os, *os_prev;
 		/*
 		 * It may be that only the ZIL differs, because it was
 		 * reset in the head.  Don't count that as being
 		 * modified.
 		 */
 		if (dmu_objset_from_ds(ds, &os) != 0)
 			return (B_TRUE);
 		if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
 			return (B_TRUE);
 		return (bcmp(&os->os_phys->os_meta_dnode,
 		    &os_prev->os_phys->os_meta_dnode,
 		    sizeof (os->os_phys->os_meta_dnode)) != 0);
 	}
 	return (B_FALSE);
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_dataset_t *hds;
 	uint64_t val;
 	int err;
 
 	err = dsl_dataset_hold_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
 	if (err)
 		return (err);
 
 	/* new name better not be in use */
 	err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
 	dsl_dataset_rele(hds, FTAG);
 
 	if (err == 0)
 		err = EEXIST;
 	else if (err == ENOENT)
 		err = 0;
 
 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
 	if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
 		err = ENAMETOOLONG;
 
 	return (err);
 }
 
 static void
 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	char oldname[MAXPATHLEN], newname[MAXPATHLEN];
 	dsl_dataset_t *ds = arg1;
 	const char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *hds;
 	int err;
 
 	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
 
 	VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
 
 	VERIFY(0 == dsl_dataset_get_snapname(ds));
 	err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
 	ASSERT0(err);
 	dsl_dataset_name(ds, oldname);
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, newsnapname);
 	mutex_exit(&ds->ds_lock);
 	err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
 	ASSERT0(err);
 	dsl_dataset_name(ds, newname);
 #ifdef _KERNEL
 	zvol_rename_minors(oldname, newname);
 #endif
 
 	spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
 	    "dataset = %llu", ds->ds_object);
 	dsl_dataset_rele(hds, FTAG);
 }
 
 struct renamesnaparg {
 	dsl_sync_task_group_t *dstg;
 	char failed[MAXPATHLEN];
 	char *oldsnap;
 	char *newsnap;
 	int error;
 };
 
 static int
 dsl_snapshot_rename_one(const char *name, void *arg)
 {
 	struct renamesnaparg *ra = arg;
 	dsl_dataset_t *ds = NULL;
 	char *snapname;
 	int err;
 
 	snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
 	(void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
 
 	/*
 	 * For recursive snapshot renames the parent won't be changing
 	 * so we just pass name for both the to/from argument.
 	 */
 	err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
 	if (err != 0) {
 		strfree(snapname);
 		return (err == ENOENT ? 0 : err);
 	}
 
 #ifdef _KERNEL
 	/*
 	 * For all filesystems undergoing rename, we'll need to unmount it.
 	 */
 	(void) zfs_unmount_snap(snapname, NULL);
 #endif
 	err = dsl_dataset_hold(snapname, ra->dstg, &ds);
 	strfree(snapname);
 	if (err != 0)
 		return (err == ENOENT ? 0 : err);
 
 	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
 	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
 
 	/* First successful rename clears the error. */
 	ra->error = 0;
 
 	return (0);
 }
 
 static int
 dsl_recursive_rename(char *oldname, const char *newname)
 {
 	int err;
 	struct renamesnaparg *ra;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	char *cp, *fsname = spa_strdup(oldname);
 	int len = strlen(oldname) + 1;
 
 	/* truncate the snapshot name to get the fsname */
 	cp = strchr(fsname, '@');
 	*cp = '\0';
 
 	err = spa_open(fsname, &spa, FTAG);
 	if (err) {
 		kmem_free(fsname, len);
 		return (err);
 	}
 	ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
 	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 
 	ra->oldsnap = strchr(oldname, '@') + 1;
 	ra->newsnap = strchr(newname, '@') + 1;
 	*ra->failed = '\0';
 	ra->error = ENOENT;
 
 	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
 	    DS_FIND_CHILDREN);
 	kmem_free(fsname, len);
 	if (err == 0)
 		err = ra->error;
 
 	if (err == 0)
 		err = dsl_sync_task_group_wait(ra->dstg);
 
 	for (dst = list_head(&ra->dstg->dstg_tasks); dst;
 	    dst = list_next(&ra->dstg->dstg_tasks, dst)) {
 		dsl_dataset_t *ds = dst->dst_arg1;
 		if (dst->dst_err) {
 			dsl_dir_name(ds->ds_dir, ra->failed);
 			(void) strlcat(ra->failed, "@", sizeof (ra->failed));
 			(void) strlcat(ra->failed, ra->newsnap,
 			    sizeof (ra->failed));
 		}
 		dsl_dataset_rele(ds, ra->dstg);
 	}
 
 	if (err)
 		(void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
 
 	dsl_sync_task_group_destroy(ra->dstg);
 	kmem_free(ra, sizeof (struct renamesnaparg));
 	spa_close(spa, FTAG);
 	return (err);
 }
 
 static int
 dsl_valid_rename(const char *oldname, void *arg)
 {
 	int delta = *(int *)arg;
 
 	if (strlen(oldname) + delta >= MAXNAMELEN)
 		return (ENAMETOOLONG);
 
 	return (0);
 }
 
 #pragma weak dmu_objset_rename = dsl_dataset_rename
 int
 dsl_dataset_rename(char *oldname, const char *newname, int flags)
 {
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	const char *tail;
 	int err;
 
 	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
 	if (err)
 		return (err);
 
 	if (tail == NULL) {
 		int delta = strlen(newname) - strlen(oldname);
 
 		/* if we're growing, validate child name lengths */
 		if (delta > 0)
 			err = dmu_objset_find(oldname, dsl_valid_rename,
 			    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 		if (err == 0)
 			err = dsl_dir_rename(dd, newname, flags);
 		dsl_dir_close(dd, FTAG);
 		return (err);
 	}
 
 	if (tail[0] != '@') {
 		/* the name ended in a nonexistent component */
 		dsl_dir_close(dd, FTAG);
 		return (ENOENT);
 	}
 
 	dsl_dir_close(dd, FTAG);
 
 	/* new name must be snapshot in same filesystem */
 	tail = strchr(newname, '@');
 	if (tail == NULL)
 		return (EINVAL);
 	tail++;
 	if (strncmp(oldname, newname, tail - newname) != 0)
 		return (EXDEV);
 
 	if (flags & ZFS_RENAME_RECURSIVE) {
 		err = dsl_recursive_rename(oldname, newname);
 	} else {
 		err = dsl_dataset_hold(oldname, FTAG, &ds);
 		if (err)
 			return (err);
 
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_snapshot_rename_check,
 		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
 
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	return (err);
 }
 
 struct promotenode {
 	list_node_t link;
 	dsl_dataset_t *ds;
 };
 
 struct promotearg {
 	list_t shared_snaps, origin_snaps, clone_snaps;
 	dsl_dataset_t *origin_origin;
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
 };
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
 static boolean_t snaplist_unstable(list_t *l);
 
 static int
 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
 	struct promotenode *snap = list_head(&pa->shared_snaps);
 	dsl_dataset_t *origin_ds = snap->ds;
 	int err;
 	uint64_t unused;
 
 	/* Check that it is a real clone */
 	if (!dsl_dir_is_clone(hds->ds_dir))
 		return (EINVAL);
 
 	/* Since this is so expensive, don't do the preliminary check */
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
 		return (EXDEV);
 
 	/* compute origin's new unique space */
 	snap = list_tail(&pa->clone_snaps);
 	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
 	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
 	    &pa->unique, &unused, &unused);
 
 	/*
 	 * Walk the snapshots that we are moving
 	 *
 	 * Compute space to transfer.  Consider the incremental changes
 	 * to used for each snapshot:
 	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
 	 * So each snapshot gave birth to:
 	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
 	 * So a sequence would look like:
 	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
 	 * Which simplifies to:
 	 * uN + kN + kN-1 + ... + k1 + k0
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
 	pa->used = origin_ds->ds_phys->ds_referenced_bytes;
 	pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
 	pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
 	for (snap = list_head(&pa->shared_snaps); snap;
 	    snap = list_next(&pa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *ds = snap->ds;
 
 		/* Check that the snapshot name does not conflict */
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
 		if (err == 0) {
 			err = EEXIST;
 			goto out;
 		}
 		if (err != ENOENT)
 			goto out;
 
 		/* The very first snapshot does not have a deadlist */
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
 			continue;
 
 		dsl_deadlist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp);
 		pa->used += dlused;
 		pa->comp += dlcomp;
 		pa->uncomp += dluncomp;
 	}
 
 	/*
 	 * If we are a clone of a clone then we never reached ORIGIN,
 	 * so we need to subtract out the clone origin's used space.
 	 */
 	if (pa->origin_origin) {
 		pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
 		pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
 		pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
 	}
 
 	/* Check that there is enough space here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
 	    pa->used);
 	if (err)
 		return (err);
 
 	/*
 	 * Compute the amounts of space that will be used by snapshots
 	 * after the promotion (for both origin and clone).  For each,
 	 * it is the amount of space that will be on all of their
 	 * deadlists (that was not born before their new origin).
 	 */
 	if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		uint64_t space;
 
 		/*
 		 * Note, typically this will not be a clone of a clone,
 		 * so dd_origin_txg will be < TXG_INITIAL, so
 		 * these snaplist_space() -> dsl_deadlist_space_range()
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
 		snap = list_head(&pa->origin_snaps);
 		err = snaplist_space(&pa->shared_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
 		if (err)
 			return (err);
 
 		err = snaplist_space(&pa->clone_snaps,
 		    snap->ds->ds_dir->dd_origin_txg, &space);
 		if (err)
 			return (err);
 		pa->cloneusedsnap += space;
 	}
 	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		err = snaplist_space(&pa->origin_snaps,
 		    origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
 		if (err)
 			return (err);
 	}
 
 	return (0);
 out:
 	pa->err_ds =  snap->ds->ds_snapname;
 	return (err);
 }
 
 static void
 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
 	struct promotenode *snap = list_head(&pa->shared_snaps);
 	dsl_dataset_t *origin_ds = snap->ds;
 	dsl_dataset_t *origin_head;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
 	dsl_dir_t *odd = NULL;
 	uint64_t oldnext_obj;
 	int64_t delta;
 
 	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
 
 	snap = list_head(&pa->origin_snaps);
 	origin_head = snap->ds;
 
 	/*
 	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
 	VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
 	    NULL, FTAG, &odd));
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
 	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
 	snap = list_tail(&pa->clone_snaps);
 	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
 	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
 
 	/* change the origin's next clone */
 	if (origin_ds->ds_phys->ds_next_clones_obj) {
 		remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
 		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
 		    origin_ds->ds_phys->ds_next_clones_obj,
 		    oldnext_obj, tx));
 	}
 
 	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
 	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
 	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
 	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
 	origin_head->ds_dir->dd_origin_txg =
 	    origin_ds->ds_phys->ds_creation_txg;
 
 	/* change dd_clone entries */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    odd->dd_phys->dd_clones, hds->ds_object, tx));
 		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
 		    pa->origin_origin->ds_dir->dd_phys->dd_clones,
 		    hds->ds_object, tx));
 
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    pa->origin_origin->ds_dir->dd_phys->dd_clones,
 		    origin_head->ds_object, tx));
 		if (dd->dd_phys->dd_clones == 0) {
 			dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
 			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 		}
 		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
 		    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
 
 	}
 
 	/* move snapshots to this dir */
 	for (snap = list_head(&pa->shared_snaps); snap;
 	    snap = list_next(&pa->shared_snaps, snap)) {
 		dsl_dataset_t *ds = snap->ds;
 
 		/* unregister props as dsl_dir is changing */
 		if (ds->ds_objset) {
 			dmu_objset_evict(ds->ds_objset);
 			ds->ds_objset = NULL;
 		}
 		/* move snap name entry */
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		VERIFY(0 == dsl_dataset_snap_remove(origin_head,
 		    ds->ds_snapname, tx));
 		VERIFY(0 == zap_add(dp->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
 		ds->ds_phys->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, odd);
 		dsl_dir_close(ds->ds_dir, ds);
 		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 		/* move any clone references */
 		if (ds->ds_phys->ds_next_clones_obj &&
 		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 			zap_cursor_t zc;
 			zap_attribute_t za;
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
 			    ds->ds_phys->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
 				dsl_dataset_t *cnds;
 				uint64_t o;
 
 				if (za.za_first_integer == oldnext_obj) {
 					/*
 					 * We've already moved the
 					 * origin's reference.
 					 */
 					continue;
 				}
 
 				VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &cnds));
 				o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
 
 				VERIFY3U(zap_remove_int(dp->dp_meta_objset,
 				    odd->dd_phys->dd_clones, o, tx), ==, 0);
 				VERIFY3U(zap_add_int(dp->dp_meta_objset,
 				    dd->dd_phys->dd_clones, o, tx), ==, 0);
 				dsl_dataset_rele(cnds, FTAG);
 			}
 			zap_cursor_fini(&zc);
 		}
 
 		ASSERT0(dsl_prop_numcb(ds));
 	}
 
 	/*
 	 * Change space accounting.
 	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
 	 * both be valid, or both be 0 (resulting in delta == 0).  This
 	 * is true for each of {clone,origin} independently.
 	 */
 
 	delta = pa->cloneusedsnap -
 	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, >=, 0);
 	ASSERT3U(pa->used, >=, delta);
 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
 	    pa->used - delta, pa->comp, pa->uncomp, tx);
 
 	delta = pa->originusedsnap -
 	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, <=, 0);
 	ASSERT3U(pa->used, >=, -delta);
 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
 	    -pa->used - delta, -pa->comp, -pa->uncomp, tx);
 
 	origin_ds->ds_phys->ds_unique_bytes = pa->unique;
 
 	/* log history record */
 	spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
 	    "dataset = %llu", hds->ds_object);
 
 	dsl_dir_close(odd, FTAG);
 }
 
 static char *snaplist_tag = "snaplist";
 /*
  * Make a list of dsl_dataset_t's for the snapshots between first_obj
  * (exclusive) and last_obj (inclusive).  The list will be in reverse
  * order (last_obj will be the list_head()).  If first_obj == 0, do all
  * snapshots back to this dataset's origin.
  */
 static int
 snaplist_make(dsl_pool_t *dp, boolean_t own,
     uint64_t first_obj, uint64_t last_obj, list_t *l)
 {
 	uint64_t obj = last_obj;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
 
 	list_create(l, sizeof (struct promotenode),
 	    offsetof(struct promotenode, link));
 
 	while (obj != first_obj) {
 		dsl_dataset_t *ds;
 		struct promotenode *snap;
 		int err;
 
 		if (own) {
 			err = dsl_dataset_own_obj(dp, obj,
 			    0, snaplist_tag, &ds);
 			if (err == 0)
 				dsl_dataset_make_exclusive(ds, snaplist_tag);
 		} else {
 			err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
 		}
 		if (err == ENOENT) {
 			/* lost race with snapshot destroy */
 			struct promotenode *last = list_tail(l);
 			ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
 			obj = last->ds->ds_phys->ds_prev_snap_obj;
 			continue;
 		} else if (err) {
 			return (err);
 		}
 
 		if (first_obj == 0)
 			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
 
 		snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
 		snap->ds = ds;
 		list_insert_tail(l, snap);
 		obj = ds->ds_phys->ds_prev_snap_obj;
 	}
 
 	return (0);
 }
 
 static int
 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
 {
 	struct promotenode *snap;
 
 	*spacep = 0;
 	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
 		uint64_t used, comp, uncomp;
 		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
 		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
 		*spacep += used;
 	}
 	return (0);
 }
 
 static void
 snaplist_destroy(list_t *l, boolean_t own)
 {
 	struct promotenode *snap;
 
 	if (!l || !list_link_active(&l->list_head))
 		return;
 
 	while ((snap = list_tail(l)) != NULL) {
 		list_remove(l, snap);
 		if (own)
 			dsl_dataset_disown(snap->ds, snaplist_tag);
 		else
 			dsl_dataset_rele(snap->ds, snaplist_tag);
 		kmem_free(snap, sizeof (struct promotenode));
 	}
 	list_destroy(l);
 }
 
 /*
  * Promote a clone.  Nomenclature note:
  * "clone" or "cds": the original clone which is being promoted
  * "origin" or "ods": the snapshot which is originally clone's origin
  * "origin head" or "ohds": the dataset which is the head
  * (filesystem/volume) for the origin
  * "origin origin": the origin of the origin's filesystem (typically
  * NULL, indicating that the clone is not a clone of a clone).
  */
 int
 dsl_dataset_promote(const char *name, char *conflsnap)
 {
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
 	dmu_object_info_t doi;
 	struct promotearg pa = { 0 };
 	struct promotenode *snap;
 	int err;
 
 	err = dsl_dataset_hold(name, FTAG, &ds);
 	if (err)
 		return (err);
 	dd = ds->ds_dir;
 	dp = dd->dd_pool;
 
 	err = dmu_object_info(dp->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, &doi);
 	if (err) {
 		dsl_dataset_rele(ds, FTAG);
 		return (err);
 	}
 
 	if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (EINVAL);
 	}
 
 	/*
 	 * We are going to inherit all the snapshots taken before our
 	 * origin (i.e., our new origin will be our parent's origin).
 	 * Take ownership of them so that we can rename them into our
 	 * namespace.
 	 */
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 
 	err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
 	    &pa.shared_snaps);
 	if (err != 0)
 		goto out;
 
 	err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
 	if (err != 0)
 		goto out;
 
 	snap = list_head(&pa.shared_snaps);
 	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
 	err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
 	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
 	if (err != 0)
 		goto out;
 
 	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
 		err = dsl_dataset_hold_obj(dp,
 		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
 		    FTAG, &pa.origin_origin);
 		if (err != 0)
 			goto out;
 	}
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
 
 	/*
 	 * Add in 128x the snapnames zapobj size, since we will be moving
 	 * a bunch of snapnames to the promoted ds, and dirtying their
 	 * bonus buffers.
 	 */
 	if (err == 0) {
 		err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
 		    dsl_dataset_promote_sync, ds, &pa,
 		    2 + 2 * doi.doi_physical_blocks_512);
 		if (err && pa.err_ds && conflsnap)
 			(void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
 	}
 
 	snaplist_destroy(&pa.shared_snaps, B_TRUE);
 	snaplist_destroy(&pa.clone_snaps, B_FALSE);
 	snaplist_destroy(&pa.origin_snaps, B_FALSE);
 	if (pa.origin_origin)
 		dsl_dataset_rele(pa.origin_origin, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 struct cloneswaparg {
 	dsl_dataset_t *cds; /* clone dataset */
 	dsl_dataset_t *ohds; /* origin's head dataset */
 	boolean_t force;
 	int64_t unused_refres_delta; /* change in unconsumed refreservation */
 };
 
 /* ARGSUSED */
 static int
 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	struct cloneswaparg *csa = arg1;
 
 	/* they should both be heads */
 	if (dsl_dataset_is_snapshot(csa->cds) ||
 	    dsl_dataset_is_snapshot(csa->ohds))
 		return (EINVAL);
 
 	/* the branch point should be just before them */
 	if (csa->cds->ds_prev != csa->ohds->ds_prev)
 		return (EINVAL);
 
 	/* cds should be the clone (unless they are unrelated) */
 	if (csa->cds->ds_prev != NULL &&
 	    csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
 	    csa->ohds->ds_object !=
 	    csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
 		return (EINVAL);
 
 	/* the clone should be a child of the origin */
 	if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
 		return (EINVAL);
 
 	/* ohds shouldn't be modified unless 'force' */
 	if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
 		return (ETXTBSY);
 
 	/* adjust amount of any unconsumed refreservation */
 	csa->unused_refres_delta =
 	    (int64_t)MIN(csa->ohds->ds_reserved,
 	    csa->ohds->ds_phys->ds_unique_bytes) -
 	    (int64_t)MIN(csa->ohds->ds_reserved,
 	    csa->cds->ds_phys->ds_unique_bytes);
 
 	if (csa->unused_refres_delta > 0 &&
 	    csa->unused_refres_delta >
 	    dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
 		return (ENOSPC);
 
 	if (csa->ohds->ds_quota != 0 &&
 	    csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
 		return (EDQUOT);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	struct cloneswaparg *csa = arg1;
 	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
 
 	ASSERT(csa->cds->ds_reserved == 0);
 	ASSERT(csa->ohds->ds_quota == 0 ||
 	    csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
 
 	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
 	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
 
 	if (csa->cds->ds_objset != NULL) {
 		dmu_objset_evict(csa->cds->ds_objset);
 		csa->cds->ds_objset = NULL;
 	}
 
 	if (csa->ohds->ds_objset != NULL) {
 		dmu_objset_evict(csa->ohds->ds_objset);
 		csa->ohds->ds_objset = NULL;
 	}
 
 	/*
 	 * Reset origin's unique bytes, if it exists.
 	 */
 	if (csa->cds->ds_prev) {
 		dsl_dataset_t *origin = csa->cds->ds_prev;
 		uint64_t comp, uncomp;
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_deadlist_space_range(&csa->cds->ds_deadlist,
 		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
 		    &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
 	}
 
 	/* swap blkptrs */
 	{
 		blkptr_t tmp;
 		tmp = csa->ohds->ds_phys->ds_bp;
 		csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
 		csa->cds->ds_phys->ds_bp = tmp;
 	}
 
 	/* set dd_*_bytes */
 	{
 		int64_t dused, dcomp, duncomp;
 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
 		uint64_t odl_used, odl_comp, odl_uncomp;
 
 		ASSERT3U(csa->cds->ds_dir->dd_phys->
 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
 		dsl_deadlist_space(&csa->cds->ds_deadlist,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space(&csa->ohds->ds_deadlist,
 		    &odl_used, &odl_comp, &odl_uncomp);
 
 		dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
 		    (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
 		dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
 		    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
 		duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
 		    cdl_uncomp -
 		    (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
 
 		dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
 		    dused, dcomp, duncomp, tx);
 		dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
 		    -dused, -dcomp, -duncomp, tx);
 
 		/*
 		 * The difference in the space used by snapshots is the
 		 * difference in snapshot space due to the head's
 		 * deadlist (since that's the only thing that's
 		 * changing that affects the snapused).
 		 */
 		dsl_deadlist_space_range(&csa->cds->ds_deadlist,
 		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &cdl_used, &cdl_comp, &cdl_uncomp);
 		dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
 		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
 		    &odl_used, &odl_comp, &odl_uncomp);
 		dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
 		    DD_USED_HEAD, DD_USED_SNAP, tx);
 	}
 
 	/* swap ds_*_bytes */
 	SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
 	    csa->cds->ds_phys->ds_referenced_bytes);
 	SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
 	    csa->cds->ds_phys->ds_compressed_bytes);
 	SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
 	    csa->cds->ds_phys->ds_uncompressed_bytes);
 	SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
 	    csa->cds->ds_phys->ds_unique_bytes);
 
 	/* apply any parent delta for change in unconsumed refreservation */
 	dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
 	    csa->unused_refres_delta, 0, 0, tx);
 
 	/*
 	 * Swap deadlists.
 	 */
 	dsl_deadlist_close(&csa->cds->ds_deadlist);
 	dsl_deadlist_close(&csa->ohds->ds_deadlist);
 	SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
 	    csa->cds->ds_phys->ds_deadlist_obj);
 	dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
 	    csa->cds->ds_phys->ds_deadlist_obj);
 	dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
 	    csa->ohds->ds_phys->ds_deadlist_obj);
 
 	dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
 }
 
 /*
  * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
  * recv" into an existing fs to swizzle the file system to the new
  * version, and by "zfs rollback".  Can also be used to swap two
  * independent head datasets if neither has any snapshots.
  */
 int
 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force)
 {
 	struct cloneswaparg csa;
 	int error;
 
 	ASSERT(clone->ds_owner);
 	ASSERT(origin_head->ds_owner);
 retry:
 	/*
 	 * Need exclusive access for the swap. If we're swapping these
 	 * datasets back after an error, we already hold the locks.
 	 */
 	if (!RW_WRITE_HELD(&clone->ds_rwlock))
 		rw_enter(&clone->ds_rwlock, RW_WRITER);
 	if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
 	    !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
 		rw_exit(&clone->ds_rwlock);
 		rw_enter(&origin_head->ds_rwlock, RW_WRITER);
 		if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
 			rw_exit(&origin_head->ds_rwlock);
 			goto retry;
 		}
 	}
 	csa.cds = clone;
 	csa.ohds = origin_head;
 	csa.force = force;
 	error = dsl_sync_task_do(clone->ds_dir->dd_pool,
 	    dsl_dataset_clone_swap_check,
 	    dsl_dataset_clone_swap_sync, &csa, NULL, 9);
 	return (error);
 }
 
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
  */
 int
 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
 	spa_t *spa;
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
 
 	if ((error = spa_open(pname, &spa, FTAG)) != 0)
 		return (error);
 	dp = spa_get_dsl(spa);
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	rw_exit(&dp->dp_config_rwlock);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 int
 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
 {
 	int error = 0;
 
 	ASSERT3S(asize, >, 0);
 
 	/*
 	 * *ref_rsrv is the portion of asize that will come from any
 	 * unconsumed refreservation space.
 	 */
 	*ref_rsrv = 0;
 
 	mutex_enter(&ds->ds_lock);
 	/*
 	 * Make a space adjustment for reserved bytes.
 	 */
 	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
 		ASSERT3U(*used, >=,
 		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
 		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
 		*ref_rsrv =
 		    asize - MIN(asize, parent_delta(ds, asize + inflight));
 	}
 
 	if (!check_quota || ds->ds_quota == 0) {
 		mutex_exit(&ds->ds_lock);
 		return (0);
 	}
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
 	if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
 		if (inflight > 0 ||
 		    ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
 			error = ERESTART;
 		else
 			error = EDQUOT;
 	}
 	mutex_exit(&ds->ds_lock);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_prop_setarg_t *psa = arg2;
 	int err;
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
 		return (ENOTSUP);
 
 	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
 		return (err);
 
 	if (psa->psa_effective_value == 0)
 		return (0);
 
 	if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
 	    psa->psa_effective_value < ds->ds_reserved)
 		return (ENOSPC);
 
 	return (0);
 }
 
 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
 
 void
 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_prop_setarg_t *psa = arg2;
 	uint64_t effective_value = psa->psa_effective_value;
 
 	dsl_prop_set_sync(ds, psa, tx);
 	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
 
 	if (ds->ds_quota != effective_value) {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_quota = effective_value;
 	}
 }
 
 int
 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
 {
 	dsl_dataset_t *ds;
 	dsl_prop_setarg_t psa;
 	int err;
 
 	dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
 
 	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
 
 	/*
 	 * If someone removes a file, then tries to set the quota, we
 	 * want to make sure the file freeing takes effect.
 	 */
 	txg_wait_open(ds->ds_dir->dd_pool, 0);
 
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
 	    ds, &psa, 0);
 
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 static int
 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_prop_setarg_t *psa = arg2;
 	uint64_t effective_value;
 	uint64_t unique;
 	int err;
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
 	    SPA_VERSION_REFRESERVATION)
 		return (ENOTSUP);
 
 	if (dsl_dataset_is_snapshot(ds))
 		return (EINVAL);
 
 	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
 		return (err);
 
 	effective_value = psa->psa_effective_value;
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
 	mutex_enter(&ds->ds_lock);
 	if (!DS_UNIQUE_IS_ACCURATE(ds))
 		dsl_dataset_recalc_head_uniq(ds);
 	unique = ds->ds_phys->ds_unique_bytes;
 	mutex_exit(&ds->ds_lock);
 
 	if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
 		uint64_t delta = MAX(unique, effective_value) -
 		    MAX(unique, ds->ds_reserved);
 
 		if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
 			return (ENOSPC);
 		if (ds->ds_quota > 0 &&
 		    effective_value > ds->ds_quota)
 			return (ENOSPC);
 	}
 
 	return (0);
 }
 
 static void
 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_prop_setarg_t *psa = arg2;
 	uint64_t effective_value = psa->psa_effective_value;
 	uint64_t unique;
 	int64_t delta;
 
 	dsl_prop_set_sync(ds, psa, tx);
 	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
 	unique = ds->ds_phys->ds_unique_bytes;
 	delta = MAX(0, (int64_t)(effective_value - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
 	ds->ds_reserved = effective_value;
 	mutex_exit(&ds->ds_lock);
 
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
 }
 
 int
 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
     uint64_t reservation)
 {
 	dsl_dataset_t *ds;
 	dsl_prop_setarg_t psa;
 	int err;
 
 	dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
 	    &reservation);
 
 	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
 
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_set_reservation_check,
 	    dsl_dataset_set_reservation_sync, ds, &psa, 0);
 
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 typedef struct zfs_hold_cleanup_arg {
 	dsl_pool_t *dp;
 	uint64_t dsobj;
 	char htag[MAXNAMELEN];
 } zfs_hold_cleanup_arg_t;
 
 static void
 dsl_dataset_user_release_onexit(void *arg)
 {
 	zfs_hold_cleanup_arg_t *ca = arg;
 
 	(void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
 	    B_TRUE);
 	kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
 }
 
 void
 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
     minor_t minor)
 {
 	zfs_hold_cleanup_arg_t *ca;
 
 	ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
 	ca->dp = ds->ds_dir->dd_pool;
 	ca->dsobj = ds->ds_object;
 	(void) strlcpy(ca->htag, htag, sizeof (ca->htag));
 	VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
 	    dsl_dataset_user_release_onexit, ca, NULL));
 }
 
 /*
  * If you add new checks here, you may need to add
  * additional checks to the "temporary" case in
  * snapshot_check() in dmu_objset.c.
  */
 static int
 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	struct dsl_ds_holdarg *ha = arg2;
 	char *htag = ha->htag;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	int error = 0;
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
 		return (ENOTSUP);
 
 	if (!dsl_dataset_is_snapshot(ds))
 		return (EINVAL);
 
 	/* tags must be unique */
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_phys->ds_userrefs_obj) {
 		error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
 		    8, 1, tx);
 		if (error == 0)
 			error = EEXIST;
 		else if (error == ENOENT)
 			error = 0;
 	}
 	mutex_exit(&ds->ds_lock);
 
 	if (error == 0 && ha->temphold &&
 	    strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
 		error = E2BIG;
 
 	return (error);
 }
 
 void
 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	struct dsl_ds_holdarg *ha = arg2;
 	char *htag = ha->htag;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t now = gethrestime_sec();
 	uint64_t zapobj;
 
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_phys->ds_userrefs_obj == 0) {
 		/*
 		 * This is the first user hold for this dataset.  Create
 		 * the userrefs zap object.
 		 */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		zapobj = ds->ds_phys->ds_userrefs_obj =
 		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
 	} else {
 		zapobj = ds->ds_phys->ds_userrefs_obj;
 	}
 	ds->ds_userrefs++;
 	mutex_exit(&ds->ds_lock);
 
 	VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
 
 	if (ha->temphold) {
 		VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
 		    htag, &now, tx));
 	}
 
 	spa_history_log_internal(LOG_DS_USER_HOLD,
 	    dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
 	    (int)ha->temphold, ds->ds_object);
 }
 
 static int
 dsl_dataset_user_hold_one(const char *dsname, void *arg)
 {
 	struct dsl_ds_holdarg *ha = arg;
 	dsl_dataset_t *ds;
 	int error;
 	char *name;
 
 	/* alloc a buffer to hold dsname@snapname plus terminating NULL */
 	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
 	error = dsl_dataset_hold(name, ha->dstg, &ds);
 	strfree(name);
 	if (error == 0) {
 		ha->gotone = B_TRUE;
 		dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
 		    dsl_dataset_user_hold_sync, ds, ha, 0);
 	} else if (error == ENOENT && ha->recursive) {
 		error = 0;
 	} else {
 		(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
 	}
 	return (error);
 }
 
 int
 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
     boolean_t temphold)
 {
 	struct dsl_ds_holdarg *ha;
 	int error;
 
 	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
 	ha->htag = htag;
 	ha->temphold = temphold;
 	error = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
 	    ds, ha, 0);
 	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
 
 	return (error);
 }
 
 int
 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
     boolean_t recursive, boolean_t temphold, int cleanup_fd)
 {
 	struct dsl_ds_holdarg *ha;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	int error;
 	minor_t minor = 0;
 
 	if (cleanup_fd != -1) {
 		/* Currently we only support cleanup-on-exit of tempholds. */
 		if (!temphold)
 			return (EINVAL);
 		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
 		if (error)
 			return (error);
 	}
 
 	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
 
 	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
 
 	error = spa_open(dsname, &spa, FTAG);
 	if (error) {
 		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
 		if (cleanup_fd != -1)
 			zfs_onexit_fd_rele(cleanup_fd);
 		return (error);
 	}
 
 	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	ha->htag = htag;
 	ha->snapname = snapname;
 	ha->recursive = recursive;
 	ha->temphold = temphold;
 
 	if (recursive) {
 		error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
 		    ha, DS_FIND_CHILDREN);
 	} else {
 		error = dsl_dataset_user_hold_one(dsname, ha);
 	}
 	if (error == 0)
 		error = dsl_sync_task_group_wait(ha->dstg);
 
 	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
 	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
 		dsl_dataset_t *ds = dst->dst_arg1;
 
 		if (dst->dst_err) {
 			dsl_dataset_name(ds, ha->failed);
 			*strchr(ha->failed, '@') = '\0';
 		} else if (error == 0 && minor != 0 && temphold) {
 			/*
 			 * If this hold is to be released upon process exit,
 			 * register that action now.
 			 */
 			dsl_register_onexit_hold_cleanup(ds, htag, minor);
 		}
 		dsl_dataset_rele(ds, ha->dstg);
 	}
 
 	if (error == 0 && recursive && !ha->gotone)
 		error = ENOENT;
 
 	if (error)
 		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
 
 	dsl_sync_task_group_destroy(ha->dstg);
 
 	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
 	spa_close(spa, FTAG);
 	if (cleanup_fd != -1)
 		zfs_onexit_fd_rele(cleanup_fd);
 	return (error);
 }
 
 struct dsl_ds_releasearg {
 	dsl_dataset_t *ds;
 	const char *htag;
 	boolean_t own;		/* do we own or just hold ds? */
 };
 
 static int
 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
     boolean_t *might_destroy)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t zapobj;
 	uint64_t tmp;
 	int error;
 
 	*might_destroy = B_FALSE;
 
 	mutex_enter(&ds->ds_lock);
 	zapobj = ds->ds_phys->ds_userrefs_obj;
 	if (zapobj == 0) {
 		/* The tag can't possibly exist */
 		mutex_exit(&ds->ds_lock);
 		return (ESRCH);
 	}
 
 	/* Make sure the tag exists */
 	error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
 	if (error) {
 		mutex_exit(&ds->ds_lock);
 		if (error == ENOENT)
 			error = ESRCH;
 		return (error);
 	}
 
 	if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
 	    DS_IS_DEFER_DESTROY(ds))
 		*might_destroy = B_TRUE;
 
 	mutex_exit(&ds->ds_lock);
 	return (0);
 }
 
 static int
 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	struct dsl_ds_releasearg *ra = arg1;
 	dsl_dataset_t *ds = ra->ds;
 	boolean_t might_destroy;
 	int error;
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
 		return (ENOTSUP);
 
 	error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
 	if (error)
 		return (error);
 
 	if (might_destroy) {
 		struct dsl_ds_destroyarg dsda = {0};
 
 		if (dmu_tx_is_syncing(tx)) {
 			/*
 			 * If we're not prepared to remove the snapshot,
 			 * we can't allow the release to happen right now.
 			 */
 			if (!ra->own)
 				return (EBUSY);
 		}
 		dsda.ds = ds;
 		dsda.releasing = B_TRUE;
 		return (dsl_dataset_destroy_check(&dsda, tag, tx));
 	}
 
 	return (0);
 }
 
 static void
 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	struct dsl_ds_releasearg *ra = arg1;
 	dsl_dataset_t *ds = ra->ds;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj;
 	uint64_t dsobj = ds->ds_object;
 	uint64_t refs;
 	int error;
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_userrefs--;
 	refs = ds->ds_userrefs;
 	mutex_exit(&ds->ds_lock);
 	error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
 	VERIFY(error == 0 || error == ENOENT);
 	zapobj = ds->ds_phys->ds_userrefs_obj;
 	VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
 
 	spa_history_log_internal(LOG_DS_USER_RELEASE,
 	    dp->dp_spa, tx, "<%s> %lld dataset = %llu",
 	    ra->htag, (longlong_t)refs, dsobj);
 
 	if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
 	    DS_IS_DEFER_DESTROY(ds)) {
 		struct dsl_ds_destroyarg dsda = {0};
 
 		ASSERT(ra->own);
 		dsda.ds = ds;
 		dsda.releasing = B_TRUE;
 		/* We already did the destroy_check */
 		dsl_dataset_destroy_sync(&dsda, tag, tx);
 	}
 }
 
 static int
 dsl_dataset_user_release_one(const char *dsname, void *arg)
 {
 	struct dsl_ds_holdarg *ha = arg;
 	struct dsl_ds_releasearg *ra;
 	dsl_dataset_t *ds;
 	int error;
 	void *dtag = ha->dstg;
 	char *name;
 	boolean_t own = B_FALSE;
 	boolean_t might_destroy;
 
 	/* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
 	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
 	error = dsl_dataset_hold(name, dtag, &ds);
 	strfree(name);
 	if (error == ENOENT && ha->recursive)
 		return (0);
 	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
 	if (error)
 		return (error);
 
 	ha->gotone = B_TRUE;
 
 	ASSERT(dsl_dataset_is_snapshot(ds));
 
 	error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
 	if (error) {
 		dsl_dataset_rele(ds, dtag);
 		return (error);
 	}
 
 	if (might_destroy) {
 #ifdef _KERNEL
 		name = kmem_asprintf("%s@%s", dsname, ha->snapname);
 		error = zfs_unmount_snap(name, NULL);
 		strfree(name);
 		if (error) {
 			dsl_dataset_rele(ds, dtag);
 			return (error);
 		}
 #endif
 		if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
 			dsl_dataset_rele(ds, dtag);
 			return (EBUSY);
 		} else {
 			own = B_TRUE;
 			dsl_dataset_make_exclusive(ds, dtag);
 		}
 	}
 
 	ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
 	ra->ds = ds;
 	ra->htag = ha->htag;
 	ra->own = own;
 	dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
 	    dsl_dataset_user_release_sync, ra, dtag, 0);
 
 	return (0);
 }
 
 int
 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
     boolean_t recursive)
 {
 	struct dsl_ds_holdarg *ha;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	int error;
 
 top:
 	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
 
 	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
 
 	error = spa_open(dsname, &spa, FTAG);
 	if (error) {
 		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
 		return (error);
 	}
 
 	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	ha->htag = htag;
 	ha->snapname = snapname;
 	ha->recursive = recursive;
 	if (recursive) {
 		error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
 		    ha, DS_FIND_CHILDREN);
 	} else {
 		error = dsl_dataset_user_release_one(dsname, ha);
 	}
 	if (error == 0)
 		error = dsl_sync_task_group_wait(ha->dstg);
 
 	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
 	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
 		struct dsl_ds_releasearg *ra = dst->dst_arg1;
 		dsl_dataset_t *ds = ra->ds;
 
 		if (dst->dst_err)
 			dsl_dataset_name(ds, ha->failed);
 
 		if (ra->own)
 			dsl_dataset_disown(ds, ha->dstg);
 		else
 			dsl_dataset_rele(ds, ha->dstg);
 
 		kmem_free(ra, sizeof (struct dsl_ds_releasearg));
 	}
 
 	if (error == 0 && recursive && !ha->gotone)
 		error = ENOENT;
 
 	if (error && error != EBUSY)
 		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
 
 	dsl_sync_task_group_destroy(ha->dstg);
 	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
 	spa_close(spa, FTAG);
 
 	/*
 	 * We can get EBUSY if we were racing with deferred destroy and
 	 * dsl_dataset_user_release_check() hadn't done the necessary
 	 * open context setup.  We can also get EBUSY if we're racing
 	 * with destroy and that thread is the ds_owner.  Either way
 	 * the busy condition should be transient, and we should retry
 	 * the release operation.
 	 */
 	if (error == EBUSY)
 		goto top;
 
 	return (error);
 }
 
 /*
  * Called at spa_load time (with retry == B_FALSE) to release a stale
  * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
  */
 int
 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
     boolean_t retry)
 {
 	dsl_dataset_t *ds;
 	char *snap;
 	char *name;
 	int namelen;
 	int error;
 
 	do {
 		rw_enter(&dp->dp_config_rwlock, RW_READER);
 		error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
 		rw_exit(&dp->dp_config_rwlock);
 		if (error)
 			return (error);
 		namelen = dsl_dataset_namelen(ds)+1;
 		name = kmem_alloc(namelen, KM_SLEEP);
 		dsl_dataset_name(ds, name);
 		dsl_dataset_rele(ds, FTAG);
 
 		snap = strchr(name, '@');
 		*snap = '\0';
 		++snap;
 		error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
 		kmem_free(name, namelen);
 
 		/*
 		 * The object can't have been destroyed because we have a hold,
 		 * but it might have been renamed, resulting in ENOENT.  Retry
 		 * if we've been requested to do so.
 		 *
 		 * It would be nice if we could use the dsobj all the way
 		 * through and avoid ENOENT entirely.  But we might need to
 		 * unmount the snapshot, and there's currently no way to lookup
 		 * a vfsp using a ZFS object id.
 		 */
 	} while ((error == ENOENT) && retry);
 
 	return (error);
 }
 
 int
 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
 {
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
 
 	VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
 	if (ds->ds_phys->ds_userrefs_obj != 0) {
 		zap_attribute_t *za;
 		zap_cursor_t zc;
 
 		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
 		    ds->ds_phys->ds_userrefs_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
 			    za->za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 		kmem_free(za, sizeof (zap_attribute_t));
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 /*
  * Note, this function is used as the callback for dmu_objset_find().  We
  * always return 0 so that we will continue to find and process
  * inconsistent datasets, even if we encounter an error trying to
  * process one of them.
  */
 /* ARGSUSED */
 int
 dsl_destroy_inconsistent(const char *dsname, void *arg)
 {
 	dsl_dataset_t *ds;
 
 	if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
 		if (DS_IS_INCONSISTENT(ds))
 			(void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
 		else
 			dsl_dataset_disown(ds, FTAG);
 	}
 	return (0);
 }
 
 /*
  * Return (in *usedp) the amount of space written in new that is not
  * present in oldsnap.  New may be a snapshot or the head.  Old must be
  * a snapshot before new, in new's filesystem (or its origin).  If not then
  * fail and return EINVAL.
  *
  * The written space is calculated by considering two components:  First, we
  * ignore any freed space, and calculate the written as new's used space
  * minus old's used space.  Next, we add in the amount of space that was freed
  * between the two snapshots, thus reducing new's used space relative to old's.
  * Specifically, this is the space that was born before old->ds_creation_txg,
  * and freed before new (ie. on new's deadlist or a previous deadlist).
  *
  * space freed                         [---------------------]
  * snapshots                       ---O-------O--------O-------O------
  *                                         oldsnap            new
  */
 int
 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	uint64_t snapobj;
 	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
 	*usedp = 0;
 	*usedp += new->ds_phys->ds_referenced_bytes;
 	*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
 
 	*compp = 0;
 	*compp += new->ds_phys->ds_compressed_bytes;
 	*compp -= oldsnap->ds_phys->ds_compressed_bytes;
 
 	*uncompp = 0;
 	*uncompp += new->ds_phys->ds_uncompressed_bytes;
 	*uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
 
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	snapobj = new->ds_object;
 	while (snapobj != oldsnap->ds_object) {
 		dsl_dataset_t *snap;
 		uint64_t used, comp, uncomp;
 
 		if (snapobj == new->ds_object) {
 			snap = new;
 		} else {
 			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
 			if (err != 0)
 				break;
 		}
 
 		if (snap->ds_phys->ds_prev_snap_txg ==
 		    oldsnap->ds_phys->ds_creation_txg) {
 			/*
 			 * The blocks in the deadlist can not be born after
 			 * ds_prev_snap_txg, so get the whole deadlist space,
 			 * which is more efficient (especially for old-format
 			 * deadlists).  Unfortunately the deadlist code
 			 * doesn't have enough information to make this
 			 * optimization itself.
 			 */
 			dsl_deadlist_space(&snap->ds_deadlist,
 			    &used, &comp, &uncomp);
 		} else {
 			dsl_deadlist_space_range(&snap->ds_deadlist,
 			    0, oldsnap->ds_phys->ds_creation_txg,
 			    &used, &comp, &uncomp);
 		}
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
 		/*
 		 * If we get to the beginning of the chain of snapshots
 		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
 		 * was not a snapshot of/before new.
 		 */
 		snapobj = snap->ds_phys->ds_prev_snap_obj;
 		if (snap != new)
 			dsl_dataset_rele(snap, FTAG);
 		if (snapobj == 0) {
 			err = EINVAL;
 			break;
 		}
 
 	}
 	rw_exit(&dp->dp_config_rwlock);
 	return (err);
 }
 
 /*
  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
  * lastsnap, and all snapshots in between are deleted.
  *
  * blocks that would be freed            [---------------------------]
  * snapshots                       ---O-------O--------O-------O--------O
  *                                        firstsnap        lastsnap
  *
  * This is the set of blocks that were born after the snap before firstsnap,
  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
  * We calculate this by iterating over the relevant deadlists (from the snap
  * after lastsnap, backward to the snap after firstsnap), summing up the
  * space on the deadlist that was born after the snap before firstsnap.
  */
 int
 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
     dsl_dataset_t *lastsnap,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
 	int err = 0;
 	uint64_t snapobj;
 	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
 
 	ASSERT(dsl_dataset_is_snapshot(firstsnap));
 	ASSERT(dsl_dataset_is_snapshot(lastsnap));
 
 	/*
 	 * Check that the snapshots are in the same dsl_dir, and firstsnap
 	 * is before lastsnap.
 	 */
 	if (firstsnap->ds_dir != lastsnap->ds_dir ||
 	    firstsnap->ds_phys->ds_creation_txg >
 	    lastsnap->ds_phys->ds_creation_txg)
 		return (EINVAL);
 
 	*usedp = *compp = *uncompp = 0;
 
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	snapobj = lastsnap->ds_phys->ds_next_snap_obj;
 	while (snapobj != firstsnap->ds_object) {
 		dsl_dataset_t *ds;
 		uint64_t used, comp, uncomp;
 
 		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
 		if (err != 0)
 			break;
 
 		dsl_deadlist_space_range(&ds->ds_deadlist,
 		    firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
 		    &used, &comp, &uncomp);
 		*usedp += used;
 		*compp += comp;
 		*uncompp += uncomp;
 
 		snapobj = ds->ds_phys->ds_prev_snap_obj;
 		ASSERT3U(snapobj, !=, 0);
 		dsl_dataset_rele(ds, FTAG);
 	}
 	rw_exit(&dp->dp_config_rwlock);
 	return (err);
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	(revision 247192)
@@ -1,1750 +1,1751 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_scan.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
 
 static scan_cb_t dsl_scan_defrag_cb;
 static scan_cb_t dsl_scan_scrub_cb;
 static scan_cb_t dsl_scan_remove_cb;
 static dsl_syncfunc_t dsl_scan_cancel_sync;
 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
 
 unsigned int zfs_top_maxinflight = 32;	/* maximum I/Os per top-level */
 unsigned int zfs_resilver_delay = 2;	/* number of ticks to delay resilver */
 unsigned int zfs_scrub_delay = 4;	/* number of ticks to delay scrub */
 unsigned int zfs_scan_idle = 50;	/* idle window in clock ticks */
 
 unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
 unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
 						 per txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
 
 SYSCTL_DECL(_vfs_zfs);
 TUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RW,
     &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
 TUNABLE_INT("vfs.zfs.resilver_delay", &zfs_resilver_delay);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RW,
     &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
 TUNABLE_INT("vfs.zfs.scrub_delay", &zfs_scrub_delay);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RW,
     &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
 TUNABLE_INT("vfs.zfs.scan_idle", &zfs_scan_idle);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RW,
     &zfs_scan_idle, 0, "Idle scan window in clock ticks");
 TUNABLE_INT("vfs.zfs.scan_min_time_ms", &zfs_scan_min_time_ms);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RW,
     &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
 TUNABLE_INT("vfs.zfs.free_min_time_ms", &zfs_free_min_time_ms);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RW,
     &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
 TUNABLE_INT("vfs.zfs.resilver_min_time_ms", &zfs_resilver_min_time_ms);
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RW,
     &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
 TUNABLE_INT("vfs.zfs.no_scrub_io", &zfs_no_scrub_io);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RW,
     &zfs_no_scrub_io, 0, "Disable scrub I/O");
 TUNABLE_INT("vfs.zfs.no_scrub_prefetch", &zfs_no_scrub_prefetch);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RW,
     &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
 
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
 
 extern int zfs_txg_timeout;
 
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
 	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
 };
 
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
 	int err;
 	dsl_scan_t *scn;
 	spa_t *spa = dp->dp_spa;
 	uint64_t f;
 
 	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
 	scn->scn_dp = dp;
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
 	if (err == 0) {
 		/*
 		 * There was an old-style scrub in progress.  Restart a
 		 * new-style scrub from the beginning.
 		 */
 		scn->scn_restart_txg = txg;
 		zfs_dbgmsg("old-style scrub was in progress; "
 		    "restarting new-style scrub in txg %llu",
 		    scn->scn_restart_txg);
 
 		/*
 		 * Load the queue obj from the old location so that it
 		 * can be freed by dsl_scan_done().
 		 */
 		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    "scrub_queue", sizeof (uint64_t), 1,
 		    &scn->scn_phys.scn_queue_obj);
 	} else {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys);
 		if (err == ENOENT)
 			return (0);
 		else if (err)
 			return (err);
 
 		if (scn->scn_phys.scn_state == DSS_SCANNING &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
 			/*
 			 * A new-type scrub was in progress on an old
 			 * pool, and the pool was accessed by old
 			 * software.  Restart from the beginning, since
 			 * the old software may have changed the pool in
 			 * the meantime.
 			 */
 			scn->scn_restart_txg = txg;
 			zfs_dbgmsg("new-style scrub was modified "
 			    "by old software; restarting in txg %llu",
 			    scn->scn_restart_txg);
 		}
 	}
 
 	spa_scan_stat_init(spa);
 	return (0);
 }
 
 void
 dsl_scan_fini(dsl_pool_t *dp)
 {
 	if (dp->dp_scan) {
 		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
 		dp->dp_scan = NULL;
 	}
 }
 
 /* ARGSUSED */
 static int
 dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg1;
 
 	if (scn->scn_phys.scn_state == DSS_SCANNING)
 		return (EBUSY);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg1;
 	pool_scan_func_t *funcp = arg2;
 	dmu_object_type_t ot = 0;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
 	scn->scn_phys.scn_func = *funcp;
 	scn->scn_phys.scn_state = DSS_SCANNING;
 	scn->scn_phys.scn_min_txg = 0;
 	scn->scn_phys.scn_max_txg = tx->tx_txg;
 	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
 	scn->scn_phys.scn_start_time = gethrestime_sec();
 	scn->scn_phys.scn_errors = 0;
 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
 	scn->scn_restart_txg = 0;
 	spa_scan_stat_init(spa);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
 
 		/* rewrite all disk labels */
 		vdev_config_dirty(spa->spa_root_vdev);
 
 		if (vdev_resilver_needed(spa->spa_root_vdev,
 		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
 			spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
 		} else {
 			spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
 		}
 
 		spa->spa_scrub_started = B_TRUE;
 		/*
 		 * If this is an incremental scrub, limit the DDT scrub phase
 		 * to just the auto-ditto class (for correctness); the rest
 		 * of the scrub should go faster using top-down pruning.
 		 */
 		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
 			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
 
 	}
 
 	/* back to the generic stuff */
 
 	if (dp->dp_blkstats == NULL) {
 		dp->dp_blkstats =
 		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
 	}
 	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 
 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 		ot = DMU_OT_ZAP_OTHER;
 
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
 	dsl_scan_sync_state(scn, tx);
 
 	spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
 	    "func=%u mintxg=%llu maxtxg=%llu",
 	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
 }
 
 /* ARGSUSED */
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	static const char *old_names[] = {
 		"scrub_bookmark",
 		"scrub_ddt_bookmark",
 		"scrub_ddt_class_max",
 		"scrub_queue",
 		"scrub_min_txg",
 		"scrub_max_txg",
 		"scrub_func",
 		"scrub_errors",
 		NULL
 	};
 
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int i;
 
 	/* Remove any remnants of an old-style scrub. */
 	for (i = 0; old_names[i]; i++) {
 		(void) zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
 	}
 
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, tx));
 		scn->scn_phys.scn_queue_obj = 0;
 	}
 
 	/*
 	 * If we were "restarted" from a stopped state, don't bother
 	 * with anything else.
 	 */
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (complete)
 		scn->scn_phys.scn_state = DSS_FINISHED;
 	else
 		scn->scn_phys.scn_state = DSS_CANCELED;
 
 	spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
 	    "complete=%u", complete);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > 0) {
 			cv_wait(&spa->spa_scrub_io_cv,
 			    &spa->spa_scrub_lock);
 		}
 		mutex_exit(&spa->spa_scrub_lock);
 		spa->spa_scrub_started = B_FALSE;
 		spa->spa_scrub_active = B_FALSE;
 
 		/*
 		 * If the scrub/resilver completed, update all DTLs to
 		 * reflect this.  Whether it succeeded or not, vacate
 		 * all temporary scrub DTLs.
 		 */
 		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
 		if (complete) {
 			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
 			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
 		}
 		spa_errlog_rotate(spa);
 
 		/*
 		 * We may have finished replacing a device.
 		 * Let the async thread assess this and handle the detach.
 		 */
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 	}
 
 	scn->scn_phys.scn_end_time = gethrestime_sec();
 }
 
 /* ARGSUSED */
 static int
 dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg1;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return (ENOENT);
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg1;
 
 	dsl_scan_done(scn, B_FALSE, tx);
 	dsl_scan_sync_state(scn, tx);
 }
 
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
 	boolean_t complete = B_FALSE;
 	int err;
 
 	err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
 	    dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
 	return (err);
 }
 
 static void dsl_scan_visitbp(blkptr_t *bp,
     const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
     dmu_tx_t *tx);
 static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
     dmu_objset_type_t ostype,
     dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
 
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 {
 	zio_free(dp->dp_spa, txg, bp);
 }
 
 void
 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 {
 	ASSERT(dsl_pool_sync_context(dp));
 	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
 	    pio->io_flags));
 }
 
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 	if (dsl_dataset_is_snapshot(ds))
 		return (MIN(smt, ds->ds_phys->ds_creation_txg));
 	return (smt);
 }
 
 static void
 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 	    &scn->scn_phys, tx));
 }
 
 static boolean_t
 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
 {
 	uint64_t elapsed_nanosecs;
 	unsigned int mintime;
 
 	/* we never skip user/group accounting objects */
 	if (zb && (int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	if (scn->scn_pausing)
 		return (B_TRUE); /* we're already pausing */
 
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 blocks. */
 	if (zb && zb->zb_level != 0)
 		return (B_FALSE);
 
 	mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 	    (elapsed_nanosecs / MICROSEC > mintime &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
 		if (zb) {
 			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			scn->scn_phys.scn_bookmark = *zb;
 		}
 		dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
 		scn->scn_pausing = B_TRUE;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 typedef struct zil_scan_arg {
 	dsl_pool_t	*zsa_dp;
 	zil_header_t	*zsa_zh;
 } zil_scan_arg_t;
 
 /* ARGSUSED */
 static int
 dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zil_scan_arg_t *zsa = arg;
 	dsl_pool_t *dp = zsa->zsa_dp;
 	dsl_scan_t *scn = dp->dp_scan;
 	zil_header_t *zh = zsa->zsa_zh;
 	zbookmark_t zb;
 
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
 	 * One block ("stubby") can be allocated a long time ago; we
 	 * want to visit that one because it has been allocated
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	if (lrc->lrc_txtype == TX_WRITE) {
 		zil_scan_arg_t *zsa = arg;
 		dsl_pool_t *dp = zsa->zsa_dp;
 		dsl_scan_t *scn = dp->dp_scan;
 		zil_header_t *zh = zsa->zsa_zh;
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_t zb;
 
 		if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
 		 * birth can be < claim_txg if this record's txg is
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
 			return (0);
 
 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
 		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	}
 	return (0);
 }
 
 static void
 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 	zil_scan_arg_t zsa = { dp, zh };
 	zilog_t *zilog;
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
 	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
 
 	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
 	    claim_txg);
 
 	zil_free(zilog);
 }
 
 /* ARGSUSED */
 static void
 dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
     uint64_t objset, uint64_t object, uint64_t blkid)
 {
 	zbookmark_t czb;
 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
 
 	if (zfs_no_scrub_prefetch)
 		return;
 
 	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
 		return;
 
 	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
 
 	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
 	    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 }
 
 static boolean_t
 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
     const zbookmark_t *zb)
 {
 	/*
 	 * We never skip over user/group accounting objects (obj<0)
 	 */
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 	    (int64_t)zb->zb_object >= 0) {
 		/*
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
 		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
 		 * If we found the block we're trying to resume from, or
 		 * we went past it to a different object, zero it out to
 		 * indicate that it's OK to start checking for pausing
 		 * again.
 		 */
 		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
 		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
 		}
 	}
 	return (B_FALSE);
 }
 
 /*
  * Return nonzero on i/o error.
  * Return new buf to write out in *bufp.
  */
 static int
 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 	int err;
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		uint32_t flags = ARC_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 
 		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
 			dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
 			    zb->zb_object, zb->zb_blkid * epb + i);
 		}
 		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
 			zbookmark_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			dsl_scan_visitbp(cbp, &czb, dnp,
 			    *bufp, ds, scn, ostype, tx);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
 		uint32_t flags = ARC_WAIT;
 
 		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
 		dnode_phys_t *cdnp;
 		int i, j;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
 			for (j = 0; j < cdnp->dn_nblkptr; j++) {
 				blkptr_t *cbp = &cdnp->dn_blkptr[j];
 				dsl_scan_prefetch(scn, *bufp, cbp,
 				    zb->zb_objset, zb->zb_blkid * epb + i, j);
 			}
 		}
 		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
 			dsl_scan_visitdnode(scn, ds, ostype,
 			    cdnp, *bufp, zb->zb_blkid * epb + i, tx);
 		}
 
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
 
 		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 
 		osp = (*bufp)->b_data;
 
 		dsl_scan_visitdnode(scn, ds, osp->os_type,
 		    &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
 
 		if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
 			/*
 			 * We also always visit user/group accounting
 			 * objects, and never skip them, even if we are
 			 * pausing.  This is necessary so that the space
 			 * deltas from this txg get integrated.
 			 */
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_groupused_dnode, *bufp,
 			    DMU_GROUPUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_userused_dnode, *bufp,
 			    DMU_USERUSED_OBJECT, tx);
 		}
 	}
 
 	return (0);
 }
 
 static void
 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
     dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
     uint64_t object, dmu_tx_t *tx)
 {
 	int j;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		zbookmark_t czb;
 
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    dnp->dn_nlevels - 1, j);
 		dsl_scan_visitbp(&dnp->dn_blkptr[j],
 		    &czb, dnp, buf, ds, scn, ostype, tx);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zbookmark_t czb;
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    0, DMU_SPILL_BLKID);
 		dsl_scan_visitbp(&dnp->dn_spill,
 		    &czb, dnp, buf, ds, scn, ostype, tx);
 	}
 }
 
 /*
  * The arguments are in this order because mdb can only print the
  * first 5; we want them to be useful.
  */
 static void
 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
     dnode_phys_t *dnp, arc_buf_t *pbuf,
     dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
     dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	arc_buf_t *buf = NULL;
 	blkptr_t bp_toread = *bp;
 
 	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
 
 	if (dsl_scan_check_pause(scn, zb))
 		return;
 
 	if (dsl_scan_check_resume(scn, dnp, zb))
 		return;
 
 	if (bp->blk_birth == 0)
 		return;
 
 	scn->scn_visited_this_txg++;
 
 	dprintf_bp(bp,
 	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
 	    ds, ds ? ds->ds_object : 0,
 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
 	    pbuf, bp);
 
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 		return;
 
 	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
 	    &buf) != 0)
 		return;
 
 	/*
 	 * If dsl_scan_ddt() has aready visited this block, it will have
 	 * already done any translations or scrubbing, so don't call the
 	 * callback again.
 	 */
 	if (ddt_class_contains(dp->dp_spa,
 	    scn->scn_phys.scn_ddt_class_max, bp)) {
 		ASSERT(buf == NULL);
 		return;
 	}
 
 	/*
 	 * If this block is from the future (after cur_max_txg), then we
 	 * are doing this on behalf of a deleted snapshot, and we will
 	 * revisit the future block on the next pass of this dataset.
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
 	if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
 		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
 	}
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 }
 
 static void
 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
 	zbookmark_t zb;
 
 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	dsl_scan_visitbp(bp, &zb, NULL, NULL,
 	    ds, scn, DMU_OST_NONE, tx);
 
 	dprintf_ds(ds, "finished scan%s", "");
 }
 
 void
 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
 		if (dsl_dataset_is_snapshot(ds)) {
 			/* Note, scn_cur_{min,max}_txg stays the same. */
 			scn->scn_phys.scn_bookmark.zb_objset =
 			    ds->ds_phys->ds_next_snap_obj;
 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
 			    "reset zb_objset to %llu",
 			    (u_longlong_t)ds->ds_object,
 			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
 			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
 		} else {
 			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
 			    ZB_DESTROYED_OBJSET, 0, 0, 0);
 			zfs_dbgmsg("destroying ds %llu; currently traversing; "
 			    "reset bookmark to -1,0,0,0",
 			    (u_longlong_t)ds->ds_object);
 		}
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
 		ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		if (dsl_dataset_is_snapshot(ds)) {
 			/*
 			 * We keep the same mintxg; it could be >
 			 * ds_creation_txg if the previous snapshot was
 			 * deleted too.
 			 */
 			VERIFY(zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
 			    ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
 			zfs_dbgmsg("destroying ds %llu; in queue; "
 			    "replacing with %llu",
 			    (u_longlong_t)ds->ds_object,
 			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
 		} else {
 			zfs_dbgmsg("destroying ds %llu; in queue; removing",
 			    (u_longlong_t)ds->ds_object);
 		}
 	} else {
 		zfs_dbgmsg("destroying ds %llu; ignoring",
 		    (u_longlong_t)ds->ds_object);
 	}
 
 	/*
 	 * dsl_scan_sync() should be called after this, and should sync
 	 * out our changed state, but just to be safe, do it here.
 	 */
 	dsl_scan_sync_state(scn, tx);
 }
 
 void
 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
 		scn->scn_phys.scn_bookmark.zb_objset =
 		    ds->ds_phys->ds_prev_snap_obj;
 		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds->ds_object,
 		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj,
 		    ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
 		zfs_dbgmsg("snapshotting ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds->ds_object,
 		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
 	}
 	dsl_scan_sync_state(scn, tx);
 }
 
 void
 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
 		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
 		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    (u_longlong_t)ds1->ds_object);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds1->ds_object, &mintxg) == 0) {
 		int err;
 
 		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
 		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
 		err = zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
 		VERIFY(err == 0 || err == EEXIST);
 		if (err == EEXIST) {
 			/* Both were there to begin with */
 			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
 			    ds1->ds_object, mintxg, tx));
 		}
 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
 		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
 		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
 		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
 		zfs_dbgmsg("clone_swap ds %llu; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    (u_longlong_t)ds1->ds_object);
 	}
 
 	dsl_scan_sync_state(scn, tx);
 }
 
 struct enqueue_clones_arg {
 	dmu_tx_t *tx;
 	uint64_t originobj;
 };
 
 /* ARGSUSED */
 static int
 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 {
 	struct enqueue_clones_arg *eca = arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
 	if (err)
 		return (err);
 
 	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
 		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
 			dsl_dataset_t *prev;
 			err = dsl_dataset_hold_obj(dp,
 			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
 
 			dsl_dataset_rele(ds, FTAG);
 			if (err)
 				return (err);
 			ds = prev;
 		}
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object,
 		    ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 	objset_t *os;
 
 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 
 	if (dmu_objset_from_ds(ds, &os))
 		goto out;
 
 	/*
 	 * Only the ZIL in the head (non-snapshot) is valid.  Even though
 	 * snapshots can have ZIL block pointers (which may be the same
 	 * BP as in the head), they must be ignored.  So we traverse the
 	 * ZIL here, rather than in scan_recurse(), because the regular
 	 * snapshot block-sharing rules don't apply to it.
 	 */
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
 		dsl_scan_zil(dp, &os->os_zil_header);
 
 	/*
 	 * Iterate over the bps in this ds.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
 
 	char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
 	    "pausing=%u",
 	    (longlong_t)dsobj, dsname,
 	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
 	    (int)scn->scn_pausing);
 	kmem_free(dsname, ZFS_MAXNAMELEN);
 
 	if (scn->scn_pausing)
 		goto out;
 
 	/*
 	 * We've finished this pass over this dataset.
 	 */
 
 	/*
 	 * If we did not completely visit this dataset, do another pass.
 	 */
 	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
 		zfs_dbgmsg("incomplete pass; visiting again");
 		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object,
 		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
 		goto out;
 	}
 
 	/*
 	 * Add descendent datasets to work queue.
 	 */
 	if (ds->ds_phys->ds_next_snap_obj != 0) {
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
 		    ds->ds_phys->ds_creation_txg, tx) == 0);
 	}
 	if (ds->ds_phys->ds_num_children > 1) {
 		boolean_t usenext = B_FALSE;
 		if (ds->ds_phys->ds_next_clones_obj != 0) {
 			uint64_t count;
 			/*
 			 * A bug in a previous version of the code could
 			 * cause upgrade_clones_cb() to not set
 			 * ds_next_snap_obj when it should, leading to a
 			 * missing entry.  Therefore we can only use the
 			 * next_clones_obj when its count is correct.
 			 */
 			int err = zap_count(dp->dp_meta_objset,
 			    ds->ds_phys->ds_next_clones_obj, &count);
 			if (err == 0 &&
 			    count == ds->ds_phys->ds_num_children - 1)
 				usenext = B_TRUE;
 		}
 
 		if (usenext) {
 			VERIFY(zap_join_key(dp->dp_meta_objset,
 			    ds->ds_phys->ds_next_clones_obj,
 			    scn->scn_phys.scn_queue_obj,
 			    ds->ds_phys->ds_creation_txg, tx) == 0);
 		} else {
 			struct enqueue_clones_arg eca;
 			eca.tx = tx;
 			eca.originobj = ds->ds_object;
 
 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
 		}
 	}
 
 out:
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /* ARGSUSED */
 static int
 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (ds->ds_phys->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 		    FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		/*
 		 * If this is a clone, we don't need to worry about it for now.
 		 */
 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_dataset_rele(prev, FTAG);
 			return (0);
 		}
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 	}
 
 	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 /*
  * Scrub/dedup interaction.
  *
  * If there are N references to a deduped block, we don't want to scrub it
  * N times -- ideally, we should scrub it exactly once.
  *
  * We leverage the fact that the dde's replication class (enum ddt_class)
  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
  *
  * To prevent excess scrubbing, the scrub begins by walking the DDT
  * to find all blocks with refcnt > 1, and scrubs each of these once.
  * Since there are two replication classes which contain blocks with
  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
  *
  * There would be nothing more to say if a block's refcnt couldn't change
  * during a scrub, but of course it can so we must account for changes
  * in a block's replication class.
  *
  * Here's an example of what can occur:
  *
  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
  * when visited during the top-down scrub phase, it will be scrubbed twice.
  * This negates our scrub optimization, but is otherwise harmless.
  *
  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
  * on each visit during the top-down scrub phase, it will never be scrubbed.
  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
  * while a scrub is in progress, it scrubs the block right then.
  */
 static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
 	ddt_entry_t dde = { 0 };
 	int error;
 	uint64_t n = 0;
 
 	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
 		ddt_t *ddt;
 
 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
 			break;
 		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
 		    (longlong_t)ddb->ddb_class,
 		    (longlong_t)ddb->ddb_type,
 		    (longlong_t)ddb->ddb_checksum,
 		    (longlong_t)ddb->ddb_cursor);
 
 		/* There should be no pending changes to the dedup table */
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
 		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
 		n++;
 
 		if (dsl_scan_check_pause(scn, NULL))
 			break;
 	}
 
 	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
 	    (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
 	    (int)scn->scn_pausing);
 
 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
 	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
 }
 
 /* ARGSUSED */
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
 	const ddt_key_t *ddk = &dde->dde_key;
 	ddt_phys_t *ddp = dde->dde_phys;
 	blkptr_t bp;
 	zbookmark_t zb = { 0 };
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
 			continue;
 		ddt_bp_create(checksum, ddk, ddp, &bp);
 
 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
 	}
 }
 
 static void
 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_ddt(scn, tx);
 		if (scn->scn_pausing)
 			return;
 	}
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
 		/* First do the MOS & ORIGIN */
 
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_visit_rootbp(scn, NULL,
 		    &dp->dp_meta_rootbp, tx);
 		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 		if (scn->scn_pausing)
 			return;
 
 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
 			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
 		} else {
 			dsl_scan_visitds(scn,
 			    dp->dp_origin_snap->ds_object, tx);
 		}
 		ASSERT(!scn->scn_pausing);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
 	    ZB_DESTROYED_OBJSET) {
 		/*
 		 * If we were paused, continue from here.  Note if the
 		 * ds we were paused on was deleted, the zb_objset may
 		 * be -1, so we will skip this and find a new objset
 		 * below.
 		 */
 		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
 		if (scn->scn_pausing)
 			return;
 	}
 
 	/*
 	 * In case we were paused right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
 	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
 
 	/* keep pulling things out of the zap-object-as-queue */
 	while (zap_cursor_init(&zc, dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj),
 	    zap_cursor_retrieve(&zc, &za) == 0) {
 		dsl_dataset_t *ds;
 		uint64_t dsobj;
 
 		dsobj = strtonum(za.za_name, NULL);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, dsobj, tx));
 
 		/* Set up min/max txg */
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		if (za.za_first_integer != 0) {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
 			    za.za_first_integer);
 		} else {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
 			    ds->ds_phys->ds_prev_snap_txg);
 		}
 		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
 		dsl_dataset_rele(ds, FTAG);
 
 		dsl_scan_visitds(scn, dsobj, tx);
 		zap_cursor_fini(&zc);
 		if (scn->scn_pausing)
 			return;
 	}
 	zap_cursor_fini(&zc);
 }
 
 static boolean_t
 dsl_scan_free_should_pause(dsl_scan_t *scn)
 {
 	uint64_t elapsed_nanosecs;
 
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 static int
 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg;
 
 	if (!scn->scn_is_bptree ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
 		if (dsl_scan_free_should_pause(scn))
 			return (ERESTART);
 	}
 
 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
 	    dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	scn->scn_visited_this_txg++;
 	return (0);
 }
 
 boolean_t
 dsl_scan_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t used = 0, comp, uncomp;
 
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 
 	if (scn->scn_phys.scn_state == DSS_SCANNING)
 		return (B_TRUE);
 
 	if (spa_feature_is_active(spa,
 	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 		return (B_TRUE);
 	}
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
 	}
 	return (used != 0);
 }
 
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	int err;
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
 	 * that we can restart an old-style scan while the pool is being
 	 * imported (see dsl_scan_init).
 	 */
 	if (scn->scn_restart_txg != 0 &&
 	    scn->scn_restart_txg <= tx->tx_txg) {
 		pool_scan_func_t func = POOL_SCAN_SCRUB;
 		dsl_scan_done(scn, B_FALSE, tx);
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
 			func = POOL_SCAN_RESILVER;
 		zfs_dbgmsg("restarting scan func=%u txg=%llu",
 		    func, tx->tx_txg);
 		dsl_scan_setup_sync(scn, &func, tx);
 	}
 
 	if (!dsl_scan_active(scn) ||
 	    spa_sync_pass(dp->dp_spa) > 1)
 		return;
 
 	scn->scn_visited_this_txg = 0;
 	scn->scn_pausing = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
 	spa->spa_scrub_active = B_TRUE;
 
 	/*
 	 * First process the free list.  If we pause the free, don't do
 	 * any scanning.  This ensures that there is no free list when
 	 * we are scanning, so the scan code doesn't have to worry about
 	 * traversing it.
 	 */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
 		    dsl_scan_free_block_cb, scn, tx);
 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
 
 		if (err == 0 && spa_feature_is_active(spa,
 		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 			scn->scn_is_bptree = B_TRUE;
 			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 			    NULL, ZIO_FLAG_MUSTSUCCEED);
 			err = bptree_iterate(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
 			    scn, tx);
 			VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
 			if (err != 0)
 				return;
 
 			/* disable async destroy feature */
 			spa_feature_decr(spa,
 			    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
 			ASSERT(!spa_feature_is_active(spa,
 			    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
 			VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, tx));
 			VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, tx));
 			dp->dp_bptree_obj = 0;
 		}
 		if (scn->scn_visited_this_txg) {
 			zfs_dbgmsg("freed %llu blocks in %llums from "
 			    "free_bpobj/bptree txg %llu",
 			    (longlong_t)scn->scn_visited_this_txg,
 			    (longlong_t)
 			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
 			    (longlong_t)tx->tx_txg);
 			scn->scn_visited_this_txg = 0;
 			/*
 			 * Re-sync the ddt so that we can further modify
 			 * it when doing bprewrite.
 			 */
 			ddt_sync(spa, tx->tx_txg);
 		}
 		if (err == ERESTART)
 			return;
 	}
 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		zfs_dbgmsg("doing scan sync txg %llu; "
 		    "ddt bm=%llu/%llu/%llu/%llx",
 		    (longlong_t)tx->tx_txg,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
 		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
 		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
 	} else {
 		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
 		    (longlong_t)tx->tx_txg,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
 	}
 
 	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 	    NULL, ZIO_FLAG_CANFAIL);
 	dsl_scan_visit(scn, tx);
 	(void) zio_wait(scn->scn_zio_root);
 	scn->scn_zio_root = NULL;
 
 	zfs_dbgmsg("visited %llu blocks in %llums",
 	    (longlong_t)scn->scn_visited_this_txg,
 	    (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
 
 	if (!scn->scn_pausing) {
 		/* finished with scan. */
 		zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
 		dsl_scan_done(scn, B_TRUE, tx);
 	}
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > 0) {
 			cv_wait(&spa->spa_scrub_io_cv,
 			    &spa->spa_scrub_lock);
 		}
 		mutex_exit(&spa->spa_scrub_lock);
 	}
 
 	dsl_scan_sync_state(scn, tx);
 }
 
 /*
  * This will start a new scan, or restart an existing one.
  */
 void
 dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
 {
 	if (txg == 0) {
 		dmu_tx_t *tx;
 		tx = dmu_tx_create_dd(dp->dp_mos_dir);
 		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
 
 		txg = dmu_tx_get_txg(tx);
 		dp->dp_scan->scn_restart_txg = txg;
 		dmu_tx_commit(tx);
 	} else {
 		dp->dp_scan->scn_restart_txg = txg;
 	}
 	zfs_dbgmsg("restarting resilver txg=%llu", txg);
 }
 
 boolean_t
 dsl_scan_resilvering(dsl_pool_t *dp)
 {
 	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
 	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
 }
 
 /*
  * scrub consumers
  */
 
 static void
 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
 	int i;
 
 	/*
 	 * If we resume after a reboot, zab will be NULL; don't record
 	 * incomplete stats in that case.
 	 */
 	if (zab == NULL)
 		return;
 
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
 		if (t & DMU_OT_NEWTYPE)
 			t = DMU_OT_OTHER;
 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
 		int equal;
 
 		zb->zb_count++;
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_2_of_2_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal == 1)
 				zb->zb_ditto_2_of_3_samevdev++;
 			else if (equal == 3)
 				zb->zb_ditto_3_of_3_samevdev++;
 			break;
 		}
 	}
 }
 
 static void
 dsl_scan_scrub_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (zio->io_error && (zio->io_error != ECKSUM ||
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
 		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	size_t size = BP_GET_PSIZE(bp);
 	spa_t *spa = dp->dp_spa;
 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
 	boolean_t needs_io;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 	int zio_priority;
 	unsigned int scan_delay = 0;
 
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg)
 		return (0);
 
 	count_block(dp->dp_blkstats, bp);
 
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
 		zio_flags |= ZIO_FLAG_SCRUB;
 		zio_priority = ZIO_PRIORITY_SCRUB;
 		needs_io = B_TRUE;
 		scan_delay = zfs_scrub_delay;
-	} else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+	} else {
+		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
 		zio_priority = ZIO_PRIORITY_RESILVER;
 		needs_io = B_FALSE;
 		scan_delay = zfs_resilver_delay;
 	}
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb->zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		vdev_t *vd = vdev_lookup_top(spa,
 		    DVA_GET_VDEV(&bp->blk_dva[d]));
 
 		/*
 		 * Keep track of how much data we've examined so that
 		 * zpool(1M) status can make useful progress reports.
 		 */
 		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
 		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
 
 		/* if it's a resilver, this may not be in the target range */
 		if (!needs_io) {
 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
 				/*
 				 * Gang members may be spread across multiple
 				 * vdevs, so the best estimate we have is the
 				 * scrub range, which has already been checked.
 				 * XXX -- it would be better to change our
 				 * allocation policy to ensure that all
 				 * gang members reside on the same vdev.
 				 */
 				needs_io = B_TRUE;
 			} else {
 				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
 				    phys_birth, 1);
 			}
 		}
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		uint64_t maxinflight = rvd->vdev_children *
 		    MAX(zfs_top_maxinflight, 1);
 		void *data = zio_data_buf_alloc(size);
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight >= maxinflight)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		/*
 		 * If we're seeing recent (zfs_scan_idle) "important" I/Os
 		 * then throttle our workload to limit the impact of a scan.
 		 */
 		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
 			delay(MAX((int)scan_delay, 0));
 
 		zio_nowait(zio_read(NULL, spa, bp, data, size,
 		    dsl_scan_scrub_done, NULL, zio_priority,
 		    zio_flags, zb));
 	}
 
 	/* do not relocate this block */
 	return (0);
 }
 
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 {
 	spa_t *spa = dp->dp_spa;
 
 	/*
 	 * Purge all vdev caches and probe all devices.  We do this here
 	 * rather than in sync context because this requires a writer lock
 	 * on the spa_config lock, which we can't do from sync context.  The
 	 * spa_scrub_reopen flag indicates that vdev_open() should not
 	 * attempt to start another scrub.
 	 */
 	spa_vdev_state_enter(spa, SCL_NONE);
 	spa->spa_scrub_reopen = B_TRUE;
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	return (dsl_sync_task_do(dp, dsl_scan_setup_check,
 	    dsl_scan_setup_sync, dp->dp_scan, &func, 0));
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c	(revision 247192)
@@ -1,124 +1,127 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
  * We keep our own copy of this algorithm for 3 main reasons:
  *	1. If we didn't, anyone modifying common/os/compress.c would
  *         directly break our on disk format
  *	2. Our version of lzjb does not have a number of checks that the
  *         common/os version needs and uses
  *	3. We initialize the lempel to ensure deterministic results,
  *	   so that identical blocks can always be deduplicated.
  * In particular, we are adding the "feature" that compress() can
  * take a destination buffer size and returns the compressed length, or the
  * source length if compression would overflow the destination buffer.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
+#include <sys/param.h>
 
 #define	MATCH_BITS	6
 #define	MATCH_MIN	3
 #define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
 #define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
 #define	LEMPEL_SIZE	1024
 
 /*ARGSUSED*/
 size_t
 lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
-	uchar_t *cpy, *copymap;
+	uchar_t *cpy;
+	uchar_t *copymap = NULL;
 	int copymask = 1 << (NBBY - 1);
 	int mlen, offset, hash;
 	uint16_t *hp;
 	uint16_t lempel[LEMPEL_SIZE] = { 0 };
 
 	while (src < (uchar_t *)s_start + s_len) {
 		if ((copymask <<= 1) == (1 << NBBY)) {
 			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
 				return (s_len);
 			copymask = 1;
 			copymap = dst;
 			*dst++ = 0;
 		}
 		if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
 			*dst++ = *src++;
 			continue;
 		}
 		hash = (src[0] << 16) + (src[1] << 8) + src[2];
 		hash += hash >> 9;
 		hash += hash >> 5;
 		hp = &lempel[hash & (LEMPEL_SIZE - 1)];
 		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
 		*hp = (uint16_t)(uintptr_t)src;
 		cpy = src - offset;
 		if (cpy >= (uchar_t *)s_start && cpy != src &&
 		    src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
 			*copymap |= copymask;
 			for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
 				if (src[mlen] != cpy[mlen])
 					break;
 			*dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
 			    (offset >> NBBY);
 			*dst++ = (uchar_t)offset;
 			src += mlen;
 		} else {
 			*dst++ = *src++;
 		}
 	}
 	return (dst - (uchar_t *)d_start);
 }
 
 /*ARGSUSED*/
 int
 lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
 	uchar_t *d_end = (uchar_t *)d_start + d_len;
-	uchar_t *cpy, copymap;
+	uchar_t *cpy;
+	uchar_t copymap = 0;
 	int copymask = 1 << (NBBY - 1);
 
 	while (dst < d_end) {
 		if ((copymask <<= 1) == (1 << NBBY)) {
 			copymask = 1;
 			copymap = *src++;
 		}
 		if (copymap & copymask) {
 			int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
 			int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
 			src += 2;
 			if ((cpy = dst - offset) < (uchar_t *)d_start)
 				return (-1);
 			while (--mlen >= 0 && dst < d_end)
 				*dst++ = *cpy++;
 		} else {
 			*dst++ = *src++;
 		}
 	}
 	return (0);
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c	(revision 247192)
@@ -1,223 +1,223 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 
 #ifdef	ZFS_DEBUG
 
 #ifdef _KERNEL
 int reference_tracking_enable = FALSE; /* runs out of memory too easily */
 #else
 int reference_tracking_enable = TRUE;
 #endif
 int reference_history = 4; /* tunable */
 
 static kmem_cache_t *reference_cache;
 static kmem_cache_t *reference_history_cache;
 
 void
 refcount_sysinit(void)
 {
 	reference_cache = kmem_cache_create("reference_cache",
 	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	reference_history_cache = kmem_cache_create("reference_history_cache",
 	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 refcount_fini(void)
 {
 	kmem_cache_destroy(reference_cache);
 	kmem_cache_destroy(reference_history_cache);
 }
 
 void
 refcount_create(refcount_t *rc)
 {
 	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&rc->rc_list, sizeof (reference_t),
 	    offsetof(reference_t, ref_link));
 	list_create(&rc->rc_removed, sizeof (reference_t),
 	    offsetof(reference_t, ref_link));
 	rc->rc_count = 0;
 	rc->rc_removed_count = 0;
 }
 
 void
 refcount_destroy_many(refcount_t *rc, uint64_t number)
 {
 	reference_t *ref;
 
 	ASSERT(rc->rc_count == number);
 	while (ref = list_head(&rc->rc_list)) {
 		list_remove(&rc->rc_list, ref);
 		kmem_cache_free(reference_cache, ref);
 	}
 	list_destroy(&rc->rc_list);
 
 	while (ref = list_head(&rc->rc_removed)) {
 		list_remove(&rc->rc_removed, ref);
 		kmem_cache_free(reference_history_cache, ref->ref_removed);
 		kmem_cache_free(reference_cache, ref);
 	}
 	list_destroy(&rc->rc_removed);
 	mutex_destroy(&rc->rc_mtx);
 }
 
 void
 refcount_destroy(refcount_t *rc)
 {
 	refcount_destroy_many(rc, 0);
 }
 
 int
 refcount_is_zero(refcount_t *rc)
 {
 	ASSERT(rc->rc_count >= 0);
 	return (rc->rc_count == 0);
 }
 
 int64_t
 refcount_count(refcount_t *rc)
 {
 	ASSERT(rc->rc_count >= 0);
 	return (rc->rc_count);
 }
 
 int64_t
 refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
 {
-	reference_t *ref;
+	reference_t *ref = NULL;
 	int64_t count;
 
 	if (reference_tracking_enable) {
 		ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
 		ref->ref_holder = holder;
 		ref->ref_number = number;
 	}
 	mutex_enter(&rc->rc_mtx);
 	ASSERT(rc->rc_count >= 0);
 	if (reference_tracking_enable)
 		list_insert_head(&rc->rc_list, ref);
 	rc->rc_count += number;
 	count = rc->rc_count;
 	mutex_exit(&rc->rc_mtx);
 
 	return (count);
 }
 
 int64_t
 refcount_add(refcount_t *rc, void *holder)
 {
 	return (refcount_add_many(rc, 1, holder));
 }
 
 int64_t
 refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
 {
 	reference_t *ref;
 	int64_t count;
 
 	mutex_enter(&rc->rc_mtx);
 	ASSERT(rc->rc_count >= number);
 
 	if (!reference_tracking_enable) {
 		rc->rc_count -= number;
 		count = rc->rc_count;
 		mutex_exit(&rc->rc_mtx);
 		return (count);
 	}
 
 	for (ref = list_head(&rc->rc_list); ref;
 	    ref = list_next(&rc->rc_list, ref)) {
 		if (ref->ref_holder == holder && ref->ref_number == number) {
 			list_remove(&rc->rc_list, ref);
 			if (reference_history > 0) {
 				ref->ref_removed =
 				    kmem_cache_alloc(reference_history_cache,
 				    KM_SLEEP);
 				list_insert_head(&rc->rc_removed, ref);
 				rc->rc_removed_count++;
 				if (rc->rc_removed_count >= reference_history) {
 					ref = list_tail(&rc->rc_removed);
 					list_remove(&rc->rc_removed, ref);
 					kmem_cache_free(reference_history_cache,
 					    ref->ref_removed);
 					kmem_cache_free(reference_cache, ref);
 					rc->rc_removed_count--;
 				}
 			} else {
 				kmem_cache_free(reference_cache, ref);
 			}
 			rc->rc_count -= number;
 			count = rc->rc_count;
 			mutex_exit(&rc->rc_mtx);
 			return (count);
 		}
 	}
 	panic("No such hold %p on refcount %llx", holder,
 	    (u_longlong_t)(uintptr_t)rc);
 	return (-1);
 }
 
 int64_t
 refcount_remove(refcount_t *rc, void *holder)
 {
 	return (refcount_remove_many(rc, 1, holder));
 }
 
 void
 refcount_transfer(refcount_t *dst, refcount_t *src)
 {
 	int64_t count, removed_count;
 	list_t list, removed;
 
 	list_create(&list, sizeof (reference_t),
 	    offsetof(reference_t, ref_link));
 	list_create(&removed, sizeof (reference_t),
 	    offsetof(reference_t, ref_link));
 
 	mutex_enter(&src->rc_mtx);
 	count = src->rc_count;
 	removed_count = src->rc_removed_count;
 	src->rc_count = 0;
 	src->rc_removed_count = 0;
 	list_move_tail(&list, &src->rc_list);
 	list_move_tail(&removed, &src->rc_removed);
 	mutex_exit(&src->rc_mtx);
 
 	mutex_enter(&dst->rc_mtx);
 	dst->rc_count += count;
 	dst->rc_removed_count += removed_count;
 	list_move_tail(&dst->rc_list, &list);
 	list_move_tail(&dst->rc_removed, &removed);
 	mutex_exit(&dst->rc_mtx);
 
 	list_destroy(&list);
 	list_destroy(&removed);
 }
 
 #endif	/* ZFS_DEBUG */
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c	(revision 247192)
@@ -1,2002 +1,2004 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 iXsystems, Inc
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sunddi.h>
 #include <sys/sa_impl.h>
 #include <sys/dnode.h>
 #include <sys/errno.h>
 #include <sys/zfs_context.h>
 
 /*
  * ZFS System attributes:
  *
  * A generic mechanism to allow for arbitrary attributes
  * to be stored in a dnode.  The data will be stored in the bonus buffer of
  * the dnode and if necessary a special "spill" block will be used to handle
  * overflow situations.  The spill block will be sized to fit the data
  * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
  * spill block is stored at the end of the current bonus buffer.  Any
  * attributes that would be in the way of the blkptr_t will be relocated
  * into the spill block.
  *
  * Attribute registration:
  *
  * Stored persistently on a per dataset basis
  * a mapping between attribute "string" names and their actual attribute
  * numeric values, length, and byteswap function.  The names are only used
  * during registration.  All  attributes are known by their unique attribute
  * id value.  If an attribute can have a variable size then the value
  * 0 will be used to indicate this.
  *
  * Attribute Layout:
  *
  * Attribute layouts are a way to compactly store multiple attributes, but
  * without taking the overhead associated with managing each attribute
  * individually.  Since you will typically have the same set of attributes
  * stored in the same order a single table will be used to represent that
  * layout.  The ZPL for example will usually have only about 10 different
  * layouts (regular files, device files, symlinks,
  * regular files + scanstamp, files/dir with extended attributes, and then
  * you have the possibility of all of those minus ACL, because it would
  * be kicked out into the spill block)
  *
  * Layouts are simply an array of the attributes and their
  * ordering i.e. [0, 1, 4, 5, 2]
  *
  * Each distinct layout is given a unique layout number and that is whats
  * stored in the header at the beginning of the SA data buffer.
  *
  * A layout only covers a single dbuf (bonus or spill).  If a set of
  * attributes is split up between the bonus buffer and a spill buffer then
  * two different layouts will be used.  This allows us to byteswap the
  * spill without looking at the bonus buffer and keeps the on disk format of
  * the bonus and spill buffer the same.
  *
  * Adding a single attribute will cause the entire set of attributes to
  * be rewritten and could result in a new layout number being constructed
  * as part of the rewrite if no such layout exists for the new set of
  * attribues.  The new attribute will be appended to the end of the already
  * existing attributes.
  *
  * Both the attribute registration and attribute layout information are
  * stored in normal ZAP attributes.  Their should be a small number of
  * known layouts and the set of attributes is assumed to typically be quite
  * small.
  *
  * The registered attributes and layout "table" information is maintained
  * in core and a special "sa_os_t" is attached to the objset_t.
  *
  * A special interface is provided to allow for quickly applying
  * a large set of attributes at once.  sa_replace_all_by_template() is
  * used to set an array of attributes.  This is used by the ZPL when
  * creating a brand new file.  The template that is passed into the function
  * specifies the attribute, size for variable length attributes, location of
  * data and special "data locator" function if the data isn't in a contiguous
  * location.
  *
  * Byteswap implications:
  * Since the SA attributes are not entirely self describing we can't do
  * the normal byteswap processing.  The special ZAP layout attribute and
  * attribute registration attributes define the byteswap function and the
  * size of the attributes, unless it is variable sized.
  * The normal ZFS byteswapping infrastructure assumes you don't need
  * to read any objects in order to do the necessary byteswapping.  Whereas
  * SA attributes can only be properly byteswapped if the dataset is opened
  * and the layout/attribute ZAP attributes are available.  Because of this
  * the SA attributes will be byteswapped when they are first accessed by
  * the SA code that will read the SA data.
  */
 
 typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
     uint16_t length, int length_idx, boolean_t, void *userp);
 
 static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
 static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
 static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
     void *data);
 static void sa_idx_tab_rele(objset_t *os, void *arg);
 static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
     int buflen);
 static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx);
 
 arc_byteswap_func_t *sa_bswap_table[] = {
 	byteswap_uint64_array,
 	byteswap_uint32_array,
 	byteswap_uint16_array,
 	byteswap_uint8_array,
 	zfs_acl_byteswap,
 };
 
 #define	SA_COPY_DATA(f, s, t, l) \
 	{ \
 		if (f == NULL) { \
 			if (l == 8) { \
 				*(uint64_t *)t = *(uint64_t *)s; \
 			} else if (l == 16) { \
 				*(uint64_t *)t = *(uint64_t *)s; \
 				*(uint64_t *)((uintptr_t)t + 8) = \
 				    *(uint64_t *)((uintptr_t)s + 8); \
 			} else { \
 				bcopy(s, t, l); \
 			} \
 		} else \
 			sa_copy_data(f, s, t, l); \
 	}
 
 /*
  * This table is fixed and cannot be changed.  Its purpose is to
  * allow the SA code to work with both old/new ZPL file systems.
  * It contains the list of legacy attributes.  These attributes aren't
  * stored in the "attribute" registry zap objects, since older ZPL file systems
  * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
  * use this static table.
  */
 sa_attr_reg_t sa_legacy_attrs[] = {
 	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
 	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
 	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
 	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
 	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
 	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
 	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
 	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
 	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
 	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
 	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
 	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
 	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
 	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
 	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
 	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
 };
 
 /*
  * ZPL legacy layout
  * This is only used for objects of type DMU_OT_ZNODE
  */
 sa_attr_type_t sa_legacy_zpl_layout[] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
 /*
  * Special dummy layout used for buffers with no attributes.
  */
 
 sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
 
 static int sa_legacy_attr_count = 16;
 static kmem_cache_t *sa_cache = NULL;
 
 /*ARGSUSED*/
 static int
 sa_cache_constructor(void *buf, void *unused, int kmflag)
 {
 	sa_handle_t *hdl = buf;
 
 	hdl->sa_bonus_tab = NULL;
 	hdl->sa_spill_tab = NULL;
 	hdl->sa_os = NULL;
 	hdl->sa_userp = NULL;
 	hdl->sa_bonus = NULL;
 	hdl->sa_spill = NULL;
 	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 sa_cache_destructor(void *buf, void *unused)
 {
 	sa_handle_t *hdl = buf;
 	mutex_destroy(&hdl->sa_lock);
 }
 
 void
 sa_cache_init(void)
 {
 	sa_cache = kmem_cache_create("sa_cache",
 	    sizeof (sa_handle_t), 0, sa_cache_constructor,
 	    sa_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 sa_cache_fini(void)
 {
 	if (sa_cache)
 		kmem_cache_destroy(sa_cache);
 }
 
 static int
 layout_num_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = arg1;
 	const sa_lot_t *node2 = arg2;
 
 	if (node1->lot_num > node2->lot_num)
 		return (1);
 	else if (node1->lot_num < node2->lot_num)
 		return (-1);
 	return (0);
 }
 
 static int
 layout_hash_compare(const void *arg1, const void *arg2)
 {
 	const sa_lot_t *node1 = arg1;
 	const sa_lot_t *node2 = arg2;
 
 	if (node1->lot_hash > node2->lot_hash)
 		return (1);
 	if (node1->lot_hash < node2->lot_hash)
 		return (-1);
 	if (node1->lot_instance > node2->lot_instance)
 		return (1);
 	if (node1->lot_instance < node2->lot_instance)
 		return (-1);
 	return (0);
 }
 
 boolean_t
 sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
 {
 	int i;
 
 	if (count != tbf->lot_attr_count)
 		return (1);
 
 	for (i = 0; i != count; i++) {
 		if (attrs[i] != tbf->lot_attrs[i])
 			return (1);
 	}
 	return (0);
 }
 
 #define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
 
 static uint64_t
 sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
 {
 	int i;
 	uint64_t crc = -1ULL;
 
 	for (i = 0; i != attr_count; i++)
 		crc ^= SA_ATTR_HASH(attrs[i]);
 
 	return (crc);
 }
 
 static int
 sa_get_spill(sa_handle_t *hdl)
 {
 	int rc;
 	if (hdl->sa_spill == NULL) {
 		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
 		    &hdl->sa_spill)) == 0)
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 	} else {
 		rc = 0;
 	}
 
 	return (rc);
 }
 
 /*
  * Main attribute lookup/update function
  * returns 0 for success or non zero for failures
  *
  * Operates on bulk array, first failure will abort further processing
  */
 int
 sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     sa_data_op_t data_op, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int i;
 	int error = 0;
 	sa_buf_type_t buftypes;
 
 	buftypes = 0;
 
 	ASSERT(count > 0);
 	for (i = 0; i != count; i++) {
 		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
 
 		bulk[i].sa_addr = NULL;
 		/* First check the bonus buffer */
 
 		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
 		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
 			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
 			    SA_GET_HDR(hdl, SA_BONUS),
 			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
 			if (tx && !(buftypes & SA_BONUS)) {
 				dmu_buf_will_dirty(hdl->sa_bonus, tx);
 				buftypes |= SA_BONUS;
 			}
 		}
 		if (bulk[i].sa_addr == NULL &&
 		    ((error = sa_get_spill(hdl)) == 0)) {
 			if (TOC_ATTR_PRESENT(
 			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
 				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
 				    SA_GET_HDR(hdl, SA_SPILL),
 				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
 				if (tx && !(buftypes & SA_SPILL) &&
 				    bulk[i].sa_size == bulk[i].sa_length) {
 					dmu_buf_will_dirty(hdl->sa_spill, tx);
 					buftypes |= SA_SPILL;
 				}
 			}
 		}
 		if (error && error != ENOENT) {
 			return ((error == ECKSUM) ? EIO : error);
 		}
 
 		switch (data_op) {
 		case SA_LOOKUP:
 			if (bulk[i].sa_addr == NULL)
 				return (ENOENT);
 			if (bulk[i].sa_data) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_addr, bulk[i].sa_data,
 				    bulk[i].sa_size);
 			}
 			continue;
 
 		case SA_UPDATE:
 			/* existing rewrite of attr */
 			if (bulk[i].sa_addr &&
 			    bulk[i].sa_size == bulk[i].sa_length) {
 				SA_COPY_DATA(bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_addr,
 				    bulk[i].sa_length);
 				continue;
 			} else if (bulk[i].sa_addr) { /* attr size change */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_REPLACE, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			} else { /* adding new attribute */
 				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
 				    SA_ADD, bulk[i].sa_data_func,
 				    bulk[i].sa_data, bulk[i].sa_length, tx);
 			}
 			if (error)
 				return (error);
 			break;
 		}
 	}
 	return (error);
 }
 
 static sa_lot_t *
 sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
     uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, *findtb;
 	int i;
 	avl_index_t loc;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 	tb->lot_attr_count = attr_count;
 	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
 
 	if (zapadd) {
 		char attr_name[8];
 
 		if (sa->sa_layout_attr_obj == 0) {
 			sa->sa_layout_attr_obj = zap_create_link(os,
 			    DMU_OT_SA_ATTR_LAYOUTS,
 			    sa->sa_master_obj, SA_LAYOUTS, tx);
 		}
 
 		(void) snprintf(attr_name, sizeof (attr_name),
 		    "%d", (int)lot_num);
 		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
 		    attr_name, 2, attr_count, attrs, tx));
 	}
 
 	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
 	    offsetof(sa_idx_tab_t, sa_next));
 
 	for (i = 0; i != attr_count; i++) {
 		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
 			tb->lot_var_sizes++;
 	}
 
 	avl_add(&sa->sa_layout_num_tree, tb);
 
 	/* verify we don't have a hash collision */
 	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
 		for (; findtb && findtb->lot_hash == hash;
 		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
 			if (findtb->lot_instance != tb->lot_instance)
 				break;
 			tb->lot_instance++;
 		}
 	}
 	avl_add(&sa->sa_layout_hash_tree, tb);
 	return (tb);
 }
 
 static void
 sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
     int count, dmu_tx_t *tx, sa_lot_t **lot)
 {
 	sa_lot_t *tb, tbsearch;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	boolean_t found = B_FALSE;
 
 	mutex_enter(&sa->sa_lock);
 	tbsearch.lot_hash = hash;
 	tbsearch.lot_instance = 0;
 	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
 	if (tb) {
 		for (; tb && tb->lot_hash == hash;
 		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
 			if (sa_layout_equal(tb, attrs, count) == 0) {
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	if (!found) {
 		tb = sa_add_layout_entry(os, attrs, count,
 		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
 	}
 	mutex_exit(&sa->sa_lock);
 	*lot = tb;
 }
 
 static int
 sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 {
 	int error;
 	uint32_t blocksize;
 
 	if (size == 0) {
 		blocksize = SPA_MINBLOCKSIZE;
 	} else if (size > SPA_MAXBLOCKSIZE) {
 		ASSERT(0);
 		return (EFBIG);
 	} else {
 		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
 	}
 
 	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
 	ASSERT(error == 0);
 	return (error);
 }
 
 static void
 sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
 {
 	if (func == NULL) {
 		bcopy(datastart, target, buflen);
 	} else {
 		boolean_t start;
 		int bytes;
 		void *dataptr;
 		void *saptr = target;
 		uint32_t length;
 
 		start = B_TRUE;
 		bytes = 0;
 		while (bytes < buflen) {
 			func(&dataptr, &length, buflen, start, datastart);
 			bcopy(dataptr, saptr, length);
 			saptr = (void *)((caddr_t)saptr + length);
 			bytes += length;
 			start = B_FALSE;
 		}
 	}
 }
 
 /*
  * Determine several different sizes
  * first the sa header size
  * the number of bytes to be stored
  * if spill would occur the index in the attribute array is returned
  *
  * the boolean will_spill will be set when spilling is necessary.  It
  * is only set when the buftype is SA_BONUS
  */
 static int
 sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
     boolean_t *will_spill)
 {
 	int var_size = 0;
 	int i;
 	int j = -1;
 	int full_space;
 	int hdrsize;
 	boolean_t done = B_FALSE;
 
 	if (buftype == SA_BONUS && sa->sa_force_spill) {
 		*total = 0;
 		*index = 0;
 		*will_spill = B_TRUE;
 		return (0);
 	}
 
 	*index = -1;
 	*total = 0;
 
 	if (buftype == SA_BONUS)
 		*will_spill = B_FALSE;
 
 	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
 	    sizeof (sa_hdr_phys_t);
 
 	full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
 	ASSERT(IS_P2ALIGNED(full_space, 8));
 
 	for (i = 0; i != attr_count; i++) {
 		boolean_t is_var_sz;
 
 		*total = P2ROUNDUP(*total, 8);
 		*total += attr_desc[i].sa_length;
 		if (done)
 			goto next;
 
 		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
 		if (is_var_sz) {
 			var_size++;
 		}
 
 		if (is_var_sz && var_size > 1) {
 			if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
 			    *total < full_space) {
 				/*
 				 * Account for header space used by array of
 				 * optional sizes of variable-length attributes.
 				 * Record the index in case this increase needs
 				 * to be reversed due to spill-over.
 				 */
 				hdrsize += sizeof (uint16_t);
 				j = i;
 			} else {
 				done = B_TRUE;
 				*index = i;
 				if (buftype == SA_BONUS)
 					*will_spill = B_TRUE;
 				continue;
 			}
 		}
 
 		/*
 		 * find index of where spill *could* occur.
 		 * Then continue to count of remainder attribute
 		 * space.  The sum is used later for sizing bonus
 		 * and spill buffer.
 		 */
 		if (buftype == SA_BONUS && *index == -1 &&
 		    (*total + P2ROUNDUP(hdrsize, 8)) >
 		    (full_space - sizeof (blkptr_t))) {
 			*index = i;
 			done = B_TRUE;
 		}
 
 next:
 		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
 		    buftype == SA_BONUS)
 			*will_spill = B_TRUE;
 	}
 
 	/*
 	 * j holds the index of the last variable-sized attribute for
 	 * which hdrsize was increased.  Reverse the increase if that
 	 * attribute will be relocated to the spill block.
 	 */
 	if (*will_spill && j == *index)
 		hdrsize -= sizeof (uint16_t);
 
 	hdrsize = P2ROUNDUP(hdrsize, 8);
 	return (hdrsize);
 }
 
 #define	BUF_SPACE_NEEDED(total, header) (total + header)
 
 /*
  * Find layout that corresponds to ordering of attributes
  * If not found a new layout number is created and added to
  * persistent layout tables.
  */
 static int
 sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
     dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	uint64_t hash;
 	sa_buf_type_t buftype;
 	sa_hdr_phys_t *sahdr;
 	void *data_start;
 	int buf_space;
 	sa_attr_type_t *attrs, *attrs_start;
 	int i, lot_count;
-	int hdrsize, spillhdrsize;
+	int hdrsize;
+	int spillhdrsize = 0;
 	int used;
 	dmu_object_type_t bonustype;
 	sa_lot_t *lot;
 	int len_idx;
 	int spill_used;
 	boolean_t spilling;
 
 	dmu_buf_will_dirty(hdl->sa_bonus, tx);
 	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
 
 	/* first determine bonus header size and sum of all attributes */
 	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
 	    SA_BONUS, &i, &used, &spilling);
 
 	if (used > SPA_MAXBLOCKSIZE)
 		return (EFBIG);
 
 	VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
 	    MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
 	    used + hdrsize, tx));
 
 	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
 	    bonustype == DMU_OT_SA);
 
 	/* setup and size spill buffer when needed */
 	if (spilling) {
 		boolean_t dummy;
 
 		if (hdl->sa_spill == NULL) {
 			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
 			    &hdl->sa_spill) == 0);
 		}
 		dmu_buf_will_dirty(hdl->sa_spill, tx);
 
 		spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
 		    attr_count - i, hdl->sa_spill, SA_SPILL, &i,
 		    &spill_used, &dummy);
 
 		if (spill_used > SPA_MAXBLOCKSIZE)
 			return (EFBIG);
 
 		buf_space = hdl->sa_spill->db_size - spillhdrsize;
 		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
 		    hdl->sa_spill->db_size)
 			VERIFY(0 == sa_resize_spill(hdl,
 			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
 	}
 
 	/* setup starting pointers to lay down data */
 	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
 	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
 	buftype = SA_BONUS;
 
 	if (spilling)
 		buf_space = (sa->sa_force_spill) ?
 		    0 : SA_BLKPTR_SPACE - hdrsize;
 	else
 		buf_space = hdl->sa_bonus->db_size - hdrsize;
 
 	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
 	    KM_SLEEP);
 	lot_count = 0;
 
 	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
 		uint16_t length;
 
 		ASSERT(IS_P2ALIGNED(data_start, 8));
 		ASSERT(IS_P2ALIGNED(buf_space, 8));
 		attrs[i] = attr_desc[i].sa_attr;
 		length = SA_REGISTERED_LEN(sa, attrs[i]);
 		if (length == 0)
 			length = attr_desc[i].sa_length;
 		else
 			VERIFY(length == attr_desc[i].sa_length);
 
 		if (buf_space < length) {  /* switch to spill buffer */
 			VERIFY(spilling);
 			VERIFY(bonustype == DMU_OT_SA);
 			if (buftype == SA_BONUS && !sa->sa_force_spill) {
 				sa_find_layout(hdl->sa_os, hash, attrs_start,
 				    lot_count, tx, &lot);
 				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
 			}
 
 			buftype = SA_SPILL;
 			hash = -1ULL;
 			len_idx = 0;
 
 			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
 			sahdr->sa_magic = SA_MAGIC;
 			data_start = (void *)((uintptr_t)sahdr +
 			    spillhdrsize);
 			attrs_start = &attrs[i];
 			buf_space = hdl->sa_spill->db_size - spillhdrsize;
 			lot_count = 0;
 		}
 		hash ^= SA_ATTR_HASH(attrs[i]);
 		attr_desc[i].sa_addr = data_start;
 		attr_desc[i].sa_size = length;
 		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
 		    data_start, length);
 		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
 			sahdr->sa_lengths[len_idx++] = length;
 		}
 		VERIFY((uintptr_t)data_start % 8 == 0);
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    length), 8);
 		buf_space -= P2ROUNDUP(length, 8);
 		lot_count++;
 	}
 
 	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
 
 	/*
 	 * Verify that old znodes always have layout number 0.
 	 * Must be DMU_OT_SA for arbitrary layouts
 	 */
 	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
 	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
 
 	if (bonustype == DMU_OT_SA) {
 		SA_SET_HDR(sahdr, lot->lot_num,
 		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
 	}
 
 	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
 	if (hdl->sa_bonus_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 		hdl->sa_bonus_tab = NULL;
 	}
 	if (!sa->sa_force_spill)
 		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
 	if (hdl->sa_spill) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		if (!spilling) {
 			/*
 			 * remove spill block that is no longer needed.
 			 */
 			dmu_buf_rele(hdl->sa_spill, NULL);
 			hdl->sa_spill = NULL;
 			hdl->sa_spill_tab = NULL;
 			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
 			    sa_handle_object(hdl), tx));
 		} else {
 			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
 		}
 	}
 
 	return (0);
 }
 
 static void
 sa_free_attr_table(sa_os_t *sa)
 {
 	int i;
 
 	if (sa->sa_attr_table == NULL)
 		return;
 
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_name)
 			kmem_free(sa->sa_attr_table[i].sa_name,
 			    strlen(sa->sa_attr_table[i].sa_name) + 1);
 	}
 
 	kmem_free(sa->sa_attr_table,
 	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
 
 	sa->sa_attr_table = NULL;
 }
 
 static int
 sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
 {
 	sa_os_t *sa = os->os_sa;
 	uint64_t sa_attr_count = 0;
-	uint64_t sa_reg_count;
+	uint64_t sa_reg_count = 0;
 	int error = 0;
 	uint64_t attr_value;
 	sa_attr_table_t *tb;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int registered_count = 0;
 	int i;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 
 	sa->sa_user_table =
 	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
 	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
 
 	if (sa->sa_reg_attr_obj != 0) {
 		error = zap_count(os, sa->sa_reg_attr_obj,
 		    &sa_attr_count);
 
 		/*
 		 * Make sure we retrieved a count and that it isn't zero
 		 */
 		if (error || (error == 0 && sa_attr_count == 0)) {
 			if (error == 0)
 				error = EINVAL;
 			goto bail;
 		}
 		sa_reg_count = sa_attr_count;
 	}
 
 	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
 		sa_attr_count += sa_legacy_attr_count;
 
 	/* Allocate attribute numbers for attributes that aren't registered */
 	for (i = 0; i != count; i++) {
 		boolean_t found = B_FALSE;
 		int j;
 
 		if (ostype == DMU_OST_ZFS) {
 			for (j = 0; j != sa_legacy_attr_count; j++) {
 				if (strcmp(reg_attrs[i].sa_name,
 				    sa_legacy_attrs[j].sa_name) == 0) {
 					sa->sa_user_table[i] =
 					    sa_legacy_attrs[j].sa_attr;
 					found = B_TRUE;
 				}
 			}
 		}
 		if (found)
 			continue;
 
 		if (sa->sa_reg_attr_obj)
 			error = zap_lookup(os, sa->sa_reg_attr_obj,
 			    reg_attrs[i].sa_name, 8, 1, &attr_value);
 		else
 			error = ENOENT;
 		switch (error) {
 		case ENOENT:
 			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
 			sa_attr_count++;
 			break;
 		case 0:
 			sa->sa_user_table[i] = ATTR_NUM(attr_value);
 			break;
 		default:
 			goto bail;
 		}
 	}
 
 	sa->sa_num_attrs = sa_attr_count;
 	tb = sa->sa_attr_table =
 	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
 
 	/*
 	 * Attribute table is constructed from requested attribute list,
 	 * previously foreign registered attributes, and also the legacy
 	 * ZPL set of attributes.
 	 */
 
 	if (sa->sa_reg_attr_obj) {
 		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t value;
 			value  = za.za_first_integer;
 
 			registered_count++;
 			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
 			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
 			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
 			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
 
 			if (tb[ATTR_NUM(value)].sa_name) {
 				continue;
 			}
 			tb[ATTR_NUM(value)].sa_name =
 			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
 			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
 			    strlen(za.za_name) +1);
 		}
 		zap_cursor_fini(&zc);
 		/*
 		 * Make sure we processed the correct number of registered
 		 * attributes
 		 */
 		if (registered_count != sa_reg_count) {
 			ASSERT(error != 0);
 			goto bail;
 		}
 
 	}
 
 	if (ostype == DMU_OST_ZFS) {
 		for (i = 0; i != sa_legacy_attr_count; i++) {
 			if (tb[i].sa_name)
 				continue;
 			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
 			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
 			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
 			tb[i].sa_registered = B_FALSE;
 			tb[i].sa_name =
 			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
 			    KM_SLEEP);
 			(void) strlcpy(tb[i].sa_name,
 			    sa_legacy_attrs[i].sa_name,
 			    strlen(sa_legacy_attrs[i].sa_name) + 1);
 		}
 	}
 
 	for (i = 0; i != count; i++) {
 		sa_attr_type_t attr_id;
 
 		attr_id = sa->sa_user_table[i];
 		if (tb[attr_id].sa_name)
 			continue;
 
 		tb[attr_id].sa_length = reg_attrs[i].sa_length;
 		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
 		tb[attr_id].sa_attr = attr_id;
 		tb[attr_id].sa_name =
 		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
 		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
 		    strlen(reg_attrs[i].sa_name) + 1);
 	}
 
 	sa->sa_need_attr_registration =
 	    (sa_attr_count != registered_count);
 
 	return (0);
 bail:
 	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
 	sa->sa_user_table = NULL;
 	sa_free_attr_table(sa);
 	return ((error != 0) ? error : EINVAL);
 }
 
 int
 sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
     sa_attr_type_t **user_table)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	sa_os_t *sa;
 	dmu_objset_type_t ostype = dmu_objset_type(os);
 	sa_attr_type_t *tb;
 	int error;
 
 	mutex_enter(&os->os_lock);
 	if (os->os_sa) {
 		mutex_enter(&os->os_sa->sa_lock);
 		mutex_exit(&os->os_lock);
 		tb = os->os_sa->sa_user_table;
 		mutex_exit(&os->os_sa->sa_lock);
 		*user_table = tb;
 		return (0);
 	}
 
 	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
 	mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
 	sa->sa_master_obj = sa_obj;
 
 	os->os_sa = sa;
 	mutex_enter(&sa->sa_lock);
 	mutex_exit(&os->os_lock);
 	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
 	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
 	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
 
 	if (sa_obj) {
 		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
 		    8, 1, &sa->sa_layout_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 		error = zap_lookup(os, sa_obj, SA_REGISTRY,
 		    8, 1, &sa->sa_reg_attr_obj);
 		if (error != 0 && error != ENOENT)
 			goto fail;
 	}
 
 	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
 		goto fail;
 
 	if (sa->sa_layout_attr_obj != 0) {
 		uint64_t layout_count;
 
 		error = zap_count(os, sa->sa_layout_attr_obj,
 		    &layout_count);
 
 		/*
 		 * Layout number count should be > 0
 		 */
 		if (error || (error == 0 && layout_count == 0)) {
 			if (error == 0)
 				error = EINVAL;
 			goto fail;
 		}
 
 		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
 		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			sa_attr_type_t *lot_attrs;
 			uint64_t lot_num;
 
 			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
 			    za.za_num_integers, KM_SLEEP);
 
 			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
 			    za.za_name, 2, za.za_num_integers,
 			    lot_attrs))) != 0) {
 				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 				    za.za_num_integers);
 				break;
 			}
 			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
 			    (unsigned long long *)&lot_num) == 0);
 
 			(void) sa_add_layout_entry(os, lot_attrs,
 			    za.za_num_integers, lot_num,
 			    sa_layout_info_hash(lot_attrs,
 			    za.za_num_integers), B_FALSE, NULL);
 			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
 			    za.za_num_integers);
 		}
 		zap_cursor_fini(&zc);
 
 		/*
 		 * Make sure layout count matches number of entries added
 		 * to AVL tree
 		 */
 		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
 			ASSERT(error != 0);
 			goto fail;
 		}
 	}
 
 	/* Add special layout number for old ZNODES */
 	if (ostype == DMU_OST_ZFS) {
 		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
 		    sa_legacy_attr_count, 0,
 		    sa_layout_info_hash(sa_legacy_zpl_layout,
 		    sa_legacy_attr_count), B_FALSE, NULL);
 
 		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
 		    0, B_FALSE, NULL);
 	}
 	*user_table = os->os_sa->sa_user_table;
 	mutex_exit(&sa->sa_lock);
 	return (0);
 fail:
 	os->os_sa = NULL;
 	sa_free_attr_table(sa);
 	if (sa->sa_user_table)
 		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 	mutex_exit(&sa->sa_lock);
 	kmem_free(sa, sizeof (sa_os_t));
 	return ((error == ECKSUM) ? EIO : error);
 }
 
 void
 sa_tear_down(objset_t *os)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *layout;
 	void *cookie;
 
 	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
 
 	/* Free up attr table */
 
 	sa_free_attr_table(sa);
 
 	cookie = NULL;
 	while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
 		sa_idx_tab_t *tab;
 		while (tab = list_head(&layout->lot_idx_tab)) {
 			ASSERT(refcount_count(&tab->sa_refcount));
 			sa_idx_tab_rele(os, tab);
 		}
 	}
 
 	cookie = NULL;
 	while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
 		kmem_free(layout->lot_attrs,
 		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
 		kmem_free(layout, sizeof (sa_lot_t));
 	}
 
 	avl_destroy(&sa->sa_layout_hash_tree);
 	avl_destroy(&sa->sa_layout_num_tree);
 
 	kmem_free(sa, sizeof (sa_os_t));
 	os->os_sa = NULL;
 }
 
 void
 sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t var_length, void *userp)
 {
 	sa_idx_tab_t *idx_tab = userp;
 
 	if (var_length) {
 		ASSERT(idx_tab->sa_variable_lengths);
 		idx_tab->sa_variable_lengths[length_idx] = length;
 	}
 	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
 	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
 }
 
 static void
 sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
     sa_iterfunc_t func, sa_lot_t *tab, void *userp)
 {
 	void *data_start;
 	sa_lot_t *tb = tab;
 	sa_lot_t search;
 	avl_index_t loc;
 	sa_os_t *sa = os->os_sa;
 	int i;
 	uint16_t *length_start = NULL;
 	uint8_t length_idx = 0;
 
 	if (tab == NULL) {
 		search.lot_num = SA_LAYOUT_NUM(hdr, type);
 		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 		ASSERT(tb);
 	}
 
 	if (IS_SA_BONUSTYPE(type)) {
 		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
 		    offsetof(sa_hdr_phys_t, sa_lengths) +
 		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
 		length_start = hdr->sa_lengths;
 	} else {
 		data_start = hdr;
 	}
 
 	for (i = 0; i != tb->lot_attr_count; i++) {
 		int attr_length, reg_length;
 		uint8_t idx_len;
 
 		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
 		if (reg_length) {
 			attr_length = reg_length;
 			idx_len = 0;
 		} else {
 			attr_length = length_start[length_idx];
 			idx_len = length_idx++;
 		}
 
 		func(hdr, data_start, tb->lot_attrs[i], attr_length,
 		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
 
 		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
 		    attr_length), 8);
 	}
 }
 
 /*ARGSUSED*/
 void
 sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
     uint16_t length, int length_idx, boolean_t variable_length, void *userp)
 {
 	sa_handle_t *hdl = userp;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
 }
 
 void
 sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 	dmu_buf_impl_t *db;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	int num_lengths = 1;
 	int i;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	if (sa_hdr_phys->sa_magic == SA_MAGIC)
 		return;
 
 	db = SA_GET_DB(hdl, buftype);
 
 	if (buftype == SA_SPILL) {
 		arc_release(db->db_buf, NULL);
 		arc_buf_thaw(db->db_buf);
 	}
 
 	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
 	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
 
 	/*
 	 * Determine number of variable lenghts in header
 	 * The standard 8 byte header has one for free and a
 	 * 16 byte header would have 4 + 1;
 	 */
 	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
 		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
 	for (i = 0; i != num_lengths; i++)
 		sa_hdr_phys->sa_lengths[i] =
 		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
 
 	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
 	    sa_byteswap_cb, NULL, hdl);
 
 	if (buftype == SA_SPILL)
 		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
 }
 
 static int
 sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
 {
 	sa_hdr_phys_t *sa_hdr_phys;
 	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
 	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_idx_tab_t *idx_tab;
 
 	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
 
 	mutex_enter(&sa->sa_lock);
 
 	/* Do we need to byteswap? */
 
 	/* only check if not old znode */
 	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
 	    sa_hdr_phys->sa_magic != 0) {
 		VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
 		sa_byteswap(hdl, buftype);
 	}
 
 	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
 
 	if (buftype == SA_BONUS)
 		hdl->sa_bonus_tab = idx_tab;
 	else
 		hdl->sa_spill_tab = idx_tab;
 
 	mutex_exit(&sa->sa_lock);
 	return (0);
 }
 
 /*ARGSUSED*/
 void
 sa_evict(dmu_buf_t *db, void *sap)
 {
 	panic("evicting sa dbuf %p\n", (void *)db);
 }
 
 static void
 sa_idx_tab_rele(objset_t *os, void *arg)
 {
 	sa_os_t *sa = os->os_sa;
 	sa_idx_tab_t *idx_tab = arg;
 
 	if (idx_tab == NULL)
 		return;
 
 	mutex_enter(&sa->sa_lock);
 	if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
 		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
 		if (idx_tab->sa_variable_lengths)
 			kmem_free(idx_tab->sa_variable_lengths,
 			    sizeof (uint16_t) *
 			    idx_tab->sa_layout->lot_var_sizes);
 		refcount_destroy(&idx_tab->sa_refcount);
 		kmem_free(idx_tab->sa_idx_tab,
 		    sizeof (uint32_t) * sa->sa_num_attrs);
 		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
 	}
 	mutex_exit(&sa->sa_lock);
 }
 
 static void
 sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
 {
 	sa_os_t *sa = os->os_sa;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	(void) refcount_add(&idx_tab->sa_refcount, NULL);
 }
 
 void
 sa_handle_destroy(sa_handle_t *hdl)
 {
 	mutex_enter(&hdl->sa_lock);
 	(void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
 	    NULL, NULL, NULL);
 
 	if (hdl->sa_bonus_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
 		hdl->sa_bonus_tab = NULL;
 	}
 	if (hdl->sa_spill_tab) {
 		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
 		hdl->sa_spill_tab = NULL;
 	}
 
 	dmu_buf_rele(hdl->sa_bonus, NULL);
 
 	if (hdl->sa_spill)
 		dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
 	mutex_exit(&hdl->sa_lock);
 
 	kmem_cache_free(sa_cache, hdl);
 }
 
 int
 sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	int error = 0;
 	dmu_object_info_t doi;
 	sa_handle_t *handle;
 
 #ifdef ZFS_DEBUG
 	dmu_object_info_from_db(db, &doi);
 	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
 	    doi.doi_bonus_type == DMU_OT_ZNODE);
 #endif
 	/* find handle, if it exists */
 	/* if one doesn't exist then create a new one, and initialize it */
 
 	handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
 	if (handle == NULL) {
 		sa_handle_t *newhandle;
 		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
 		handle->sa_userp = userp;
 		handle->sa_bonus = db;
 		handle->sa_os = os;
 		handle->sa_spill = NULL;
 
 		error = sa_build_index(handle, SA_BONUS);
 		newhandle = (hdl_type == SA_HDL_SHARED) ?
 		    dmu_buf_set_user_ie(db, handle,
 		    NULL, sa_evict) : NULL;
 
 		if (newhandle != NULL) {
 			kmem_cache_free(sa_cache, handle);
 			handle = newhandle;
 		}
 	}
 	*handlepp = handle;
 
 	return (error);
 }
 
 int
 sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
     sa_handle_type_t hdl_type, sa_handle_t **handlepp)
 {
 	dmu_buf_t *db;
 	int error;
 
 	if (error = dmu_bonus_hold(objset, objid, NULL, &db))
 		return (error);
 
 	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
 	    handlepp));
 }
 
 int
 sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
 {
 	return (dmu_bonus_hold(objset, obj_num, tag, db));
 }
 
 void
 sa_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dmu_buf_rele(db, tag);
 }
 
 int
 sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
 }
 
 int
 sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = buf;
 	bulk.sa_length = buflen;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_lookup_impl(hdl, &bulk, 1);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 #ifdef _KERNEL
 int
 sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
 		error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
 		    uio->uio_resid), UIO_READ, uio);
 	}
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 
 }
 #endif
 
 void *
 sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
 {
 	sa_idx_tab_t *idx_tab;
 	sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, search;
 	avl_index_t loc;
 
 	/*
 	 * Deterimine layout number.  If SA node and header == 0 then
 	 * force the index table to the dummy "1" empty layout.
 	 *
 	 * The layout number would only be zero for a newly created file
 	 * that has not added any attributes yet, or with crypto enabled which
 	 * doesn't write any attributes to the bonus buffer.
 	 */
 
 	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
 
 	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
 
 	/* Verify header size is consistent with layout information */
 	ASSERT(tb);
 	ASSERT(IS_SA_BONUSTYPE(bonustype) &&
 	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
 	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
 
 	/*
 	 * See if any of the already existing TOC entries can be reused?
 	 */
 
 	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
 	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
 		boolean_t valid_idx = B_TRUE;
 		int i;
 
 		if (tb->lot_var_sizes != 0 &&
 		    idx_tab->sa_variable_lengths != NULL) {
 			for (i = 0; i != tb->lot_var_sizes; i++) {
 				if (hdr->sa_lengths[i] !=
 				    idx_tab->sa_variable_lengths[i]) {
 					valid_idx = B_FALSE;
 					break;
 				}
 			}
 		}
 		if (valid_idx) {
 			sa_idx_tab_hold(os, idx_tab);
 			return (idx_tab);
 		}
 	}
 
 	/* No such luck, create a new entry */
 	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
 	idx_tab->sa_idx_tab =
 	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
 	idx_tab->sa_layout = tb;
 	refcount_create(&idx_tab->sa_refcount);
 	if (tb->lot_var_sizes)
 		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
 		    tb->lot_var_sizes, KM_SLEEP);
 
 	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
 	    tb, idx_tab);
 	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
 	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
 	list_insert_tail(&tb->lot_idx_tab, idx_tab);
 	return (idx_tab);
 }
 
 void
 sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
     boolean_t start, void *userdata)
 {
 	ASSERT(start);
 
 	*dataptr = userdata;
 	*len = total_len;
 }
 
 static void
 sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
 {
 	uint64_t attr_value = 0;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	sa_attr_table_t *tb = sa->sa_attr_table;
 	int i;
 
 	mutex_enter(&sa->sa_lock);
 
 	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
 		mutex_exit(&sa->sa_lock);
 		return;
 	}
 
 	if (sa->sa_reg_attr_obj == 0) {
 		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
 		    DMU_OT_SA_ATTR_REGISTRATION,
 		    sa->sa_master_obj, SA_REGISTRY, tx);
 	}
 	for (i = 0; i != sa->sa_num_attrs; i++) {
 		if (sa->sa_attr_table[i].sa_registered)
 			continue;
 		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
 		    tb[i].sa_byteswap);
 		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
 		    tb[i].sa_name, 8, 1, &attr_value, tx));
 		tb[i].sa_registered = B_TRUE;
 	}
 	sa->sa_need_attr_registration = B_FALSE;
 	mutex_exit(&sa->sa_lock);
 }
 
 /*
  * Replace all attributes with attributes specified in template.
  * If dnode had a spill buffer then those attributes will be
  * also be replaced, possibly with just an empty spill block
  *
  * This interface is intended to only be used for bulk adding of
  * attributes for a new file.  It will also be used by the ZPL
  * when converting and old formatted znode to native SA support.
  */
 int
 sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
 }
 
 int
 sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
     int attr_count, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_replace_all_by_template_locked(hdl, attr_desc,
 	    attr_count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Add/remove a single attribute or replace a variable-sized attribute value
  * with a value of a different size, and then rewrite the entire set
  * of attributes.
  * Same-length attribute value replacement (including fixed-length attributes)
  * is handled more efficiently by the upper layers.
  */
 static int
 sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
     sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
     uint16_t buflen, dmu_tx_t *tx)
 {
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 	dnode_t *dn;
 	sa_bulk_attr_t *attr_desc;
 	void *old_data[2];
 	int bonus_attr_count = 0;
-	int bonus_data_size, spill_data_size;
+	int bonus_data_size = 0;
+	int spill_data_size = 0;
 	int spill_attr_count = 0;
 	int error;
 	uint16_t length;
 	int i, j, k, length_idx;
 	sa_hdr_phys_t *hdr;
 	sa_idx_tab_t *idx_tab;
 	int attr_count;
 	int count;
 
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* First make of copy of the old data */
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn->dn_bonuslen != 0) {
 		bonus_data_size = hdl->sa_bonus->db_size;
 		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
 		bcopy(hdl->sa_bonus->db_data, old_data[0],
 		    hdl->sa_bonus->db_size);
 		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
 	} else {
 		old_data[0] = NULL;
 	}
 	DB_DNODE_EXIT(db);
 
 	/* Bring spill buffer online if it isn't currently */
 
 	if ((error = sa_get_spill(hdl)) == 0) {
 		spill_data_size = hdl->sa_spill->db_size;
 		old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
 		bcopy(hdl->sa_spill->db_data, old_data[1],
 		    hdl->sa_spill->db_size);
 		spill_attr_count =
 		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
 	} else if (error && error != ENOENT) {
 		if (old_data[0])
 			kmem_free(old_data[0], bonus_data_size);
 		return (error);
 	} else {
 		old_data[1] = NULL;
 	}
 
 	/* build descriptor of all attributes */
 
 	attr_count = bonus_attr_count + spill_attr_count;
 	if (action == SA_ADD)
 		attr_count++;
 	else if (action == SA_REMOVE)
 		attr_count--;
 
 	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
 
 	/*
 	 * loop through bonus and spill buffer if it exists, and
 	 * build up new attr_descriptor to reset the attributes
 	 */
 	k = j = 0;
 	count = bonus_attr_count;
 	hdr = SA_GET_HDR(hdl, SA_BONUS);
 	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
 	for (; k != 2; k++) {
 		/* iterate over each attribute in layout */
 		for (i = 0, length_idx = 0; i != count; i++) {
 			sa_attr_type_t attr;
 
 			attr = idx_tab->sa_layout->lot_attrs[i];
 			if (attr == newattr) {
 				/* duplicate attributes are not allowed */
 				ASSERT(action == SA_REPLACE ||
 				    action == SA_REMOVE);
 				/* must be variable-sized to be replaced here */
 				if (action == SA_REPLACE) {
 					ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
 					SA_ADD_BULK_ATTR(attr_desc, j, attr,
 					    locator, datastart, buflen);
 				}
 			} else {
 				length = SA_REGISTERED_LEN(sa, attr);
 				if (length == 0) {
 					length = hdr->sa_lengths[length_idx];
 				}
 
 				SA_ADD_BULK_ATTR(attr_desc, j, attr,
 				    NULL, (void *)
 				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
 				    (uintptr_t)old_data[k]), length);
 			}
 			if (SA_REGISTERED_LEN(sa, attr) == 0)
 				length_idx++;
 		}
 		if (k == 0 && hdl->sa_spill) {
 			hdr = SA_GET_HDR(hdl, SA_SPILL);
 			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
 			count = spill_attr_count;
 		} else {
 			break;
 		}
 	}
 	if (action == SA_ADD) {
 		length = SA_REGISTERED_LEN(sa, newattr);
 		if (length == 0) {
 			length = buflen;
 		}
 		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
 		    datastart, buflen);
 	}
 	ASSERT3U(j, ==, attr_count);
 
 	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
 
 	if (old_data[0])
 		kmem_free(old_data[0], bonus_data_size);
 	if (old_data[1])
 		kmem_free(old_data[1], spill_data_size);
 	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
 
 	return (error);
 }
 
 static int
 sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
     dmu_tx_t *tx)
 {
 	int error;
 	sa_os_t *sa = hdl->sa_os->os_sa;
 	dmu_object_type_t bonustype;
 
 	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
 
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 
 	/* sync out registration table if necessary */
 	if (sa->sa_need_attr_registration)
 		sa_attr_register_sync(hdl, tx);
 
 	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
 	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
 		sa->sa_update_cb(hdl, tx);
 
 	return (error);
 }
 
 /*
  * update or add new attribute
  */
 int
 sa_update(sa_handle_t *hdl, sa_attr_type_t type,
     void *buf, uint32_t buflen, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = type;
 	bulk.sa_data_func = NULL;
 	bulk.sa_length = buflen;
 	bulk.sa_data = buf;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
     uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
 {
 	int error;
 	sa_bulk_attr_t bulk;
 
 	bulk.sa_attr = attr;
 	bulk.sa_data = userdata;
 	bulk.sa_data_func = locator;
 	bulk.sa_length = buflen;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 /*
  * Return size of an attribute
  */
 
 int
 sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
 {
 	sa_bulk_attr_t bulk;
 	int error;
 
 	bulk.sa_data = NULL;
 	bulk.sa_attr = attr;
 	bulk.sa_data_func = NULL;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
 		mutex_exit(&hdl->sa_lock);
 		return (error);
 	}
 	*size = bulk.sa_size;
 
 	mutex_exit(&hdl->sa_lock);
 	return (0);
 }
 
 int
 sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	ASSERT(hdl);
 	ASSERT(MUTEX_HELD(&hdl->sa_lock));
 	return (sa_lookup_impl(hdl, attrs, count));
 }
 
 int
 sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_lookup_locked(hdl, attrs, count);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
 {
 	int error;
 
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 	error = sa_bulk_update_impl(hdl, attrs, count, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 int
 sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
 {
 	int error;
 
 	mutex_enter(&hdl->sa_lock);
 	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
 	    NULL, 0, tx);
 	mutex_exit(&hdl->sa_lock);
 	return (error);
 }
 
 void
 sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
 {
 	dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
 }
 
 void
 sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
 {
 	dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
 	    blksize, nblocks);
 }
 
 void
 sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
 {
 	(void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
 	    oldhdl, newhdl, NULL, sa_evict);
 	oldhdl->sa_bonus = NULL;
 }
 
 void
 sa_set_userp(sa_handle_t *hdl, void *ptr)
 {
 	hdl->sa_userp = ptr;
 }
 
 dmu_buf_t *
 sa_get_db(sa_handle_t *hdl)
 {
 	return ((dmu_buf_t *)hdl->sa_bonus);
 }
 
 void *
 sa_get_userdata(sa_handle_t *hdl)
 {
 	return (hdl->sa_userp);
 }
 
 void
 sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
 {
 	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
 	os->os_sa->sa_update_cb = func;
 }
 
 void
 sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
 {
 
 	mutex_enter(&os->os_sa->sa_lock);
 	sa_register_update_callback_locked(os, func);
 	mutex_exit(&os->os_sa->sa_lock);
 }
 
 uint64_t
 sa_handle_object(sa_handle_t *hdl)
 {
 	return (hdl->sa_bonus->db_object);
 }
 
 boolean_t
 sa_enabled(objset_t *os)
 {
 	return (os->os_sa == NULL);
 }
 
 int
 sa_set_sa_object(objset_t *os, uint64_t sa_object)
 {
 	sa_os_t *sa = os->os_sa;
 
 	if (sa->sa_master_obj)
 		return (1);
 
 	sa->sa_master_obj = sa_object;
 
 	return (0);
 }
 
 int
 sa_hdrsize(void *arg)
 {
 	sa_hdr_phys_t *hdr = arg;
 
 	return (SA_HDR_SIZE(hdr));
 }
 
 void
 sa_handle_lock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_enter(&hdl->sa_lock);
 }
 
 void
 sa_handle_unlock(sa_handle_t *hdl)
 {
 	ASSERT(hdl);
 	mutex_exit(&hdl->sa_lock);
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 247192)
@@ -1,6645 +1,6646 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
 #include <sys/zvol.h>
 #include <sys/trim_map.h>
 
 #ifdef	_KERNEL
 #include <sys/callb.h>
 #include <sys/cpupart.h>
 #include <sys/zone.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /* Check hostid on import? */
 static int check_hostid = 1;
 
 SYSCTL_DECL(_vfs_zfs);
 TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
     "Check hostid on import?");
 
 typedef enum zti_modes {
 	zti_mode_fixed,			/* value is # of threads (min 1) */
 	zti_mode_online_percent,	/* value is % of online CPUs */
 	zti_mode_batch,			/* cpu-intensive; value is ignored */
 	zti_mode_null,			/* don't create a taskq */
 	zti_nmodes
 } zti_modes_t;
 
 #define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
 #define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
 #define	ZTI_BATCH	{ zti_mode_batch, 0 }
 #define	ZTI_NULL	{ zti_mode_null, 0 }
 
 #define	ZTI_ONE		ZTI_FIX(1)
 
 typedef struct zio_taskq_info {
 	enum zti_modes zti_mode;
 	uint_t zti_value;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"issue", "issue_high", "intr", "intr_high"
 };
 
 /*
  * Define the taskq threads for the following I/O types:
  * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
  */
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
 	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
 	{ ZTI_FIX(100),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 };
 
 static dsl_syncfunc_t spa_sync_version;
 static dsl_syncfunc_t spa_sync_props;
 static dsl_checkfunc_t spa_change_guid_check;
 static dsl_syncfunc_t spa_change_guid_sync;
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
 #ifdef PSRSET_BIND
 id_t		zio_taskq_psrset_bind = PS_NONE;
 #endif
 #ifdef SYSDC
 boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
 #endif
 uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 extern int	zfs_sync_pass_deferred_free;
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 
 	if (strval != NULL)
 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 	else
 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 
 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size;
 	uint64_t alloc;
 	uint64_t space;
 	uint64_t cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 		size = metaslab_class_get_space(spa_normal_class(spa));
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 
 		space = 0;
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 			space += tvd->vdev_max_asize - tvd->vdev_asize;
 		}
 		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
 		    src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == FREAD), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 			src = ZPROP_SRC_DEFAULT;
 		else
 			src = ZPROP_SRC_LOCAL;
 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 	}
 
 	if (pool != NULL) {
 		dsl_dir_t *freedir = pool->dp_free_dir;
 
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (freedir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 			    freedir->dd_phys->dd_used_bytes, src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nvp);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0) {
 		mutex_exit(&spa->spa_props_lock);
 		return (0);
 	}
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 			continue;
 
 		switch (za.za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za.za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_pool_t *dp;
 				dsl_dataset_t *ds = NULL;
 
 				dp = spa_get_dsl(spa);
 				rw_enter(&dp->dp_config_rwlock, RW_READER);
 				if (err = dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &ds)) {
 					rw_exit(&dp->dp_config_rwlock);
 					break;
 				}
 
 				strval = kmem_alloc(
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 				rw_exit(&dp->dp_config_rwlock);
 			} else {
 				strval = NULL;
 				intval = za.za_first_integer;
 			}
 
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval,
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za.za_name, 1, za.za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za.za_num_integers);
 				break;
 			}
 			spa_prop_add_list(*nvp, prop, strval, 0, src);
 			kmem_free(strval, za.za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 	mutex_exit(&spa->spa_props_lock);
 out:
 	if (err && err != ENOENT) {
 		nvlist_free(*nvp);
 		*nvp = NULL;
 		return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
-	uint64_t objnum;
+	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPROP_INVAL:
 			if (!zpool_prop_feature(propname)) {
 				error = EINVAL;
 				break;
 			}
 
 			/*
 			 * Sanitize the input.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 				error = EINVAL;
 				break;
 			}
 
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 
 			if (intval != 0) {
 				error = EINVAL;
 				break;
 			}
 
 			fname = strchr(propname, '@') + 1;
 			if (zfeature_lookup_name(fname, NULL) != 0) {
 				error = EINVAL;
 				break;
 			}
 
 			has_feature = B_TRUE;
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = EINVAL;
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = EINVAL;
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = ENOTSUP;
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = ENOTSUP;
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 				uint64_t compress;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				if (error = dmu_objset_hold(strval, FTAG, &os))
 					break;
 
 				/* Must be ZPL and not gzip compressed. */
 
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = ENOTSUP;
 				} else if ((error = dsl_prop_get_integer(strval,
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    &compress, NULL)) == 0 &&
 				    !BOOTFS_COMPRESS_VALID(compress)) {
 					error = ENOTSUP;
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 			    intval > ZIO_FAILURE_MODE_PANIC))
 				error = EINVAL;
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = EIO;
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = EINVAL;
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = EINVAL;
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				/*
 				 * The kernel doesn't have an easy isprint()
 				 * check.  For this kernel check, we merely
 				 * check ASCII apart from DEL.  Fix this if
 				 * there is an easy-to-use kernel isprint().
 				 */
 				if (*check >= 0x7f) {
 					error = EINVAL;
 					break;
 				}
 				check++;
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = E2BIG;
 			break;
 
 		case ZPOOL_PROP_DEDUPDITTO:
 			if (spa_version(spa) < SPA_VERSION_DEDUP)
 				error = ENOTSUP;
 			else
 				error = nvpair_value_uint64(elem, &intval);
 			if (error == 0 &&
 			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 				error = EINVAL;
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
 			uint64_t ver;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
 			    spa_sync_version, spa, &ver, 6);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
 		    spa, nvp, 6));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 /*ARGSUSED*/
 static int
 spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	spa_t *spa = arg1;
 	uint64_t *newguid = arg2;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (ENXIO);
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	spa_t *spa = arg1;
 	uint64_t *newguid = arg2;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 #ifdef __FreeBSD__
 	/*
 	 * TODO: until recent illumos logging changes are merged
 	 *       log reguid as pool property change
 	 */
 	spa_history_log_internal(LOG_POOL_PROPSET, spa, tx,
 	    "guid change old=%llu new=%llu", oldguid, *newguid);
 #else
 	spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld",
 	    oldguid, *newguid);
 #endif
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  */
 int
 spa_change_guid(spa_t *spa)
 {
 	int error;
 	uint64_t guid;
 
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
 	error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
 	    spa_change_guid_sync, spa, &guid, 5);
 
 	if (error == 0) {
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
 	int ret;
 
 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_t));
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static taskq_t *
 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
     uint_t value)
 {
 	uint_t flags = TASKQ_PREPOPULATE;
 	boolean_t batch = B_FALSE;
 
 	switch (mode) {
 	case zti_mode_null:
 		return (NULL);		/* no taskq needed */
 
 	case zti_mode_fixed:
 		ASSERT3U(value, >=, 1);
 		value = MAX(value, 1);
 		break;
 
 	case zti_mode_batch:
 		batch = B_TRUE;
 		flags |= TASKQ_THREADS_CPU_PCT;
 		value = zio_taskq_batch_pct;
 		break;
 
 	case zti_mode_online_percent:
 		flags |= TASKQ_THREADS_CPU_PCT;
 		break;
 
 	default:
 		panic("unrecognized mode for %s taskq (%u:%u) in "
 		    "spa_activate()",
 		    name, mode, value);
 		break;
 	}
 
 #ifdef SYSDC
 	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 		if (batch)
 			flags |= TASKQ_DC_BATCH;
 
 		return (taskq_create_sysdc(name, value, 50, INT_MAX,
 		    spa->spa_proc, zio_taskq_basedc, flags));
 	}
 #endif
 	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
 	    spa->spa_proc, flags));
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 			enum zti_modes mode = ztip->zti_mode;
 			uint_t value = ztip->zti_value;
 			char name[32];
 
 			(void) snprintf(name, sizeof (name),
 			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
 
 			spa->spa_zio_taskq[t][q] =
 			    spa_taskq_create(spa, name, mode, value);
 		}
 	}
 }
 
 #ifdef _KERNEL
 #ifdef SPA_PROCESS
 static void
 spa_thread(void *arg)
 {
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 #ifdef PSRSET_BIND
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 #endif
 
 #ifdef SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 #endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif	/* SPA_PROCESS */
 #endif
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, int mode)
 {
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef SPA_PROCESS
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif	/* SPA_PROCESS */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	ASSERT(spa->spa_proc == &p0);
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	/*
 	 * Start TRIM thread.
 	 */
 	trim_thread_create(spa);
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	/*
 	 * Stop TRIM thread in case spa_unload() wasn't called directly
 	 * before spa_deactivate().
 	 */
 	trim_thread_destroy(spa);
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			if (spa->spa_zio_taskq[t][q] != NULL)
 				taskq_destroy(spa->spa_zio_taskq[t][q]);
 			spa->spa_zio_taskq[t][q] = NULL;
 		}
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 #ifdef SPA_PROCESS
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 #endif	/* SPA_PROCESS */
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 static int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (EINVAL);
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	int i;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Stop TRIM thread.
 	 */
 	trim_thread_destroy(spa);
 
 	/*
 	 * Stop async tasks.
 	 */
 	spa_async_suspend(spa);
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		(void) zio_wait(spa->spa_async_zio_root);
 		spa->spa_async_zio_root = NULL;
 	}
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 	}
 	if (spa->spa_l2cache.sav_vdevs) {
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 static void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		vd = spa->spa_spares.sav_vdevs[i];
 
 		/* Undo the call to spa_activate() below */
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL && tvd->vdev_isspare)
 			spa_spare_remove(tvd);
 		vdev_close(vd);
 		vdev_free(vd);
 	}
 
 	if (spa->spa_spares.sav_vdevs)
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 static void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (sav->sav_config != NULL) {
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 	} else {
 		nl2cache = 0;
+		newvdevs = NULL;
 	}
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE) == 0);
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 		}
 	}
 
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	for (i = 0; i < oldnvdevs; i++) {
 		uint64_t pool;
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
 			ASSERT(vd->vdev_isl2cache);
 
 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
 			vdev_clear_stats(vd);
 			vdev_free(vd);
 		}
 	}
 
 	if (oldvdevs)
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 
 	if (sav->sav_config == NULL)
 		goto out;
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 out:
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	kmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 /*
  * Validate the current config against the MOS config
  */
 static boolean_t
 spa_config_valid(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv;
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
 	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing devices in this config.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
 		    KM_SLEEP);
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 			vdev_t *mtvd  = mrvd->vdev_child[c];
 
 			if (tvd->vdev_ops == &vdev_missing_ops &&
 			    mtvd->vdev_ops != &vdev_missing_ops &&
 			    mtvd->vdev_islog)
 				child[idx++] = vdev_config_generate(spa, mtvd,
 				    B_FALSE, 0);
 		}
 
 		if (idx) {
 			VERIFY(nvlist_add_nvlist_array(nv,
 			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
 
 			for (int i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 	}
 
 	/*
 	 * Compare the root vdev tree with the information we have
 	 * from the MOS config (mrvd). Check each top-level vdev
 	 * with the corresponding MOS config top-level (mtvd).
 	 */
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		vdev_t *mtvd  = mrvd->vdev_child[c];
 
 		/*
 		 * Resolve any "missing" vdevs in the current configuration.
 		 * If we find that the MOS config has more accurate information
 		 * about the top-level vdev then use that vdev instead.
 		 */
 		if (tvd->vdev_ops == &vdev_missing_ops &&
 		    mtvd->vdev_ops != &vdev_missing_ops) {
 
 			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
 				continue;
 
 			/*
 			 * Device specific actions.
 			 */
 			if (mtvd->vdev_islog) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 			} else {
 				/*
 				 * XXX - once we have 'readonly' pool
 				 * support we should be able to handle
 				 * missing data devices by transitioning
 				 * the pool to readonly.
 				 */
 				continue;
 			}
 
 			/*
 			 * Swap the missing vdev with the data we were
 			 * able to obtain from the MOS config.
 			 */
 			vdev_remove_child(rvd, tvd);
 			vdev_remove_child(mrvd, mtvd);
 
 			vdev_add_child(rvd, mtvd);
 			vdev_add_child(mrvd, tvd);
 
 			spa_config_exit(spa, SCL_ALL, FTAG);
 			vdev_load(mtvd);
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 			vdev_reopen(rvd);
 		} else if (mtvd->vdev_islog) {
 			/*
 			 * Load the slog device's state from the MOS config
 			 * since it's possible that the label does not
 			 * contain the most up-to-date information.
 			 */
 			vdev_load_log_state(tvd, mtvd);
 			vdev_reopen(tvd);
 		}
 	}
 	vdev_free(mrvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Ensure we were able to validate the config.
 	 */
 	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
 }
 
 /*
  * Check for missing log devices
  */
 static int
 spa_check_logs(spa_t *spa)
 {
 	switch (spa->spa_log_state) {
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
 		    DS_FIND_CHILDREN)) {
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 			return (1);
 		}
 		break;
 	}
 	return (0);
 }
 
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	if (!spa_has_slogs(spa))
 		return (B_FALSE);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog) {
 			metaslab_group_passivate(mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog)
 			metaslab_group_activate(mg);
 	}
 }
 
 int
 spa_offline_log(spa_t *spa)
 {
 	int error = 0;
 
 	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
 	    NULL, DS_FIND_CHILDREN)) == 0) {
 
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	int i;
 
 	for (i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_add_64(&sle->sle_meta_count, 1);
 		else
 			atomic_add_64(&sle->sle_data_count, 1);
 	}
 	zio_data_buf_free(zio->io_data, zio->io_size);
 }
 
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	if (bp != NULL) {
 		zio_t *rio = arg;
 		size_t size = BP_GET_PSIZE(bp);
 		void *data = zio_data_buf_alloc(size);
 
 		zio_nowait(zio_read(rio, spa, bp, data, size,
 		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	}
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_rewind_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error;
 
 	zpool_get_rewind_policy(spa->spa_config, &policy);
 
 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
 		return (0);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	error = traverse_pool(spa, spa->spa_verify_min_txg,
 	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
 
 	(void) zio_wait(rio);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
 	    sle.sle_data_count <= policy.zrp_maxdata) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
 		VERIFY(nvlist_add_int64(spa->spa_load_info,
 		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = EIO;
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
 {
 	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val));
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (err);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
     boolean_t mosconfig)
 {
 	nvlist_t *config = spa->spa_config;
 	char *ereport = FM_EREPORT_ZFS_POOL;
 	char *comment;
 	int error;
 	uint64_t pool_guid;
 	nvlist_t *nvl;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
 		return (EINVAL);
 
 	ASSERT(spa->spa_comment == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 		error = EEXIST;
 	} else {
 		spa->spa_config_guid = pool_guid;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
 		    &nvl) == 0) {
 			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
 			    KM_SLEEP) == 0);
 		}
 
 		nvlist_free(spa->spa_load_info);
 		spa->spa_load_info = fnvlist_alloc();
 
 		gethrestime(&spa->spa_loaded_ts);
 		error = spa_load_impl(spa, pool_guid, config, state, type,
 		    mosconfig, &ereport);
 	}
 
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	return (error);
 }
 
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
 static int
 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport)
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
 	nvlist_t *label;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
 	uint64_t children, config_cache_txg = spa->spa_config_txg;
 	int orig_mode = spa->spa_mode;
 	int parse;
 	uint64_t obj;
 	boolean_t missing_feat_write = B_FALSE;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
 	 * This prevents things like resilvering recently removed devices.
 	 */
 	if (!mosconfig)
 		spa->spa_mode = FREAD;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa->spa_load_state = state;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
 		return (EINVAL);
 
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT(spa->spa_root_vdev == rvd);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We need to validate the vdev labels against the configuration that
 	 * we have in hand, which is dependent on the setting of mosconfig. If
 	 * mosconfig is true then we're validating the vdev labels based on
 	 * that config.  Otherwise, we're validating against the cached config
 	 * (zpool.cache) that was read when we loaded the zfs module, and then
 	 * later we will recursively call spa_load() and validate against
 	 * the vdev config.
 	 *
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		error = vdev_validate(rvd, mosconfig);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0)
 			return (error);
 
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (ENXIO);
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL || nvlist_lookup_nvlist(label,
 		    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
 			nvlist_free(label);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
 		    0);
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				VERIFY(nvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "") == 0);
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
 			nvlist_free(unsup_feat);
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	/*
 	 * If the vdev guid sum doesn't match the uberblock, we have an
 	 * incomplete configuration.  We first check to see if the pool
 	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
 	 * If it is, defer the vdev_guid_sum check till later so we
 	 * can handle missing vdevs.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 	    &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!feature_is_supported(spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
 			if (!feature_is_supported(spa->spa_meta_objset,
 			    spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
 			    unsup_feat, enabled_feat)) {
 				missing_feat_write = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (missing_feat_write &&
 		    spa_writeable(spa))) {
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (!mosconfig) {
 		uint64_t hostid;
 		nvlist_t *policy = NULL, *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 			char *hostname;
 			unsigned long myhostid = 0;
 
 			VERIFY(nvlist_lookup_string(nvconfig,
 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
 #ifdef	_KERNEL
 			myhostid = zone_get_hostid(NULL);
 #else	/* _KERNEL */
 			/*
 			 * We're emulating the system's hostid in userland, so
 			 * we can't use zone_get_hostid().
 			 */
 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
 #endif	/* _KERNEL */
 			if (check_hostid && hostid != 0 && myhostid != 0 &&
 			    hostid != myhostid) {
 				nvlist_free(nvconfig);
 				cmn_err(CE_WARN, "pool '%s' could not be "
 				    "loaded as it was last accessed by "
 				    "another system (host: %s hostid: 0x%lx). "
 				    "See: http://illumos.org/msg/ZFS-8000-EY",
 				    spa_name(spa), hostname,
 				    (unsigned long)hostid);
 				return (EBADF);
 			}
 		}
 		if (nvlist_lookup_nvlist(spa->spa_config,
 		    ZPOOL_REWIND_POLICY, &policy) == 0)
 			VERIFY(nvlist_add_nvlist(nvconfig,
 			    ZPOOL_REWIND_POLICY, policy) == 0);
 
 		spa_config_set(spa, nvconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_activate(spa, orig_mode);
 
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
 		    &spa->spa_dedup_ditto);
 
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev state for all toplevel vdevs.
 	 */
 	vdev_load(rvd);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Load the DDTs (dedup tables).
 	 */
 	error = ddt_load(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	spa_update_dspace(spa);
 
 	/*
 	 * Validate the config, using the MOS config to fill in any
 	 * information which might be missing.  If we fail to validate
 	 * the config then declare the pool unfit for use. If we're
 	 * assembling a pool from a split, the log is not transferred
 	 * over.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		nvlist_t *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_config_valid(spa, nvconfig)) {
 			nvlist_free(nvconfig);
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 			    ENXIO));
 		}
 		nvlist_free(nvconfig);
 
 		/*
 		 * Now that we've validated the config, check the state of the
 		 * root vdev.  If it can't be opened, it indicates one or
 		 * more toplevel vdevs are faulted.
 		 */
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (ENXIO);
 
 		if (spa_check_logs(spa)) {
 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
 		}
 	}
 
 	if (missing_feat_write) {
 		ASSERT(state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
 	}
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (state != SPA_LOAD_TRYIMPORT) {
 		if (error = spa_load_verify(spa))
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 	}
 
 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
 
 		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Claim log blocks that haven't been committed yet.
 		 * This must all happen in a single txg.
 		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 		 * invoked from zil_claim_log_block()'s i/o done callback.
 		 * Price of rollback is that we abandon the log.
 		 */
 		spa->spa_claiming = B_TRUE;
 
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		(void) dmu_objset_find(spa_name(spa),
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
 		spa->spa_claiming = B_FALSE;
 
 		spa_set_log_state(spa, SPA_LOG_GOOD);
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by either zil_check_log_chain()
 		 * (invoked from spa_check_logs()) or zil_claim() above.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * If the config cache is stale, or we have uninitialized
 		 * metaslabs (see spa_vdev_add()), then update the config.
 		 *
 		 * If this is a verbatim import, trust the current
 		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
 		    state == SPA_LOAD_IMPORT ||
 		    state == SPA_LOAD_RECOVER ||
 		    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 			need_update = B_TRUE;
 
 		for (int c = 0; c < rvd->vdev_children; c++)
 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
 				need_update = B_TRUE;
 
 		/*
 		 * Update the config cache asychronously in case we're the
 		 * root pool, in which case the config cache isn't writable yet.
 		 */
 		if (need_update)
 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
 		 */
 		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(rvd, NULL, NULL))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		/*
 		 * Delete any inconsistent datasets.
 		 */
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 	}
 
 	return (0);
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
 	int mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg--;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
     uint64_t max_request, int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
 	    mosconfig);
 	if (load_error == 0)
 		return (0);
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state, mosconfig);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT3P(loadinfo, ==, NULL);
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
     nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (mutex_owner(&spa_namespace_lock) != curthread) {
 		mutex_enter(&spa_namespace_lock);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
 		return (ENOENT);
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_rewind_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zrp_request & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
 		    policy.zrp_request);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (ENOENT);
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				VERIFY(nvlist_dup(spa->spa_config, config,
 				    KM_SLEEP) == 0);
 				VERIFY(nvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info) == 0);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER) {
 		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 		if (firstopen)
 			zvol_create_minors(pool);
 #endif
 #endif
 	}
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
     nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	mutex_exit(&spa_namespace_lock);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_inject_ref--;
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 	if (nspares != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			VERIFY(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
 				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 	if (nl2cache != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			VERIFY(nvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
 			    == 0);
 			vdev_get_stats(vd, vs);
 		}
 	}
 }
 
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features) == 0);
 	nvlist_free(features);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			VERIFY(nvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
 
 			VERIFY(nvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_get_errlog_size(spa)) == 0);
 
 			if (spa_suspended(spa))
 				VERIFY(nvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode) == 0);
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			mutex_enter(&spa_namespace_lock);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			mutex_exit(&spa_namespace_lock);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (EINVAL);
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (ENOTSUP);
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * The L2ARC currently only supports disk devices in
 		 * kernel context.  For user-level testing, we allow it.
 		 */
 #ifdef _KERNEL
 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
 			error = ENOTBLK;
 			vdev_free(vd);
 			goto out;
 		}
 #endif
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid) == 0);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatentating with the
 		 * current dev list.
 		 */
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs) == 0);
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
 			    KM_SLEEP) == 0);
 		for (i = 0; i < ndevs; i++)
 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
 			    KM_SLEEP) == 0);
 
 		VERIFY(nvlist_remove(sav->sav_config, config,
 		    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 		    config, newdevs, ndevs + oldndevs) == 0);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
 		    devs, ndevs) == 0);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     const char *history_str, nvlist_t *zplprops)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj;
 	boolean_t has_features;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (EEXIST);
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, NULL, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	has_features = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem)))
 			has_features = B_TRUE;
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = EINVAL;
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_metaslab_set_size(rvd->vdev_child[c]);
 			vdev_expand(rvd->vdev_child[c], txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES)
 		spa_feature_create_zap_objects(spa, tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
 		spa_history_create_obj(spa, tx);
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(spa, props, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(spa->spa_dsl_pool);
 
 	/*
 	 * We explicitly wait for the first transaction to complete so that our
 	 * bean counters are appropriately updated.
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	spa_config_sync(spa, B_FALSE, B_TRUE);
 
 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
 	spa_history_log_version(spa, LOG_POOL_CREATE);
 
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 #ifdef _KERNEL
 #if defined(sun)
 /*
  * Get the root pool information from the root disk, then import the root pool
  * during the system boot up time.
  */
 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
 
 static nvlist_t *
 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
 {
 	nvlist_t *config;
 	nvlist_t *nvtop, *nvroot;
 	uint64_t pgid;
 
 	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
 		return (NULL);
 
 	/*
 	 * Add this top-level vdev to the child array.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &nvtop, 1) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 	return (config);
 }
 
 /*
  * Walk the vdev tree and see if we can find a device with "better"
  * configuration. A configuration is "better" if the label on that
  * device has a more recent txg.
  */
 static void
 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t *label;
 		uint64_t label_txg;
 
 		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
 		    &label) != 0)
 			return;
 
 		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
 		    &label_txg) == 0);
 
 		/*
 		 * Do we have a better boot device?
 		 */
 		if (label_txg > *txg) {
 			*txg = label_txg;
 			*avd = vd;
 		}
 		nvlist_free(label);
 	}
 }
 
 /*
  * Import a root pool.
  *
  * For x86. devpath_list will consist of devid and/or physpath name of
  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
  * The GRUB "findroot" command will return the vdev we should boot.
  *
  * For Sparc, devpath_list consists the physpath name of the booting device
  * no matter the rootpool is a single device pool or a mirrored pool.
  * e.g.
  *	"/pci@1f,0/ide@d/disk@0,0:a"
  */
 int
 spa_import_rootpool(char *devpath, char *devid)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t guid, txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(devpath, devid, &guid);
 #if defined(_OBP) && defined(_KERNEL)
 	if (config == NULL) {
 		if (strstr(devpath, "/iscsi/ssd") != NULL) {
 			/* iscsi boot */
 			get_iscsi_bootpath_phy(devpath);
 			config = spa_generate_rootconf(devpath, devid, &guid);
 		}
 	}
 #endif
 	if (config == NULL) {
 		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
 		    devpath);
 		return (EIO);
 	}
 
 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &pname) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pname)) != NULL) {
 		/*
 		 * Remove the existing root pool from the namespace so that we
 		 * can replace it with the correct config we just read in.
 		 */
 		spa_remove(spa);
 	}
 
 	spa = spa_add(pname, config, NULL);
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	/*
 	 * Get the boot vdev.
 	 */
 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
 		    (u_longlong_t)guid);
 		error = ENOENT;
 		goto out;
 	}
 
 	/*
 	 * Determine if there is a better boot device.
 	 */
 	avd = bvd;
 	spa_alt_rootvdev(rvd, &avd, &txg);
 	if (avd != bvd) {
 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
 		    "try booting from '%s'", avd->vdev_path);
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * If the boot device is part of a spare vdev then ensure that
 	 * we're booting off the active spare.
 	 */
 	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    !bvd->vdev_isspare) {
 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
 		    "try booting from '%s'",
 		    bvd->vdev_parent->
 		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
 		error = EINVAL;
 		goto out;
 	}
 
 	error = 0;
 	spa_history_log_version(spa, LOG_POOL_IMPORT);
 out:
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (error);
 }
 
 #else
 
 extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
     uint64_t *count);
 
 static nvlist_t *
 spa_generate_rootconf(const char *name)
 {
 	nvlist_t **configs, **tops;
 	nvlist_t *config;
 	nvlist_t *best_cfg, *nvtop, *nvroot;
 	uint64_t *holes;
 	uint64_t best_txg;
 	uint64_t nchildren;
 	uint64_t pgid;
 	uint64_t count;
 	uint64_t i;
 	uint_t   nholes;
 
 	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
 		return (NULL);
 
 	ASSERT3U(count, !=, 0);
 	best_txg = 0;
 	for (i = 0; i < count; i++) {
 		uint64_t txg;
 
 		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
 		    &txg) == 0);
 		if (txg > best_txg) {
 			best_txg = txg;
 			best_cfg = configs[i];
 		}
 	}
 
 	/*
 	 * Multi-vdev root pool configuration discovery is not supported yet.
 	 */
 	nchildren = 1;
 	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
 	holes = NULL;
 	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
 	    &holes, &nholes);
 
 	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
 	for (i = 0; i < nchildren; i++) {
 		if (i >= count)
 			break;
 		if (configs[i] == NULL)
 			continue;
 		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
 		    &nvtop) == 0);
 		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
 	}
 	for (i = 0; holes != NULL && i < nholes; i++) {
 		if (i >= nchildren)
 			continue;
 		if (tops[holes[i]] != NULL)
 			continue;
 		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_HOLE) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
 		    holes[i]) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 	for (i = 0; i < nchildren; i++) {
 		if (tops[i] != NULL)
 			continue;
 		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_MISSING) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
 		    i) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 
 	/*
 	 * Create pool config based on the best vdev config.
 	 */
 	nvlist_dup(best_cfg, &config, KM_SLEEP);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    tops, nchildren) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 
 	/*
 	 * Drop vdev config elements that should not be present at pool level.
 	 */
 	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
 	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
 
 	for (i = 0; i < count; i++)
 		nvlist_free(configs[i]);
 	kmem_free(configs, count * sizeof(void *));
 	for (i = 0; i < nchildren; i++)
 		nvlist_free(tops[i]);
 	kmem_free(tops, nchildren * sizeof(void *));
 	nvlist_free(nvroot);
 	return (config);
 }
 
 int
 spa_import_rootpool(const char *name)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(name);
 
 	mutex_enter(&spa_namespace_lock);
 	if (config != NULL) {
 		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    &pname) == 0 && strcmp(name, pname) == 0);
 		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
 		    == 0);
 
 		if ((spa = spa_lookup(pname)) != NULL) {
 			/*
 			 * Remove the existing root pool from the namespace so
 			 * that we can replace it with the correct config
 			 * we just read in.
 			 */
 			spa_remove(spa);
 		}
 		spa = spa_add(pname, config, NULL);
 
 		/*
 		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
 		 * via spa_version().
 		 */
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 		    &spa->spa_ubsync.ub_version) != 0)
 			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 	} else if ((spa = spa_lookup(name)) == NULL) {
 		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
 		    name);
 		return (EIO);
 	} else {
 		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
 	}
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (0);
 }
 
 #endif	/* sun */
 #endif
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_rewind_policy_t policy;
 	uint64_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (EEXIST);
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = FREAD;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 
 		mutex_exit(&spa_namespace_lock);
 		spa_history_log_version(spa, LOG_POOL_IMPORT);
 
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_rewind_policy(config, &policy);
 	if (policy.zrp_request & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	/*
 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
 	 * because the user-supplied config is actually the one to trust when
 	 * doing an import.
 	 */
 	if (state != SPA_LOAD_RECOVER)
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
 	    policy.zrp_request);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 	    spa->spa_load_info) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_SPARE);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_L2CACHE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	mutex_exit(&spa_namespace_lock);
 	spa_history_log_version(spa, LOG_POOL_IMPORT);
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	zvol_create_minors(pool);
 #endif
 #endif
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	char *poolname;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, FREAD);
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
 	 * Pass TRUE for mosconfig because the user-supplied config
 	 * is actually the one to trust when doing an import.
 	 */
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    poolname) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    state) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp) == 0);
 		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				VERIFY(nvlist_add_string(config,
 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	mutex_exit(&spa_namespace_lock);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	spa_t *spa;
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & FWRITE))
 		return (EROFS);
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (ENOENT);
 	}
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
 	 * reacquire the namespace lock, and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	spa_async_suspend(spa);
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 
 	/*
 	 * The pool will be in core if it's openable,
 	 * in which case we can modify its state.
 	 */
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
 		/*
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 		/*
 		 * A pool cannot be exported or destroyed if there are active
 		 * references.  If we are resetting a pool, allow references by
 		 * fault injection handlers.
 		 */
 		if (!spa_refcount_zero(spa) ||
 		    (spa->spa_inject_ref != 0 &&
 		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (EBUSY);
 		}
 
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (EXDEV);
 		}
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg, id;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 
 		/*
 		 * Set the vdev id to the first hole, if one exists.
 		 */
 		for (id = 0; id < rvd->vdev_children; id++) {
 			if (rvd->vdev_child[id]->vdev_ishole) {
 				vdev_free(rvd->vdev_child[id]);
 				break;
 			}
 		}
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = id;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Attach a device to a mirror.  The arguments are the path to any device
  * in the mirror, and the nvroot for the new device.  If the path specifies
  * a device that is not mirrored, we automatically insert the mirror vdev.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!oldvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * Spares can't replace logs
 	 */
 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or the root
 		 * vdev.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
 		    KM_SLEEP);
 		(void) sprintf(oldvd->vdev_path, "%s/%s",
 		    newvd->vdev_path, "old");
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 	}
 
 	/* mark the device being resilvered */
 	newvd->vdev_resilvering = B_TRUE;
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (pvd->vdev_ops != pvops)
 		pvd = vdev_add_parent(oldvd, pvops);
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 	ASSERT(pvd->vdev_ops == pvops);
 	ASSERT(oldvd->vdev_parent == pvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
 	    dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
 		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
 	}
 
 	oldvdpath = spa_strdup(oldvd->vdev_path);
 	newvdpath = spa_strdup(newvd->vdev_path);
 	newvd_isspare = newvd->vdev_isspare;
 
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 	/*
 	 * Restart the resilver
 	 */
 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
-	uint64_t unspare_guid;
+	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a spare, then it implies
 	 * that the spare should become a real disk, and be removed from the
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_id == 0 &&
 	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 		cvd->vdev_resilvering = B_FALSE;
 	}
 
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path);
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		mutex_enter(&spa_namespace_lock);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_offline_log(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || vd->vdev_ishole) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = EINVAL;
 				break;
 			}
 		}
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = EINVAL;
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = ENODEV;
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    vml[c]->vdev_ishole ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = EINVAL;
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c])) {
 			error = EBUSY;
 			break;
 		}
 
 		/* we need certain info from the top level */
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift) == 0);
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    glist, children) == 0);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
 	    nvl) == 0);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 	    spa_version(spa)) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    spa->spa_config_txg) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL)) == 0);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 #ifndef sun
 	/* mark that we are creating new spa by splitting */
 	newspa->spa_splitting_newspa = B_TRUE;
 #endif
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
 #ifndef sun
 	newspa->spa_splitting_newspa = B_FALSE;
 #endif
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL) {
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(LOG_POOL_VDEV_DETACH,
 				    spa, tx, "vdev=%s",
 				    vml[c]->vdev_path);
 			vdev_free(vml[c]);
 		}
 	}
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
 	    "split new pool %s from pool %s", newname, spa_name(spa));
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 static nvlist_t *
 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 {
 	for (int i = 0; i < count; i++) {
 		uint64_t guid;
 
 		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		if (guid == target_guid)
 			return (nvpp[i]);
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 	nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
 	if (count > 1)
 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
 
 	for (int i = 0, j = 0; i < count; i++) {
 		if (dev[i] == dev_to_remove)
 			continue;
 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
 	}
 
 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
 	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
 
 	if (count > 1)
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
 /*
  * Evacuate the device.
  */
 static int
 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 {
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as writer
 	 * since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
 	if (vd->vdev_islog) {
 		if (vd->vdev_stat.vs_alloc != 0)
 			error = spa_offline_log(spa);
 	} else {
 		error = ENOTSUP;
 	}
 
 	if (error)
 		return (error);
 
 	/*
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	ASSERT0(vd->vdev_stat.vs_alloc);
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
 	vdev_dirty(vd, 0, NULL, txg);
 	vdev_config_dirty(vd);
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	return (0);
 }
 
 /*
  * Complete the removal by cleaning up the namespace.
  */
 static void
 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t id = vd->vdev_id;
 	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Only remove any devices which are empty.
 	 */
 	if (vd->vdev_stat.vs_alloc != 0)
 		return;
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
 		vdev_state_clean(vd);
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);
 
 	vdev_free(vd);
 
 	if (last_vdev) {
 		vdev_compact_children(rvd);
 	} else {
 		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 		vdev_add_child(rvd, vd);
 	}
 	vdev_config_dirty(rvd);
 
 	/*
 	 * Reassess the health of our root vdev.
 	 */
 	vdev_reopen(rvd);
 }
 
 /*
  * Remove a device from the pool -
  *
  * Removing a device from the vdev namespace requires several steps
  * and can take a significant amount of time.  As a result we use
  * the spa_vdev_config_[enter/exit] functions which allow us to
  * grab and release the spa_config_lock while still holding the namespace
  * lock.  During each step the configuration is synced out.
  */
 
 /*
  * Remove a device from the pool.  Currently, this supports removing only hot
  * spares, slogs, and level 2 ARC devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
 	ASSERT(spa_writeable(spa));
 
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
 		/*
 		 * Only remove the hot spare if it's not currently in use
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 			spa_load_spares(spa);
 			spa->spa_spares.sav_sync = B_TRUE;
 		} else {
 			error = EBUSY;
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
 		/*
 		 * Cache devices can always be removed.
 		 */
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
 		ASSERT(vd == vd->vdev_top);
 
 		/*
 		 * XXX - Once we have bp-rewrite this should
 		 * become the common case.
 		 */
 
 		mg = vd->vdev_mg;
 
 		/*
 		 * Stop allocating from this vdev.
 		 */
 		metaslab_group_passivate(mg);
 
 		/*
 		 * Wait for the youngest allocations and frees to sync,
 		 * and then wait for the deferral of those frees to finish.
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 		/*
 		 * Attempt to evacuate the vdev.
 		 */
 		error = spa_vdev_remove_evacuate(spa, vd);
 
 		txg = spa_vdev_config_enter(spa);
 
 		/*
 		 * If we couldn't evacuate the vdev, unwind.
 		 */
 		if (error) {
 			metaslab_group_activate(mg);
 			return (spa_vdev_exit(spa, NULL, txg, error));
 		}
 
 		/*
 		 * Clean up the vdev namespace.
 		 */
 		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
 		/*
 		 * Normal vdevs cannot be removed (yet).
 		 */
 		error = ENOTSUP;
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
 		error = ENOENT;
 	}
 
 	if (!locked)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * current spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (EBUSY);
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (ENOTSUP);
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_probe_wanted) {
 		vd->vdev_probe_wanted = B_FALSE;
 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	sysevent_id_t eid;
 	nvlist_t *attr;
 	char *physpath;
 
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
 
 	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 
 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
 
 	nvlist_free(attr);
 	kmem_free(physpath, MAXPATHLEN);
 }
 
 static void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = arg;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks = 0;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
 			    spa, NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), new_space, new_space - old_space);
 		}
 	}
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE)
 		spa_vdev_resilver_done(spa);
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER)
 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL &&
 	    rootdir != NULL && !vn_is_readonly(rootdir))
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, tx);
 	return (0);
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *zio = arg;
 
 	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
 	    BP_GET_PSIZE(bp), zio->io_flags));
 	return (0);
 }
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dbuf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = kmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 	bzero(packed + nvsize, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
 	kmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	if (sav->sav_count == 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
 	} else {
 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
 		    sav->sav_count) == 0);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	if (list_is_empty(&spa->spa_config_dirty_list))
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (spa->spa_config_syncing)
 		nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	spa_t *spa = arg1;
 	uint64_t version = *(uint64_t *)arg2;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(version <= SPA_VERSION);
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	spa_t *spa = arg1;
 	objset_t *mos = spa->spa_meta_objset;
 	nvlist_t *nvp = arg2;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		zprop_type_t proptype;
 		zfeature_info_t *feature;
 
 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
 		case ZPROP_INVAL:
 			/*
 			 * We checked this earlier in spa_prop_validate().
 			 */
 			ASSERT(zpool_prop_feature(nvpair_name(elem)));
 
 			fname = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
 
 			spa_feature_enable(spa, feature, tx);
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			VERIFY(nvpair_value_uint64(elem, &intval) == 0);
 			/*
 			 * The version is synced seperatly before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persisitent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			VERIFY(nvpair_value_string(elem, &strval) == 0);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  It's unnecessary
 			 * to do this for pool creation since the vdev's
 			 * configuratoin has already been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL)
 				vdev_config_dirty(spa->spa_root_vdev);
 			break;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			propname = zpool_prop_to_name(prop);
 			proptype = zpool_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				VERIFY(nvpair_value_string(elem, &strval) == 0);
 				VERIFY(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx) == 0);
 
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY(zpool_prop_index_to_string(
 					    prop, intval, &unused) == 0);
 				}
 				VERIFY(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx) == 0);
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 
 			switch (prop) {
 			case ZPOOL_PROP_DELEGATION:
 				spa->spa_delegation = intval;
 				break;
 			case ZPOOL_PROP_BOOTFS:
 				spa->spa_bootfs = intval;
 				break;
 			case ZPOOL_PROP_FAILUREMODE:
 				spa->spa_failmode = intval;
 				break;
 			case ZPOOL_PROP_AUTOEXPAND:
 				spa->spa_autoexpand = intval;
 				if (tx->tx_txg != TXG_INITIAL)
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOEXPAND);
 				break;
 			case ZPOOL_PROP_DEDUPDITTO:
 				spa->spa_dedup_ditto = intval;
 				break;
 			default:
 				break;
 			}
 		}
 
 		/* log internal history if this is not a zpool create */
 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
 		    tx->tx_txg != TXG_INITIAL) {
 			spa_history_log_internal(LOG_POOL_PROPSET,
 			    spa, tx, "%s %lld %s",
 			    nvpair_name(elem), intval, spa_name(spa));
 		}
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	ASSERT(spa->spa_sync_pass == 1);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int error;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		int i;
 
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY(0 == zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	/*
 	 * If anything has changed in this txg, or if someone is waiting
 	 * for this txg to sync (eg, spa_vdev_remove()), push the
 	 * deferred frees from the previous txg.  If not, leave them
 	 * alone so that we don't generate work on an otherwise idle
 	 * system.
 	 */
 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
 	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
 	    ((dsl_scan_active(dp->dp_scan) ||
 	    txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
 		zio_t *zio = zio_root(spa, NULL, NULL, 0);
 		VERIFY3U(bpobj_iterate(defer_bpo,
 		    spa_free_sync_cb, zio, tx), ==, 0);
 		VERIFY0(zio_wait(zio));
 	}
 
 	/*
 	 * Iterate to convergence.
 	 */
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free) {
 			zio_t *zio = zio_root(spa, NULL, NULL, 0);
 			bplist_iterate(free_bpl, spa_free_sync_cb,
 			    zio, tx);
 			VERIFY(zio_wait(zio) == 0);
 		} else {
 			bplist_iterate(free_bpl, bpobj_enqueue_cb,
 			    defer_bpo, tx);
 		}
 
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
 
 		if (pass == 1)
 			spa_sync_upgrades(spa, tx);
 
 	} while (dmu_objset_is_dirty(mos, txg));
 
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 *
 	 * If there are no dirty vdevs, we sync the uberblock to a few
 	 * random top-level vdevs that are known to be visible in the
 	 * config cache (see spa_vdev_add() for a complete description).
 	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
 	 */
 	for (;;) {
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_DVAS_PER_BP];
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
 
 			for (int c = 0; c < children; c++) {
 				vd = rvd->vdev_child[(c0 + c) % children];
 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
 					continue;
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_DVAS_PER_BP)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
 			if (error != 0)
 				error = vdev_config_sync(svd, svdcount, txg,
 				    B_TRUE);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg, B_FALSE);
 			if (error != 0)
 				error = vdev_config_sync(rvd->vdev_child,
 				    rvd->vdev_children, txg, B_TRUE);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL);
 		zio_resume_wait(spa);
 	}
 	dmu_tx_commit(tx);
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	dsl_pool_sync_done(dp, txg);
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 		vdev_sync_done(vd, txg);
 
 	spa_update_dspace(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	spa->spa_sync_pass = 0;
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
 	ASSERT(version >= spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	int i;
 	uint64_t spareguid;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &spareguid) == 0 && spareguid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Post a sysevent corresponding to the given event.  The 'name' must be one of
  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
 {
 #ifdef _KERNEL
 	sysevent_t		*ev;
 	sysevent_attr_list_t	*attr = NULL;
 	sysevent_value_t	value;
 	sysevent_id_t		eid;
 
 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
 	    SE_SLEEP);
 
 	value.value_type = SE_DATA_TYPE_STRING;
 	value.value.sv_string = spa_name(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
 		goto done;
 
 	value.value_type = SE_DATA_TYPE_UINT64;
 	value.value.sv_uint64 = spa_guid(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
 		goto done;
 
 	if (vd) {
 		value.value_type = SE_DATA_TYPE_UINT64;
 		value.value.sv_uint64 = vd->vdev_guid;
 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
 		    SE_SLEEP) != 0)
 			goto done;
 
 		if (vd->vdev_path) {
 			value.value_type = SE_DATA_TYPE_STRING;
 			value.value.sv_string = vd->vdev_path;
 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
 			    &value, SE_SLEEP) != 0)
 				goto done;
 		}
 	}
 
 	if (sysevent_attach_attributes(ev, attr) != 0)
 		goto done;
 	attr = NULL;
 
 	(void) log_sysevent(ev, SE_SLEEP, &eid);
 
 done:
 	if (attr)
 		sysevent_free_attr(attr);
 	sysevent_free(ev);
 #endif
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	(revision 247192)
@@ -1,2172 +1,2173 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 
 /*
  * Virtual device vector for RAID-Z.
  *
  * This vdev supports single, double, and triple parity. For single parity,
  * we use a simple XOR of all the data columns. For double or triple parity,
  * we use a special case of Reed-Solomon coding. This extends the
  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  * former is also based. The latter is designed to provide higher performance
  * for writes.
  *
  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  * amended six years later identifying a critical flaw that invalidates its
  * claims. Nevertheless, the technique can be adapted to work for up to
  * triple parity. For additional parity, the amendment "Note: Correction to
  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  * is viable, but the additional complexity means that write performance will
  * suffer.
  *
  * All of the methods above operate on a Galois field, defined over the
  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  * can be expressed with a single byte. Briefly, the operations on the
  * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *   o multiplication of A by 2 is defined by the following bitwise expression:
  *	(A * 2)_7 = A_6
  *	(A * 2)_6 = A_5
  *	(A * 2)_5 = A_4
  *	(A * 2)_4 = A_3 + A_7
  *	(A * 2)_3 = A_2 + A_7
  *	(A * 2)_2 = A_1 + A_7
  *	(A * 2)_1 = A_0
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  * As an aside, this multiplication is derived from the error correcting
  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  * than field addition). The inverse of a field element A (A^-1) is therefore
  * A ^ (255 - 1) = A^254.
  *
  * The up-to-three parity columns, P, Q, R over several data columns,
  * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  * independent coefficients. (There are no additional coefficients that have
  * this property which is why the uncorrected Plank method breaks down.)
  *
  * See the reconstruction code below for how P, Q and R can used individually
  * or in concert to recover missing data columns.
  */
 
 typedef struct raidz_col {
 	uint64_t rc_devidx;		/* child device index for I/O */
 	uint64_t rc_offset;		/* device offset */
 	uint64_t rc_size;		/* I/O size */
 	void *rc_data;			/* I/O data */
 	void *rc_gdata;			/* used to store the "good" version */
 	int rc_error;			/* I/O error for this device */
 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
 } raidz_col_t;
 
 typedef struct raidz_map {
 	uint64_t rm_cols;		/* Regular column count */
 	uint64_t rm_scols;		/* Count including skipped columns */
 	uint64_t rm_bigcols;		/* Number of oversized columns */
 	uint64_t rm_asize;		/* Actual total I/O size */
 	uint64_t rm_missingdata;	/* Count of missing data devices */
 	uint64_t rm_missingparity;	/* Count of missing parity devices */
 	uint64_t rm_firstdatacol;	/* First data column/parity count */
 	uint64_t rm_nskip;		/* Skipped sectors for padding */
 	uint64_t rm_skipstart;	/* Column index of padding start */
 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
 	uintptr_t rm_reports;		/* # of referencing checksum reports */
 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
 #define	VDEV_RAIDZ_R		2
 
 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 
 /*
  * We provide a mechanism to perform the field multiplication operation on a
  * 64-bit value all at once rather than a byte at a time. This works by
  * creating a mask from the top bit in each byte and using that to
  * conditionally apply the XOR of 0x1d.
  */
 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
 { \
 	(mask) = (x) & 0x8080808080808080ULL; \
 	(mask) = ((mask) << 1) - ((mask) >> 7); \
 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
 }
 
 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
 { \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
 /*
  * Force reconstruction to use the general purpose method.
  */
 int vdev_raidz_default_to_general;
 
 /*
  * These two tables represent powers and logs of 2 in the Galois field defined
  * above. These values were computed by repeatedly multiplying by 2 as above.
  */
 static const uint8_t vdev_raidz_pow2[256] = {
 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 };
 static const uint8_t vdev_raidz_log2[256] = {
 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 };
 
 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 
 /*
  * Multiply a given number by 2 raised to the given power.
  */
 static uint8_t
 vdev_raidz_exp2(uint_t a, int exp)
 {
 	if (a == 0)
 		return (0);
 
 	ASSERT(exp >= 0);
 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 
 	exp += vdev_raidz_log2[a];
 	if (exp > 255)
 		exp -= 255;
 
 	return (vdev_raidz_pow2[exp]);
 }
 
 static void
 vdev_raidz_map_free(raidz_map_t *rm)
 {
 	int c;
 	size_t size;
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		if (rm->rm_col[c].rc_data != NULL)
 			zio_buf_free(rm->rm_col[c].rc_data,
 			    rm->rm_col[c].rc_size);
 
 		if (rm->rm_col[c].rc_gdata != NULL)
 			zio_buf_free(rm->rm_col[c].rc_gdata,
 			    rm->rm_col[c].rc_size);
 	}
 
 	size = 0;
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 		size += rm->rm_col[c].rc_size;
 
 	if (rm->rm_datacopy != NULL)
 		zio_buf_free(rm->rm_datacopy, size);
 
 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
 
 static void
 vdev_raidz_map_free_vsd(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	ASSERT0(rm->rm_freed);
 	rm->rm_freed = 1;
 
 	if (rm->rm_reports == 0)
 		vdev_raidz_map_free(rm);
 }
 
 /*ARGSUSED*/
 static void
 vdev_raidz_cksum_free(void *arg, size_t ignored)
 {
 	raidz_map_t *rm = arg;
 
 	ASSERT3U(rm->rm_reports, >, 0);
 
 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 		vdev_raidz_map_free(rm);
 }
 
 static void
 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 {
 	raidz_map_t *rm = zcr->zcr_cbdata;
 	size_t c = zcr->zcr_cbinfo;
 	size_t x;
 
 	const char *good = NULL;
 	const char *bad = rm->rm_col[c].rc_data;
 
 	if (good_data == NULL) {
 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 		return;
 	}
 
 	if (c < rm->rm_firstdatacol) {
 		/*
 		 * The first time through, calculate the parity blocks for
 		 * the good data (this relies on the fact that the good
 		 * data never changes for a given logical ZIO)
 		 */
 		if (rm->rm_col[0].rc_gdata == NULL) {
 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 			char *buf;
 
 			/*
 			 * Set up the rm_col[]s to generate the parity for
 			 * good_data, first saving the parity bufs and
 			 * replacing them with buffers to hold the result.
 			 */
 			for (x = 0; x < rm->rm_firstdatacol; x++) {
 				bad_parity[x] = rm->rm_col[x].rc_data;
 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 				    zio_buf_alloc(rm->rm_col[x].rc_size);
 			}
 
 			/* fill in the data columns from good_data */
 			buf = (char *)good_data;
 			for (; x < rm->rm_cols; x++) {
 				rm->rm_col[x].rc_data = buf;
 				buf += rm->rm_col[x].rc_size;
 			}
 
 			/*
 			 * Construct the parity from the good data.
 			 */
 			vdev_raidz_generate_parity(rm);
 
 			/* restore everything back to its original state */
 			for (x = 0; x < rm->rm_firstdatacol; x++)
 				rm->rm_col[x].rc_data = bad_parity[x];
 
 			buf = rm->rm_datacopy;
 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 				rm->rm_col[x].rc_data = buf;
 				buf += rm->rm_col[x].rc_size;
 			}
 		}
 
 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 		good = rm->rm_col[c].rc_gdata;
 	} else {
 		/* adjust good_data to point at the start of our column */
 		good = good_data;
 
 		for (x = rm->rm_firstdatacol; x < c; x++)
 			good += rm->rm_col[x].rc_size;
 	}
 
 	/* we drop the ereport if it ends up that the data was good */
 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 }
 
 /*
  * Invoked indirectly by zfs_ereport_start_checksum(), called
  * below when our read operation fails completely.  The main point
  * is to keep a copy of everything we read from disk, so that at
  * vdev_raidz_cksum_finish() time we can compare it with the good data.
  */
 static void
 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 {
 	size_t c = (size_t)(uintptr_t)arg;
 	caddr_t buf;
 
 	raidz_map_t *rm = zio->io_vsd;
 	size_t size;
 
 	/* set up the report and bump the refcount  */
 	zcr->zcr_cbdata = rm;
 	zcr->zcr_cbinfo = c;
 	zcr->zcr_finish = vdev_raidz_cksum_finish;
 	zcr->zcr_free = vdev_raidz_cksum_free;
 
 	rm->rm_reports++;
 	ASSERT3U(rm->rm_reports, >, 0);
 
 	if (rm->rm_datacopy != NULL)
 		return;
 
 	/*
 	 * It's the first time we're called for this raidz_map_t, so we need
 	 * to copy the data aside; there's no guarantee that our zio's buffer
 	 * won't be re-used for something else.
 	 *
 	 * Our parity data is already in separate buffers, so there's no need
 	 * to copy them.
 	 */
 
 	size = 0;
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 		size += rm->rm_col[c].rc_size;
 
 	buf = rm->rm_datacopy = zio_buf_alloc(size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		raidz_col_t *col = &rm->rm_col[c];
 
 		bcopy(col->rc_data, buf, col->rc_size);
 		col->rc_data = buf;
 
 		buf += col->rc_size;
 	}
 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 }
 
 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	vdev_raidz_map_free_vsd,
 	vdev_raidz_cksum_report
 };
 
 static raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
     uint64_t nparity)
 {
 	raidz_map_t *rm;
 	uint64_t b = zio->io_offset >> unit_shift;
 	uint64_t s = zio->io_size >> unit_shift;
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
 	q = s / (dcols - nparity);
 	r = s - q * (dcols - nparity);
 	bc = (r == 0 ? 0 : r + nparity);
 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	if (q == 0) {
 		acols = bc;
 		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
 		acols = dcols;
 		scols = dcols;
 	}
 
 	ASSERT3U(acols, <=, scols);
 
 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 
 	rm->rm_cols = acols;
 	rm->rm_scols = scols;
 	rm->rm_bigcols = bc;
 	rm->rm_skipstart = bc;
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 	rm->rm_firstdatacol = nparity;
 	rm->rm_datacopy = NULL;
 	rm->rm_reports = 0;
 	rm->rm_freed = 0;
 	rm->rm_ecksuminjected = 0;
 
 	asize = 0;
 
 	for (c = 0; c < scols; c++) {
 		col = f + c;
 		coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << unit_shift;
 		}
 		rm->rm_col[c].rc_devidx = col;
 		rm->rm_col[c].rc_offset = coff;
 		rm->rm_col[c].rc_data = NULL;
 		rm->rm_col[c].rc_gdata = NULL;
 		rm->rm_col[c].rc_error = 0;
 		rm->rm_col[c].rc_tried = 0;
 		rm->rm_col[c].rc_skipped = 0;
 
 		if (c >= acols)
 			rm->rm_col[c].rc_size = 0;
 		else if (c < bc)
 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 		else
 			rm->rm_col[c].rc_size = q << unit_shift;
 
 		asize += rm->rm_col[c].rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << unit_shift);
 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 	ASSERT3U(rm->rm_nskip, <=, nparity);
 
 	if (zio->io_type != ZIO_TYPE_FREE) {
 		for (c = 0; c < rm->rm_firstdatacol; c++) {
 			rm->rm_col[c].rc_data =
 			    zio_buf_alloc(rm->rm_col[c].rc_size);
 		}
 
 		rm->rm_col[c].rc_data = zio->io_data;
 
 		for (c = c + 1; c < acols; c++) {
 			rm->rm_col[c].rc_data =
 			    (char *)rm->rm_col[c - 1].rc_data +
 			    rm->rm_col[c - 1].rc_size;
 		}
 	}
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
 	 * will always be on the same device and, since parity isn't read
 	 * during normal operation, that that device's I/O bandwidth won't be
 	 * used effectively. We therefore switch the parity every 1MB.
 	 *
 	 * ... at least that was, ostensibly, the theory. As a practical
 	 * matter unless we juggle the parity between all devices evenly, we
 	 * won't see any benefit. Further, occasional writes that aren't a
 	 * multiple of the LCM of the number of children and the minimum
 	 * stripe width are sufficient to avoid pessimal behavior.
 	 * Unfortunately, this decision created an implicit on-disk format
 	 * requirement that we need to support for all eternity, but only
 	 * for single-parity RAID-Z.
 	 *
 	 * If we intend to skip a sector in the zeroth column for padding
 	 * we must make sure to note this swap. We will never intend to
 	 * skip the first column since at least one data and one parity
 	 * column must appear in each row.
 	 */
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 		devidx = rm->rm_col[0].rc_devidx;
 		o = rm->rm_col[0].rc_offset;
 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 		rm->rm_col[1].rc_devidx = devidx;
 		rm->rm_col[1].rc_offset = o;
 
 		if (rm->rm_skipstart == 0)
 			rm->rm_skipstart = 1;
 	}
 
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	return (rm);
 }
 
 static void
 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 {
 	uint64_t *p, *src, pcount, ccount, i;
 	int c;
 
 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccount == pcount);
 			for (i = 0; i < ccount; i++, src++, p++) {
 				*p = *src;
 			}
 		} else {
 			ASSERT(ccount <= pcount);
 			for (i = 0; i < ccount; i++, src++, p++) {
 				*p ^= *src;
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 	int c;
 
 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 
 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
 				*p = *src;
 				*q = *src;
 			}
 			for (; i < pcnt; i++, src++, p++, q++) {
 				*p = 0;
 				*q = 0;
 			}
 		} else {
 			ASSERT(ccnt <= pcnt);
 
 			/*
 			 * Apply the algorithm described above by multiplying
 			 * the previous result and adding in the new value.
 			 */
 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
 				*p ^= *src;
 
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 				*q ^= *src;
 			}
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			for (; i < pcnt; i++, q++) {
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 {
 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 	int c;
 
 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 
 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 				*p = *src;
 				*q = *src;
 				*r = *src;
 			}
 			for (; i < pcnt; i++, src++, p++, q++, r++) {
 				*p = 0;
 				*q = 0;
 				*r = 0;
 			}
 		} else {
 			ASSERT(ccnt <= pcnt);
 
 			/*
 			 * Apply the algorithm described above by multiplying
 			 * the previous result and adding in the new value.
 			 */
 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 				*p ^= *src;
 
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 				*q ^= *src;
 
 				VDEV_RAIDZ_64MUL_4(*r, mask);
 				*r ^= *src;
 			}
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			for (; i < pcnt; i++, q++, r++) {
 				VDEV_RAIDZ_64MUL_2(*q, mask);
 				VDEV_RAIDZ_64MUL_4(*r, mask);
 			}
 		}
 	}
 }
 
 /*
  * Generate RAID parity in the first virtual columns according to the number of
  * parity columns available.
  */
 static void
 vdev_raidz_generate_parity(raidz_map_t *rm)
 {
 	switch (rm->rm_firstdatacol) {
 	case 1:
 		vdev_raidz_generate_parity_p(rm);
 		break;
 	case 2:
 		vdev_raidz_generate_parity_pq(rm);
 		break;
 	case 3:
 		vdev_raidz_generate_parity_pqr(rm);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 	}
 }
 
 static int
 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, i;
 	int x = tgts[0];
 	int c;
 
 	ASSERT(ntgts == 1);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(x < rm->rm_cols);
 
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 	ASSERT(xcount > 0);
 
 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 	dst = rm->rm_col[x].rc_data;
 	for (i = 0; i < xcount; i++, dst++, src++) {
 		*dst = *src;
 	}
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		dst = rm->rm_col[x].rc_data;
 
 		if (c == x)
 			continue;
 
 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 		count = MIN(ccount, xcount);
 
 		for (i = 0; i < count; i++, dst++, src++) {
 			*dst ^= *src;
 		}
 	}
 
 	return (1 << VDEV_RAIDZ_P);
 }
 
 static int
 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
 	uint8_t *b;
 	int x = tgts[0];
 	int c, j, exp;
 
 	ASSERT(ntgts == 1);
 
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		src = rm->rm_col[c].rc_data;
 		dst = rm->rm_col[x].rc_data;
 
 		if (c == x)
 			ccount = 0;
 		else
 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		count = MIN(ccount, xcount);
 
 		if (c == rm->rm_firstdatacol) {
 			for (i = 0; i < count; i++, dst++, src++) {
 				*dst = *src;
 			}
 			for (; i < xcount; i++, dst++) {
 				*dst = 0;
 			}
 
 		} else {
 			for (i = 0; i < count; i++, dst++, src++) {
 				VDEV_RAIDZ_64MUL_2(*dst, mask);
 				*dst ^= *src;
 			}
 
 			for (; i < xcount; i++, dst++) {
 				VDEV_RAIDZ_64MUL_2(*dst, mask);
 			}
 		}
 	}
 
 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 	dst = rm->rm_col[x].rc_data;
 	exp = 255 - (rm->rm_cols - 1 - x);
 
 	for (i = 0; i < xcount; i++, dst++, src++) {
 		*dst ^= *src;
 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 			*b = vdev_raidz_exp2(*b, exp);
 		}
 	}
 
 	return (1 << VDEV_RAIDZ_Q);
 }
 
 static int
 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 	void *pdata, *qdata;
 	uint64_t xsize, ysize, i;
 	int x = tgts[0];
 	int y = tgts[1];
 
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(y < rm->rm_cols);
 
 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 
 	/*
 	 * Move the parity data aside -- we're going to compute parity as
 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 	 * reuse the parity generation mechanism without trashing the actual
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 	xsize = rm->rm_col[x].rc_size;
 	ysize = rm->rm_col[y].rc_size;
 
 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 	rm->rm_col[x].rc_size = 0;
 	rm->rm_col[y].rc_size = 0;
 
 	vdev_raidz_generate_parity_pq(rm);
 
 	rm->rm_col[x].rc_size = xsize;
 	rm->rm_col[y].rc_size = ysize;
 
 	p = pdata;
 	q = qdata;
 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 	xd = rm->rm_col[x].rc_data;
 	yd = rm->rm_col[y].rc_data;
 
 	/*
 	 * We now have:
 	 *	Pxy = P + D_x + D_y
 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 	 *
 	 * We can then solve for D_x:
 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
 	 * where
 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 	 *
 	 * With D_x in hand, we can easily solve for D_y:
 	 *	D_y = P + Pxy + D_x
 	 */
 
 	a = vdev_raidz_pow2[255 + x - y];
 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 	tmp = 255 - vdev_raidz_log2[a ^ 1];
 
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 
 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
 
 		if (i < ysize)
 			*yd = *p ^ *pxy ^ *xd;
 	}
 
 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
 	/*
 	 * Restore the saved parity data.
 	 */
 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 
 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 }
 
 /* BEGIN CSTYLED */
 /*
  * In the general case of reconstruction, we must solve the system of linear
  * equations defined by the coeffecients used to generate parity as well as
  * the contents of the data and parity disks. This can be expressed with
  * vectors for the original data (D) and the actual data (d) and parity (p)
  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
  *
  *            __   __                     __     __
  *            |     |         __     __   |  p_0  |
  *            |  V  |         |  D_0  |   | p_m-1 |
  *            |     |    x    |   :   | = |  d_0  |
  *            |  I  |         | D_n-1 |   |   :   |
  *            |     |         ~~     ~~   | d_n-1 |
  *            ~~   ~~                     ~~     ~~
  *
  * I is simply a square identity matrix of size n, and V is a vandermonde
  * matrix defined by the coeffecients we chose for the various parity columns
  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
  * computation as well as linear separability.
  *
  *      __               __               __     __
  *      |   1   ..  1 1 1 |               |  p_0  |
  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
  *      |   :       : : : |   |   :   |   |  d_2  |
  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
  *      |   0   ..  0 0 1 |               | d_n-1 |
  *      ~~               ~~               ~~     ~~
  *
  * Note that I, V, d, and p are known. To compute D, we must invert the
  * matrix and use the known data and parity values to reconstruct the unknown
  * data values. We begin by removing the rows in V|I and d|p that correspond
  * to failed or missing columns; we then make V|I square (n x n) and d|p
  * sized n by removing rows corresponding to unused parity from the bottom up
  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
  * using Gauss-Jordan elimination. In the example below we use m=3 parity
  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
  *           |  19 205 116  29  64  16  4   1  |      / /
  *           |  1   0   0   0   0   0   0   0  |     / /
  *           |  0   1   0   0   0   0   0   0  | <--' /
  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
  *           |  0   1   0   0   0   0   0   0  |
  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *
  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
  * matrix is not singular.
  * __                                                                 __
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  *                   __                               __
  *                   |  0   0   1   0   0   0   0   0  |
  *                   | 167 100  5   41 159 169 217 208 |
  *                   | 166 100  4   40 158 168 216 209 |
  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
  *                   |  0   0   0   0   1   0   0   0  |
  *                   |  0   0   0   0   0   1   0   0  |
  *                   |  0   0   0   0   0   0   1   0  |
  *                   |  0   0   0   0   0   0   0   1  |
  *                   ~~                               ~~
  *
  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
  * of the missing data.
  *
  * As is apparent from the example above, the only non-trivial rows in the
  * inverse matrix correspond to the data disks that we're trying to
  * reconstruct. Indeed, those are the only rows we need as the others would
  * only be useful for reconstructing data known or assumed to be valid. For
  * that reason, we only build the coefficients in the rows that correspond to
  * targeted columns.
  */
 /* END CSTYLED */
 
 static void
 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
     uint8_t **rows)
 {
 	int i, j;
 	int pow;
 
 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 
 	/*
 	 * Fill in the missing rows of interest.
 	 */
 	for (i = 0; i < nmap; i++) {
 		ASSERT3S(0, <=, map[i]);
 		ASSERT3S(map[i], <=, 2);
 
 		pow = map[i] * n;
 		if (pow > 255)
 			pow -= 255;
 		ASSERT(pow <= 255);
 
 		for (j = 0; j < n; j++) {
 			pow -= map[i];
 			if (pow < 0)
 				pow += 255;
 			rows[i][j] = vdev_raidz_pow2[pow];
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, ii, jj;
 	uint8_t log;
 
 	/*
 	 * Assert that the first nmissing entries from the array of used
 	 * columns correspond to parity columns and that subsequent entries
 	 * correspond to data columns.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
 	}
 	for (; i < n; i++) {
 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 	}
 
 	/*
 	 * First initialize the storage where we'll compute the inverse rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			invrows[i][j] = (i == j) ? 1 : 0;
 		}
 	}
 
 	/*
 	 * Subtract all trivial rows from the rows of consequence.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = nmissing; j < n; j++) {
 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 			jj = used[j] - rm->rm_firstdatacol;
 			ASSERT3S(jj, <, n);
 			invrows[i][j] = rows[i][jj];
 			rows[i][jj] = 0;
 		}
 	}
 
 	/*
 	 * For each of the rows of interest, we must normalize it and subtract
 	 * a multiple of it from the other rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < missing[i]; j++) {
 			ASSERT0(rows[i][j]);
 		}
 		ASSERT3U(rows[i][missing[i]], !=, 0);
 
 		/*
 		 * Compute the inverse of the first element and multiply each
 		 * element in the row by that value.
 		 */
 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 
 		for (j = 0; j < n; j++) {
 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 		}
 
 		for (ii = 0; ii < nmissing; ii++) {
 			if (i == ii)
 				continue;
 
 			ASSERT3U(rows[ii][missing[i]], !=, 0);
 
 			log = vdev_raidz_log2[rows[ii][missing[i]]];
 
 			for (j = 0; j < n; j++) {
 				rows[ii][j] ^=
 				    vdev_raidz_exp2(rows[i][j], log);
 				invrows[ii][j] ^=
 				    vdev_raidz_exp2(invrows[i][j], log);
 			}
 		}
 	}
 
 	/*
 	 * Verify that the data that is left in the rows are properly part of
 	 * an identity matrix.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			if (j == missing[i]) {
 				ASSERT3U(rows[i][j], ==, 1);
 			} else {
 				ASSERT0(rows[i][j]);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
     int *missing, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, x, cc, c;
 	uint8_t *src;
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
-	uint8_t log, val;
+	uint8_t log = 0;
+	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 
 	psize = sizeof (invlog[0][0]) * n * nmissing;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing; i++) {
 		invlog[i] = pp;
 		pp += n;
 	}
 
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			ASSERT3U(invrows[i][j], !=, 0);
 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 		}
 	}
 
 	for (i = 0; i < n; i++) {
 		c = used[i];
 		ASSERT3U(c, <, rm->rm_cols);
 
 		src = rm->rm_col[c].rc_data;
 		ccount = rm->rm_col[c].rc_size;
 		for (j = 0; j < nmissing; j++) {
 			cc = missing[j] + rm->rm_firstdatacol;
 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
 			ASSERT3U(cc, <, rm->rm_cols);
 			ASSERT3U(cc, !=, c);
 
 			dst[j] = rm->rm_col[cc].rc_data;
 			dcount[j] = rm->rm_col[cc].rc_size;
 		}
 
 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
 
 		for (x = 0; x < ccount; x++, src++) {
 			if (*src != 0)
 				log = vdev_raidz_log2[*src];
 
 			for (cc = 0; cc < nmissing; cc++) {
 				if (x >= dcount[cc])
 					continue;
 
 				if (*src == 0) {
 					val = 0;
 				} else {
 					if ((ll = log + invlog[cc][i]) >= 255)
 						ll -= 255;
 					val = vdev_raidz_pow2[ll];
 				}
 
 				if (i == 0)
 					dst[cc][x] = val;
 				else
 					dst[cc][x] ^= val;
 			}
 		}
 	}
 
 	kmem_free(p, psize);
 }
 
 static int
 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	int n, i, c, t, tt;
 	int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 
 	uint8_t *p, *pp;
 	size_t psize;
 
 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *used;
 
 	int code = 0;
 
 
 	n = rm->rm_cols - rm->rm_firstdatacol;
 
 	/*
 	 * Figure out which data columns are missing.
 	 */
 	nmissing_rows = 0;
 	for (t = 0; t < ntgts; t++) {
 		if (tgts[t] >= rm->rm_firstdatacol) {
 			missing_rows[nmissing_rows++] =
 			    tgts[t] - rm->rm_firstdatacol;
 		}
 	}
 
 	/*
 	 * Figure out which parity columns to use to help generate the missing
 	 * data columns.
 	 */
 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 		ASSERT(tt < ntgts);
 		ASSERT(c < rm->rm_firstdatacol);
 
 		/*
 		 * Skip any targeted parity columns.
 		 */
 		if (c == tgts[tt]) {
 			tt++;
 			continue;
 		}
 
 		code |= 1 << c;
 
 		parity_map[i] = c;
 		i++;
 	}
 
 	ASSERT(code != 0);
 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
 
 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
 	    nmissing_rows * n + sizeof (used[0]) * n;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing_rows; i++) {
 		rows[i] = pp;
 		pp += n;
 		invrows[i] = pp;
 		pp += n;
 	}
 	used = pp;
 
 	for (i = 0; i < nmissing_rows; i++) {
 		used[i] = parity_map[i];
 	}
 
 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 		if (tt < nmissing_rows &&
 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
 			tt++;
 			continue;
 		}
 
 		ASSERT3S(i, <, n);
 		used[i] = c;
 		i++;
 	}
 
 	/*
 	 * Initialize the interesting rows of the matrix.
 	 */
 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
 
 	/*
 	 * Invert the matrix.
 	 */
 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
 	    invrows, used);
 
 	/*
 	 * Reconstruct the missing data using the generated matrix.
 	 */
 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
 	    invrows, used);
 
 	kmem_free(p, psize);
 
 	return (code);
 }
 
 static int
 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
 	int i, c;
 	int code;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
 	/*
 	 * The tgts list must already be sorted.
 	 */
 	for (i = 1; i < nt; i++) {
 		ASSERT(t[i] > t[i - 1]);
 	}
 
 	nbadparity = rm->rm_firstdatacol;
 	nbaddata = rm->rm_cols - nbadparity;
 	ntgts = 0;
 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
 		if (c < rm->rm_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
 		if (i < nt && c == t[i]) {
 			tgts[ntgts++] = c;
 			i++;
 		} else if (rm->rm_col[c].rc_error != 0) {
 			tgts[ntgts++] = c;
 		} else if (c >= rm->rm_firstdatacol) {
 			nbaddata--;
 		} else {
 			parity_valid[c] = B_TRUE;
 			nbadparity--;
 		}
 	}
 
 	ASSERT(ntgts >= nt);
 	ASSERT(nbaddata >= 0);
 	ASSERT(nbaddata + nbadparity == ntgts);
 
 	dt = &tgts[nbadparity];
 
 	/*
 	 * See if we can use any of our optimized reconstruction routines.
 	 */
 	if (!vdev_raidz_default_to_general) {
 		switch (nbaddata) {
 		case 1:
 			if (parity_valid[VDEV_RAIDZ_P])
 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
 
 			ASSERT(rm->rm_firstdatacol > 1);
 
 			if (parity_valid[VDEV_RAIDZ_Q])
 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
 
 			ASSERT(rm->rm_firstdatacol > 2);
 			break;
 
 		case 2:
 			ASSERT(rm->rm_firstdatacol > 1);
 
 			if (parity_valid[VDEV_RAIDZ_P] &&
 			    parity_valid[VDEV_RAIDZ_Q])
 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
 
 			ASSERT(rm->rm_firstdatacol > 2);
 
 			break;
 		}
 	}
 
 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
 	ASSERT(code > 0);
 	return (code);
 }
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *ashift)
 {
 	vdev_t *cvd;
 	uint64_t nparity = vd->vdev_nparity;
 	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
 	ASSERT(nparity > 0);
 
 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (EINVAL);
 	}
 
 	vdev_open_children(vd);
 
 	for (c = 0; c < vd->vdev_children; c++) {
 		cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*ashift = MAX(*ashift, cvd->vdev_ashift);
 	}
 
 	*asize *= vd->vdev_children;
 	*max_asize *= vd->vdev_children;
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_close(vdev_t *vd)
 {
 	int c;
 
 	for (c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t cols = vd->vdev_children;
 	uint64_t nparity = vd->vdev_nparity;
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 	asize = roundup(asize, nparity + 1) << ashift;
 
 	return (asize);
 }
 
 static void
 vdev_raidz_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	rc->rc_error = zio->io_error;
 	rc->rc_tried = 1;
 	rc->rc_skipped = 0;
 }
 
 static int
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_t *cvd;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
 	int c, i;
 
 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_FREE) {
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_raidz_generate_parity(rm);
 
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 
 		/*
 		 * Generate optional I/Os for any skipped sectors to improve
 		 * aggregation contiguity.
 		 */
 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
 			ASSERT(c <= rm->rm_scols);
 			if (c == rm->rm_scols)
 				c = 0;
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset + rc->rc_size, NULL,
 			    1 << tvd->vdev_ashift,
 			    zio->io_type, zio->io_priority,
 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 		}
 
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (c = rm->rm_cols - 1; c >= 0; c--) {
 		rc = &rm->rm_col[c];
 		cvd = vd->vdev_child[rc->rc_devidx];
 		if (!vdev_readable(cvd)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
 				rm->rm_missingparity++;
 			rc->rc_error = ENXIO;
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
 				rm->rm_missingparity++;
 			rc->rc_error = ESTALE;
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 static void
 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		zio_bad_cksum_t zbc;
 		raidz_map_t *rm = zio->io_vsd;
 
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
 		    &zbc);
 	}
 }
 
 /*
  * We keep track of whether or not there were any injected errors, so that
  * any ereports we generate can note it.
  */
 static int
 raidz_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t zbc;
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
 	if (ret != 0 && zbc.zbc_injected != 0)
 		rm->rm_ecksuminjected = 1;
 
 	return (ret);
 }
 
 /*
  * Generate the parity from the data columns. If we tried and were able to
  * read the parity without error, verify that the generated parity matches the
  * data we read. If it doesn't, we fire off a checksum error. Return the
  * number such failures.
  */
 static int
 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 {
 	void *orig[VDEV_RAIDZ_MAXPARITY];
 	int c, ret = 0;
 	raidz_col_t *rc;
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 		orig[c] = zio_buf_alloc(rc->rc_size);
 		bcopy(rc->rc_data, orig[c], rc->rc_size);
 	}
 
 	vdev_raidz_generate_parity(rm);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
 			raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = ECKSUM;
 			ret++;
 		}
 		zio_buf_free(orig[c], rc->rc_size);
 	}
 
 	return (ret);
 }
 
 /*
  * Keep statistics on all the ways that we used parity to correct data.
  */
 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
 vdev_raidz_worst_error(raidz_map_t *rm)
 {
 	int error = 0;
 
 	for (int c = 0; c < rm->rm_cols; c++)
 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
 
 	return (error);
 }
 
 /*
  * Iterate over all combinations of bad data and attempt a reconstruction.
  * Note that the algorithm below is non-optimal because it doesn't take into
  * account how reconstruction is actually performed. For example, with
  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
  * is targeted as invalid as if columns 1 and 4 are targeted since in both
  * cases we'd only use parity information in column 0.
  */
 static int
 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
 {
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 	void *orig[VDEV_RAIDZ_MAXPARITY];
 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 	int *tgts = &tstore[1];
 	int current, next, i, c, n;
 	int code, ret = 0;
 
 	ASSERT(total_errors < rm->rm_firstdatacol);
 
 	/*
 	 * This simplifies one edge condition.
 	 */
 	tgts[-1] = -1;
 
 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
 		/*
 		 * Initialize the targets array by finding the first n columns
 		 * that contain no error.
 		 *
 		 * If there were no data errors, we need to ensure that we're
 		 * always explicitly attempting to reconstruct at least one
 		 * data column. To do this, we simply push the highest target
 		 * up into the data columns.
 		 */
 		for (c = 0, i = 0; i < n; i++) {
 			if (i == n - 1 && data_errors == 0 &&
 			    c < rm->rm_firstdatacol) {
 				c = rm->rm_firstdatacol;
 			}
 
 			while (rm->rm_col[c].rc_error != 0) {
 				c++;
 				ASSERT3S(c, <, rm->rm_cols);
 			}
 
 			tgts[i] = c++;
 		}
 
 		/*
 		 * Setting tgts[n] simplifies the other edge condition.
 		 */
 		tgts[n] = rm->rm_cols;
 
 		/*
 		 * These buffers were allocated in previous iterations.
 		 */
 		for (i = 0; i < n - 1; i++) {
 			ASSERT(orig[i] != NULL);
 		}
 
 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
 
 		current = 0;
 		next = tgts[current];
 
 		while (current != n) {
 			tgts[current] = next;
 			current = 0;
 
 			/*
 			 * Save off the original data that we're going to
 			 * attempt to reconstruct.
 			 */
 			for (i = 0; i < n; i++) {
 				ASSERT(orig[i] != NULL);
 				c = tgts[i];
 				ASSERT3S(c, >=, 0);
 				ASSERT3S(c, <, rm->rm_cols);
 				rc = &rm->rm_col[c];
 				bcopy(rc->rc_data, orig[i], rc->rc_size);
 			}
 
 			/*
 			 * Attempt a reconstruction and exit the outer loop on
 			 * success.
 			 */
 			code = vdev_raidz_reconstruct(rm, tgts, n);
 			if (raidz_checksum_verify(zio) == 0) {
 				atomic_inc_64(&raidz_corrected[code]);
 
 				for (i = 0; i < n; i++) {
 					c = tgts[i];
 					rc = &rm->rm_col[c];
 					ASSERT(rc->rc_error == 0);
 					if (rc->rc_tried)
 						raidz_checksum_error(zio, rc,
 						    orig[i]);
 					rc->rc_error = ECKSUM;
 				}
 
 				ret = code;
 				goto done;
 			}
 
 			/*
 			 * Restore the original data.
 			 */
 			for (i = 0; i < n; i++) {
 				c = tgts[i];
 				rc = &rm->rm_col[c];
 				bcopy(orig[i], rc->rc_data, rc->rc_size);
 			}
 
 			do {
 				/*
 				 * Find the next valid column after the current
 				 * position..
 				 */
 				for (next = tgts[current] + 1;
 				    next < rm->rm_cols &&
 				    rm->rm_col[next].rc_error != 0; next++)
 					continue;
 
 				ASSERT(next <= tgts[current + 1]);
 
 				/*
 				 * If that spot is available, we're done here.
 				 */
 				if (next != tgts[current + 1])
 					break;
 
 				/*
 				 * Otherwise, find the next valid column after
 				 * the previous position.
 				 */
 				for (c = tgts[current - 1] + 1;
 				    rm->rm_col[c].rc_error != 0; c++)
 					continue;
 
 				tgts[current] = c;
 				current++;
 
 			} while (current != n);
 		}
 	}
 	n--;
 done:
 	for (i = 0; i < n; i++) {
 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
 	}
 
 	return (ret);
 }
 
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *cvd;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
 	int n, c;
 	int tgts[VDEV_RAIDZ_MAXPARITY];
 	int code;
 
 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
 
 	for (c = 0; c < rm->rm_cols; c++) {
 		rc = &rm->rm_col[c];
 
 		if (rc->rc_error) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 
 			if (c < rm->rm_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			if (!rc->rc_skipped)
 				unexpected_errors++;
 
 			total_errors++;
 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as a success.
 		 * (If we couldn't write enough columns to reconstruct
 		 * the data, the I/O failed.  Otherwise, good enough.)
 		 *
 		 * Now that we support write reallocation, it would be better
 		 * to treat partial failure as real failure unless there are
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
 		/* XXPOLICY */
 		if (total_errors > rm->rm_firstdatacol)
 			zio->io_error = vdev_raidz_worst_error(rm);
 
 		return;
 	} else if (zio->io_type == ZIO_TYPE_FREE) {
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 	/*
 	 * There are three potential phases for a read:
 	 *	1. produce valid data from the columns read
 	 *	2. read all disks and try again
 	 *	3. perform combinatorial reconstruction
 	 *
 	 * Each phase is progressively both more expensive and less likely to
 	 * occur. If we encounter more errors than we can repair or all phases
 	 * fail, we have no choice but to return an error.
 	 */
 
 	/*
 	 * If the number of errors we saw was correctable -- less than or equal
 	 * to the number of parity disks read -- attempt to produce data that
 	 * has a valid checksum. Naturally, this case applies in the absence of
 	 * any errors.
 	 */
 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
 		if (data_errors == 0) {
 			if (raidz_checksum_verify(zio) == 0) {
 				/*
 				 * If we read parity information (unnecessarily
 				 * as it happens since no reconstruction was
 				 * needed) regenerate and verify the parity.
 				 * We also regenerate parity when resilvering
 				 * so we can write it out to the failed device
 				 * later.
 				 */
 				if (parity_errors + parity_untried <
 				    rm->rm_firstdatacol ||
 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 					n = raidz_parity_verify(zio, rm);
 					unexpected_errors += n;
 					ASSERT(parity_errors + n <=
 					    rm->rm_firstdatacol);
 				}
 				goto done;
 			}
 		} else {
 			/*
 			 * We either attempt to read all the parity columns or
 			 * none of them. If we didn't try to read parity, we
 			 * wouldn't be here in the correctable case. There must
 			 * also have been fewer parity errors than parity
 			 * columns or, again, we wouldn't be in this code path.
 			 */
 			ASSERT(parity_untried == 0);
 			ASSERT(parity_errors < rm->rm_firstdatacol);
 
 			/*
 			 * Identify the data columns that reported an error.
 			 */
 			n = 0;
 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 				rc = &rm->rm_col[c];
 				if (rc->rc_error != 0) {
 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
 					tgts[n++] = c;
 				}
 			}
 
 			ASSERT(rm->rm_firstdatacol >= n);
 
 			code = vdev_raidz_reconstruct(rm, tgts, n);
 
 			if (raidz_checksum_verify(zio) == 0) {
 				atomic_inc_64(&raidz_corrected[code]);
 
 				/*
 				 * If we read more parity disks than were used
 				 * for reconstruction, confirm that the other
 				 * parity disks produced correct data. This
 				 * routine is suboptimal in that it regenerates
 				 * the parity that we already used in addition
 				 * to the parity that we're attempting to
 				 * verify, but this should be a relatively
 				 * uncommon case, and can be optimized if it
 				 * becomes a problem. Note that we regenerate
 				 * parity when resilvering so we can write it
 				 * out to failed devices later.
 				 */
 				if (parity_errors < rm->rm_firstdatacol - n ||
 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 					n = raidz_parity_verify(zio, rm);
 					unexpected_errors += n;
 					ASSERT(parity_errors + n <=
 					    rm->rm_firstdatacol);
 				}
 
 				goto done;
 			}
 		}
 	}
 
 	/*
 	 * This isn't a typical situation -- either we got a read error or
 	 * a child silently returned bad data. Read every block so we can
 	 * try again with as much data and parity as we can track down. If
 	 * we've already been through once before, all children will be marked
 	 * as tried so we'll proceed to combinatorial reconstruction.
 	 */
 	unexpected_errors = 1;
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 
 	for (c = 0; c < rm->rm_cols; c++) {
 		if (rm->rm_col[c].rc_tried)
 			continue;
 
 		zio_vdev_io_redone(zio);
 		do {
 			rc = &rm->rm_col[c];
 			if (rc->rc_tried)
 				continue;
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    vd->vdev_child[rc->rc_devidx],
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		} while (++c < rm->rm_cols);
 
 		return;
 	}
 
 	/*
 	 * At this point we've attempted to reconstruct the data given the
 	 * errors we detected, and we've attempted to read all columns. There
 	 * must, therefore, be one or more additional problems -- silent errors
 	 * resulting in invalid data rather than explicit I/O errors resulting
 	 * in absent data. We check if there is enough additional data to
 	 * possibly reconstruct the data and then perform combinatorial
 	 * reconstruction over all possible combinations. If that fails,
 	 * we're cooked.
 	 */
 	if (total_errors > rm->rm_firstdatacol) {
 		zio->io_error = vdev_raidz_worst_error(rm);
 
 	} else if (total_errors < rm->rm_firstdatacol &&
 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
 		/*
 		 * If we didn't use all the available parity for the
 		 * combinatorial reconstruction, verify that the remaining
 		 * parity is correct.
 		 */
 		if (code != (1 << rm->rm_firstdatacol) - 1)
 			(void) raidz_parity_verify(zio, rm);
 	} else {
 		/*
 		 * We're here because either:
 		 *
 		 *	total_errors == rm_first_datacol, or
 		 *	vdev_raidz_combrec() failed
 		 *
 		 * In either case, there is enough bad data to prevent
 		 * reconstruction.
 		 *
 		 * Start checksum ereports for all children which haven't
 		 * failed, and the IO wasn't speculative.
 		 */
 		zio->io_error = ECKSUM;
 
 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			for (c = 0; c < rm->rm_cols; c++) {
 				rc = &rm->rm_col[c];
 				if (rc->rc_error == 0) {
 					zio_bad_cksum_t zbc;
 					zbc.zbc_has_cksum = 0;
 					zbc.zbc_injected =
 					    rm->rm_ecksuminjected;
 
 					zfs_ereport_start_checksum(
 					    zio->io_spa,
 					    vd->vdev_child[rc->rc_devidx],
 					    zio, rc->rc_offset, rc->rc_size,
 					    (void *)(uintptr_t)c, &zbc);
 				}
 			}
 		}
 	}
 
 done:
 	zio_checksum_verified(zio);
 
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
 
 			if (rc->rc_error == 0)
 				continue;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
 
 static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted > vd->vdev_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_raidz_ops = {
 	vdev_raidz_open,
 	vdev_raidz_close,
 	vdev_raidz_asize,
 	vdev_raidz_io_start,
 	vdev_raidz_io_done,
 	vdev_raidz_state_change,
 	NULL,
 	NULL,
 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c	(revision 247192)
@@ -1,872 +1,872 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
  * The 512-byte leaf is broken into 32 16-byte chunks.
  * chunk number n means l_chunk[n], even though the header precedes it.
  * the names are stored null-terminated.
  */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
 /* half the (current) minimum block size */
 #define	MAX_ARRAY_BYTES (8<<10)
 
 #define	LEAF_HASH(l, h) \
 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
 	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
 
 #define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
 
 
 static void
 zap_memset(void *a, int c, size_t n)
 {
 	char *cp = a;
 	char *cpend = cp + n;
 
 	while (cp < cpend)
 		*cp++ = c;
 }
 
 static void
 stv(int len, void *addr, uint64_t value)
 {
 	switch (len) {
 	case 1:
 		*(uint8_t *)addr = value;
 		return;
 	case 2:
 		*(uint16_t *)addr = value;
 		return;
 	case 4:
 		*(uint32_t *)addr = value;
 		return;
 	case 8:
 		*(uint64_t *)addr = value;
 		return;
 	}
 	ASSERT(!"bad int len");
 }
 
 static uint64_t
 ldv(int len, const void *addr)
 {
 	switch (len) {
 	case 1:
 		return (*(uint8_t *)addr);
 	case 2:
 		return (*(uint16_t *)addr);
 	case 4:
 		return (*(uint32_t *)addr);
 	case 8:
 		return (*(uint64_t *)addr);
 	}
 	ASSERT(!"bad int len");
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 {
 	int i;
 	zap_leaf_t l;
 	l.l_bs = highbit(size)-1;
 	l.l_phys = buf;
 
 	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
 	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
 	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
 	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
 	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
 	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
 
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
 		switch (lc->l_free.lf_type) {
 		case ZAP_CHUNK_ENTRY:
 			le = &lc->l_entry;
 
 			le->le_type =		BSWAP_8(le->le_type);
 			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
 			le->le_next =		BSWAP_16(le->le_next);
 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
 			le->le_name_numints =	BSWAP_16(le->le_name_numints);
 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
 			le->le_value_numints =	BSWAP_16(le->le_value_numints);
 			le->le_cd =		BSWAP_32(le->le_cd);
 			le->le_hash =		BSWAP_64(le->le_hash);
 			break;
 		case ZAP_CHUNK_FREE:
 			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
 			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
 			break;
 		case ZAP_CHUNK_ARRAY:
 			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
 			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
 			/* la_array doesn't need swapping */
 			break;
 		default:
 			ASSERT(!"bad leaf type");
 		}
 	}
 }
 
 void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	int i;
 
 	l->l_bs = highbit(l->l_dbuf->db_size)-1;
 	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
 	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
 	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
 	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
 	if (sort)
 		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 }
 
 /*
  * Routines which manipulate leaf chunks (l_chunk[]).
  */
 
 static uint16_t
 zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	int chunk;
 
 	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
 
 	chunk = l->l_phys->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
 	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
 
 	l->l_phys->l_hdr.lh_nfree--;
 
 	return (chunk);
 }
 
 static void
 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 {
 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
 	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
 
 	zlf->lf_type = ZAP_CHUNK_FREE;
 	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
 	l->l_phys->l_hdr.lh_freelist = chunk;
 
 	l->l_phys->l_hdr.lh_nfree++;
 }
 
 /*
  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
  */
 
 static uint16_t
 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
     int integer_size, int num_integers)
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
 	int byten = 0;
-	uint64_t value;
+	uint64_t value = 0;
 	int shift = (integer_size-1)*8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
 
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int i;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
 			if (byten == 0)
 				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
 			if (++byten == integer_size) {
 				byten = 0;
 				buf += integer_size;
 				if (--len == 0)
 					break;
 			}
 		}
 
 		*chunkp = chunk;
 		chunkp = &la->la_next;
 	}
 	*chunkp = CHAIN_END;
 
 	return (chunk_head);
 }
 
 static void
 zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 {
 	uint16_t chunk = *chunkp;
 
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
 		chunk = nextchunk;
 	}
 }
 
 /* array_len and buf_len are in integers, not bytes */
 static void
 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
     void *buf)
 {
 	int len = MIN(array_len, buf_len);
 	int byten = 0;
 	uint64_t value = 0;
 	char *p = buf;
 
 	ASSERT3U(array_int_len, <=, buf_int_len);
 
 	/* Fast path for one 8-byte integer */
 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		uint8_t *ip = la->la_array;
 		uint64_t *buf64 = buf;
 
 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
 		return;
 	}
 
 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
 	if (array_int_len == 1 && buf_int_len == 1 &&
 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_array *la =
 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
 			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
 			p += ZAP_LEAF_ARRAY_BYTES;
 			chunk = la->la_next;
 		}
 		return;
 	}
 
 	while (len > 0) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int i;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
 				stv(buf_int_len, p, value);
 				byten = 0;
 				len--;
 				if (len == 0)
 					return;
 				p += buf_int_len;
 			}
 		}
 		chunk = la->la_next;
 	}
 }
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
     int chunk, int array_numints)
 {
 	int bseen = 0;
 
 	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
 		uint64_t *thiskey;
 		boolean_t match;
 
 		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
 		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
 		    KM_SLEEP);
 
 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
 		    sizeof (*thiskey), array_numints, thiskey);
 		match = bcmp(thiskey, zn->zn_key_orig,
 		    array_numints * sizeof (*thiskey)) == 0;
 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
 		return (match);
 	}
 
 	ASSERT(zn->zn_key_intlen == 1);
 	if (zn->zn_matchtype == MT_FIRST) {
 		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
 		boolean_t match;
 
 		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
 		    sizeof (char), array_numints, thisname);
 		match = zap_match(zn, thisname);
 		kmem_free(thisname, array_numints);
 		return (match);
 	}
 
 	/*
 	 * Fast path for exact matching.
 	 * First check that the lengths match, so that we don't read
 	 * past the end of the zn_key_orig array.
 	 */
 	if (array_numints != zn->zn_key_orig_numints)
 		return (B_FALSE);
 	while (bseen < array_numints) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
 			break;
 		chunk = la->la_next;
 		bseen += toread;
 	}
 	return (bseen == array_numints);
 }
 
 /*
  * Routines which manipulate leaf entries.
  */
 
 int
 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
 {
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 
 	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 again:
 	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
 		uint16_t chunk = *chunkp;
 		le = ZAP_LEAF_ENTRY(l, chunk);
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 		if (le->le_hash != zn->zn_hash)
 			continue;
 
 		/*
 		 * NB: the entry chain is always sorted by cd on
 		 * normalized zap objects, so this will find the
 		 * lowest-cd match for MT_FIRST.
 		 */
 		ASSERT(zn->zn_matchtype == MT_EXACT ||
 		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
 		    le->le_name_numints)) {
 			zeh->zeh_num_integers = le->le_value_numints;
 			zeh->zeh_integer_size = le->le_value_intlen;
 			zeh->zeh_cd = le->le_cd;
 			zeh->zeh_hash = le->le_hash;
 			zeh->zeh_chunkp = chunkp;
 			zeh->zeh_leaf = l;
 			return (0);
 		}
 	}
 
 	/*
 	 * NB: we could of course do this in one pass, but that would be
 	 * a pain.  We'll see if MT_BEST is even used much.
 	 */
 	if (zn->zn_matchtype == MT_BEST) {
 		zn->zn_matchtype = MT_FIRST;
 		goto again;
 	}
 
 	return (ENOENT);
 }
 
 /* Return (h1,cd1 >= h2,cd2) */
 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
 
 int
 zap_leaf_lookup_closest(zap_leaf_t *l,
     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
 	uint64_t besth = -1ULL;
 	uint32_t bestcd = -1U;
 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
 	uint16_t lh;
 	struct zap_leaf_entry *le;
 
 	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
 		for (chunk = l->l_phys->l_hash[lh];
 		    chunk != CHAIN_END; chunk = le->le_next) {
 			le = ZAP_LEAF_ENTRY(l, chunk);
 
 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
 				ASSERT3U(bestlh, >=, lh);
 				bestlh = lh;
 				besth = le->le_hash;
 				bestcd = le->le_cd;
 
 				zeh->zeh_num_integers = le->le_value_numints;
 				zeh->zeh_integer_size = le->le_value_intlen;
 				zeh->zeh_cd = le->le_cd;
 				zeh->zeh_hash = le->le_hash;
 				zeh->zeh_fakechunk = chunk;
 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
 				zeh->zeh_leaf = l;
 			}
 		}
 	}
 
 	return (bestcd == -1U ? ENOENT : 0);
 }
 
 int
 zap_entry_read(const zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, void *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	if (le->le_value_intlen > integer_size)
 		return (EINVAL);
 
 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
 	    le->le_value_intlen, le->le_value_numints,
 	    integer_size, num_integers, buf);
 
 	if (zeh->zeh_num_integers > num_integers)
 		return (EOVERFLOW);
 	return (0);
 
 }
 
 int
 zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
     char *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
 		    le->le_name_numints, 8, buflen / 8, buf);
 	} else {
 		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
 		    le->le_name_numints, 1, buflen, buf);
 	}
 	if (le->le_name_numints > buflen)
 		return (EOVERFLOW);
 	return (0);
 }
 
 int
 zap_entry_update(zap_entry_handle_t *zeh,
 	uint8_t integer_size, uint64_t num_integers, const void *buf)
 {
 	int delta_chunks;
 	zap_leaf_t *l = zeh->zeh_leaf;
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
 
 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
 	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
 		return (EAGAIN);
 
 	zap_leaf_array_free(l, &le->le_value_chunk);
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
 	le->le_value_numints = num_integers;
 	le->le_value_intlen = integer_size;
 	return (0);
 }
 
 void
 zap_entry_remove(zap_entry_handle_t *zeh)
 {
 	uint16_t entry_chunk;
 	struct zap_leaf_entry *le;
 	zap_leaf_t *l = zeh->zeh_leaf;
 
 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
 
 	entry_chunk = *zeh->zeh_chunkp;
 	le = ZAP_LEAF_ENTRY(l, entry_chunk);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	zap_leaf_array_free(l, &le->le_name_chunk);
 	zap_leaf_array_free(l, &le->le_value_chunk);
 
 	*zeh->zeh_chunkp = le->le_next;
 	zap_leaf_chunk_free(l, entry_chunk);
 
 	l->l_phys->l_hdr.lh_nentries--;
 }
 
 int
 zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 	uint64_t valuelen;
 	int numchunks;
 	uint64_t h = zn->zn_hash;
 
 	valuelen = integer_size * num_integers;
 
 	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (E2BIG);
 
 	if (cd == ZAP_NEED_CD) {
 		/* find the lowest unused cd */
 		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
 			cd = 0;
 
 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
 			    chunk != CHAIN_END; chunk = le->le_next) {
 				le = ZAP_LEAF_ENTRY(l, chunk);
 				if (le->le_cd > cd)
 					break;
 				if (le->le_hash == h) {
 					ASSERT3U(cd, ==, le->le_cd);
 					cd++;
 				}
 			}
 		} else {
 			/* old unsorted format; do it the O(n^2) way */
 			for (cd = 0; ; cd++) {
 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
 				    chunk != CHAIN_END; chunk = le->le_next) {
 					le = ZAP_LEAF_ENTRY(l, chunk);
 					if (le->le_hash == h &&
 					    le->le_cd == cd) {
 						break;
 					}
 				}
 				/* If this cd is not in use, we are good. */
 				if (chunk == CHAIN_END)
 					break;
 			}
 		}
 		/*
 		 * We would run out of space in a block before we could
 		 * store enough entries to run out of CD values.
 		 */
 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
 	}
 
 	if (l->l_phys->l_hdr.lh_nfree < numchunks)
 		return (EAGAIN);
 
 	/* make the entry */
 	chunk = zap_leaf_chunk_alloc(l);
 	le = ZAP_LEAF_ENTRY(l, chunk);
 	le->le_type = ZAP_CHUNK_ENTRY;
 	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
 	    zn->zn_key_intlen, zn->zn_key_orig_numints);
 	le->le_name_numints = zn->zn_key_orig_numints;
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
 	le->le_value_numints = num_integers;
 	le->le_value_intlen = integer_size;
 	le->le_hash = h;
 	le->le_cd = cd;
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
 	chunkp = zap_leaf_rehash_entry(l, chunk);
 
 	l->l_phys->l_hdr.lh_nentries++;
 
 	zeh->zeh_leaf = l;
 	zeh->zeh_num_integers = num_integers;
 	zeh->zeh_integer_size = le->le_value_intlen;
 	zeh->zeh_cd = le->le_cd;
 	zeh->zeh_hash = le->le_hash;
 	zeh->zeh_chunkp = chunkp;
 
 	return (0);
 }
 
 /*
  * Determine if there is another entry with the same normalized form.
  * For performance purposes, either zn or name must be provided (the
  * other can be NULL).  Note, there usually won't be any hash
  * conflicts, in which case we don't need the concatenated/normalized
  * form of the name.  But all callers have one of these on hand anyway,
  * so might as well take advantage.  A cleaner but slower interface
  * would accept neither argument, and compute the normalized name as
  * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
  */
 boolean_t
 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
     const char *name, zap_t *zap)
 {
 	uint64_t chunk;
 	struct zap_leaf_entry *le;
 	boolean_t allocdzn = B_FALSE;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
 	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
 	    chunk != CHAIN_END; chunk = le->le_next) {
 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
 		if (le->le_hash != zeh->zeh_hash)
 			continue;
 		if (le->le_cd == zeh->zeh_cd)
 			continue;
 
 		if (zn == NULL) {
 			zn = zap_name_alloc(zap, name, MT_FIRST);
 			allocdzn = B_TRUE;
 		}
 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
 		    le->le_name_chunk, le->le_name_numints)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
 		}
 	}
 	if (allocdzn)
 		zap_name_free(zn);
 	return (B_FALSE);
 }
 
 /*
  * Routines for transferring entries between leafs.
  */
 
 static uint16_t *
 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
 	/*
 	 * keep the entry chain sorted by cd
 	 * NB: this will not cause problems for unsorted leafs, though
 	 * it is unnecessary there.
 	 */
 	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
 	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
 		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
 		if (le2->le_cd > le->le_cd)
 			break;
 	}
 
 	le->le_next = *chunkp;
 	*chunkp = entry;
 	return (chunkp);
 }
 
 static uint16_t
 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 {
 	uint16_t new_chunk;
 	uint16_t *nchunkp = &new_chunk;
 
 	while (chunk != CHAIN_END) {
 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
 		struct zap_leaf_array *nla =
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
 
 		*nla = *la; /* structure assignment */
 
 		zap_leaf_chunk_free(l, chunk);
 		chunk = nextchunk;
 		*nchunkp = nchunk;
 		nchunkp = &nla->la_next;
 	}
 	*nchunkp = CHAIN_END;
 	return (new_chunk);
 }
 
 static void
 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le, *nle;
 	uint16_t chunk;
 
 	le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	chunk = zap_leaf_chunk_alloc(nl);
 	nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
 	(void) zap_leaf_rehash_entry(nl, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
 
 	zap_leaf_chunk_free(l, entry);
 
 	l->l_phys->l_hdr.lh_nentries--;
 	nl->l_phys->l_hdr.lh_nentries++;
 }
 
 /*
  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
  */
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
 	int i;
 	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
 	l->l_phys->l_hdr.lh_prefix <<= 1;
 	l->l_phys->l_hdr.lh_prefix_len++;
 	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
 	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
 		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 
 	/*
 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
 	 * the remaining entries
 	 *
 	 * NB: We could find entries via the hashtable instead. That
 	 * would be O(hashents+numents) rather than O(numblks+numents),
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
 
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
 			(void) zap_leaf_rehash_entry(l, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
 	int i, n;
 
 	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
 	    l->l_phys->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
 
 
 	n = l->l_phys->l_hdr.lh_nentries/5;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_with_n5_entries[n]++;
 
 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
 	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
 	    (1<<FZAP_BLOCK_SHIFT(zap));
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
 		int nentries = 0;
 		int chunk = l->l_phys->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =
 			    ZAP_LEAF_ENTRY(l, chunk);
 
 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
 			    le->le_value_intlen);
 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 			zs->zs_entries_using_n_chunks[n]++;
 
 			chunk = le->le_next;
 			nentries++;
 		}
 
 		n = nentries;
 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 		zs->zs_buckets_with_n_entries[n]++;
 	}
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c	(revision 247192)
@@ -1,199 +1,199 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_acl.h>
 
 void
 zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
 {
 	int i;
 
 	for (i = 0; i != ace_cnt; i++, ace++) {
 		ace->a_who = BSWAP_32(ace->a_who);
 		ace->a_access_mask = BSWAP_32(ace->a_access_mask);
 		ace->a_flags = BSWAP_16(ace->a_flags);
 		ace->a_type = BSWAP_16(ace->a_type);
 	}
 }
 
 /*
  * swap ace_t and ace_oject_t
  */
 void
 zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
 {
 	caddr_t end;
 	caddr_t ptr;
-	zfs_ace_t *zacep;
+	zfs_ace_t *zacep = NULL;
 	ace_t *acep;
 	uint16_t entry_type;
 	size_t entry_size;
 	int ace_type;
 
 	end = (caddr_t)buf + size;
 	ptr = buf;
 
 	while (ptr < end) {
 		if (zfs_layout) {
 			/*
 			 * Avoid overrun.  Embedded aces can have one
 			 * of several sizes.  We don't know exactly
 			 * how many our present, only the size of the
 			 * buffer containing them.  That size may be
 			 * larger than needed to hold the aces
 			 * present.  As long as we do not do any
 			 * swapping beyond the end of our block we are
 			 * okay.  It it safe to swap any non-ace data
 			 * within the block since it is just zeros.
 			 */
 			if (ptr + sizeof (zfs_ace_hdr_t) > end) {
 				break;
 			}
 			zacep = (zfs_ace_t *)ptr;
 			zacep->z_hdr.z_access_mask =
 			    BSWAP_32(zacep->z_hdr.z_access_mask);
 			zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
 			ace_type = zacep->z_hdr.z_type =
 			    BSWAP_16(zacep->z_hdr.z_type);
 			entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
 		} else {
 			/* Overrun avoidance */
 			if (ptr + sizeof (ace_t) > end) {
 				break;
 			}
 			acep = (ace_t *)ptr;
 			acep->a_access_mask = BSWAP_32(acep->a_access_mask);
 			acep->a_flags = BSWAP_16(acep->a_flags);
 			ace_type = acep->a_type = BSWAP_16(acep->a_type);
 			acep->a_who = BSWAP_32(acep->a_who);
 			entry_type = acep->a_flags & ACE_TYPE_FLAGS;
 		}
 		switch (entry_type) {
 		case ACE_OWNER:
 		case ACE_EVERYONE:
 		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
 			entry_size = zfs_layout ?
 			    sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
 			break;
 		case ACE_IDENTIFIER_GROUP:
 		default:
 			/* Overrun avoidance */
 			if (zfs_layout) {
 				if (ptr + sizeof (zfs_ace_t) <= end) {
 					zacep->z_fuid = BSWAP_64(zacep->z_fuid);
 				} else {
 					entry_size = sizeof (zfs_ace_t);
 					break;
 				}
 			}
 			switch (ace_type) {
 			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
 			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
 			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
 			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
 				entry_size = zfs_layout ?
 				    sizeof (zfs_object_ace_t) :
 				    sizeof (ace_object_t);
 				break;
 			default:
 				entry_size = zfs_layout ? sizeof (zfs_ace_t) :
 				    sizeof (ace_t);
 				break;
 			}
 		}
 		ptr = ptr + entry_size;
 	}
 }
 
 /* ARGSUSED */
 void
 zfs_oldacl_byteswap(void *buf, size_t size)
 {
 	int cnt;
 
 	/*
 	 * Arggh, since we don't know how many ACEs are in
 	 * the array, we have to swap the entire block
 	 */
 
 	cnt = size / sizeof (ace_t);
 
 	zfs_oldace_byteswap((ace_t *)buf, cnt);
 }
 
 /* ARGSUSED */
 void
 zfs_acl_byteswap(void *buf, size_t size)
 {
 	zfs_ace_byteswap(buf, size, B_TRUE);
 }
 
 void
 zfs_znode_byteswap(void *buf, size_t size)
 {
 	znode_phys_t *zp = buf;
 
 	ASSERT(size >= sizeof (znode_phys_t));
 
 	zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
 	zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
 	zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
 	zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
 	zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
 	zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
 	zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
 	zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
 	zp->zp_gen = BSWAP_64(zp->zp_gen);
 	zp->zp_mode = BSWAP_64(zp->zp_mode);
 	zp->zp_size = BSWAP_64(zp->zp_size);
 	zp->zp_parent = BSWAP_64(zp->zp_parent);
 	zp->zp_links = BSWAP_64(zp->zp_links);
 	zp->zp_xattr = BSWAP_64(zp->zp_xattr);
 	zp->zp_rdev = BSWAP_64(zp->zp_rdev);
 	zp->zp_flags = BSWAP_64(zp->zp_flags);
 	zp->zp_uid = BSWAP_64(zp->zp_uid);
 	zp->zp_gid = BSWAP_64(zp->zp_gid);
 	zp->zp_zap = BSWAP_64(zp->zp_zap);
 	zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
 	zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
 	zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
 
 	zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
 	zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
 	zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
 	zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
 	if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
 		zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
 		    ZFS_ACE_SPACE);
 	} else {
 		zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
 		    ACE_SLOT_CNT);
 	}
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c	(revision 247192)
@@ -1,764 +1,767 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/avl.h>
 #include <sys/zap.h>
 #include <sys/refcount.h>
 #include <sys/nvpair.h>
 #ifdef _KERNEL
 #include <sys/kidmap.h>
 #include <sys/sid.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #endif
 #include <sys/zfs_fuid.h>
 
 /*
  * FUID Domain table(s).
  *
  * The FUID table is stored as a packed nvlist of an array
  * of nvlists which contain an index, domain string and offset
  *
  * During file system initialization the nvlist(s) are read and
  * two AVL trees are created.  One tree is keyed by the index number
  * and the other by the domain string.  Nodes are never removed from
  * trees, but new entries may be added.  If a new entry is added then
  * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
  * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
  *
  */
 
 #define	FUID_IDX	"fuid_idx"
 #define	FUID_DOMAIN	"fuid_domain"
 #define	FUID_OFFSET	"fuid_offset"
 #define	FUID_NVP_ARRAY	"fuid_nvlist"
 
 typedef struct fuid_domain {
 	avl_node_t	f_domnode;
 	avl_node_t	f_idxnode;
 	ksiddomain_t	*f_ksid;
 	uint64_t	f_idx;
 } fuid_domain_t;
 
 static char *nulldomain = "";
 
 /*
  * Compare two indexes.
  */
 static int
 idx_compare(const void *arg1, const void *arg2)
 {
 	const fuid_domain_t *node1 = arg1;
 	const fuid_domain_t *node2 = arg2;
 
 	if (node1->f_idx < node2->f_idx)
 		return (-1);
 	else if (node1->f_idx > node2->f_idx)
 		return (1);
 	return (0);
 }
 
 /*
  * Compare two domain strings.
  */
 static int
 domain_compare(const void *arg1, const void *arg2)
 {
 	const fuid_domain_t *node1 = arg1;
 	const fuid_domain_t *node2 = arg2;
 	int val;
 
 	val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
 	if (val == 0)
 		return (0);
 	return (val > 0 ? 1 : -1);
 }
 
 void
 zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
 {
 	avl_create(idx_tree, idx_compare,
 	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
 	avl_create(domain_tree, domain_compare,
 	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
 }
 
 /*
  * load initial fuid domain and idx trees.  This function is used by
  * both the kernel and zdb.
  */
 uint64_t
 zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
     avl_tree_t *domain_tree)
 {
 	dmu_buf_t *db;
 	uint64_t fuid_size;
 
 	ASSERT(fuid_obj != 0);
 	VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
 	    FTAG, &db));
 	fuid_size = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	if (fuid_size)  {
 		nvlist_t **fuidnvp;
 		nvlist_t *nvp = NULL;
 		uint_t count;
 		char *packed;
 		int i;
 
 		packed = kmem_alloc(fuid_size, KM_SLEEP);
 		VERIFY(dmu_read(os, fuid_obj, 0,
 		    fuid_size, packed, DMU_READ_PREFETCH) == 0);
 		VERIFY(nvlist_unpack(packed, fuid_size,
 		    &nvp, 0) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
 		    &fuidnvp, &count) == 0);
 
 		for (i = 0; i != count; i++) {
 			fuid_domain_t *domnode;
 			char *domain;
 			uint64_t idx;
 
 			VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
 			    &domain) == 0);
 			VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
 			    &idx) == 0);
 
 			domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
 
 			domnode->f_idx = idx;
 			domnode->f_ksid = ksid_lookupdomain(domain);
 			avl_add(idx_tree, domnode);
 			avl_add(domain_tree, domnode);
 		}
 		nvlist_free(nvp);
 		kmem_free(packed, fuid_size);
 	}
 	return (fuid_size);
 }
 
 void
 zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
 {
 	fuid_domain_t *domnode;
 	void *cookie;
 
 	cookie = NULL;
 	while (domnode = avl_destroy_nodes(domain_tree, &cookie))
 		ksiddomain_rele(domnode->f_ksid);
 
 	avl_destroy(domain_tree);
 	cookie = NULL;
 	while (domnode = avl_destroy_nodes(idx_tree, &cookie))
 		kmem_free(domnode, sizeof (fuid_domain_t));
 	avl_destroy(idx_tree);
 }
 
 char *
 zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
 {
 	fuid_domain_t searchnode, *findnode;
 	avl_index_t loc;
 
 	searchnode.f_idx = idx;
 
 	findnode = avl_find(idx_tree, &searchnode, &loc);
 
 	return (findnode ? findnode->f_ksid->kd_name : nulldomain);
 }
 
 #ifdef _KERNEL
 /*
  * Load the fuid table(s) into memory.
  */
 static void
 zfs_fuid_init(zfsvfs_t *zfsvfs)
 {
 	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
 
 	if (zfsvfs->z_fuid_loaded) {
 		rw_exit(&zfsvfs->z_fuid_lock);
 		return;
 	}
 
 	zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
 
 	(void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
 	    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
 	if (zfsvfs->z_fuid_obj != 0) {
 		zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
 		    zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
 		    &zfsvfs->z_fuid_domain);
 	}
 
 	zfsvfs->z_fuid_loaded = B_TRUE;
 	rw_exit(&zfsvfs->z_fuid_lock);
 }
 
 /*
  * sync out AVL trees to persistent storage.
  */
 void
 zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 {
 	nvlist_t *nvp;
 	nvlist_t **fuids;
 	size_t nvsize = 0;
 	char *packed;
 	dmu_buf_t *db;
 	fuid_domain_t *domnode;
 	int numnodes;
 	int i;
 
 	if (!zfsvfs->z_fuid_dirty) {
 		return;
 	}
 
 	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
 
 	/*
 	 * First see if table needs to be created?
 	 */
 	if (zfsvfs->z_fuid_obj == 0) {
 		zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
 		    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
 		    &zfsvfs->z_fuid_obj, tx) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
 	fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
 	for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
 	    domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
 		VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
 		    domnode->f_idx) == 0);
 		VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
 		VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
 		    domnode->f_ksid->kd_name) == 0);
 	}
 	VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
 	    fuids, numnodes) == 0);
 	for (i = 0; i != numnodes; i++)
 		nvlist_free(fuids[i]);
 	kmem_free(fuids, numnodes * sizeof (void *));
 	VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 	VERIFY(nvlist_pack(nvp, &packed, &nvsize,
 	    NV_ENCODE_XDR, KM_SLEEP) == 0);
 	nvlist_free(nvp);
 	zfsvfs->z_fuid_size = nvsize;
 	dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
 	    zfsvfs->z_fuid_size, packed, tx);
 	kmem_free(packed, zfsvfs->z_fuid_size);
 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
 	    FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
 	dmu_buf_rele(db, FTAG);
 
 	zfsvfs->z_fuid_dirty = B_FALSE;
 	rw_exit(&zfsvfs->z_fuid_lock);
 }
 
 /*
  * Query domain table for a given domain.
  *
  * If domain isn't found and addok is set, it is added to AVL trees and
  * the zfsvfs->z_fuid_dirty flag will be set to TRUE.  It will then be
  * necessary for the caller or another thread to detect the dirty table
  * and sync out the changes.
  */
 int
 zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
     char **retdomain, boolean_t addok)
 {
 	fuid_domain_t searchnode, *findnode;
 	avl_index_t loc;
 	krw_t rw = RW_READER;
 
 	/*
 	 * If the dummy "nobody" domain then return an index of 0
 	 * to cause the created FUID to be a standard POSIX id
 	 * for the user nobody.
 	 */
 	if (domain[0] == '\0') {
 		if (retdomain)
 			*retdomain = nulldomain;
 		return (0);
 	}
 
 	searchnode.f_ksid = ksid_lookupdomain(domain);
 	if (retdomain)
 		*retdomain = searchnode.f_ksid->kd_name;
 	if (!zfsvfs->z_fuid_loaded)
 		zfs_fuid_init(zfsvfs);
 
 retry:
 	rw_enter(&zfsvfs->z_fuid_lock, rw);
 	findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
 
 	if (findnode) {
 		rw_exit(&zfsvfs->z_fuid_lock);
 		ksiddomain_rele(searchnode.f_ksid);
 		return (findnode->f_idx);
 	} else if (addok) {
 		fuid_domain_t *domnode;
 		uint64_t retidx;
 
 		if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
 			rw_exit(&zfsvfs->z_fuid_lock);
 			rw = RW_WRITER;
 			goto retry;
 		}
 
 		domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
 		domnode->f_ksid = searchnode.f_ksid;
 
 		retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
 
 		avl_add(&zfsvfs->z_fuid_domain, domnode);
 		avl_add(&zfsvfs->z_fuid_idx, domnode);
 		zfsvfs->z_fuid_dirty = B_TRUE;
 		rw_exit(&zfsvfs->z_fuid_lock);
 		return (retidx);
 	} else {
 		rw_exit(&zfsvfs->z_fuid_lock);
 		return (-1);
 	}
 }
 
 /*
  * Query domain table by index, returning domain string
  *
  * Returns a pointer from an avl node of the domain string.
  *
  */
 const char *
 zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 {
 	char *domain;
 
 	if (idx == 0 || !zfsvfs->z_use_fuids)
 		return (NULL);
 
 	if (!zfsvfs->z_fuid_loaded)
 		zfs_fuid_init(zfsvfs);
 
 	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
 
 	if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
 		domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
 	else
 		domain = nulldomain;
 	rw_exit(&zfsvfs->z_fuid_lock);
 
 	ASSERT(domain);
 	return (domain);
 }
 
 void
 zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
 {
 	*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
 	*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
 }
 
 uid_t
 zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
     cred_t *cr, zfs_fuid_type_t type)
 {
 	uint32_t index = FUID_INDEX(fuid);
 	const char *domain;
 	uid_t id;
 
 	if (index == 0)
 		return (fuid);
 
 	domain = zfs_fuid_find_by_idx(zfsvfs, index);
 	ASSERT(domain != NULL);
 
 #ifdef sun
 	if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
 		(void) kidmap_getuidbysid(crgetzone(cr), domain,
 		    FUID_RID(fuid), &id);
 	} else {
 		(void) kidmap_getgidbysid(crgetzone(cr), domain,
 		    FUID_RID(fuid), &id);
 	}
 #else	/* !sun */
 	id = UID_NOBODY;
 #endif	/* !sun */
 	return (id);
 }
 
 /*
  * Add a FUID node to the list of fuid's being created for this
  * ACL
  *
  * If ACL has multiple domains, then keep only one copy of each unique
  * domain.
  */
 void
 zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
     uint64_t idx, uint64_t id, zfs_fuid_type_t type)
 {
 	zfs_fuid_t *fuid;
 	zfs_fuid_domain_t *fuid_domain;
 	zfs_fuid_info_t *fuidp;
 	uint64_t fuididx;
 	boolean_t found = B_FALSE;
 
 	if (*fuidpp == NULL)
 		*fuidpp = zfs_fuid_info_alloc();
 
 	fuidp = *fuidpp;
 	/*
 	 * First find fuid domain index in linked list
 	 *
 	 * If one isn't found then create an entry.
 	 */
 
 	for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
 	    fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
 	    fuid_domain), fuididx++) {
 		if (idx == fuid_domain->z_domidx) {
 			found = B_TRUE;
 			break;
 		}
 	}
 
 	if (!found) {
 		fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
 		fuid_domain->z_domain = domain;
 		fuid_domain->z_domidx = idx;
 		list_insert_tail(&fuidp->z_domains, fuid_domain);
 		fuidp->z_domain_str_sz += strlen(domain) + 1;
 		fuidp->z_domain_cnt++;
 	}
 
 	if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
 
 		/*
 		 * Now allocate fuid entry and add it on the end of the list
 		 */
 
 		fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
 		fuid->z_id = id;
 		fuid->z_domidx = idx;
 		fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
 
 		list_insert_tail(&fuidp->z_fuids, fuid);
 		fuidp->z_fuid_cnt++;
 	} else {
 		if (type == ZFS_OWNER)
 			fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
 		else
 			fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
 	}
 }
 
 /*
  * Create a file system FUID, based on information in the users cred
  *
  * If cred contains KSID_OWNER then it should be used to determine
  * the uid otherwise cred's uid will be used. By default cred's gid
  * is used unless it's an ephemeral ID in which case KSID_GROUP will
  * be used if it exists.
  */
 uint64_t
 zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
     cred_t *cr, zfs_fuid_info_t **fuidp)
 {
 	uint64_t	idx;
 	ksid_t		*ksid;
 	uint32_t	rid;
 	char 		*kdomain;
 	const char	*domain;
 	uid_t		id;
 
 	VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
 
 	ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
 
 	if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
 		id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
 
 		if (IS_EPHEMERAL(id))
 			return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
 
 		return ((uint64_t)id);
 	}
 
 	/*
 	 * ksid is present and FUID is supported
 	 */
 	id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
 
 	if (!IS_EPHEMERAL(id))
 		return ((uint64_t)id);
 
 	if (type == ZFS_GROUP)
 		id = ksid_getid(ksid);
 
 	rid = ksid_getrid(ksid);
 	domain = ksid_getdomain(ksid);
 
 	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
 
 	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
 
 	return (FUID_ENCODE(idx, rid));
 }
 
 /*
  * Create a file system FUID for an ACL ace
  * or a chown/chgrp of the file.
  * This is similar to zfs_fuid_create_cred, except that
  * we can't find the domain + rid information in the
  * cred.  Instead we have to query Winchester for the
  * domain and rid.
  *
  * During replay operations the domain+rid information is
  * found in the zfs_fuid_info_t that the replay code has
  * attached to the zfsvfs of the file system.
  */
 uint64_t
 zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
     zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
 {
 	const char *domain;
 	char *kdomain;
 	uint32_t fuid_idx = FUID_INDEX(id);
 	uint32_t rid;
 	idmap_stat status;
-	uint64_t idx;
+	uint64_t idx = 0;
 	zfs_fuid_t *zfuid = NULL;
-	zfs_fuid_info_t *fuidp;
+	zfs_fuid_info_t *fuidp = NULL;
 
 	/*
 	 * If POSIX ID, or entry is already a FUID then
 	 * just return the id
 	 *
 	 * We may also be handed an already FUID'ized id via
 	 * chmod.
 	 */
 
 	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
 		return (id);
 
 	if (zfsvfs->z_replay) {
 		fuidp = zfsvfs->z_fuid_replay;
 
 		/*
 		 * If we are passed an ephemeral id, but no
 		 * fuid_info was logged then return NOBODY.
 		 * This is most likely a result of idmap service
 		 * not being available.
 		 */
 		if (fuidp == NULL)
 			return (UID_NOBODY);
 
+		VERIFY3U(type, >=, ZFS_OWNER);
+		VERIFY3U(type, <=, ZFS_ACE_GROUP);
+
 		switch (type) {
 		case ZFS_ACE_USER:
 		case ZFS_ACE_GROUP:
 			zfuid = list_head(&fuidp->z_fuids);
 			rid = FUID_RID(zfuid->z_logfuid);
 			idx = FUID_INDEX(zfuid->z_logfuid);
 			break;
 		case ZFS_OWNER:
 			rid = FUID_RID(fuidp->z_fuid_owner);
 			idx = FUID_INDEX(fuidp->z_fuid_owner);
 			break;
 		case ZFS_GROUP:
 			rid = FUID_RID(fuidp->z_fuid_group);
 			idx = FUID_INDEX(fuidp->z_fuid_group);
 			break;
 		};
-		domain = fuidp->z_domain_table[idx -1];
+		domain = fuidp->z_domain_table[idx - 1];
 	} else {
 		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
 			status = kidmap_getsidbyuid(crgetzone(cr), id,
 			    &domain, &rid);
 		else
 			status = kidmap_getsidbygid(crgetzone(cr), id,
 			    &domain, &rid);
 
 		if (status != 0) {
 			/*
 			 * When returning nobody we will need to
 			 * make a dummy fuid table entry for logging
 			 * purposes.
 			 */
 			rid = UID_NOBODY;
 			domain = nulldomain;
 		}
 	}
 
 	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
 
 	if (!zfsvfs->z_replay)
 		zfs_fuid_node_add(fuidpp, kdomain,
 		    rid, idx, id, type);
 	else if (zfuid != NULL) {
 		list_remove(&fuidp->z_fuids, zfuid);
 		kmem_free(zfuid, sizeof (zfs_fuid_t));
 	}
 	return (FUID_ENCODE(idx, rid));
 }
 
 void
 zfs_fuid_destroy(zfsvfs_t *zfsvfs)
 {
 	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
 	if (!zfsvfs->z_fuid_loaded) {
 		rw_exit(&zfsvfs->z_fuid_lock);
 		return;
 	}
 	zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
 	rw_exit(&zfsvfs->z_fuid_lock);
 }
 
 /*
  * Allocate zfs_fuid_info for tracking FUIDs created during
  * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
  */
 zfs_fuid_info_t *
 zfs_fuid_info_alloc(void)
 {
 	zfs_fuid_info_t *fuidp;
 
 	fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
 	list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
 	    offsetof(zfs_fuid_domain_t, z_next));
 	list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
 	    offsetof(zfs_fuid_t, z_next));
 	return (fuidp);
 }
 
 /*
  * Release all memory associated with zfs_fuid_info_t
  */
 void
 zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
 {
 	zfs_fuid_t *zfuid;
 	zfs_fuid_domain_t *zdomain;
 
 	while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
 		list_remove(&fuidp->z_fuids, zfuid);
 		kmem_free(zfuid, sizeof (zfs_fuid_t));
 	}
 
 	if (fuidp->z_domain_table != NULL)
 		kmem_free(fuidp->z_domain_table,
 		    (sizeof (char **)) * fuidp->z_domain_cnt);
 
 	while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
 		list_remove(&fuidp->z_domains, zdomain);
 		kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
 	}
 
 	kmem_free(fuidp, sizeof (zfs_fuid_info_t));
 }
 
 /*
  * Check to see if id is a groupmember.  If cred
  * has ksid info then sidlist is checked first
  * and if still not found then POSIX groups are checked
  *
  * Will use a straight FUID compare when possible.
  */
 boolean_t
 zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 {
 #ifdef sun
 	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
 	ksidlist_t	*ksidlist = crgetsidlist(cr);
 #endif	/* !sun */
 	uid_t		gid;
 
 #ifdef sun
 	if (ksid && ksidlist) {
 		int 		i;
 		ksid_t		*ksid_groups;
 		uint32_t	idx = FUID_INDEX(id);
 		uint32_t	rid = FUID_RID(id);
 
 		ksid_groups = ksidlist->ksl_sids;
 
 		for (i = 0; i != ksidlist->ksl_nsid; i++) {
 			if (idx == 0) {
 				if (id != IDMAP_WK_CREATOR_GROUP_GID &&
 				    id == ksid_groups[i].ks_id) {
 					return (B_TRUE);
 				}
 			} else {
 				const char *domain;
 
 				domain = zfs_fuid_find_by_idx(zfsvfs, idx);
 				ASSERT(domain != NULL);
 
 				if (strcmp(domain,
 				    IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
 					return (B_FALSE);
 
 				if ((strcmp(domain,
 				    ksid_groups[i].ks_domain->kd_name) == 0) &&
 				    rid == ksid_groups[i].ks_rid)
 					return (B_TRUE);
 			}
 		}
 	}
 #endif	/* !sun */
 
 	/*
 	 * Not found in ksidlist, check posix groups
 	 */
 	gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
 	return (groupmember(gid, cr));
 }
 
 void
 zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 {
 	if (zfsvfs->z_fuid_obj == 0) {
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    FUID_SIZE_ESTIMATE(zfsvfs));
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
 	} else {
 		dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
 		dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
 		    FUID_SIZE_ESTIMATE(zfsvfs));
 	}
 }
 #endif
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	(revision 247192)
@@ -1,681 +1,680 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/file.h>
 #include <sys/vfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/byteorder.h>
 #include <sys/policy.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/zfs_fuid.h>
 #include <sys/dsl_dataset.h>
 
 /*
  * These zfs_log_* functions must be called within a dmu tx, in one
  * of 2 contexts depending on zilog->z_replay:
  *
  * Non replay mode
  * ---------------
  * We need to record the transaction so that if it is committed to
  * the Intent Log then it can be replayed.  An intent log transaction
  * structure (itx_t) is allocated and all the information necessary to
  * possibly replay the transaction is saved in it. The itx is then assigned
  * a sequence number and inserted in the in-memory list anchored in the zilog.
  *
  * Replay mode
  * -----------
  * We need to mark the intent log record as replayed in the log header.
  * This is done in the same transaction as the replay so that they
  * commit atomically.
  */
 
 int
 zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
 {
 	int isxvattr = (vap->va_mask & AT_XVATTR);
 	switch (type) {
 	case Z_FILE:
 		if (vsecp == NULL && !isxvattr)
 			return (TX_CREATE);
 		if (vsecp && isxvattr)
 #ifdef TODO
 			return (TX_CREATE_ACL_ATTR);
 #else
 			panic("%s:%u: unsupported condition", __func__, __LINE__);
 #endif
 		if (vsecp)
 			return (TX_CREATE_ACL);
 		else
 			return (TX_CREATE_ATTR);
 		/*NOTREACHED*/
 	case Z_DIR:
 		if (vsecp == NULL && !isxvattr)
 			return (TX_MKDIR);
 		if (vsecp && isxvattr)
 #ifdef TODO
 			return (TX_MKDIR_ACL_ATTR);
 #else
 			panic("%s:%u: unsupported condition", __func__, __LINE__);
 #endif
 		if (vsecp)
 			return (TX_MKDIR_ACL);
 		else
 			return (TX_MKDIR_ATTR);
 	case Z_XATTRDIR:
 		return (TX_MKXATTR);
 	}
 	ASSERT(0);
 	return (TX_MAX_TYPE);
 }
 
 /*
  * build up the log data necessary for logging xvattr_t
  * First lr_attr_t is initialized.  following the lr_attr_t
  * is the mapsize and attribute bitmap copied from the xvattr_t.
  * Following the bitmap and bitmapsize two 64 bit words are reserved
  * for the create time which may be set.  Following the create time
  * records a single 64 bit integer which has the bits to set on
  * replay for the xvattr.
  */
 static void
 zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 {
 	uint32_t	*bitmap;
 	uint64_t	*attrs;
 	uint64_t	*crtime;
 	xoptattr_t	*xoap;
 	void		*scanstamp;
 	int		i;
 
 	xoap = xva_getxoptattr(xvap);
 	ASSERT(xoap);
 
 	lrattr->lr_attr_masksize = xvap->xva_mapsize;
 	bitmap = &lrattr->lr_attr_bitmap;
 	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
 		*bitmap = xvap->xva_reqattrmap[i];
 	}
 
 	/* Now pack the attributes up in a single uint64_t */
 	attrs = (uint64_t *)bitmap;
 	crtime = attrs + 1;
 	scanstamp = (caddr_t)(crtime + 2);
 	*attrs = 0;
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
 		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
 		    XAT0_READONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
 		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
 		    XAT0_HIDDEN;
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
 		*attrs |= (xoap->xoa_system == 0) ? 0 :
 		    XAT0_SYSTEM;
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
 		*attrs |= (xoap->xoa_archive == 0) ? 0 :
 		    XAT0_ARCHIVE;
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
 		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
 		    XAT0_IMMUTABLE;
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
 		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
 		    XAT0_NOUNLINK;
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
 		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
 		    XAT0_APPENDONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
 		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
 		    XAT0_APPENDONLY;
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
 		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
 		    XAT0_NODUMP;
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
 		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
 		    XAT0_AV_QUARANTINED;
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
 		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
 		    XAT0_AV_MODIFIED;
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 		*attrs |= (xoap->xoa_reparse == 0) ? 0 :
 		    XAT0_REPARSE;
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
 		*attrs |= (xoap->xoa_offline == 0) ? 0 :
 		    XAT0_OFFLINE;
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
 		*attrs |= (xoap->xoa_sparse == 0) ? 0 :
 		    XAT0_SPARSE;
 }
 
 static void *
 zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
 {
 	zfs_fuid_t *zfuid;
 	uint64_t *fuidloc = start;
 
 	/* First copy in the ACE FUIDs */
 	for (zfuid = list_head(&fuidp->z_fuids); zfuid;
 	    zfuid = list_next(&fuidp->z_fuids, zfuid)) {
 		*fuidloc++ = zfuid->z_logfuid;
 	}
 	return (fuidloc);
 }
 
 
 static void *
 zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
 {
 	zfs_fuid_domain_t *zdomain;
 
 	/* now copy in the domain info, if any */
 	if (fuidp->z_domain_str_sz != 0) {
 		for (zdomain = list_head(&fuidp->z_domains); zdomain;
 		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
 			bcopy((void *)zdomain->z_domain, start,
 			    strlen(zdomain->z_domain) + 1);
 			start = (caddr_t)start +
 			    strlen(zdomain->z_domain) + 1;
 		}
 	}
 	return (start);
 }
 
 /*
  * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
  * TX_MKDIR_ATTR and TX_MKXATTR
  * transactions.
  *
  * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
  * domain information appended prior to the name.  In this case the
  * uid/gid in the log record will be a log centric FUID.
  *
  * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
  * may contain attributes, ACL and optional fuid information.
  *
  * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
  * and ACL and normal users/groups in the ACEs.
  *
  * There may be an optional xvattr attribute information similar
  * to zfs_log_setattr.
  *
  * Also, after the file name "domain" strings may be appended.
  */
 void
 zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
     zfs_fuid_info_t *fuidp, vattr_t *vap)
 {
 	itx_t *itx;
 	lr_create_t *lr;
 	lr_acl_create_t *lracl;
-	size_t aclsize;
+	size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
 	size_t xvatsize = 0;
 	size_t txsize;
 	xvattr_t *xvap = (xvattr_t *)vap;
 	void *end;
 	size_t lrsize;
 	size_t namesize = strlen(name) + 1;
 	size_t fuidsz = 0;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	/*
 	 * If we have FUIDs present then add in space for
 	 * domains and ACE fuid's if any.
 	 */
 	if (fuidp) {
 		fuidsz += fuidp->z_domain_str_sz;
 		fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
 	}
 
 	if (vap->va_mask & AT_XVATTR)
 		xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
 
 	if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
 	    (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
 	    (int)txtype == TX_MKXATTR) {
 		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
 		lrsize = sizeof (*lr);
 	} else {
-		aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
 		txsize =
 		    sizeof (lr_acl_create_t) + namesize + fuidsz +
 		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
 		lrsize = sizeof (lr_acl_create_t);
 	}
 
 	itx = zil_itx_create(txtype, txsize);
 
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_foid = zp->z_id;
 	lr->lr_mode = zp->z_mode;
 	if (!IS_EPHEMERAL(zp->z_uid)) {
 		lr->lr_uid = (uint64_t)zp->z_uid;
 	} else {
 		lr->lr_uid = fuidp->z_fuid_owner;
 	}
 	if (!IS_EPHEMERAL(zp->z_gid)) {
 		lr->lr_gid = (uint64_t)zp->z_gid;
 	} else {
 		lr->lr_gid = fuidp->z_fuid_group;
 	}
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
 	    sizeof (uint64_t));
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
 	    lr->lr_crtime, sizeof (uint64_t) * 2);
 
 	if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
 	    sizeof (lr->lr_rdev)) != 0)
 		lr->lr_rdev = 0;
 
 	/*
 	 * Fill in xvattr info if any
 	 */
 	if (vap->va_mask & AT_XVATTR) {
 		zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
 		end = (caddr_t)lr + lrsize + xvatsize;
 	} else {
 		end = (caddr_t)lr + lrsize;
 	}
 
 	/* Now fill in any ACL info */
 
 	if (vsecp) {
 		lracl = (lr_acl_create_t *)&itx->itx_lr;
 		lracl->lr_aclcnt = vsecp->vsa_aclcnt;
 		lracl->lr_acl_bytes = aclsize;
 		lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
 		lracl->lr_fuidcnt  = fuidp ? fuidp->z_fuid_cnt : 0;
 		if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
 			lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
 		else
 			lracl->lr_acl_flags = 0;
 
 		bcopy(vsecp->vsa_aclentp, end, aclsize);
 		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
 	}
 
 	/* drop in FUID info */
 	if (fuidp) {
 		end = zfs_log_fuid_ids(fuidp, end);
 		end = zfs_log_fuid_domains(fuidp, end);
 	}
 	/*
 	 * Now place file name in log record
 	 */
 	bcopy(name, end, namesize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
  */
 void
 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	znode_t *dzp, char *name, uint64_t foid)
 {
 	itx_t *itx;
 	lr_remove_t *lr;
 	size_t namesize = strlen(name) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_remove_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	bcopy(name, (char *)(lr + 1), namesize);
 
 	itx->itx_oid = foid;
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * zfs_log_link() handles TX_LINK transactions.
  */
 void
 zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	znode_t *dzp, znode_t *zp, char *name)
 {
 	itx_t *itx;
 	lr_link_t *lr;
 	size_t namesize = strlen(name) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_link_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_link_obj = zp->z_id;
 	bcopy(name, (char *)(lr + 1), namesize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * zfs_log_symlink() handles TX_SYMLINK transactions.
  */
 void
 zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, char *link)
 {
 	itx_t *itx;
 	lr_create_t *lr;
 	size_t namesize = strlen(name) + 1;
 	size_t linksize = strlen(link) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_foid = zp->z_id;
 	lr->lr_uid = zp->z_uid;
 	lr->lr_gid = zp->z_gid;
 	lr->lr_mode = zp->z_mode;
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
 	    sizeof (uint64_t));
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
 	    lr->lr_crtime, sizeof (uint64_t) * 2);
 	bcopy(name, (char *)(lr + 1), namesize);
 	bcopy(link, (char *)(lr + 1) + namesize, linksize);
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * zfs_log_rename() handles TX_RENAME transactions.
  */
 void
 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
 {
 	itx_t *itx;
 	lr_rename_t *lr;
 	size_t snamesize = strlen(sname) + 1;
 	size_t dnamesize = strlen(dname) + 1;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
 	lr = (lr_rename_t *)&itx->itx_lr;
 	lr->lr_sdoid = sdzp->z_id;
 	lr->lr_tdoid = tdzp->z_id;
 	bcopy(sname, (char *)(lr + 1), snamesize);
 	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
 	itx->itx_oid = szp->z_id;
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * zfs_log_write() handles TX_WRITE transactions.
  */
 ssize_t zfs_immediate_write_sz = 32768;
 
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 {
 	itx_wr_state_t write_state;
 	boolean_t slogging;
 	uintptr_t fsync_cnt;
 	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 	    ? 0 : zfs_immediate_write_sz;
 
 	slogging = spa_has_slogs(zilog->zl_spa) &&
 	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
 		write_state = WR_INDIRECT;
 	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
 
 	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
 		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
 	}
 
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
 		ssize_t len;
 
 		/*
 		 * If the write would overflow the largest block then split it.
 		 */
 		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
 			len = SPA_MAXBLOCKSIZE >> 1;
 		else
 			len = resid;
 
 		itx = zil_itx_create(txtype, sizeof (*lr) +
 		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
 		}
 
 		itx->itx_wr_state = write_state;
 		if (write_state == WR_NEED_COPY)
 			itx->itx_sod += len;
 		lr->lr_foid = zp->z_id;
 		lr->lr_offset = off;
 		lr->lr_length = len;
 		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zp->z_zfsvfs;
 
 		if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
 		    (fsync_cnt == 0))
 			itx->itx_sync = B_FALSE;
 
 		zil_itx_assign(zilog, itx, tx);
 
 		off += len;
 		resid -= len;
 	}
 }
 
 /*
  * zfs_log_truncate() handles TX_TRUNCATE transactions.
  */
 void
 zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	znode_t *zp, uint64_t off, uint64_t len)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * zfs_log_setattr() handles TX_SETATTR transactions.
  */
 void
 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
 {
 	itx_t		*itx;
 	lr_setattr_t	*lr;
 	xvattr_t	*xvap = (xvattr_t *)vap;
 	size_t		recsize = sizeof (lr_setattr_t);
 	void		*start;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	/*
 	 * If XVATTR set, then log record size needs to allow
 	 * for lr_attr_t + xvattr mask, mapsize and create time
 	 * plus actual attribute values
 	 */
 	if (vap->va_mask & AT_XVATTR)
 		recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
 
 	if (fuidp)
 		recsize += fuidp->z_domain_str_sz;
 
 	itx = zil_itx_create(txtype, recsize);
 	lr = (lr_setattr_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	lr->lr_mask = (uint64_t)mask_applied;
 	lr->lr_mode = (uint64_t)vap->va_mode;
 	if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
 		lr->lr_uid = fuidp->z_fuid_owner;
 	else
 		lr->lr_uid = (uint64_t)vap->va_uid;
 
 	if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
 		lr->lr_gid = fuidp->z_fuid_group;
 	else
 		lr->lr_gid = (uint64_t)vap->va_gid;
 
 	lr->lr_size = (uint64_t)vap->va_size;
 	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
 	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
 	start = (lr_setattr_t *)(lr + 1);
 	if (vap->va_mask & AT_XVATTR) {
 		zfs_log_xvattr((lr_attr_t *)start, xvap);
 		start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
 	}
 
 	/*
 	 * Now stick on domain information if any on end
 	 */
 
 	if (fuidp)
 		(void) zfs_log_fuid_domains(fuidp, start);
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
  * zfs_log_acl() handles TX_ACL transactions.
  */
 void
 zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
 {
 	itx_t *itx;
 	lr_acl_v0_t *lrv0;
 	lr_acl_t *lr;
 	int txtype;
 	int lrsize;
 	size_t txsize;
 	size_t aclbytes = vsecp->vsa_aclentsz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
 	    TX_ACL_V0 : TX_ACL;
 
 	if (txtype == TX_ACL)
 		lrsize = sizeof (*lr);
 	else
 		lrsize = sizeof (*lrv0);
 
 	txsize = lrsize +
 	    ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
 	    (fuidp ? fuidp->z_domain_str_sz : 0) +
 	    sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
 
 	itx = zil_itx_create(txtype, txsize);
 
 	lr = (lr_acl_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	if (txtype == TX_ACL) {
 		lr->lr_acl_bytes = aclbytes;
 		lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
 		lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
 		if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
 			lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
 		else
 			lr->lr_acl_flags = 0;
 	}
 	lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
 
 	if (txtype == TX_ACL_V0) {
 		lrv0 = (lr_acl_v0_t *)lr;
 		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
 	} else {
 		void *start = (ace_t *)(lr + 1);
 
 		bcopy(vsecp->vsa_aclentp, start, aclbytes);
 
 		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
 
 		if (fuidp) {
 			start = zfs_log_fuid_ids(fuidp, start);
 			(void) zfs_log_fuid_domains(fuidp, start);
 		}
 	}
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	zil_itx_assign(zilog, itx, tx);
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c	(revision 247192)
@@ -1,605 +1,605 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
  * This file contains the code to implement file range locking in
  * ZFS, although there isn't much specific to ZFS (all that comes to mind
  * support for growing the blocksize).
  *
  * Interface
  * ---------
  * Defined in zfs_rlock.h but essentially:
  *	rl = zfs_range_lock(zp, off, len, lock_type);
  *	zfs_range_unlock(rl);
  *	zfs_range_reduce(rl, off, len);
  *
  * AVL tree
  * --------
  * An AVL tree is used to maintain the state of the existing ranges
  * that are locked for exclusive (writer) or shared (reader) use.
  * The starting range offset is used for searching and sorting the tree.
  *
  * Common case
  * -----------
  * The (hopefully) usual case is of no overlaps or contention for
  * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
  * searched that finds no overlap, and *this* rl_t is placed in the tree.
  *
  * Overlaps/Reference counting/Proxy locks
  * ---------------------------------------
  * The avl code only allows one node at a particular offset. Also it's very
  * inefficient to search through all previous entries looking for overlaps
  * (because the very 1st in the ordered list might be at offset 0 but
  * cover the whole file).
  * So this implementation uses reference counts and proxy range locks.
  * Firstly, only reader locks use reference counts and proxy locks,
  * because writer locks are exclusive.
  * When a reader lock overlaps with another then a proxy lock is created
  * for that range and replaces the original lock. If the overlap
  * is exact then the reference count of the proxy is simply incremented.
  * Otherwise, the proxy lock is split into smaller lock ranges and
  * new proxy locks created for non overlapping ranges.
  * The reference counts are adjusted accordingly.
  * Meanwhile, the orginal lock is kept around (this is the callers handle)
  * and its offset and length are used when releasing the lock.
  *
  * Thread coordination
  * -------------------
  * In order to make wakeups efficient and to ensure multiple continuous
  * readers on a range don't starve a writer for the same range lock,
  * two condition variables are allocated in each rl_t.
  * If a writer (or reader) can't get a range it initialises the writer
  * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
  * and waits on that cv. When a thread unlocks that range it wakes up all
  * writers then all readers before destroying the lock.
  *
  * Append mode writes
  * ------------------
  * Append mode writes need to lock a range at the end of a file.
  * The offset of the end of the file is determined under the
  * range locking mutex, and the lock type converted from RL_APPEND to
  * RL_WRITER and the range locked.
  *
  * Grow block handling
  * -------------------
  * ZFS supports multiple block sizes currently upto 128K. The smallest
  * block size is used for the file which is grown as needed. During this
  * growth all other writers and readers must be excluded.
  * So if the block size needs to be grown then the whole file is
  * exclusively locked, then later the caller will reduce the lock
  * range to just the range to be written using zfs_reduce_range.
  */
 
 #include <sys/zfs_rlock.h>
 
 /*
  * Check if a write lock can be grabbed, or wait and recheck until available.
  */
 static void
 zfs_range_lock_writer(znode_t *zp, rl_t *new)
 {
 	avl_tree_t *tree = &zp->z_range_avl;
 	rl_t *rl;
 	avl_index_t where;
 	uint64_t end_size;
 	uint64_t off = new->r_off;
 	uint64_t len = new->r_len;
 
 	for (;;) {
 		/*
 		 * Range locking is also used by zvol and uses a
 		 * dummied up znode. However, for zvol, we don't need to
 		 * append or grow blocksize, and besides we don't have
 		 * a "sa" data or z_zfsvfs - so skip that processing.
 		 *
 		 * Yes, this is ugly, and would be solved by not handling
 		 * grow or append in range lock code. If that was done then
 		 * we could make the range locking code generically available
 		 * to other non-zfs consumers.
 		 */
 		if (zp->z_vnode) { /* caller is ZPL */
 			/*
 			 * If in append mode pick up the current end of file.
 			 * This is done under z_range_lock to avoid races.
 			 */
 			if (new->r_type == RL_APPEND)
 				new->r_off = zp->z_size;
 
 			/*
 			 * If we need to grow the block size then grab the whole
 			 * file range. This is also done under z_range_lock to
 			 * avoid races.
 			 */
 			end_size = MAX(zp->z_size, new->r_off + len);
 			if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
 			    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
 				new->r_off = 0;
 				new->r_len = UINT64_MAX;
 			}
 		}
 
 		/*
 		 * First check for the usual case of no locks
 		 */
 		if (avl_numnodes(tree) == 0) {
 			new->r_type = RL_WRITER; /* convert to writer */
 			avl_add(tree, new);
 			return;
 		}
 
 		/*
 		 * Look for any locks in the range.
 		 */
 		rl = avl_find(tree, new, &where);
 		if (rl)
 			goto wait; /* already locked at same offset */
 
 		rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
 		if (rl && (rl->r_off < new->r_off + new->r_len))
 			goto wait;
 
 		rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
 		if (rl && rl->r_off + rl->r_len > new->r_off)
 			goto wait;
 
 		new->r_type = RL_WRITER; /* convert possible RL_APPEND */
 		avl_insert(tree, new, where);
 		return;
 wait:
 		if (!rl->r_write_wanted) {
 			cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
 			rl->r_write_wanted = B_TRUE;
 		}
 		cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
 
 		/* reset to original */
 		new->r_off = off;
 		new->r_len = len;
 	}
 }
 
 /*
  * If this is an original (non-proxy) lock then replace it by
  * a proxy and return the proxy.
  */
 static rl_t *
 zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
 {
 	rl_t *proxy;
 
 	if (rl->r_proxy)
 		return (rl); /* already a proxy */
 
 	ASSERT3U(rl->r_cnt, ==, 1);
 	ASSERT(rl->r_write_wanted == B_FALSE);
 	ASSERT(rl->r_read_wanted == B_FALSE);
 	avl_remove(tree, rl);
 	rl->r_cnt = 0;
 
 	/* create a proxy range lock */
 	proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
 	proxy->r_off = rl->r_off;
 	proxy->r_len = rl->r_len;
 	proxy->r_cnt = 1;
 	proxy->r_type = RL_READER;
 	proxy->r_proxy = B_TRUE;
 	proxy->r_write_wanted = B_FALSE;
 	proxy->r_read_wanted = B_FALSE;
 	avl_add(tree, proxy);
 
 	return (proxy);
 }
 
 /*
  * Split the range lock at the supplied offset
  * returning the *front* proxy.
  */
 static rl_t *
 zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
 {
 	rl_t *front, *rear;
 
 	ASSERT3U(rl->r_len, >, 1);
 	ASSERT3U(off, >, rl->r_off);
 	ASSERT3U(off, <, rl->r_off + rl->r_len);
 	ASSERT(rl->r_write_wanted == B_FALSE);
 	ASSERT(rl->r_read_wanted == B_FALSE);
 
 	/* create the rear proxy range lock */
 	rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
 	rear->r_off = off;
 	rear->r_len = rl->r_off + rl->r_len - off;
 	rear->r_cnt = rl->r_cnt;
 	rear->r_type = RL_READER;
 	rear->r_proxy = B_TRUE;
 	rear->r_write_wanted = B_FALSE;
 	rear->r_read_wanted = B_FALSE;
 
 	front = zfs_range_proxify(tree, rl);
 	front->r_len = off - rl->r_off;
 
 	avl_insert_here(tree, rear, front, AVL_AFTER);
 	return (front);
 }
 
 /*
  * Create and add a new proxy range lock for the supplied range.
  */
 static void
 zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
 {
 	rl_t *rl;
 
 	ASSERT(len);
 	rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
 	rl->r_off = off;
 	rl->r_len = len;
 	rl->r_cnt = 1;
 	rl->r_type = RL_READER;
 	rl->r_proxy = B_TRUE;
 	rl->r_write_wanted = B_FALSE;
 	rl->r_read_wanted = B_FALSE;
 	avl_add(tree, rl);
 }
 
 static void
 zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
 {
 	rl_t *next;
 	uint64_t off = new->r_off;
 	uint64_t len = new->r_len;
 
 	/*
 	 * prev arrives either:
 	 * - pointing to an entry at the same offset
 	 * - pointing to the entry with the closest previous offset whose
 	 *   range may overlap with the new range
 	 * - null, if there were no ranges starting before the new one
 	 */
 	if (prev) {
 		if (prev->r_off + prev->r_len <= off) {
 			prev = NULL;
 		} else if (prev->r_off != off) {
 			/*
 			 * convert to proxy if needed then
 			 * split this entry and bump ref count
 			 */
 			prev = zfs_range_split(tree, prev, off);
 			prev = AVL_NEXT(tree, prev); /* move to rear range */
 		}
 	}
 	ASSERT((prev == NULL) || (prev->r_off == off));
 
 	if (prev)
 		next = prev;
 	else
 		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
 
 	if (next == NULL || off + len <= next->r_off) {
 		/* no overlaps, use the original new rl_t in the tree */
 		avl_insert(tree, new, where);
 		return;
 	}
 
 	if (off < next->r_off) {
 		/* Add a proxy for initial range before the overlap */
 		zfs_range_new_proxy(tree, off, next->r_off - off);
 	}
 
 	new->r_cnt = 0; /* will use proxies in tree */
 	/*
 	 * We now search forward through the ranges, until we go past the end
 	 * of the new range. For each entry we make it a proxy if it
 	 * isn't already, then bump its reference count. If there's any
 	 * gaps between the ranges then we create a new proxy range.
 	 */
 	for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
 		if (off + len <= next->r_off)
 			break;
 		if (prev && prev->r_off + prev->r_len < next->r_off) {
 			/* there's a gap */
 			ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
 			zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
 			    next->r_off - (prev->r_off + prev->r_len));
 		}
 		if (off + len == next->r_off + next->r_len) {
 			/* exact overlap with end */
 			next = zfs_range_proxify(tree, next);
 			next->r_cnt++;
 			return;
 		}
 		if (off + len < next->r_off + next->r_len) {
 			/* new range ends in the middle of this block */
 			next = zfs_range_split(tree, next, off + len);
 			next->r_cnt++;
 			return;
 		}
 		ASSERT3U(off + len, >, next->r_off + next->r_len);
 		next = zfs_range_proxify(tree, next);
 		next->r_cnt++;
 	}
 
 	/* Add the remaining end range. */
 	zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
 	    (off + len) - (prev->r_off + prev->r_len));
 }
 
 /*
  * Check if a reader lock can be grabbed, or wait and recheck until available.
  */
 static void
 zfs_range_lock_reader(znode_t *zp, rl_t *new)
 {
 	avl_tree_t *tree = &zp->z_range_avl;
 	rl_t *prev, *next;
 	avl_index_t where;
 	uint64_t off = new->r_off;
 	uint64_t len = new->r_len;
 
 	/*
 	 * Look for any writer locks in the range.
 	 */
 retry:
 	prev = avl_find(tree, new, &where);
 	if (prev == NULL)
 		prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
 
 	/*
 	 * Check the previous range for a writer lock overlap.
 	 */
 	if (prev && (off < prev->r_off + prev->r_len)) {
 		if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
 			if (!prev->r_read_wanted) {
 				cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
 				prev->r_read_wanted = B_TRUE;
 			}
 			cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
 			goto retry;
 		}
 		if (off + len < prev->r_off + prev->r_len)
 			goto got_lock;
 	}
 
 	/*
 	 * Search through the following ranges to see if there's
 	 * write lock any overlap.
 	 */
 	if (prev)
 		next = AVL_NEXT(tree, prev);
 	else
 		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
 	for (; next; next = AVL_NEXT(tree, next)) {
 		if (off + len <= next->r_off)
 			goto got_lock;
 		if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
 			if (!next->r_read_wanted) {
 				cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
 				next->r_read_wanted = B_TRUE;
 			}
 			cv_wait(&next->r_rd_cv, &zp->z_range_lock);
 			goto retry;
 		}
 		if (off + len <= next->r_off + next->r_len)
 			goto got_lock;
 	}
 
 got_lock:
 	/*
 	 * Add the read lock, which may involve splitting existing
 	 * locks and bumping ref counts (r_cnt).
 	 */
 	zfs_range_add_reader(tree, new, prev, where);
 }
 
 /*
  * Lock a range (offset, length) as either shared (RL_READER)
  * or exclusive (RL_WRITER). Returns the range lock structure
  * for later unlocking or reduce range (if entire file
  * previously locked as RL_WRITER).
  */
 rl_t *
 zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
 {
 	rl_t *new;
 
 	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
 
 	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
 	new->r_zp = zp;
 	new->r_off = off;
 	if (len + off < off)	/* overflow */
 		len = UINT64_MAX - off;
 	new->r_len = len;
 	new->r_cnt = 1; /* assume it's going to be in the tree */
 	new->r_type = type;
 	new->r_proxy = B_FALSE;
 	new->r_write_wanted = B_FALSE;
 	new->r_read_wanted = B_FALSE;
 
 	mutex_enter(&zp->z_range_lock);
 	if (type == RL_READER) {
 		/*
 		 * First check for the usual case of no locks
 		 */
 		if (avl_numnodes(&zp->z_range_avl) == 0)
 			avl_add(&zp->z_range_avl, new);
 		else
 			zfs_range_lock_reader(zp, new);
 	} else
 		zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
 	mutex_exit(&zp->z_range_lock);
 	return (new);
 }
 
 /*
  * Unlock a reader lock
  */
 static void
 zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
 {
 	avl_tree_t *tree = &zp->z_range_avl;
-	rl_t *rl, *next;
+	rl_t *rl, *next = NULL;
 	uint64_t len;
 
 	/*
 	 * The common case is when the remove entry is in the tree
 	 * (cnt == 1) meaning there's been no other reader locks overlapping
 	 * with this one. Otherwise the remove entry will have been
 	 * removed from the tree and replaced by proxies (one or
 	 * more ranges mapping to the entire range).
 	 */
 	if (remove->r_cnt == 1) {
 		avl_remove(tree, remove);
 		if (remove->r_write_wanted) {
 			cv_broadcast(&remove->r_wr_cv);
 			cv_destroy(&remove->r_wr_cv);
 		}
 		if (remove->r_read_wanted) {
 			cv_broadcast(&remove->r_rd_cv);
 			cv_destroy(&remove->r_rd_cv);
 		}
 	} else {
 		ASSERT0(remove->r_cnt);
 		ASSERT0(remove->r_write_wanted);
 		ASSERT0(remove->r_read_wanted);
 		/*
 		 * Find start proxy representing this reader lock,
 		 * then decrement ref count on all proxies
 		 * that make up this range, freeing them as needed.
 		 */
 		rl = avl_find(tree, remove, NULL);
 		ASSERT(rl);
 		ASSERT(rl->r_cnt);
 		ASSERT(rl->r_type == RL_READER);
 		for (len = remove->r_len; len != 0; rl = next) {
 			len -= rl->r_len;
 			if (len) {
 				next = AVL_NEXT(tree, rl);
 				ASSERT(next);
 				ASSERT(rl->r_off + rl->r_len == next->r_off);
 				ASSERT(next->r_cnt);
 				ASSERT(next->r_type == RL_READER);
 			}
 			rl->r_cnt--;
 			if (rl->r_cnt == 0) {
 				avl_remove(tree, rl);
 				if (rl->r_write_wanted) {
 					cv_broadcast(&rl->r_wr_cv);
 					cv_destroy(&rl->r_wr_cv);
 				}
 				if (rl->r_read_wanted) {
 					cv_broadcast(&rl->r_rd_cv);
 					cv_destroy(&rl->r_rd_cv);
 				}
 				kmem_free(rl, sizeof (rl_t));
 			}
 		}
 	}
 	kmem_free(remove, sizeof (rl_t));
 }
 
 /*
  * Unlock range and destroy range lock structure.
  */
 void
 zfs_range_unlock(rl_t *rl)
 {
 	znode_t *zp = rl->r_zp;
 
 	ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
 	ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
 	ASSERT(!rl->r_proxy);
 
 	mutex_enter(&zp->z_range_lock);
 	if (rl->r_type == RL_WRITER) {
 		/* writer locks can't be shared or split */
 		avl_remove(&zp->z_range_avl, rl);
 		mutex_exit(&zp->z_range_lock);
 		if (rl->r_write_wanted) {
 			cv_broadcast(&rl->r_wr_cv);
 			cv_destroy(&rl->r_wr_cv);
 		}
 		if (rl->r_read_wanted) {
 			cv_broadcast(&rl->r_rd_cv);
 			cv_destroy(&rl->r_rd_cv);
 		}
 		kmem_free(rl, sizeof (rl_t));
 	} else {
 		/*
 		 * lock may be shared, let zfs_range_unlock_reader()
 		 * release the lock and free the rl_t
 		 */
 		zfs_range_unlock_reader(zp, rl);
 		mutex_exit(&zp->z_range_lock);
 	}
 }
 
 /*
  * Reduce range locked as RL_WRITER from whole file to specified range.
  * Asserts the whole file is exclusivly locked and so there's only one
  * entry in the tree.
  */
 void
 zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
 {
 	znode_t *zp = rl->r_zp;
 
 	/* Ensure there are no other locks */
 	ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
 	ASSERT(rl->r_off == 0);
 	ASSERT(rl->r_type == RL_WRITER);
 	ASSERT(!rl->r_proxy);
 	ASSERT3U(rl->r_len, ==, UINT64_MAX);
 	ASSERT3U(rl->r_cnt, ==, 1);
 
 	mutex_enter(&zp->z_range_lock);
 	rl->r_off = off;
 	rl->r_len = len;
 	mutex_exit(&zp->z_range_lock);
 	if (rl->r_write_wanted)
 		cv_broadcast(&rl->r_wr_cv);
 	if (rl->r_read_wanted)
 		cv_broadcast(&rl->r_rd_cv);
 }
 
 /*
  * AVL comparison function used to order range locks
  * Locks are ordered on the start offset of the range.
  */
 int
 zfs_range_compare(const void *arg1, const void *arg2)
 {
 	const rl_t *rl1 = arg1;
 	const rl_t *rl2 = arg2;
 
 	if (rl1->r_off > rl2->r_off)
 		return (1);
 	if (rl1->r_off < rl2->r_off)
 		return (-1);
 	return (0);
 }
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 247192)
@@ -1,2489 +1,2496 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/acl.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
 #include <sys/jail.h>
 #include "zfs_comutil.h"
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
 
 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
 
 int zfs_super_owner;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
     "File system owner can perform privileged operation on his file systems");
 
 int zfs_debug_level;
 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
     "Debug level");
 
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
 static int zfs_version_acl = ZFS_ACL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
     "ZFS_ACL_VERSION");
 static int zfs_version_spa = SPA_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
     "SPA_VERSION");
 static int zfs_version_zpl = ZPL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
     "ZPL_VERSION");
 
 static int zfs_mount(vfs_t *vfsp);
 static int zfs_umount(vfs_t *vfsp, int fflag);
 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 static int zfs_sync(vfs_t *vfsp, int waitfor);
 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 static void zfs_freevfs(vfs_t *vfsp);
 
 static struct vfsops zfs_vfsops = {
 	.vfs_mount =		zfs_mount,
 	.vfs_unmount =		zfs_umount,
 	.vfs_root =		zfs_root,
 	.vfs_statfs =		zfs_statfs,
 	.vfs_vget =		zfs_vget,
 	.vfs_sync =		zfs_sync,
 	.vfs_checkexp =		zfs_checkexp,
 	.vfs_fhtovp =		zfs_fhtovp,
 };
 
 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 
 /*
  * We need to keep a count of active fs's.
  * This is necessary to prevent our module
  * from being unloaded after a umount -f
  */
 static uint32_t	zfs_active_fs_count = 0;
 
 /*ARGSUSED*/
 static int
 zfs_sync(vfs_t *vfsp, int waitfor)
 {
 
 	/*
 	 * Data integrity is job one.  We don't want a compromised kernel
 	 * writing to the storage pool, so we never sync during panic.
 	 */
 	if (panicstr)
 		return (0);
 
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		dsl_pool_t *dp;
 		int error;
 
 		error = vfs_stdsync(vfsp, waitfor);
 		if (error != 0)
 			return (error);
 
 		ZFS_ENTER(zfsvfs);
 		dp = dmu_objset_pool(zfsvfs->z_os);
 
 		/*
 		 * If the system is shutting down, then skip any
 		 * filesystems which may exist on a suspended pool.
 		 */
 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, 0);
 
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 		 * request by waiting for all pools to commit all dirty data.
 		 */
 		spa_sync_allpools();
 	}
 
 	return (0);
 }
 
 #ifndef __FreeBSD__
 static int
 zfs_create_unique_device(dev_t *dev)
 {
 	major_t new_major;
 
 	do {
 		ASSERT3U(zfs_minor, <=, MAXMIN32);
 		minor_t start = zfs_minor;
 		do {
 			mutex_enter(&zfs_dev_mtx);
 			if (zfs_minor >= MAXMIN32) {
 				/*
 				 * If we're still using the real major
 				 * keep out of /dev/zfs and /dev/zvol minor
 				 * number space.  If we're using a getudev()'ed
 				 * major number, we can use all of its minors.
 				 */
 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 					zfs_minor = ZFS_MIN_MINOR;
 				else
 					zfs_minor = 0;
 			} else {
 				zfs_minor++;
 			}
 			*dev = makedevice(zfs_major, zfs_minor);
 			mutex_exit(&zfs_dev_mtx);
 		} while (vfs_devismounted(*dev) && zfs_minor != start);
 		if (zfs_minor == start) {
 			/*
 			 * We are using all ~262,000 minor numbers for the
 			 * current major number.  Create a new major number.
 			 */
 			if ((new_major = getudev()) == (major_t)-1) {
 				cmn_err(CE_WARN,
 				    "zfs_mount: Can't get unique major "
 				    "device number.");
 				return (-1);
 			}
 			mutex_enter(&zfs_dev_mtx);
 			zfs_major = new_major;
 			zfs_minor = 0;
 
 			mutex_exit(&zfs_dev_mtx);
 		} else {
 			break;
 		}
 		/* CONSTANTCONDITION */
 	} while (1);
 
 	return (0);
 }
 #endif	/* !__FreeBSD__ */
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
 }
 
 static void
 xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
 }
 
 static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval < SPA_MINBLOCKSIZE ||
 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 		newval = SPA_MAXBLOCKSIZE;
 
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 }
 
 static void
 readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 	}
 }
 
 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 	}
 }
 
 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 	}
 }
 
 /*
  * The nbmand mount option can be changed at mount time.
  * We can't allow it to be toggled on live file systems or incorrect
  * behavior may be seen from cifs clients
  *
  * This property isn't registered via dsl_prop_register(), but this callback
  * will be called when a file system is first mounted
  */
 static void
 nbmand_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	if (newval == FALSE) {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 	} else {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 	}
 }
 
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_show_ctldir = newval;
 }
 
 static void
 vscan_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_vscan = newval;
 }
 
 static void
 acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_mode = newval;
 }
 
 static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	uint64_t nbmand;
-	int readonly, do_readonly = B_FALSE;
-	int setuid, do_setuid = B_FALSE;
-	int exec, do_exec = B_FALSE;
-	int xattr, do_xattr = B_FALSE;
-	int atime, do_atime = B_FALSE;
+	boolean_t readonly = B_FALSE;
+	boolean_t do_readonly = B_FALSE;
+	boolean_t setuid = B_FALSE;
+	boolean_t do_setuid = B_FALSE;
+	boolean_t exec = B_FALSE;
+	boolean_t do_exec = B_FALSE;
+	boolean_t devices = B_FALSE;
+	boolean_t do_devices = B_FALSE;
+	boolean_t xattr = B_FALSE;
+	boolean_t do_xattr = B_FALSE;
+	boolean_t atime = B_FALSE;
+	boolean_t do_atime = B_FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(zfsvfs);
 	os = zfsvfs->z_os;
 
 	/*
 	 * This function can be called for a snapshot when we update snapshot's
 	 * mount point, which isn't really supported.
 	 */
 	if (dmu_objset_is_snapshot(os))
 		return (EOPNOTSUPP);
 
 	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 		readonly = B_FALSE;
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid = B_FALSE;
 		do_setuid = B_TRUE;
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 			setuid = B_TRUE;
 			do_setuid = B_TRUE;
 		}
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 		exec = B_FALSE;
 		do_exec = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 		exec = B_TRUE;
 		do_exec = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 		xattr = B_FALSE;
 		do_xattr = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 		atime = B_FALSE;
 		do_atime = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 		atime = B_TRUE;
 		do_atime = B_TRUE;
 	}
 
 	/*
 	 * nbmand is a special property.  It can only be changed at
 	 * mount time.
 	 *
 	 * This is weird, but it is documented to only be changeable
 	 * at mount time.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 		nbmand = B_FALSE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 		nbmand = B_TRUE;
 	} else {
 		char osname[MAXNAMELEN];
 
 		dmu_objset_name(os, osname);
 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 		    NULL)) {
 			return (error);
 		}
 	}
 
 	/*
 	 * Register property callbacks.
 	 *
 	 * It would probably be fine to just check for i/o error from
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
 	ds = dmu_objset_ds(os);
 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "xattr", xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "recordsize", blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "readonly", readonly_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "setuid", setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "exec", exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "snapdir", snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "aclmode", acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "vscan", vscan_changed_cb, zfsvfs);
 	if (error)
 		goto unregister;
 
 	/*
 	 * Invoke our callbacks to restore temporary mount options.
 	 */
 	if (do_readonly)
 		readonly_changed_cb(zfsvfs, readonly);
 	if (do_setuid)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 	if (do_atime)
 		atime_changed_cb(zfsvfs, atime);
 
 	nbmand_changed_cb(zfsvfs, nbmand);
 
 	return (0);
 
 unregister:
 	/*
 	 * We may attempt to unregister some callbacks that are not
 	 * registered, but this is OK; it will simply return ENOMSG,
 	 * which we will ignore.
 	 */
 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 	    zfsvfs);
 	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
 	return (error);
 
 }
 
 static int
 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
     uint64_t *userp, uint64_t *groupp)
 {
 	int error = 0;
 
 	/*
 	 * Is it a valid type of object to track?
 	 */
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (ENOENT);
 
 	/*
 	 * If we have a NULL data pointer
 	 * then assume the id's aren't changing and
 	 * return EEXIST to the dmu to let it know to
 	 * use the same ids
 	 */
 	if (data == NULL)
 		return (EEXIST);
 
 	if (bonustype == DMU_OT_ZNODE) {
 		znode_phys_t *znp = data;
 		*userp = znp->zp_uid;
 		*groupp = znp->zp_gid;
 	} else {
 		int hdrsize;
 		sa_hdr_phys_t *sap = data;
 		sa_hdr_phys_t sa = *sap;
 		boolean_t swap = B_FALSE;
 
 		ASSERT(bonustype == DMU_OT_SA);
 
 		if (sa.sa_magic == 0) {
 			/*
 			 * This should only happen for newly created
 			 * files that haven't had the znode data filled
 			 * in yet.
 			 */
 			*userp = 0;
 			*groupp = 0;
 			return (0);
 		}
 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
 			sa.sa_magic = SA_MAGIC;
 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 			swap = B_TRUE;
 		} else {
 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
 		}
 
 		hdrsize = sa_hdrsize(&sa);
 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_UID_OFFSET));
 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_GID_OFFSET));
 		if (swap) {
 			*userp = BSWAP_64(*userp);
 			*groupp = BSWAP_64(*groupp);
 		}
 	}
 	return (error);
 }
 
 static void
 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
     char *domainbuf, int buflen, uid_t *ridp)
 {
 	uint64_t fuid;
 	const char *domain;
 
 	fuid = strtonum(fuidstr, NULL);
 
 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 	if (domain)
 		(void) strlcpy(domainbuf, domain, buflen);
 	else
 		domainbuf[0] = '\0';
 	*ridp = FUID_RID(fuid);
 }
 
 static uint64_t
 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 {
 	switch (type) {
 	case ZFS_PROP_USERUSED:
 		return (DMU_USERUSED_OBJECT);
 	case ZFS_PROP_GROUPUSED:
 		return (DMU_GROUPUSED_OBJECT);
 	case ZFS_PROP_USERQUOTA:
 		return (zfsvfs->z_userquota_obj);
 	case ZFS_PROP_GROUPQUOTA:
 		return (zfsvfs->z_groupquota_obj);
 	}
 	return (0);
 }
 
 int
 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 {
 	int error;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zfs_useracct_t *buf = vbuf;
 	uint64_t obj;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (ENOTSUP);
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0) {
 		*bufsizep = 0;
 		return (0);
 	}
 
 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 		    *bufsizep)
 			break;
 
 		fuidstr_to_sid(zfsvfs, za.za_name,
 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 
 		buf->zu_space = za.za_first_integer;
 		buf++;
 	}
 	if (error == ENOENT)
 		error = 0;
 
 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 	*cookiep = zap_cursor_serialize(&zc);
 	zap_cursor_fini(&zc);
 	return (error);
 }
 
 /*
  * buf must be big enough (eg, 32 bytes)
  */
 static int
 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
     char *buf, boolean_t addok)
 {
 	uint64_t fuid;
 	int domainid = 0;
 
 	if (domain && domain[0]) {
 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 		if (domainid == -1)
 			return (ENOENT);
 	}
 	fuid = FUID_ENCODE(domainid, rid);
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	return (0);
 }
 
 int
 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valp)
 {
 	char buf[32];
 	int err;
 	uint64_t obj;
 
 	*valp = 0;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (ENOTSUP);
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0)
 		return (0);
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 	if (err)
 		return (err);
 
 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 	if (err == ENOENT)
 		err = 0;
 	return (err);
 }
 
 int
 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t quota)
 {
 	char buf[32];
 	int err;
 	dmu_tx_t *tx;
 	uint64_t *objp;
 	boolean_t fuid_dirtied;
 
 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 		return (EINVAL);
 
 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 		return (ENOTSUP);
 
 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 	    &zfsvfs->z_groupquota_obj;
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 	if (err)
 		return (err);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 	if (*objp == 0) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    zfs_userquota_prop_prefixes[type]);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	mutex_enter(&zfsvfs->z_lock);
 	if (*objp == 0) {
 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 		    DMU_OT_NONE, 0, tx);
 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 	}
 	mutex_exit(&zfsvfs->z_lock);
 
 	if (quota == 0) {
 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 		if (err == ENOENT)
 			err = 0;
 	} else {
 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 	}
 	ASSERT(err == 0);
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 	dmu_tx_commit(tx);
 	return (err);
 }
 
 boolean_t
 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
 	char buf[32];
 	uint64_t used, quota, usedobj, quotaobj;
 	int err;
 
 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 	if (err != 0)
 		return (B_FALSE);
 
 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 	if (err != 0)
 		return (B_FALSE);
 	return (used >= quota);
 }
 
 boolean_t
 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 {
 	uint64_t fuid;
 	uint64_t quotaobj;
 
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	fuid = isgroup ? zp->z_gid : zp->z_uid;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 }
 
 int
 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	zfsvfs_t *zfsvfs;
 	uint64_t zval;
 	int i, error;
 	uint64_t sa_obj;
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	/*
 	 * We claim to always be readonly so we can open snapshots;
 	 * other ZPL code will prevent us from writing to snapshots.
 	 */
 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 	if (error) {
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 		return (error);
 	}
 
 	/*
 	 * Initialize the zfs-specific filesystem structure.
 	 * Should probably make this a kmem cache, shuffle fields,
 	 * and just bzero up to z_hold_mtx[].
 	 */
 	zfsvfs->z_vfs = NULL;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 	zfsvfs->z_os = os;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 	if (error) {
 		goto out;
 	} else if (zfsvfs->z_version >
 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 		(void) printf("Can't mount a version %lld file system "
 		    "on a version %lld pool\n. Pool must be upgraded to mount "
 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
 		error = ENOTSUP;
 		goto out;
 	}
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
 		goto out;
 	zfsvfs->z_norm = (int)zval;
 
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
 		goto out;
 	zfsvfs->z_utf8 = (zval != 0);
 
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
 		goto out;
 	zfsvfs->z_case = (uint_t)zval;
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 	    zfsvfs->z_case == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 
 	if (zfsvfs->z_use_sa) {
 		/* should either have both of these objects or none */
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 		    &sa_obj);
 		if (error)
 			return (error);
 	} else {
 		/*
 		 * Pre SA versions file systems should never touch
 		 * either the attribute registration or layout objects.
 		 */
 		sa_obj = 0;
 	}
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 	if (error)
 		goto out;
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(os, zfs_sa_upgrade);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
 	if (error)
 		goto out;
 	ASSERT(zfsvfs->z_root != 0);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
 	if (error)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 	    8, 1, &zfsvfs->z_userquota_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 	    8, 1, &zfsvfs->z_groupquota_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 	    &zfsvfs->z_fuid_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 	    &zfsvfs->z_shares_dir);
 	if (error && error != ENOENT)
 		goto out;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 	rrw_init(&zfsvfs->z_teardown_lock);
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	*zfvp = zfsvfs;
 	return (0);
 
 out:
 	dmu_objset_disown(os, zfsvfs);
 	*zfvp = NULL;
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 	return (error);
 }
 
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
 	int error;
 
 	error = zfs_register_callbacks(zfsvfs->z_vfs);
 	if (error)
 		return (error);
 
 	/*
 	 * Set the objset user_ptr to track its zfsvfs.
 	 */
 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
 	 * have to worry about replaying the log as we blocked all
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
 		boolean_t readonly;
 
 		/*
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
 		if (readonly != 0)
 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		else
 			zfs_unlinked_drain(zfsvfs);
 
 		/*
 		 * Parse and replay the intent log.
 		 *
 		 * Because of ziltest, this must be done after
 		 * zfs_unlinked_drain().  (Further note: ziltest
 		 * doesn't use readonly mounts, where
 		 * zfs_unlinked_drain() isn't called.)  This is because
 		 * ziltest causes spa_sync() to think it's committed,
 		 * but actually it is not, so the intent log contains
 		 * many txg's worth of changes.
 		 *
 		 * In particular, if object N is in the unlinked set in
 		 * the last txg to actually sync, then it could be
 		 * actually freed in a later txg and then reallocated
 		 * in a yet later txg.  This would write a "create
 		 * object N" record to the intent log.  Normally, this
 		 * would be fine because the spa_sync() would have
 		 * written out the fact that object N is free, before
 		 * we could write the "create object N" intent log
 		 * record.
 		 *
 		 * But when we are in ziltest mode, we advance the "open
 		 * txg" without actually spa_sync()-ing the changes to
 		 * disk.  So we would see that object N is still
 		 * allocated and in the unlinked set, and there is an
 		 * intent log record saying to allocate it.
 		 */
 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
 			if (zil_replay_disable) {
 				zil_destroy(zfsvfs->z_log, B_FALSE);
 			} else {
 				zfsvfs->z_replay = B_TRUE;
 				zil_replay(zfsvfs->z_os, zfsvfs,
 				    zfs_replay_vector);
 				zfsvfs->z_replay = B_FALSE;
 			}
 		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
 	return (0);
 }
 
 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
 
 void
 zfsvfs_free(zfsvfs_t *zfsvfs)
 {
 	int i;
 
 	/*
 	 * This is a barrier to prevent the filesystem from going away in
 	 * zfs_znode_move() until we can safely ensure that the filesystem is
 	 * not unmounted. We consider the filesystem valid before the barrier
 	 * and invalid after the barrier.
 	 */
 	rw_enter(&zfsvfs_lock, RW_READER);
 	rw_exit(&zfsvfs_lock);
 
 	zfs_fuid_destroy(zfsvfs);
 
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
 	rrw_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 static void
 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 {
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	if (zfsvfs->z_vfs) {
 		if (zfsvfs->z_use_fuids) {
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		} else {
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		}
 	}
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
 	uint64_t recordsize, fsid_guid;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 	vnode_t *vp;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
 	error = zfsvfs_create(osname, &zfsvfs);
 	if (error)
 		return (error);
 	zfsvfs->z_vfs = vfsp;
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
 		goto out;
 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
 
 	vfsp->vfs_data = zfsvfs;
 	vfsp->mnt_flag |= MNT_LOCAL;
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
 	 * 56-bit objset unique ID.  The objset unique ID is unique to
 	 * all objsets open on this system, provided by unique_create().
 	 * The 8-bit fs type must be put in the low bits of fsid[1]
 	 * because that's where other Solaris filesystems put it.
 	 */
 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
 	vfsp->vfs_fsid.val[0] = fsid_guid;
 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
 	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
 
 	/*
 	 * Set features for file system.
 	 */
 	zfs_set_fuid_feature(zfsvfs);
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 	}
 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
 
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
 
 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 	} else {
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
 	/* Grab extra reference. */
 	VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0);
 	VOP_UNLOCK(vp, 0);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	} else {
 		atomic_add_32(&zfs_active_fs_count, 1);
 	}
 
 	return (error);
 }
 
 void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
 	struct dsl_dataset *ds;
 
 	/*
 	 * Unregister properties.
 	 */
 	if (!dmu_objset_is_snapshot(os)) {
 		ds = dmu_objset_ds(os);
 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
 		    acl_inherit_changed_cb, zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "vscan",
 		    vscan_changed_cb, zfsvfs) == 0);
 	}
 }
 
 #ifdef SECLABEL
 /*
  * Convert a decimal digit string to a uint64_t integer.
  */
 static int
 str_to_uint64(char *str, uint64_t *objnum)
 {
 	uint64_t num = 0;
 
 	while (*str) {
 		if (*str < '0' || *str > '9')
 			return (EINVAL);
 
 		num = num*10 + *str++ - '0';
 	}
 
 	*objnum = num;
 	return (0);
 }
 
 /*
  * The boot path passed from the boot loader is in the form of
  * "rootpool-name/root-filesystem-object-number'. Convert this
  * string to a dataset name: "rootpool-name/root-filesystem-name".
  */
 static int
 zfs_parse_bootfs(char *bpath, char *outpath)
 {
 	char *slashp;
 	uint64_t objnum;
 	int error;
 
 	if (*bpath == 0 || *bpath == '/')
 		return (EINVAL);
 
 	(void) strcpy(outpath, bpath);
 
 	slashp = strchr(bpath, '/');
 
 	/* if no '/', just return the pool name */
 	if (slashp == NULL) {
 		return (0);
 	}
 
 	/* if not a number, just return the root dataset name */
 	if (str_to_uint64(slashp+1, &objnum)) {
 		return (0);
 	}
 
 	*slashp = '\0';
 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
 	*slashp = '/';
 
 	return (error);
 }
 
 /*
  * zfs_check_global_label:
  *	Check that the hex label string is appropriate for the dataset
  *	being mounted into the global_zone proper.
  *
  *	Return an error if the hex label string is not default or
  *	admin_low/admin_high.  For admin_low labels, the corresponding
  *	dataset must be readonly.
  */
 int
 zfs_check_global_label(const char *dsname, const char *hexsl)
 {
 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
 		/* must be readonly */
 		uint64_t rdonly;
 
 		if (dsl_prop_get_integer(dsname,
 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
 			return (EACCES);
 		return (rdonly ? 0 : EACCES);
 	}
 	return (EACCES);
 }
 
 /*
  * zfs_mount_label_policy:
  *	Determine whether the mount is allowed according to MAC check.
  *	by comparing (where appropriate) label of the dataset against
  *	the label of the zone being mounted into.  If the dataset has
  *	no label, create one.
  *
  *	Returns:
  *		 0 :	access allowed
  *		>0 :	error code, such as EACCES
  */
 static int
 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
 {
 	int		error, retv;
 	zone_t		*mntzone = NULL;
 	ts_label_t	*mnt_tsl;
 	bslabel_t	*mnt_sl;
 	bslabel_t	ds_sl;
 	char		ds_hexsl[MAXNAMELEN];
 
 	retv = EACCES;				/* assume the worst */
 
 	/*
 	 * Start by getting the dataset label if it exists.
 	 */
 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error)
 		return (EACCES);
 
 	/*
 	 * If labeling is NOT enabled, then disallow the mount of datasets
 	 * which have a non-default label already.  No other label checks
 	 * are needed.
 	 */
 	if (!is_system_labeled()) {
 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 			return (0);
 		return (EACCES);
 	}
 
 	/*
 	 * Get the label of the mountpoint.  If mounting into the global
 	 * zone (i.e. mountpoint is not within an active zone and the
 	 * zoned property is off), the label must be default or
 	 * admin_low/admin_high only; no other checks are needed.
 	 */
 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
 	if (mntzone->zone_id == GLOBAL_ZONEID) {
 		uint64_t zoned;
 
 		zone_rele(mntzone);
 
 		if (dsl_prop_get_integer(osname,
 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 			return (EACCES);
 		if (!zoned)
 			return (zfs_check_global_label(osname, ds_hexsl));
 		else
 			/*
 			 * This is the case of a zone dataset being mounted
 			 * initially, before the zone has been fully created;
 			 * allow this mount into global zone.
 			 */
 			return (0);
 	}
 
 	mnt_tsl = mntzone->zone_slabel;
 	ASSERT(mnt_tsl != NULL);
 	label_hold(mnt_tsl);
 	mnt_sl = label2bslabel(mnt_tsl);
 
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
 		/*
 		 * The dataset doesn't have a real label, so fabricate one.
 		 */
 		char *str = NULL;
 
 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
 		    dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 		    ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
 			retv = 0;
 		if (str != NULL)
 			kmem_free(str, strlen(str) + 1);
 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
 		/*
 		 * Now compare labels to complete the MAC check.  If the
 		 * labels are equal then allow access.  If the mountpoint
 		 * label dominates the dataset label, allow readonly access.
 		 * Otherwise, access is denied.
 		 */
 		if (blequal(mnt_sl, &ds_sl))
 			retv = 0;
 		else if (bldominates(mnt_sl, &ds_sl)) {
 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 			retv = 0;
 		}
 	}
 
 	label_rele(mnt_tsl);
 	zone_rele(mntzone);
 	return (retv);
 }
 #endif	/* SECLABEL */
 
 #ifdef OPENSOLARIS_MOUNTROOT
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
 	int error = 0;
 	static int zfsrootdone = 0;
 	zfsvfs_t *zfsvfs = NULL;
 	znode_t *zp = NULL;
 	vnode_t *vp = NULL;
 	char *zfs_bootfs;
 	char *zfs_devid;
 
 	ASSERT(vfsp);
 
 	/*
 	 * The filesystem that we mount as root is defined in the
 	 * boot property "zfs-bootfs" with a format of
 	 * "poolname/root-dataset-objnum".
 	 */
 	if (why == ROOT_INIT) {
 		if (zfsrootdone++)
 			return (EBUSY);
 		/*
 		 * the process of doing a spa_load will require the
 		 * clock to be set before we could (for example) do
 		 * something better by looking at the timestamp on
 		 * an uberblock, so just set it to -1.
 		 */
 		clkset(-1);
 
 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
 			    "bootfs name");
 			return (EINVAL);
 		}
 		zfs_devid = spa_get_bootprop("diskdevid");
 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
 		if (zfs_devid)
 			spa_free_bootprop(zfs_devid);
 		if (error) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
 			    error);
 			return (error);
 		}
 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
 			    error);
 			return (error);
 		}
 
 		spa_free_bootprop(zfs_bootfs);
 
 		if (error = vfs_lock(vfsp))
 			return (error);
 
 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
 			goto out;
 		}
 
 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
 		ASSERT(zfsvfs);
 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
 			goto out;
 		}
 
 		vp = ZTOV(zp);
 		mutex_enter(&vp->v_lock);
 		vp->v_flag |= VROOT;
 		mutex_exit(&vp->v_lock);
 		rootvp = vp;
 
 		/*
 		 * Leave rootvp held.  The root file system is never unmounted.
 		 */
 
 		vfs_add((struct vnode *)0, vfsp,
 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
 out:
 		vfs_unlock(vfsp);
 		return (error);
 	} else if (why == ROOT_REMOUNT) {
 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
 		vfsp->vfs_flag |= VFS_REMOUNT;
 
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		return (zfs_register_callbacks(vfsp));
 
 	} else if (why == ROOT_UNMOUNT) {
 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
 		(void) zfs_sync(vfsp, 0, 0);
 		return (0);
 	}
 
 	/*
 	 * if "why" is equal to anything else other than ROOT_INIT,
 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
 	 */
 	return (ENOTSUP);
 }
 #endif	/* OPENSOLARIS_MOUNTROOT */
 
 static int
 getpoolname(const char *osname, char *poolname)
 {
 	char *p;
 
 	p = strchr(osname, '/');
 	if (p == NULL) {
 		if (strlen(osname) >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strcpy(poolname, osname);
 	} else {
 		if (p - osname >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strncpy(poolname, osname, p - osname);
 		poolname[p - osname] = '\0';
 	}
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_mount(vfs_t *vfsp)
 {
 	kthread_t	*td = curthread;
 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
 	cred_t		*cr = td->td_ucred;
 	char		*osname;
 	int		error = 0;
 	int		canwrite;
 
 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
 		return (EPERM);
 
 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
 		return (EINVAL);
 
 	/*
 	 * If full-owner-access is enabled and delegated administration is
 	 * turned on, we must set nosuid.
 	 */
 	if (zfs_super_owner &&
 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Check for mount privilege?
 	 *
 	 * If we don't have privilege then see if
 	 * we have local permission to allow it
 	 */
 	error = secpolicy_fs_mount(cr, mvp, vfsp);
 	if (error) {
 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
 			goto out;
 
 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
 			vattr_t		vattr;
 
 			/*
 			 * Make sure user is the owner of the mount point
 			 * or has sufficient privileges.
 			 */
 
 			vattr.va_mask = AT_UID;
 
 			vn_lock(mvp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(mvp, &vattr, cr)) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 
 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 			VOP_UNLOCK(mvp, 0);
 		}
 
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Refuse to mount a filesystem if we are in a local zone and the
 	 * dataset is not visible.
 	 */
 	if (!INGLOBALZONE(curthread) &&
 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
 		error = EPERM;
 		goto out;
 	}
 
 #ifdef SECLABEL
 	error = zfs_mount_label_policy(vfsp, osname);
 	if (error)
 		goto out;
 #endif
 
 	vfsp->vfs_flag |= MNT_NFS4ACLS;
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
 	if (vfsp->vfs_flag & MS_REMOUNT) {
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		error = zfs_register_callbacks(vfsp);
 		goto out;
 	}
 
 	/* Initial root mount: try hard to import the requested root pool. */
 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
 		char pname[MAXNAMELEN];
 
 		error = getpoolname(osname, pname);
 		if (error == 0)
 			error = spa_import_rootpool(pname);
 		if (error)
 			goto out;
 	}
 	DROP_GIANT();
 	error = zfs_domount(vfsp, osname);
 	PICKUP_GIANT();
 
 #ifdef sun
 	/*
 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
 	 * disappear due to a forced unmount.
 	 */
 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
 		VFS_HOLD(mvp->v_vfsp);
 #endif	/* sun */
 
 out:
 	return (error);
 }
 
 static int
 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
 	statp->f_version = STATFS_VERSION;
 
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
 
 	/*
 	 * The underlying storage pool actually uses multiple block sizes.
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
 	statp->f_bsize = SPA_MINBLOCKSIZE;
 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
 	 * file system, but reported in terms of f_frsize - the
 	 * "fragment" size.
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
 	statp->f_bfree = availbytes / statp->f_bsize;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
 	 * statvfs() should really be called statufs(), because it assumes
 	 * static metadata.  ZFS doesn't preallocate files, so the best
 	 * we can do is report the max that could possibly fit in f_files,
 	 * and that minus the number actually used in f_ffree.
 	 * For f_ffree, report the smaller of the number of object available
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
 	statp->f_files = statp->f_ffree + usedobjs;
 
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 
 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 	    sizeof(statp->f_mntfromname));
 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
 	statp->f_namemax = ZFS_MAXNAMELEN;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 int
 zfs_vnode_lock(vnode_t *vp, int flags)
 {
 	int error;
 
 	ASSERT(vp != NULL);
 
 	error = vn_lock(vp, flags);
 	return (error);
 }
 
 static int
 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER_NOERROR(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0)
 		*vpp = ZTOV(rootzp);
 
 	ZFS_EXIT(zfsvfs);
 
 	if (error == 0) {
 		error = zfs_vnode_lock(*vpp, flags);
 		if (error == 0)
 			(*vpp)->v_vflag |= VV_ROOT;
 	}
 	if (error != 0)
 		*vpp = NULL;
 
 	return (error);
 }
 
 /*
  * Teardown the zfsvfs::z_os.
  *
  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
  * and 'z_teardown_inactive_lock' held.
  */
 static int
 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 {
 	znode_t	*zp;
 
 	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
 	if (!unmounting) {
 		/*
 		 * We purge the parent filesystem's vfsp as the parent
 		 * filesystem and all of its snapshots have their vnode's
 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
 		 * 'z_parent' is self referential for non-snapshots.
 		 */
 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 #ifdef FREEBSD_NAMECACHE
 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
 #endif
 	}
 
 	/*
 	 * Close the zil. NB: Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.
 	 */
 	if (zfsvfs->z_log) {
 		zil_close(zfsvfs->z_log);
 		zfsvfs->z_log = NULL;
 	}
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
 
 	/*
 	 * If we are not unmounting (ie: online recv) and someone already
 	 * unmounted this file system while we were doing the switcheroo,
 	 * or a reopen of z_os failed then just bail out now.
 	 */
 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
 		return (EIO);
 	}
 
 	/*
 	 * At this point there are no vops active, and any new vops will
 	 * fail with EIO since we have z_teardown_lock for writer (only
 	 * relavent for forced unmount).
 	 *
 	 * Release all holds on dbufs.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
 		if (zp->z_sa_hdl) {
 			ASSERT(ZTOV(zp)->v_count >= 0);
 			zfs_znode_dmu_fini(zp);
 		}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * If we are unmounting, set the unmounted flag and let new vops
 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
 	 * other vops will fail with EIO.
 	 */
 	if (unmounting) {
 		zfsvfs->z_unmounted = B_TRUE;
 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	}
 
 	/*
 	 * z_os will be NULL if there was an error in attempting to reopen
 	 * zfsvfs, so just return as the properties had already been
 	 * unregistered and cached data had been evicted before.
 	 */
 	if (zfsvfs->z_os == NULL)
 		return (0);
 
 	/*
 	 * Unregister properties.
 	 */
 	zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Evict cached data
 	 */
 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 	(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
 
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag)
 {
 	kthread_t *td = curthread;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	objset_t *os;
 	cred_t *cr = td->td_ucred;
 	int ret;
 
 	ret = secpolicy_fs_unmount(cr, vfsp);
 	if (ret) {
 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
 		    ZFS_DELEG_PERM_MOUNT, cr))
 			return (ret);
 	}
 
 	/*
 	 * We purge the parent filesystem's vfsp as the parent filesystem
 	 * and all of its snapshots have their vnode's v_vfsp set to the
 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
 	 * referential for non-snapshots.
 	 */
 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
 		ret = vflush(vfsp, 0, 0, td);
 		ASSERT(ret == EBUSY);
 		if (!(fflag & MS_FORCE)) {
 			if (zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
 			ASSERT(zfsvfs->z_ctldir->v_count == 1);
 		}
 		zfsctl_destroy(zfsvfs);
 		ASSERT(zfsvfs->z_ctldir == NULL);
 	}
 
 	if (fflag & MS_FORCE) {
 		/*
 		 * Mark file system as unmounted before calling
 		 * vflush(FORCECLOSE). This way we ensure no future vnops
 		 * will be called and risk operating on DOOMED vnodes.
 		 */
 		rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 		zfsvfs->z_unmounted = B_TRUE;
 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
 	}
 
 	/*
 	 * Flush all the files.
 	 */
 	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 	if (ret != 0) {
 		if (!zfsvfs->z_issnap) {
 			zfsctl_create(zfsvfs);
 			ASSERT(zfsvfs->z_ctldir != NULL);
 		}
 		return (ret);
 	}
 
 	if (!(fflag & MS_FORCE)) {
 		/*
 		 * Check the number of active vnodes in the file system.
 		 * Our count is maintained in the vfs structure, but the
 		 * number is off by 1 to indicate a hold on the vfs
 		 * structure itself.
 		 *
 		 * The '.zfs' directory maintains a reference of its
 		 * own, and any active references underneath are
 		 * reflected in the vnode count.
 		 */
 		if (zfsvfs->z_ctldir == NULL) {
 			if (vfsp->vfs_count > 1)
 				return (EBUSY);
 		} else {
 			if (vfsp->vfs_count > 2 ||
 			    zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
 		}
 	}
 
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;
 
 	/*
 	 * z_os will be NULL if there was an error in
 	 * attempting to reopen zfsvfs.
 	 */
 	if (os != NULL) {
 		/*
 		 * Unset the objset user_ptr.
 		 */
 		mutex_enter(&os->os_user_ptr_lock);
 		dmu_objset_set_user(os, NULL);
 		mutex_exit(&os->os_user_ptr_lock);
 
 		/*
 		 * Finally release the objset
 		 */
 		dmu_objset_disown(os, zfsvfs);
 	}
 
 	/*
 	 * We can now safely destroy the '.zfs' directory node.
 	 */
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 	if (zfsvfs->z_issnap) {
 		vnode_t *svp = vfsp->mnt_vnodecovered;
 
 		if (svp->v_count >= 2)
 			VN_RELE(svp);
 	}
 	zfs_freevfs(vfsp);
 
 	return (0);
 }
 
 static int
 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	int 		err;
 
 	/*
 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
 	 * This will make NFS to switch to LOOKUP instead of using VGET.
 	 */
 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
 		return (EOPNOTSUPP);
 
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err == 0)
 		*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	if (err == 0)
 		err = zfs_vnode_lock(*vpp, flags);
 	if (err != 0)
 		*vpp = NULL;
 	else
 		(*vpp)->v_hash = ino;
 	return (err);
 }
 
 static int
 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 	/*
 	 * If this is regular file system vfsp is the same as
 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
 	 * zfsvfs->z_parent->z_vfs represents parent file system
 	 * which we have to use here, because only this file system
 	 * has mnt_export configured.
 	 */
 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
 	    credanonp, numsecflavors, secflavors));
 }
 
 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
 
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
 	uint64_t	zp_gen;
 	int 		i, err;
 
 	*vpp = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * On FreeBSD we can get snapshot's mount point or its parent file
 	 * system mount point depending if snapshot is already mounted or not.
 	 */
 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
 		uint64_t	objsetid = 0;
 		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 
 		ZFS_EXIT(zfsvfs);
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
 			return (EINVAL);
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
 	 * we are in the .zfs/shares directory tree.
 	 */
 	if ((fid_gen == 0 &&
 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
 		*vpp = zfsvfs->z_ctldir;
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else if (object == zfsvfs->z_shares_dir) {
 			VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
 			VN_HOLD(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		err = zfs_vnode_lock(*vpp, flags);
 		if (err != 0)
 			*vpp = NULL;
 		return (err);
 	}
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 	if (err = zfs_zget(zfsvfs, object, &zp)) {
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 	    sizeof (uint64_t));
 	zp_gen = zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		VN_RELE(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	err = zfs_vnode_lock(*vpp, flags | LK_RETRY);
 	if (err == 0)
 		vnode_create_vobject(*vpp, zp->z_size, curthread);
 	else
 		*vpp = NULL;
 	return (err);
 }
 
 /*
  * Block out VOPs and close zfsvfs_t::z_os
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
  * 'z_teardown_inactive_lock' write held.
  */
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
 	int error;
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
 	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 
 	return (0);
 }
 
 /*
  * Reopen zfsvfs_t::z_os and release VOPs.
  */
 int
 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
 {
 	int err;
 
 	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
 	err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
 	    &zfsvfs->z_os);
 	if (err) {
 		zfsvfs->z_os = NULL;
 	} else {
 		znode_t *zp;
 		uint64_t sa_obj = 0;
 
 		/*
 		 * Make sure version hasn't changed
 		 */
 
 		err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
 		    &zfsvfs->z_version);
 
 		if (err)
 			goto bail;
 
 		err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    ZFS_SA_ATTRS, 8, 1, &sa_obj);
 
 		if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
 			goto bail;
 
 		if ((err = sa_setup(zfsvfs->z_os, sa_obj,
 		    zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
 			goto bail;
 
 		if (zfsvfs->z_version >= ZPL_VERSION_SA)
 			sa_register_update_callback(zfsvfs->z_os,
 			    zfs_sa_upgrade);
 
 		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
 		zfs_set_fuid_feature(zfsvfs);
 
 		/*
 		 * Attempt to re-establish all the active znodes with
 		 * their dbufs.  If a zfs_rezget() fails, then we'll let
 		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
 		 * when they try to use their znode.
 		 */
 		mutex_enter(&zfsvfs->z_znodes_lock);
 		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
 		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
 			(void) zfs_rezget(zp);
 		}
 		mutex_exit(&zfsvfs->z_znodes_lock);
 	}
 
 bail:
 	/* release the VOPs */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
 		 * Since we couldn't reopen zfsvfs::z_os, or
 		 * setup the sa framework force unmount this file system.
 		 */
 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
 	}
 	return (err);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 #ifdef sun
 	/*
 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
 	 * from zfs_mount().  Release it here.  If we came through
 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
 	 * skip the VFS_RELE for rootvfs.
 	 */
 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
 		VFS_RELE(zfsvfs->z_parent->z_vfs);
 #endif	/* sun */
 
 	zfsvfs_free(zfsvfs);
 
 	atomic_add_32(&zfs_active_fs_count, -1);
 }
 
 #ifdef __i386__
 static int desiredvnodes_backup;
 #endif
 
 static void
 zfs_vnodes_adjust(void)
 {
 #ifdef __i386__
 	int newdesiredvnodes;
 
 	desiredvnodes_backup = desiredvnodes;
 
 	/*
 	 * We calculate newdesiredvnodes the same way it is done in
 	 * vntblinit(). If it is equal to desiredvnodes, it means that
 	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
 	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
 	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode))));
 	if (newdesiredvnodes == desiredvnodes)
 		desiredvnodes = (3 * newdesiredvnodes) / 4;
 #endif
 }
 
 static void
 zfs_vnodes_adjust_back(void)
 {
 
 #ifdef __i386__
 	desiredvnodes = desiredvnodes_backup;
 #endif
 }
 
 void
 zfs_init(void)
 {
 
 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
 
 	/*
 	 * Initialize .zfs directory structures
 	 */
 	zfsctl_init();
 
 	/*
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
 
 	/*
 	 * Reduce number of vnodes. Originally number of vnodes is calculated
 	 * with UFS inode in mind. We reduce it here, because it's too big for
 	 * ZFS/i386.
 	 */
 	zfs_vnodes_adjust();
 
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
 void
 zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
 	zfs_vnodes_adjust_back();
 }
 
 int
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
 
 int
 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
 	int error;
 	objset_t *os = zfsvfs->z_os;
 	dmu_tx_t *tx;
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
 		return (EINVAL);
 
 	if (newvers < zfsvfs->z_version)
 		return (EINVAL);
 
 	if (zfs_spa_version_map(newvers) >
 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
 		return (ENOTSUP);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    ZFS_SA_ATTRS);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 	    8, 1, &newvers, tx);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		return (error);
 	}
 
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		uint64_t sa_obj;
 
 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
 		    SPA_VERSION_SA);
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 
 		error = zap_add(os, MASTER_NODE_OBJ,
 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT0(error);
 
 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
 		sa_register_update_callback(os, zfs_sa_upgrade);
 	}
 
 	spa_history_log_internal(LOG_DS_UPGRADE,
 	    dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
 	    zfsvfs->z_version, newvers, dmu_objset_id(os));
 
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Read a property stored within the master node.
  */
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
 	const char *pname;
 	int error = ENOENT;
 
 	/*
 	 * Look up the file system's value for the property.  For the
 	 * version property, we look up a slightly different string.
 	 */
 	if (prop == ZFS_PROP_VERSION)
 		pname = ZPL_VERSION_STR;
 	else
 		pname = zfs_prop_to_name(prop);
 
 	if (os != NULL)
 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
 
 	if (error == ENOENT) {
 		/* No value set, use the default value */
 		switch (prop) {
 		case ZFS_PROP_VERSION:
 			*value = ZPL_VERSION;
 			break;
 		case ZFS_PROP_NORMALIZE:
 		case ZFS_PROP_UTF8ONLY:
 			*value = 0;
 			break;
 		case ZFS_PROP_CASE:
 			*value = ZFS_CASE_SENSITIVE;
 			break;
 		default:
 			return (error);
 		}
 		error = 0;
 	}
 	return (error);
 }
 
 #ifdef _KERNEL
 void
 zfsvfs_update_fromname(const char *oldname, const char *newname)
 {
 	char tmpbuf[MAXPATHLEN];
 	struct mount *mp;
 	char *fromname;
 	size_t oldlen;
 
 	oldlen = strlen(oldname);
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		fromname = mp->mnt_stat.f_mntfromname;
 		if (strcmp(fromname, oldname) == 0) {
 			(void)strlcpy(fromname, newname,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 		if (strncmp(fromname, oldname, oldlen) == 0 &&
 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
 			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
 			    newname, fromname + oldlen);
 			(void)strlcpy(fromname, tmpbuf,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 #endif
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 247192)
@@ -1,6893 +1,6894 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sf_buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
 #include <vm/vm_param.h>
 #include <vm/vm_pageout.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *      can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
  *	This is critical because we don't want to block while holding locks.
  *	Note, in particular, that if a lock is sometimes acquired before
  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
  *	use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *      During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 		if (fs_vscan(*vpp, cr, 0) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (EACCES);
 		}
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (ENXIO);
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	/* end of file? */
 	if ((error == ESRCH) || (noff > file_sz)) {
 		/*
 		 * Handle the virtual hole at the end of file.
 		 */
 		if (hole) {
 			*off = file_sz;
 			return (0);
 		}
 		return (ENXIO);
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
 
 	switch (com) {
 	case _FIOFFS:
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	case _FIOGDIO:
 	case _FIOSDIO:
 		return (0);
 
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
 #ifdef sun
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (EFAULT);
 #else
 		off = *(offset_t *)data;
 #endif
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 #ifdef sun
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (EFAULT);
 #else
 		*(offset_t *)data = off;
 #endif
 		return (0);
 	}
 	return (ENOTTY);
 }
 
 static vm_page_t
 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if ((pp->oflags & VPO_BUSY) != 0) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_sleep(pp, "zfsmwb");
 				continue;
 			}
 		} else {
 			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
 			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
 			    VM_ALLOC_NOBUSY);
 		}
 
 		if (pp != NULL) {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_object_pip_add(obj, 1);
 			vm_page_io_start(pp);
 			pmap_remove_write(pp);
 			vm_page_clear_dirty(pp, off, nbytes);
 		}
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unbusy(vm_page_t pp)
 {
 
 	vm_page_io_finish(pp);
 	vm_object_pip_subtract(pp->object, 1);
 }
 
 static vm_page_t
 page_hold(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if ((pp->oflags & VPO_BUSY) != 0) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_sleep(pp, "zfsmwb");
 				continue;
 			}
 
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_lock(pp);
 			vm_page_hold(pp);
 			vm_page_unlock(pp);
 
 		} else
 			pp = NULL;
 		break;
 	}
 	return (pp);
 }
 
 static void
 page_unhold(vm_page_t pp)
 {
 
 	vm_page_lock(pp);
 	vm_page_unhold(pp);
 	vm_page_unlock(pp);
 }
 
 static caddr_t
 zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
 {
 
 	*sfp = sf_buf_alloc(pp, 0);
 	return ((caddr_t)sf_buf_kva(*sfp));
 }
 
 static void
 zfs_unmap_page(struct sf_buf *sf)
 {
 
 	sf_buf_free(sf);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 static void
 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
     int segflg, dmu_tx_t *tx)
 {
 	vm_object_t obj;
 	struct sf_buf *sf;
 	caddr_t va;
 	int off;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
 		if (segflg == UIO_NOCOPY) {
 			pp = vm_page_lookup(obj, OFF_TO_IDX(start));
 			KASSERT(pp != NULL,
 			    ("zfs update_pages: NULL page in putpages case"));
 			KASSERT(off == 0,
 			    ("zfs update_pages: unaligned data in putpages case"));
 			KASSERT(pp->valid == VM_PAGE_BITS_ALL,
 			    ("zfs update_pages: invalid page in putpages case"));
 			KASSERT(pp->busy > 0,
 			    ("zfs update_pages: unbusy page in putpages case"));
 			KASSERT(!pmap_page_is_write_mapped(pp),
 			    ("zfs update_pages: writable page in putpages case"));
 			zfs_vmobject_wunlock(obj);
 
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_write(os, oid, start, nbytes, va, tx);
 			zfs_unmap_page(sf);
 
 			zfs_vmobject_wlock(obj);
 			vm_page_undirty(pp);
 		} else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 			zfs_vmobject_wunlock(obj);
 
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, oid, start+off, nbytes,
 			    va+off, DMU_READ_PREFETCH);;
 			zfs_unmap_page(sf);
 
 			zfs_vmobject_wlock(obj);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 	if (segflg != UIO_NOCOPY)
 		vm_object_pip_wakeupn(obj, 0);
 	zfs_vmobject_wunlock(obj);
 }
 
 /*
  * Read with UIO_NOCOPY flag means that sendfile(2) requests
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
  * all pages in advance, drain VPO_BUSY on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
 static int
 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	struct sf_buf *sf;
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(uio->uio_segflg == UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
 
 	zfs_vmobject_wlock(obj);
 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
 		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
 		if (pp->valid == 0) {
 			vm_page_io_start(pp);
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
 			if (bytes != PAGESIZE && error == 0)
 				bzero(va + bytes, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			vm_page_io_finish(pp);
 			vm_page_lock(pp);
 			if (error) {
 				vm_page_free(pp);
 			} else {
 				pp->valid = VM_PAGE_BITS_ALL;
 				vm_page_activate(pp);
 			}
 			vm_page_unlock(pp);
 		}
 		if (error)
 			break;
 		uio->uio_resid -= bytes;
 		uio->uio_offset += bytes;
 		len -= bytes;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if (pp = page_hold(vp, start)) {
 			struct sf_buf *sf;
 			caddr_t va;
 
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = uiomove(va + off, bytes, UIO_READ, uio);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			page_unhold(pp);
 		} else {
 			zfs_vmobject_wunlock(obj);
 			error = dmu_read_uio(os, zp->z_id, uio, bytes);
 			zfs_vmobject_wlock(obj);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	ssize_t		n, nbytes;
-	int		error;
+	int		error = 0;
 	rl_t		*rl;
 	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	os = zfsvfs->z_os;
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
 		return (EACCES);
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE(zp->z_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (zfsvfs->z_log &&
 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
 #ifdef sun
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
 		int blksz = zp->z_blksz;
 		uint64_t offset = uio->uio_loffset;
 
 		xuio = (xuio_t *)uio;
 		if ((ISP2(blksz))) {
 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 			    blksz)) / blksz;
 		} else {
 			ASSERT(offset + n <= blksz);
 			nblk = 1;
 		}
 		(void) dmu_xuio_init(xuio, nblk);
 
 		if (vn_has_cached_data(vp)) {
 			/*
 			 * For simplicity, we always allocate a full buffer
 			 * even if we only expect to read a portion of a block.
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    blksz), 0, blksz);
 			}
 		}
 	}
 #endif	/* sun */
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 #ifdef __FreeBSD__
 		if (uio->uio_segflg == UIO_NOCOPY)
 			error = mappedread_sf(vp, nbytes, uio);
 		else
 #endif /* __FreeBSD__ */
 		if (vn_has_cached_data(vp))
 			error = mappedread(vp, nbytes, uio);
 		else
 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = EIO;
 			break;
 		}
 
 		n -= nbytes;
 	}
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND flag set if in append mode.
  *		cr	- credentials of caller.
  *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = MAXOFFSET_T;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
-	int		error;
+	int		error = 0;
 	arc_buf_t	*abuf;
-	iovec_t		*aiov;
+	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
 	iovec_t		*iovp = uio->uio_iov;
 	int		write_eof;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[4];
 	uint64_t	mtime[2], ctime[2];
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * If immutable or not appending then return EPERM
 	 */
 	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 	    (uio->uio_loffset < zp->z_size))) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Validate file offset
 	 */
 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Check for mandatory locks before calling zfs_range_lock()
 	 * in order to prevent a deadlock with locks set via fcntl().
 	 */
 	if (MANDMODE((mode_t)zp->z_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef sun
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 		xuio = (xuio_t *)uio;
 	else
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif	/* sun */
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 		woff = rl->r_off;
 		if (rl->r_len == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
 	write_eof = (woff + n > zp->z_size);
 
 	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
 again:
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			error = EDQUOT;
 			break;
 		}
 
 		if (xuio && abuf == NULL) {
 			ASSERT(i_iov < iovcnt);
 			aiov = &iovp[i_iov];
 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
 			dmu_xuio_clear(xuio, i_iov);
 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 			    iovec_t *, aiov, arc_buf_t *, abuf);
 			ASSERT((aiov->iov_base == abuf->b_data) ||
 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
 			if (error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes)) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
 			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto again;
 			}
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (woff + nbytes > zp->z_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 			/*
 			 * If this is not a full block write, but we are
 			 * extending the file past EOF and this data starts
 			 * block-aligned, use assign_arcbuf().  Otherwise,
 			 * write via dmu_write().
 			 */
 			if (tx_bytes < max_blksz && (!write_eof ||
 			    aiov->iov_base != abuf->b_data)) {
 				ASSERT(xuio);
 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
 				    aiov->iov_len, aiov->iov_base, tx);
 				dmu_return_arcbuf(abuf);
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    woff, abuf, tx);
 			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
 		if (tx_bytes && vn_has_cached_data(vp)) {
 			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 			    zp->z_id, uio->uio_segflg, tx);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 *
 		 * Note: we don't call zfs_fuid_map_id() here because
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(vp, cr,
 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 			uint64_t newmode;
 			zp->z_mode &= ~(S_ISUID | S_ISGID);
 			newmode = zp->z_mode;
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
 			ASSERT(error == 0);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
 #ifdef sun
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif	/* sun */
 	}
 
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
 	objset_t *os = zp->z_zfsvfs->z_os;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 #ifdef DEBUG
 static int zil_fault_io = 0;
 #endif
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 
 	ASSERT(zio != NULL);
 	ASSERT(size != 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (ENOENT);
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 		return (ENOENT);
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zfsvfs->z_log;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = ENOENT;
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and it's checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
 			    RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = ENOENT;
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = EIO;
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold(os, object, offset, zgd, &db,
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *obp = dmu_buf_get_blkptr(db);
 			if (obp) {
 				ASSERT(BP_IS_HOLE(bp));
 				*bp = *obp;
 			}
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= zp->z_blksz);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (flag & V_ACE_MASK)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 	else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * If vnode is for a device return a specfs vnode instead.
  */
 static int
 specvp_check(vnode_t **vpp, cred_t *cr)
 {
 	int error = 0;
 
 	if (IS_DEVVP(*vpp)) {
 		struct vnode *svp;
 
 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 		VN_RELE(*vpp);
 		if (svp == NULL)
 			error = ENOSYS;
 		*vpp = svp;
 	}
 	return (error);
 }
 
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		direntflags - directory lookup flags
  *		realpnp - returned pathname.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
 	int *direntflags = NULL;
 	void *realpnp = NULL;
 
 	/* fast path */
 	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
 		if (dvp->v_type != VDIR) {
 			return (ENOTDIR);
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (EIO);
 		}
 
 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 			error = zfs_fastaccesschk_execute(zdp, cr);
 			if (!error) {
 				*vpp = dvp;
 				VN_HOLD(*vpp);
 				return (0);
 			}
 			return (error);
 		} else {
 			vnode_t *tvp = dnlc_lookup(dvp, nm);
 
 			if (tvp) {
 				error = zfs_fastaccesschk_execute(zdp, cr);
 				if (error) {
 					VN_RELE(tvp);
 					return (error);
 				}
 				if (tvp == DNLC_NO_VNODE) {
 					VN_RELE(tvp);
 					return (ENOENT);
 				} else {
 					*vpp = tvp;
 					return (specvp_check(vpp, cr));
 				}
 			}
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 #ifdef TODO
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 #endif
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (dvp->v_type != VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (ENOTDIR);
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EILSEQ);
 	}
 
 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
 	if (error == 0)
 		error = specvp_check(vpp, cr);
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 				cnp->cn_flags |= SAVENAME;
 				break;
 			}
 			/* FALLTHROUGH */
 		case DELETE:
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 			break;
 		}
 	}
 	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
 		int ltype = 0;
 
 		if (cnp->cn_flags & ISDOTDOT) {
 			ltype = VOP_ISLOCKED(dvp);
 			VOP_UNLOCK(dvp, 0);
 		}
 		ZFS_EXIT(zfsvfs);
 		error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
 		if (cnp->cn_flags & ISDOTDOT)
 			vn_lock(dvp, ltype | LK_RETRY);
 		if (error != 0) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 			return (error);
 		}
 	} else {
 		ZFS_EXIT(zfsvfs);
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
 		cache_enter(dvp, *vpp, cnp);
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 #endif
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp 	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
     vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	void		*vsecp = NULL;
 	int		flag = 0;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (EINVAL);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EILSEQ);
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 top:
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		VN_HOLD(dvp);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible VN_HOLD(zp) */
 		int zflg = 0;
 
 		if (flag & FIGNORECASE)
 			zflg |= ZCILOOK;
 
 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 		    NULL, NULL);
 		if (error) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			if (strcmp(name, "..") == 0)
 				error = EISDIR;
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if (zp == NULL) {
 		uint64_t txtype;
 
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 
 		if ((dzp->z_pflags & ZFS_XATTR) &&
 		    (vap->va_type != VREG)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			error = EINVAL;
 			goto out;
 		}
 
 		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 		    cr, vsecp, &acl_ids)) != 0)
 			goto out;
 		have_acl = B_TRUE;
 
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 			zfs_acl_ids_free(&acl_ids);
 			error = EDQUOT;
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		fuid_dirtied = zfsvfs->z_fuid_dirty;
 		if (fuid_dirtied)
 			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 		if (fuid_dirtied)
 			zfs_fuid_sync(zfsvfs, tx);
 
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 		    vsecp, acl_ids.z_fuidp, vap);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
 
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 		have_acl = B_FALSE;
 
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl == EXCL) {
 			error = EEXIST;
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
 			error = EISDIR;
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if ((ZTOV(zp)->v_type == VREG) &&
 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
 			/* we can't hold any locks when calling zfs_freesp() */
 			zfs_dirent_unlock(dl);
 			dl = NULL;
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 			if (error == 0) {
 				vnevent_create(ZTOV(zp), ct);
 			}
 		}
 	}
 out:
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			VN_RELE(ZTOV(zp));
 	} else {
 		*vpp = ZTOV(zp);
 		error = specvp_check(vpp, cr);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 
 uint64_t null_xattr = 0;
 
 /*ARGSUSED*/
 static int
 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
     int flags)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	znode_t		*xzp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t 	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 	pathname_t	realnm;
 	int		error;
 	int		zflg = ZEXISTS;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE) {
 		zflg |= ZCILOOK;
 		pn_alloc(&realnm);
 		realnmp = &realnm;
 	}
 
 top:
 	xattr_obj = 0;
 	xzp = NULL;
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, realnmp)) {
 		if (realnmp)
 			pn_free(realnmp);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = EPERM;
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	if (realnmp)
 		dnlc_remove(dvp, realnmp->pn_buf);
 	else
 		dnlc_remove(dvp, name);
 
 	VI_LOCK(vp);
 	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
 	VI_UNLOCK(vp);
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	if (may_delete_now) {
 		toobig =
 		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
 		/* if the file is too big, only hold_free a token amount */
 		dmu_tx_hold_free(tx, zp->z_id, 0,
 		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
 	}
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	mutex_enter(&zp->z_lock);
 	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 	mutex_exit(&zp->z_lock);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (xzp)
 			VN_RELE(ZTOV(xzp));
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		if (realnmp)
 			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 
 		/*
 		 * Hold z_lock so that we can make sure that the ACL obj
 		 * hasn't changed.  Could have been deleted due to
 		 * zfs_sa_upgrade().
 		 */
 		mutex_enter(&zp->z_lock);
 		VI_LOCK(vp);
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 		delete_now = may_delete_now && !toobig &&
 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
 		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
 		    acl_obj;
 		VI_UNLOCK(vp);
 	}
 
 	if (delete_now) {
 #ifdef __FreeBSD__
 		panic("zfs_remove: delete_now branch taken");
 #endif
 		if (xattr_obj_unlinked) {
 			ASSERT3U(xzp->z_links, ==, 2);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = 1;
 			xzp->z_links = 0;
 			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 			    &xzp->z_links, sizeof (xzp->z_links), tx);
 			ASSERT3U(error,  ==,  0);
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 
 			if (zp->z_is_sa)
 				error = sa_remove(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), tx);
 			else
 				error = sa_update(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
 				    sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		VI_LOCK(vp);
 		vp->v_count--;
 		ASSERT0(vp->v_count);
 		VI_UNLOCK(vp);
 		mutex_exit(&zp->z_lock);
 		zfs_znode_delete(zp, tx);
 	} else if (unlinked) {
 		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
 #ifdef __FreeBSD__
 		vp->v_vflag |= VV_NOSYNC;
 #endif
 	}
 
 	txtype = TX_REMOVE;
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
 	if (realnmp)
 		pn_free(realnmp);
 
 	zfs_dirent_unlock(dl);
 
 	if (!delete_now)
 		VN_RELE(vp);
 	if (xzp)
 		VN_RELE(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 /*ARGSUSED*/
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
     caller_context_t *ct, int flags, vsecattr_t *vsecp)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VDIR);
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (EINVAL);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EILSEQ);
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    vsecp, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 top:
 	*vpp = NULL;
 
 	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
 	    NULL, NULL)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (EDQUOT);
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zflg = ZEXISTS;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, NULL)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 
 	if (vp == cwd) {
 		error = EINVAL;
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	/*
 	 * Grab a lock on the directory to make sure that noone is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(dvp);
 #endif
 
 	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 #ifdef FREEBSD_NAMECACHE
 	cache_purge(vp);
 #endif
 out:
 	zfs_dirent_unlock(dl);
 
 	VN_RELE(vp);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure.
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	edirent_t	*eodp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	boolean_t	check_sysattrs;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
 	int		flags = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
+		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 	/*
 	 * If this VFS supports the system attribute view interface; and
 	 * we're looking at an extended attribute directory; and we care
 	 * about normalization conflicts on this vfs; then we must check
 	 * for normalization conflicts with the sysattr name space.
 	 */
 #ifdef TODO
 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
 	    (flags & V_RDDIR_ENTFLAGS);
 #else
 	check_sysattrs = 0;
 #endif
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = ENXIO;
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 
 			if (check_sysattrs && !zap.za_normalization_conflict) {
 #ifdef TODO
 				zap.za_normalization_conflict =
 				    xattr_sysattr_casechk(zap.za_name);
 #else
 				panic("%s:%u: TODO", __func__, __LINE__);
 #endif
 			}
 		}
 
 		if (flags & V_RDDIR_ACCFILTER) {
 			/*
 			 * If we have no access at all, don't include
 			 * this entry in the returned information
 			 */
 			znode_t	*ezp;
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
 				VN_RELE(ZTOV(ezp));
 				goto skip_entry;
 			}
 			VN_RELE(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = EINVAL;
 				goto update;
 			}
 			break;
 		}
 		if (flags & V_RDDIR_ENTFLAGS) {
 			/*
 			 * Add extended flag entry:
 			 */
 			eodp->ed_ino = objnum;
 			eodp->ed_reclen = reclen;
 			/* NOTE: ed_off is the offset for the *next* entry */
 			next = &(eodp->ed_off);
 			eodp->ed_eflags = zap.za_normalization_conflict ?
 			    ED_CASE_CONFLICT : 0;
 			(void) strncpy(eodp->ed_name, zap.za_name,
 			    EDIRENT_NAMELEN(reclen));
 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
 		} else {
 			/*
 			 * Add normal entry:
 			 */
 			odp->d_ino = objnum;
 			odp->d_reclen = reclen;
 			odp->d_namlen = strlen(zap.za_name);
 			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 			odp->d_type = type;
 			odp = (dirent64_t *)((intptr_t)odp + reclen);
 		}
 		outcount += reclen;
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0);
 
 	skip_entry:
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 ulong_t zfs_fsync_sync_cnt = 4;
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		ZFS_EXIT(zfsvfs);
 	}
 	return (0);
 }
 
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds)
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t links;
 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	mutex_enter(&zp->z_lock);
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 #ifdef sun
 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 #else
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 #endif
 	vap->va_nodeid = zp->z_id;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
 		links = zp->z_links + 1;
 	else
 		links = zp->z_links;
 	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
 	vap->va_size = zp->z_size;
 #ifdef sun
 	vap->va_rdev = vp->v_rdev;
 #else
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 #endif
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 			uint64_t times[2];
 
 			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 			    times, sizeof (times));
 			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
 			XVA_SET_RTN(xvap, XAT_CREATETIME);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
 	mutex_exit(&zp->z_lock);
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
-	uint_t		saved_mask;
+	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (EINVAL);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (EISDIR);
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			ZFS_EXIT(zfsvfs);
 			return (EOVERFLOW);
 		}
 	}
 
 top:
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (EROFS);
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (EPERM);
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Save the mode, as secpolicy_vnode_setattr()
 				 * will overwrite it with ova.va_mode.
 				 */
 				saved_mode = vap->va_mode;
 			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask) {
 			vap->va_mask |= saved_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Recover the mode after
 				 * secpolicy_vnode_setattr().
 				 */
 				vap->va_mode = saved_mode;
 			}
 		}
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID))) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
 					VN_RELE(ZTOV(attrzp));
 				err = EDQUOT;
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
 					VN_RELE(ZTOV(attrzp));
 				err = EDQUOT;
 				goto out2;
 			}
 		}
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = EPERM;
 			goto out;
 		}
 
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
 		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (err) {
 		if (err == ERESTART)
 			dmu_tx_wait(tx);
 		goto out;
 	}
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3U((uintptr_t)aclp, !=, 0);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
 		    B_TRUE);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime, B_TRUE);
 		}
 	}
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(vp->v_type == VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	mutex_exit(&zp->z_lock);
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 		if (err == ERESTART)
 			goto top;
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 	}
 
 out2:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			VN_RELE(ZTOV(zl->zl_znode));
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = zp->z_zfsvfs->z_root;
 	uint64_t	oidp = zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (oidp == szp->z_id)		/* We're a descendant of szp */
 			return (EINVAL);
 
 		if (oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
 		    &oidp, sizeof (oidp));
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*tdzp, *szp, *tzp;
 	znode_t		*sdzp = VTOZ(sdvp);
 	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
 	zilog_t		*zilog;
 	vnode_t		*realvp;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr;
 	int		error = 0;
 	int		zflg = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(sdzp);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure we have the real vp for the target directory.
 	 */
 	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
 		tdvp = realvp;
 
 	if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
 		ZFS_EXIT(zfsvfs);
 		return (EXDEV);
 	}
 
 	tdzp = VTOZ(tdvp);
 	ZFS_VERIFY_ZP(tdzp);
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EILSEQ);
 	}
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		/*
 		 * First compare the two name arguments without
 		 * considering any case folding.
 		 */
 		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
 
 		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
 		ASSERT(error == 0 || !zfsvfs->z_utf8);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 		/*
 		 * If the file system is case-folding, then we may
 		 * have some more checking to do.  A case-folding file
 		 * system is either supporting mixed case sensitivity
 		 * access or is completely case-insensitive.  Note
 		 * that the file system is always case preserving.
 		 *
 		 * In mixed sensitivity mode case sensitive behavior
 		 * is the default.  FIGNORECASE must be used to
 		 * explicitly request case insensitive behavior.
 		 *
 		 * If the source and target names provided differ only
 		 * by case (e.g., a request to rename 'tim' to 'Tim'),
 		 * we will treat this as a special case in the
 		 * case-insensitive mode: as long as the source name
 		 * is an exact match, we will allow this to proceed as
 		 * a name-change request.
 		 */
 		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
 		    flags & FIGNORECASE)) &&
 		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
 		    &error) == 0) {
 			/*
 			 * case preserving rename request, require exact
 			 * name matches
 			 */
 			zflg |= ZCIEXACT;
 			zflg &= ~ZCILOOK;
 		}
 	}
 
 	/*
 	 * If the source and destination directories are the same, we should
 	 * grab the z_name_lock of that directory only once.
 	 */
 	if (sdzp == tdzp) {
 		zflg |= ZHAVELOCK;
 		rw_enter(&sdzp->z_name_lock, RW_READER);
 	}
 
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 		    ZEXISTS | zflg, NULL, NULL);
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 	} else {
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, zflg, NULL, NULL);
 		serr = zfs_dirent_lock(&sdl,
 		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
 		    NULL, NULL);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				VN_RELE(ZTOV(tzp));
 		}
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		/*
 		 * FreeBSD: In OpenSolaris they only check if rename source is
 		 * ".." here, because "." is handled in their lookup. This is
 		 * not the case for FreeBSD, so we check for "." explicitly.
 		 */
 		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
 			serr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		VN_RELE(ZTOV(szp));
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(tnm, "..") == 0)
 			terr = EINVAL;
 		ZFS_EXIT(zfsvfs);
 		return (terr);
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto out;
 
 	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if (ZTOV(szp)->v_type == VDIR) {
 			if (ZTOV(tzp)->v_type != VDIR) {
 				error = ENOTDIR;
 				goto out;
 			}
 		} else {
 			if (ZTOV(tzp)->v_type == VDIR) {
 				error = EISDIR;
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	}
 
 	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
 	if (tzp)
 		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_rename_dest_dir(tdvp, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME |
 				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
 				    sdl->dl_name, tdzp, tdl->dl_name, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
 				vn_renamepath(tdvp, ZTOV(szp), tnm,
 				    strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
 #ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
 			cache_purge(sdvp);
 			cache_purge(tdvp);
 			cache_purge(ZTOV(szp));
 			if (tzp)
 				cache_purge(ZTOV(tzp));
 		}
 #endif
 	}
 
 	dmu_tx_commit(tx);
 out:
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	if (sdzp == tdzp)
 		rw_exit(&sdzp->z_name_lock);
 
 
 	VN_RELE(ZTOV(szp));
 	if (tzp)
 		VN_RELE(ZTOV(tzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		target	- Target path of new symlink.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EILSEQ);
 	}
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (ENAMETOOLONG);
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (EDQUOT);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uoip	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure to contain the link path.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 	mutex_exit(&zp->z_lock);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	vnode_t		*realvp;
 	int		error;
 	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (VOP_REALVP(svp, &realvp, ct) == 0)
 		svp = realvp;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
 		ZFS_EXIT(zfsvfs);
 		return (EXDEV);
 	}
 
 	szp = VTOZ(svp);
 	ZFS_VERIFY_ZP(szp);
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EILSEQ);
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 #ifdef sun
 /*
  * zfs_null_putapage() is used when the file system has been force
  * unmounted. It just drops the pages.
  */
 /* ARGSUSED */
 static int
 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 		size_t *lenp, int flags, cred_t *cr)
 {
 	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
 	return (0);
 }
 
 /*
  * Push a page out to disk, klustering if possible.
  *
  *	IN:	vp	- file to push page to.
  *		pp	- page to push.
  *		flags	- additional flags.
  *		cr	- credentials of caller.
  *
  *	OUT:	offp	- start of range pushed.
  *		lenp	- len of range pushed.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * NOTE: callers must have locked the page to be pushed.  On
  * exit, the page (and all other pages in the kluster) must be
  * unlocked.
  */
 /* ARGSUSED */
 static int
 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 		size_t *lenp, int flags, cred_t *cr)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	dmu_tx_t	*tx;
 	u_offset_t	off, koff;
 	size_t		len, klen;
 	int		err;
 
 	off = pp->p_offset;
 	len = PAGESIZE;
 	/*
 	 * If our blocksize is bigger than the page size, try to kluster
 	 * multiple pages so that we write a full block (thus avoiding
 	 * a read-modify-write).
 	 */
 	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
 		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
 		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
 		ASSERT(koff <= zp->z_size);
 		if (koff + klen > zp->z_size)
 			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
 		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
 	}
 	ASSERT3U(btop(len), ==, btopr(len));
 
 	/*
 	 * Can't push pages past end-of-file.
 	 */
 	if (off >= zp->z_size) {
 		/* ignore all pages */
 		err = 0;
 		goto out;
 	} else if (off + len > zp->z_size) {
 		int npages = btopr(zp->z_size - off);
 		page_t *trunc;
 
 		page_list_break(&pp, &trunc, npages);
 		/* ignore pages past end of file */
 		if (trunc)
 			pvn_write_done(trunc, flags);
 		len = zp->z_size - off;
 	}
 
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		err = EDQUOT;
 		goto out;
 	}
 top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (err != 0) {
 		if (err == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz <= PAGESIZE) {
 		caddr_t va = zfs_map_page(pp, S_READ);
 		ASSERT3U(len, <=, PAGESIZE);
 		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
 		zfs_unmap_page(pp, va);
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
 	if (offp)
 		*offp = off;
 	if (lenp)
 		*lenp = len;
 
 	return (err);
 }
 
 /*
  * Copy the portion of the file indicated from pages into the file.
  * The pages are stored in a page list attached to the files vnode.
  *
  *	IN:	vp	- vnode of file to push page data to.
  *		off	- position in file to put data.
  *		len	- amount of data to write.
  *		flags	- flags to control the operation.
  *		cr	- credentials of caller.
  *		ct	- caller context.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	page_t		*pp;
 	size_t		io_len;
 	u_offset_t	io_off;
 	uint_t		blksz;
 	rl_t		*rl;
 	int		error = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * Align this request to the file block size in case we kluster.
 	 * XXX - this can result in pretty aggresive locking, which can
 	 * impact simultanious read/write access.  One option might be
 	 * to break up long requests (len == 0) into block-by-block
 	 * operations to get narrower locking.
 	 */
 	blksz = zp->z_blksz;
 	if (ISP2(blksz))
 		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
 	else
 		io_off = 0;
 	if (len > 0 && ISP2(blksz))
 		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
 	else
 		io_len = 0;
 
 	if (io_len == 0) {
 		/*
 		 * Search the entire vp list for pages >= io_off.
 		 */
 		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
 		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
 		goto out;
 	}
 	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
 
 	if (off > zp->z_size) {
 		/* past end of file */
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
 
 	for (off = io_off; io_off < off + len; io_off += io_len) {
 		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
 			pp = page_lookup(vp, io_off,
 			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
 		} else {
 			pp = page_lookup_nowait(vp, io_off,
 			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
 		}
 
 		if (pp != NULL && pvn_getdirty(pp, flags)) {
 			int err;
 
 			/*
 			 * Found a dirty page to push
 			 */
 			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
 			if (err)
 				error = err;
 		} else {
 			io_len = PAGESIZE;
 		}
 	}
 out:
 	zfs_range_unlock(rl);
 	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 #endif	/* sun */
 
 /*ARGSUSED*/
 void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
 		mutex_exit(&zp->z_lock);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 	mutex_exit(&zp->z_lock);
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 #ifdef sun
 /*
  * Bounds-check the seek operation.
  *
  *	IN:	vp	- vnode seeking within
  *		ooff	- old file offset
  *		noffp	- pointer to new file offset
  *		ct	- caller context
  *
  *	RETURN:	0 if success
  *		EINVAL if new offset invalid
  */
 /* ARGSUSED */
 static int
 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
     caller_context_t *ct)
 {
 	if (vp->v_type == VDIR)
 		return (0);
 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
 }
 
 /*
  * Pre-filter the generic locking function to trap attempts to place
  * a mandatory lock on a memory mapped file.
  */
 static int
 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
     flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * We are following the UFS semantics with respect to mapcnt
 	 * here: If we see that the file is mapped already, then we will
 	 * return an error, but we don't worry about races between this
 	 * function and zfs_map().
 	 */
 	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (EAGAIN);
 	}
 	ZFS_EXIT(zfsvfs);
 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
 }
 
 /*
  * If we can't find a page in the cache, we will create a new page
  * and fill it with file data.  For efficiency, we may try to fill
  * multiple pages at once (klustering) to fill up the supplied page
  * list.  Note that the pages to be filled are held with an exclusive
  * lock to prevent access by other threads while they are being filled.
  */
 static int
 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
     caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
 {
 	znode_t *zp = VTOZ(vp);
 	page_t *pp, *cur_pp;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	u_offset_t io_off, total;
 	size_t io_len;
 	int err;
 
 	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
 		/*
 		 * We only have a single page, don't bother klustering
 		 */
 		io_off = off;
 		io_len = PAGESIZE;
 		pp = page_create_va(vp, io_off, io_len,
 		    PG_EXCL | PG_WAIT, seg, addr);
 	} else {
 		/*
 		 * Try to find enough pages to fill the page list
 		 */
 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
 		    &io_len, off, plsz, 0);
 	}
 	if (pp == NULL) {
 		/*
 		 * The page already exists, nothing to do here.
 		 */
 		*pl = NULL;
 		return (0);
 	}
 
 	/*
 	 * Fill the pages in the kluster.
 	 */
 	cur_pp = pp;
 	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
 		caddr_t va;
 
 		ASSERT3U(io_off, ==, cur_pp->p_offset);
 		va = zfs_map_page(cur_pp, S_WRITE);
 		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
 		    DMU_READ_PREFETCH);
 		zfs_unmap_page(cur_pp, va);
 		if (err) {
 			/* On error, toss the entire kluster */
 			pvn_read_done(pp, B_ERROR);
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = EIO;
 			return (err);
 		}
 		cur_pp = cur_pp->p_next;
 	}
 
 	/*
 	 * Fill in the page list array from the kluster starting
 	 * from the desired offset `off'.
 	 * NOTE: the page list will always be null terminated.
 	 */
 	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
 	ASSERT(pl == NULL || (*pl)->p_offset == off);
 
 	return (0);
 }
 
 /*
  * Return pointers to the pages for the file region [off, off + len]
  * in the pl array.  If plsz is greater than len, this function may
  * also return page pointers from after the specified region
  * (i.e. the region [off, off + plsz]).  These additional pages are
  * only returned if they are already in the cache, or were created as
  * part of a klustered read.
  *
  *	IN:	vp	- vnode of file to get data from.
  *		off	- position in file to get data from.
  *		len	- amount of data to retrieve.
  *		plsz	- length of provided page list.
  *		seg	- segment to obtain pages for.
  *		addr	- virtual address of fault.
  *		rw	- mode of created pages.
  *		cr	- credentials of caller.
  *		ct	- caller context.
  *
  *	OUT:	protp	- protection mode of created pages.
  *		pl	- list of pages created.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	page_t		**pl0 = pl;
 	int		err = 0;
 
 	/* we do our own caching, faultahead is unnecessary */
 	if (pl == NULL)
 		return (0);
 	else if (len > plsz)
 		len = plsz;
 	else
 		len = P2ROUNDUP(len, PAGESIZE);
 	ASSERT(plsz >= len);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (protp)
 		*protp = PROT_ALL;
 
 	/*
 	 * Loop through the requested range [off, off + len) looking
 	 * for pages.  If we don't find a page, we will need to create
 	 * a new page and fill it with data from the file.
 	 */
 	while (len > 0) {
 		if (*pl = page_lookup(vp, off, SE_SHARED))
 			*(pl+1) = NULL;
 		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
 			goto out;
 		while (*pl) {
 			ASSERT3U((*pl)->p_offset, ==, off);
 			off += PAGESIZE;
 			addr += PAGESIZE;
 			if (len > 0) {
 				ASSERT3U(len, >=, PAGESIZE);
 				len -= PAGESIZE;
 			}
 			ASSERT3U(plsz, >=, PAGESIZE);
 			plsz -= PAGESIZE;
 			pl++;
 		}
 	}
 
 	/*
 	 * Fill out the page array with any pages already in the cache.
 	 */
 	while (plsz > 0 &&
 	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
 			off += PAGESIZE;
 			plsz -= PAGESIZE;
 	}
 out:
 	if (err) {
 		/*
 		 * Release any pages we have previously locked.
 		 */
 		while (pl > pl0)
 			page_unlock(*--pl);
 	} else {
 		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	}
 
 	*pl = NULL;
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * Request a memory map for a section of a file.  This code interacts
  * with common code and the VM system as follows:
  *
  *	common code calls mmap(), which ends up in smmap_common()
  *
  *	this calls VOP_MAP(), which takes you into (say) zfs
  *
  *	zfs_map() calls as_map(), passing segvn_create() as the callback
  *
  *	segvn_create() creates the new segment and calls VOP_ADDMAP()
  *
  *	zfs_addmap() updates z_mapcnt
  */
 /*ARGSUSED*/
 static int
 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	segvn_crargs_t	vn_a;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((prot & PROT_WRITE) && (zp->z_pflags &
 	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if ((prot & (PROT_READ | PROT_EXEC)) &&
 	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
 		ZFS_EXIT(zfsvfs);
 		return (EACCES);
 	}
 
 	if (vp->v_flag & VNOMAP) {
 		ZFS_EXIT(zfsvfs);
 		return (ENOSYS);
 	}
 
 	if (off < 0 || len > MAXOFFSET_T - off) {
 		ZFS_EXIT(zfsvfs);
 		return (ENXIO);
 	}
 
 	if (vp->v_type != VREG) {
 		ZFS_EXIT(zfsvfs);
 		return (ENODEV);
 	}
 
 	/*
 	 * If file is locked, disallow mapping.
 	 */
 	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
 		ZFS_EXIT(zfsvfs);
 		return (EAGAIN);
 	}
 
 	as_rangelock(as);
 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
 	if (error != 0) {
 		as_rangeunlock(as);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vn_a.vp = vp;
 	vn_a.offset = (u_offset_t)off;
 	vn_a.type = flags & MAP_TYPE;
 	vn_a.prot = prot;
 	vn_a.maxprot = maxprot;
 	vn_a.cred = cr;
 	vn_a.amp = NULL;
 	vn_a.flags = flags & ~MAP_TYPE;
 	vn_a.szc = 0;
 	vn_a.lgrp_mem_policy_flags = 0;
 
 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
 
 	as_rangeunlock(as);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	uint64_t pages = btopr(len);
 
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
 	return (0);
 }
 
 /*
  * The reason we push dirty pages as part of zfs_delmap() is so that we get a
  * more accurate mtime for the associated file.  Since we don't have a way of
  * detecting when the data was actually modified, we have to resort to
  * heuristics.  If an explicit msync() is done, then we mark the mtime when the
  * last page is pushed.  The problem occurs when the msync() call is omitted,
  * which by far the most common case:
  *
  * 	open()
  * 	mmap()
  * 	<modify memory>
  * 	munmap()
  * 	close()
  * 	<time lapse>
  * 	putpage() via fsflush
  *
  * If we wait until fsflush to come along, we can have a modification time that
  * is some arbitrary point in the future.  In order to prevent this in the
  * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
  * torn down.
  */
 /* ARGSUSED */
 static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	uint64_t pages = btopr(len);
 
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
 	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
 	    vn_has_cached_data(vp))
 		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
 
 	return (0);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	vp	- vnode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller [UNUSED].
  *		ct	- caller context.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	vp - ctime|mtime updated
  */
 /* ARGSUSED */
 static int
 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint64_t	off, len;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (cmd != F_FREESP) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	if (error = convoff(vp, bfp, 0, offset)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (bfp->l_len < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 #endif	/* sun */
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
 /*ARGSUSED*/
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	fidp->fid_len = size;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = INT_MAX;
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 #ifdef sun
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
 		error = zfs_dirent_lock(&dl, zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
 		if (error == 0) {
 			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			VN_RELE(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 
 	case _PC_SATTR_ENABLED:
 	case _PC_SATTR_EXISTS:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 		    (vp->v_type == VREG || vp->v_type == VDIR);
 		return (0);
 
 	case _PC_ACCESS_FILTERING:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
 		    vp->v_type == VDIR;
 		return (0);
 
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
 #endif	/* sun */
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 #ifdef sun
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
 #endif	/* sun */
 	case _PC_ACL_EXTENDED:
 		*valp = 0;
 		return (0);
 
 	case _PC_ACL_NFS4:
 		*valp = 1;
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*valp = ACL_MAX_ENTRIES;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 #ifdef sun
 /*
  * Tunable, both must be a power of 2.
  *
  * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
  * zcr_blksz_max: if set to less than the file block size, allow loaning out of
  *                an arcbuf for a partial block read
  */
 int zcr_blksz_min = (1 << 10);	/* 1K */
 int zcr_blksz_max = (1 << 17);	/* 128K */
 
 /*ARGSUSED*/
 static int
 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int max_blksz = zfsvfs->z_max_blksz;
 	uio_t *uio = &xuio->xu_uio;
 	ssize_t size = uio->uio_resid;
 	offset_t offset = uio->uio_loffset;
 	int blksz;
 	int fullblk, i;
 	arc_buf_t *abuf;
 	ssize_t maxsize;
 	int preamble, postamble;
 
 	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
 		return (EINVAL);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	switch (ioflag) {
 	case UIO_WRITE:
 		/*
 		 * Loan out an arc_buf for write if write size is bigger than
 		 * max_blksz, and the file's block size is also max_blksz.
 		 */
 		blksz = max_blksz;
 		if (size < blksz || zp->z_blksz != blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 		/*
 		 * Caller requests buffers for write before knowing where the
 		 * write offset might be (e.g. NFS TCP write).
 		 */
 		if (offset == -1) {
 			preamble = 0;
 		} else {
 			preamble = P2PHASE(offset, blksz);
 			if (preamble) {
 				preamble = blksz - preamble;
 				size -= preamble;
 			}
 		}
 
 		postamble = P2PHASE(size, blksz);
 		size -= postamble;
 
 		fullblk = size / blksz;
 		(void) dmu_xuio_init(xuio,
 		    (preamble != 0) + fullblk + (postamble != 0));
 		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
 		    int, postamble, int,
 		    (preamble != 0) + fullblk + (postamble != 0));
 
 		/*
 		 * Have to fix iov base/len for partial buffers.  They
 		 * currently represent full arc_buf's.
 		 */
 		if (preamble) {
 			/* data begins in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf,
 			    blksz - preamble, preamble);
 		}
 
 		for (i = 0; i < fullblk; i++) {
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
 		}
 
 		if (postamble) {
 			/* data ends in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
 		}
 		break;
 	case UIO_READ:
 		/*
 		 * Loan out an arc_buf for read if the read size is larger than
 		 * the current file block size.  Block alignment is not
 		 * considered.  Partial arc_buf will be loaned out for read.
 		 */
 		blksz = zp->z_blksz;
 		if (blksz < zcr_blksz_min)
 			blksz = zcr_blksz_min;
 		if (blksz > zcr_blksz_max)
 			blksz = zcr_blksz_max;
 		/* avoid potential complexity of dealing with it */
 		if (blksz > max_blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 
 		maxsize = zp->z_size - uio->uio_loffset;
 		if (size > maxsize)
 			size = maxsize;
 
 		if (size < blksz || vn_has_cached_data(vp)) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
 		break;
 	default:
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
 
 	uio->uio_extflg = UIO_XUIO;
 	XUIO_XUZC_RW(xuio) = ioflag;
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
 {
 	int i;
 	arc_buf_t *abuf;
 	int ioflag = XUIO_XUZC_RW(xuio);
 
 	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
 
 	i = dmu_xuio_cnt(xuio);
 	while (i-- > 0) {
 		abuf = dmu_xuio_arcbuf(xuio, i);
 		/*
 		 * if abuf == NULL, it must be a write buffer
 		 * that has been returned in zfs_write().
 		 */
 		if (abuf)
 			dmu_return_arcbuf(abuf);
 		ASSERT(abuf || ioflag == UIO_WRITE);
 	}
 
 	dmu_xuio_fini(xuio);
 	return (0);
 }
 
 /*
  * Predeclare these here so that the compiler assumes that
  * this is an "old style" function declaration that does
  * not include arguments => we won't get type mismatch errors
  * in the initializations that follow.
  */
 static int zfs_inval();
 static int zfs_isdir();
 
 static int
 zfs_inval()
 {
 	return (EINVAL);
 }
 
 static int
 zfs_isdir()
 {
 	return (EISDIR);
 }
 /*
  * Directory vnode operations template
  */
 vnodeops_t *zfs_dvnodeops;
 const fs_operation_def_t zfs_dvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_READ,		{ .error = zfs_isdir },
 	VOPNAME_WRITE,		{ .error = zfs_isdir },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_CREATE,		{ .vop_create = zfs_create },
 	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
 	VOPNAME_LINK,		{ .vop_link = zfs_link },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
 	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
 	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
 	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Regular file vnode operations template
  */
 vnodeops_t *zfs_fvnodeops;
 const fs_operation_def_t zfs_fvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_READ,		{ .vop_read = zfs_read },
 	VOPNAME_WRITE,		{ .vop_write = zfs_write },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
 	VOPNAME_SPACE,		{ .vop_space = zfs_space },
 	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
 	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
 	VOPNAME_MAP,		{ .vop_map = zfs_map },
 	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
 	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
 	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
 	NULL,			NULL
 };
 
 /*
  * Symbolic link vnode operations template
  */
 vnodeops_t *zfs_symvnodeops;
 const fs_operation_def_t zfs_symvnodeops_template[] = {
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * special share hidden files vnode operations template
  */
 vnodeops_t *zfs_sharevnodeops;
 const fs_operation_def_t zfs_sharevnodeops_template[] = {
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Extended attribute directory vnode operations template
  *	This template is identical to the directory vnodes
  *	operation template except for restricted operations:
  *		VOP_MKDIR()
  *		VOP_SYMLINK()
  * Note that there are other restrictions embedded in:
  *	zfs_create()	- restrict type to VREG
  *	zfs_link()	- no links into/out of attribute space
  *	zfs_rename()	- no moves into/out of attribute space
  */
 vnodeops_t *zfs_xdvnodeops;
 const fs_operation_def_t zfs_xdvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_CREATE,		{ .vop_create = zfs_create },
 	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
 	VOPNAME_LINK,		{ .vop_link = zfs_link },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_MKDIR,		{ .error = zfs_inval },
 	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
 	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
 	VOPNAME_SYMLINK,	{ .error = zfs_inval },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Error vnode operations template
  */
 vnodeops_t *zfs_evnodeops;
 const fs_operation_def_t zfs_evnodeops_template[] = {
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	NULL,			NULL
 };
 #endif	/* sun */
 
 static int
 ioflags(int ioflags)
 {
 	int flags = 0;
 
 	if (ioflags & IO_APPEND)
 		flags |= FAPPEND;
 	if (ioflags & IO_NDELAY)
         	flags |= FNONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= (FSYNC | FDSYNC | FRSYNC);
 
 	return (flags);
 }
 
 static int
 zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_page_t mfirst, mlast, mreq;
 	vm_object_t object;
 	caddr_t va;
 	struct sf_buf *sf;
 	off_t startoff, endoff;
 	int i, error;
 	vm_pindex_t reqstart, reqend;
 	int pcount, lsize, reqsize, size;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	pcount = OFF_TO_IDX(round_page(count));
 	mreq = m[reqpage];
 	object = mreq->object;
 	error = 0;
 
 	KASSERT(vp->v_object == object, ("mismatching object"));
 
 	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
 		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
 		reqstart = OFF_TO_IDX(round_page(startoff));
 		if (reqstart < m[0]->pindex)
 			reqstart = 0;
 		else
 			reqstart = reqstart - m[0]->pindex;
 		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
 		    zp->z_blksz);
 		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
 		if (reqend > m[pcount - 1]->pindex)
 			reqend = m[pcount - 1]->pindex;
 		reqsize = reqend - m[reqstart]->pindex + 1;
 		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
 		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
 	} else {
 		reqstart = reqpage;
 		reqsize = 1;
 	}
 	mfirst = m[reqstart];
 	mlast = m[reqstart + reqsize - 1];
 
 	zfs_vmobject_wlock(object);
 
 	for (i = 0; i < reqstart; i++) {
 		vm_page_lock(m[i]);
 		vm_page_free(m[i]);
 		vm_page_unlock(m[i]);
 	}
 	for (i = reqstart + reqsize; i < pcount; i++) {
 		vm_page_lock(m[i]);
 		vm_page_free(m[i]);
 		vm_page_unlock(m[i]);
 	}
 
 	if (mreq->valid && reqsize == 1) {
 		if (mreq->valid != VM_PAGE_BITS_ALL)
 			vm_page_zero_invalid(mreq, TRUE);
 		zfs_vmobject_wunlock(object);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_ok);
 	}
 
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
 
 	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
 		for (i = reqstart; i < reqstart + reqsize; i++) {
 			if (i != reqpage) {
 				vm_page_lock(m[i]);
 				vm_page_free(m[i]);
 				vm_page_unlock(m[i]);
 			}
 		}
 		zfs_vmobject_wunlock(object);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_bad);
 	}
 
 	lsize = PAGE_SIZE;
 	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
 		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
 
 	zfs_vmobject_wunlock(object);
 
 	for (i = reqstart; i < reqstart + reqsize; i++) {
 		size = PAGE_SIZE;
 		if (i == (reqstart + reqsize - 1))
 			size = lsize;
 		va = zfs_map_page(m[i], &sf);
 		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
 		    size, va, DMU_READ_PREFETCH);
 		if (size != PAGE_SIZE)
 			bzero(va + size, PAGE_SIZE - size);
 		zfs_unmap_page(sf);
 		if (error != 0)
 			break;
 	}
 
 	zfs_vmobject_wlock(object);
 
 	for (i = reqstart; i < reqstart + reqsize; i++) {
 		if (!error)
 			m[i]->valid = VM_PAGE_BITS_ALL;
 		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
 		if (i != reqpage)
 			vm_page_readahead_finish(m[i]);
 	}
 
 	zfs_vmobject_wunlock(object);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
 }
 
 static int
 zfs_freebsd_getpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_reqpage;
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
 
 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
 }
 
 static int
 zfs_freebsd_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	return (0);
 }
 
 static int
 zfs_freebsd_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
 	return (error);
 }
 
 static int
 zfs_freebsd_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL, NULL));
 }
 
 static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	accmode_t accmode;
 	int error = 0;
 
 	/*
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
 	if (accmode != 0)
 		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
 
 	/*
 	 * VADMIN has to be handled by vaccess().
 	 */
 	if (error == 0) {
 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
 		if (accmode != 0) {
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred, NULL);
 		}
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories.
 	 */
 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
 		error = EACCES;
 	}
 
 	return (error);
 }
 
 static int
 zfs_freebsd_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT(cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, cnp->cn_thread, 0));
 }
 
 static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 
 	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
 	    ap->a_cnp->cn_cred, NULL, 0, NULL));
 }
 
 static int
 zfs_freebsd_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 static int
 zfs_freebsd_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
 }
 
 static int
 zfs_freebsd_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 	xvattr_t xvap;
 	u_long fflags = 0;
 	int error;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 	xvap.xva_vattr.va_mask |= AT_XVATTR;
 
 	/* Convert chflags into ZFS-type flags. */
 	/* XXX: what about SF_SETTABLE?. */
 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
 	if (error != 0)
 		return (error);
 
 	/* Convert ZFS xattr into chflags. */
 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
 		fflags |= (fflag);					\
 } while (0)
 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
 	    xvap.xva_xoptattrs.xoa_immutable);
 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
 	return (0);
 }
 
 static int
 zfs_freebsd_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	cred_t *cred = ap->a_cred;
 	xvattr_t xvap;
 	u_long fflags;
 	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
 	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
 		int error;
 
 		if (zfsvfs->z_use_fuids == B_FALSE)
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the security.jail.chflags_allowed sysctl is
 		 * is non-zero; otherwise, they behave like unprivileged
 		 * processes.
 		 */
 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
 		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				error = securelevel_gt(cred, 0);
 				if (error != 0)
 					return (error);
 			}
 		} else {
 			/*
 			 * Callers may only modify the file flags on objects they
 			 * have VADMIN rights for.
 			 */
 			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
 				return (error);
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				return (EPERM);
 			}
 			if (fflags &
 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
 				return (EPERM);
 			}
 		}
 
 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
 		XVA_SET_REQ(&xvap, (xflag));				\
 		(xfield) = ((fflags & (fflag)) != 0);			\
 	}								\
 } while (0)
 		/* Convert chflags into ZFS-type flags. */
 		/* XXX: what about SF_SETTABLE?. */
 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 		    xvap.xva_xoptattrs.xoa_immutable);
 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
 #undef	FLAG_CHANGE
 	}
 	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
 }
 
 static int
 zfs_freebsd_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
 	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
 	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
 
 	if (tdvp == tvp)
 		VN_RELE(tdvp);
 	else
 		VN_URELE(tdvp);
 	if (tvp)
 		VN_URELE(tvp);
 	VN_RELE(fdvp);
 	VN_RELE(fvp);
 
 	return (error);
 }
 
 static int
 zfs_freebsd_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
 	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
 	return (0);
 }
 
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp != NULL);
 
 	/* Destroy the vm object and flush associated pages. */
 	vnode_destroy_vobject(vp);
 
 	/*
 	 * z_teardown_inactive_lock protects from a race with
 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
 	 * force unmount.
 	 */
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
 	else
 		zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 zfs_freebsd_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 static int
 zfs_freebsd_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
 	if (error == 0)
 		*ap->a_retval = val;
 	else if (error == EOPNOTSUPP)
 		error = vop_stdpathconf(ap);
 	return (error);
 }
 
 static int
 zfs_freebsd_fifo_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_ACL_EXTENDED:
 	case _PC_ACL_NFS4:
 	case _PC_ACL_PATH_MAX:
 	case _PC_MAC_PRESENT:
 		return (zfs_freebsd_pathconf(ap));
 	default:
 		return (fifo_specops.vop_pathconf(ap));
 	}
 }
 
 /*
  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
  * extended attribute name:
  *
  *	NAMESPACE	PREFIX	
  *	system		freebsd:system:
  *	user		(none, can be used to access ZFS fsattr(5) attributes
  *			created on Solaris)
  */
 static int
 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
     size_t size)
 {
 	const char *namespace, *prefix, *suffix;
 
 	/* We don't allow '/' character in attribute name. */
 	if (strchr(name, '/') != NULL)
 		return (EINVAL);
 	/* We don't allow attribute names that start with "freebsd:" string. */
 	if (strncmp(name, "freebsd:", 8) == 0)
 		return (EINVAL);
 
 	bzero(attrname, size);
 
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_USER:
 #if 0
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_USER_STRING;
 		suffix = ":";
 #else
 		/*
 		 * This is the default namespace by which we can access all
 		 * attributes created on Solaris.
 		 */
 		prefix = namespace = suffix = "";
 #endif
 		break;
 	case EXTATTR_NAMESPACE_SYSTEM:
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
 		suffix = ":";
 		break;
 	case EXTATTR_NAMESPACE_EMPTY:
 	default:
 		return (EINVAL);
 	}
 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
 	    name) >= size) {
 		return (ENAMETOOLONG);
 	}
 	return (0);
 }
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  */
 static int
 zfs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FREAD;
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
 		if (error == 0)
 			*ap->a_size = (size_t)va.va_size;
 	} else if (ap->a_uio != NULL)
 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 int
 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 zfs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR | CREATE_XATTR_DIR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
 		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 zfs_listextattr(struct vop_listextattr_args *ap)
 /*
 vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrprefix[16];
 	u_char dirbuf[sizeof(struct dirent)];
 	struct dirent *dp;
 	struct iovec aiov;
 	struct uio auio, *uio = ap->a_uio;
 	size_t *sizep = ap->a_size;
 	size_t plen;
 	vnode_t *xvp = NULL, *vp;
 	int done, error, eof, pos;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof(attrprefix));
 	if (error != 0)
 		return (error);
 	plen = strlen(attrprefix);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (sizep != NULL)
 		*sizep = 0;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		/*
 		 * ENOATTR means that the EA directory does not yet exist,
 		 * i.e. there are no extended attributes there.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 
 	do {
 		u_char nlen;
 
 		aiov.iov_base = (void *)dirbuf;
 		aiov.iov_len = sizeof(dirbuf);
 		auio.uio_resid = sizeof(dirbuf);
 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
 		done = sizeof(dirbuf) - auio.uio_resid;
 		if (error != 0)
 			break;
 		for (pos = 0; pos < done;) {
 			dp = (struct dirent *)(dirbuf + pos);
 			pos += dp->d_reclen;
 			/*
 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
 			 * is what we get when attribute was created on Solaris.
 			 */
 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
 				continue;
 			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
 				continue;
 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
 				continue;
 			nlen = dp->d_namlen - plen;
 			if (sizep != NULL)
 				*sizep += 1 + nlen;
 			else if (uio != NULL) {
 				/*
 				 * Format of extattr name entry is one byte for
 				 * length and the rest for name.
 				 */
 				error = uiomove(&nlen, 1, uio->uio_rw, uio);
 				if (error == 0) {
 					error = uiomove(dp->d_name + plen, nlen,
 					    uio->uio_rw, uio);
 				}
 				if (error != 0)
 					break;
 			}
 		}
 	} while (!eof && error == 0);
 
 	vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 int
 zfs_freebsd_getacl(ap)
 	struct vop_getacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
 	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
 		return (error);
 
 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
 	if (vsecattr.vsa_aclentp != NULL)
 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
 
 	return (error);
 }
 
 int
 zfs_freebsd_setacl(ap)
 	struct vop_setacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 	int		aclbsize;	/* size of acl list in bytes */
 	aclent_t	*aaclp;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
 	 * splitting every entry into two and appending "canonical six"
 	 * entries at the end.  Don't allow for setting an ACL that would
 	 * cause chmod(2) to run out of ACL entries.
 	 */
 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
 		return (ENOSPC);
 
 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
 	if (error != 0)
 		return (error);
 
 	vsecattr.vsa_mask = VSA_ACE;
 	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
 	aaclp = vsecattr.vsa_aclentp;
 	vsecattr.vsa_aclentsz = aclbsize;
 
 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
 	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
 	kmem_free(aaclp, aclbsize);
 
 	return (error);
 }
 
 int
 zfs_freebsd_aclcheck(ap)
 	struct vop_aclcheck_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (EOPNOTSUPP);
 }
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
 #ifdef FREEBSD_NAMECACHE
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
 #else
 	.vop_lookup =		zfs_freebsd_lookup,
 #endif
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
 	.vop_mknod =		zfs_freebsd_create,
 	.vop_mkdir =		zfs_freebsd_mkdir,
 	.vop_readdir =		zfs_freebsd_readdir,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_open =		zfs_freebsd_open,
 	.vop_close =		zfs_freebsd_close,
 	.vop_rmdir =		zfs_freebsd_rmdir,
 	.vop_ioctl =		zfs_freebsd_ioctl,
 	.vop_link =		zfs_freebsd_link,
 	.vop_symlink =		zfs_freebsd_symlink,
 	.vop_readlink =		zfs_freebsd_readlink,
 	.vop_read =		zfs_freebsd_read,
 	.vop_write =		zfs_freebsd_write,
 	.vop_remove =		zfs_freebsd_remove,
 	.vop_rename =		zfs_freebsd_rename,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 	.vop_bmap =		zfs_freebsd_bmap,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getextattr =	zfs_getextattr,
 	.vop_deleteextattr =	zfs_deleteextattr,
 	.vop_setextattr =	zfs_setextattr,
 	.vop_listextattr =	zfs_listextattr,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
 };
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_access =		zfs_freebsd_access,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 };
 
 /*
  * special share hidden files vnode operations template
  */
 struct vop_vector zfs_shareops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		zfs_freebsd_access,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 };
Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris
===================================================================
--- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris	(revision 247192)

Property changes on: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,2 ##
   Merged /head/sys/cddl/contrib/opensolaris:r247097-247191
   Merged /vendor-sys/illumos/dist:r246653
Index: user/attilio/vmobj-rwlock/sys/dev/ata/ata-all.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/dev/ata/ata-all.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/dev/ata/ata-all.c	(revision 247192)
@@ -1,1995 +1,1996 @@
 /*-
  * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ata.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ata.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/endian.h>
 #include <sys/ctype.h>
 #include <sys/conf.h>
 #include <sys/bus.h>
 #include <sys/bio.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/sema.h>
 #include <sys/taskqueue.h>
 #include <vm/uma.h>
 #include <machine/stdarg.h>
 #include <machine/resource.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <dev/ata/ata-all.h>
 #include <dev/pci/pcivar.h>
 #include <ata_if.h>
 
 #ifdef ATA_CAM
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_debug.h>
 #endif
 
 #ifndef ATA_CAM
 /* device structure */
 static  d_ioctl_t       ata_ioctl;
 static struct cdevsw ata_cdevsw = {
 	.d_version =    D_VERSION,
 	.d_flags =      D_NEEDGIANT, /* we need this as newbus isn't mpsafe */
 	.d_ioctl =      ata_ioctl,
 	.d_name =       "ata",
 };
 #endif
 
 /* prototypes */
 #ifndef ATA_CAM
 static void ata_boot_attach(void);
 static device_t ata_add_child(device_t, struct ata_device *, int);
 #else
 static void ataaction(struct cam_sim *sim, union ccb *ccb);
 static void atapoll(struct cam_sim *sim);
 #endif
 static void ata_conn_event(void *, int);
 #ifndef ATA_CAM
 static void bswap(int8_t *, int);
 static void btrim(int8_t *, int);
 static void bpack(int8_t *, int8_t *, int);
 #endif
 static void ata_interrupt_locked(void *data);
 #ifdef ATA_CAM
 static void ata_periodic_poll(void *data);
 #endif
 
 /* global vars */
 MALLOC_DEFINE(M_ATA, "ata_generic", "ATA driver generic layer");
 int (*ata_raid_ioctl_func)(u_long cmd, caddr_t data) = NULL;
 #ifndef ATA_CAM
 struct intr_config_hook *ata_delayed_attach = NULL;
 #endif
 devclass_t ata_devclass;
 uma_zone_t ata_request_zone;
 uma_zone_t ata_composite_zone;
 #ifndef ATA_CAM
 int ata_wc = 1;
 int ata_setmax = 0;
 #endif
 int ata_dma_check_80pin = 1;
 
 /* local vars */
 #ifndef ATA_CAM
 static int ata_dma = 1;
 static int atapi_dma = 1;
 #endif
 
 /* sysctl vars */
 static SYSCTL_NODE(_hw, OID_AUTO, ata, CTLFLAG_RD, 0, "ATA driver parameters");
 #ifndef ATA_CAM
 TUNABLE_INT("hw.ata.ata_dma", &ata_dma);
 SYSCTL_INT(_hw_ata, OID_AUTO, ata_dma, CTLFLAG_RDTUN, &ata_dma, 0,
 	   "ATA disk DMA mode control");
 #endif
 TUNABLE_INT("hw.ata.ata_dma_check_80pin", &ata_dma_check_80pin);
 SYSCTL_INT(_hw_ata, OID_AUTO, ata_dma_check_80pin,
 	   CTLFLAG_RW, &ata_dma_check_80pin, 1,
 	   "Check for 80pin cable before setting ATA DMA mode");
 #ifndef ATA_CAM
 TUNABLE_INT("hw.ata.atapi_dma", &atapi_dma);
 SYSCTL_INT(_hw_ata, OID_AUTO, atapi_dma, CTLFLAG_RDTUN, &atapi_dma, 0,
 	   "ATAPI device DMA mode control");
 TUNABLE_INT("hw.ata.wc", &ata_wc);
 SYSCTL_INT(_hw_ata, OID_AUTO, wc, CTLFLAG_RDTUN, &ata_wc, 0,
 	   "ATA disk write caching");
 TUNABLE_INT("hw.ata.setmax", &ata_setmax);
 SYSCTL_INT(_hw_ata, OID_AUTO, setmax, CTLFLAG_RDTUN, &ata_setmax, 0,
 	   "ATA disk set max native address");
 #endif
 #ifdef ATA_CAM
 FEATURE(ata_cam, "ATA devices are accessed through the cam(4) driver");
 #endif
 
 /*
  * newbus device interface related functions
  */
 int
 ata_probe(device_t dev)
 {
     return 0;
 }
 
 int
 ata_attach(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
     int error, rid;
 #ifdef ATA_CAM
     struct cam_devq *devq;
     const char *res;
     char buf[64];
     int i, mode;
 #endif
 
     /* check that we have a virgin channel to attach */
     if (ch->r_irq)
 	return EEXIST;
 
     /* initialize the softc basics */
     ch->dev = dev;
     ch->state = ATA_IDLE;
     bzero(&ch->state_mtx, sizeof(struct mtx));
     mtx_init(&ch->state_mtx, "ATA state lock", NULL, MTX_DEF);
     bzero(&ch->queue_mtx, sizeof(struct mtx));
     mtx_init(&ch->queue_mtx, "ATA queue lock", NULL, MTX_DEF);
     TAILQ_INIT(&ch->ata_queue);
     TASK_INIT(&ch->conntask, 0, ata_conn_event, dev);
 #ifdef ATA_CAM
 	for (i = 0; i < 16; i++) {
 		ch->user[i].revision = 0;
 		snprintf(buf, sizeof(buf), "dev%d.sata_rev", i);
 		if (resource_int_value(device_get_name(dev),
 		    device_get_unit(dev), buf, &mode) != 0 &&
 		    resource_int_value(device_get_name(dev),
 		    device_get_unit(dev), "sata_rev", &mode) != 0)
 			mode = -1;
 		if (mode >= 0)
 			ch->user[i].revision = mode;
 		ch->user[i].mode = 0;
 		snprintf(buf, sizeof(buf), "dev%d.mode", i);
 		if (resource_string_value(device_get_name(dev),
 		    device_get_unit(dev), buf, &res) == 0)
 			mode = ata_str2mode(res);
 		else if (resource_string_value(device_get_name(dev),
 		    device_get_unit(dev), "mode", &res) == 0)
 			mode = ata_str2mode(res);
 		else
 			mode = -1;
 		if (mode >= 0)
 			ch->user[i].mode = mode;
 		if (ch->flags & ATA_SATA)
 			ch->user[i].bytecount = 8192;
 		else
 			ch->user[i].bytecount = MAXPHYS;
 		ch->user[i].caps = 0;
 		ch->curr[i] = ch->user[i];
 		if (ch->pm_level > 0)
 			ch->user[i].caps |= CTS_SATA_CAPS_H_PMREQ;
 		if (ch->pm_level > 1)
 			ch->user[i].caps |= CTS_SATA_CAPS_D_PMREQ;
 	}
 	callout_init(&ch->poll_callout, 1);
 #endif
 
 #ifndef ATA_CAM
     /* reset the controller HW, the channel and device(s) */
     while (ATA_LOCKING(dev, ATA_LF_LOCK) != ch->unit)
 	pause("ataatch", 1);
     ATA_RESET(dev);
     ATA_LOCKING(dev, ATA_LF_UNLOCK);
 #endif
 
     /* allocate DMA resources if DMA HW present*/
     if (ch->dma.alloc)
 	ch->dma.alloc(dev);
 
     /* setup interrupt delivery */
     rid = ATA_IRQ_RID;
     ch->r_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
 				       RF_SHAREABLE | RF_ACTIVE);
     if (!ch->r_irq) {
 	device_printf(dev, "unable to allocate interrupt\n");
 	return ENXIO;
     }
     if ((error = bus_setup_intr(dev, ch->r_irq, ATA_INTR_FLAGS, NULL,
 				ata_interrupt, ch, &ch->ih))) {
 	bus_release_resource(dev, SYS_RES_IRQ, rid, ch->r_irq);
 	device_printf(dev, "unable to setup interrupt\n");
 	return error;
     }
 
 #ifndef ATA_CAM
     /* probe and attach devices on this channel unless we are in early boot */
     if (!ata_delayed_attach)
 	ata_identify(dev);
     return (0);
 #else
 	if (ch->flags & ATA_PERIODIC_POLL)
 		callout_reset(&ch->poll_callout, hz, ata_periodic_poll, ch);
 	mtx_lock(&ch->state_mtx);
 	/* Create the device queue for our SIM. */
 	devq = cam_simq_alloc(1);
 	if (devq == NULL) {
 		device_printf(dev, "Unable to allocate simq\n");
 		error = ENOMEM;
 		goto err1;
 	}
 	/* Construct SIM entry */
 	ch->sim = cam_sim_alloc(ataaction, atapoll, "ata", ch,
 	    device_get_unit(dev), &ch->state_mtx, 1, 0, devq);
 	if (ch->sim == NULL) {
 		device_printf(dev, "unable to allocate sim\n");
 		cam_simq_free(devq);
 		error = ENOMEM;
 		goto err1;
 	}
 	if (xpt_bus_register(ch->sim, dev, 0) != CAM_SUCCESS) {
 		device_printf(dev, "unable to register xpt bus\n");
 		error = ENXIO;
 		goto err2;
 	}
 	if (xpt_create_path(&ch->path, /*periph*/NULL, cam_sim_path(ch->sim),
 	    CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
 		device_printf(dev, "unable to create path\n");
 		error = ENXIO;
 		goto err3;
 	}
 	mtx_unlock(&ch->state_mtx);
 	return (0);
 
 err3:
 	xpt_bus_deregister(cam_sim_path(ch->sim));
 err2:
 	cam_sim_free(ch->sim, /*free_devq*/TRUE);
 	ch->sim = NULL;
 err1:
 	bus_release_resource(dev, SYS_RES_IRQ, rid, ch->r_irq);
 	mtx_unlock(&ch->state_mtx);
 	if (ch->flags & ATA_PERIODIC_POLL)
 		callout_drain(&ch->poll_callout);
 	return (error);
 #endif
 }
 
 int
 ata_detach(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
 #ifndef ATA_CAM
     device_t *children;
     int nchildren, i;
 #endif
 
     /* check that we have a valid channel to detach */
     if (!ch->r_irq)
 	return ENXIO;
 
     /* grap the channel lock so no new requests gets launched */
     mtx_lock(&ch->state_mtx);
     ch->state |= ATA_STALL_QUEUE;
     mtx_unlock(&ch->state_mtx);
 #ifdef ATA_CAM
     if (ch->flags & ATA_PERIODIC_POLL)
 	callout_drain(&ch->poll_callout);
 #endif
 
 #ifndef ATA_CAM
     /* detach & delete all children */
     if (!device_get_children(dev, &children, &nchildren)) {
 	for (i = 0; i < nchildren; i++)
 	    if (children[i])
 		device_delete_child(dev, children[i]);
 	free(children, M_TEMP);
     } 
 #endif
     taskqueue_drain(taskqueue_thread, &ch->conntask);
 
 #ifdef ATA_CAM
 	mtx_lock(&ch->state_mtx);
 	xpt_async(AC_LOST_DEVICE, ch->path, NULL);
 	xpt_free_path(ch->path);
 	xpt_bus_deregister(cam_sim_path(ch->sim));
 	cam_sim_free(ch->sim, /*free_devq*/TRUE);
 	ch->sim = NULL;
 	mtx_unlock(&ch->state_mtx);
 #endif
 
     /* release resources */
     bus_teardown_intr(dev, ch->r_irq, ch->ih);
     bus_release_resource(dev, SYS_RES_IRQ, ATA_IRQ_RID, ch->r_irq);
     ch->r_irq = NULL;
 
     /* free DMA resources if DMA HW present*/
     if (ch->dma.free)
 	ch->dma.free(dev);
 
     mtx_destroy(&ch->state_mtx);
     mtx_destroy(&ch->queue_mtx);
     return 0;
 }
 
 static void
 ata_conn_event(void *context, int dummy)
 {
 	device_t dev = (device_t)context;
 #ifdef ATA_CAM
 	struct ata_channel *ch = device_get_softc(dev);
 	union ccb *ccb;
 
 	mtx_lock(&ch->state_mtx);
 	if (ch->sim == NULL) {
 		mtx_unlock(&ch->state_mtx);
 		return;
 	}
 	ata_reinit(dev);
 	if ((ccb = xpt_alloc_ccb_nowait()) == NULL)
 		return;
 	if (xpt_create_path(&ccb->ccb_h.path, NULL,
 	    cam_sim_path(ch->sim),
 	    CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
 		xpt_free_ccb(ccb);
 		return;
 	}
 	xpt_rescan(ccb);
 	mtx_unlock(&ch->state_mtx);
 #else
 	ata_reinit(dev);
 #endif
 }
 
 int
 ata_reinit(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
     struct ata_request *request;
 #ifndef ATA_CAM
     device_t *children;
     int nchildren, i;
 
     /* check that we have a valid channel to reinit */
     if (!ch || !ch->r_irq)
 	return ENXIO;
 
     if (bootverbose)
 	device_printf(dev, "reiniting channel ..\n");
 
     /* poll for locking the channel */
     while (ATA_LOCKING(dev, ATA_LF_LOCK) != ch->unit)
 	pause("atarini", 1);
 
     /* catch eventual request in ch->running */
     mtx_lock(&ch->state_mtx);
     if (ch->state & ATA_STALL_QUEUE) {
 	/* Recursive reinits and reinits during detach prohobited. */
 	mtx_unlock(&ch->state_mtx);
 	return (ENXIO);
     }
     if ((request = ch->running))
 	callout_stop(&request->callout);
     ch->running = NULL;
 
     /* unconditionally grap the channel lock */
     ch->state |= ATA_STALL_QUEUE;
     mtx_unlock(&ch->state_mtx);
 
     /* reset the controller HW, the channel and device(s) */
     ATA_RESET(dev);
 
     /* reinit the children and delete any that fails */
     if (!device_get_children(dev, &children, &nchildren)) {
 	mtx_lock(&Giant);       /* newbus suckage it needs Giant */
 	for (i = 0; i < nchildren; i++) {
 	    /* did any children go missing ? */
 	    if (children[i] && device_is_attached(children[i]) &&
 		ATA_REINIT(children[i])) {
 		/*
 		 * if we had a running request and its device matches
 		 * this child we need to inform the request that the 
 		 * device is gone.
 		 */
 		if (request && request->dev == children[i]) {
 		    request->result = ENXIO;
 		    device_printf(request->dev, "FAILURE - device detached\n");
 
 		    /* if not timeout finish request here */
 		    if (!(request->flags & ATA_R_TIMEOUT))
 			    ata_finish(request);
 		    request = NULL;
 		}
 		device_delete_child(dev, children[i]);
 	    }
 	}
 	free(children, M_TEMP);
 	mtx_unlock(&Giant);     /* newbus suckage dealt with, release Giant */
     }
 
     /* if we still have a good request put it on the queue again */
     if (request && !(request->flags & ATA_R_TIMEOUT)) {
 	device_printf(request->dev,
 		      "WARNING - %s requeued due to channel reset",
 		      ata_cmd2str(request));
 	if (!(request->flags & (ATA_R_ATAPI | ATA_R_CONTROL)))
 	    printf(" LBA=%ju", request->u.ata.lba);
 	printf("\n");
 	request->flags |= ATA_R_REQUEUE;
 	ata_queue_request(request);
     }
 
     /* we're done release the channel for new work */
     mtx_lock(&ch->state_mtx);
     ch->state = ATA_IDLE;
     mtx_unlock(&ch->state_mtx);
     ATA_LOCKING(dev, ATA_LF_UNLOCK);
 
     /* Add new children. */
 /*    ata_identify(dev); */
 
     if (bootverbose)
 	device_printf(dev, "reinit done ..\n");
 
     /* kick off requests on the queue */
     ata_start(dev);
 #else
 	xpt_freeze_simq(ch->sim, 1);
 	if ((request = ch->running)) {
 		ch->running = NULL;
 		if (ch->state == ATA_ACTIVE)
 		    ch->state = ATA_IDLE;
 		callout_stop(&request->callout);
 		if (ch->dma.unload)
 		    ch->dma.unload(request);
 		request->result = ERESTART;
 		ata_cam_end_transaction(dev, request);
 	}
 	/* reset the controller HW, the channel and device(s) */
 	ATA_RESET(dev);
 	/* Tell the XPT about the event */
 	xpt_async(AC_BUS_RESET, ch->path, NULL);
 	xpt_release_simq(ch->sim, TRUE);
 #endif
 	return(0);
 }
 
 int
 ata_suspend(device_t dev)
 {
     struct ata_channel *ch;
 
     /* check for valid device */
     if (!dev || !(ch = device_get_softc(dev)))
 	return ENXIO;
 
 #ifdef ATA_CAM
 	if (ch->flags & ATA_PERIODIC_POLL)
 		callout_drain(&ch->poll_callout);
 	mtx_lock(&ch->state_mtx);
 	xpt_freeze_simq(ch->sim, 1);
 	while (ch->state != ATA_IDLE)
 		msleep(ch, &ch->state_mtx, PRIBIO, "atasusp", hz/100);
 	mtx_unlock(&ch->state_mtx);
 #else
     /* wait for the channel to be IDLE or detached before suspending */
     while (ch->r_irq) {
 	mtx_lock(&ch->state_mtx);
 	if (ch->state == ATA_IDLE) {
 	    ch->state = ATA_ACTIVE;
 	    mtx_unlock(&ch->state_mtx);
 	    break;
 	}
 	mtx_unlock(&ch->state_mtx);
 	tsleep(ch, PRIBIO, "atasusp", hz/10);
     }
     ATA_LOCKING(dev, ATA_LF_UNLOCK);
 #endif
     return(0);
 }
 
 int
 ata_resume(device_t dev)
 {
     struct ata_channel *ch;
     int error;
 
     /* check for valid device */
     if (!dev || !(ch = device_get_softc(dev)))
 	return ENXIO;
 
 #ifdef ATA_CAM
 	mtx_lock(&ch->state_mtx);
 	error = ata_reinit(dev);
 	xpt_release_simq(ch->sim, TRUE);
 	mtx_unlock(&ch->state_mtx);
 	if (ch->flags & ATA_PERIODIC_POLL)
 		callout_reset(&ch->poll_callout, hz, ata_periodic_poll, ch);
 #else
     /* reinit the devices, we dont know what mode/state they are in */
     error = ata_reinit(dev);
     /* kick off requests on the queue */
     ata_start(dev);
 #endif
     return error;
 }
 
 void
 ata_interrupt(void *data)
 {
 #ifdef ATA_CAM
     struct ata_channel *ch = (struct ata_channel *)data;
 
     mtx_lock(&ch->state_mtx);
     xpt_batch_start(ch->sim);
 #endif
     ata_interrupt_locked(data);
 #ifdef ATA_CAM
     xpt_batch_done(ch->sim);
     mtx_unlock(&ch->state_mtx);
 #endif
 }
 
 static void
 ata_interrupt_locked(void *data)
 {
     struct ata_channel *ch = (struct ata_channel *)data;
     struct ata_request *request;
 
 #ifndef ATA_CAM
     mtx_lock(&ch->state_mtx);
 #endif
     do {
 	/* ignore interrupt if its not for us */
 	if (ch->hw.status && !ch->hw.status(ch->dev))
 	    break;
 
 	/* do we have a running request */
 	if (!(request = ch->running))
 	    break;
 
 	ATA_DEBUG_RQ(request, "interrupt");
 
 	/* safetycheck for the right state */
 	if (ch->state == ATA_IDLE) {
 	    device_printf(request->dev, "interrupt on idle channel ignored\n");
 	    break;
 	}
 
 	/*
 	 * we have the HW locks, so end the transaction for this request
 	 * if it finishes immediately otherwise wait for next interrupt
 	 */
 	if (ch->hw.end_transaction(request) == ATA_OP_FINISHED) {
 	    ch->running = NULL;
 	    if (ch->state == ATA_ACTIVE)
 		ch->state = ATA_IDLE;
 #ifdef ATA_CAM
 	    ata_cam_end_transaction(ch->dev, request);
 #else
 	    mtx_unlock(&ch->state_mtx);
 	    ATA_LOCKING(ch->dev, ATA_LF_UNLOCK);
 	    ata_finish(request);
 #endif
 	    return;
 	}
     } while (0);
 #ifndef ATA_CAM
     mtx_unlock(&ch->state_mtx);
 #endif
 }
 
 #ifdef ATA_CAM
 static void
 ata_periodic_poll(void *data)
 {
     struct ata_channel *ch = (struct ata_channel *)data;
 
     callout_reset(&ch->poll_callout, hz, ata_periodic_poll, ch);
     ata_interrupt(ch);
 }
 #endif
 
 void
 ata_print_cable(device_t dev, u_int8_t *who)
 {
     device_printf(dev,
                   "DMA limited to UDMA33, %s found non-ATA66 cable\n", who);
 }
 
 #ifndef ATA_CAM
 int
 ata_check_80pin(device_t dev, int mode)
 {
     struct ata_device *atadev = device_get_softc(dev);
 
     if (!ata_dma_check_80pin) {
         if (bootverbose)
             device_printf(dev, "Skipping 80pin cable check\n");
         return mode;
     }
 
     if (mode > ATA_UDMA2 && !(atadev->param.hwres & ATA_CABLE_ID)) {
         ata_print_cable(dev, "device");
         mode = ATA_UDMA2;
     }
     return mode;
 }
 #endif
 
 #ifndef ATA_CAM
 void
 ata_setmode(device_t dev)
 {
 	struct ata_channel *ch = device_get_softc(device_get_parent(dev));
 	struct ata_device *atadev = device_get_softc(dev);
 	int error, mode, pmode;
 
 	mode = atadev->mode;
 	do {
 		pmode = mode = ata_limit_mode(dev, mode, ATA_DMA_MAX);
 		mode = ATA_SETMODE(device_get_parent(dev), atadev->unit, mode);
 		if ((ch->flags & (ATA_CHECKS_CABLE | ATA_SATA)) == 0)
 			mode = ata_check_80pin(dev, mode);
 	} while (pmode != mode); /* Interate till successfull negotiation. */
 	error = ata_controlcmd(dev, ATA_SETFEATURES, ATA_SF_SETXFER, 0, mode);
 	if (bootverbose)
 	        device_printf(dev, "%ssetting %s\n",
 		    (error) ? "FAILURE " : "", ata_mode2str(mode));
 	atadev->mode = mode;
 }
 #endif
 
 /*
  * device related interfaces
  */
 #ifndef ATA_CAM
 static int
 ata_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
 	  int32_t flag, struct thread *td)
 {
     device_t device, *children;
     struct ata_ioc_devices *devices = (struct ata_ioc_devices *)data;
     int *value = (int *)data;
     int i, nchildren, error = ENOTTY;
 
     switch (cmd) {
     case IOCATAGMAXCHANNEL:
 	/* In case we have channel 0..n this will return n+1. */
 	*value = devclass_get_maxunit(ata_devclass);
 	error = 0;
 	break;
 
     case IOCATAREINIT:
 	if (*value >= devclass_get_maxunit(ata_devclass) ||
 	    !(device = devclass_get_device(ata_devclass, *value)) ||
 	    !device_is_attached(device))
 	    return ENXIO;
 	error = ata_reinit(device);
 	break;
 
     case IOCATAATTACH:
 	if (*value >= devclass_get_maxunit(ata_devclass) ||
 	    !(device = devclass_get_device(ata_devclass, *value)) ||
 	    !device_is_attached(device))
 	    return ENXIO;
 	error = DEVICE_ATTACH(device);
 	break;
 
     case IOCATADETACH:
 	if (*value >= devclass_get_maxunit(ata_devclass) ||
 	    !(device = devclass_get_device(ata_devclass, *value)) ||
 	    !device_is_attached(device))
 	    return ENXIO;
 	error = DEVICE_DETACH(device);
 	break;
 
     case IOCATADEVICES:
 	if (devices->channel >= devclass_get_maxunit(ata_devclass) ||
 	    !(device = devclass_get_device(ata_devclass, devices->channel)) ||
 	    !device_is_attached(device))
 	    return ENXIO;
 	bzero(devices->name[0], 32);
 	bzero(&devices->params[0], sizeof(struct ata_params));
 	bzero(devices->name[1], 32);
 	bzero(&devices->params[1], sizeof(struct ata_params));
 	if (!device_get_children(device, &children, &nchildren)) {
 	    for (i = 0; i < nchildren; i++) {
 		if (children[i] && device_is_attached(children[i])) {
 		    struct ata_device *atadev = device_get_softc(children[i]);
 
 		    if (atadev->unit == ATA_MASTER) { /* XXX SOS PM */
 			strncpy(devices->name[0],
 				device_get_nameunit(children[i]), 32);
 			bcopy(&atadev->param, &devices->params[0],
 			      sizeof(struct ata_params));
 		    }
 		    if (atadev->unit == ATA_SLAVE) { /* XXX SOS PM */
 			strncpy(devices->name[1],
 				device_get_nameunit(children[i]), 32);
 			bcopy(&atadev->param, &devices->params[1],
 			      sizeof(struct ata_params));
 		    }
 		}
 	    }
 	    free(children, M_TEMP);
 	    error = 0;
 	}
 	else
 	    error = ENODEV;
 	break;
 
     default:
 	if (ata_raid_ioctl_func)
 	    error = ata_raid_ioctl_func(cmd, data);
     }
     return error;
 }
 #endif
 
 #ifndef ATA_CAM
 int
 ata_device_ioctl(device_t dev, u_long cmd, caddr_t data)
 {
     struct ata_device *atadev = device_get_softc(dev);
     struct ata_channel *ch = device_get_softc(device_get_parent(dev));
     struct ata_ioc_request *ioc_request = (struct ata_ioc_request *)data;
     struct ata_params *params = (struct ata_params *)data;
     int *mode = (int *)data;
     struct ata_request *request;
     caddr_t buf;
     int error;
 
     switch (cmd) {
     case IOCATAREQUEST:
 	if (ioc_request->count >
 	    (ch->dma.max_iosize ? ch->dma.max_iosize : DFLTPHYS)) {
 		return (EFBIG);
 	}
 	if (!(buf = malloc(ioc_request->count, M_ATA, M_NOWAIT))) {
 	    return ENOMEM;
 	}
 	if (!(request = ata_alloc_request())) {
 	    free(buf, M_ATA);
 	    return  ENOMEM;
 	}
 	request->dev = atadev->dev;
 	if (ioc_request->flags & ATA_CMD_WRITE) {
 	    error = copyin(ioc_request->data, buf, ioc_request->count);
 	    if (error) {
 		free(buf, M_ATA);
 		ata_free_request(request);
 		return error;
 	    }
 	}
 	if (ioc_request->flags & ATA_CMD_ATAPI) {
 	    request->flags = ATA_R_ATAPI;
 	    bcopy(ioc_request->u.atapi.ccb, request->u.atapi.ccb, 16);
 	}
 	else {
 	    request->u.ata.command = ioc_request->u.ata.command;
 	    request->u.ata.feature = ioc_request->u.ata.feature;
 	    request->u.ata.lba = ioc_request->u.ata.lba;
 	    request->u.ata.count = ioc_request->u.ata.count;
 	}
 	request->timeout = ioc_request->timeout;
 	request->data = buf;
 	request->bytecount = ioc_request->count;
 	request->transfersize = request->bytecount;
 	if (ioc_request->flags & ATA_CMD_CONTROL)
 	    request->flags |= ATA_R_CONTROL;
 	if (ioc_request->flags & ATA_CMD_READ)
 	    request->flags |= ATA_R_READ;
 	if (ioc_request->flags & ATA_CMD_WRITE)
 	    request->flags |= ATA_R_WRITE;
 	ata_queue_request(request);
 	if (request->flags & ATA_R_ATAPI) {
 	    bcopy(&request->u.atapi.sense, &ioc_request->u.atapi.sense,
 		  sizeof(struct atapi_sense));
 	}
 	else {
 	    ioc_request->u.ata.command = request->u.ata.command;
 	    ioc_request->u.ata.feature = request->u.ata.feature;
 	    ioc_request->u.ata.lba = request->u.ata.lba;
 	    ioc_request->u.ata.count = request->u.ata.count;
 	}
 	ioc_request->error = request->result;
 	if (ioc_request->flags & ATA_CMD_READ)
 	    error = copyout(buf, ioc_request->data, ioc_request->count);
 	else
 	    error = 0;
 	free(buf, M_ATA);
 	ata_free_request(request);
 	return error;
    
     case IOCATAGPARM:
 	ata_getparam(atadev, 0);
 	bcopy(&atadev->param, params, sizeof(struct ata_params));
 	return 0;
 	
     case IOCATASMODE:
 	atadev->mode = *mode;
 	ata_setmode(dev);
 	return 0;
 
     case IOCATAGMODE:
 	*mode = atadev->mode |
 	    (ATA_GETREV(device_get_parent(dev), atadev->unit) << 8);
 	return 0;
     case IOCATASSPINDOWN:
 	atadev->spindown = *mode;
 	return 0;
     case IOCATAGSPINDOWN:
 	*mode = atadev->spindown;
 	return 0;
     default:
 	return ENOTTY;
     }
 }
 #endif
 
 #ifndef ATA_CAM
 static void
 ata_boot_attach(void)
 {
     struct ata_channel *ch;
     int ctlr;
 
     mtx_lock(&Giant);       /* newbus suckage it needs Giant */
 
     /* kick off probe and attach on all channels */
     for (ctlr = 0; ctlr < devclass_get_maxunit(ata_devclass); ctlr++) {
 	if ((ch = devclass_get_softc(ata_devclass, ctlr))) {
 	    ata_identify(ch->dev);
 	}
     }
 
     /* release the hook that got us here, we are only needed once during boot */
     if (ata_delayed_attach) {
 	config_intrhook_disestablish(ata_delayed_attach);
 	free(ata_delayed_attach, M_TEMP);
 	ata_delayed_attach = NULL;
     }
 
     mtx_unlock(&Giant);     /* newbus suckage dealt with, release Giant */
 }
 #endif
 
 /*
  * misc support functions
  */
 #ifndef ATA_CAM
 static device_t
 ata_add_child(device_t parent, struct ata_device *atadev, int unit)
 {
     device_t child;
 
     if ((child = device_add_child(parent, (unit < 0) ? NULL : "ad", unit))) {
 	device_set_softc(child, atadev);
 	device_quiet(child);
 	atadev->dev = child;
 	atadev->max_iosize = DEV_BSIZE;
 	atadev->mode = ATA_PIO_MAX;
     }
     return child;
 }
 #endif
 
 #ifndef ATA_CAM
 int
 ata_getparam(struct ata_device *atadev, int init)
 {
     struct ata_channel *ch = device_get_softc(device_get_parent(atadev->dev));
     struct ata_request *request;
     const char *res;
     char buf[64];
     u_int8_t command = 0;
     int error = ENOMEM, retries = 2, mode = -1;
 
     if (ch->devices & (ATA_ATA_MASTER << atadev->unit))
 	command = ATA_ATA_IDENTIFY;
     if (ch->devices & (ATA_ATAPI_MASTER << atadev->unit))
 	command = ATA_ATAPI_IDENTIFY;
     if (!command)
 	return ENXIO;
 
     while (retries-- > 0 && error) {
 	if (!(request = ata_alloc_request()))
 	    break;
 	request->dev = atadev->dev;
 	request->timeout = 1;
 	request->retries = 0;
 	request->u.ata.command = command;
 	request->flags = (ATA_R_READ|ATA_R_AT_HEAD|ATA_R_DIRECT);
 	if (!bootverbose)
 	    request->flags |= ATA_R_QUIET;
 	request->data = (void *)&atadev->param;
 	request->bytecount = sizeof(struct ata_params);
 	request->donecount = 0;
 	request->transfersize = DEV_BSIZE;
 	ata_queue_request(request);
 	error = request->result;
 	ata_free_request(request);
     }
 
     if (!error && (isprint(atadev->param.model[0]) ||
 		   isprint(atadev->param.model[1]))) {
 	struct ata_params *atacap = &atadev->param;
 	int16_t *ptr;
 
 	for (ptr = (int16_t *)atacap;
 	     ptr < (int16_t *)atacap + sizeof(struct ata_params)/2; ptr++) {
 	    *ptr = le16toh(*ptr);
 	}
 	if (!(!strncmp(atacap->model, "FX", 2) ||
 	      !strncmp(atacap->model, "NEC", 3) ||
 	      !strncmp(atacap->model, "Pioneer", 7) ||
 	      !strncmp(atacap->model, "SHARP", 5))) {
 	    bswap(atacap->model, sizeof(atacap->model));
 	    bswap(atacap->revision, sizeof(atacap->revision));
 	    bswap(atacap->serial, sizeof(atacap->serial));
 	}
 	btrim(atacap->model, sizeof(atacap->model));
 	bpack(atacap->model, atacap->model, sizeof(atacap->model));
 	btrim(atacap->revision, sizeof(atacap->revision));
 	bpack(atacap->revision, atacap->revision, sizeof(atacap->revision));
 	btrim(atacap->serial, sizeof(atacap->serial));
 	bpack(atacap->serial, atacap->serial, sizeof(atacap->serial));
 
 	if (bootverbose)
 	    printf("ata%d-%s: pio=%s wdma=%s udma=%s cable=%s wire\n",
 		   device_get_unit(ch->dev),
 		   ata_unit2str(atadev),
 		   ata_mode2str(ata_pmode(atacap)),
 		   ata_mode2str(ata_wmode(atacap)),
 		   ata_mode2str(ata_umode(atacap)),
 		   (atacap->hwres & ATA_CABLE_ID) ? "80":"40");
 
 	if (init) {
 	    char buffer[64];
 
 	    sprintf(buffer, "%.40s/%.8s", atacap->model, atacap->revision);
 	    device_set_desc_copy(atadev->dev, buffer);
 	    if ((atadev->param.config & ATA_PROTO_ATAPI) &&
 		(atadev->param.config != ATA_CFA_MAGIC1) &&
 		(atadev->param.config != ATA_CFA_MAGIC2)) {
 		if (atapi_dma &&
 		    (atadev->param.config & ATA_DRQ_MASK) != ATA_DRQ_INTR &&
 		    ata_umode(&atadev->param) >= ATA_UDMA2)
 		    atadev->mode = ATA_DMA_MAX;
 	    }
 	    else {
 		if (ata_dma &&
 		    (ata_umode(&atadev->param) > 0 ||
 		     ata_wmode(&atadev->param) > 0))
 		    atadev->mode = ATA_DMA_MAX;
 	    }
 	    snprintf(buf, sizeof(buf), "dev%d.mode", atadev->unit);
 	    if (resource_string_value(device_get_name(ch->dev),
 	        device_get_unit(ch->dev), buf, &res) == 0)
 		    mode = ata_str2mode(res);
 	    else if (resource_string_value(device_get_name(ch->dev),
 		device_get_unit(ch->dev), "mode", &res) == 0)
 		    mode = ata_str2mode(res);
 	    if (mode >= 0)
 		    atadev->mode = mode;
 	}
     }
     else {
 	if (!error)
 	    error = ENXIO;
     }
     return error;
 }
 #endif
 
 #ifndef ATA_CAM
 int
 ata_identify(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
     struct ata_device *atadev;
     device_t *children;
     device_t child, master = NULL;
     int nchildren, i, n = ch->devices;
 
     if (bootverbose)
 	device_printf(dev, "Identifying devices: %08x\n", ch->devices);
 
     mtx_lock(&Giant);
     /* Skip existing devices. */
     if (!device_get_children(dev, &children, &nchildren)) {
 	for (i = 0; i < nchildren; i++) {
 	    if (children[i] && (atadev = device_get_softc(children[i])))
 		n &= ~((ATA_ATA_MASTER | ATA_ATAPI_MASTER) << atadev->unit);
 	}
 	free(children, M_TEMP);
     }
     /* Create new devices. */
     if (bootverbose)
 	device_printf(dev, "New devices: %08x\n", n);
     if (n == 0) {
 	mtx_unlock(&Giant);
 	return (0);
     }
     for (i = 0; i < ATA_PM; ++i) {
 	if (n & (((ATA_ATA_MASTER | ATA_ATAPI_MASTER) << i))) {
 	    int unit = -1;
 
 	    if (!(atadev = malloc(sizeof(struct ata_device),
 				  M_ATA, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "out of memory\n");
 		return ENOMEM;
 	    }
 	    atadev->unit = i;
 #ifdef ATA_STATIC_ID
 	    if (n & (ATA_ATA_MASTER << i))
 		unit = (device_get_unit(dev) << 1) + i;
 #endif
 	    if ((child = ata_add_child(dev, atadev, unit))) {
 		/*
 		 * PATA slave should be identified first, to allow
 		 * device cable detection on master to work properly.
 		 */
 		if (i == 0 && (n & ATA_PORTMULTIPLIER) == 0 &&
 			(n & ((ATA_ATA_MASTER | ATA_ATAPI_MASTER) << 1)) != 0) {
 		    master = child;
 		    continue;
 		}
 		if (ata_getparam(atadev, 1)) {
 		    device_delete_child(dev, child);
 		    free(atadev, M_ATA);
 		}
 	    }
 	    else
 		free(atadev, M_ATA);
 	}
     }
     if (master) {
 	atadev = device_get_softc(master);
 	if (ata_getparam(atadev, 1)) {
 	    device_delete_child(dev, master);
 	    free(atadev, M_ATA);
 	}
     }
     bus_generic_probe(dev);
     bus_generic_attach(dev);
     mtx_unlock(&Giant);
     return 0;
 }
 #endif
 
 void
 ata_default_registers(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
 
     /* fill in the defaults from whats setup already */
     ch->r_io[ATA_ERROR].res = ch->r_io[ATA_FEATURE].res;
     ch->r_io[ATA_ERROR].offset = ch->r_io[ATA_FEATURE].offset;
     ch->r_io[ATA_IREASON].res = ch->r_io[ATA_COUNT].res;
     ch->r_io[ATA_IREASON].offset = ch->r_io[ATA_COUNT].offset;
     ch->r_io[ATA_STATUS].res = ch->r_io[ATA_COMMAND].res;
     ch->r_io[ATA_STATUS].offset = ch->r_io[ATA_COMMAND].offset;
     ch->r_io[ATA_ALTSTAT].res = ch->r_io[ATA_CONTROL].res;
     ch->r_io[ATA_ALTSTAT].offset = ch->r_io[ATA_CONTROL].offset;
 }
 
 void
 ata_modify_if_48bit(struct ata_request *request)
 {
     struct ata_channel *ch = device_get_softc(request->parent);
     struct ata_device *atadev = device_get_softc(request->dev);
 
     request->flags &= ~ATA_R_48BIT;
 
     if (((request->u.ata.lba + request->u.ata.count) >= ATA_MAX_28BIT_LBA ||
 	 request->u.ata.count > 256) &&
 	atadev->param.support.command2 & ATA_SUPPORT_ADDRESS48) {
 
 	/* translate command into 48bit version */
 	switch (request->u.ata.command) {
 	case ATA_READ:
 	    request->u.ata.command = ATA_READ48;
 	    break;
 	case ATA_READ_MUL:
 	    request->u.ata.command = ATA_READ_MUL48;
 	    break;
 	case ATA_READ_DMA:
 	    if (ch->flags & ATA_NO_48BIT_DMA) {
 		if (request->transfersize > DEV_BSIZE)
 		    request->u.ata.command = ATA_READ_MUL48;
 		else
 		    request->u.ata.command = ATA_READ48;
 		request->flags &= ~ATA_R_DMA;
 	    }
 	    else
 		request->u.ata.command = ATA_READ_DMA48;
 	    break;
 	case ATA_READ_DMA_QUEUED:
 	    if (ch->flags & ATA_NO_48BIT_DMA) {
 		if (request->transfersize > DEV_BSIZE)
 		    request->u.ata.command = ATA_READ_MUL48;
 		else
 		    request->u.ata.command = ATA_READ48;
 		request->flags &= ~ATA_R_DMA;
 	    }
 	    else
 		request->u.ata.command = ATA_READ_DMA_QUEUED48;
 	    break;
 	case ATA_WRITE:
 	    request->u.ata.command = ATA_WRITE48;
 	    break;
 	case ATA_WRITE_MUL:
 	    request->u.ata.command = ATA_WRITE_MUL48;
 	    break;
 	case ATA_WRITE_DMA:
 	    if (ch->flags & ATA_NO_48BIT_DMA) {
 		if (request->transfersize > DEV_BSIZE)
 		    request->u.ata.command = ATA_WRITE_MUL48;
 		else
 		    request->u.ata.command = ATA_WRITE48;
 		request->flags &= ~ATA_R_DMA;
 	    }
 	    else
 		request->u.ata.command = ATA_WRITE_DMA48;
 	    break;
 	case ATA_WRITE_DMA_QUEUED:
 	    if (ch->flags & ATA_NO_48BIT_DMA) {
 		if (request->transfersize > DEV_BSIZE)
 		    request->u.ata.command = ATA_WRITE_MUL48;
 		else
 		    request->u.ata.command = ATA_WRITE48;
 		request->u.ata.command = ATA_WRITE48;
 		request->flags &= ~ATA_R_DMA;
 	    }
 	    else
 		request->u.ata.command = ATA_WRITE_DMA_QUEUED48;
 	    break;
 	case ATA_FLUSHCACHE:
 	    request->u.ata.command = ATA_FLUSHCACHE48;
 	    break;
 	case ATA_SET_MAX_ADDRESS:
 	    request->u.ata.command = ATA_SET_MAX_ADDRESS48;
 	    break;
 	default:
 	    return;
 	}
 	request->flags |= ATA_R_48BIT;
     }
     else if (atadev->param.support.command2 & ATA_SUPPORT_ADDRESS48) {
 
 	/* translate command into 48bit version */
 	switch (request->u.ata.command) {
 	case ATA_FLUSHCACHE:
 	    request->u.ata.command = ATA_FLUSHCACHE48;
 	    break;
 	case ATA_READ_NATIVE_MAX_ADDRESS:
 	    request->u.ata.command = ATA_READ_NATIVE_MAX_ADDRESS48;
 	    break;
 	case ATA_SET_MAX_ADDRESS:
 	    request->u.ata.command = ATA_SET_MAX_ADDRESS48;
 	    break;
 	default:
 	    return;
 	}
 	request->flags |= ATA_R_48BIT;
     }
 }
 
 void
 ata_udelay(int interval)
 {
     /* for now just use DELAY, the timer/sleep subsytems are not there yet */
     if (1 || interval < (1000000/hz) || ata_delayed_attach)
 	DELAY(interval);
     else
 	pause("ataslp", interval/(1000000/hz));
 }
 
 #ifndef ATA_CAM
 const char *
 ata_unit2str(struct ata_device *atadev)
 {
     struct ata_channel *ch = device_get_softc(device_get_parent(atadev->dev));
     static char str[8];
 
     if (ch->devices & ATA_PORTMULTIPLIER)
 	sprintf(str, "port%d", atadev->unit);
     else
 	sprintf(str, "%s", atadev->unit == ATA_MASTER ? "master" : "slave");
     return str;
 }
 #endif
 
 const char *
 ata_mode2str(int mode)
 {
     switch (mode) {
     case -1: return "UNSUPPORTED";
     case ATA_PIO0: return "PIO0";
     case ATA_PIO1: return "PIO1";
     case ATA_PIO2: return "PIO2";
     case ATA_PIO3: return "PIO3";
     case ATA_PIO4: return "PIO4";
     case ATA_WDMA0: return "WDMA0";
     case ATA_WDMA1: return "WDMA1";
     case ATA_WDMA2: return "WDMA2";
     case ATA_UDMA0: return "UDMA16";
     case ATA_UDMA1: return "UDMA25";
     case ATA_UDMA2: return "UDMA33";
     case ATA_UDMA3: return "UDMA40";
     case ATA_UDMA4: return "UDMA66";
     case ATA_UDMA5: return "UDMA100";
     case ATA_UDMA6: return "UDMA133";
     case ATA_SA150: return "SATA150";
     case ATA_SA300: return "SATA300";
     default:
 	if (mode & ATA_DMA_MASK)
 	    return "BIOSDMA";
 	else
 	    return "BIOSPIO";
     }
 }
 
 int
 ata_str2mode(const char *str)
 {
 
 	if (!strcasecmp(str, "PIO0")) return (ATA_PIO0);
 	if (!strcasecmp(str, "PIO1")) return (ATA_PIO1);
 	if (!strcasecmp(str, "PIO2")) return (ATA_PIO2);
 	if (!strcasecmp(str, "PIO3")) return (ATA_PIO3);
 	if (!strcasecmp(str, "PIO4")) return (ATA_PIO4);
 	if (!strcasecmp(str, "WDMA0")) return (ATA_WDMA0);
 	if (!strcasecmp(str, "WDMA1")) return (ATA_WDMA1);
 	if (!strcasecmp(str, "WDMA2")) return (ATA_WDMA2);
 	if (!strcasecmp(str, "UDMA0")) return (ATA_UDMA0);
 	if (!strcasecmp(str, "UDMA16")) return (ATA_UDMA0);
 	if (!strcasecmp(str, "UDMA1")) return (ATA_UDMA1);
 	if (!strcasecmp(str, "UDMA25")) return (ATA_UDMA1);
 	if (!strcasecmp(str, "UDMA2")) return (ATA_UDMA2);
 	if (!strcasecmp(str, "UDMA33")) return (ATA_UDMA2);
 	if (!strcasecmp(str, "UDMA3")) return (ATA_UDMA3);
 	if (!strcasecmp(str, "UDMA44")) return (ATA_UDMA3);
 	if (!strcasecmp(str, "UDMA4")) return (ATA_UDMA4);
 	if (!strcasecmp(str, "UDMA66")) return (ATA_UDMA4);
 	if (!strcasecmp(str, "UDMA5")) return (ATA_UDMA5);
 	if (!strcasecmp(str, "UDMA100")) return (ATA_UDMA5);
 	if (!strcasecmp(str, "UDMA6")) return (ATA_UDMA6);
 	if (!strcasecmp(str, "UDMA133")) return (ATA_UDMA6);
 	return (-1);
 }
 
 #ifndef ATA_CAM
 const char *
 ata_satarev2str(int rev)
 {
 	switch (rev) {
 	case 0: return "";
 	case 1: return "SATA 1.5Gb/s";
 	case 2: return "SATA 3Gb/s";
 	case 3: return "SATA 6Gb/s";
 	case 0xff: return "SATA";
 	default: return "???";
 	}
 }
 #endif
 
 int
 ata_atapi(device_t dev, int target)
 {
     struct ata_channel *ch = device_get_softc(dev);
 
     return (ch->devices & (ATA_ATAPI_MASTER << target));
 }
 
 #ifndef ATA_CAM
 int
 ata_pmode(struct ata_params *ap)
 {
     if (ap->atavalid & ATA_FLAG_64_70) {
 	if (ap->apiomodes & 0x02)
 	    return ATA_PIO4;
 	if (ap->apiomodes & 0x01)
 	    return ATA_PIO3;
     }
     if (ap->mwdmamodes & 0x04)
 	return ATA_PIO4;
     if (ap->mwdmamodes & 0x02)
 	return ATA_PIO3;
     if (ap->mwdmamodes & 0x01)
 	return ATA_PIO2;
     if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x200)
 	return ATA_PIO2;
     if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x100)
 	return ATA_PIO1;
     if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x000)
 	return ATA_PIO0;
     return ATA_PIO0;
 }
 #endif
 
 #ifndef ATA_CAM
 int
 ata_wmode(struct ata_params *ap)
 {
     if (ap->mwdmamodes & 0x04)
 	return ATA_WDMA2;
     if (ap->mwdmamodes & 0x02)
 	return ATA_WDMA1;
     if (ap->mwdmamodes & 0x01)
 	return ATA_WDMA0;
     return -1;
 }
 #endif
 
 #ifndef ATA_CAM
 int
 ata_umode(struct ata_params *ap)
 {
     if (ap->atavalid & ATA_FLAG_88) {
 	if (ap->udmamodes & 0x40)
 	    return ATA_UDMA6;
 	if (ap->udmamodes & 0x20)
 	    return ATA_UDMA5;
 	if (ap->udmamodes & 0x10)
 	    return ATA_UDMA4;
 	if (ap->udmamodes & 0x08)
 	    return ATA_UDMA3;
 	if (ap->udmamodes & 0x04)
 	    return ATA_UDMA2;
 	if (ap->udmamodes & 0x02)
 	    return ATA_UDMA1;
 	if (ap->udmamodes & 0x01)
 	    return ATA_UDMA0;
     }
     return -1;
 }
 #endif
 
 #ifndef ATA_CAM
 int
 ata_limit_mode(device_t dev, int mode, int maxmode)
 {
     struct ata_device *atadev = device_get_softc(dev);
 
     if (maxmode && mode > maxmode)
 	mode = maxmode;
 
     if (mode >= ATA_UDMA0 && ata_umode(&atadev->param) > 0)
 	return min(mode, ata_umode(&atadev->param));
 
     if (mode >= ATA_WDMA0 && ata_wmode(&atadev->param) > 0)
 	return min(mode, ata_wmode(&atadev->param));
 
     if (mode > ata_pmode(&atadev->param))
 	return min(mode, ata_pmode(&atadev->param));
 
     return mode;
 }
 #endif
 
 #ifndef ATA_CAM
 static void
 bswap(int8_t *buf, int len)
 {
     u_int16_t *ptr = (u_int16_t*)(buf + len);
 
     while (--ptr >= (u_int16_t*)buf)
 	*ptr = ntohs(*ptr);
 }
 #endif
 
 #ifndef ATA_CAM
 static void
 btrim(int8_t *buf, int len)
 {
     int8_t *ptr;
 
     for (ptr = buf; ptr < buf+len; ++ptr)
 	if (!*ptr || *ptr == '_')
 	    *ptr = ' ';
     for (ptr = buf + len - 1; ptr >= buf && *ptr == ' '; --ptr)
 	*ptr = 0;
 }
 #endif
 
 #ifndef ATA_CAM
 static void
 bpack(int8_t *src, int8_t *dst, int len)
 {
     int i, j, blank;
 
     for (i = j = blank = 0 ; i < len; i++) {
 	if (blank && src[i] == ' ') continue;
 	if (blank && src[i] != ' ') {
 	    dst[j++] = src[i];
 	    blank = 0;
 	    continue;
 	}
 	if (src[i] == ' ') {
 	    blank = 1;
 	    if (i == 0)
 		continue;
 	}
 	dst[j++] = src[i];
     }
     if (j < len)
 	dst[j] = 0x00;
 }
 #endif
 
 #ifdef ATA_CAM
 void
 ata_cam_begin_transaction(device_t dev, union ccb *ccb)
 {
 	struct ata_channel *ch = device_get_softc(dev);
 	struct ata_request *request;
 
 	if (!(request = ata_alloc_request())) {
 		device_printf(dev, "FAILURE - out of memory in start\n");
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		xpt_done(ccb);
 		return;
 	}
 	bzero(request, sizeof(*request));
 
 	/* setup request */
 	request->dev = NULL;
 	request->parent = dev;
 	request->unit = ccb->ccb_h.target_id;
 	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 		request->data = ccb->ataio.data_ptr;
 		request->bytecount = ccb->ataio.dxfer_len;
 		request->u.ata.command = ccb->ataio.cmd.command;
 		request->u.ata.feature = ((uint16_t)ccb->ataio.cmd.features_exp << 8) |
 					  (uint16_t)ccb->ataio.cmd.features;
 		request->u.ata.count = ((uint16_t)ccb->ataio.cmd.sector_count_exp << 8) |
 					(uint16_t)ccb->ataio.cmd.sector_count;
 		if (ccb->ataio.cmd.flags & CAM_ATAIO_48BIT) {
 			request->flags |= ATA_R_48BIT;
 			request->u.ata.lba =
 				     ((uint64_t)ccb->ataio.cmd.lba_high_exp << 40) |
 				     ((uint64_t)ccb->ataio.cmd.lba_mid_exp << 32) |
 				     ((uint64_t)ccb->ataio.cmd.lba_low_exp << 24);
 		} else {
 			request->u.ata.lba =
 				     ((uint64_t)(ccb->ataio.cmd.device & 0x0f) << 24);
 		}
 		request->u.ata.lba |= ((uint64_t)ccb->ataio.cmd.lba_high << 16) |
 				      ((uint64_t)ccb->ataio.cmd.lba_mid << 8) |
 				       (uint64_t)ccb->ataio.cmd.lba_low;
 		if (ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT)
 			request->flags |= ATA_R_NEEDRESULT;
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
 		    ccb->ataio.cmd.flags & CAM_ATAIO_DMA)
 			request->flags |= ATA_R_DMA;
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN)
 			request->flags |= ATA_R_READ;
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
 			request->flags |= ATA_R_WRITE;
 		if (ccb->ataio.cmd.command == ATA_READ_MUL ||
 		    ccb->ataio.cmd.command == ATA_READ_MUL48 ||
 		    ccb->ataio.cmd.command == ATA_WRITE_MUL ||
 		    ccb->ataio.cmd.command == ATA_WRITE_MUL48) {
 			request->transfersize = min(request->bytecount,
 			    ch->curr[ccb->ccb_h.target_id].bytecount);
 		} else
 			request->transfersize = min(request->bytecount, 512);
 	} else {
 		request->data = ccb->csio.data_ptr;
 		request->bytecount = ccb->csio.dxfer_len;
 		bcopy((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
 		    ccb->csio.cdb_io.cdb_ptr : ccb->csio.cdb_io.cdb_bytes,
 		    request->u.atapi.ccb, ccb->csio.cdb_len);
 		request->flags |= ATA_R_ATAPI;
 		if (ch->curr[ccb->ccb_h.target_id].atapi == 16)
 			request->flags |= ATA_R_ATAPI16;
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE &&
 		    ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA)
 			request->flags |= ATA_R_DMA;
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN)
 			request->flags |= ATA_R_READ;
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT)
 			request->flags |= ATA_R_WRITE;
 		request->transfersize = min(request->bytecount,
 		    ch->curr[ccb->ccb_h.target_id].bytecount);
 	}
 	request->retries = 0;
 	request->timeout = (ccb->ccb_h.timeout + 999) / 1000;
 	callout_init_mtx(&request->callout, &ch->state_mtx, CALLOUT_RETURNUNLOCKED);
 	request->ccb = ccb;
+	request->flags |= ATA_R_DATA_IN_CCB;
 
 	ch->running = request;
 	ch->state = ATA_ACTIVE;
 	if (ch->hw.begin_transaction(request) == ATA_OP_FINISHED) {
 	    ch->running = NULL;
 	    ch->state = ATA_IDLE;
 	    ata_cam_end_transaction(dev, request);
 	    return;
 	}
 }
 
 static void
 ata_cam_request_sense(device_t dev, struct ata_request *request)
 {
 	struct ata_channel *ch = device_get_softc(dev);
 	union ccb *ccb = request->ccb;
 
 	ch->requestsense = 1;
 
 	bzero(request, sizeof(*request));
 	request->dev = NULL;
 	request->parent = dev;
 	request->unit = ccb->ccb_h.target_id;
 	request->data = (void *)&ccb->csio.sense_data;
 	request->bytecount = ccb->csio.sense_len;
 	request->u.atapi.ccb[0] = ATAPI_REQUEST_SENSE;
 	request->u.atapi.ccb[4] = ccb->csio.sense_len;
 	request->flags |= ATA_R_ATAPI;
 	if (ch->curr[ccb->ccb_h.target_id].atapi == 16)
 		request->flags |= ATA_R_ATAPI16;
 	if (ch->curr[ccb->ccb_h.target_id].mode >= ATA_DMA)
 		request->flags |= ATA_R_DMA;
 	request->flags |= ATA_R_READ;
 	request->transfersize = min(request->bytecount,
 	    ch->curr[ccb->ccb_h.target_id].bytecount);
 	request->retries = 0;
 	request->timeout = (ccb->ccb_h.timeout + 999) / 1000;
 	callout_init_mtx(&request->callout, &ch->state_mtx, CALLOUT_RETURNUNLOCKED);
 	request->ccb = ccb;
 
 	ch->running = request;
 	ch->state = ATA_ACTIVE;
 	if (ch->hw.begin_transaction(request) == ATA_OP_FINISHED) {
 		ch->running = NULL;
 		ch->state = ATA_IDLE;
 		ata_cam_end_transaction(dev, request);
 		return;
 	}
 }
 
 static void
 ata_cam_process_sense(device_t dev, struct ata_request *request)
 {
 	struct ata_channel *ch = device_get_softc(dev);
 	union ccb *ccb = request->ccb;
 	int fatalerr = 0;
 
 	ch->requestsense = 0;
 
 	if (request->flags & ATA_R_TIMEOUT)
 		fatalerr = 1;
 	if ((request->flags & ATA_R_TIMEOUT) == 0 &&
 	    (request->status & ATA_S_ERROR) == 0 &&
 	    request->result == 0) {
 		ccb->ccb_h.status |= CAM_AUTOSNS_VALID;
 	} else {
 		ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 		ccb->ccb_h.status |= CAM_AUTOSENSE_FAIL;
 	}
 
 	ata_free_request(request);
 	xpt_done(ccb);
 	/* Do error recovery if needed. */
 	if (fatalerr)
 		ata_reinit(dev);
 }
 
 void
 ata_cam_end_transaction(device_t dev, struct ata_request *request)
 {
 	struct ata_channel *ch = device_get_softc(dev);
 	union ccb *ccb = request->ccb;
 	int fatalerr = 0;
 
 	if (ch->requestsense) {
 		ata_cam_process_sense(dev, request);
 		return;
 	}
 
 	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 	if (request->flags & ATA_R_TIMEOUT) {
 		xpt_freeze_simq(ch->sim, 1);
 		ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 		ccb->ccb_h.status |= CAM_CMD_TIMEOUT | CAM_RELEASE_SIMQ;
 		fatalerr = 1;
 	} else if (request->status & ATA_S_ERROR) {
 		if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 			ccb->ccb_h.status |= CAM_ATA_STATUS_ERROR;
 		} else {
 			ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR;
 			ccb->csio.scsi_status = SCSI_STATUS_CHECK_COND;
 		}
 	} else if (request->result == ERESTART)
 		ccb->ccb_h.status |= CAM_REQUEUE_REQ;
 	else if (request->result != 0)
 		ccb->ccb_h.status |= CAM_REQ_CMP_ERR;
 	else
 		ccb->ccb_h.status |= CAM_REQ_CMP;
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP &&
 	    !(ccb->ccb_h.status & CAM_DEV_QFRZN)) {
 		xpt_freeze_devq(ccb->ccb_h.path, 1);
 		ccb->ccb_h.status |= CAM_DEV_QFRZN;
 	}
 	if (ccb->ccb_h.func_code == XPT_ATA_IO &&
 	    ((request->status & ATA_S_ERROR) ||
 	    (ccb->ataio.cmd.flags & CAM_ATAIO_NEEDRESULT))) {
 		struct ata_res *res = &ccb->ataio.res;
 		res->status = request->status;
 		res->error = request->error;
 		res->lba_low = request->u.ata.lba;
 		res->lba_mid = request->u.ata.lba >> 8;
 		res->lba_high = request->u.ata.lba >> 16;
 		res->device = request->u.ata.lba >> 24;
 		res->lba_low_exp = request->u.ata.lba >> 24;
 		res->lba_mid_exp = request->u.ata.lba >> 32;
 		res->lba_high_exp = request->u.ata.lba >> 40;
 		res->sector_count = request->u.ata.count;
 		res->sector_count_exp = request->u.ata.count >> 8;
 	}
 	if ((ccb->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_NONE) {
 		if (ccb->ccb_h.func_code == XPT_ATA_IO) {
 			ccb->ataio.resid =
 			    ccb->ataio.dxfer_len - request->donecount;
 		} else {
 			ccb->csio.resid =
 			    ccb->csio.dxfer_len - request->donecount;
 		}
 	}
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
 	    (ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)
 		ata_cam_request_sense(dev, request);
 	else {
 		ata_free_request(request);
 		xpt_done(ccb);
 	}
 	/* Do error recovery if needed. */
 	if (fatalerr)
 		ata_reinit(dev);
 }
 
 static int
 ata_check_ids(device_t dev, union ccb *ccb)
 {
 	struct ata_channel *ch = device_get_softc(dev);
 
 	if (ccb->ccb_h.target_id > ((ch->flags & ATA_NO_SLAVE) ? 0 : 1)) {
 		ccb->ccb_h.status = CAM_TID_INVALID;
 		xpt_done(ccb);
 		return (-1);
 	}
 	if (ccb->ccb_h.target_lun != 0) {
 		ccb->ccb_h.status = CAM_LUN_INVALID;
 		xpt_done(ccb);
 		return (-1);
 	}
 	return (0);
 }
 
 static void
 ataaction(struct cam_sim *sim, union ccb *ccb)
 {
 	device_t dev, parent;
 	struct ata_channel *ch;
 
 	CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("ataaction func_code=%x\n",
 	    ccb->ccb_h.func_code));
 
 	ch = (struct ata_channel *)cam_sim_softc(sim);
 	dev = ch->dev;
 	switch (ccb->ccb_h.func_code) {
 	/* Common cases first */
 	case XPT_ATA_IO:	/* Execute the requested I/O operation */
 	case XPT_SCSI_IO:
 		if (ata_check_ids(dev, ccb))
 			return;
 		if ((ch->devices & ((ATA_ATA_MASTER | ATA_ATAPI_MASTER)
 		    << ccb->ccb_h.target_id)) == 0) {
 			ccb->ccb_h.status = CAM_SEL_TIMEOUT;
 			break;
 		}
 		if (ch->running)
 			device_printf(dev, "already running!\n");
 		if (ccb->ccb_h.func_code == XPT_ATA_IO &&
 		    (ccb->ataio.cmd.flags & CAM_ATAIO_CONTROL) &&
 		    (ccb->ataio.cmd.control & ATA_A_RESET)) {
 			struct ata_res *res = &ccb->ataio.res;
 			
 			bzero(res, sizeof(*res));
 			if (ch->devices & (ATA_ATA_MASTER << ccb->ccb_h.target_id)) {
 				res->lba_high = 0;
 				res->lba_mid = 0;
 			} else {
 				res->lba_high = 0xeb;
 				res->lba_mid = 0x14;
 			}
 			ccb->ccb_h.status = CAM_REQ_CMP;
 			break;
 		}
 		ata_cam_begin_transaction(dev, ccb);
 		return;
 	case XPT_EN_LUN:		/* Enable LUN as a target */
 	case XPT_TARGET_IO:		/* Execute target I/O request */
 	case XPT_ACCEPT_TARGET_IO:	/* Accept Host Target Mode CDB */
 	case XPT_CONT_TARGET_IO:	/* Continue Host Target I/O Connection*/
 	case XPT_ABORT:			/* Abort the specified CCB */
 		/* XXX Implement */
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		break;
 	case XPT_SET_TRAN_SETTINGS:
 	{
 		struct	ccb_trans_settings *cts = &ccb->cts;
 		struct	ata_cam_device *d; 
 
 		if (ata_check_ids(dev, ccb))
 			return;
 		if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
 			d = &ch->curr[ccb->ccb_h.target_id];
 		else
 			d = &ch->user[ccb->ccb_h.target_id];
 		if (ch->flags & ATA_SATA) {
 			if (cts->xport_specific.sata.valid & CTS_SATA_VALID_REVISION)
 				d->revision = cts->xport_specific.sata.revision;
 			if (cts->xport_specific.sata.valid & CTS_SATA_VALID_MODE) {
 				if (cts->type == CTS_TYPE_CURRENT_SETTINGS) {
 					d->mode = ATA_SETMODE(ch->dev,
 					    ccb->ccb_h.target_id,
 					    cts->xport_specific.sata.mode);
 				} else
 					d->mode = cts->xport_specific.sata.mode;
 			}
 			if (cts->xport_specific.sata.valid & CTS_SATA_VALID_BYTECOUNT)
 				d->bytecount = min(8192, cts->xport_specific.sata.bytecount);
 			if (cts->xport_specific.sata.valid & CTS_SATA_VALID_ATAPI)
 				d->atapi = cts->xport_specific.sata.atapi;
 			if (cts->xport_specific.sata.valid & CTS_SATA_VALID_CAPS)
 				d->caps = cts->xport_specific.sata.caps;
 		} else {
 			if (cts->xport_specific.ata.valid & CTS_ATA_VALID_MODE) {
 				if (cts->type == CTS_TYPE_CURRENT_SETTINGS) {
 					d->mode = ATA_SETMODE(ch->dev,
 					    ccb->ccb_h.target_id,
 					    cts->xport_specific.ata.mode);
 				} else
 					d->mode = cts->xport_specific.ata.mode;
 			}
 			if (cts->xport_specific.ata.valid & CTS_ATA_VALID_BYTECOUNT)
 				d->bytecount = cts->xport_specific.ata.bytecount;
 			if (cts->xport_specific.ata.valid & CTS_ATA_VALID_ATAPI)
 				d->atapi = cts->xport_specific.ata.atapi;
 		}
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_GET_TRAN_SETTINGS:
 	{
 		struct	ccb_trans_settings *cts = &ccb->cts;
 		struct  ata_cam_device *d;
 
 		if (ata_check_ids(dev, ccb))
 			return;
 		if (cts->type == CTS_TYPE_CURRENT_SETTINGS)
 			d = &ch->curr[ccb->ccb_h.target_id];
 		else
 			d = &ch->user[ccb->ccb_h.target_id];
 		cts->protocol = PROTO_UNSPECIFIED;
 		cts->protocol_version = PROTO_VERSION_UNSPECIFIED;
 		if (ch->flags & ATA_SATA) {
 			cts->transport = XPORT_SATA;
 			cts->transport_version = XPORT_VERSION_UNSPECIFIED;
 			cts->xport_specific.sata.valid = 0;
 			cts->xport_specific.sata.mode = d->mode;
 			cts->xport_specific.sata.valid |= CTS_SATA_VALID_MODE;
 			cts->xport_specific.sata.bytecount = d->bytecount;
 			cts->xport_specific.sata.valid |= CTS_SATA_VALID_BYTECOUNT;
 			if (cts->type == CTS_TYPE_CURRENT_SETTINGS) {
 				cts->xport_specific.sata.revision =
 				    ATA_GETREV(dev, ccb->ccb_h.target_id);
 				if (cts->xport_specific.sata.revision != 0xff) {
 					cts->xport_specific.sata.valid |=
 					    CTS_SATA_VALID_REVISION;
 				}
 				cts->xport_specific.sata.caps =
 				    d->caps & CTS_SATA_CAPS_D;
 				if (ch->pm_level) {
 					cts->xport_specific.sata.caps |=
 					    CTS_SATA_CAPS_H_PMREQ;
 				}
 				cts->xport_specific.sata.caps &=
 				    ch->user[ccb->ccb_h.target_id].caps;
 				cts->xport_specific.sata.valid |=
 				    CTS_SATA_VALID_CAPS;
 			} else {
 				cts->xport_specific.sata.revision = d->revision;
 				cts->xport_specific.sata.valid |= CTS_SATA_VALID_REVISION;
 				cts->xport_specific.sata.caps = d->caps;
 				cts->xport_specific.sata.valid |= CTS_SATA_VALID_CAPS;
 			}
 			cts->xport_specific.sata.atapi = d->atapi;
 			cts->xport_specific.sata.valid |= CTS_SATA_VALID_ATAPI;
 		} else {
 			cts->transport = XPORT_ATA;
 			cts->transport_version = XPORT_VERSION_UNSPECIFIED;
 			cts->xport_specific.ata.valid = 0;
 			cts->xport_specific.ata.mode = d->mode;
 			cts->xport_specific.ata.valid |= CTS_ATA_VALID_MODE;
 			cts->xport_specific.ata.bytecount = d->bytecount;
 			cts->xport_specific.ata.valid |= CTS_ATA_VALID_BYTECOUNT;
 			cts->xport_specific.ata.atapi = d->atapi;
 			cts->xport_specific.ata.valid |= CTS_ATA_VALID_ATAPI;
 		}
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	case XPT_RESET_BUS:		/* Reset the specified SCSI bus */
 	case XPT_RESET_DEV:	/* Bus Device Reset the specified SCSI device */
 		ata_reinit(dev);
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_TERM_IO:		/* Terminate the I/O process */
 		/* XXX Implement */
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		break;
 	case XPT_PATH_INQ:		/* Path routing inquiry */
 	{
 		struct ccb_pathinq *cpi = &ccb->cpi;
 
 		parent = device_get_parent(dev);
 		cpi->version_num = 1; /* XXX??? */
 		cpi->hba_inquiry = PI_SDTR_ABLE;
 		cpi->target_sprt = 0;
 		cpi->hba_misc = PIM_SEQSCAN;
 		cpi->hba_eng_cnt = 0;
 		if (ch->flags & ATA_NO_SLAVE)
 			cpi->max_target = 0;
 		else
 			cpi->max_target = 1;
 		cpi->max_lun = 0;
 		cpi->initiator_id = 0;
 		cpi->bus_id = cam_sim_bus(sim);
 		if (ch->flags & ATA_SATA)
 			cpi->base_transfer_speed = 150000;
 		else
 			cpi->base_transfer_speed = 3300;
 		strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
 		strncpy(cpi->hba_vid, "ATA", HBA_IDLEN);
 		strncpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
 		cpi->unit_number = cam_sim_unit(sim);
 		if (ch->flags & ATA_SATA)
 			cpi->transport = XPORT_SATA;
 		else
 			cpi->transport = XPORT_ATA;
 		cpi->transport_version = XPORT_VERSION_UNSPECIFIED;
 		cpi->protocol = PROTO_ATA;
 		cpi->protocol_version = PROTO_VERSION_UNSPECIFIED;
 		cpi->maxio = ch->dma.max_iosize ? ch->dma.max_iosize : DFLTPHYS;
 		if (device_get_devclass(device_get_parent(parent)) ==
 		    devclass_find("pci")) {
 			cpi->hba_vendor = pci_get_vendor(parent);
 			cpi->hba_device = pci_get_device(parent);
 			cpi->hba_subvendor = pci_get_subvendor(parent);
 			cpi->hba_subdevice = pci_get_subdevice(parent);
 		}
 		cpi->ccb_h.status = CAM_REQ_CMP;
 		break;
 	}
 	default:
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		break;
 	}
 	xpt_done(ccb);
 }
 
 static void
 atapoll(struct cam_sim *sim)
 {
 	struct ata_channel *ch = (struct ata_channel *)cam_sim_softc(sim);
 
 	ata_interrupt_locked(ch);
 }
 #endif
 
 /*
  * module handeling
  */
 static int
 ata_module_event_handler(module_t mod, int what, void *arg)
 {
 #ifndef ATA_CAM
     static struct cdev *atacdev;
 #endif
 
     switch (what) {
     case MOD_LOAD:
 #ifndef ATA_CAM
 	/* register controlling device */
 	atacdev = make_dev(&ata_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "ata");
 
 	if (cold) {
 	    /* register boot attach to be run when interrupts are enabled */
 	    if (!(ata_delayed_attach = (struct intr_config_hook *)
 				       malloc(sizeof(struct intr_config_hook),
 					      M_TEMP, M_NOWAIT | M_ZERO))) {
 		printf("ata: malloc of delayed attach hook failed\n");
 		return EIO;
 	    }
 	    ata_delayed_attach->ich_func = (void*)ata_boot_attach;
 	    if (config_intrhook_establish(ata_delayed_attach) != 0) {
 		printf("ata: config_intrhook_establish failed\n");
 		free(ata_delayed_attach, M_TEMP);
 	    }
 	}
 #endif
 	return 0;
 
     case MOD_UNLOAD:
 #ifndef ATA_CAM
 	/* deregister controlling device */
 	destroy_dev(atacdev);
 #endif
 	return 0;
 
     default:
 	return EOPNOTSUPP;
     }
 }
 
 static moduledata_t ata_moduledata = { "ata", ata_module_event_handler, NULL };
 DECLARE_MODULE(ata, ata_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND);
 MODULE_VERSION(ata, 1);
 #ifdef ATA_CAM
 MODULE_DEPEND(ata, cam, 1, 1, 1);
 #endif
 
 static void
 ata_init(void)
 {
     ata_request_zone = uma_zcreate("ata_request", sizeof(struct ata_request),
 				   NULL, NULL, NULL, NULL, 0, 0);
     ata_composite_zone = uma_zcreate("ata_composite",
 				     sizeof(struct ata_composite),
 				     NULL, NULL, NULL, NULL, 0, 0);
 }
 SYSINIT(ata_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_init, NULL);
 
 static void
 ata_uninit(void)
 {
     uma_zdestroy(ata_composite_zone);
     uma_zdestroy(ata_request_zone);
 }
 SYSUNINIT(ata_unregister, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_uninit, NULL);
Index: user/attilio/vmobj-rwlock/sys/dev/ata/ata-all.h
===================================================================
--- user/attilio/vmobj-rwlock/sys/dev/ata/ata-all.h	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/dev/ata/ata-all.h	(revision 247192)
@@ -1,776 +1,777 @@
 /*-
  * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_ata.h"
 
 #if 0
 #define	ATA_LEGACY_SUPPORT		/* Enable obsolete features that break
 					 * some modern devices */
 #endif
 
 /* ATA register defines */
 #define ATA_DATA                        0       /* (RW) data */
 
 #define ATA_FEATURE                     1       /* (W) feature */
 #define         ATA_F_DMA               0x01    /* enable DMA */
 #define         ATA_F_OVL               0x02    /* enable overlap */
 
 #define ATA_COUNT                       2       /* (W) sector count */
 
 #define ATA_SECTOR                      3       /* (RW) sector # */
 #define ATA_CYL_LSB                     4       /* (RW) cylinder# LSB */
 #define ATA_CYL_MSB                     5       /* (RW) cylinder# MSB */
 #define ATA_DRIVE                       6       /* (W) Sector/Drive/Head */
 #define         ATA_D_LBA               0x40    /* use LBA addressing */
 #define         ATA_D_IBM               0xa0    /* 512 byte sectors, ECC */
 
 #define ATA_COMMAND                     7       /* (W) command */
 
 #define ATA_ERROR                       8       /* (R) error */
 #define         ATA_E_ILI               0x01    /* illegal length */
 #define         ATA_E_NM                0x02    /* no media */
 #define         ATA_E_ABORT             0x04    /* command aborted */
 #define         ATA_E_MCR               0x08    /* media change request */
 #define         ATA_E_IDNF              0x10    /* ID not found */
 #define         ATA_E_MC                0x20    /* media changed */
 #define         ATA_E_UNC               0x40    /* uncorrectable data */
 #define         ATA_E_ICRC              0x80    /* UDMA crc error */
 #define		ATA_E_ATAPI_SENSE_MASK	0xf0	/* ATAPI sense key mask */
 
 #define ATA_IREASON                     9       /* (R) interrupt reason */
 #define         ATA_I_CMD               0x01    /* cmd (1) | data (0) */
 #define         ATA_I_IN                0x02    /* read (1) | write (0) */
 #define         ATA_I_RELEASE           0x04    /* released bus (1) */
 #define         ATA_I_TAGMASK           0xf8    /* tag mask */
 
 #define ATA_STATUS                      10      /* (R) status */
 #define ATA_ALTSTAT                     11      /* (R) alternate status */
 #define         ATA_S_ERROR             0x01    /* error */
 #define         ATA_S_INDEX             0x02    /* index */
 #define         ATA_S_CORR              0x04    /* data corrected */
 #define         ATA_S_DRQ               0x08    /* data request */
 #define         ATA_S_DSC               0x10    /* drive seek completed */
 #define         ATA_S_SERVICE           0x10    /* drive needs service */
 #define         ATA_S_DWF               0x20    /* drive write fault */
 #define         ATA_S_DMA               0x20    /* DMA ready */
 #define         ATA_S_READY             0x40    /* drive ready */
 #define         ATA_S_BUSY              0x80    /* busy */
 
 #define ATA_CONTROL                     12      /* (W) control */
 
 #define ATA_CTLOFFSET                   0x206   /* control register offset */
 #define ATA_PCCARD_CTLOFFSET            0x0e    /* do for PCCARD devices */
 #define ATA_PC98_CTLOFFSET              0x10c   /* do for PC98 devices */
 #define         ATA_A_IDS               0x02    /* disable interrupts */
 #define         ATA_A_RESET             0x04    /* RESET controller */
 #ifdef	ATA_LEGACY_SUPPORT			
 #define         ATA_A_4BIT              0x08    /* 4 head bits: obsolete 1996 */
 #else
 #define         ATA_A_4BIT              0x00 
 #endif
 #define         ATA_A_HOB               0x80    /* High Order Byte enable */
 
 /* SATA register defines */
 #define ATA_SSTATUS                     13
 #define         ATA_SS_DET_MASK         0x0000000f
 #define         ATA_SS_DET_NO_DEVICE    0x00000000
 #define         ATA_SS_DET_DEV_PRESENT  0x00000001
 #define         ATA_SS_DET_PHY_ONLINE   0x00000003
 #define         ATA_SS_DET_PHY_OFFLINE  0x00000004
 
 #define         ATA_SS_SPD_MASK         0x000000f0
 #define         ATA_SS_SPD_NO_SPEED     0x00000000
 #define         ATA_SS_SPD_GEN1         0x00000010
 #define         ATA_SS_SPD_GEN2         0x00000020
 
 #define         ATA_SS_IPM_MASK         0x00000f00
 #define         ATA_SS_IPM_NO_DEVICE    0x00000000
 #define         ATA_SS_IPM_ACTIVE       0x00000100
 #define         ATA_SS_IPM_PARTIAL      0x00000200
 #define         ATA_SS_IPM_SLUMBER      0x00000600
 
 #define ATA_SERROR                      14
 #define         ATA_SE_DATA_CORRECTED   0x00000001
 #define         ATA_SE_COMM_CORRECTED   0x00000002
 #define         ATA_SE_DATA_ERR         0x00000100
 #define         ATA_SE_COMM_ERR         0x00000200
 #define         ATA_SE_PROT_ERR         0x00000400
 #define         ATA_SE_HOST_ERR         0x00000800
 #define         ATA_SE_PHY_CHANGED      0x00010000
 #define         ATA_SE_PHY_IERROR       0x00020000
 #define         ATA_SE_COMM_WAKE        0x00040000
 #define         ATA_SE_DECODE_ERR       0x00080000
 #define         ATA_SE_PARITY_ERR       0x00100000
 #define         ATA_SE_CRC_ERR          0x00200000
 #define         ATA_SE_HANDSHAKE_ERR    0x00400000
 #define         ATA_SE_LINKSEQ_ERR      0x00800000
 #define         ATA_SE_TRANSPORT_ERR    0x01000000
 #define         ATA_SE_UNKNOWN_FIS      0x02000000
 
 #define ATA_SCONTROL                    15
 #define         ATA_SC_DET_MASK         0x0000000f
 #define         ATA_SC_DET_IDLE         0x00000000
 #define         ATA_SC_DET_RESET        0x00000001
 #define         ATA_SC_DET_DISABLE      0x00000004
 
 #define         ATA_SC_SPD_MASK         0x000000f0
 #define         ATA_SC_SPD_NO_SPEED     0x00000000
 #define         ATA_SC_SPD_SPEED_GEN1   0x00000010
 #define         ATA_SC_SPD_SPEED_GEN2   0x00000020
 #define         ATA_SC_SPD_SPEED_GEN3   0x00000040
 
 #define         ATA_SC_IPM_MASK         0x00000f00
 #define         ATA_SC_IPM_NONE         0x00000000
 #define         ATA_SC_IPM_DIS_PARTIAL  0x00000100
 #define         ATA_SC_IPM_DIS_SLUMBER  0x00000200
 
 #define ATA_SACTIVE                     16
 
 /* SATA AHCI v1.0 register defines */
 #define ATA_AHCI_CAP                    0x00
 #define		ATA_AHCI_CAP_NPMASK	0x0000001f
 #define		ATA_AHCI_CAP_SXS	0x00000020
 #define		ATA_AHCI_CAP_EMS	0x00000040
 #define		ATA_AHCI_CAP_CCCS	0x00000080
 #define		ATA_AHCI_CAP_NCS	0x00001F00
 #define		ATA_AHCI_CAP_NCS_SHIFT	8
 #define		ATA_AHCI_CAP_PSC	0x00002000
 #define		ATA_AHCI_CAP_SSC	0x00004000
 #define		ATA_AHCI_CAP_PMD	0x00008000
 #define		ATA_AHCI_CAP_FBSS	0x00010000
 #define		ATA_AHCI_CAP_SPM	0x00020000
 #define		ATA_AHCI_CAP_SAM	0x00080000
 #define		ATA_AHCI_CAP_ISS	0x00F00000
 #define		ATA_AHCI_CAP_ISS_SHIFT	20
 #define		ATA_AHCI_CAP_SCLO	0x01000000
 #define		ATA_AHCI_CAP_SAL	0x02000000
 #define		ATA_AHCI_CAP_SALP	0x04000000
 #define		ATA_AHCI_CAP_SSS	0x08000000
 #define		ATA_AHCI_CAP_SMPS	0x10000000
 #define		ATA_AHCI_CAP_SSNTF	0x20000000
 #define		ATA_AHCI_CAP_SNCQ	0x40000000
 #define		ATA_AHCI_CAP_64BIT	0x80000000
 
 #define ATA_AHCI_GHC                    0x04
 #define         ATA_AHCI_GHC_AE         0x80000000
 #define         ATA_AHCI_GHC_IE         0x00000002
 #define         ATA_AHCI_GHC_HR         0x00000001
 
 #define ATA_AHCI_IS                     0x08
 #define ATA_AHCI_PI                     0x0c
 #define ATA_AHCI_VS                     0x10
 
 #define ATA_AHCI_OFFSET                 0x80
 
 #define ATA_AHCI_P_CLB                  0x100
 #define ATA_AHCI_P_CLBU                 0x104
 #define ATA_AHCI_P_FB                   0x108
 #define ATA_AHCI_P_FBU                  0x10c
 #define ATA_AHCI_P_IS                   0x110
 #define ATA_AHCI_P_IE                   0x114
 #define         ATA_AHCI_P_IX_DHR       0x00000001
 #define         ATA_AHCI_P_IX_PS        0x00000002
 #define         ATA_AHCI_P_IX_DS        0x00000004
 #define         ATA_AHCI_P_IX_SDB       0x00000008
 #define         ATA_AHCI_P_IX_UF        0x00000010
 #define         ATA_AHCI_P_IX_DP        0x00000020
 #define         ATA_AHCI_P_IX_PC        0x00000040
 #define         ATA_AHCI_P_IX_DI        0x00000080
 
 #define         ATA_AHCI_P_IX_PRC       0x00400000
 #define         ATA_AHCI_P_IX_IPM       0x00800000
 #define         ATA_AHCI_P_IX_OF        0x01000000
 #define         ATA_AHCI_P_IX_INF       0x04000000
 #define         ATA_AHCI_P_IX_IF        0x08000000
 #define         ATA_AHCI_P_IX_HBD       0x10000000
 #define         ATA_AHCI_P_IX_HBF       0x20000000
 #define         ATA_AHCI_P_IX_TFE       0x40000000
 #define         ATA_AHCI_P_IX_CPD       0x80000000
 
 #define ATA_AHCI_P_CMD                  0x118
 #define         ATA_AHCI_P_CMD_ST       0x00000001
 #define         ATA_AHCI_P_CMD_SUD      0x00000002
 #define         ATA_AHCI_P_CMD_POD      0x00000004
 #define         ATA_AHCI_P_CMD_CLO      0x00000008
 #define         ATA_AHCI_P_CMD_FRE      0x00000010
 #define         ATA_AHCI_P_CMD_CCS_MASK 0x00001f00
 #define         ATA_AHCI_P_CMD_ISS      0x00002000
 #define         ATA_AHCI_P_CMD_FR       0x00004000
 #define         ATA_AHCI_P_CMD_CR       0x00008000
 #define         ATA_AHCI_P_CMD_CPS      0x00010000
 #define         ATA_AHCI_P_CMD_PMA      0x00020000
 #define         ATA_AHCI_P_CMD_HPCP     0x00040000
 #define         ATA_AHCI_P_CMD_ISP      0x00080000
 #define         ATA_AHCI_P_CMD_CPD      0x00100000
 #define         ATA_AHCI_P_CMD_ATAPI    0x01000000
 #define         ATA_AHCI_P_CMD_DLAE     0x02000000
 #define         ATA_AHCI_P_CMD_ALPE     0x04000000
 #define         ATA_AHCI_P_CMD_ASP      0x08000000
 #define         ATA_AHCI_P_CMD_ICC_MASK 0xf0000000
 #define         ATA_AHCI_P_CMD_NOOP     0x00000000
 #define         ATA_AHCI_P_CMD_ACTIVE   0x10000000
 #define         ATA_AHCI_P_CMD_PARTIAL  0x20000000
 #define         ATA_AHCI_P_CMD_SLUMBER  0x60000000
 
 #define ATA_AHCI_P_TFD                  0x120
 #define ATA_AHCI_P_SIG                  0x124
 #define ATA_AHCI_P_SSTS                 0x128
 #define ATA_AHCI_P_SCTL                 0x12c
 #define ATA_AHCI_P_SERR                 0x130
 #define ATA_AHCI_P_SACT                 0x134
 #define ATA_AHCI_P_CI                   0x138
 #define ATA_AHCI_P_SNTF                 0x13C
 #define ATA_AHCI_P_FBS                  0x140
 
 #define ATA_AHCI_CL_SIZE                32
 #define ATA_AHCI_CL_OFFSET              0
 #define ATA_AHCI_FB_OFFSET              (ATA_AHCI_CL_SIZE * 32)
 #define ATA_AHCI_CT_OFFSET              (ATA_AHCI_FB_OFFSET + 4096)
 #define ATA_AHCI_CT_SIZE                (2176 + 128)
 
 struct ata_ahci_dma_prd {
     u_int64_t                   dba;
     u_int32_t                   reserved;
     u_int32_t                   dbc;            /* 0 based */
 #define ATA_AHCI_PRD_MASK       0x003fffff      /* max 4MB */
 #define ATA_AHCI_PRD_IPC        (1<<31)
 } __packed;
 
 struct ata_ahci_cmd_tab {
     u_int8_t                    cfis[64];
     u_int8_t                    acmd[32];
     u_int8_t                    reserved[32];
 #define ATA_AHCI_DMA_ENTRIES            129
     struct ata_ahci_dma_prd     prd_tab[ATA_AHCI_DMA_ENTRIES];
 } __packed;
 
 struct ata_ahci_cmd_list {
     u_int16_t                   cmd_flags;
 #define ATA_AHCI_CMD_ATAPI		0x0020
 #define ATA_AHCI_CMD_WRITE		0x0040
 #define ATA_AHCI_CMD_PREFETCH		0x0080
 #define ATA_AHCI_CMD_RESET		0x0100
 #define ATA_AHCI_CMD_BIST		0x0200
 #define ATA_AHCI_CMD_CLR_BUSY		0x0400
 
     u_int16_t                   prd_length;     /* PRD entries */
     u_int32_t                   bytecount;
     u_int64_t                   cmd_table_phys; /* 128byte aligned */
 } __packed;
 
 
 /* DMA register defines */
 #define ATA_DMA_ENTRIES                 256
 #define ATA_DMA_EOT                     0x80000000
 
 #define ATA_BMCMD_PORT                  17
 #define         ATA_BMCMD_START_STOP    0x01
 #define         ATA_BMCMD_WRITE_READ    0x08
 
 #define ATA_BMDEVSPEC_0                 18
 #define ATA_BMSTAT_PORT                 19
 #define         ATA_BMSTAT_ACTIVE       0x01
 #define         ATA_BMSTAT_ERROR        0x02
 #define         ATA_BMSTAT_INTERRUPT    0x04
 #define         ATA_BMSTAT_MASK         0x07
 #define         ATA_BMSTAT_DMA_MASTER   0x20
 #define         ATA_BMSTAT_DMA_SLAVE    0x40
 #define         ATA_BMSTAT_DMA_SIMPLEX  0x80
 
 #define ATA_BMDEVSPEC_1                 20
 #define ATA_BMDTP_PORT                  21
 
 #define ATA_IDX_ADDR                    22
 #define ATA_IDX_DATA                    23
 #define ATA_MAX_RES                     24
 
 /* misc defines */
 #define ATA_PRIMARY                     0x1f0
 #define ATA_SECONDARY                   0x170
 #define ATA_PC98_BANK                   0x432
 #define ATA_IOSIZE                      0x08
 #define ATA_PC98_IOSIZE                 0x10
 #define ATA_CTLIOSIZE                   0x01
 #define ATA_BMIOSIZE                    0x08
 #define ATA_PC98_BANKIOSIZE             0x01
 #define ATA_IOADDR_RID                  0
 #define ATA_CTLADDR_RID                 1
 #define ATA_BMADDR_RID                  0x20
 #define ATA_PC98_CTLADDR_RID            8
 #define ATA_PC98_BANKADDR_RID           9
 #define ATA_IRQ_RID                     0
 #define ATA_DEV(unit)                   ((unit > 0) ? 0x10 : 0)
 #define ATA_CFA_MAGIC1                  0x844A
 #define ATA_CFA_MAGIC2                  0x848A
 #define ATA_CFA_MAGIC3                  0x8400
 #define ATAPI_MAGIC_LSB                 0x14
 #define ATAPI_MAGIC_MSB                 0xeb
 #define ATAPI_P_READ                    (ATA_S_DRQ | ATA_I_IN)
 #define ATAPI_P_WRITE                   (ATA_S_DRQ)
 #define ATAPI_P_CMDOUT                  (ATA_S_DRQ | ATA_I_CMD)
 #define ATAPI_P_DONEDRQ                 (ATA_S_DRQ | ATA_I_CMD | ATA_I_IN)
 #define ATAPI_P_DONE                    (ATA_I_CMD | ATA_I_IN)
 #define ATAPI_P_ABORT                   0
 #define ATA_INTR_FLAGS                  (INTR_MPSAFE|INTR_TYPE_BIO|INTR_ENTROPY)
 #define ATA_OP_CONTINUES                0
 #define ATA_OP_FINISHED                 1
 #define ATA_MAX_28BIT_LBA               268435455UL
 
 #ifndef	ATA_REQUEST_TIMEOUT
 #define	ATA_REQUEST_TIMEOUT		10
 #endif
 
 /* structure used for composite atomic operations */
 #define MAX_COMPOSITES          32              /* u_int32_t bits */
 struct ata_composite {
     struct mtx          lock;                   /* control lock */
     u_int32_t           rd_needed;              /* needed read subdisks */
     u_int32_t           rd_done;                /* done read subdisks */
     u_int32_t           wr_needed;              /* needed write subdisks */
     u_int32_t           wr_depend;              /* write depends on subdisks */
     u_int32_t           wr_done;                /* done write subdisks */
     struct ata_request  *request[MAX_COMPOSITES];
     u_int32_t           residual;               /* bytes still to transfer */
     caddr_t             data_1;     
     caddr_t             data_2;     
 };
 
 /* structure used to queue an ATA/ATAPI request */
 struct ata_request {
     device_t                    dev;            /* device handle */
     device_t                    parent;         /* channel handle */
     int				unit;		/* physical unit */
     union {
 	struct {
 	    u_int8_t            command;        /* command reg */
 	    u_int16_t           feature;        /* feature reg */
 	    u_int16_t           count;          /* count reg */
 	    u_int64_t           lba;            /* lba reg */
 	} ata;
 	struct {
 	    u_int8_t            ccb[16];        /* ATAPI command block */
 	    struct atapi_sense  sense;          /* ATAPI request sense data */
 	    u_int8_t            saved_cmd;      /* ATAPI saved command */
 	} atapi;
     } u;
     u_int32_t                   bytecount;      /* bytes to transfer */
     u_int32_t                   transfersize;   /* bytes pr transfer */
     caddr_t                     data;           /* pointer to data buf */
     u_int32_t                   tag;            /* HW tag of this request */
     int                         flags;
 #define         ATA_R_CONTROL           0x00000001
 #define         ATA_R_READ              0x00000002
 #define         ATA_R_WRITE             0x00000004
 #define         ATA_R_ATAPI             0x00000008
 #define         ATA_R_DMA               0x00000010
 #define         ATA_R_QUIET             0x00000020
 #define         ATA_R_TIMEOUT           0x00000040
 #define         ATA_R_48BIT             0x00000080
 
 #define         ATA_R_ORDERED           0x00000100
 #define         ATA_R_AT_HEAD           0x00000200
 #define         ATA_R_REQUEUE           0x00000400
 #define         ATA_R_THREAD            0x00000800
 #define         ATA_R_DIRECT            0x00001000
 #define         ATA_R_NEEDRESULT        0x00002000
+#define         ATA_R_DATA_IN_CCB       0x00004000
 
 #define         ATA_R_ATAPI16           0x00010000
 #define         ATA_R_ATAPI_INTR        0x00020000
 
 #define         ATA_R_DEBUG             0x10000000
 #define         ATA_R_DANGER1           0x20000000
 #define         ATA_R_DANGER2           0x40000000
 
     struct ata_dmaslot          *dma;           /* DMA slot of this request */
     u_int8_t                    status;         /* ATA status */
     u_int8_t                    error;          /* ATA error */
     u_int32_t                   donecount;      /* bytes transferred */
     int                         result;         /* result error code */
     void                        (*callback)(struct ata_request *request);
     struct sema                 done;           /* request done sema */
     int                         retries;        /* retry count */
     int                         timeout;        /* timeout for this cmd */
     struct callout              callout;        /* callout management */
     struct task                 task;           /* task management */
     struct bio                  *bio;           /* bio for this request */
     int                         this;           /* this request ID */
     struct ata_composite        *composite;     /* for composite atomic ops */
     void                        *driver;        /* driver specific */
     TAILQ_ENTRY(ata_request)    chain;          /* list management */
 #ifdef ATA_CAM
     union ccb			*ccb;
 #endif
 };
 
 /* define this for debugging request processing */
 #if 0
 #define ATA_DEBUG_RQ(request, string) \
     { \
     if (request->flags & ATA_R_DEBUG) \
 	device_printf(request->parent, "req=%p %s " string "\n", \
 		      request, ata_cmd2str(request)); \
     }
 #else
 #define ATA_DEBUG_RQ(request, string)
 #endif
 
 
 /* structure describing an ATA/ATAPI device */
 struct ata_device {
     device_t                    dev;            /* device handle */
     int                         unit;           /* physical unit */
 #define         ATA_MASTER              0x00
 #define         ATA_SLAVE               0x01
 #define         ATA_PM                  0x0f
 
     struct ata_params           param;          /* ata param structure */
     int                         mode;           /* current transfermode */
     u_int32_t                   max_iosize;     /* max IO size */
     int				spindown;	/* idle spindown timeout */
     struct callout              spindown_timer;
     int                         spindown_state;
     int                         flags;
 #define         ATA_D_USE_CHS           0x0001
 #define         ATA_D_MEDIA_CHANGED     0x0002
 #define         ATA_D_ENC_PRESENT       0x0004
 };
 
 /* structure for holding DMA Physical Region Descriptors (PRD) entries */
 struct ata_dma_prdentry {
     u_int32_t addr;
     u_int32_t count;
 };  
 
 /* structure used by the setprd function */
 struct ata_dmasetprd_args {
     void *dmatab;
     int nsegs;
     int error;
 };
 
 struct ata_dmaslot {
     u_int8_t                    status;         /* DMA status */
     bus_dma_tag_t               sg_tag;         /* SG list DMA tag */
     bus_dmamap_t                sg_map;         /* SG list DMA map */
     void                        *sg;            /* DMA transfer table */
     bus_addr_t                  sg_bus;         /* bus address of dmatab */
     bus_dma_tag_t               data_tag;       /* data DMA tag */
     bus_dmamap_t                data_map;       /* data DMA map */
 };
 
 /* structure holding DMA related information */
 struct ata_dma {
     bus_dma_tag_t               dmatag;         /* parent DMA tag */
     bus_dma_tag_t               work_tag;       /* workspace DMA tag */
     bus_dmamap_t                work_map;       /* workspace DMA map */
     u_int8_t                    *work;          /* workspace */
     bus_addr_t                  work_bus;       /* bus address of dmatab */
 
 #define ATA_DMA_SLOTS			1
     int				dma_slots;	/* DMA slots allocated */
     struct ata_dmaslot		slot[ATA_DMA_SLOTS];
     u_int32_t                   alignment;      /* DMA SG list alignment */
     u_int32_t                   boundary;       /* DMA SG list boundary */
     u_int32_t                   segsize;        /* DMA SG list segment size */
     u_int32_t                   max_iosize;     /* DMA data max IO size */
     u_int64_t                   max_address;    /* highest DMA'able address */
     int                         flags;
 #define ATA_DMA_ACTIVE                  0x01    /* DMA transfer in progress */
 
     void (*alloc)(device_t dev);
     void (*free)(device_t dev);
     void (*setprd)(void *xsc, bus_dma_segment_t *segs, int nsegs, int error);
     int (*load)(struct ata_request *request, void *addr, int *nsegs);
     int (*unload)(struct ata_request *request);
     int (*start)(struct ata_request *request);
     int (*stop)(struct ata_request *request);
     void (*reset)(device_t dev);
 };
 
 /* structure holding lowlevel functions */
 struct ata_lowlevel {
     u_int32_t (*softreset)(device_t dev, int pmport);
     int (*pm_read)(device_t dev, int port, int reg, u_int32_t *result);
     int (*pm_write)(device_t dev, int port, int reg, u_int32_t value);
     int (*status)(device_t dev);
     int (*begin_transaction)(struct ata_request *request);
     int (*end_transaction)(struct ata_request *request);
     int (*command)(struct ata_request *request);
     void (*tf_read)(struct ata_request *request);
     void (*tf_write)(struct ata_request *request);
 };
 
 /* structure holding resources for an ATA channel */
 struct ata_resource {
     struct resource             *res;
     int                         offset;
 };
 
 #ifdef ATA_CAM
 struct ata_cam_device {
 	u_int			revision;
 	int			mode;
 	u_int			bytecount;
 	u_int			atapi;
 	u_int			caps;
 };
 #endif
 
 /* structure describing an ATA channel */
 struct ata_channel {
     device_t                    dev;            /* device handle */
     int                         unit;           /* physical channel */
     int                         attached;       /* channel is attached */
     struct ata_resource         r_io[ATA_MAX_RES];/* I/O resources */
     struct resource             *r_irq;         /* interrupt of this channel */
     void                        *ih;            /* interrupt handle */
     struct ata_lowlevel         hw;             /* lowlevel HW functions */
     struct ata_dma              dma;            /* DMA data / functions */
     int                         flags;          /* channel flags */
 #define         ATA_NO_SLAVE            0x01
 #define         ATA_USE_16BIT           0x02
 #define         ATA_ATAPI_DMA_RO        0x04
 #define         ATA_NO_48BIT_DMA        0x08
 #define         ATA_ALWAYS_DMASTAT      0x10
 #define         ATA_CHECKS_CABLE	0x20
 #define         ATA_NO_ATAPI_DMA	0x40
 #define         ATA_SATA		0x80
 #define         ATA_DMA_BEFORE_CMD	0x100
 #define         ATA_KNOWN_PRESENCE	0x200
 #define         ATA_STATUS_IS_LONG	0x400
 #define         ATA_PERIODIC_POLL	0x800
 
     int				pm_level;	/* power management level */
     int                         devices;        /* what is present */
 #define         ATA_ATA_MASTER          0x00000001
 #define         ATA_ATA_SLAVE           0x00000002
 #define         ATA_PORTMULTIPLIER      0x00008000
 #define         ATA_ATAPI_MASTER        0x00010000
 #define         ATA_ATAPI_SLAVE         0x00020000
 
     struct mtx                  state_mtx;      /* state lock */
     int                         state;          /* ATA channel state */
 #define         ATA_IDLE                0x0000
 #define         ATA_ACTIVE              0x0001
 #define         ATA_STALL_QUEUE         0x0002
 
     struct mtx                  queue_mtx;      /* queue lock */
     TAILQ_HEAD(, ata_request)   ata_queue;      /* head of ATA queue */
     struct ata_request          *freezepoint;   /* composite freezepoint */
     struct ata_request          *running;       /* currently running request */
     struct task			conntask;	/* PHY events handling task */
 #ifdef ATA_CAM
 	struct cam_sim		*sim;
 	struct cam_path		*path;
 	struct ata_cam_device	user[16];       /* User-specified settings */
 	struct ata_cam_device	curr[16];       /* Current settings */
 	int			requestsense;	/* CCB waiting for SENSE. */
 #endif
 	struct callout		poll_callout;	/* Periodic status poll. */
 };
 
 /* disk bay/enclosure related */
 #define         ATA_LED_OFF             0x00
 #define         ATA_LED_RED             0x01
 #define         ATA_LED_GREEN           0x02
 #define         ATA_LED_ORANGE          0x03
 #define         ATA_LED_MASK            0x03
 
 /* externs */
 extern int (*ata_raid_ioctl_func)(u_long cmd, caddr_t data);
 extern struct intr_config_hook *ata_delayed_attach;
 extern devclass_t ata_devclass;
 extern int ata_wc;
 extern int ata_setmax;
 extern int ata_dma_check_80pin;
 
 /* public prototypes */
 /* ata-all.c: */
 int ata_probe(device_t dev);
 int ata_attach(device_t dev);
 int ata_detach(device_t dev);
 int ata_reinit(device_t dev);
 int ata_suspend(device_t dev);
 int ata_resume(device_t dev);
 void ata_interrupt(void *data);
 int ata_device_ioctl(device_t dev, u_long cmd, caddr_t data);
 int ata_getparam(struct ata_device *atadev, int init);
 int ata_identify(device_t dev);
 void ata_default_registers(device_t dev);
 void ata_modify_if_48bit(struct ata_request *request);
 void ata_udelay(int interval);
 const char *ata_unit2str(struct ata_device *atadev);
 const char *ata_mode2str(int mode);
 int ata_str2mode(const char *str);
 const char *ata_satarev2str(int rev);
 int ata_atapi(device_t dev, int target);
 int ata_pmode(struct ata_params *ap);
 int ata_wmode(struct ata_params *ap);
 int ata_umode(struct ata_params *ap);
 int ata_limit_mode(device_t dev, int mode, int maxmode);
 void ata_setmode(device_t dev);
 void ata_print_cable(device_t dev, u_int8_t *who);
 int ata_check_80pin(device_t dev, int mode);
 #ifdef ATA_CAM
 void ata_cam_begin_transaction(device_t dev, union ccb *ccb);
 void ata_cam_end_transaction(device_t dev, struct ata_request *request);
 #endif
 
 /* ata-queue.c: */
 int ata_controlcmd(device_t dev, u_int8_t command, u_int16_t feature, u_int64_t lba, u_int16_t count);
 int ata_atapicmd(device_t dev, u_int8_t *ccb, caddr_t data, int count, int flags, int timeout);
 void ata_queue_request(struct ata_request *request);
 void ata_start(device_t dev);
 void ata_finish(struct ata_request *request);
 void ata_timeout(struct ata_request *);
 void ata_catch_inflight(device_t dev);
 void ata_fail_requests(device_t dev);
 void ata_drop_requests(device_t dev);
 const char *ata_cmd2str(struct ata_request *request);
 
 /* ata-lowlevel.c: */
 void ata_generic_hw(device_t dev);
 int ata_begin_transaction(struct ata_request *);
 int ata_end_transaction(struct ata_request *);
 void ata_generic_reset(device_t dev);
 int ata_generic_command(struct ata_request *request);
 
 /* ata-dma.c: */
 void ata_dmainit(device_t);
 void ata_dmafini(device_t dev);
 
 /* ata-sata.c: */
 void ata_sata_phy_check_events(device_t dev, int port);
 int ata_sata_scr_read(struct ata_channel *ch, int port, int reg, uint32_t *val);
 int ata_sata_scr_write(struct ata_channel *ch, int port, int reg, uint32_t val);
 int ata_sata_phy_reset(device_t dev, int port, int quick);
 int ata_sata_setmode(device_t dev, int target, int mode);
 int ata_sata_getrev(device_t dev, int target);
 int ata_request2fis_h2d(struct ata_request *request, u_int8_t *fis);
 void ata_pm_identify(device_t dev);
 
 /* macros for alloc/free of struct ata_request */
 extern uma_zone_t ata_request_zone;
 #define ata_alloc_request() uma_zalloc(ata_request_zone, M_NOWAIT | M_ZERO)
 #define ata_free_request(request) { \
 	if (!(request->flags & ATA_R_DANGER2)) \
 	    uma_zfree(ata_request_zone, request); \
 	}
 
 /* macros for alloc/free of struct ata_composite */
 extern uma_zone_t ata_composite_zone;
 #define ata_alloc_composite() uma_zalloc(ata_composite_zone, M_NOWAIT | M_ZERO)
 #define ata_free_composite(composite) uma_zfree(ata_composite_zone, composite)
 
 MALLOC_DECLARE(M_ATA);
 
 /* misc newbus defines */
 #define GRANDPARENT(dev)        device_get_parent(device_get_parent(dev))
 
 /* macros to hide busspace uglyness */
 #define ATA_INB(res, offset) \
 	bus_read_1((res), (offset))
 
 #define ATA_INW(res, offset) \
 	bus_read_2((res), (offset))
 #define ATA_INW_STRM(res, offset) \
 	bus_read_stream_2((res), (offset))
 #define ATA_INL(res, offset) \
 	bus_read_4((res), (offset))
 #define ATA_INSW(res, offset, addr, count) \
 	bus_read_multi_2((res), (offset), (addr), (count))
 #define ATA_INSW_STRM(res, offset, addr, count) \
 	bus_read_multi_stream_2((res), (offset), (addr), (count))
 #define ATA_INSL(res, offset, addr, count) \
 	bus_read_multi_4((res), (offset), (addr), (count))
 #define ATA_INSL_STRM(res, offset, addr, count) \
 	bus_read_multi_stream_4((res), (offset), (addr), (count))
 #define ATA_OUTB(res, offset, value) \
 	bus_write_1((res), (offset), (value))
 #define ATA_OUTW(res, offset, value) \
 	bus_write_2((res), (offset), (value))
 #define ATA_OUTW_STRM(res, offset, value) \
 	bus_write_stream_2((res), (offset), (value))
 #define ATA_OUTL(res, offset, value) \
 	bus_write_4((res), (offset), (value))
 #define ATA_OUTSW(res, offset, addr, count) \
 	bus_write_multi_2((res), (offset), (addr), (count))
 #define ATA_OUTSW_STRM(res, offset, addr, count) \
 	bus_write_multi_stream_2((res), (offset), (addr), (count))
 #define ATA_OUTSL(res, offset, addr, count) \
 	bus_write_multi_4((res), (offset), (addr), (count))
 #define ATA_OUTSL_STRM(res, offset, addr, count) \
 	bus_write_multi_stream_4((res), (offset), (addr), (count))
 
 #define ATA_IDX_INB(ch, idx) \
 	ATA_INB(ch->r_io[idx].res, ch->r_io[idx].offset)
 
 #define ATA_IDX_INW(ch, idx) \
 	ATA_INW(ch->r_io[idx].res, ch->r_io[idx].offset)
 
 #define ATA_IDX_INW_STRM(ch, idx) \
 	ATA_INW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset)
 
 #define ATA_IDX_INL(ch, idx) \
 	ATA_INL(ch->r_io[idx].res, ch->r_io[idx].offset)
 
 #define ATA_IDX_INSW(ch, idx, addr, count) \
 	ATA_INSW(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
 
 #define ATA_IDX_INSW_STRM(ch, idx, addr, count) \
 	ATA_INSW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
 
 #define ATA_IDX_INSL(ch, idx, addr, count) \
 	ATA_INSL(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
 
 #define ATA_IDX_INSL_STRM(ch, idx, addr, count) \
 	ATA_INSL_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
 
 #define ATA_IDX_OUTB(ch, idx, value) \
 	ATA_OUTB(ch->r_io[idx].res, ch->r_io[idx].offset, value)
 
 #define ATA_IDX_OUTW(ch, idx, value) \
 	ATA_OUTW(ch->r_io[idx].res, ch->r_io[idx].offset, value)
 
 #define ATA_IDX_OUTW_STRM(ch, idx, value) \
 	ATA_OUTW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, value)
 
 #define ATA_IDX_OUTL(ch, idx, value) \
 	ATA_OUTL(ch->r_io[idx].res, ch->r_io[idx].offset, value)
 
 #define ATA_IDX_OUTSW(ch, idx, addr, count) \
 	ATA_OUTSW(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
 
 #define ATA_IDX_OUTSW_STRM(ch, idx, addr, count) \
 	ATA_OUTSW_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
 
 #define ATA_IDX_OUTSL(ch, idx, addr, count) \
 	ATA_OUTSL(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
 
 #define ATA_IDX_OUTSL_STRM(ch, idx, addr, count) \
 	ATA_OUTSL_STRM(ch->r_io[idx].res, ch->r_io[idx].offset, addr, count)
Index: user/attilio/vmobj-rwlock/sys/dev/ata/ata-dma.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/dev/ata/ata-dma.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/dev/ata/ata-dma.c	(revision 247192)
@@ -1,353 +1,353 @@
 /*-
  * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ata.h>
 #include <sys/kernel.h>
 #include <sys/endian.h>
 #include <sys/malloc.h> 
 #include <sys/lock.h>
 #include <sys/sema.h>
 #include <sys/taskqueue.h>
 #include <vm/uma.h>
 #include <sys/bus.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <dev/ata/ata-all.h>
 
 /* prototypes */
 static void ata_dmasetupc_cb(void *xsc, bus_dma_segment_t *segs, int nsegs, int error);
 static void ata_dmaalloc(device_t dev);
 static void ata_dmafree(device_t dev);
 static void ata_dmasetprd(void *xsc, bus_dma_segment_t *segs, int nsegs, int error);
 static int ata_dmaload(struct ata_request *request, void *addr, int *nsegs);
 static int ata_dmaunload(struct ata_request *request);
 
 /* local vars */
 static MALLOC_DEFINE(M_ATADMA, "ata_dma", "ATA driver DMA");
 
 /* misc defines */
 #define MAXTABSZ        PAGE_SIZE
 #define MAXWSPCSZ       PAGE_SIZE*2
 
 struct ata_dc_cb_args {
     bus_addr_t maddr;
     int error;
 };
 
 void 
 ata_dmainit(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
     struct ata_dc_cb_args dcba;
 
     if (ch->dma.alloc == NULL)
 	ch->dma.alloc = ata_dmaalloc;
     if (ch->dma.free == NULL)
 	ch->dma.free = ata_dmafree;
     if (ch->dma.setprd == NULL)
 	ch->dma.setprd = ata_dmasetprd;
     if (ch->dma.load == NULL)
 	ch->dma.load = ata_dmaload;
     if (ch->dma.unload == NULL)
 	ch->dma.unload = ata_dmaunload;
     if (ch->dma.alignment == 0)
 	ch->dma.alignment = 2;
     if (ch->dma.boundary == 0)
 	ch->dma.boundary = 65536;
     if (ch->dma.segsize == 0)
 	ch->dma.segsize = 65536;
     if (ch->dma.max_iosize == 0)
 	ch->dma.max_iosize = MIN((ATA_DMA_ENTRIES - 1) * PAGE_SIZE, MAXPHYS);
     if (ch->dma.max_address == 0)
 	ch->dma.max_address = BUS_SPACE_MAXADDR_32BIT;
     if (ch->dma.dma_slots == 0)
 	ch->dma.dma_slots = 1;
 
     if (bus_dma_tag_create(bus_get_dma_tag(dev), ch->dma.alignment, 0,
 			   ch->dma.max_address, BUS_SPACE_MAXADDR,
 			   NULL, NULL, ch->dma.max_iosize,
 			   ATA_DMA_ENTRIES, ch->dma.segsize,
 			   0, NULL, NULL, &ch->dma.dmatag))
 	goto error;
 
     if (bus_dma_tag_create(ch->dma.dmatag, PAGE_SIZE, 64 * 1024,
 			   ch->dma.max_address, BUS_SPACE_MAXADDR,
 			   NULL, NULL, MAXWSPCSZ, 1, MAXWSPCSZ,
 			   0, NULL, NULL, &ch->dma.work_tag))
 	goto error;
 
     if (bus_dmamem_alloc(ch->dma.work_tag, (void **)&ch->dma.work,
 			 BUS_DMA_WAITOK | BUS_DMA_COHERENT,
 			 &ch->dma.work_map))
 	goto error;
 
     if (bus_dmamap_load(ch->dma.work_tag, ch->dma.work_map, ch->dma.work,
 			MAXWSPCSZ, ata_dmasetupc_cb, &dcba, 0) ||
 			dcba.error) {
 	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
 	goto error;
     }
     ch->dma.work_bus = dcba.maddr;
     return;
 
 error:
     device_printf(dev, "WARNING - DMA initialization failed, disabling DMA\n");
     ata_dmafini(dev);
 }
 
 void 
 ata_dmafini(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
 
     if (ch->dma.work_bus) {
 	bus_dmamap_unload(ch->dma.work_tag, ch->dma.work_map);
 	bus_dmamem_free(ch->dma.work_tag, ch->dma.work, ch->dma.work_map);
 	ch->dma.work_bus = 0;
 	ch->dma.work_map = NULL;
 	ch->dma.work = NULL;
     }
     if (ch->dma.work_tag) {
 	bus_dma_tag_destroy(ch->dma.work_tag);
 	ch->dma.work_tag = NULL;
     }
     if (ch->dma.dmatag) {
 	bus_dma_tag_destroy(ch->dma.dmatag);
 	ch->dma.dmatag = NULL;
     }
 }
 
 static void
 ata_dmasetupc_cb(void *xsc, bus_dma_segment_t *segs, int nsegs, int error)
 {
     struct ata_dc_cb_args *dcba = (struct ata_dc_cb_args *)xsc;
 
     if (!(dcba->error = error))
 	dcba->maddr = segs[0].ds_addr;
 }
 
 static void
 ata_dmaalloc(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
     struct ata_dc_cb_args dcba;
     int i;
 
     /* alloc and setup needed dma slots */
     bzero(ch->dma.slot, sizeof(struct ata_dmaslot) * ATA_DMA_SLOTS);
     for (i = 0; i < ch->dma.dma_slots; i++) {
 	struct ata_dmaslot *slot = &ch->dma.slot[i];
 
 	if (bus_dma_tag_create(ch->dma.dmatag, PAGE_SIZE, PAGE_SIZE,
 			       ch->dma.max_address, BUS_SPACE_MAXADDR,
 			       NULL, NULL, PAGE_SIZE, 1, PAGE_SIZE,
 			       0, NULL, NULL, &slot->sg_tag)) {
             device_printf(ch->dev, "FAILURE - create sg_tag\n");
             goto error;
 	}
 
 	if (bus_dmamem_alloc(slot->sg_tag, (void **)&slot->sg, BUS_DMA_WAITOK,
 			     &slot->sg_map)) {
 	    device_printf(ch->dev, "FAILURE - alloc sg_map\n");
 	    goto error;
         }
 
 	if (bus_dmamap_load(slot->sg_tag, slot->sg_map, slot->sg, MAXTABSZ,
 			    ata_dmasetupc_cb, &dcba, 0) || dcba.error) {
 	    device_printf(ch->dev, "FAILURE - load sg\n");
 	    goto error;
 	}
 	slot->sg_bus = dcba.maddr;
 
 	if (bus_dma_tag_create(ch->dma.dmatag,
 			       ch->dma.alignment, ch->dma.boundary,
                                ch->dma.max_address, BUS_SPACE_MAXADDR,
                                NULL, NULL, ch->dma.max_iosize,
                                ATA_DMA_ENTRIES, ch->dma.segsize,
                                BUS_DMA_ALLOCNOW, NULL, NULL, &slot->data_tag)) {
 	    device_printf(ch->dev, "FAILURE - create data_tag\n");
 	    goto error;
 	}
 
 	if (bus_dmamap_create(slot->data_tag, 0, &slot->data_map)) {
 	    device_printf(ch->dev, "FAILURE - create data_map\n");
 	    goto error;
         }
     }
 
     return;
 
 error:
     device_printf(dev, "WARNING - DMA allocation failed, disabling DMA\n");
     ata_dmafree(dev);
 }
 
 static void
 ata_dmafree(device_t dev)
 {
     struct ata_channel *ch = device_get_softc(dev);
     int i;
 
     /* free all dma slots */
     for (i = 0; i < ATA_DMA_SLOTS; i++) {
 	struct ata_dmaslot *slot = &ch->dma.slot[i];
 
 	if (slot->sg_bus) {
             bus_dmamap_unload(slot->sg_tag, slot->sg_map);
             slot->sg_bus = 0;
 	}
 	if (slot->sg_map) {
             bus_dmamem_free(slot->sg_tag, slot->sg, slot->sg_map);
             bus_dmamap_destroy(slot->sg_tag, slot->sg_map);
             slot->sg = NULL;
             slot->sg_map = NULL;
 	}
 	if (slot->data_map) {
             bus_dmamap_destroy(slot->data_tag, slot->data_map);
             slot->data_map = NULL;
 	}
 	if (slot->sg_tag) {
             bus_dma_tag_destroy(slot->sg_tag);
             slot->sg_tag = NULL;
 	}
 	if (slot->data_tag) {
             bus_dma_tag_destroy(slot->data_tag);
             slot->data_tag = NULL;
 	}
     }
 }
 
 static void
 ata_dmasetprd(void *xsc, bus_dma_segment_t *segs, int nsegs, int error)
 {
     struct ata_dmasetprd_args *args = xsc;
     struct ata_dma_prdentry *prd = args->dmatab;
     int i;
 
     if ((args->error = error))
 	return;
 
     for (i = 0; i < nsegs; i++) {
 	prd[i].addr = htole32(segs[i].ds_addr);
 	prd[i].count = htole32(segs[i].ds_len);
     }
     prd[i - 1].count |= htole32(ATA_DMA_EOT);
     KASSERT(nsegs <= ATA_DMA_ENTRIES, ("too many DMA segment entries\n"));
     args->nsegs = nsegs;
 }
 
 static int
 ata_dmaload(struct ata_request *request, void *addr, int *entries)
 {
     struct ata_channel *ch = device_get_softc(request->parent);
     struct ata_dmasetprd_args dspa;
     int error;
 
     ATA_DEBUG_RQ(request, "dmaload");
 
     if (request->dma) {
 	device_printf(request->parent,
 		      "FAILURE - already active DMA on this device\n");
 	return EIO;
     }
     if (!request->bytecount) {
 	device_printf(request->parent,
 		      "FAILURE - zero length DMA transfer attempted\n");
 	return EIO;
     }
     if (request->bytecount & (ch->dma.alignment - 1)) {
 	device_printf(request->parent,
 		      "FAILURE - odd-sized DMA transfer attempt %d %% %d\n",
 		      request->bytecount, ch->dma.alignment);
 	return EIO;
     }
     if (request->bytecount > ch->dma.max_iosize) {
 	device_printf(request->parent,
 		      "FAILURE - oversized DMA transfer attempt %d > %d\n",
 		      request->bytecount, ch->dma.max_iosize);
 	return EIO;
     }
 
     /* set our slot. XXX SOS NCQ will change that */
     request->dma = &ch->dma.slot[0];
 
     if (addr)
 	dspa.dmatab = addr;
     else
 	dspa.dmatab = request->dma->sg;
 
 #ifdef ATA_CAM
-    if (request->ccb)
+    if (request->flags & ATA_R_DATA_IN_CCB)
         error = bus_dmamap_load_ccb(request->dma->data_tag,
 				request->dma->data_map, request->ccb,
 				ch->dma.setprd, &dspa, BUS_DMA_NOWAIT);
     else
 #endif
         error = bus_dmamap_load(request->dma->data_tag, request->dma->data_map,
 				request->data, request->bytecount,
 				ch->dma.setprd, &dspa, BUS_DMA_NOWAIT);
     if (error || (error = dspa.error)) {
 	device_printf(request->parent, "FAILURE - load data\n");
 	goto error;
     }
 
     if (entries)
 	*entries = dspa.nsegs;
 
     bus_dmamap_sync(request->dma->sg_tag, request->dma->sg_map,
 		    BUS_DMASYNC_PREWRITE);
     bus_dmamap_sync(request->dma->data_tag, request->dma->data_map,
 		    (request->flags & ATA_R_READ) ?
 		    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
     return 0;
 
 error:
     ata_dmaunload(request);
     return EIO;
 }
 
 int
 ata_dmaunload(struct ata_request *request)
 {
     ATA_DEBUG_RQ(request, "dmaunload");
 
     if (request->dma) {
 	bus_dmamap_sync(request->dma->sg_tag, request->dma->sg_map,
 			BUS_DMASYNC_POSTWRITE);
 	bus_dmamap_sync(request->dma->data_tag, request->dma->data_map,
 			(request->flags & ATA_R_READ) ?
 			BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
 
 	bus_dmamap_unload(request->dma->data_tag, request->dma->data_map);
         request->dma = NULL;
     }
     return 0;
 }
Index: user/attilio/vmobj-rwlock/sys/dev/ath/ath_hal/ar5416/ar5416_xmit.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/dev/ath/ath_hal/ar5416/ar5416_xmit.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/dev/ath/ath_hal/ar5416/ar5416_xmit.c	(revision 247192)
@@ -1,1211 +1,1280 @@
 /*
  * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting
  * Copyright (c) 2002-2008 Atheros Communications, Inc.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  *
  * $FreeBSD$
  */
 #include "opt_ah.h"
 
 #include "ah.h"
 #include "ah_desc.h"
 #include "ah_internal.h"
 
 #include "ar5416/ar5416.h"
 #include "ar5416/ar5416reg.h"
 #include "ar5416/ar5416phy.h"
 #include "ar5416/ar5416desc.h"
 
 /*
  * Stop transmit on the specified queue
  */
 HAL_BOOL
 ar5416StopTxDma(struct ath_hal *ah, u_int q)
 {
 #define	STOP_DMA_TIMEOUT	4000	/* us */
 #define	STOP_DMA_ITER		100	/* us */
 	u_int i;
 
 	HALASSERT(q < AH_PRIVATE(ah)->ah_caps.halTotalQueues);
 
 	HALASSERT(AH5212(ah)->ah_txq[q].tqi_type != HAL_TX_QUEUE_INACTIVE);
 
 	OS_REG_WRITE(ah, AR_Q_TXD, 1 << q);
 	for (i = STOP_DMA_TIMEOUT/STOP_DMA_ITER; i != 0; i--) {
 		if (ar5212NumTxPending(ah, q) == 0)
 			break;
 		OS_DELAY(STOP_DMA_ITER);
 	}
 #ifdef AH_DEBUG
 	if (i == 0) {
 		HALDEBUG(ah, HAL_DEBUG_ANY,
 		    "%s: queue %u DMA did not stop in 400 msec\n", __func__, q);
 		HALDEBUG(ah, HAL_DEBUG_ANY,
 		    "%s: QSTS 0x%x Q_TXE 0x%x Q_TXD 0x%x Q_CBR 0x%x\n", __func__,
 		    OS_REG_READ(ah, AR_QSTS(q)), OS_REG_READ(ah, AR_Q_TXE),
 		    OS_REG_READ(ah, AR_Q_TXD), OS_REG_READ(ah, AR_QCBRCFG(q)));
 		HALDEBUG(ah, HAL_DEBUG_ANY,
 		    "%s: Q_MISC 0x%x Q_RDYTIMECFG 0x%x Q_RDYTIMESHDN 0x%x\n",
 		    __func__, OS_REG_READ(ah, AR_QMISC(q)),
 		    OS_REG_READ(ah, AR_QRDYTIMECFG(q)),
 		    OS_REG_READ(ah, AR_Q_RDYTIMESHDN));
 	}
 #endif /* AH_DEBUG */
 
 	/* ar5416 and up can kill packets at the PCU level */
 	if (ar5212NumTxPending(ah, q)) {
 		uint32_t j;
 
 		HALDEBUG(ah, HAL_DEBUG_TXQUEUE,
 		    "%s: Num of pending TX Frames %d on Q %d\n",
 		    __func__, ar5212NumTxPending(ah, q), q);
 
 		/* Kill last PCU Tx Frame */
 		/* TODO - save off and restore current values of Q1/Q2? */
 		for (j = 0; j < 2; j++) {
 			uint32_t tsfLow = OS_REG_READ(ah, AR_TSF_L32);
 			OS_REG_WRITE(ah, AR_QUIET2,
 			    SM(10, AR_QUIET2_QUIET_DUR));
 			OS_REG_WRITE(ah, AR_QUIET_PERIOD, 100);
 			OS_REG_WRITE(ah, AR_NEXT_QUIET, tsfLow >> 10);
 			OS_REG_SET_BIT(ah, AR_TIMER_MODE, AR_TIMER_MODE_QUIET);
 
 			if ((OS_REG_READ(ah, AR_TSF_L32)>>10) == (tsfLow>>10))
 				break;
 
 			HALDEBUG(ah, HAL_DEBUG_ANY,
 			    "%s: TSF moved while trying to set quiet time "
 			    "TSF: 0x%08x\n", __func__, tsfLow);
 			HALASSERT(j < 1); /* TSF shouldn't count twice or reg access is taking forever */
 		}
 		
 		OS_REG_SET_BIT(ah, AR_DIAG_SW, AR_DIAG_CHAN_IDLE);
 		
 		/* Allow the quiet mechanism to do its work */
 		OS_DELAY(200);
 		OS_REG_CLR_BIT(ah, AR_TIMER_MODE, AR_TIMER_MODE_QUIET);
 
 		/* Verify the transmit q is empty */
 		for (i = STOP_DMA_TIMEOUT/STOP_DMA_ITER; i != 0; i--) {
 			if (ar5212NumTxPending(ah, q) == 0)
 				break;
 			OS_DELAY(STOP_DMA_ITER);
 		}
 		if (i == 0) {
 			HALDEBUG(ah, HAL_DEBUG_ANY,
 			    "%s: Failed to stop Tx DMA in %d msec after killing"
 			    " last frame\n", __func__, STOP_DMA_TIMEOUT / 1000);
 		}
 		OS_REG_CLR_BIT(ah, AR_DIAG_SW, AR_DIAG_CHAN_IDLE);
 	}
 
 	OS_REG_WRITE(ah, AR_Q_TXD, 0);
 	return (i != 0);
 #undef STOP_DMA_ITER
 #undef STOP_DMA_TIMEOUT
 }
 
 #define VALID_KEY_TYPES \
         ((1 << HAL_KEY_TYPE_CLEAR) | (1 << HAL_KEY_TYPE_WEP)|\
          (1 << HAL_KEY_TYPE_AES)   | (1 << HAL_KEY_TYPE_TKIP))
 #define isValidKeyType(_t)      ((1 << (_t)) & VALID_KEY_TYPES)
 
 #define set11nTries(_series, _index) \
         (SM((_series)[_index].Tries, AR_XmitDataTries##_index))
 
 #define set11nRate(_series, _index) \
         (SM((_series)[_index].Rate, AR_XmitRate##_index))
 
 #define set11nPktDurRTSCTS(_series, _index) \
         (SM((_series)[_index].PktDuration, AR_PacketDur##_index) |\
          ((_series)[_index].RateFlags & HAL_RATESERIES_RTS_CTS   ?\
          AR_RTSCTSQual##_index : 0))
 
 #define set11nRateFlags(_series, _index) \
         ((_series)[_index].RateFlags & HAL_RATESERIES_2040 ? AR_2040_##_index : 0) \
         |((_series)[_index].RateFlags & HAL_RATESERIES_HALFGI ? AR_GI##_index : 0) \
         |((_series)[_index].RateFlags & HAL_RATESERIES_STBC ? AR_STBC##_index : 0) \
         |SM((_series)[_index].ChSel, AR_ChainSel##_index)
 
 /*
  * Descriptor Access Functions
  */
 
 #define VALID_PKT_TYPES \
         ((1<<HAL_PKT_TYPE_NORMAL)|(1<<HAL_PKT_TYPE_ATIM)|\
          (1<<HAL_PKT_TYPE_PSPOLL)|(1<<HAL_PKT_TYPE_PROBE_RESP)|\
          (1<<HAL_PKT_TYPE_BEACON)|(1<<HAL_PKT_TYPE_AMPDU))
 #define isValidPktType(_t)      ((1<<(_t)) & VALID_PKT_TYPES)
 #define VALID_TX_RATES \
         ((1<<0x0b)|(1<<0x0f)|(1<<0x0a)|(1<<0x0e)|(1<<0x09)|(1<<0x0d)|\
          (1<<0x08)|(1<<0x0c)|(1<<0x1b)|(1<<0x1a)|(1<<0x1e)|(1<<0x19)|\
 	 (1<<0x1d)|(1<<0x18)|(1<<0x1c)|(1<<0x01)|(1<<0x02)|(1<<0x03)|\
 	 (1<<0x04)|(1<<0x05)|(1<<0x06)|(1<<0x07)|(1<<0x00))
 /* NB: accept HT rates */
 #define	isValidTxRate(_r)	((1<<((_r) & 0x7f)) & VALID_TX_RATES)
 
 HAL_BOOL
 ar5416SetupTxDesc(struct ath_hal *ah, struct ath_desc *ds,
 	u_int pktLen,
 	u_int hdrLen,
 	HAL_PKT_TYPE type,
 	u_int txPower,
 	u_int txRate0, u_int txTries0,
 	u_int keyIx,
 	u_int antMode,
 	u_int flags,
 	u_int rtsctsRate,
 	u_int rtsctsDuration,
 	u_int compicvLen,
 	u_int compivLen,
 	u_int comp)
 {
 #define	RTSCTS	(HAL_TXDESC_RTSENA|HAL_TXDESC_CTSENA)
 	struct ar5416_desc *ads = AR5416DESC(ds);
 	struct ath_hal_5416 *ahp = AH5416(ah);
 
 	(void) hdrLen;
 
 	HALASSERT(txTries0 != 0);
 	HALASSERT(isValidPktType(type));
 	HALASSERT(isValidTxRate(txRate0));
 	HALASSERT((flags & RTSCTS) != RTSCTS);
 	/* XXX validate antMode */
 
         txPower = (txPower + AH5212(ah)->ah_txPowerIndexOffset);
         if (txPower > 63)
 		txPower = 63;
 
 	ads->ds_ctl0 = (pktLen & AR_FrameLen)
 		     | (txPower << AR_XmitPower_S)
 		     | (flags & HAL_TXDESC_VEOL ? AR_VEOL : 0)
 		     | (flags & HAL_TXDESC_CLRDMASK ? AR_ClrDestMask : 0)
 		     | (flags & HAL_TXDESC_INTREQ ? AR_TxIntrReq : 0)
 		     ;
 	ads->ds_ctl1 = (type << AR_FrameType_S)
 		     | (flags & HAL_TXDESC_NOACK ? AR_NoAck : 0)
                      ;
 	ads->ds_ctl2 = SM(txTries0, AR_XmitDataTries0)
 		     | (flags & HAL_TXDESC_DURENA ? AR_DurUpdateEn : 0)
 		     ;
 	ads->ds_ctl3 = (txRate0 << AR_XmitRate0_S)
 		     ;
 	ads->ds_ctl4 = 0;
 	ads->ds_ctl5 = 0;
 	ads->ds_ctl6 = 0;
 	ads->ds_ctl7 = SM(ahp->ah_tx_chainmask, AR_ChainSel0) 
 		     | SM(ahp->ah_tx_chainmask, AR_ChainSel1)
 		     | SM(ahp->ah_tx_chainmask, AR_ChainSel2) 
 		     | SM(ahp->ah_tx_chainmask, AR_ChainSel3)
 		     ;
 	ads->ds_ctl8 = SM(0, AR_AntCtl0);
 	ads->ds_ctl9 = SM(0, AR_AntCtl1) | SM(txPower, AR_XmitPower1);
 	ads->ds_ctl10 = SM(0, AR_AntCtl2) | SM(txPower, AR_XmitPower2);
 	ads->ds_ctl11 = SM(0, AR_AntCtl3) | SM(txPower, AR_XmitPower3);
 
 	if (keyIx != HAL_TXKEYIX_INVALID) {
 		/* XXX validate key index */
 		ads->ds_ctl1 |= SM(keyIx, AR_DestIdx);
 		ads->ds_ctl0 |= AR_DestIdxValid;
 		ads->ds_ctl6 |= SM(ahp->ah_keytype[keyIx], AR_EncrType);
 	}
 	if (flags & RTSCTS) {
 		if (!isValidTxRate(rtsctsRate)) {
 			HALDEBUG(ah, HAL_DEBUG_ANY,
 			    "%s: invalid rts/cts rate 0x%x\n",
 			    __func__, rtsctsRate);
 			return AH_FALSE;
 		}
 		/* XXX validate rtsctsDuration */
 		ads->ds_ctl0 |= (flags & HAL_TXDESC_CTSENA ? AR_CTSEnable : 0)
 			     | (flags & HAL_TXDESC_RTSENA ? AR_RTSEnable : 0)
 			     ;
 		ads->ds_ctl7 |= (rtsctsRate << AR_RTSCTSRate_S);
 	}
 
 	/*
 	 * Set the TX antenna to 0 for Kite
 	 * To preserve existing behaviour, also set the TPC bits to 0;
 	 * when TPC is enabled these should be filled in appropriately.
 	 */
 	if (AR_SREV_KITE(ah)) {
 		ads->ds_ctl8 = SM(0, AR_AntCtl0);
 		ads->ds_ctl9 = SM(0, AR_AntCtl1) | SM(0, AR_XmitPower1);
 		ads->ds_ctl10 = SM(0, AR_AntCtl2) | SM(0, AR_XmitPower2);
 		ads->ds_ctl11 = SM(0, AR_AntCtl3) | SM(0, AR_XmitPower3);
 	}
 	return AH_TRUE;
 #undef RTSCTS
 }
 
 HAL_BOOL
 ar5416SetupXTxDesc(struct ath_hal *ah, struct ath_desc *ds,
 	u_int txRate1, u_int txTries1,
 	u_int txRate2, u_int txTries2,
 	u_int txRate3, u_int txTries3)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 
 	if (txTries1) {
 		HALASSERT(isValidTxRate(txRate1));
 		ads->ds_ctl2 |= SM(txTries1, AR_XmitDataTries1);
 		ads->ds_ctl3 |= (txRate1 << AR_XmitRate1_S);
 	}
 	if (txTries2) {
 		HALASSERT(isValidTxRate(txRate2));
 		ads->ds_ctl2 |= SM(txTries2, AR_XmitDataTries2);
 		ads->ds_ctl3 |= (txRate2 << AR_XmitRate2_S);
 	}
 	if (txTries3) {
 		HALASSERT(isValidTxRate(txRate3));
 		ads->ds_ctl2 |= SM(txTries3, AR_XmitDataTries3);
 		ads->ds_ctl3 |= (txRate3 << AR_XmitRate3_S);
 	}
 	return AH_TRUE;
 }
 
 HAL_BOOL
 ar5416FillTxDesc(struct ath_hal *ah, struct ath_desc *ds,
 	HAL_DMA_ADDR *bufAddrList, uint32_t *segLenList, u_int descId,
 	u_int qcuId, HAL_BOOL firstSeg, HAL_BOOL lastSeg,
 	const struct ath_desc *ds0)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 	uint32_t segLen = segLenList[0];
 
 	HALASSERT((segLen &~ AR_BufLen) == 0);
 
 	ds->ds_data = bufAddrList[0];
 
 	if (firstSeg) {
 		/*
 		 * First descriptor, don't clobber xmit control data
 		 * setup by ar5212SetupTxDesc.
 		 */
 		ads->ds_ctl1 |= segLen | (lastSeg ? 0 : AR_TxMore);
 	} else if (lastSeg) {		/* !firstSeg && lastSeg */
 		/*
 		 * Last descriptor in a multi-descriptor frame,
 		 * copy the multi-rate transmit parameters from
 		 * the first frame for processing on completion. 
 		 */
 		ads->ds_ctl1 = segLen;
 #ifdef AH_NEED_DESC_SWAP
 		ads->ds_ctl0 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl0)
 		    & AR_TxIntrReq;
 		ads->ds_ctl2 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl2);
 		ads->ds_ctl3 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl3);
 		/* ctl6 - we only need encrtype; the rest are blank */
 		ads->ds_ctl6 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl6 & AR_EncrType);
 #else
 		ads->ds_ctl0 = AR5416DESC_CONST(ds0)->ds_ctl0 & AR_TxIntrReq;
 		ads->ds_ctl2 = AR5416DESC_CONST(ds0)->ds_ctl2;
 		ads->ds_ctl3 = AR5416DESC_CONST(ds0)->ds_ctl3;
 		/* ctl6 - we only need encrtype; the rest are blank */
 		ads->ds_ctl6 = AR5416DESC_CONST(ds0)->ds_ctl6 & AR_EncrType;
 #endif
 	} else {			/* !firstSeg && !lastSeg */
 		/*
 		 * Intermediate descriptor in a multi-descriptor frame.
 		 */
 #ifdef AH_NEED_DESC_SWAP
 		ads->ds_ctl0 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl0)
 		    & AR_TxIntrReq;
 		ads->ds_ctl6 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl6 & AR_EncrType);
 #else
 		ads->ds_ctl0 = AR5416DESC_CONST(ds0)->ds_ctl0 & AR_TxIntrReq;
 		ads->ds_ctl6 = AR5416DESC_CONST(ds0)->ds_ctl6 & AR_EncrType;
 #endif
 		ads->ds_ctl1 = segLen | AR_TxMore;
 		ads->ds_ctl2 = 0;
 		ads->ds_ctl3 = 0;
 	}
 	/* XXX only on last descriptor? */
 	OS_MEMZERO(ads->u.tx.status, sizeof(ads->u.tx.status));
 	return AH_TRUE;
 }
 
 /*
  * NB: cipher is no longer used, it's calculated.
  */
 HAL_BOOL
 ar5416ChainTxDesc(struct ath_hal *ah, struct ath_desc *ds,
 	HAL_DMA_ADDR *bufAddrList,
 	uint32_t *segLenList,
 	u_int pktLen,
 	u_int hdrLen,
 	HAL_PKT_TYPE type,
 	u_int keyIx,
 	HAL_CIPHER cipher,
 	uint8_t delims,
 	HAL_BOOL firstSeg,
 	HAL_BOOL lastSeg,
 	HAL_BOOL lastAggr)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 	uint32_t *ds_txstatus = AR5416_DS_TXSTATUS(ah,ads);
 	struct ath_hal_5416 *ahp = AH5416(ah);
 	u_int segLen = segLenList[0];
 
 	int isaggr = 0;
 	uint32_t last_aggr = 0;
 	
 	(void) hdrLen;
 	(void) ah;
 
 	HALASSERT((segLen &~ AR_BufLen) == 0);
 	ds->ds_data = bufAddrList[0];
 
 	HALASSERT(isValidPktType(type));
 	if (type == HAL_PKT_TYPE_AMPDU) {
 		type = HAL_PKT_TYPE_NORMAL;
 		isaggr = 1;
 		if (lastAggr == AH_FALSE)
 			last_aggr = AR_MoreAggr;
 	}
 
 	/*
 	 * Since this function is called before any of the other
 	 * descriptor setup functions (at least in this particular
 	 * 802.11n aggregation implementation), always bzero() the
 	 * descriptor. Previously this would be done for all but
 	 * the first segment.
 	 * XXX TODO: figure out why; perhaps I'm using this slightly
 	 * XXX incorrectly.
 	 */
 	OS_MEMZERO(ds->ds_hw, AR5416_DESC_TX_CTL_SZ);
 
 	/*
 	 * Note: VEOL should only be for the last descriptor in the chain.
 	 */
 	ads->ds_ctl0 = (pktLen & AR_FrameLen);
 
 	/*
 	 * For aggregates:
 	 * + IsAggr must be set for all descriptors of all subframes of
 	 *   the aggregate
 	 * + MoreAggr must be set for all descriptors of all subframes
 	 *   of the aggregate EXCEPT the last subframe;
 	 * + MoreAggr must be _CLEAR_ for all descrpitors of the last
 	 *   subframe of the aggregate.
 	 */
 	ads->ds_ctl1 = (type << AR_FrameType_S)
 			| (isaggr ? (AR_IsAggr | last_aggr) : 0);
 
 	ads->ds_ctl2 = 0;
 	ads->ds_ctl3 = 0;
 	if (keyIx != HAL_TXKEYIX_INVALID) {
 		/* XXX validate key index */
 		ads->ds_ctl1 |= SM(keyIx, AR_DestIdx);
 		ads->ds_ctl0 |= AR_DestIdxValid;
 	}
 
 	ads->ds_ctl6 |= SM(ahp->ah_keytype[keyIx], AR_EncrType);
 	if (isaggr) {
 		ads->ds_ctl6 |= SM(delims, AR_PadDelim);
 	}
 
 	if (firstSeg) {
 		ads->ds_ctl1 |= segLen | (lastSeg ? 0 : AR_TxMore);
 	} else if (lastSeg) {           /* !firstSeg && lastSeg */
 		ads->ds_ctl0 = 0;
 		ads->ds_ctl1 |= segLen;
 	} else {                        /* !firstSeg && !lastSeg */
 		/*
 		 * Intermediate descriptor in a multi-descriptor frame.
 		 */
 		ads->ds_ctl0 = 0;
 		ads->ds_ctl1 |= segLen | AR_TxMore;
 	}
 	ds_txstatus[0] = ds_txstatus[1] = 0;
 	ds_txstatus[9] &= ~AR_TxDone;
 	
 	return AH_TRUE;
 }
 
 HAL_BOOL
 ar5416SetupFirstTxDesc(struct ath_hal *ah, struct ath_desc *ds,
 	u_int aggrLen, u_int flags, u_int txPower,
 	u_int txRate0, u_int txTries0, u_int antMode,
 	u_int rtsctsRate, u_int rtsctsDuration)
 {
 #define RTSCTS  (HAL_TXDESC_RTSENA|HAL_TXDESC_CTSENA)
 	struct ar5416_desc *ads = AR5416DESC(ds);
 	struct ath_hal_5212 *ahp = AH5212(ah);
 
 	HALASSERT(txTries0 != 0);
 	HALASSERT(isValidTxRate(txRate0));
 	HALASSERT((flags & RTSCTS) != RTSCTS);
 	/* XXX validate antMode */
 	
 	txPower = (txPower + ahp->ah_txPowerIndexOffset );
 	if(txPower > 63)  txPower=63;
 
 	ads->ds_ctl0 |= (txPower << AR_XmitPower_S)
 		| (flags & HAL_TXDESC_VEOL ? AR_VEOL : 0)
 		| (flags & HAL_TXDESC_CLRDMASK ? AR_ClrDestMask : 0)
 		| (flags & HAL_TXDESC_INTREQ ? AR_TxIntrReq : 0);
 	ads->ds_ctl1 |= (flags & HAL_TXDESC_NOACK ? AR_NoAck : 0);
 	ads->ds_ctl2 |= SM(txTries0, AR_XmitDataTries0);
 	ads->ds_ctl3 |= (txRate0 << AR_XmitRate0_S);
 	ads->ds_ctl7 = SM(AH5416(ah)->ah_tx_chainmask, AR_ChainSel0) 
 		| SM(AH5416(ah)->ah_tx_chainmask, AR_ChainSel1)
 		| SM(AH5416(ah)->ah_tx_chainmask, AR_ChainSel2) 
 		| SM(AH5416(ah)->ah_tx_chainmask, AR_ChainSel3);
 	
 	/* NB: no V1 WAR */
 	ads->ds_ctl8 = SM(0, AR_AntCtl0);
 	ads->ds_ctl9 = SM(0, AR_AntCtl1) | SM(txPower, AR_XmitPower1);
 	ads->ds_ctl10 = SM(0, AR_AntCtl2) | SM(txPower, AR_XmitPower2);
 	ads->ds_ctl11 = SM(0, AR_AntCtl3) | SM(txPower, AR_XmitPower3);
 
 	ads->ds_ctl6 &= ~(0xffff);
 	ads->ds_ctl6 |= SM(aggrLen, AR_AggrLen);
 
 	if (flags & RTSCTS) {
 		/* XXX validate rtsctsDuration */
 		ads->ds_ctl0 |= (flags & HAL_TXDESC_CTSENA ? AR_CTSEnable : 0)
 			| (flags & HAL_TXDESC_RTSENA ? AR_RTSEnable : 0);
 	}
 
 	/*
 	 * Set the TX antenna to 0 for Kite
 	 * To preserve existing behaviour, also set the TPC bits to 0;
 	 * when TPC is enabled these should be filled in appropriately.
 	 */
 	if (AR_SREV_KITE(ah)) {
 		ads->ds_ctl8 = SM(0, AR_AntCtl0);
 		ads->ds_ctl9 = SM(0, AR_AntCtl1) | SM(0, AR_XmitPower1);
 		ads->ds_ctl10 = SM(0, AR_AntCtl2) | SM(0, AR_XmitPower2);
 		ads->ds_ctl11 = SM(0, AR_AntCtl3) | SM(0, AR_XmitPower3);
 	}
 	
 	return AH_TRUE;
 #undef RTSCTS
 }
 
 HAL_BOOL
 ar5416SetupLastTxDesc(struct ath_hal *ah, struct ath_desc *ds,
 		const struct ath_desc *ds0)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 
 	ads->ds_ctl1 &= ~AR_MoreAggr;
 	ads->ds_ctl6 &= ~AR_PadDelim;
 
 	/* hack to copy rate info to last desc for later processing */
 #ifdef AH_NEED_DESC_SWAP
 	ads->ds_ctl2 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl2);
 	ads->ds_ctl3 = __bswap32(AR5416DESC_CONST(ds0)->ds_ctl3);
 #else
 	ads->ds_ctl2 = AR5416DESC_CONST(ds0)->ds_ctl2;
 	ads->ds_ctl3 = AR5416DESC_CONST(ds0)->ds_ctl3;
 #endif
 	return AH_TRUE;
 }
 
 #ifdef AH_NEED_DESC_SWAP
 /* Swap transmit descriptor */
 static __inline void
 ar5416SwapTxDesc(struct ath_desc *ds)
 {
 	ds->ds_data = __bswap32(ds->ds_data);
 	ds->ds_ctl0 = __bswap32(ds->ds_ctl0);
 	ds->ds_ctl1 = __bswap32(ds->ds_ctl1);
 	ds->ds_hw[0] = __bswap32(ds->ds_hw[0]);
 	ds->ds_hw[1] = __bswap32(ds->ds_hw[1]);
 	ds->ds_hw[2] = __bswap32(ds->ds_hw[2]);
 	ds->ds_hw[3] = __bswap32(ds->ds_hw[3]);
 }
 #endif
 
 /*
  * Processing of HW TX descriptor.
  */
 HAL_STATUS
 ar5416ProcTxDesc(struct ath_hal *ah,
 	struct ath_desc *ds, struct ath_tx_status *ts)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 	uint32_t *ds_txstatus = AR5416_DS_TXSTATUS(ah,ads);
 
 #ifdef AH_NEED_DESC_SWAP
 	if ((ds_txstatus[9] & __bswap32(AR_TxDone)) == 0)
 		return HAL_EINPROGRESS;
 	ar5416SwapTxDesc(ds);
 #else
 	if ((ds_txstatus[9] & AR_TxDone) == 0)
 		return HAL_EINPROGRESS;
 #endif
 
 	/* Update software copies of the HW status */
 	ts->ts_seqnum = MS(ds_txstatus[9], AR_SeqNum);
 	ts->ts_tstamp = AR_SendTimestamp(ds_txstatus);
 	ts->ts_tid = MS(ds_txstatus[9], AR_TxTid);
 
 	ts->ts_status = 0;
 	if (ds_txstatus[1] & AR_ExcessiveRetries)
 		ts->ts_status |= HAL_TXERR_XRETRY;
 	if (ds_txstatus[1] & AR_Filtered)
 		ts->ts_status |= HAL_TXERR_FILT;
 	if (ds_txstatus[1] & AR_FIFOUnderrun)
 		ts->ts_status |= HAL_TXERR_FIFO;
 	if (ds_txstatus[9] & AR_TxOpExceeded)
 		ts->ts_status |= HAL_TXERR_XTXOP;
 	if (ds_txstatus[1] & AR_TxTimerExpired)
 		ts->ts_status |= HAL_TXERR_TIMER_EXPIRED;
 
 	ts->ts_flags  = 0;
 	if (ds_txstatus[0] & AR_TxBaStatus) {
 		ts->ts_flags |= HAL_TX_BA;
 		ts->ts_ba_low = AR_BaBitmapLow(ds_txstatus);
 		ts->ts_ba_high = AR_BaBitmapHigh(ds_txstatus);
 	}
 	if (ds->ds_ctl1 & AR_IsAggr)
 		ts->ts_flags |= HAL_TX_AGGR;
 	if (ds_txstatus[1] & AR_DescCfgErr)
 		ts->ts_flags |= HAL_TX_DESC_CFG_ERR;
 	if (ds_txstatus[1] & AR_TxDataUnderrun)
 		ts->ts_flags |= HAL_TX_DATA_UNDERRUN;
 	if (ds_txstatus[1] & AR_TxDelimUnderrun)
 		ts->ts_flags |= HAL_TX_DELIM_UNDERRUN;
 
 	/*
 	 * Extract the transmit rate used and mark the rate as
 	 * ``alternate'' if it wasn't the series 0 rate.
 	 */
 	ts->ts_finaltsi =  MS(ds_txstatus[9], AR_FinalTxIdx);
 	switch (ts->ts_finaltsi) {
 	case 0:
 		ts->ts_rate = MS(ads->ds_ctl3, AR_XmitRate0);
 		break;
 	case 1:
 		ts->ts_rate = MS(ads->ds_ctl3, AR_XmitRate1);
 		break;
 	case 2:
 		ts->ts_rate = MS(ads->ds_ctl3, AR_XmitRate2);
 		break;
 	case 3:
 		ts->ts_rate = MS(ads->ds_ctl3, AR_XmitRate3);
 		break;
 	}
 
 	ts->ts_rssi = MS(ds_txstatus[5], AR_TxRSSICombined);
 	ts->ts_rssi_ctl[0] = MS(ds_txstatus[0], AR_TxRSSIAnt00);
 	ts->ts_rssi_ctl[1] = MS(ds_txstatus[0], AR_TxRSSIAnt01);
 	ts->ts_rssi_ctl[2] = MS(ds_txstatus[0], AR_TxRSSIAnt02);
 	ts->ts_rssi_ext[0] = MS(ds_txstatus[5], AR_TxRSSIAnt10);
 	ts->ts_rssi_ext[1] = MS(ds_txstatus[5], AR_TxRSSIAnt11);
 	ts->ts_rssi_ext[2] = MS(ds_txstatus[5], AR_TxRSSIAnt12);
 	ts->ts_evm0 = AR_TxEVM0(ds_txstatus);
 	ts->ts_evm1 = AR_TxEVM1(ds_txstatus);
 	ts->ts_evm2 = AR_TxEVM2(ds_txstatus);
 
 	ts->ts_shortretry = MS(ds_txstatus[1], AR_RTSFailCnt);
 	ts->ts_longretry = MS(ds_txstatus[1], AR_DataFailCnt);
 	/*
 	 * The retry count has the number of un-acked tries for the
 	 * final series used.  When doing multi-rate retry we must
 	 * fixup the retry count by adding in the try counts for
 	 * each series that was fully-processed.  Beware that this
 	 * takes values from the try counts in the final descriptor.
 	 * These are not required by the hardware.  We assume they
 	 * are placed there by the driver as otherwise we have no
 	 * access and the driver can't do the calculation because it
 	 * doesn't know the descriptor format.
 	 */
 	switch (ts->ts_finaltsi) {
 	case 3: ts->ts_longretry += MS(ads->ds_ctl2, AR_XmitDataTries2);
 	case 2: ts->ts_longretry += MS(ads->ds_ctl2, AR_XmitDataTries1);
 	case 1: ts->ts_longretry += MS(ads->ds_ctl2, AR_XmitDataTries0);
 	}
 
 	/*
 	 * These fields are not used. Zero these to preserve compatability
 	 * with existing drivers.
 	 */
 	ts->ts_virtcol = MS(ads->ds_ctl1, AR_VirtRetryCnt);
 	ts->ts_antenna = 0; /* We don't switch antennas on Owl*/
 
 	/* handle tx trigger level changes internally */
 	if ((ts->ts_status & HAL_TXERR_FIFO) ||
 	    (ts->ts_flags & (HAL_TX_DATA_UNDERRUN | HAL_TX_DELIM_UNDERRUN)))
 		ar5212UpdateTxTrigLevel(ah, AH_TRUE);
 
 	return HAL_OK;
 }
 
 HAL_BOOL
 ar5416SetGlobalTxTimeout(struct ath_hal *ah, u_int tu)
 {
 	struct ath_hal_5416 *ahp = AH5416(ah);
 
 	if (tu > 0xFFFF) {
 		HALDEBUG(ah, HAL_DEBUG_ANY, "%s: bad global tx timeout %u\n",
 		    __func__, tu);
 		/* restore default handling */
 		ahp->ah_globaltxtimeout = (u_int) -1;
 		return AH_FALSE;
 	}
 	OS_REG_RMW_FIELD(ah, AR_GTXTO, AR_GTXTO_TIMEOUT_LIMIT, tu);
 	ahp->ah_globaltxtimeout = tu;
 	return AH_TRUE;
 }
 
 u_int
 ar5416GetGlobalTxTimeout(struct ath_hal *ah)
 {
 	return MS(OS_REG_READ(ah, AR_GTXTO), AR_GTXTO_TIMEOUT_LIMIT);
 }
 
+#define	HT_RC_2_MCS(_rc)	((_rc) & 0x0f)
+static const u_int8_t baDurationDelta[] = {
+	24,	//  0: BPSK
+	12,	//  1: QPSK 1/2
+	12,	//  2: QPSK 3/4
+	4,	//  3: 16-QAM 1/2
+	4,	//  4: 16-QAM 3/4
+	4,	//  5: 64-QAM 2/3
+	4,	//  6: 64-QAM 3/4
+	4,	//  7: 64-QAM 5/6
+	24,	//  8: BPSK
+	12,	//  9: QPSK 1/2
+	12,	// 10: QPSK 3/4
+	4,	// 11: 16-QAM 1/2
+	4,	// 12: 16-QAM 3/4
+	4,	// 13: 64-QAM 2/3
+	4,	// 14: 64-QAM 3/4
+	4,	// 15: 64-QAM 5/6
+};
+
 void
 ar5416Set11nRateScenario(struct ath_hal *ah, struct ath_desc *ds,
         u_int durUpdateEn, u_int rtsctsRate,
 	HAL_11N_RATE_SERIES series[], u_int nseries, u_int flags)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 	uint32_t ds_ctl0;
 
 	HALASSERT(nseries == 4);
 	(void)nseries;
 
 	/*
 	 * XXX since the upper layers doesn't know the current chainmask
 	 * XXX setup, just override its decisions here.
 	 * XXX The upper layers need to be taught this!
 	 */
 	if (series[0].Tries != 0)
 		series[0].ChSel = AH5416(ah)->ah_tx_chainmask;
 	if (series[1].Tries != 0)
 		series[1].ChSel = AH5416(ah)->ah_tx_chainmask;
 	if (series[2].Tries != 0)
 		series[2].ChSel = AH5416(ah)->ah_tx_chainmask;
 	if (series[3].Tries != 0)
 		series[3].ChSel = AH5416(ah)->ah_tx_chainmask;
 
 	/*
 	 * Only one of RTS and CTS enable must be set.
 	 * If a frame has both set, just do RTS protection -
 	 * that's enough to satisfy legacy protection.
 	 */
 	if (flags & (HAL_TXDESC_RTSENA | HAL_TXDESC_CTSENA)) {
 		ds_ctl0 = ads->ds_ctl0;
 
 		if (flags & HAL_TXDESC_RTSENA) {
 			ds_ctl0 &= ~AR_CTSEnable;
 			ds_ctl0 |= AR_RTSEnable;
 		} else {
 			ds_ctl0 &= ~AR_RTSEnable;
 			ds_ctl0 |= AR_CTSEnable;
 		}
 
 		ads->ds_ctl0 = ds_ctl0;
 	} else {
 		ads->ds_ctl0 =
 		    (ads->ds_ctl0 & ~(AR_RTSEnable | AR_CTSEnable));
 	}
 
 	ads->ds_ctl2 = set11nTries(series, 0)
 		     | set11nTries(series, 1)
 		     | set11nTries(series, 2)
 		     | set11nTries(series, 3)
 		     | (durUpdateEn ? AR_DurUpdateEn : 0);
 
 	ads->ds_ctl3 = set11nRate(series, 0)
 		     | set11nRate(series, 1)
 		     | set11nRate(series, 2)
 		     | set11nRate(series, 3);
 
 	ads->ds_ctl4 = set11nPktDurRTSCTS(series, 0)
 		     | set11nPktDurRTSCTS(series, 1);
 
 	ads->ds_ctl5 = set11nPktDurRTSCTS(series, 2)
 		     | set11nPktDurRTSCTS(series, 3);
 
 	ads->ds_ctl7 = set11nRateFlags(series, 0)
 		     | set11nRateFlags(series, 1)
 		     | set11nRateFlags(series, 2)
 		     | set11nRateFlags(series, 3)
 		     | SM(rtsctsRate, AR_RTSCTSRate);
 }
 
+/*
+ * Note: this should be called before calling ar5416SetBurstDuration()
+ * (if it is indeed called) in order to ensure that the burst duration
+ * is correctly updated with the BA delta workaround.
+ */
 void
 ar5416Set11nAggrFirst(struct ath_hal *ah, struct ath_desc *ds, u_int aggrLen,
     u_int numDelims)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
+	uint32_t flags;
+	uint32_t burstDur;
+	uint8_t rate;
 
 	ads->ds_ctl1 |= (AR_IsAggr | AR_MoreAggr);
 
 	ads->ds_ctl6 &= ~(AR_AggrLen | AR_PadDelim);
 	ads->ds_ctl6 |= SM(aggrLen, AR_AggrLen);
 	ads->ds_ctl6 |= SM(numDelims, AR_PadDelim);
+
+	if (! AR_SREV_MERLIN_10_OR_LATER(ah)) {
+		/*
+		 * XXX It'd be nice if I were passed in the rate scenario
+		 * at this point..
+		 */
+		rate = MS(ads->ds_ctl3, AR_XmitRate0);
+		flags = ads->ds_ctl0 & (AR_CTSEnable | AR_RTSEnable);
+		/*
+		 * WAR - MAC assumes normal ACK time instead of
+		 * block ACK while computing packet duration.
+		 * Add this delta to the burst duration in the descriptor.
+		 */
+		if (flags && (ads->ds_ctl1 & AR_IsAggr)) {
+			burstDur = baDurationDelta[HT_RC_2_MCS(rate)];
+			ads->ds_ctl2 &= ~(AR_BurstDur);
+			ads->ds_ctl2 |= SM(burstDur, AR_BurstDur);
+		}
+	}
 }
 
 void
 ar5416Set11nAggrMiddle(struct ath_hal *ah, struct ath_desc *ds, u_int numDelims)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 	uint32_t *ds_txstatus = AR5416_DS_TXSTATUS(ah,ads);
 
 	ads->ds_ctl1 |= (AR_IsAggr | AR_MoreAggr);
 
 	ads->ds_ctl6 &= ~AR_PadDelim;
 	ads->ds_ctl6 |= SM(numDelims, AR_PadDelim);
 	ads->ds_ctl6 &= ~AR_AggrLen;
 
 	/*
 	 * Clear the TxDone status here, may need to change
 	 * func name to reflect this
 	 */
 	ds_txstatus[9] &= ~AR_TxDone;
 }
 
 void
 ar5416Set11nAggrLast(struct ath_hal *ah, struct ath_desc *ds)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 
 	ads->ds_ctl1 |= AR_IsAggr;
 	ads->ds_ctl1 &= ~AR_MoreAggr;
 	ads->ds_ctl6 &= ~AR_PadDelim;
 }
 
 void
 ar5416Clr11nAggr(struct ath_hal *ah, struct ath_desc *ds)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
 
 	ads->ds_ctl1 &= (~AR_IsAggr & ~AR_MoreAggr);
 	ads->ds_ctl6 &= ~AR_PadDelim;
 	ads->ds_ctl6 &= ~AR_AggrLen;
 }
 
+/*
+ * Program the burst duration, with the included BA delta if it's
+ * applicable.
+ */
 void
 ar5416Set11nBurstDuration(struct ath_hal *ah, struct ath_desc *ds,
                                                   u_int burstDuration)
 {
 	struct ar5416_desc *ads = AR5416DESC(ds);
+	uint32_t burstDur = 0;
+	uint8_t rate;
 
+	if (! AR_SREV_MERLIN_10_OR_LATER(ah)) {
+		/*
+		 * XXX It'd be nice if I were passed in the rate scenario
+		 * at this point..
+		 */
+		rate = MS(ads->ds_ctl3, AR_XmitDataTries0);
+		/*
+		 * WAR - MAC assumes normal ACK time instead of
+		 * block ACK while computing packet duration.
+		 * Add this delta to the burst duration in the descriptor.
+		 */
+		if (ads->ds_ctl1 & AR_IsAggr) {
+			burstDur = baDurationDelta[HT_RC_2_MCS(rate)];
+		}
+	}
+
 	ads->ds_ctl2 &= ~AR_BurstDur;
-	ads->ds_ctl2 |= SM(burstDuration, AR_BurstDur);
+	ads->ds_ctl2 |= SM(burstDur + burstDuration, AR_BurstDur);
 }
 
 /*
  * Retrieve the rate table from the given TX completion descriptor
  */
 HAL_BOOL
 ar5416GetTxCompletionRates(struct ath_hal *ah, const struct ath_desc *ds0, int *rates, int *tries)
 {
 	const struct ar5416_desc *ads = AR5416DESC_CONST(ds0);
 
 	rates[0] = MS(ads->ds_ctl3, AR_XmitRate0);
 	rates[1] = MS(ads->ds_ctl3, AR_XmitRate1);
 	rates[2] = MS(ads->ds_ctl3, AR_XmitRate2);
 	rates[3] = MS(ads->ds_ctl3, AR_XmitRate3);
 
 	tries[0] = MS(ads->ds_ctl2, AR_XmitDataTries0);
 	tries[1] = MS(ads->ds_ctl2, AR_XmitDataTries1);
 	tries[2] = MS(ads->ds_ctl2, AR_XmitDataTries2);
 	tries[3] = MS(ads->ds_ctl2, AR_XmitDataTries3);
 
 	return AH_TRUE;
 }
 
 
 /*
  * TX queue management routines - AR5416 and later chipsets
  */
 
 /*
  * Allocate and initialize a tx DCU/QCU combination.
  */
 int
 ar5416SetupTxQueue(struct ath_hal *ah, HAL_TX_QUEUE type,
 	const HAL_TXQ_INFO *qInfo)
 {
 	struct ath_hal_5212 *ahp = AH5212(ah);
 	HAL_TX_QUEUE_INFO *qi;
 	HAL_CAPABILITIES *pCap = &AH_PRIVATE(ah)->ah_caps;
 	int q, defqflags;
 
 	/* by default enable OK+ERR+DESC+URN interrupts */
 	defqflags = HAL_TXQ_TXOKINT_ENABLE
 		  | HAL_TXQ_TXERRINT_ENABLE
 		  | HAL_TXQ_TXDESCINT_ENABLE
 		  | HAL_TXQ_TXURNINT_ENABLE;
 	/* XXX move queue assignment to driver */
 	switch (type) {
 	case HAL_TX_QUEUE_BEACON:
 		q = pCap->halTotalQueues-1;	/* highest priority */
 		defqflags |= HAL_TXQ_DBA_GATED
 		       | HAL_TXQ_CBR_DIS_QEMPTY
 		       | HAL_TXQ_ARB_LOCKOUT_GLOBAL
 		       | HAL_TXQ_BACKOFF_DISABLE;
 		break;
 	case HAL_TX_QUEUE_CAB:
 		q = pCap->halTotalQueues-2;	/* next highest priority */
 		defqflags |= HAL_TXQ_DBA_GATED
 		       | HAL_TXQ_CBR_DIS_QEMPTY
 		       | HAL_TXQ_CBR_DIS_BEMPTY
 		       | HAL_TXQ_ARB_LOCKOUT_GLOBAL
 		       | HAL_TXQ_BACKOFF_DISABLE;
 		break;
 	case HAL_TX_QUEUE_PSPOLL:
 		q = 1;				/* lowest priority */
 		defqflags |= HAL_TXQ_DBA_GATED
 		       | HAL_TXQ_CBR_DIS_QEMPTY
 		       | HAL_TXQ_CBR_DIS_BEMPTY
 		       | HAL_TXQ_ARB_LOCKOUT_GLOBAL
 		       | HAL_TXQ_BACKOFF_DISABLE;
 		break;
 	case HAL_TX_QUEUE_UAPSD:
 		q = pCap->halTotalQueues-3;	/* nextest highest priority */
 		if (ahp->ah_txq[q].tqi_type != HAL_TX_QUEUE_INACTIVE) {
 			HALDEBUG(ah, HAL_DEBUG_ANY,
 			    "%s: no available UAPSD tx queue\n", __func__);
 			return -1;
 		}
 		break;
 	case HAL_TX_QUEUE_DATA:
 		for (q = 0; q < pCap->halTotalQueues; q++)
 			if (ahp->ah_txq[q].tqi_type == HAL_TX_QUEUE_INACTIVE)
 				break;
 		if (q == pCap->halTotalQueues) {
 			HALDEBUG(ah, HAL_DEBUG_ANY,
 			    "%s: no available tx queue\n", __func__);
 			return -1;
 		}
 		break;
 	default:
 		HALDEBUG(ah, HAL_DEBUG_ANY,
 		    "%s: bad tx queue type %u\n", __func__, type);
 		return -1;
 	}
 
 	HALDEBUG(ah, HAL_DEBUG_TXQUEUE, "%s: queue %u\n", __func__, q);
 
 	qi = &ahp->ah_txq[q];
 	if (qi->tqi_type != HAL_TX_QUEUE_INACTIVE) {
 		HALDEBUG(ah, HAL_DEBUG_ANY, "%s: tx queue %u already active\n",
 		    __func__, q);
 		return -1;
 	}
 	OS_MEMZERO(qi, sizeof(HAL_TX_QUEUE_INFO));
 	qi->tqi_type = type;
 	if (qInfo == AH_NULL) {
 		qi->tqi_qflags = defqflags;
 		qi->tqi_aifs = INIT_AIFS;
 		qi->tqi_cwmin = HAL_TXQ_USEDEFAULT;	/* NB: do at reset */
 		qi->tqi_cwmax = INIT_CWMAX;
 		qi->tqi_shretry = INIT_SH_RETRY;
 		qi->tqi_lgretry = INIT_LG_RETRY;
 		qi->tqi_physCompBuf = 0;
 	} else {
 		qi->tqi_physCompBuf = qInfo->tqi_compBuf;
 		(void) ar5212SetTxQueueProps(ah, q, qInfo);
 	}
 	/* NB: must be followed by ar5212ResetTxQueue */
 	return q;
 }
 
 /*
  * Update the h/w interrupt registers to reflect a tx q's configuration.
  */
 static void
 setTxQInterrupts(struct ath_hal *ah, HAL_TX_QUEUE_INFO *qi)
 {
 	struct ath_hal_5212 *ahp = AH5212(ah);
 
 	HALDEBUG(ah, HAL_DEBUG_TXQUEUE,
 	    "%s: tx ok 0x%x err 0x%x desc 0x%x eol 0x%x urn 0x%x\n", __func__,
 	    ahp->ah_txOkInterruptMask, ahp->ah_txErrInterruptMask,
 	    ahp->ah_txDescInterruptMask, ahp->ah_txEolInterruptMask,
 	    ahp->ah_txUrnInterruptMask);
 
 	OS_REG_WRITE(ah, AR_IMR_S0,
 		  SM(ahp->ah_txOkInterruptMask, AR_IMR_S0_QCU_TXOK)
 		| SM(ahp->ah_txDescInterruptMask, AR_IMR_S0_QCU_TXDESC)
 	);
 	OS_REG_WRITE(ah, AR_IMR_S1,
 		  SM(ahp->ah_txErrInterruptMask, AR_IMR_S1_QCU_TXERR)
 		| SM(ahp->ah_txEolInterruptMask, AR_IMR_S1_QCU_TXEOL)
 	);
 	OS_REG_RMW_FIELD(ah, AR_IMR_S2,
 		AR_IMR_S2_QCU_TXURN, ahp->ah_txUrnInterruptMask);
 }
 
 /*
  * Set the retry, aifs, cwmin/max, readyTime regs for specified queue
  * Assumes:
  *  phwChannel has been set to point to the current channel
  */
 #define	TU_TO_USEC(_tu)		((_tu) << 10)
 HAL_BOOL
 ar5416ResetTxQueue(struct ath_hal *ah, u_int q)
 {
 	struct ath_hal_5212 *ahp = AH5212(ah);
 	HAL_CAPABILITIES *pCap = &AH_PRIVATE(ah)->ah_caps;
 	const struct ieee80211_channel *chan = AH_PRIVATE(ah)->ah_curchan;
 	HAL_TX_QUEUE_INFO *qi;
 	uint32_t cwMin, chanCwMin, qmisc, dmisc;
 
 	if (q >= pCap->halTotalQueues) {
 		HALDEBUG(ah, HAL_DEBUG_ANY, "%s: invalid queue num %u\n",
 		    __func__, q);
 		return AH_FALSE;
 	}
 	qi = &ahp->ah_txq[q];
 	if (qi->tqi_type == HAL_TX_QUEUE_INACTIVE) {
 		HALDEBUG(ah, HAL_DEBUG_TXQUEUE, "%s: inactive queue %u\n",
 		    __func__, q);
 		return AH_TRUE;		/* XXX??? */
 	}
 
 	HALDEBUG(ah, HAL_DEBUG_TXQUEUE, "%s: reset queue %u\n", __func__, q);
 
 	if (qi->tqi_cwmin == HAL_TXQ_USEDEFAULT) {
 		/*
 		 * Select cwmin according to channel type.
 		 * NB: chan can be NULL during attach
 		 */
 		if (chan && IEEE80211_IS_CHAN_B(chan))
 			chanCwMin = INIT_CWMIN_11B;
 		else
 			chanCwMin = INIT_CWMIN;
 		/* make sure that the CWmin is of the form (2^n - 1) */
 		for (cwMin = 1; cwMin < chanCwMin; cwMin = (cwMin << 1) | 1)
 			;
 	} else
 		cwMin = qi->tqi_cwmin;
 
 	/* set cwMin/Max and AIFS values */
 	OS_REG_WRITE(ah, AR_DLCL_IFS(q),
 		  SM(cwMin, AR_D_LCL_IFS_CWMIN)
 		| SM(qi->tqi_cwmax, AR_D_LCL_IFS_CWMAX)
 		| SM(qi->tqi_aifs, AR_D_LCL_IFS_AIFS));
 
 	/* Set retry limit values */
 	OS_REG_WRITE(ah, AR_DRETRY_LIMIT(q), 
 		   SM(INIT_SSH_RETRY, AR_D_RETRY_LIMIT_STA_SH)
 		 | SM(INIT_SLG_RETRY, AR_D_RETRY_LIMIT_STA_LG)
 		 | SM(qi->tqi_lgretry, AR_D_RETRY_LIMIT_FR_LG)
 		 | SM(qi->tqi_shretry, AR_D_RETRY_LIMIT_FR_SH)
 	);
 
 	/* NB: always enable early termination on the QCU */
 	qmisc = AR_Q_MISC_DCU_EARLY_TERM_REQ
 	      | SM(AR_Q_MISC_FSP_ASAP, AR_Q_MISC_FSP);
 
 	/* NB: always enable DCU to wait for next fragment from QCU */
 	dmisc = AR_D_MISC_FRAG_WAIT_EN;
 
 	/* Enable exponential backoff window */
 	dmisc |= AR_D_MISC_BKOFF_PERSISTENCE;
 
 	/* 
 	 * The chip reset default is to use a DCU backoff threshold of 0x2.
 	 * Restore this when programming the DCU MISC register.
 	 */
 	dmisc |= 0x2;
 
 	/* multiqueue support */
 	if (qi->tqi_cbrPeriod) {
 		OS_REG_WRITE(ah, AR_QCBRCFG(q), 
 			  SM(qi->tqi_cbrPeriod,AR_Q_CBRCFG_CBR_INTERVAL)
 			| SM(qi->tqi_cbrOverflowLimit, AR_Q_CBRCFG_CBR_OVF_THRESH));
 		qmisc = (qmisc &~ AR_Q_MISC_FSP) | AR_Q_MISC_FSP_CBR;
 		if (qi->tqi_cbrOverflowLimit)
 			qmisc |= AR_Q_MISC_CBR_EXP_CNTR_LIMIT;
 	}
 
 	if (qi->tqi_readyTime && (qi->tqi_type != HAL_TX_QUEUE_CAB)) {
 		OS_REG_WRITE(ah, AR_QRDYTIMECFG(q),
 			  SM(qi->tqi_readyTime, AR_Q_RDYTIMECFG_INT)
 			| AR_Q_RDYTIMECFG_ENA);
 	}
 	
 	OS_REG_WRITE(ah, AR_DCHNTIME(q),
 		  SM(qi->tqi_burstTime, AR_D_CHNTIME_DUR)
 		| (qi->tqi_burstTime ? AR_D_CHNTIME_EN : 0));
 
 	if (qi->tqi_readyTime &&
 	    (qi->tqi_qflags & HAL_TXQ_RDYTIME_EXP_POLICY_ENABLE))
 		qmisc |= AR_Q_MISC_RDYTIME_EXP_POLICY;
 	if (qi->tqi_qflags & HAL_TXQ_DBA_GATED)
 		qmisc = (qmisc &~ AR_Q_MISC_FSP) | AR_Q_MISC_FSP_DBA_GATED;
 	if (MS(qmisc, AR_Q_MISC_FSP) != AR_Q_MISC_FSP_ASAP) {
 		/*
 		 * These are meangingful only when not scheduled asap.
 		 */
 		if (qi->tqi_qflags & HAL_TXQ_CBR_DIS_BEMPTY)
 			qmisc |= AR_Q_MISC_CBR_INCR_DIS0;
 		else
 			qmisc &= ~AR_Q_MISC_CBR_INCR_DIS0;
 		if (qi->tqi_qflags & HAL_TXQ_CBR_DIS_QEMPTY)
 			qmisc |= AR_Q_MISC_CBR_INCR_DIS1;
 		else
 			qmisc &= ~AR_Q_MISC_CBR_INCR_DIS1;
 	}
 
 	if (qi->tqi_qflags & HAL_TXQ_BACKOFF_DISABLE)
 		dmisc |= AR_D_MISC_POST_FR_BKOFF_DIS;
 	if (qi->tqi_qflags & HAL_TXQ_FRAG_BURST_BACKOFF_ENABLE)
 		dmisc |= AR_D_MISC_FRAG_BKOFF_EN;
 	if (qi->tqi_qflags & HAL_TXQ_ARB_LOCKOUT_GLOBAL)
 		dmisc |= SM(AR_D_MISC_ARB_LOCKOUT_CNTRL_GLOBAL,
 			    AR_D_MISC_ARB_LOCKOUT_CNTRL);
 	else if (qi->tqi_qflags & HAL_TXQ_ARB_LOCKOUT_INTRA)
 		dmisc |= SM(AR_D_MISC_ARB_LOCKOUT_CNTRL_INTRA_FR,
 			    AR_D_MISC_ARB_LOCKOUT_CNTRL);
 	if (qi->tqi_qflags & HAL_TXQ_IGNORE_VIRTCOL)
 		dmisc |= SM(AR_D_MISC_VIR_COL_HANDLING_IGNORE,
 			    AR_D_MISC_VIR_COL_HANDLING);
 	if (qi->tqi_qflags & HAL_TXQ_SEQNUM_INC_DIS)
 		dmisc |= AR_D_MISC_SEQ_NUM_INCR_DIS;
 
 	/*
 	 * Fillin type-dependent bits.  Most of this can be
 	 * removed by specifying the queue parameters in the
 	 * driver; it's here for backwards compatibility.
 	 */
 	switch (qi->tqi_type) {
 	case HAL_TX_QUEUE_BEACON:		/* beacon frames */
 		qmisc |= AR_Q_MISC_FSP_DBA_GATED
 		      |  AR_Q_MISC_BEACON_USE
 		      |  AR_Q_MISC_CBR_INCR_DIS1;
 
 		dmisc |= SM(AR_D_MISC_ARB_LOCKOUT_CNTRL_GLOBAL,
 			    AR_D_MISC_ARB_LOCKOUT_CNTRL)
 		      |  AR_D_MISC_BEACON_USE
 		      |  AR_D_MISC_POST_FR_BKOFF_DIS;
 		break;
 	case HAL_TX_QUEUE_CAB:			/* CAB  frames */
 		/* 
 		 * No longer Enable AR_Q_MISC_RDYTIME_EXP_POLICY,
 		 * There is an issue with the CAB Queue
 		 * not properly refreshing the Tx descriptor if
 		 * the TXE clear setting is used.
 		 */
 		qmisc |= AR_Q_MISC_FSP_DBA_GATED
 		      |  AR_Q_MISC_CBR_INCR_DIS1
 		      |  AR_Q_MISC_CBR_INCR_DIS0;
 		HALDEBUG(ah, HAL_DEBUG_TXQUEUE, "%s: CAB: tqi_readyTime = %d\n",
 		    __func__, qi->tqi_readyTime);
 		if (qi->tqi_readyTime) {
 			HALDEBUG(ah, HAL_DEBUG_TXQUEUE,
 			    "%s: using tqi_readyTime\n", __func__);
 			OS_REG_WRITE(ah, AR_QRDYTIMECFG(q),
 			    SM(qi->tqi_readyTime, AR_Q_RDYTIMECFG_INT) |
 			    AR_Q_RDYTIMECFG_ENA);
 		} else {
 			int value;
 			/*
 			 * NB: don't set default ready time if driver
 			 * has explicitly specified something.  This is
 			 * here solely for backwards compatibility.
 			 */
 			/*
 			 * XXX for now, hard-code a CAB interval of 70%
 			 * XXX of the total beacon interval.
 			 *
 			 * XXX This keeps Merlin and later based MACs
 			 * XXX quite a bit happier (stops stuck beacons,
 			 * XXX which I gather is because of such a long
 			 * XXX cabq time.)
 			 */
 			value = (ahp->ah_beaconInterval * 70 / 100)
 				- (ah->ah_config.ah_sw_beacon_response_time
 				+ ah->ah_config.ah_dma_beacon_response_time)
 				- ah->ah_config.ah_additional_swba_backoff;
 			/*
 			 * XXX Ensure it isn't too low - nothing lower
 			 * XXX than 10 TU
 			 */
 			if (value < 10)
 				value = 10;
 			HALDEBUG(ah, HAL_DEBUG_TXQUEUE,
 			    "%s: defaulting to rdytime = %d uS\n",
 			    __func__, value);
 			OS_REG_WRITE(ah, AR_QRDYTIMECFG(q),
 			    SM(TU_TO_USEC(value), AR_Q_RDYTIMECFG_INT) |
 			    AR_Q_RDYTIMECFG_ENA);
 		}
 		dmisc |= SM(AR_D_MISC_ARB_LOCKOUT_CNTRL_GLOBAL,
 			    AR_D_MISC_ARB_LOCKOUT_CNTRL);
 		break;
 	case HAL_TX_QUEUE_PSPOLL:
 		qmisc |= AR_Q_MISC_CBR_INCR_DIS1;
 		break;
 	case HAL_TX_QUEUE_UAPSD:
 		dmisc |= AR_D_MISC_POST_FR_BKOFF_DIS;
 		break;
 	default:			/* NB: silence compiler */
 		break;
 	}
 
 	OS_REG_WRITE(ah, AR_QMISC(q), qmisc);
 	OS_REG_WRITE(ah, AR_DMISC(q), dmisc);
 
 	/* Setup compression scratchpad buffer */
 	/* 
 	 * XXX: calling this asynchronously to queue operation can
 	 *      cause unexpected behavior!!!
 	 */
 	if (qi->tqi_physCompBuf) {
 		HALASSERT(qi->tqi_type == HAL_TX_QUEUE_DATA ||
 			  qi->tqi_type == HAL_TX_QUEUE_UAPSD);
 		OS_REG_WRITE(ah, AR_Q_CBBS, (80 + 2*q));
 		OS_REG_WRITE(ah, AR_Q_CBBA, qi->tqi_physCompBuf);
 		OS_REG_WRITE(ah, AR_Q_CBC,  HAL_COMP_BUF_MAX_SIZE/1024);
 		OS_REG_WRITE(ah, AR_Q0_MISC + 4*q,
 			     OS_REG_READ(ah, AR_Q0_MISC + 4*q)
 			     | AR_Q_MISC_QCU_COMP_EN);
 	}
 	
 	/*
 	 * Always update the secondary interrupt mask registers - this
 	 * could be a new queue getting enabled in a running system or
 	 * hw getting re-initialized during a reset!
 	 *
 	 * Since we don't differentiate between tx interrupts corresponding
 	 * to individual queues - secondary tx mask regs are always unmasked;
 	 * tx interrupts are enabled/disabled for all queues collectively
 	 * using the primary mask reg
 	 */
 	if (qi->tqi_qflags & HAL_TXQ_TXOKINT_ENABLE)
 		ahp->ah_txOkInterruptMask |= 1 << q;
 	else
 		ahp->ah_txOkInterruptMask &= ~(1 << q);
 	if (qi->tqi_qflags & HAL_TXQ_TXERRINT_ENABLE)
 		ahp->ah_txErrInterruptMask |= 1 << q;
 	else
 		ahp->ah_txErrInterruptMask &= ~(1 << q);
 	if (qi->tqi_qflags & HAL_TXQ_TXDESCINT_ENABLE)
 		ahp->ah_txDescInterruptMask |= 1 << q;
 	else
 		ahp->ah_txDescInterruptMask &= ~(1 << q);
 	if (qi->tqi_qflags & HAL_TXQ_TXEOLINT_ENABLE)
 		ahp->ah_txEolInterruptMask |= 1 << q;
 	else
 		ahp->ah_txEolInterruptMask &= ~(1 << q);
 	if (qi->tqi_qflags & HAL_TXQ_TXURNINT_ENABLE)
 		ahp->ah_txUrnInterruptMask |= 1 << q;
 	else
 		ahp->ah_txUrnInterruptMask &= ~(1 << q);
 	setTxQInterrupts(ah, qi);
 
 	return AH_TRUE;
 }
 #undef	TU_TO_USEC
Index: user/attilio/vmobj-rwlock/sys/dev/mxge/if_mxge.c
===================================================================
--- user/attilio/vmobj-rwlock/sys/dev/mxge/if_mxge.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/dev/mxge/if_mxge.c	(revision 247192)
@@ -1,5046 +1,5056 @@
 /******************************************************************************
 
-Copyright (c) 2006-2009, Myricom Inc.
+Copyright (c) 2006-2013, Myricom Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
  2. Neither the name of the Myricom Inc, nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 ***************************************************************************/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/linker.h>
 #include <sys/firmware.h>
 #include <sys/endian.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/ethernet.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 
 #include <net/bpf.h>
 
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/zlib.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_lro.h>
 #include <netinet6/ip6_var.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 #include <machine/resource.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/smp.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
 
 #include <vm/vm.h>		/* for pmap_mapdev() */
 #include <vm/pmap.h>
 
 #if defined(__i386) || defined(__amd64)
 #include <machine/specialreg.h>
 #endif
 
 #include <dev/mxge/mxge_mcp.h>
 #include <dev/mxge/mcp_gen_header.h>
 /*#define MXGE_FAKE_IFP*/
 #include <dev/mxge/if_mxge_var.h>
 #ifdef IFNET_BUF_RING
 #include <sys/buf_ring.h>
 #endif
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 /* tunable params */
 static int mxge_nvidia_ecrc_enable = 1;
 static int mxge_force_firmware = 0;
 static int mxge_intr_coal_delay = 30;
 static int mxge_deassert_wait = 1;
 static int mxge_flow_control = 1;
 static int mxge_verbose = 0;
 static int mxge_ticks;
 static int mxge_max_slices = 1;
 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
 static int mxge_always_promisc = 0;
 static int mxge_initial_mtu = ETHERMTU_JUMBO;
 static int mxge_throttle = 0;
 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
 static char *mxge_fw_aligned = "mxge_eth_z8e";
 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
 
 static int mxge_probe(device_t dev);
 static int mxge_attach(device_t dev);
 static int mxge_detach(device_t dev);
 static int mxge_shutdown(device_t dev);
 static void mxge_intr(void *arg);
 
 static device_method_t mxge_methods[] =
 {
   /* Device interface */
   DEVMETHOD(device_probe, mxge_probe),
   DEVMETHOD(device_attach, mxge_attach),
   DEVMETHOD(device_detach, mxge_detach),
   DEVMETHOD(device_shutdown, mxge_shutdown),
 
   DEVMETHOD_END
 };
 
 static driver_t mxge_driver =
 {
   "mxge",
   mxge_methods,
   sizeof(mxge_softc_t),
 };
 
 static devclass_t mxge_devclass;
 
 /* Declare ourselves to be a child of the PCI bus.*/
 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
 
 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
 static int mxge_close(mxge_softc_t *sc, int down);
 static int mxge_open(mxge_softc_t *sc);
 static void mxge_tick(void *arg);
 
 static int
 mxge_probe(device_t dev)
 {
 	int rev;
 
 
 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
 		rev = pci_get_revid(dev);
 		switch (rev) {
 		case MXGE_PCI_REV_Z8E:
 			device_set_desc(dev, "Myri10G-PCIE-8A");
 			break;
 		case MXGE_PCI_REV_Z8ES:
 			device_set_desc(dev, "Myri10G-PCIE-8B");
 			break;
 		default:
 			device_set_desc(dev, "Myri10G-PCIE-8??");
 			device_printf(dev, "Unrecognized rev %d NIC\n",
 				      rev);
 			break;	
 		}
 		return 0;
 	}
 	return ENXIO;
 }
 
 static void
 mxge_enable_wc(mxge_softc_t *sc)
 {
 #if defined(__i386) || defined(__amd64)
 	vm_offset_t len;
 	int err;
 
 	sc->wc = 1;
 	len = rman_get_size(sc->mem_res);
 	err = pmap_change_attr((vm_offset_t) sc->sram,
 			       len, PAT_WRITE_COMBINING);
 	if (err != 0) {
 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
 			      err);
 		sc->wc = 0;
 	}
 #endif		
 }
 
 
 /* callback to get our DMA address */
 static void
 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
 			 int error)
 {
 	if (error == 0) {
 		*(bus_addr_t *) arg = segs->ds_addr;
 	}
 }
 
 static int
 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes, 
 		   bus_size_t alignment)
 {
 	int err;
 	device_t dev = sc->dev;
 	bus_size_t boundary, maxsegsize;
 
 	if (bytes > 4096 && alignment == 4096) {
 		boundary = 0;
 		maxsegsize = bytes;
 	} else {
 		boundary = 4096;
 		maxsegsize = 4096;
 	}
 
 	/* allocate DMAable memory tags */
 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
 				 alignment,		/* alignment */
 				 boundary,		/* boundary */
 				 BUS_SPACE_MAXADDR,	/* low */
 				 BUS_SPACE_MAXADDR,	/* high */
 				 NULL, NULL,		/* filter */
 				 bytes,			/* maxsize */
 				 1,			/* num segs */
 				 maxsegsize,		/* maxsegsize */
 				 BUS_DMA_COHERENT,	/* flags */
 				 NULL, NULL,		/* lock */
 				 &dma->dmat);		/* tag */
 	if (err != 0) {
 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
 		return err;
 	}
 
 	/* allocate DMAable memory & map */
 	err = bus_dmamem_alloc(dma->dmat, &dma->addr, 
 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT 
 				| BUS_DMA_ZERO),  &dma->map);
 	if (err != 0) {
 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
 		goto abort_with_dmat;
 	}
 
 	/* load the memory */
 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
 			      mxge_dmamap_callback,
 			      (void *)&dma->bus_addr, 0);
 	if (err != 0) {
 		device_printf(dev, "couldn't load map (err = %d)\n", err);
 		goto abort_with_mem;
 	}
 	return 0;
 
 abort_with_mem:
 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
 abort_with_dmat:
 	(void)bus_dma_tag_destroy(dma->dmat);
 	return err;
 }
 
 
 static void
 mxge_dma_free(mxge_dma_t *dma)
 {
 	bus_dmamap_unload(dma->dmat, dma->map);
 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
 	(void)bus_dma_tag_destroy(dma->dmat);
 }
 
 /*
  * The eeprom strings on the lanaiX have the format
  * SN=x\0
  * MAC=x:x:x:x:x:x\0
  * PC=text\0
  */
 
 static int
 mxge_parse_strings(mxge_softc_t *sc)
 {
 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
 
 	char *ptr, *limit;
-	int i, found_mac;
+	int i, found_mac, found_sn2;
 
 	ptr = sc->eeprom_strings;
 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
 	found_mac = 0;
+	found_sn2 = 0;
 	while (ptr < limit && *ptr != '\0') {
 		if (memcmp(ptr, "MAC=", 4) == 0) {
 			ptr += 1;
 			sc->mac_addr_string = ptr;
 			for (i = 0; i < 6; i++) {
 				ptr += 3;
 				if ((ptr + 2) > limit)
 					goto abort;
 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
 				found_mac = 1;
 			}
 		} else if (memcmp(ptr, "PC=", 3) == 0) {
 			ptr += 3;
 			strncpy(sc->product_code_string, ptr,
 				sizeof (sc->product_code_string) - 1);
-		} else if (memcmp(ptr, "SN=", 3) == 0) {
+		} else if (!found_sn2 && (memcmp(ptr, "SN=", 3) == 0)) {
 			ptr += 3;
 			strncpy(sc->serial_number_string, ptr,
 				sizeof (sc->serial_number_string) - 1);
+		} else if (memcmp(ptr, "SN2=", 4) == 0) {
+			/* SN2 takes precedence over SN */
+			ptr += 4;
+			found_sn2 = 1;
+			strncpy(sc->serial_number_string, ptr,
+				sizeof (sc->serial_number_string) - 1);
 		}
 		MXGE_NEXT_STRING(ptr);
 	}
 
 	if (found_mac)
 		return 0;
 
  abort:
 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
 
 	return ENXIO;
 }
 
 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
 static void
 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
 {
 	uint32_t val;
 	unsigned long base, off;
 	char *va, *cfgptr;
 	device_t pdev, mcp55;
 	uint16_t vendor_id, device_id, word;
 	uintptr_t bus, slot, func, ivend, idev;
 	uint32_t *ptr32;
 
 
 	if (!mxge_nvidia_ecrc_enable)
 		return;
 
 	pdev = device_get_parent(device_get_parent(sc->dev));
 	if (pdev == NULL) {
 		device_printf(sc->dev, "could not find parent?\n");
 		return;
 	}
 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
 
 	if (vendor_id != 0x10de)
 		return;
 
 	base = 0;
 
 	if (device_id == 0x005d) {
 		/* ck804, base address is magic */
 		base = 0xe0000000UL;
 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
 		/* mcp55, base address stored in chipset */
 		mcp55 = pci_find_bsf(0, 0, 0);
 		if (mcp55 &&
 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
 			word = pci_read_config(mcp55, 0x90, 2);
 			base = ((unsigned long)word & 0x7ffeU) << 25;
 		}
 	}
 	if (!base)
 		return;
 
 	/* XXXX
 	   Test below is commented because it is believed that doing
 	   config read/write beyond 0xff will access the config space
 	   for the next larger function.  Uncomment this and remove 
 	   the hacky pmap_mapdev() way of accessing config space when
 	   FreeBSD grows support for extended pcie config space access
 	*/
 #if 0	
 	/* See if we can, by some miracle, access the extended
 	   config space */
 	val = pci_read_config(pdev, 0x178, 4);
 	if (val != 0xffffffff) {
 		val |= 0x40;
 		pci_write_config(pdev, 0x178, val, 4);
 		return;
 	}
 #endif
 	/* Rather than using normal pci config space writes, we must
 	 * map the Nvidia config space ourselves.  This is because on
 	 * opteron/nvidia class machine the 0xe000000 mapping is
 	 * handled by the nvidia chipset, that means the internal PCI
 	 * device (the on-chip northbridge), or the amd-8131 bridge
 	 * and things behind them are not visible by this method.
 	 */
 
 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
 		      PCI_IVAR_BUS, &bus);
 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
 		      PCI_IVAR_SLOT, &slot);
 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
 		      PCI_IVAR_FUNCTION, &func);
 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
 		      PCI_IVAR_VENDOR, &ivend);
 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
 		      PCI_IVAR_DEVICE, &idev);
 					
 	off =  base
 		+ 0x00100000UL * (unsigned long)bus
 		+ 0x00001000UL * (unsigned long)(func
 						 + 8 * slot);
 
 	/* map it into the kernel */
 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
 	
 
 	if (va == NULL) {
 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
 		return;
 	}
 	/* get a pointer to the config space mapped into the kernel */
 	cfgptr = va + (off & PAGE_MASK);
 
 	/* make sure that we can really access it */
 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
 	if (! (vendor_id == ivend && device_id == idev)) {
 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
 			      vendor_id, device_id);
 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
 		return;
 	}
 
 	ptr32 = (uint32_t*)(cfgptr + 0x178);
 	val = *ptr32;
 
 	if (val == 0xffffffff) {
 		device_printf(sc->dev, "extended mapping failed\n");
 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
 		return;
 	}
 	*ptr32 = val | 0x40;
 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
 	if (mxge_verbose) 
 		device_printf(sc->dev,
 			      "Enabled ECRC on upstream Nvidia bridge "
 			      "at %d:%d:%d\n",
 			      (int)bus, (int)slot, (int)func);
 	return;
 }
 #else
 static void
 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
 {
 	device_printf(sc->dev,
 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
 	return;
 }
 #endif
 
 
 static int
 mxge_dma_test(mxge_softc_t *sc, int test_type)
 {
 	mxge_cmd_t cmd;
 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
 	int status;
 	uint32_t len;
 	char *test = " ";
 
 
 	/* Run a small DMA test.
 	 * The magic multipliers to the length tell the firmware
 	 * to do DMA read, write, or read+write tests.  The
 	 * results are returned in cmd.data0.  The upper 16
 	 * bits of the return is the number of transfers completed.
 	 * The lower 16 bits is the time in 0.5us ticks that the
 	 * transfers took to complete.
 	 */
 
 	len = sc->tx_boundary;
 
 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
 	cmd.data2 = len * 0x10000;
 	status = mxge_send_cmd(sc, test_type, &cmd);
 	if (status != 0) {
 		test = "read";
 		goto abort;
 	}
 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
 		(cmd.data0 & 0xffff);
 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
 	cmd.data2 = len * 0x1;
 	status = mxge_send_cmd(sc, test_type, &cmd);
 	if (status != 0) {
 		test = "write";
 		goto abort;
 	}
 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
 		(cmd.data0 & 0xffff);
 
 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
 	cmd.data2 = len * 0x10001;
 	status = mxge_send_cmd(sc, test_type, &cmd);
 	if (status != 0) {
 		test = "read/write";
 		goto abort;
 	}
 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
 		(cmd.data0 & 0xffff);
 
 abort:
 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
 			      test, status);
 
 	return status;
 }
 
 /*
  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
  * when the PCI-E Completion packets are aligned on an 8-byte
  * boundary.  Some PCI-E chip sets always align Completion packets; on
  * the ones that do not, the alignment can be enforced by enabling
  * ECRC generation (if supported).
  *
  * When PCI-E Completion packets are not aligned, it is actually more
  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
  *
  * If the driver can neither enable ECRC nor verify that it has
  * already been enabled, then it must use a firmware image which works
  * around unaligned completion packets (ethp_z8e.dat), and it should
  * also ensure that it never gives the device a Read-DMA which is
  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
  * enabled, then the driver should use the aligned (eth_z8e.dat)
  * firmware image, and set tx_boundary to 4KB.
  */
 
 static int
 mxge_firmware_probe(mxge_softc_t *sc)
 {
 	device_t dev = sc->dev;
 	int reg, status;
 	uint16_t pectl;
 
 	sc->tx_boundary = 4096;
 	/*
 	 * Verify the max read request size was set to 4KB
 	 * before trying the test with 4KB.
 	 */
 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
 		pectl = pci_read_config(dev, reg + 0x8, 2);
 		if ((pectl & (5 << 12)) != (5 << 12)) {
 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
 				      pectl);
 			sc->tx_boundary = 2048;
 		}
 	}
 
 	/* 
 	 * load the optimized firmware (which assumes aligned PCIe
 	 * completions) in order to see if it works on this host.
 	 */
 	sc->fw_name = mxge_fw_aligned;
 	status = mxge_load_firmware(sc, 1);
 	if (status != 0) {
 		return status;
 	}
 
 	/* 
 	 * Enable ECRC if possible
 	 */
 	mxge_enable_nvidia_ecrc(sc);
 
 	/* 
 	 * Run a DMA test which watches for unaligned completions and
-	 * aborts on the first one seen.
+	 * aborts on the first one seen.  Not required on Z8ES or newer.
 	 */
-
+	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
+		return 0;
 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
 	if (status == 0)
 		return 0; /* keep the aligned firmware */
 
 	if (status != E2BIG)
 		device_printf(dev, "DMA test failed: %d\n", status);
 	if (status == ENOSYS)
 		device_printf(dev, "Falling back to ethp! "
 			      "Please install up to date fw\n");
 	return status;
 }
 
 static int
 mxge_select_firmware(mxge_softc_t *sc)
 {
 	int aligned = 0;
 	int force_firmware = mxge_force_firmware;
 
 	if (sc->throttle)
 		force_firmware = sc->throttle;
 
 	if (force_firmware != 0) {
 		if (force_firmware == 1)
 			aligned = 1;
 		else
 			aligned = 0;
 		if (mxge_verbose)
 			device_printf(sc->dev,
 				      "Assuming %s completions (forced)\n",
 				      aligned ? "aligned" : "unaligned");
 		goto abort;
 	}
 
 	/* if the PCIe link width is 4 or less, we can use the aligned
 	   firmware and skip any checks */
 	if (sc->link_width != 0 && sc->link_width <= 4) {
 		device_printf(sc->dev,
 			      "PCIe x%d Link, expect reduced performance\n",
 			      sc->link_width);
 		aligned = 1;
 		goto abort;
 	}
 
 	if (0 == mxge_firmware_probe(sc))
 		return 0;
 
 abort:
 	if (aligned) {
 		sc->fw_name = mxge_fw_aligned;
 		sc->tx_boundary = 4096;
 	} else {
 		sc->fw_name = mxge_fw_unaligned;
 		sc->tx_boundary = 2048;
 	}
 	return (mxge_load_firmware(sc, 0));
 }
 
 union qualhack
 {
         const char *ro_char;
         char *rw_char;
 };
 
 static int
 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
 {
 
 
 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
 		device_printf(sc->dev, "Bad firmware type: 0x%x\n", 
 			      be32toh(hdr->mcp_type));
 		return EIO;
 	}
 
 	/* save firmware version for sysctl */
 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
 	if (mxge_verbose)
 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
 
 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
 
 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
 		device_printf(sc->dev, "Found firmware version %s\n",
 			      sc->fw_version);
 		device_printf(sc->dev, "Driver needs %d.%d\n",
 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
 		return EINVAL;
 	}
 	return 0;
 
 }
 
 static void *
 z_alloc(void *nil, u_int items, u_int size)
 {
         void *ptr;
 
         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
         return ptr;
 }
 
 static void
 z_free(void *nil, void *ptr)
 {
         free(ptr, M_TEMP);
 }
 
 
 static int
 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
 {
 	z_stream zs;
 	char *inflate_buffer;
 	const struct firmware *fw;
 	const mcp_gen_header_t *hdr;
 	unsigned hdr_offset;
 	int status;
 	unsigned int i;
 	char dummy;
 	size_t fw_len;
 
 	fw = firmware_get(sc->fw_name);
 	if (fw == NULL) {
 		device_printf(sc->dev, "Could not find firmware image %s\n",
 			      sc->fw_name);
 		return ENOENT;
 	}
 
 
 
 	/* setup zlib and decompress f/w */
 	bzero(&zs, sizeof (zs));
 	zs.zalloc = z_alloc;
 	zs.zfree = z_free;
 	status = inflateInit(&zs);
 	if (status != Z_OK) {
 		status = EIO;
 		goto abort_with_fw;
 	}
 
 	/* the uncompressed size is stored as the firmware version,
 	   which would otherwise go unused */
 	fw_len = (size_t) fw->version; 
 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
 	if (inflate_buffer == NULL)
 		goto abort_with_zs;
 	zs.avail_in = fw->datasize;
 	zs.next_in = __DECONST(char *, fw->data);
 	zs.avail_out = fw_len;
 	zs.next_out = inflate_buffer;
 	status = inflate(&zs, Z_FINISH);
 	if (status != Z_STREAM_END) {
 		device_printf(sc->dev, "zlib %d\n", status);
 		status = EIO;
 		goto abort_with_buffer;
 	}
 
 	/* check id */
 	hdr_offset = htobe32(*(const uint32_t *)
 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
 		device_printf(sc->dev, "Bad firmware file");
 		status = EIO;
 		goto abort_with_buffer;
 	}
 	hdr = (const void*)(inflate_buffer + hdr_offset); 
 
 	status = mxge_validate_firmware(sc, hdr);
 	if (status != 0)
 		goto abort_with_buffer;
 
 	/* Copy the inflated firmware to NIC SRAM. */
 	for (i = 0; i < fw_len; i += 256) {
 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
 			      inflate_buffer + i,
 			      min(256U, (unsigned)(fw_len - i)));
 		wmb();
 		dummy = *sc->sram;
 		wmb();
 	}
 
 	*limit = fw_len;
 	status = 0;
 abort_with_buffer:
 	free(inflate_buffer, M_TEMP);
 abort_with_zs:
 	inflateEnd(&zs);
 abort_with_fw:
 	firmware_put(fw, FIRMWARE_UNLOAD);
 	return status;
 }
 
 /*
  * Enable or disable periodic RDMAs from the host to make certain
  * chipsets resend dropped PCIe messages
  */
 
 static void
 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
 {
 	char buf_bytes[72];
 	volatile uint32_t *confirm;
 	volatile char *submit;
 	uint32_t *buf, dma_low, dma_high;
 	int i;
 
 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
 
 	/* clear confirmation addr */
 	confirm = (volatile uint32_t *)sc->cmd;
 	*confirm = 0;
 	wmb();
 
 	/* send an rdma command to the PCIe engine, and wait for the
 	   response in the confirmation address.  The firmware should
 	   write a -1 there to indicate it is alive and well
 	*/
 
 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
 	buf[2] = htobe32(0xffffffff);		/* confirm data */
 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
 	buf[5] = htobe32(enable);			/* enable? */
 
 
 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
 
 	mxge_pio_copy(submit, buf, 64);
 	wmb();
 	DELAY(1000);
 	wmb();
 	i = 0;
 	while (*confirm != 0xffffffff && i < 20) {
 		DELAY(1000);
 		i++;
 	}
 	if (*confirm != 0xffffffff) {
 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)", 
 			      (enable ? "enable" : "disable"), confirm, 
 			      *confirm);
 	}
 	return;
 }
 
 static int 
 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
 {
 	mcp_cmd_t *buf;
 	char buf_bytes[sizeof(*buf) + 8];
 	volatile mcp_cmd_response_t *response = sc->cmd;
 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
 	uint32_t dma_low, dma_high;
 	int err, sleep_total = 0;
 
 	/* ensure buf is aligned to 8 bytes */
 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
 
 	buf->data0 = htobe32(data->data0);
 	buf->data1 = htobe32(data->data1);
 	buf->data2 = htobe32(data->data2);
 	buf->cmd = htobe32(cmd);
 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
 
 	buf->response_addr.low = htobe32(dma_low);
 	buf->response_addr.high = htobe32(dma_high);
 	mtx_lock(&sc->cmd_mtx);
 	response->result = 0xffffffff;
 	wmb();
 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
 
 	/* wait up to 20ms */
 	err = EAGAIN;
 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
 		bus_dmamap_sync(sc->cmd_dma.dmat, 
 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
 		wmb();
 		switch (be32toh(response->result)) {
 		case 0:
 			data->data0 = be32toh(response->data);
 			err = 0;
 			break;
 		case 0xffffffff:
 			DELAY(1000);
 			break;
 		case MXGEFW_CMD_UNKNOWN:
 			err = ENOSYS;
 			break;
 		case MXGEFW_CMD_ERROR_UNALIGNED:
 			err = E2BIG;
 			break;
 		case MXGEFW_CMD_ERROR_BUSY:
 			err = EBUSY;
 			break;
 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
 			err = ENXIO;
 			break;
 		default:
 			device_printf(sc->dev, 
 				      "mxge: command %d "
 				      "failed, result = %d\n",
 				      cmd, be32toh(response->result));
 			err = ENXIO;
 			break;
 		}
 		if (err != EAGAIN)
 			break;
 	}
 	if (err == EAGAIN)
 		device_printf(sc->dev, "mxge: command %d timed out"
 			      "result = %d\n",
 			      cmd, be32toh(response->result));
 	mtx_unlock(&sc->cmd_mtx);
 	return err;
 }
 
 static int
 mxge_adopt_running_firmware(mxge_softc_t *sc)
 {
 	struct mcp_gen_header *hdr;
 	const size_t bytes = sizeof (struct mcp_gen_header);
 	size_t hdr_offset;
 	int status;
 
 	/* find running firmware header */
 	hdr_offset = htobe32(*(volatile uint32_t *)
 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
 
 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
 		device_printf(sc->dev, 
 			      "Running firmware has bad header offset (%d)\n",
 			      (int)hdr_offset);
 		return EIO;
 	}
 
 	/* copy header of running firmware from SRAM to host memory to
 	 * validate firmware */
 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
 	if (hdr == NULL) {
 		device_printf(sc->dev, "could not malloc firmware hdr\n");
 		return ENOMEM;
 	}
 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
 				rman_get_bushandle(sc->mem_res),
 				hdr_offset, (char *)hdr, bytes);
 	status = mxge_validate_firmware(sc, hdr);
 	free(hdr, M_DEVBUF);
 
 	/* 
 	 * check to see if adopted firmware has bug where adopting
 	 * it will cause broadcasts to be filtered unless the NIC
 	 * is kept in ALLMULTI mode
 	 */
 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
 		sc->adopted_rx_filter_bug = 1;
 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
 			      "working around rx filter bug\n",
 			      sc->fw_ver_major, sc->fw_ver_minor,
 			      sc->fw_ver_tiny);
 	}
 
 	return status;
 }
 
 
 static int
 mxge_load_firmware(mxge_softc_t *sc, int adopt)
 {
 	volatile uint32_t *confirm;
 	volatile char *submit;
 	char buf_bytes[72];
 	uint32_t *buf, size, dma_low, dma_high;
 	int status, i;
 
 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
 
 	size = sc->sram_size;
 	status = mxge_load_firmware_helper(sc, &size);
 	if (status) {
 		if (!adopt)
 			return status;
 		/* Try to use the currently running firmware, if
 		   it is new enough */
 		status = mxge_adopt_running_firmware(sc);
 		if (status) {
 			device_printf(sc->dev,
 				      "failed to adopt running firmware\n");
 			return status;
 		}
 		device_printf(sc->dev,
 			      "Successfully adopted running firmware\n");
 		if (sc->tx_boundary == 4096) {
 			device_printf(sc->dev,
 				"Using firmware currently running on NIC"
 				 ".  For optimal\n");
 			device_printf(sc->dev,
 				 "performance consider loading optimized "
 				 "firmware\n");
 		}
 		sc->fw_name = mxge_fw_unaligned;
 		sc->tx_boundary = 2048;
 		return 0;
 	}
 	/* clear confirmation addr */
 	confirm = (volatile uint32_t *)sc->cmd;
 	*confirm = 0;
 	wmb();
 	/* send a reload command to the bootstrap MCP, and wait for the
 	   response in the confirmation address.  The firmware should
 	   write a -1 there to indicate it is alive and well
 	*/
 
 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
 
 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
 	buf[2] = htobe32(0xffffffff);	/* confirm data */
 
 	/* FIX: All newest firmware should un-protect the bottom of
 	   the sram before handoff. However, the very first interfaces
 	   do not. Therefore the handoff copy must skip the first 8 bytes
 	*/
 					/* where the code starts*/
 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
 	buf[4] = htobe32(size - 8); 	/* length of code */
 	buf[5] = htobe32(8);		/* where to copy to */
 	buf[6] = htobe32(0);		/* where to jump to */
 
 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
 	mxge_pio_copy(submit, buf, 64);
 	wmb();
 	DELAY(1000);
 	wmb();
 	i = 0;
 	while (*confirm != 0xffffffff && i < 20) {
 		DELAY(1000*10);
 		i++;
 		bus_dmamap_sync(sc->cmd_dma.dmat, 
 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
 	}
 	if (*confirm != 0xffffffff) {
 		device_printf(sc->dev,"handoff failed (%p = 0x%x)", 
 			confirm, *confirm);
 		
 		return ENXIO;
 	}
 	return 0;
 }
 
 static int
 mxge_update_mac_address(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	uint8_t *addr = sc->mac_addr;
 	int status;
 
 	
 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16) 
 		     | (addr[2] << 8) | addr[3]);
 
 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
 
 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
 	return status;
 }
 
 static int
 mxge_change_pause(mxge_softc_t *sc, int pause)
 {	
 	mxge_cmd_t cmd;
 	int status;
 
 	if (pause)
 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
 				       &cmd);
 	else
 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
 				       &cmd);
 
 	if (status) {
 		device_printf(sc->dev, "Failed to set flow control mode\n");
 		return ENXIO;
 	}
 	sc->pause = pause;
 	return 0;
 }
 
 static void
 mxge_change_promisc(mxge_softc_t *sc, int promisc)
 {	
 	mxge_cmd_t cmd;
 	int status;
 
 	if (mxge_always_promisc)
 		promisc = 1;
 
 	if (promisc)
 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
 				       &cmd);
 	else
 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
 				       &cmd);
 
 	if (status) {
 		device_printf(sc->dev, "Failed to set promisc mode\n");
 	}
 }
 
 static void
 mxge_set_multicast_list(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp = sc->ifp;
 	int err;
 
 	/* This firmware is known to not support multicast */
 	if (!sc->fw_multicast_support)
 		return;
 
 	/* Disable multicast filtering while we play with the lists*/
 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
 	if (err != 0) {
 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
 		       " error status: %d\n", err);
 		return;
 	}
 	
 	if (sc->adopted_rx_filter_bug)
 		return;
 	
 	if (ifp->if_flags & IFF_ALLMULTI)
 		/* request to disable multicast filtering, so quit here */
 		return;
 
 	/* Flush all the filters */
 
 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
 	if (err != 0) {
 		device_printf(sc->dev, 
 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
 			      ", error status: %d\n", err);
 		return;
 	}
 
 	/* Walk the multicast list, and add each address */
 
 	if_maddr_rlock(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
 		      &cmd.data0, 4);
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
 		      &cmd.data1, 2);
 		cmd.data0 = htonl(cmd.data0);
 		cmd.data1 = htonl(cmd.data1);
 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
 		if (err != 0) {
 			device_printf(sc->dev, "Failed "
 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
 			       "%d\t", err);
 			/* abort, leaving multicast filtering off */
 			if_maddr_runlock(ifp);
 			return;
 		}
 	}
 	if_maddr_runlock(ifp);
 	/* Enable multicast filtering */
 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
 	if (err != 0) {
 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
 		       ", error status: %d\n", err);
 	}
 }
 
 static int
 mxge_max_mtu(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	int status;
 
 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
 
 	/* try to set nbufs to see if it we can
 	   use virtually contiguous jumbos */
 	cmd.data0 = 0;
 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
 			       &cmd);
 	if (status == 0)
 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
 
 	/* otherwise, we're limited to MJUMPAGESIZE */
 	return MJUMPAGESIZE - MXGEFW_PAD;
 }
 
 static int
 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
 {
 	struct mxge_slice_state *ss;
 	mxge_rx_done_t *rx_done;
 	volatile uint32_t *irq_claim;
 	mxge_cmd_t cmd;
 	int slice, status;
 
 	/* try to send a reset command to the card to see if it
 	   is alive */
 	memset(&cmd, 0, sizeof (cmd));
 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
 	if (status != 0) {
 		device_printf(sc->dev, "failed reset\n");
 		return ENXIO;
 	}
 
 	mxge_dummy_rdma(sc, 1);
 
 
 	/* set the intrq size */
 	cmd.data0 = sc->rx_ring_size;
 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
 
 	/* 
 	 * Even though we already know how many slices are supported
 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
 	 * has magic side effects, and must be called after a reset.
 	 * It must be called prior to calling any RSS related cmds,
 	 * including assigning an interrupt queue for anything but
 	 * slice 0.  It must also be called *after*
 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
 	 * the firmware to compute offsets.
 	 */
 	 
 	if (sc->num_slices > 1) {
 		/* ask the maximum number of slices it supports */
 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
 					   &cmd);
 		if (status != 0) {
 			device_printf(sc->dev, 
 				      "failed to get number of slices\n");
 			return status;
 		}
 		/* 
 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
 		 * to setting up the interrupt queue DMA
 		 */
 		cmd.data0 = sc->num_slices;
 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
 #ifdef IFNET_BUF_RING
 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
 #endif
 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
 					   &cmd);
 		if (status != 0) {
 			device_printf(sc->dev,
 				      "failed to set number of slices\n");
 			return status;
 		}
 	}
 
 
 	if (interrupts_setup) {
 		/* Now exchange information about interrupts  */
 		for (slice = 0; slice < sc->num_slices; slice++) {
 			rx_done = &sc->ss[slice].rx_done;
 			memset(rx_done->entry, 0, sc->rx_ring_size);
 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
 			cmd.data2 = slice;
 			status |= mxge_send_cmd(sc,
 						MXGEFW_CMD_SET_INTRQ_DMA,
 						&cmd);
 		}
 	}
 
 	status |= mxge_send_cmd(sc, 
 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
 	
 
 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
 
 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
 
 
 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, 
 				&cmd);
 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
 	if (status != 0) {
 		device_printf(sc->dev, "failed set interrupt parameters\n");
 		return status;
 	}
 	
 
 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
 
 	
 	/* run a DMA benchmark */
 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
 
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		ss = &sc->ss[slice];
 
 		ss->irq_claim = irq_claim + (2 * slice);
 		/* reset mcp/driver shared state back to 0 */
 		ss->rx_done.idx = 0;
 		ss->rx_done.cnt = 0;
 		ss->tx.req = 0;
 		ss->tx.done = 0;
 		ss->tx.pkt_done = 0;
 		ss->tx.queue_active = 0;
 		ss->tx.activate = 0;
 		ss->tx.deactivate = 0;
 		ss->tx.wake = 0;
 		ss->tx.defrag = 0;
 		ss->tx.stall = 0;
 		ss->rx_big.cnt = 0;
 		ss->rx_small.cnt = 0;
 		ss->lc.lro_bad_csum = 0;
 		ss->lc.lro_queued = 0;
 		ss->lc.lro_flushed = 0;
 		if (ss->fw_stats != NULL) {
 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
 		}
 	}
 	sc->rdma_tags_available = 15;
 	status = mxge_update_mac_address(sc);
 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
 	mxge_change_pause(sc, sc->pause);
 	mxge_set_multicast_list(sc);
 	if (sc->throttle) {
 		cmd.data0 = sc->throttle;
 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
 				  &cmd)) {
 			device_printf(sc->dev,
 				      "can't enable throttle\n");
 		}
 	}
 	return status;
 }
 
 static int
 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
 {
 	mxge_cmd_t cmd;
 	mxge_softc_t *sc;
 	int err;
 	unsigned int throttle;
 
 	sc = arg1;
 	throttle = sc->throttle;
 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
         if (err != 0) {
                 return err;
         }
 
 	if (throttle == sc->throttle)
 		return 0;
 
         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
                 return EINVAL;
 	
 	mtx_lock(&sc->driver_mtx);
 	cmd.data0 = throttle;
 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
 	if (err == 0)
 		sc->throttle = throttle;
 	mtx_unlock(&sc->driver_mtx);	
 	return err;
 }
 
 static int
 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
 {
         mxge_softc_t *sc;
         unsigned int intr_coal_delay;
         int err;
 
         sc = arg1;
         intr_coal_delay = sc->intr_coal_delay;
         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
         if (err != 0) {
                 return err;
         }
         if (intr_coal_delay == sc->intr_coal_delay)
                 return 0;
 
         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
                 return EINVAL;
 
 	mtx_lock(&sc->driver_mtx);
 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
 	sc->intr_coal_delay = intr_coal_delay;
 	
 	mtx_unlock(&sc->driver_mtx);
         return err;
 }
 
 static int
 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
 {
         mxge_softc_t *sc;
         unsigned int enabled;
         int err;
 
         sc = arg1;
         enabled = sc->pause;
         err = sysctl_handle_int(oidp, &enabled, arg2, req);
         if (err != 0) {
                 return err;
         }
         if (enabled == sc->pause)
                 return 0;
 
 	mtx_lock(&sc->driver_mtx);
 	err = mxge_change_pause(sc, enabled);
 	mtx_unlock(&sc->driver_mtx);
         return err;
 }
 
 static int
 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
 {
         int err;
 
         if (arg1 == NULL)
                 return EFAULT;
         arg2 = be32toh(*(int *)arg1);
         arg1 = NULL;
         err = sysctl_handle_int(oidp, arg1, arg2, req);
 
         return err;
 }
 
 static void
 mxge_rem_sysctls(mxge_softc_t *sc)
 {
 	struct mxge_slice_state *ss;
 	int slice;
 
 	if (sc->slice_sysctl_tree == NULL)
 		return;
 
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		ss = &sc->ss[slice];
 		if (ss == NULL || ss->sysctl_tree == NULL)
 			continue;
 		sysctl_ctx_free(&ss->sysctl_ctx);
 		ss->sysctl_tree = NULL;
 	}
 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
 	sc->slice_sysctl_tree = NULL;
 }
 
 static void
 mxge_add_sysctls(mxge_softc_t *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *children;
 	mcp_irq_data_t *fw;
 	struct mxge_slice_state *ss;
 	int slice;
 	char slice_num[8];
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
 	fw = sc->ss[0].fw_stats;
 
 	/* random information */
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
 		       "firmware_version",
 		       CTLFLAG_RD, &sc->fw_version,
 		       0, "firmware version");
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
 		       "serial_number",
 		       CTLFLAG_RD, &sc->serial_number_string,
 		       0, "serial number");
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, 
 		       "product_code",
 		       CTLFLAG_RD, &sc->product_code_string,
 		       0, "product_code");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "pcie_link_width",
 		       CTLFLAG_RD, &sc->link_width,
 		       0, "tx_boundary");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "tx_boundary",
 		       CTLFLAG_RD, &sc->tx_boundary,
 		       0, "tx_boundary");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "write_combine",
 		       CTLFLAG_RD, &sc->wc,
 		       0, "write combining PIO?");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "read_dma_MBs",
 		       CTLFLAG_RD, &sc->read_dma,
 		       0, "DMA Read speed in MB/s");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "write_dma_MBs",
 		       CTLFLAG_RD, &sc->write_dma,
 		       0, "DMA Write speed in MB/s");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "read_write_dma_MBs",
 		       CTLFLAG_RD, &sc->read_write_dma,
 		       0, "DMA concurrent Read/Write speed in MB/s");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "watchdog_resets",
 		       CTLFLAG_RD, &sc->watchdog_resets,
 		       0, "Number of times NIC was reset");
 
 
 	/* performance related tunables */
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"intr_coal_delay",
 			CTLTYPE_INT|CTLFLAG_RW, sc,
 			0, mxge_change_intr_coal, 
 			"I", "interrupt coalescing delay in usecs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"throttle",
 			CTLTYPE_INT|CTLFLAG_RW, sc,
 			0, mxge_change_throttle, 
 			"I", "transmit throttling");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"flow_control_enabled",
 			CTLTYPE_INT|CTLFLAG_RW, sc,
 			0, mxge_change_flow_control,
 			"I", "interrupt coalescing delay in usecs");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "deassert_wait",
 		       CTLFLAG_RW, &mxge_deassert_wait,
 		       0, "Wait for IRQ line to go low in ihandler");
 
 	/* stats block from firmware is in network byte order.  
 	   Need to swap it */
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"link_up",
 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
 			0, mxge_handle_be32,
 			"I", "link up");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"rdma_tags_available",
 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
 			0, mxge_handle_be32,
 			"I", "rdma_tags_available");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_bad_crc32",
 			CTLTYPE_INT|CTLFLAG_RD, 
 			&fw->dropped_bad_crc32,
 			0, mxge_handle_be32,
 			"I", "dropped_bad_crc32");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_bad_phy",
 			CTLTYPE_INT|CTLFLAG_RD, 
 			&fw->dropped_bad_phy,
 			0, mxge_handle_be32,
 			"I", "dropped_bad_phy");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_link_error_or_filtered",
 			CTLTYPE_INT|CTLFLAG_RD, 
 			&fw->dropped_link_error_or_filtered,
 			0, mxge_handle_be32,
 			"I", "dropped_link_error_or_filtered");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_link_overflow",
 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
 			0, mxge_handle_be32,
 			"I", "dropped_link_overflow");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_multicast_filtered",
 			CTLTYPE_INT|CTLFLAG_RD, 
 			&fw->dropped_multicast_filtered,
 			0, mxge_handle_be32,
 			"I", "dropped_multicast_filtered");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_no_big_buffer",
 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
 			0, mxge_handle_be32,
 			"I", "dropped_no_big_buffer");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_no_small_buffer",
 			CTLTYPE_INT|CTLFLAG_RD, 
 			&fw->dropped_no_small_buffer,
 			0, mxge_handle_be32,
 			"I", "dropped_no_small_buffer");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_overrun",
 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
 			0, mxge_handle_be32,
 			"I", "dropped_overrun");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_pause",
 			CTLTYPE_INT|CTLFLAG_RD, 
 			&fw->dropped_pause,
 			0, mxge_handle_be32,
 			"I", "dropped_pause");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_runt",
 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
 			0, mxge_handle_be32,
 			"I", "dropped_runt");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 			"dropped_unicast_filtered",
 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
 			0, mxge_handle_be32,
 			"I", "dropped_unicast_filtered");
 
 	/* verbose printing? */
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 		       "verbose",
 		       CTLFLAG_RW, &mxge_verbose,
 		       0, "verbose printing");
 
 	/* add counters exported for debugging from all slices */
 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
 	sc->slice_sysctl_tree = 
 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
 				"slice", CTLFLAG_RD, 0, "");
 
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		ss = &sc->ss[slice];
 		sysctl_ctx_init(&ss->sysctl_ctx);
 		ctx = &ss->sysctl_ctx;
 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
 		sprintf(slice_num, "%d", slice);
 		ss->sysctl_tree = 
 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
 					CTLFLAG_RD, 0, "");
 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "rx_small_cnt",
 			       CTLFLAG_RD, &ss->rx_small.cnt,
 			       0, "rx_small_cnt");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "rx_big_cnt",
 			       CTLFLAG_RD, &ss->rx_big.cnt,
 			       0, "rx_small_cnt");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
 			       0, "number of lro merge queues flushed");
 
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
 			       0, "number of bad csums preventing LRO");
 
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
 			       0, "number of frames appended to lro merge"
 			       "queues");
 
 #ifndef IFNET_BUF_RING
 		/* only transmit from slice 0 for now */
 		if (slice > 0)
 			continue;
 #endif
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_req",
 			       CTLFLAG_RD, &ss->tx.req,
 			       0, "tx_req");
 
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_done",
 			       CTLFLAG_RD, &ss->tx.done,
 			       0, "tx_done");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_pkt_done",
 			       CTLFLAG_RD, &ss->tx.pkt_done,
 			       0, "tx_done");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_stall",
 			       CTLFLAG_RD, &ss->tx.stall,
 			       0, "tx_stall");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_wake",
 			       CTLFLAG_RD, &ss->tx.wake,
 			       0, "tx_wake");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_defrag",
 			       CTLFLAG_RD, &ss->tx.defrag,
 			       0, "tx_defrag");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_queue_active",
 			       CTLFLAG_RD, &ss->tx.queue_active,
 			       0, "tx_queue_active");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_activate",
 			       CTLFLAG_RD, &ss->tx.activate,
 			       0, "tx_activate");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 			       "tx_deactivate",
 			       CTLFLAG_RD, &ss->tx.deactivate,
 			       0, "tx_deactivate");
 	}
 }
 
 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy 
    backwards one at a time and handle ring wraps */
 
 static inline void 
 mxge_submit_req_backwards(mxge_tx_ring_t *tx, 
 			    mcp_kreq_ether_send_t *src, int cnt)
 {
         int idx, starting_slot;
         starting_slot = tx->req;
         while (cnt > 1) {
                 cnt--;
                 idx = (starting_slot + cnt) & tx->mask;
                 mxge_pio_copy(&tx->lanai[idx],
 			      &src[cnt], sizeof(*src));
                 wmb();
         }
 }
 
 /*
  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
  * at most 32 bytes at a time, so as to avoid involving the software
  * pio handler in the nic.   We re-write the first segment's flags
  * to mark them valid only after writing the entire chain 
  */
 
 static inline void 
 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, 
                   int cnt)
 {
         int idx, i;
         uint32_t *src_ints;
 	volatile uint32_t *dst_ints;
         mcp_kreq_ether_send_t *srcp;
 	volatile mcp_kreq_ether_send_t *dstp, *dst;
 	uint8_t last_flags;
         
         idx = tx->req & tx->mask;
 
 	last_flags = src->flags;
 	src->flags = 0;
         wmb();
         dst = dstp = &tx->lanai[idx];
         srcp = src;
 
         if ((idx + cnt) < tx->mask) {
                 for (i = 0; i < (cnt - 1); i += 2) {
                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
                         wmb(); /* force write every 32 bytes */
                         srcp += 2;
                         dstp += 2;
                 }
         } else {
                 /* submit all but the first request, and ensure 
                    that it is submitted below */
                 mxge_submit_req_backwards(tx, src, cnt);
                 i = 0;
         }
         if (i < cnt) {
                 /* submit the first request */
                 mxge_pio_copy(dstp, srcp, sizeof(*src));
                 wmb(); /* barrier before setting valid flag */
         }
 
         /* re-write the last 32-bits with the valid flags */
         src->flags = last_flags;
         src_ints = (uint32_t *)src;
         src_ints+=3;
         dst_ints = (volatile uint32_t *)dst;
         dst_ints+=3;
         *dst_ints =  *src_ints;
         tx->req += cnt;
         wmb();
 }
 
 static int
 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
     struct mxge_pkt_info *pi)
 {
 	struct ether_vlan_header *eh;
 	uint16_t etype;
 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
 #if IFCAP_TSO6 && defined(INET6)
 	int nxt;
 #endif
 
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		etype = ntohs(eh->evl_proto);
 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		etype = ntohs(eh->evl_encap_proto);
 		pi->ip_off = ETHER_HDR_LEN;
 	}
 
 	switch (etype) {
 	case ETHERTYPE_IP:
 		/*
 		 * ensure ip header is in first mbuf, copy it to a
 		 * scratch buffer if not
 		 */
 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
 		pi->ip6 = NULL;
 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
 			    ss->scratch);
 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
 		}
 		pi->ip_hlen = pi->ip->ip_hl << 2;
 		if (!tso)
 			return 0;
 
 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
 		    sizeof(struct tcphdr))) {
 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
 			    sizeof(struct tcphdr), ss->scratch);
 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
 		}
 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
 		break;
 #if IFCAP_TSO6 && defined(INET6)
 	case ETHERTYPE_IPV6:
 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
 			    ss->scratch);
 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
 		}
 		nxt = 0;
 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
 		pi->ip_hlen -= pi->ip_off;
 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
 			return EINVAL;
 
 		if (!tso)
 			return 0;
 
 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
 			return EINVAL;
 
 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
 		    sizeof(struct tcphdr))) {
 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
 			    sizeof(struct tcphdr), ss->scratch);
 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
 		}
 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
 		break;
 #endif
 	default:
 		return EINVAL;
 	}
 	return 0;
 }
 
 #if IFCAP_TSO4
 
 static void
 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
 {
 	mxge_tx_ring_t *tx;
 	mcp_kreq_ether_send_t *req;
 	bus_dma_segment_t *seg;
 	uint32_t low, high_swapped;
 	int len, seglen, cum_len, cum_len_next;
 	int next_is_first, chop, cnt, rdma_count, small;
 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
 	uint8_t flags, flags_next;
 	static int once;
 
 	mss = m->m_pkthdr.tso_segsz;
 
 	/* negative cum_len signifies to the
 	 * send loop that we are still in the
 	 * header portion of the TSO packet.
 	 */
 
 	cksum_offset = pi->ip_off + pi->ip_hlen;
 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
 
 	/* TSO implies checksum offload on this hardware */
 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
 		/*
 		 * If packet has full TCP csum, replace it with pseudo hdr
 		 * sum that the NIC expects, otherwise the NIC will emit
 		 * packets with bad TCP checksums.
 		 */
 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		if (pi->ip6) {
 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
 			sum = in6_cksum_pseudo(pi->ip6,
 			    m->m_pkthdr.len - cksum_offset,
 			    IPPROTO_TCP, 0);
 #endif
 		} else {
+#ifdef INET
 			m->m_pkthdr.csum_flags |= CSUM_TCP;
 			sum = in_pseudo(pi->ip->ip_src.s_addr,
 			    pi->ip->ip_dst.s_addr,
 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
 				    cksum_offset)));
+#endif
 		}
 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
 	}
 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
 
 	
 	/* for TSO, pseudo_hdr_offset holds mss.
 	 * The firmware figures out where to put
 	 * the checksum by parsing the header. */
 	pseudo_hdr_offset = htobe16(mss);
 
 	if (pi->ip6) {
 		/*
 		 * for IPv6 TSO, the "checksum offset" is re-purposed
 		 * to store the TCP header len
 		 */
 		cksum_offset = (pi->tcp->th_off << 2);
 	}
 
 	tx = &ss->tx;
 	req = tx->req_list;
 	seg = tx->seg_list;
 	cnt = 0;
 	rdma_count = 0;
 	/* "rdma_count" is the number of RDMAs belonging to the
 	 * current packet BEFORE the current send request. For
 	 * non-TSO packets, this is equal to "count".
 	 * For TSO packets, rdma_count needs to be reset
 	 * to 0 after a segment cut.
 	 *
 	 * The rdma_count field of the send request is
 	 * the number of RDMAs of the packet starting at
 	 * that request. For TSO send requests with one ore more cuts
 	 * in the middle, this is the number of RDMAs starting
 	 * after the last cut in the request. All previous
 	 * segments before the last cut implicitly have 1 RDMA.
 	 *
 	 * Since the number of RDMAs is not known beforehand,
 	 * it must be filled-in retroactively - after each
 	 * segmentation cut or at the end of the entire packet.
 	 */
 
 	while (busdma_seg_cnt) {
 		/* Break the busdma segment up into pieces*/
 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
 		len = seg->ds_len;
 
 		while (len) {
 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
 			seglen = len;
 			cum_len_next = cum_len + seglen;
 			(req-rdma_count)->rdma_count = rdma_count + 1;
 			if (__predict_true(cum_len >= 0)) {
 				/* payload */
 				chop = (cum_len_next > mss);
 				cum_len_next = cum_len_next % mss;
 				next_is_first = (cum_len_next == 0);
 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
 				flags_next |= next_is_first *
 					MXGEFW_FLAGS_FIRST;
 				rdma_count |= -(chop | next_is_first);
 				rdma_count += chop & !next_is_first;
 			} else if (cum_len_next >= 0) {
 				/* header ends */
 				rdma_count = -1;
 				cum_len_next = 0;
 				seglen = -cum_len;
 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
 				flags_next = MXGEFW_FLAGS_TSO_PLD |
 					MXGEFW_FLAGS_FIRST | 
 					(small * MXGEFW_FLAGS_SMALL);
 			    }
 			
 			req->addr_high = high_swapped;
 			req->addr_low = htobe32(low);
 			req->pseudo_hdr_offset = pseudo_hdr_offset;
 			req->pad = 0;
 			req->rdma_count = 1;
 			req->length = htobe16(seglen);
 			req->cksum_offset = cksum_offset;
 			req->flags = flags | ((cum_len & 1) *
 					      MXGEFW_FLAGS_ALIGN_ODD);
 			low += seglen;
 			len -= seglen;
 			cum_len = cum_len_next;
 			flags = flags_next;
 			req++;
 			cnt++;
 			rdma_count++;
 			if (cksum_offset != 0 && !pi->ip6) {
 				if (__predict_false(cksum_offset > seglen))
 					cksum_offset -= seglen;
 				else
 					cksum_offset = 0;
 			}
 			if (__predict_false(cnt > tx->max_desc))
 				goto drop;
 		}
 		busdma_seg_cnt--;
 		seg++;
 	}
 	(req-rdma_count)->rdma_count = rdma_count;
 
 	do {
 		req--;
 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
 
 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
 	mxge_submit_req(tx, tx->req_list, cnt);
 #ifdef IFNET_BUF_RING
 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
 		/* tell the NIC to start polling this slice */
 		*tx->send_go = 1;
 		tx->queue_active = 1;
 		tx->activate++;
 		wmb();
 	}
 #endif
 	return;
 
 drop:
 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
 	m_freem(m);
 	ss->oerrors++;
 	if (!once) {
 		printf("tx->max_desc exceeded via TSO!\n");
 		printf("mss = %d, %ld, %d!\n", mss,
 		       (long)seg - (long)tx->seg_list, tx->max_desc);
 		once = 1;
 	}
 	return;
 
 }
 
 #endif /* IFCAP_TSO4 */
 
 #ifdef MXGE_NEW_VLAN_API
 /* 
  * We reproduce the software vlan tag insertion from
  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
  * vlan tag insertion. We need to advertise this in order to have the
  * vlan interface respect our csum offload flags.
  */
 static struct mbuf *
 mxge_vlan_tag_insert(struct mbuf *m)
 {
 	struct ether_vlan_header *evl;
 
 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
 	if (__predict_false(m == NULL))
 		return NULL;
 	if (m->m_len < sizeof(*evl)) {
 		m = m_pullup(m, sizeof(*evl));
 		if (__predict_false(m == NULL))
 			return NULL;
 	}
 	/*
 	 * Transform the Ethernet header into an Ethernet header
 	 * with 802.1Q encapsulation.
 	 */
 	evl = mtod(m, struct ether_vlan_header *);
 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
 	m->m_flags &= ~M_VLANTAG;
 	return m;
 }
 #endif /* MXGE_NEW_VLAN_API */
 
 static void
 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
 {
 	struct mxge_pkt_info pi = {0,0,0,0};
 	mxge_softc_t *sc;
 	mcp_kreq_ether_send_t *req;
 	bus_dma_segment_t *seg;
 	struct mbuf *m_tmp;
 	struct ifnet *ifp;
 	mxge_tx_ring_t *tx;
 	int cnt, cum_len, err, i, idx, odd_flag;
 	uint16_t pseudo_hdr_offset;
         uint8_t flags, cksum_offset;
 
 
 	sc = ss->sc;
 	ifp = sc->ifp;
 	tx = &ss->tx;
 
 #ifdef MXGE_NEW_VLAN_API
 	if (m->m_flags & M_VLANTAG) {
 		m = mxge_vlan_tag_insert(m);
 		if (__predict_false(m == NULL))
 			goto drop_without_m;
 	}
 #endif
 	if (m->m_pkthdr.csum_flags &
 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
 		if (mxge_parse_tx(ss, m, &pi))
 			goto drop;
 	}
 
 	/* (try to) map the frame for DMA */
 	idx = tx->req & tx->mask;
 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
 				      m, tx->seg_list, &cnt, 
 				      BUS_DMA_NOWAIT);
 	if (__predict_false(err == EFBIG)) {
 		/* Too many segments in the chain.  Try
 		   to defrag */
 		m_tmp = m_defrag(m, M_NOWAIT);
 		if (m_tmp == NULL) {
 			goto drop;
 		}
 		ss->tx.defrag++;
 		m = m_tmp;
 		err = bus_dmamap_load_mbuf_sg(tx->dmat, 
 					      tx->info[idx].map,
 					      m, tx->seg_list, &cnt, 
 					      BUS_DMA_NOWAIT);
 	}
 	if (__predict_false(err != 0)) {
 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
 			      " packet len = %d\n", err, m->m_pkthdr.len);
 		goto drop;
 	}
 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
 			BUS_DMASYNC_PREWRITE);
 	tx->info[idx].m = m;
 
 #if IFCAP_TSO4
 	/* TSO is different enough, we handle it in another routine */
 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
 		mxge_encap_tso(ss, m, cnt, &pi);
 		return;
 	}
 #endif
 
 	req = tx->req_list;
 	cksum_offset = 0;
 	pseudo_hdr_offset = 0;
 	flags = MXGEFW_FLAGS_NO_TSO;
 
 	/* checksum offloading? */
 	if (m->m_pkthdr.csum_flags &
 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
 		/* ensure ip header is in first mbuf, copy
 		   it to a scratch buffer if not */
 		cksum_offset = pi.ip_off + pi.ip_hlen;
 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
 		req->cksum_offset = cksum_offset;
 		flags |= MXGEFW_FLAGS_CKSUM;
 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
 	} else {
 		odd_flag = 0;
 	}
 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
 		flags |= MXGEFW_FLAGS_SMALL;
 
 	/* convert segments into a request list */
 	cum_len = 0;
 	seg = tx->seg_list;
 	req->flags = MXGEFW_FLAGS_FIRST;
 	for (i = 0; i < cnt; i++) {
 		req->addr_low = 
 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
 		req->addr_high = 
 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
 		req->length = htobe16(seg->ds_len);
 		req->cksum_offset = cksum_offset;
 		if (cksum_offset > seg->ds_len)
 			cksum_offset -= seg->ds_len;
 		else
 			cksum_offset = 0;
 		req->pseudo_hdr_offset = pseudo_hdr_offset;
 		req->pad = 0; /* complete solid 16-byte block */
 		req->rdma_count = 1;
 		req->flags |= flags | ((cum_len & 1) * odd_flag);
 		cum_len += seg->ds_len;
 		seg++;
 		req++;
 		req->flags = 0;
 	}
 	req--;
 	/* pad runts to 60 bytes */
 	if (cum_len < 60) {
 		req++;
 		req->addr_low = 
 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
 		req->addr_high = 
 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
 		req->length = htobe16(60 - cum_len);
 		req->cksum_offset = 0;
 		req->pseudo_hdr_offset = pseudo_hdr_offset;
 		req->pad = 0; /* complete solid 16-byte block */
 		req->rdma_count = 1;
 		req->flags |= flags | ((cum_len & 1) * odd_flag);
 		cnt++;
 	}
 
 	tx->req_list[0].rdma_count = cnt;
 #if 0
 	/* print what the firmware will see */
 	for (i = 0; i < cnt; i++) {
 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
 		    "cso:%d, flags:0x%x, rdma:%d\n",
 		    i, (int)ntohl(tx->req_list[i].addr_high),
 		    (int)ntohl(tx->req_list[i].addr_low),
 		    (int)ntohs(tx->req_list[i].length),
 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
 		    tx->req_list[i].rdma_count);
 	}
 	printf("--------------\n");
 #endif
 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
 	mxge_submit_req(tx, tx->req_list, cnt);
 #ifdef IFNET_BUF_RING
 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
 		/* tell the NIC to start polling this slice */
 		*tx->send_go = 1;
 		tx->queue_active = 1;
 		tx->activate++;
 		wmb();
 	}
 #endif
 	return;
 
 drop:
 	m_freem(m);
 drop_without_m:
 	ss->oerrors++;
 	return;
 }
 
 #ifdef IFNET_BUF_RING
 static void
 mxge_qflush(struct ifnet *ifp)
 {
 	mxge_softc_t *sc = ifp->if_softc;
 	mxge_tx_ring_t *tx;
 	struct mbuf *m;
 	int slice;
 
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		tx = &sc->ss[slice].tx;
 		mtx_lock(&tx->mtx);
 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
 			m_freem(m);
 		mtx_unlock(&tx->mtx);
 	}
 	if_qflush(ifp);
 }
 
 static inline void
 mxge_start_locked(struct mxge_slice_state *ss)
 {
 	mxge_softc_t *sc;
 	struct mbuf *m;
 	struct ifnet *ifp;
 	mxge_tx_ring_t *tx;
 
 	sc = ss->sc;
 	ifp = sc->ifp;
 	tx = &ss->tx;
 
 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
 		m = drbr_dequeue(ifp, tx->br);
 		if (m == NULL) {
 			return;
 		}
 		/* let BPF see it */
 		BPF_MTAP(ifp, m);
 
 		/* give it to the nic */
 		mxge_encap(ss, m);
 	}
 	/* ran out of transmit slots */
 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
 	    && (!drbr_empty(ifp, tx->br))) {
 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
 		tx->stall++;
 	}
 }
 
 static int
 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
 {
 	mxge_softc_t *sc;
 	struct ifnet *ifp;
 	mxge_tx_ring_t *tx;
 	int err;
 
 	sc = ss->sc;
 	ifp = sc->ifp;
 	tx = &ss->tx;
 
 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING) {
 		err = drbr_enqueue(ifp, tx->br, m);
 		return (err);
 	}
 
 	if (!drbr_needs_enqueue(ifp, tx->br) &&
 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
 		/* let BPF see it */
 		BPF_MTAP(ifp, m);
 		/* give it to the nic */
 		mxge_encap(ss, m);
 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
 		return (err);
 	}
 	if (!drbr_empty(ifp, tx->br))
 		mxge_start_locked(ss);
 	return (0);
 }
 
 static int
 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	mxge_softc_t *sc = ifp->if_softc;
 	struct mxge_slice_state *ss;
 	mxge_tx_ring_t *tx;
 	int err = 0;
 	int slice;
 
 	slice = m->m_pkthdr.flowid;
 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
 
 	ss = &sc->ss[slice];
 	tx = &ss->tx;
 
 	if (mtx_trylock(&tx->mtx)) {
 		err = mxge_transmit_locked(ss, m);
 		mtx_unlock(&tx->mtx);
 	} else {
 		err = drbr_enqueue(ifp, tx->br, m);
 	}
 
 	return (err);
 }
 
 #else
 
 static inline void
 mxge_start_locked(struct mxge_slice_state *ss)
 {
 	mxge_softc_t *sc;
 	struct mbuf *m;
 	struct ifnet *ifp;
 	mxge_tx_ring_t *tx;
 
 	sc = ss->sc;
 	ifp = sc->ifp;
 	tx = &ss->tx;
 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL) {
 			return;
 		}
 		/* let BPF see it */
 		BPF_MTAP(ifp, m);
 
 		/* give it to the nic */
 		mxge_encap(ss, m);
 	}
 	/* ran out of transmit slots */
 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 		tx->stall++;
 	}
 }
 #endif
 static void
 mxge_start(struct ifnet *ifp)
 {
 	mxge_softc_t *sc = ifp->if_softc;
 	struct mxge_slice_state *ss;
 
 	/* only use the first slice for now */
 	ss = &sc->ss[0];
 	mtx_lock(&ss->tx.mtx);
 	mxge_start_locked(ss);
 	mtx_unlock(&ss->tx.mtx);		
 }
 
 /*
  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
  * at most 32 bytes at a time, so as to avoid involving the software
  * pio handler in the nic.   We re-write the first segment's low
  * DMA address to mark it valid only after we write the entire chunk
  * in a burst
  */
 static inline void
 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
 		mcp_kreq_ether_recv_t *src)
 {
 	uint32_t low;
 
 	low = src->addr_low;
 	src->addr_low = 0xffffffff;
 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
 	wmb();
 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
 	wmb();
 	src->addr_low = low;
 	dst->addr_low = low;
 	wmb();
 }
 
 static int
 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
 {
 	bus_dma_segment_t seg;
 	struct mbuf *m;
 	mxge_rx_ring_t *rx = &ss->rx_small;
 	int cnt, err;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		rx->alloc_fail++;
 		err = ENOBUFS;
 		goto done;
 	}
 	m->m_len = MHLEN;
 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, 
 				      &seg, &cnt, BUS_DMA_NOWAIT);
 	if (err != 0) {
 		m_free(m);
 		goto done;
 	}
 	rx->info[idx].m = m;
 	rx->shadow[idx].addr_low = 
 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
 	rx->shadow[idx].addr_high = 
 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
 
 done:
 	if ((idx & 7) == 7)
 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
 	return err;
 }
 
 static int
 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
 {
 	bus_dma_segment_t seg[3];
 	struct mbuf *m;
 	mxge_rx_ring_t *rx = &ss->rx_big;
 	int cnt, err, i;
 
 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
 	if (m == NULL) {
 		rx->alloc_fail++;
 		err = ENOBUFS;
 		goto done;
 	}
 	m->m_len = rx->mlen;
 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, 
 				      seg, &cnt, BUS_DMA_NOWAIT);
 	if (err != 0) {
 		m_free(m);
 		goto done;
 	}
 	rx->info[idx].m = m;
 	rx->shadow[idx].addr_low = 
 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
 	rx->shadow[idx].addr_high = 
 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
 
 #if MXGE_VIRT_JUMBOS
 	for (i = 1; i < cnt; i++) {
 		rx->shadow[idx + i].addr_low = 
 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
 		rx->shadow[idx + i].addr_high = 
 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
        }
 #endif
 
 done:
        for (i = 0; i < rx->nbufs; i++) {
 		if ((idx & 7) == 7) {
 			mxge_submit_8rx(&rx->lanai[idx - 7],
 					&rx->shadow[idx - 7]);
 		}
 		idx++;
 	}
 	return err;
 }
 
 #ifdef INET6
 
 static uint16_t
 mxge_csum_generic(uint16_t *raw, int len)
 {
 	uint32_t csum;
 
 
 	csum = 0;
 	while (len > 0) {
 		csum += *raw;
 		raw++;
 		len -= 2;
 	}
 	csum = (csum >> 16) + (csum & 0xffff);
 	csum = (csum >> 16) + (csum & 0xffff);
 	return (uint16_t)csum;
 }
 
 static inline uint16_t
 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
 {
 	uint32_t partial;
 	int nxt, cksum_offset;
 	struct ip6_hdr *ip6 = p;
 	uint16_t c;
 
 	nxt = ip6->ip6_nxt;
 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
 					   IPPROTO_IPV6, &nxt);
 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
 			return (1);
 	}
 
 	/*
 	 * IPv6 headers do not contain a checksum, and hence
 	 * do not checksum to zero, so they don't "fall out"
 	 * of the partial checksum calculation like IPv4
 	 * headers do.  We need to fix the partial checksum by
 	 * subtracting the checksum of the IPv6 header.
 	 */
 
 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
 				    ETHER_HDR_LEN);
 	csum += ~partial;
 	csum +=	 (csum < ~partial);
 	csum = (csum >> 16) + (csum & 0xFFFF);
 	csum = (csum >> 16) + (csum & 0xFFFF);
 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
 			     csum);
-
-//	printf("%d %d %x %x %x %x %x\n", m->m_pkthdr.len, cksum_offset, c, csum, ocsum, partial, d);
 	c ^= 0xffff;
 	return (c);
 }
 #endif /* INET6 */
 /* 
  *  Myri10GE hardware checksums are not valid if the sender
  *  padded the frame with non-zero padding.  This is because
  *  the firmware just does a simple 16-bit 1s complement
  *  checksum across the entire frame, excluding the first 14
  *  bytes.  It is best to simply to check the checksum and
  *  tell the stack about it only if the checksum is good
  */
 
 static inline uint16_t
 mxge_rx_csum(struct mbuf *m, int csum)
 {
 	struct ether_header *eh;
 #ifdef INET
 	struct ip *ip;
 #endif
+#if defined(INET) || defined(INET6)
 	int cap = m->m_pkthdr.rcvif->if_capenable;
+#endif
 	uint16_t c, etype;
 
 
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	switch (etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		if ((cap & IFCAP_RXCSUM) == 0)
 			return (1);
 		ip = (struct ip *)(eh + 1);
 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
 			return (1);
 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
 				    (ip->ip_hl << 2) + ip->ip_p));
 		c ^= 0xffff;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
 			return (1);
 		c = mxge_rx_csum6((eh + 1), m, csum);
 		break;
 #endif
 	default:
 		c = 1;
 	}
 	return (c);
 }
 
 static void
 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
 {
 	struct ether_vlan_header *evl;
 	struct ether_header *eh;
 	uint32_t partial;
 
 	evl = mtod(m, struct ether_vlan_header *);
 	eh = mtod(m, struct ether_header *);
 
 	/*
 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
 	 * after what the firmware thought was the end of the ethernet
 	 * header.
 	 */
 
 	/* put checksum into host byte order */
 	*csum = ntohs(*csum); 
 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
 	(*csum) += ~partial;
 	(*csum) +=  ((*csum) < ~partial);
 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
 
 	/* restore checksum to network byte order; 
 	   later consumers expect this */
 	*csum = htons(*csum);
 
 	/* save the tag */
 #ifdef MXGE_NEW_VLAN_API	
 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
 #else
 	{
 		struct m_tag *mtag;
 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
 				   M_NOWAIT);
 		if (mtag == NULL)
 			return;
 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
 		m_tag_prepend(m, mtag);
 	}
 
 #endif
 	m->m_flags |= M_VLANTAG;
 
 	/*
 	 * Remove the 802.1q header by copying the Ethernet
 	 * addresses over it and adjusting the beginning of
 	 * the data in the mbuf.  The encapsulated Ethernet
 	 * type field is already in place.
 	 */
 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
 }
 
 
 static inline void
 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
 		 uint32_t csum, int lro)
 {
 	mxge_softc_t *sc;
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct ether_header *eh;
 	mxge_rx_ring_t *rx;
 	bus_dmamap_t old_map;
 	int idx;
 
 	sc = ss->sc;
 	ifp = sc->ifp;
 	rx = &ss->rx_big;
 	idx = rx->cnt & rx->mask;
 	rx->cnt += rx->nbufs;
 	/* save a pointer to the received mbuf */
 	m = rx->info[idx].m;
 	/* try to replace the received mbuf */
 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
 		/* drop the frame -- the old mbuf is re-cycled */
 		ifp->if_ierrors++;
 		return;
 	}
 
 	/* unmap the received buffer */
 	old_map = rx->info[idx].map;
 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
 	bus_dmamap_unload(rx->dmat, old_map);
 
 	/* swap the bus_dmamap_t's */
 	rx->info[idx].map = rx->extra_map;
 	rx->extra_map = old_map;
 
 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
 	 * aligned */
 	m->m_data += MXGEFW_PAD;
 
 	m->m_pkthdr.rcvif = ifp;
 	m->m_len = m->m_pkthdr.len = len;
 	ss->ipackets++;
 	eh = mtod(m, struct ether_header *);
 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 		mxge_vlan_tag_remove(m, &csum);
 	}
 	/* if the checksum is valid, mark it in the mbuf header */
 	
 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
 	    (0 == mxge_rx_csum(m, csum))) {
 		/* Tell the stack that the  checksum is good */
 		m->m_pkthdr.csum_data = 0xffff;
 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
 			CSUM_DATA_VALID;
 
 #if defined(INET) || defined (INET6)
 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
 			return;
 #endif
 	}
 	/* flowid only valid if RSS hashing is enabled */
 	if (sc->num_slices > 1) {
 		m->m_pkthdr.flowid = (ss - sc->ss);
 		m->m_flags |= M_FLOWID;
 	}
 	/* pass the frame up the stack */
 	(*ifp->if_input)(ifp, m);
 }
 
 static inline void
 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
 		   uint32_t csum, int lro)
 {
 	mxge_softc_t *sc;
 	struct ifnet *ifp;
 	struct ether_header *eh;
 	struct mbuf *m;
 	mxge_rx_ring_t *rx;
 	bus_dmamap_t old_map;
 	int idx;
 
 	sc = ss->sc;
 	ifp = sc->ifp;
 	rx = &ss->rx_small;
 	idx = rx->cnt & rx->mask;
 	rx->cnt++;
 	/* save a pointer to the received mbuf */
 	m = rx->info[idx].m;
 	/* try to replace the received mbuf */
 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
 		/* drop the frame -- the old mbuf is re-cycled */
 		ifp->if_ierrors++;
 		return;
 	}
 
 	/* unmap the received buffer */
 	old_map = rx->info[idx].map;
 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
 	bus_dmamap_unload(rx->dmat, old_map);
 
 	/* swap the bus_dmamap_t's */
 	rx->info[idx].map = rx->extra_map;
 	rx->extra_map = old_map;
 
 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
 	 * aligned */
 	m->m_data += MXGEFW_PAD;
 
 	m->m_pkthdr.rcvif = ifp;
 	m->m_len = m->m_pkthdr.len = len;
 	ss->ipackets++;
 	eh = mtod(m, struct ether_header *);
 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 		mxge_vlan_tag_remove(m, &csum);
 	}
 	/* if the checksum is valid, mark it in the mbuf header */
 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
 	    (0 == mxge_rx_csum(m, csum))) {
 		/* Tell the stack that the  checksum is good */
 		m->m_pkthdr.csum_data = 0xffff;
 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
 			CSUM_DATA_VALID;
 
 #if defined(INET) || defined (INET6)
 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
 			return;
 #endif
 	}
 	/* flowid only valid if RSS hashing is enabled */
 	if (sc->num_slices > 1) {
 		m->m_pkthdr.flowid = (ss - sc->ss);
 		m->m_flags |= M_FLOWID;
 	}
 	/* pass the frame up the stack */
 	(*ifp->if_input)(ifp, m);
 }
 
 static inline void
 mxge_clean_rx_done(struct mxge_slice_state *ss)
 {
 	mxge_rx_done_t *rx_done = &ss->rx_done;
 	int limit = 0;
 	uint16_t length;
 	uint16_t checksum;
 	int lro;
 
 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
 	while (rx_done->entry[rx_done->idx].length != 0) {
 		length = ntohs(rx_done->entry[rx_done->idx].length);
 		rx_done->entry[rx_done->idx].length = 0;
 		checksum = rx_done->entry[rx_done->idx].checksum;
 		if (length <= (MHLEN - MXGEFW_PAD))
 			mxge_rx_done_small(ss, length, checksum, lro);
 		else
 			mxge_rx_done_big(ss, length, checksum, lro);
 		rx_done->cnt++;
 		rx_done->idx = rx_done->cnt & rx_done->mask;
 
 		/* limit potential for livelock */
 		if (__predict_false(++limit > rx_done->mask / 2))
 			break;
 	}
 #if defined(INET)  || defined (INET6)
 	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
 		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
 		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
 		tcp_lro_flush(&ss->lc, lro);
 	}
 #endif
 }
 
 
 static inline void
 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
 {
 	struct ifnet *ifp;
 	mxge_tx_ring_t *tx;
 	struct mbuf *m;
 	bus_dmamap_t map;
 	int idx;
 	int *flags;
 
 	tx = &ss->tx;
 	ifp = ss->sc->ifp;
 	while (tx->pkt_done != mcp_idx) {
 		idx = tx->done & tx->mask;
 		tx->done++;
 		m = tx->info[idx].m;
 		/* mbuf and DMA map only attached to the first
 		   segment per-mbuf */
 		if (m != NULL) {
 			ss->obytes += m->m_pkthdr.len;
 			if (m->m_flags & M_MCAST)
 				ss->omcasts++;
 			ss->opackets++;
 			tx->info[idx].m = NULL;
 			map = tx->info[idx].map;
 			bus_dmamap_unload(tx->dmat, map);
 			m_freem(m);
 		}
 		if (tx->info[idx].flag) {
 			tx->info[idx].flag = 0;
 			tx->pkt_done++;
 		}
 	}
 	
 	/* If we have space, clear IFF_OACTIVE to tell the stack that
            its OK to send packets */
 #ifdef IFNET_BUF_RING
 	flags = &ss->if_drv_flags;
 #else
 	flags = &ifp->if_drv_flags;
 #endif
 	mtx_lock(&ss->tx.mtx);
 	if ((*flags) & IFF_DRV_OACTIVE &&
 	    tx->req - tx->done < (tx->mask + 1)/4) {
 		*(flags) &= ~IFF_DRV_OACTIVE;
 		ss->tx.wake++;
 		mxge_start_locked(ss);
 	}
 #ifdef IFNET_BUF_RING
 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
 		/* let the NIC stop polling this queue, since there
 		 * are no more transmits pending */
 		if (tx->req == tx->done) {
 			*tx->send_stop = 1;
 			tx->queue_active = 0;
 			tx->deactivate++;
 			wmb();
 		}
 	}
 #endif
 	mtx_unlock(&ss->tx.mtx);
 
 }
 
 static struct mxge_media_type mxge_xfp_media_types[] =
 {
 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
 	{0,		(1 << 5),	"10GBASE-ER"},
 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
 	{0,		(1 << 3),	"10GBASE-SW"},
 	{0,		(1 << 2),	"10GBASE-LW"},
 	{0,		(1 << 1),	"10GBASE-EW"},
 	{0,		(1 << 0),	"Reserved"}
 };
 static struct mxge_media_type mxge_sfp_media_types[] =
 {
 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
 	{0,		(1 << 7),	"Reserved"},
 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
 };
 
 static void
 mxge_media_set(mxge_softc_t *sc, int media_type)
 {
 
 	
 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 
 		    0, NULL);
 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
 	sc->current_media = media_type;
 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
 }
 
 static void
 mxge_media_init(mxge_softc_t *sc)
 {
 	char *ptr;
 	int i;
 
 	ifmedia_removeall(&sc->media);
 	mxge_media_set(sc, IFM_AUTO);
 
 	/* 
 	 * parse the product code to deterimine the interface type
 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
 	 * after the 3rd dash in the driver's cached copy of the
 	 * EEPROM's product code string.
 	 */
 	ptr = sc->product_code_string;
 	if (ptr == NULL) {
 		device_printf(sc->dev, "Missing product code\n");
 		return;
 	}
 
 	for (i = 0; i < 3; i++, ptr++) {
 		ptr = strchr(ptr, '-');
 		if (ptr == NULL) {
 			device_printf(sc->dev,
 				      "only %d dashes in PC?!?\n", i);
 			return;
 		}
 	}
 	if (*ptr == 'C' || *(ptr +1) == 'C') {
 		/* -C is CX4 */
 		sc->connector = MXGE_CX4;
 		mxge_media_set(sc, IFM_10G_CX4);
 	} else if (*ptr == 'Q') {
 		/* -Q is Quad Ribbon Fiber */
 		sc->connector = MXGE_QRF;
 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
 		/* FreeBSD has no media type for Quad ribbon fiber */
 	} else if (*ptr == 'R') {
 		/* -R is XFP */
 		sc->connector = MXGE_XFP;
 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
 		/* -S or -2S is SFP+ */
 		sc->connector = MXGE_SFP;
 	} else {
 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
 	}
 }
 
 /*
  * Determine the media type for a NIC.  Some XFPs will identify
  * themselves only when their link is up, so this is initiated via a
  * link up interrupt.  However, this can potentially take up to
  * several milliseconds, so it is run via the watchdog routine, rather
  * than in the interrupt handler itself. 
  */
 static void
 mxge_media_probe(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	char *cage_type;
 
 	struct mxge_media_type *mxge_media_types = NULL;
 	int i, err, ms, mxge_media_type_entries;
 	uint32_t byte;
 
 	sc->need_media_probe = 0;
 
 	if (sc->connector == MXGE_XFP) {
 		/* -R is XFP */
 		mxge_media_types = mxge_xfp_media_types;
 		mxge_media_type_entries = 
 			sizeof (mxge_xfp_media_types) /
 			sizeof (mxge_xfp_media_types[0]);
 		byte = MXGE_XFP_COMPLIANCE_BYTE;
 		cage_type = "XFP";
 	} else 	if (sc->connector == MXGE_SFP) {
 		/* -S or -2S is SFP+ */
 		mxge_media_types = mxge_sfp_media_types;
 		mxge_media_type_entries = 
 			sizeof (mxge_sfp_media_types) /
 			sizeof (mxge_sfp_media_types[0]);
 		cage_type = "SFP+";
 		byte = 3;
 	} else {
 		/* nothing to do; media type cannot change */
 		return;
 	}
 
 	/*
 	 * At this point we know the NIC has an XFP cage, so now we
 	 * try to determine what is in the cage by using the
 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
 	 * register.  We read just one byte, which may take over
 	 * a millisecond
 	 */
 
 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
 	cmd.data1 = byte;
 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
 		device_printf(sc->dev, "failed to read XFP\n");
 	}
 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
 	}
 	if (err != MXGEFW_CMD_OK) {
 		return;
 	}
 
 	/* now we wait for the data to be cached */
 	cmd.data0 = byte;
 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
 		DELAY(1000);
 		cmd.data0 = byte;
 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
 	}
 	if (err != MXGEFW_CMD_OK) {
 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
 			      cage_type, err, ms);
 		return;
 	}
 		
 	if (cmd.data0 == mxge_media_types[0].bitmask) {
 		if (mxge_verbose)
 			device_printf(sc->dev, "%s:%s\n", cage_type,
 				      mxge_media_types[0].name);
 		if (sc->current_media != mxge_media_types[0].flag) {
 			mxge_media_init(sc);
 			mxge_media_set(sc, mxge_media_types[0].flag);
 		}
 		return;
 	}
 	for (i = 1; i < mxge_media_type_entries; i++) {
 		if (cmd.data0 & mxge_media_types[i].bitmask) {
 			if (mxge_verbose)
 				device_printf(sc->dev, "%s:%s\n",
 					      cage_type,
 					      mxge_media_types[i].name);
 
 			if (sc->current_media != mxge_media_types[i].flag) {
 				mxge_media_init(sc);
 				mxge_media_set(sc, mxge_media_types[i].flag);
 			}
 			return;
 		}
 	}
 	if (mxge_verbose)
 		device_printf(sc->dev, "%s media 0x%x unknown\n",
 			      cage_type, cmd.data0);
 
 	return;
 }
 
 static void
 mxge_intr(void *arg)
 {
 	struct mxge_slice_state *ss = arg;
 	mxge_softc_t *sc = ss->sc;
 	mcp_irq_data_t *stats = ss->fw_stats;
 	mxge_tx_ring_t *tx = &ss->tx;
 	mxge_rx_done_t *rx_done = &ss->rx_done;
 	uint32_t send_done_count;
 	uint8_t valid;
 
 
 #ifndef IFNET_BUF_RING
 	/* an interrupt on a non-zero slice is implicitly valid
 	   since MSI-X irqs are not shared */
 	if (ss != sc->ss) {
 		mxge_clean_rx_done(ss);
 		*ss->irq_claim = be32toh(3);
 		return;
 	}
 #endif
 
 	/* make sure the DMA has finished */
 	if (!stats->valid) {
 		return;
 	}
 	valid = stats->valid;
 
 	if (sc->legacy_irq) {
 		/* lower legacy IRQ  */
 		*sc->irq_deassert = 0;
 		if (!mxge_deassert_wait)
 			/* don't wait for conf. that irq is low */
 			stats->valid = 0;
 	} else {
 		stats->valid = 0;
 	}
 
 	/* loop while waiting for legacy irq deassertion */
 	do {
 		/* check for transmit completes and receives */
 		send_done_count = be32toh(stats->send_done_count);
 		while ((send_done_count != tx->pkt_done) ||
 		       (rx_done->entry[rx_done->idx].length != 0)) {
 			if (send_done_count != tx->pkt_done)
 				mxge_tx_done(ss, (int)send_done_count);
 			mxge_clean_rx_done(ss);
 			send_done_count = be32toh(stats->send_done_count);
 		}
 		if (sc->legacy_irq && mxge_deassert_wait)
 			wmb();
 	} while (*((volatile uint8_t *) &stats->valid));
 
 	/* fw link & error stats meaningful only on the first slice */
 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
 		if (sc->link_state != stats->link_up) {
 			sc->link_state = stats->link_up;
 			if (sc->link_state) {
 				if_link_state_change(sc->ifp, LINK_STATE_UP);
 				if_initbaudrate(sc->ifp, IF_Gbps(10));
 				if (mxge_verbose)
 					device_printf(sc->dev, "link up\n");
 			} else {
 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
 				sc->ifp->if_baudrate = 0;
 				if (mxge_verbose)
 					device_printf(sc->dev, "link down\n");
 			}
 			sc->need_media_probe = 1;
 		}
 		if (sc->rdma_tags_available !=
 		    be32toh(stats->rdma_tags_available)) {
 			sc->rdma_tags_available = 
 				be32toh(stats->rdma_tags_available);
 			device_printf(sc->dev, "RDMA timed out! %d tags "
 				      "left\n", sc->rdma_tags_available);
 		}
 
 		if (stats->link_down) {
 			sc->down_cnt += stats->link_down;
 			sc->link_state = 0;
 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
 		}
 	}
 
 	/* check to see if we have rx token to pass back */
 	if (valid & 0x1)
 	    *ss->irq_claim = be32toh(3);
 	*(ss->irq_claim + 1) = be32toh(3);
 }
 
 static void
 mxge_init(void *arg)
 {
 	mxge_softc_t *sc = arg;
 	struct ifnet *ifp = sc->ifp;
 
 
 	mtx_lock(&sc->driver_mtx);
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		(void) mxge_open(sc);
 	mtx_unlock(&sc->driver_mtx);
 }
 
 
 
 static void
 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
 {
 	int i;
 
 #if defined(INET) || defined(INET6)
 	tcp_lro_free(&ss->lc);
 #endif
 	for (i = 0; i <= ss->rx_big.mask; i++) {
 		if (ss->rx_big.info[i].m == NULL)
 			continue;
 		bus_dmamap_unload(ss->rx_big.dmat,
 				  ss->rx_big.info[i].map);
 		m_freem(ss->rx_big.info[i].m);
 		ss->rx_big.info[i].m = NULL;
 	}
 
 	for (i = 0; i <= ss->rx_small.mask; i++) {
 		if (ss->rx_small.info[i].m == NULL)
 			continue;
 		bus_dmamap_unload(ss->rx_small.dmat,
 				  ss->rx_small.info[i].map);
 		m_freem(ss->rx_small.info[i].m);
 		ss->rx_small.info[i].m = NULL;
 	}
 
 	/* transmit ring used only on the first slice */
 	if (ss->tx.info == NULL)
 		return;
 
 	for (i = 0; i <= ss->tx.mask; i++) {
 		ss->tx.info[i].flag = 0;
 		if (ss->tx.info[i].m == NULL)
 			continue;
 		bus_dmamap_unload(ss->tx.dmat,
 				  ss->tx.info[i].map);
 		m_freem(ss->tx.info[i].m);
 		ss->tx.info[i].m = NULL;
 	}
 }
 
 static void
 mxge_free_mbufs(mxge_softc_t *sc)
 {
 	int slice;
 
 	for (slice = 0; slice < sc->num_slices; slice++)
 		mxge_free_slice_mbufs(&sc->ss[slice]);
 }
 
 static void
 mxge_free_slice_rings(struct mxge_slice_state *ss)
 {
 	int i;
 
 
 	if (ss->rx_done.entry != NULL)
 		mxge_dma_free(&ss->rx_done.dma);
 	ss->rx_done.entry = NULL;
 
 	if (ss->tx.req_bytes != NULL)
 		free(ss->tx.req_bytes, M_DEVBUF);
 	ss->tx.req_bytes = NULL;
 
 	if (ss->tx.seg_list != NULL)
 		free(ss->tx.seg_list, M_DEVBUF);
 	ss->tx.seg_list = NULL;
 
 	if (ss->rx_small.shadow != NULL)
 		free(ss->rx_small.shadow, M_DEVBUF);
 	ss->rx_small.shadow = NULL;
 
 	if (ss->rx_big.shadow != NULL)
 		free(ss->rx_big.shadow, M_DEVBUF);
 	ss->rx_big.shadow = NULL;
 
 	if (ss->tx.info != NULL) {
 		if (ss->tx.dmat != NULL) {
 			for (i = 0; i <= ss->tx.mask; i++) {
 				bus_dmamap_destroy(ss->tx.dmat,
 						   ss->tx.info[i].map);
 			}
 			bus_dma_tag_destroy(ss->tx.dmat);
 		}
 		free(ss->tx.info, M_DEVBUF);
 	}
 	ss->tx.info = NULL;
 
 	if (ss->rx_small.info != NULL) {
 		if (ss->rx_small.dmat != NULL) {
 			for (i = 0; i <= ss->rx_small.mask; i++) {
 				bus_dmamap_destroy(ss->rx_small.dmat,
 						   ss->rx_small.info[i].map);
 			}
 			bus_dmamap_destroy(ss->rx_small.dmat,
 					   ss->rx_small.extra_map);
 			bus_dma_tag_destroy(ss->rx_small.dmat);
 		}
 		free(ss->rx_small.info, M_DEVBUF);
 	}
 	ss->rx_small.info = NULL;
 
 	if (ss->rx_big.info != NULL) {
 		if (ss->rx_big.dmat != NULL) {
 			for (i = 0; i <= ss->rx_big.mask; i++) {
 				bus_dmamap_destroy(ss->rx_big.dmat,
 						   ss->rx_big.info[i].map);
 			}
 			bus_dmamap_destroy(ss->rx_big.dmat,
 					   ss->rx_big.extra_map);
 			bus_dma_tag_destroy(ss->rx_big.dmat);
 		}
 		free(ss->rx_big.info, M_DEVBUF);
 	}
 	ss->rx_big.info = NULL;
 }
 
 static void
 mxge_free_rings(mxge_softc_t *sc)
 {
 	int slice;
 
 	for (slice = 0; slice < sc->num_slices; slice++)
 		mxge_free_slice_rings(&sc->ss[slice]);
 }
 
 static int
 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
 		       int tx_ring_entries)
 {
 	mxge_softc_t *sc = ss->sc;
 	size_t bytes;
 	int err, i;
 
 	err = ENOMEM;
 
 	/* allocate per-slice receive resources */
 
 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
 
 	/* allocate the rx shadow rings */
 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
 	if (ss->rx_small.shadow == NULL)
 		return err;
 
 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
 	if (ss->rx_big.shadow == NULL)
 		return err;
 
 	/* allocate the rx host info rings */
 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
 	if (ss->rx_small.info == NULL)
 		return err;
 
 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
 	if (ss->rx_big.info == NULL)
 		return err;
 
 	/* allocate the rx busdma resources */
 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
 				 1,			/* alignment */
 				 4096,			/* boundary */
 				 BUS_SPACE_MAXADDR,	/* low */
 				 BUS_SPACE_MAXADDR,	/* high */
 				 NULL, NULL,		/* filter */
 				 MHLEN,			/* maxsize */
 				 1,			/* num segs */
 				 MHLEN,			/* maxsegsize */
 				 BUS_DMA_ALLOCNOW,	/* flags */
 				 NULL, NULL,		/* lock */
 				 &ss->rx_small.dmat);	/* tag */
 	if (err != 0) {
 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
 			      err);
 		return err;
 	}
 
 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
 				 1,			/* alignment */
 #if MXGE_VIRT_JUMBOS
 				 4096,			/* boundary */
 #else
 				 0,			/* boundary */
 #endif
 				 BUS_SPACE_MAXADDR,	/* low */
 				 BUS_SPACE_MAXADDR,	/* high */
 				 NULL, NULL,		/* filter */
 				 3*4096,		/* maxsize */
 #if MXGE_VIRT_JUMBOS
 				 3,			/* num segs */
 				 4096,			/* maxsegsize*/
 #else
 				 1,			/* num segs */
 				 MJUM9BYTES,		/* maxsegsize*/
 #endif
 				 BUS_DMA_ALLOCNOW,	/* flags */
 				 NULL, NULL,		/* lock */
 				 &ss->rx_big.dmat);	/* tag */
 	if (err != 0) {
 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
 			      err);
 		return err;
 	}
 	for (i = 0; i <= ss->rx_small.mask; i++) {
 		err = bus_dmamap_create(ss->rx_small.dmat, 0, 
 					&ss->rx_small.info[i].map);
 		if (err != 0) {
 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
 				      err);
 			return err;
 		}
 	}
 	err = bus_dmamap_create(ss->rx_small.dmat, 0, 
 				&ss->rx_small.extra_map);
 	if (err != 0) {
 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
 			      err);
 		return err;
 	}
 
 	for (i = 0; i <= ss->rx_big.mask; i++) {
 		err = bus_dmamap_create(ss->rx_big.dmat, 0, 
 					&ss->rx_big.info[i].map);
 		if (err != 0) {
 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
 				      err);
 			return err;
 		}
 	}
 	err = bus_dmamap_create(ss->rx_big.dmat, 0, 
 				&ss->rx_big.extra_map);
 	if (err != 0) {
 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
 			      err);
 		return err;
 	}
 
 	/* now allocate TX resouces */
 
 #ifndef IFNET_BUF_RING
 	/* only use a single TX ring for now */
 	if (ss != ss->sc->ss)
 		return 0;
 #endif
 
 	ss->tx.mask = tx_ring_entries - 1;
 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
 
 	
 	/* allocate the tx request copy block */
 	bytes = 8 + 
 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
 	if (ss->tx.req_bytes == NULL)
 		return err;
 	/* ensure req_list entries are aligned to 8 bytes */
 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
 
 	/* allocate the tx busdma segment list */
 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
 	ss->tx.seg_list = (bus_dma_segment_t *) 
 		malloc(bytes, M_DEVBUF, M_WAITOK);
 	if (ss->tx.seg_list == NULL)
 		return err;
 
 	/* allocate the tx host info ring */
 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
 	if (ss->tx.info == NULL)
 		return err;
 	
 	/* allocate the tx busdma resources */
 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
 				 1,			/* alignment */
 				 sc->tx_boundary,	/* boundary */
 				 BUS_SPACE_MAXADDR,	/* low */
 				 BUS_SPACE_MAXADDR,	/* high */
 				 NULL, NULL,		/* filter */
 				 65536 + 256,		/* maxsize */
 				 ss->tx.max_desc - 2,	/* num segs */
 				 sc->tx_boundary,	/* maxsegsz */
 				 BUS_DMA_ALLOCNOW,	/* flags */
 				 NULL, NULL,		/* lock */
 				 &ss->tx.dmat);		/* tag */
 	
 	if (err != 0) {
 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
 			      err);
 		return err;
 	}
 
 	/* now use these tags to setup dmamaps for each slot
 	   in the ring */
 	for (i = 0; i <= ss->tx.mask; i++) {
 		err = bus_dmamap_create(ss->tx.dmat, 0, 
 					&ss->tx.info[i].map);
 		if (err != 0) {
 			device_printf(sc->dev, "Err %d  tx dmamap\n",
 				      err);
 			return err;
 		}
 	}
 	return 0;
 
 }
 
 static int
 mxge_alloc_rings(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	int tx_ring_size;
 	int tx_ring_entries, rx_ring_entries;
 	int err, slice;
 	
 	/* get ring sizes */
 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
 	tx_ring_size = cmd.data0;
 	if (err != 0) {
 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
 		goto abort;
 	}
 
 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
 	IFQ_SET_READY(&sc->ifp->if_snd);
 
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		err = mxge_alloc_slice_rings(&sc->ss[slice],
 					     rx_ring_entries,
 					     tx_ring_entries);
 		if (err != 0)
 			goto abort;
 	}
 	return 0;
 
 abort:
 	mxge_free_rings(sc);
 	return err;
 
 }
 
 
 static void
 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
 {
 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
 
 	if (bufsize < MCLBYTES) {
 		/* easy, everything fits in a single buffer */
 		*big_buf_size = MCLBYTES;
 		*cl_size = MCLBYTES;
 		*nbufs = 1;
 		return;
 	}
 
 	if (bufsize < MJUMPAGESIZE) {
 		/* still easy, everything still fits in a single buffer */
 		*big_buf_size = MJUMPAGESIZE;
 		*cl_size = MJUMPAGESIZE;
 		*nbufs = 1;
 		return;
 	}
 #if MXGE_VIRT_JUMBOS
 	/* now we need to use virtually contiguous buffers */
 	*cl_size = MJUM9BYTES;
 	*big_buf_size = 4096;
 	*nbufs = mtu / 4096 + 1;
 	/* needs to be a power of two, so round up */
 	if (*nbufs == 3)
 		*nbufs = 4;
 #else
 	*cl_size = MJUM9BYTES;
 	*big_buf_size = MJUM9BYTES;
 	*nbufs = 1;
 #endif
 }
 
 static int
 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
 {
 	mxge_softc_t *sc;
 	mxge_cmd_t cmd;
 	bus_dmamap_t map;
 	int err, i, slice;
 
 
 	sc = ss->sc;
 	slice = ss - sc->ss;
 
 #if defined(INET) || defined(INET6)
 	(void)tcp_lro_init(&ss->lc);
 #endif
 	ss->lc.ifp = sc->ifp;
 	
 	/* get the lanai pointers to the send and receive rings */
 
 	err = 0;
 #ifndef IFNET_BUF_RING
 	/* We currently only send from the first slice */
 	if (slice == 0) {
 #endif
 		cmd.data0 = slice;
 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
 		ss->tx.lanai = 
 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
 		ss->tx.send_go = (volatile uint32_t *)
 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
 		ss->tx.send_stop = (volatile uint32_t *)
 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
 #ifndef IFNET_BUF_RING
 	}
 #endif
 	cmd.data0 = slice;
 	err |= mxge_send_cmd(sc, 
 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
 	ss->rx_small.lanai = 
 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
 	cmd.data0 = slice;
 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
 	ss->rx_big.lanai = 
 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
 
 	if (err != 0) {
 		device_printf(sc->dev, 
 			      "failed to get ring sizes or locations\n");
 		return EIO;
 	}
 
 	/* stock receive rings */
 	for (i = 0; i <= ss->rx_small.mask; i++) {
 		map = ss->rx_small.info[i].map;
 		err = mxge_get_buf_small(ss, map, i);
 		if (err) {
 			device_printf(sc->dev, "alloced %d/%d smalls\n",
 				      i, ss->rx_small.mask + 1);
 			return ENOMEM;
 		}
 	}
 	for (i = 0; i <= ss->rx_big.mask; i++) {
 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
 	}
 	ss->rx_big.nbufs = nbufs;
 	ss->rx_big.cl_size = cl_size;
 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
 		map = ss->rx_big.info[i].map;
 		err = mxge_get_buf_big(ss, map, i);
 		if (err) {
 			device_printf(sc->dev, "alloced %d/%d bigs\n",
 				      i, ss->rx_big.mask + 1);
 			return ENOMEM;
 		}
 	}
 	return 0;
 }
 
 static int 
 mxge_open(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	int err, big_bytes, nbufs, slice, cl_size, i;
 	bus_addr_t bus;
 	volatile uint8_t *itable;
 	struct mxge_slice_state *ss;
 
 	/* Copy the MAC address in case it was overridden */
 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
 
 	err = mxge_reset(sc, 1);
 	if (err != 0) {
 		device_printf(sc->dev, "failed to reset\n");
 		return EIO;
 	}
 
 	if (sc->num_slices > 1) {
 		/* setup the indirection table */
 		cmd.data0 = sc->num_slices;
 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
 				    &cmd);
 
 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
 				     &cmd);
 		if (err != 0) {
 			device_printf(sc->dev,
 				      "failed to setup rss tables\n");
 			return err;
 		}
 
 		/* just enable an identity mapping */
 		itable = sc->sram + cmd.data0;
 		for (i = 0; i < sc->num_slices; i++)
 			itable[i] = (uint8_t)i;
 
 		cmd.data0 = 1;
 		cmd.data1 = mxge_rss_hash_type;
 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
 		if (err != 0) {
 			device_printf(sc->dev, "failed to enable slices\n");
 			return err;
 		}
 	}
 
 
 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
 
 	cmd.data0 = nbufs;
 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
 			    &cmd);
 	/* error is only meaningful if we're trying to set 
 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
 	if (err && nbufs > 1) {
 		device_printf(sc->dev,
 			      "Failed to set alway-use-n to %d\n",
 			      nbufs);
 		return EIO;
 	}
 	/* Give the firmware the mtu and the big and small buffer
 	   sizes.  The firmware wants the big buf size to be a power
 	   of two. Luckily, FreeBSD's clusters are powers of two */
 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
 	cmd.data0 = MHLEN - MXGEFW_PAD;
 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
 			     &cmd);
 	cmd.data0 = big_bytes;
 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
 
 	if (err != 0) {
 		device_printf(sc->dev, "failed to setup params\n");
 		goto abort;
 	}
 
 	/* Now give him the pointer to the stats block */
 	for (slice = 0; 
 #ifdef IFNET_BUF_RING
 	     slice < sc->num_slices;
 #else
 	     slice < 1;
 #endif
 	     slice++) {
 		ss = &sc->ss[slice];
 		cmd.data0 =
 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
 		cmd.data1 =
 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
 		cmd.data2 = sizeof(struct mcp_irq_data);
 		cmd.data2 |= (slice << 16);
 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
 	}
 
 	if (err != 0) {
 		bus = sc->ss->fw_stats_dma.bus_addr;
 		bus += offsetof(struct mcp_irq_data, send_done_count);
 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
 		err = mxge_send_cmd(sc,
 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
 				    &cmd);
 		/* Firmware cannot support multicast without STATS_DMA_V2 */
 		sc->fw_multicast_support = 0;
 	} else {
 		sc->fw_multicast_support = 1;
 	}
 
 	if (err != 0) {
 		device_printf(sc->dev, "failed to setup params\n");
 		goto abort;
 	}
 
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
 		if (err != 0) {
 			device_printf(sc->dev, "couldn't open slice %d\n",
 				      slice);
 			goto abort;
 		}
 	}
 
 	/* Finally, start the firmware running */
 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
 	if (err) {
 		device_printf(sc->dev, "Couldn't bring up link\n");
 		goto abort;
 	}
 #ifdef IFNET_BUF_RING
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		ss = &sc->ss[slice];
 		ss->if_drv_flags |= IFF_DRV_RUNNING;
 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	}
 #endif
 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 	return 0;
 
 
 abort:
 	mxge_free_mbufs(sc);
 
 	return err;
 }
 
 static int
 mxge_close(mxge_softc_t *sc, int down)
 {
 	mxge_cmd_t cmd;
 	int err, old_down_cnt;
 #ifdef IFNET_BUF_RING
 	struct mxge_slice_state *ss;	
 	int slice;
 #endif
 
 #ifdef IFNET_BUF_RING
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		ss = &sc->ss[slice];
 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
 	}
 #endif
 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	if (!down) {
 		old_down_cnt = sc->down_cnt;
 		wmb();
 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
 		if (err) {
 			device_printf(sc->dev,
 				      "Couldn't bring down link\n");
 		}
 		if (old_down_cnt == sc->down_cnt) {
 			/* wait for down irq */
 			DELAY(10 * sc->intr_coal_delay);
 		}
 		wmb();
 		if (old_down_cnt == sc->down_cnt) {
 			device_printf(sc->dev, "never got down irq\n");
 		}
 	}
 	mxge_free_mbufs(sc);
 
 	return 0;
 }
 
 static void
 mxge_setup_cfg_space(mxge_softc_t *sc)
 {
 	device_t dev = sc->dev;
 	int reg;
 	uint16_t cmd, lnk, pectl;
 
 	/* find the PCIe link width and set max read request to 4KB*/
 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
 		lnk = pci_read_config(dev, reg + 0x12, 2);
 		sc->link_width = (lnk >> 4) & 0x3f;
 
 		if (sc->pectl == 0) {
 			pectl = pci_read_config(dev, reg + 0x8, 2);
 			pectl = (pectl & ~0x7000) | (5 << 12);
 			pci_write_config(dev, reg + 0x8, pectl, 2);
 			sc->pectl = pectl;
 		} else {
 			/* restore saved pectl after watchdog reset */
 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
 		}
 	}
 
 	/* Enable DMA and Memory space access */
 	pci_enable_busmaster(dev);
 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
 	cmd |= PCIM_CMD_MEMEN;
 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
 }
 
 static uint32_t
 mxge_read_reboot(mxge_softc_t *sc)
 {
 	device_t dev = sc->dev;
 	uint32_t vs;
 
 	/* find the vendor specific offset */
 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
 		device_printf(sc->dev,
 			      "could not find vendor specific offset\n");
 		return (uint32_t)-1;
 	}
 	/* enable read32 mode */
 	pci_write_config(dev, vs + 0x10, 0x3, 1);
 	/* tell NIC which register to read */
 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
 	return (pci_read_config(dev, vs + 0x14, 4));
 }
 
 static void
 mxge_watchdog_reset(mxge_softc_t *sc)
 {
 	struct pci_devinfo *dinfo;
 	struct mxge_slice_state *ss;
 	int err, running, s, num_tx_slices = 1;
 	uint32_t reboot;
 	uint16_t cmd;
 
 	err = ENXIO;
 
 	device_printf(sc->dev, "Watchdog reset!\n");
 
 	/* 
 	 * check to see if the NIC rebooted.  If it did, then all of
 	 * PCI config space has been reset, and things like the
 	 * busmaster bit will be zero.  If this is the case, then we
 	 * must restore PCI config space before the NIC can be used
 	 * again
 	 */
 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
 	if (cmd == 0xffff) {
 		/* 
 		 * maybe the watchdog caught the NIC rebooting; wait
 		 * up to 100ms for it to finish.  If it does not come
 		 * back, then give up 
 		 */
 		DELAY(1000*100);
 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
 		if (cmd == 0xffff) {
 			device_printf(sc->dev, "NIC disappeared!\n");
 		}
 	}
 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
 		/* print the reboot status */
 		reboot = mxge_read_reboot(sc);
 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
 			      reboot);
 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
 		if (running) {
 
 			/* 
 			 * quiesce NIC so that TX routines will not try to
 			 * xmit after restoration of BAR
 			 */
 
 			/* Mark the link as down */
 			if (sc->link_state) {
 				sc->link_state = 0;
 				if_link_state_change(sc->ifp,
 						     LINK_STATE_DOWN);
 			}
 #ifdef IFNET_BUF_RING
 			num_tx_slices = sc->num_slices;
 #endif
 			/* grab all TX locks to ensure no tx  */
 			for (s = 0; s < num_tx_slices; s++) {
 				ss = &sc->ss[s];
 				mtx_lock(&ss->tx.mtx);
 			}
 			mxge_close(sc, 1);
 		}
 		/* restore PCI configuration space */
 		dinfo = device_get_ivars(sc->dev);
 		pci_cfg_restore(sc->dev, dinfo);
 
 		/* and redo any changes we made to our config space */
 		mxge_setup_cfg_space(sc);
 
 		/* reload f/w */
 		err = mxge_load_firmware(sc, 0);
 		if (err) {
 			device_printf(sc->dev,
 				      "Unable to re-load f/w\n");
 		}
 		if (running) {
 			if (!err)
 				err = mxge_open(sc);
 			/* release all TX locks */
 			for (s = 0; s < num_tx_slices; s++) {
 				ss = &sc->ss[s];
 #ifdef IFNET_BUF_RING
 				mxge_start_locked(ss);
 #endif
 				mtx_unlock(&ss->tx.mtx);
 			}
 		}
 		sc->watchdog_resets++;
 	} else {
 		device_printf(sc->dev,
 			      "NIC did not reboot, not resetting\n");
 		err = 0;
 	}
 	if (err) {
 		device_printf(sc->dev, "watchdog reset failed\n");
 	} else {
 		if (sc->dying == 2)
 			sc->dying = 0;
 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
 	}
 }
 
 static void
 mxge_watchdog_task(void *arg, int pending)
 {
 	mxge_softc_t *sc = arg;
 
 
 	mtx_lock(&sc->driver_mtx);
 	mxge_watchdog_reset(sc);
 	mtx_unlock(&sc->driver_mtx);
 }
 
 static void
 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
 {
 	tx = &sc->ss[slice].tx;
 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
 	device_printf(sc->dev,
 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
 		      tx->req, tx->done, tx->queue_active);
 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
 			      tx->activate, tx->deactivate);
 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
 		      tx->pkt_done,
 		      be32toh(sc->ss->fw_stats->send_done_count));
 }
 
 static int
 mxge_watchdog(mxge_softc_t *sc)
 {
 	mxge_tx_ring_t *tx;
 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
 	int i, err = 0;
 
 	/* see if we have outstanding transmits, which
 	   have been pending for more than mxge_ticks */
 	for (i = 0; 
 #ifdef IFNET_BUF_RING
 	     (i < sc->num_slices) && (err == 0);
 #else
 	     (i < 1) && (err == 0);
 #endif
 	     i++) {
 		tx = &sc->ss[i].tx;		
 		if (tx->req != tx->done &&
 		    tx->watchdog_req != tx->watchdog_done &&
 		    tx->done == tx->watchdog_done) {
 			/* check for pause blocking before resetting */
 			if (tx->watchdog_rx_pause == rx_pause) {
 				mxge_warn_stuck(sc, tx, i);
 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
 				return (ENXIO);
 			}
 			else
 				device_printf(sc->dev, "Flow control blocking "
 					      "xmits, check link partner\n");
 		}
 
 		tx->watchdog_req = tx->req;
 		tx->watchdog_done = tx->done;
 		tx->watchdog_rx_pause = rx_pause;
 	}
 
 	if (sc->need_media_probe)
 		mxge_media_probe(sc);
 	return (err);
 }
 
 static u_long
 mxge_update_stats(mxge_softc_t *sc)
 {
 	struct mxge_slice_state *ss;
 	u_long pkts = 0;
 	u_long ipackets = 0;
 	u_long opackets = 0;
 #ifdef IFNET_BUF_RING
 	u_long obytes = 0;
 	u_long omcasts = 0;
 	u_long odrops = 0;
 #endif
 	u_long oerrors = 0;
 	int slice;
 
 	for (slice = 0; slice < sc->num_slices; slice++) {
 		ss = &sc->ss[slice];
 		ipackets += ss->ipackets;
 		opackets += ss->opackets;
 #ifdef IFNET_BUF_RING
 		obytes += ss->obytes;
 		omcasts += ss->omcasts;
 		odrops += ss->tx.br->br_drops;
 #endif
 		oerrors += ss->oerrors;
 	}
 	pkts = (ipackets - sc->ifp->if_ipackets);
 	pkts += (opackets - sc->ifp->if_opackets);
 	sc->ifp->if_ipackets = ipackets;
 	sc->ifp->if_opackets = opackets;
 #ifdef IFNET_BUF_RING
 	sc->ifp->if_obytes = obytes;
 	sc->ifp->if_omcasts = omcasts;
 	sc->ifp->if_snd.ifq_drops = odrops;
 #endif
 	sc->ifp->if_oerrors = oerrors;
 	return pkts;
 }
 
 static void
 mxge_tick(void *arg)
 {
 	mxge_softc_t *sc = arg;
 	u_long pkts = 0;
 	int err = 0;
 	int running, ticks;
 	uint16_t cmd;
 
 	ticks = mxge_ticks;
 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
 	if (running) {
 		/* aggregate stats from different slices */
 		pkts = mxge_update_stats(sc);
 		if (!sc->watchdog_countdown) {
 			err = mxge_watchdog(sc);
 			sc->watchdog_countdown = 4;
 		}
 		sc->watchdog_countdown--;
 	}
 	if (pkts == 0) {
 		/* ensure NIC did not suffer h/w fault while idle */
 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);		
 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
 			sc->dying = 2;
 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
 			err = ENXIO;
 		}
 		/* look less often if NIC is idle */
 		ticks *= 4;
 	}
 
 	if (err == 0)
 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
 
 }
 
 static int
 mxge_media_change(struct ifnet *ifp)
 {
 	return EINVAL;
 }
 
 static int
 mxge_change_mtu(mxge_softc_t *sc, int mtu)
 {
 	struct ifnet *ifp = sc->ifp;
 	int real_mtu, old_mtu;
 	int err = 0;
 
 
 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
 		return EINVAL;
 	mtx_lock(&sc->driver_mtx);
 	old_mtu = ifp->if_mtu;
 	ifp->if_mtu = mtu;
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		mxge_close(sc, 0);
 		err = mxge_open(sc);
 		if (err != 0) {
 			ifp->if_mtu = old_mtu;
 			mxge_close(sc, 0);
 			(void) mxge_open(sc);
 		}
 	}
 	mtx_unlock(&sc->driver_mtx);
 	return err;
 }	
 
 static void
 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	mxge_softc_t *sc = ifp->if_softc;
 	
 
 	if (sc == NULL)
 		return;
 	ifmr->ifm_status = IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
 	ifmr->ifm_active |= sc->current_media;
 }
 
 static int
 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	mxge_softc_t *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	int err, mask;
 
 	err = 0;
 	switch (command) {
 	case SIOCSIFADDR:
 	case SIOCGIFADDR:
 		err = ether_ioctl(ifp, command, data);
 		break;
 
 	case SIOCSIFMTU:
 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
 		break;
 
 	case SIOCSIFFLAGS:
 		mtx_lock(&sc->driver_mtx);
 		if (sc->dying) {
 			mtx_unlock(&sc->driver_mtx);
 			return EINVAL;
 		}
 		if (ifp->if_flags & IFF_UP) {
 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 				err = mxge_open(sc);
 			} else {
 				/* take care of promis can allmulti
 				   flag chages */
 				mxge_change_promisc(sc, 
 						    ifp->if_flags & IFF_PROMISC);
 				mxge_set_multicast_list(sc);
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				mxge_close(sc, 0);
 			}
 		}
 		mtx_unlock(&sc->driver_mtx);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		mtx_lock(&sc->driver_mtx);
 		mxge_set_multicast_list(sc);
 		mtx_unlock(&sc->driver_mtx);
 		break;
 
 	case SIOCSIFCAP:
 		mtx_lock(&sc->driver_mtx);
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_TXCSUM) {
 			if (IFCAP_TXCSUM & ifp->if_capenable) {
 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
 			} else {
 				ifp->if_capenable |= IFCAP_TXCSUM;
 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
 			}
 		} else if (mask & IFCAP_RXCSUM) {
 			if (IFCAP_RXCSUM & ifp->if_capenable) {
 				ifp->if_capenable &= ~IFCAP_RXCSUM;
 			} else {
 				ifp->if_capenable |= IFCAP_RXCSUM;
 			}
 		}
 		if (mask & IFCAP_TSO4) {
 			if (IFCAP_TSO4 & ifp->if_capenable) {
 				ifp->if_capenable &= ~IFCAP_TSO4;
 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
 				ifp->if_capenable |= IFCAP_TSO4;
 				ifp->if_hwassist |= CSUM_TSO;
 			} else {
 				printf("mxge requires tx checksum offload"
 				       " be enabled to use TSO\n");
 				err = EINVAL;
 			}
 		}
 #if IFCAP_TSO6
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
 						       | IFCAP_TSO6);
 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
 						      | CSUM_UDP);
 			} else {
 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
 				ifp->if_hwassist |= (CSUM_TCP_IPV6
 						     | CSUM_UDP_IPV6);
 			}
 		} else if (mask & IFCAP_RXCSUM_IPV6) {
 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
 			} else {
 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
 			}
 		}
 		if (mask & IFCAP_TSO6) {
 			if (IFCAP_TSO6 & ifp->if_capenable) {
 				ifp->if_capenable &= ~IFCAP_TSO6;
 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
 				ifp->if_capenable |= IFCAP_TSO6;
 				ifp->if_hwassist |= CSUM_TSO;
 			} else {
 				printf("mxge requires tx checksum offload"
 				       " be enabled to use TSO\n");
 				err = EINVAL;
 			}
 		}
 #endif /*IFCAP_TSO6 */
 
 		if (mask & IFCAP_LRO)
 			ifp->if_capenable ^= IFCAP_LRO;
 		if (mask & IFCAP_VLAN_HWTAGGING)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 		if (mask & IFCAP_VLAN_HWTSO)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
 
 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
 
 		mtx_unlock(&sc->driver_mtx);
 		VLAN_CAPABILITIES(ifp);
 
 		break;
 
 	case SIOCGIFMEDIA:
 		mtx_lock(&sc->driver_mtx);
 		mxge_media_probe(sc);
 		mtx_unlock(&sc->driver_mtx);
 		err = ifmedia_ioctl(ifp, (struct ifreq *)data, 
 				    &sc->media, command);
                 break;
 
 	default:
 		err = ENOTTY;
         }
 	return err;
 }
 
 static void
 mxge_fetch_tunables(mxge_softc_t *sc)
 {
 
 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled", 
 			  &mxge_flow_control);
 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay", 
 			  &mxge_intr_coal_delay);	
 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable", 
 			  &mxge_nvidia_ecrc_enable);	
 	TUNABLE_INT_FETCH("hw.mxge.force_firmware", 
 			  &mxge_force_firmware);	
 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait", 
 			  &mxge_deassert_wait);	
 	TUNABLE_INT_FETCH("hw.mxge.verbose", 
 			  &mxge_verbose);	
 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
 
 	if (bootverbose)
 		mxge_verbose = 1;
 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
 		mxge_intr_coal_delay = 30;
 	if (mxge_ticks == 0)
 		mxge_ticks = hz / 2;
 	sc->pause = mxge_flow_control;
 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4 
 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
 	}
 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
 	    mxge_initial_mtu < ETHER_MIN_LEN)
 		mxge_initial_mtu = ETHERMTU_JUMBO;
 
 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
 		mxge_throttle = MXGE_MAX_THROTTLE;
 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
 		mxge_throttle = MXGE_MIN_THROTTLE;
 	sc->throttle = mxge_throttle;
 }
 
 
 static void
 mxge_free_slices(mxge_softc_t *sc)
 {
 	struct mxge_slice_state *ss;
 	int i;
 
 
 	if (sc->ss == NULL)
 		return;
 
 	for (i = 0; i < sc->num_slices; i++) {
 		ss = &sc->ss[i];
 		if (ss->fw_stats != NULL) {
 			mxge_dma_free(&ss->fw_stats_dma);
 			ss->fw_stats = NULL;
 #ifdef IFNET_BUF_RING
 			if (ss->tx.br != NULL) {
 				drbr_free(ss->tx.br, M_DEVBUF);
 				ss->tx.br = NULL;
 			}
 #endif
 			mtx_destroy(&ss->tx.mtx);
 		}
 		if (ss->rx_done.entry != NULL) {
 			mxge_dma_free(&ss->rx_done.dma);
 			ss->rx_done.entry = NULL;
 		}
 	}
 	free(sc->ss, M_DEVBUF);
 	sc->ss = NULL;
 }
 
 static int
 mxge_alloc_slices(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	struct mxge_slice_state *ss;
 	size_t bytes;
 	int err, i, max_intr_slots;
 
 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
 	if (err != 0) {
 		device_printf(sc->dev, "Cannot determine rx ring size\n");
 		return err;
 	}
 	sc->rx_ring_size = cmd.data0;
 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
 	
 	bytes = sizeof (*sc->ss) * sc->num_slices;
 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (sc->ss == NULL)
 		return (ENOMEM);
 	for (i = 0; i < sc->num_slices; i++) {
 		ss = &sc->ss[i];
 
 		ss->sc = sc;
 
 		/* allocate per-slice rx interrupt queues */
 		
 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
 		if (err != 0)
 			goto abort;
 		ss->rx_done.entry = ss->rx_done.dma.addr;
 		bzero(ss->rx_done.entry, bytes);
 
 		/* 
 		 * allocate the per-slice firmware stats; stats
 		 * (including tx) are used used only on the first
 		 * slice for now
 		 */
 #ifndef IFNET_BUF_RING
 		if (i > 0)
 			continue;
 #endif
 
 		bytes = sizeof (*ss->fw_stats);
 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma, 
 				     sizeof (*ss->fw_stats), 64);
 		if (err != 0)
 			goto abort;
 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
 #ifdef IFNET_BUF_RING
 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
 					   &ss->tx.mtx);
 #endif
 	}
 
 	return (0);
 
 abort:
 	mxge_free_slices(sc);
 	return (ENOMEM);
 }
 
 static void
 mxge_slice_probe(mxge_softc_t *sc)
 {
 	mxge_cmd_t cmd;
 	char *old_fw;
 	int msix_cnt, status, max_intr_slots;
 
 	sc->num_slices = 1;
 	/* 
 	 *  don't enable multiple slices if they are not enabled,
 	 *  or if this is not an SMP system 
 	 */
 	
 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
 		return;
 
 	/* see how many MSI-X interrupts are available */
 	msix_cnt = pci_msix_count(sc->dev);
 	if (msix_cnt < 2)
 		return;
 
 	/* now load the slice aware firmware see what it supports */
 	old_fw = sc->fw_name;
 	if (old_fw == mxge_fw_aligned)
 		sc->fw_name = mxge_fw_rss_aligned;
 	else
 		sc->fw_name = mxge_fw_rss_unaligned;
 	status = mxge_load_firmware(sc, 0);
 	if (status != 0) {
 		device_printf(sc->dev, "Falling back to a single slice\n");
 		return;
 	}
 	
 	/* try to send a reset command to the card to see if it
 	   is alive */
 	memset(&cmd, 0, sizeof (cmd));
 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
 	if (status != 0) {
 		device_printf(sc->dev, "failed reset\n");
 		goto abort_with_fw;
 	}
 
 	/* get rx ring size */
 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
 	if (status != 0) {
 		device_printf(sc->dev, "Cannot determine rx ring size\n");
 		goto abort_with_fw;
 	}
 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
 
 	/* tell it the size of the interrupt queues */
 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
 	if (status != 0) {
 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
 		goto abort_with_fw;
 	}
 
 	/* ask the maximum number of slices it supports */
 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
 	if (status != 0) {
 		device_printf(sc->dev,
 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
 		goto abort_with_fw;
 	}
 	sc->num_slices = cmd.data0;
 	if (sc->num_slices > msix_cnt)
 		sc->num_slices = msix_cnt;
 
 	if (mxge_max_slices == -1) {
 		/* cap to number of CPUs in system */
 		if (sc->num_slices > mp_ncpus)
 			sc->num_slices = mp_ncpus;
 	} else {
 		if (sc->num_slices > mxge_max_slices)
 			sc->num_slices = mxge_max_slices;
 	}
 	/* make sure it is a power of two */
 	while (sc->num_slices & (sc->num_slices - 1))
 		sc->num_slices--;
 
 	if (mxge_verbose)
 		device_printf(sc->dev, "using %d slices\n",
 			      sc->num_slices);
 	
 	return;
 
 abort_with_fw:
 	sc->fw_name = old_fw;
 	(void) mxge_load_firmware(sc, 0);
 }
 
 static int
 mxge_add_msix_irqs(mxge_softc_t *sc)
 {
 	size_t bytes;
 	int count, err, i, rid;
 
 	rid = PCIR_BAR(2);
 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 						    &rid, RF_ACTIVE);
 
 	if (sc->msix_table_res == NULL) {
 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
 		return ENXIO;
 	}
 
 	count = sc->num_slices;
 	err = pci_alloc_msix(sc->dev, &count);
 	if (err != 0) {
 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
 			      "err = %d \n", sc->num_slices, err);
 		goto abort_with_msix_table;
 	}
 	if (count < sc->num_slices) {
 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
 			      count, sc->num_slices);
 		device_printf(sc->dev,
 			      "Try setting hw.mxge.max_slices to %d\n",
 			      count);
 		err = ENOSPC;
 		goto abort_with_msix;
 	}
 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (sc->msix_irq_res == NULL) {
 		err = ENOMEM;
 		goto abort_with_msix;
 	}
 
 	for (i = 0; i < sc->num_slices; i++) {
 		rid = i + 1;
 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
 							  SYS_RES_IRQ,
 							  &rid, RF_ACTIVE);
 		if (sc->msix_irq_res[i] == NULL) {
 			device_printf(sc->dev, "couldn't allocate IRQ res"
 				      " for message %d\n", i);
 			err = ENXIO;
 			goto abort_with_res;
 		}
 	}
 
 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
 
 	for (i = 0; i < sc->num_slices; i++) {
 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i], 
 				     INTR_TYPE_NET | INTR_MPSAFE,
 #if __FreeBSD_version > 700030
 				     NULL,
 #endif
 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
 		if (err != 0) {
 			device_printf(sc->dev, "couldn't setup intr for "
 				      "message %d\n", i);
 			goto abort_with_intr;
 		}
 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
 				  sc->msix_ih[i], "s%d", i);
 	}
 
 	if (mxge_verbose) {
 		device_printf(sc->dev, "using %d msix IRQs:",
 			      sc->num_slices);
 		for (i = 0; i < sc->num_slices; i++)
 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
 		printf("\n");
 	}
 	return (0);
 
 abort_with_intr:
 	for (i = 0; i < sc->num_slices; i++) {
 		if (sc->msix_ih[i] != NULL) {
 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
 					  sc->msix_ih[i]);
 			sc->msix_ih[i] = NULL;
 		}
 	}
 	free(sc->msix_ih, M_DEVBUF);
 
 
 abort_with_res:
 	for (i = 0; i < sc->num_slices; i++) {
 		rid = i + 1;
 		if (sc->msix_irq_res[i] != NULL)
 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
 					     sc->msix_irq_res[i]);
 		sc->msix_irq_res[i] = NULL;
 	}
 	free(sc->msix_irq_res, M_DEVBUF);
 
 
 abort_with_msix:
 	pci_release_msi(sc->dev);
 
 abort_with_msix_table:
 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
 			     sc->msix_table_res);
 
 	return err;
 }
 
 static int
 mxge_add_single_irq(mxge_softc_t *sc)
 {
 	int count, err, rid;
 
 	count = pci_msi_count(sc->dev);
 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
 		rid = 1;
 	} else {
 		rid = 0;
 		sc->legacy_irq = 1;
 	}
 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
 					 1, RF_SHAREABLE | RF_ACTIVE);
 	if (sc->irq_res == NULL) {
 		device_printf(sc->dev, "could not alloc interrupt\n");
 		return ENXIO;
 	}
 	if (mxge_verbose)
 		device_printf(sc->dev, "using %s irq %ld\n",
 			      sc->legacy_irq ? "INTx" : "MSI",
 			      rman_get_start(sc->irq_res));
 	err = bus_setup_intr(sc->dev, sc->irq_res, 
 			     INTR_TYPE_NET | INTR_MPSAFE,
 #if __FreeBSD_version > 700030
 			     NULL,
 #endif
 			     mxge_intr, &sc->ss[0], &sc->ih);
 	if (err != 0) {
 		bus_release_resource(sc->dev, SYS_RES_IRQ,
 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
 		if (!sc->legacy_irq)
 			pci_release_msi(sc->dev);
 	}
 	return err;
 }
 
 static void
 mxge_rem_msix_irqs(mxge_softc_t *sc)
 {
 	int i, rid;
 
 	for (i = 0; i < sc->num_slices; i++) {
 		if (sc->msix_ih[i] != NULL) {
 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
 					  sc->msix_ih[i]);
 			sc->msix_ih[i] = NULL;
 		}
 	}
 	free(sc->msix_ih, M_DEVBUF);
 
 	for (i = 0; i < sc->num_slices; i++) {
 		rid = i + 1;
 		if (sc->msix_irq_res[i] != NULL)
 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
 					     sc->msix_irq_res[i]);
 		sc->msix_irq_res[i] = NULL;
 	}
 	free(sc->msix_irq_res, M_DEVBUF);
 
 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
 			     sc->msix_table_res);
 
 	pci_release_msi(sc->dev);
 	return;
 }
 
 static void
 mxge_rem_single_irq(mxge_softc_t *sc)
 {
 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
 	bus_release_resource(sc->dev, SYS_RES_IRQ,
 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
 	if (!sc->legacy_irq)
 		pci_release_msi(sc->dev);
 }
 
 static void
 mxge_rem_irq(mxge_softc_t *sc)
 {
 	if (sc->num_slices > 1)
 		mxge_rem_msix_irqs(sc);
 	else
 		mxge_rem_single_irq(sc);
 }
 
 static int
 mxge_add_irq(mxge_softc_t *sc)
 {
 	int err;
 
 	if (sc->num_slices > 1)
 		err = mxge_add_msix_irqs(sc);
 	else
 		err = mxge_add_single_irq(sc);
 	
 	if (0 && err == 0 && sc->num_slices > 1) {
 		mxge_rem_msix_irqs(sc);
 		err = mxge_add_msix_irqs(sc);
 	}
 	return err;
 }
 
 
 static int 
 mxge_attach(device_t dev)
 {
 	mxge_cmd_t cmd;
 	mxge_softc_t *sc = device_get_softc(dev);
 	struct ifnet *ifp;
 	int err, rid;
 
 	sc->dev = dev;
 	mxge_fetch_tunables(sc);
 
 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
 				  taskqueue_thread_enqueue, &sc->tq);
 	if (sc->tq == NULL) {
 		err = ENOMEM;
 		goto abort_with_nothing;
 	}
 
 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
 				 1,			/* alignment */
 				 0,			/* boundary */
 				 BUS_SPACE_MAXADDR,	/* low */
 				 BUS_SPACE_MAXADDR,	/* high */
 				 NULL, NULL,		/* filter */
 				 65536 + 256,		/* maxsize */
 				 MXGE_MAX_SEND_DESC, 	/* num segs */
 				 65536,			/* maxsegsize */
 				 0,			/* flags */
 				 NULL, NULL,		/* lock */
 				 &sc->parent_dmat);	/* tag */
 
 	if (err != 0) {
 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
 			      err);
 		goto abort_with_tq;
 	}
 
 	ifp = sc->ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(dev, "can not if_alloc()\n");
 		err = ENOSPC;
 		goto abort_with_parent_dmat;
 	}
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 
 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
 		 device_get_nameunit(dev));
 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
 		 "%s:drv", device_get_nameunit(dev));
 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
 		 MTX_NETWORK_LOCK, MTX_DEF);
 
 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
 
 	mxge_setup_cfg_space(sc);
 	
 	/* Map the board into the kernel */
 	rid = PCIR_BARS;
 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
 					 ~0, 1, RF_ACTIVE);
 	if (sc->mem_res == NULL) {
 		device_printf(dev, "could not map memory\n");
 		err = ENXIO;
 		goto abort_with_lock;
 	}
 	sc->sram = rman_get_virtual(sc->mem_res);
 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
 		device_printf(dev, "impossible memory region size %ld\n",
 			      rman_get_size(sc->mem_res));
 		err = ENXIO;
 		goto abort_with_mem_res;
 	}
 
 	/* make NULL terminated copy of the EEPROM strings section of
 	   lanai SRAM */
 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
 				rman_get_bushandle(sc->mem_res),
 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
 				sc->eeprom_strings, 
 				MXGE_EEPROM_STRINGS_SIZE - 2);
 	err = mxge_parse_strings(sc);
 	if (err != 0)
 		goto abort_with_mem_res;
 
 	/* Enable write combining for efficient use of PCIe bus */
 	mxge_enable_wc(sc);
 
 	/* Allocate the out of band dma memory */
 	err = mxge_dma_alloc(sc, &sc->cmd_dma, 
 			     sizeof (mxge_cmd_t), 64);
 	if (err != 0) 
 		goto abort_with_mem_res;
 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
 	if (err != 0) 
 		goto abort_with_cmd_dma;
 
 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
 	if (err != 0)
 		goto abort_with_zeropad_dma;
 
 	/* select & load the firmware */
 	err = mxge_select_firmware(sc);
 	if (err != 0)
 		goto abort_with_dmabench;
 	sc->intr_coal_delay = mxge_intr_coal_delay;
 
 	mxge_slice_probe(sc);
 	err = mxge_alloc_slices(sc);
 	if (err != 0)
 		goto abort_with_dmabench;
 
 	err = mxge_reset(sc, 0);
 	if (err != 0)
 		goto abort_with_slices;
 
 	err = mxge_alloc_rings(sc);
 	if (err != 0) {
 		device_printf(sc->dev, "failed to allocate rings\n");
 		goto abort_with_slices;
 	}
 
 	err = mxge_add_irq(sc);
 	if (err != 0) {
 		device_printf(sc->dev, "failed to add irq\n");
 		goto abort_with_rings;
 	}
 
 	if_initbaudrate(ifp, IF_Gbps(10));
 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
 		IFCAP_RXCSUM_IPV6;
 #if defined(INET) || defined(INET6)
 	ifp->if_capabilities |= IFCAP_LRO;
 #endif
 
 #ifdef MXGE_NEW_VLAN_API
 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
 
 	/* Only FW 1.4.32 and newer can do TSO over vlans */
 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
 	    sc->fw_ver_tiny >= 32)
 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
 #endif
 	sc->max_mtu = mxge_max_mtu(sc);
 	if (sc->max_mtu >= 9000)
 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
 	else
 		device_printf(dev, "MTU limited to %d.  Install "
 			      "latest firmware for 9000 byte jumbo support\n",
 			      sc->max_mtu - ETHER_HDR_LEN);
 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
 	/* check to see if f/w supports TSO for IPv6 */
 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
 		if (CSUM_TCP_IPV6)
 			ifp->if_capabilities |= IFCAP_TSO6;
 		sc->max_tso6_hlen = min(cmd.data0,
 					sizeof (sc->ss[0].scratch));
 	}
 	ifp->if_capenable = ifp->if_capabilities;
 	if (sc->lro_cnt == 0)
 		ifp->if_capenable &= ~IFCAP_LRO;
         ifp->if_init = mxge_init;
         ifp->if_softc = sc;
         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
         ifp->if_ioctl = mxge_ioctl;
         ifp->if_start = mxge_start;
 	/* Initialise the ifmedia structure */
 	ifmedia_init(&sc->media, 0, mxge_media_change, 
 		     mxge_media_status);
 	mxge_media_init(sc);
 	mxge_media_probe(sc);
 	sc->dying = 0;
 	ether_ifattach(ifp, sc->mac_addr);
 	/* ether_ifattach sets mtu to ETHERMTU */
 	if (mxge_initial_mtu != ETHERMTU)
 		mxge_change_mtu(sc, mxge_initial_mtu);
 
 	mxge_add_sysctls(sc);
 #ifdef IFNET_BUF_RING
 	ifp->if_transmit = mxge_transmit;
 	ifp->if_qflush = mxge_qflush;
 #endif
 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
 				device_get_nameunit(sc->dev));
 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
 	return 0;
 
 abort_with_rings:
 	mxge_free_rings(sc);
 abort_with_slices:
 	mxge_free_slices(sc);
 abort_with_dmabench:
 	mxge_dma_free(&sc->dmabench_dma);
 abort_with_zeropad_dma:
 	mxge_dma_free(&sc->zeropad_dma);
 abort_with_cmd_dma:
 	mxge_dma_free(&sc->cmd_dma);
 abort_with_mem_res:
 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
 abort_with_lock:
 	pci_disable_busmaster(dev);
 	mtx_destroy(&sc->cmd_mtx);
 	mtx_destroy(&sc->driver_mtx);
 	if_free(ifp);
 abort_with_parent_dmat:
 	bus_dma_tag_destroy(sc->parent_dmat);
 abort_with_tq:
 	if (sc->tq != NULL) {
 		taskqueue_drain(sc->tq, &sc->watchdog_task);
 		taskqueue_free(sc->tq);
 		sc->tq = NULL;
 	}
 abort_with_nothing:
 	return err;
 }
 
 static int
 mxge_detach(device_t dev)
 {
 	mxge_softc_t *sc = device_get_softc(dev);
 
 	if (mxge_vlans_active(sc)) {
 		device_printf(sc->dev,
 			      "Detach vlans before removing module\n");
 		return EBUSY;
 	}
 	mtx_lock(&sc->driver_mtx);
 	sc->dying = 1;
 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
 		mxge_close(sc, 0);
 	mtx_unlock(&sc->driver_mtx);
 	ether_ifdetach(sc->ifp);
 	if (sc->tq != NULL) {
 		taskqueue_drain(sc->tq, &sc->watchdog_task);
 		taskqueue_free(sc->tq);
 		sc->tq = NULL;
 	}
 	callout_drain(&sc->co_hdl);
 	ifmedia_removeall(&sc->media);
 	mxge_dummy_rdma(sc, 0);
 	mxge_rem_sysctls(sc);
 	mxge_rem_irq(sc);
 	mxge_free_rings(sc);
 	mxge_free_slices(sc);
 	mxge_dma_free(&sc->dmabench_dma);
 	mxge_dma_free(&sc->zeropad_dma);
 	mxge_dma_free(&sc->cmd_dma);
 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
 	pci_disable_busmaster(dev);
 	mtx_destroy(&sc->cmd_mtx);
 	mtx_destroy(&sc->driver_mtx);
 	if_free(sc->ifp);
 	bus_dma_tag_destroy(sc->parent_dmat);
 	return 0;
 }
 
 static int
 mxge_shutdown(device_t dev)
 {
 	return 0;
 }
 
 /*
   This file uses Myri10GE driver indentation.
 
   Local Variables:
   c-file-style:"linux"
   tab-width:8
   End:
 */
Index: user/attilio/vmobj-rwlock/sys/dev/mxge/if_mxge_var.h
===================================================================
--- user/attilio/vmobj-rwlock/sys/dev/mxge/if_mxge_var.h	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/dev/mxge/if_mxge_var.h	(revision 247192)
@@ -1,382 +1,382 @@
 /*******************************************************************************
 
-Copyright (c) 2006-2009, Myricom Inc.
+Copyright (c) 2006-2013, Myricom Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
  2. Neither the name of the Myricom Inc, nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 $FreeBSD$
 
 ***************************************************************************/
 
 #define MXGE_ETH_STOPPED 0
 #define MXGE_ETH_STOPPING 1
 #define MXGE_ETH_STARTING 2
 #define MXGE_ETH_RUNNING 3
 #define MXGE_ETH_OPEN_FAILED 4
 
 #define MXGE_FW_OFFSET 1024*1024
 #define MXGE_EEPROM_STRINGS_SIZE 256
 #define MXGE_MAX_SEND_DESC 128
 
 #if ((__FreeBSD_version > 800000 && __FreeBSD_version < 800005) \
      || __FreeBSD_version < 700111)
 #define MXGE_VIRT_JUMBOS 1
 #else
 #define MXGE_VIRT_JUMBOS 0
 #endif
 
 #if (__FreeBSD_version > 800082)
 #define IFNET_BUF_RING 1
 #endif
 
 #if (__FreeBSD_version < 1000020)
 #undef IF_Kbps
 #undef IF_Mbps
 #undef IF_Gbps
 #define	IF_Kbps(x)	((uintmax_t)(x) * 1000)	/* kilobits/sec. */
 #define	IF_Mbps(x)	(IF_Kbps((x) * 1000))	/* megabits/sec. */
 #define	IF_Gbps(x)	(IF_Mbps((x) * 1000))	/* gigabits/sec. */
 static __inline void
 if_initbaudrate(struct ifnet *ifp, uintmax_t baud)
 {
 	ifp->if_baudrate = baud;
 }
 #endif
 #ifndef VLAN_CAPABILITIES
 #define VLAN_CAPABILITIES(ifp)
 #define mxge_vlans_active(sc) (sc)->ifp->if_nvlans
 #else
 #define mxge_vlans_active(sc) (sc)->ifp->if_vlantrunk
 #endif
 
 #ifndef VLAN_TAG_VALUE
 #define MXGE_NEW_VLAN_API
 #endif
 
 #ifndef IFCAP_LRO
 #define IFCAP_LRO 0
 #endif
 
 #ifndef IFCAP_TSO
 #define IFCAP_TSO 0
 #endif
  
 #ifndef IFCAP_TSO4
 #define IFCAP_TSO4 0
 #endif
 
 #ifndef IFCAP_TSO6
 #define IFCAP_TSO6 0
 #endif
 
 #ifndef IFCAP_TXCSUM_IPV6
 #define IFCAP_TXCSUM_IPV6 0
 #endif
 
 #ifndef IFCAP_RXCSUM_IPV6
 #define IFCAP_RXCSUM_IPV6 0
 #endif
 
 #ifndef CSUM_TSO
 #define CSUM_TSO 0
 #endif
 
 #ifndef CSUM_TCP_IPV6
 #define CSUM_TCP_IPV6 0
 #endif
 
 #ifndef CSUM_UDP_IPV6
 #define CSUM_UDP_IPV6 0
 #endif
 
 #ifndef CSUM_DELAY_DATA_IPV6
 #define CSUM_DELAY_DATA_IPV6 0
 #endif
 
 typedef struct {
 	void *addr;
 	bus_addr_t bus_addr;
 	bus_dma_tag_t dmat;
 	bus_dmamap_t map;
 } mxge_dma_t;
 
 
 typedef struct {
 	mcp_slot_t *entry;
 	mxge_dma_t dma;
 	int cnt;
 	int idx;
 	int mask;
 } mxge_rx_done_t;
 
 typedef struct
 {
   uint32_t data0;
   uint32_t data1;
   uint32_t data2;
 } mxge_cmd_t;
 
 struct mxge_rx_buffer_state {
 	struct mbuf *m;
 	bus_dmamap_t map;
 };
 
 struct mxge_tx_buffer_state {
 	struct mbuf *m;
 	bus_dmamap_t map;
 	int flag;
 };
 
 typedef struct
 {
 	volatile mcp_kreq_ether_recv_t *lanai;	/* lanai ptr for recv ring */
 	mcp_kreq_ether_recv_t *shadow;	/* host shadow of recv ring */
 	struct mxge_rx_buffer_state *info;
 	bus_dma_tag_t dmat;
 	bus_dmamap_t extra_map;
 	int cnt;
 	int nbufs;
 	int cl_size;
 	int alloc_fail;
 	int mask;			/* number of rx slots -1 */
 	int mlen;
 } mxge_rx_ring_t;
 
 typedef struct
 {
 	struct mtx mtx;
 #ifdef IFNET_BUF_RING
 	struct buf_ring *br;
 #endif
 	volatile mcp_kreq_ether_send_t *lanai;	/* lanai ptr for sendq	*/
 	volatile uint32_t *send_go;		/* doorbell for sendq */
 	volatile uint32_t *send_stop;		/* doorbell for sendq */
 	mcp_kreq_ether_send_t *req_list;	/* host shadow of sendq */
 	char *req_bytes;
 	bus_dma_segment_t *seg_list;
 	struct mxge_tx_buffer_state *info;
 	bus_dma_tag_t dmat;
 	int req;			/* transmits submitted	*/
 	int mask;			/* number of transmit slots -1 */
 	int done;			/* transmits completed	*/
 	int pkt_done;			/* packets completed */
 	int max_desc;			/* max descriptors per xmit */
 	int queue_active;		/* fw currently polling this queue*/
 	int activate;
 	int deactivate;
 	int stall;			/* #times hw queue exhausted */
 	int wake;			/* #times irq re-enabled xmit */
 	int watchdog_req;		/* cache of req */
 	int watchdog_done;		/* cache of done */
 	int watchdog_rx_pause;		/* cache of pause rq recvd */
 	int defrag;
 	char mtx_name[16];
 } mxge_tx_ring_t;
 
 struct mxge_softc;
 typedef struct mxge_softc mxge_softc_t;
 
 struct mxge_slice_state {
 	mxge_softc_t *sc;
 	mxge_tx_ring_t tx;		/* transmit ring 	*/
 	mxge_rx_ring_t rx_small;
 	mxge_rx_ring_t rx_big;
 	mxge_rx_done_t rx_done;
 	mcp_irq_data_t *fw_stats;
 	volatile uint32_t *irq_claim;
 	u_long ipackets;
 	u_long opackets;
 	u_long obytes;
 	u_long omcasts;
 	u_long oerrors;
 	int if_drv_flags;
 	struct lro_ctrl lc;
 	mxge_dma_t fw_stats_dma;
 	struct sysctl_oid *sysctl_tree;
 	struct sysctl_ctx_list sysctl_ctx;
 	char scratch[256];
 };
 
 struct mxge_softc {
 	struct ifnet* ifp;
 	struct mxge_slice_state *ss;
 	int tx_boundary;		/* boundary transmits cannot cross*/
 	int lro_cnt;
 	bus_dma_tag_t	parent_dmat;
 	volatile uint8_t *sram;
 	int sram_size;
 	volatile uint32_t *irq_deassert;
 	mcp_cmd_response_t *cmd;
 	mxge_dma_t cmd_dma;
 	mxge_dma_t zeropad_dma;
 	struct pci_dev *pdev;
 	int legacy_irq;
 	int link_state;
 	unsigned int rdma_tags_available;
 	int intr_coal_delay;
 	volatile uint32_t *intr_coal_delay_ptr;
 	int wc;
 	struct mtx cmd_mtx;
 	struct mtx driver_mtx;
 	int wake_queue;
 	int stop_queue;
 	int down_cnt;
 	int watchdog_resets;
 	int watchdog_countdown;
 	int pause;
 	struct resource *mem_res;
 	struct resource *irq_res;
 	struct resource **msix_irq_res;
 	struct resource *msix_table_res;
 	struct resource *msix_pba_res;
 	void *ih; 
 	void **msix_ih;
 	char *fw_name;
 	char eeprom_strings[MXGE_EEPROM_STRINGS_SIZE];
 	char fw_version[128];
 	int fw_ver_major;
 	int fw_ver_minor;
 	int fw_ver_tiny;
 	int adopted_rx_filter_bug;
 	device_t dev;
 	struct ifmedia media;
 	int read_dma;
 	int write_dma;
 	int read_write_dma;
 	int fw_multicast_support;
 	int link_width;
 	int max_mtu;
 	int throttle;
 	int tx_defrag;
 	int media_flags;
 	int need_media_probe;
 	int num_slices;
 	int rx_ring_size;
 	int dying;
 	int connector;
 	int current_media;
 	int max_tso6_hlen;
 	mxge_dma_t dmabench_dma;
 	struct callout co_hdl;
 	struct taskqueue *tq;
 	struct task watchdog_task;
 	struct sysctl_oid *slice_sysctl_tree;
 	struct sysctl_ctx_list slice_sysctl_ctx;
 	char *mac_addr_string;
 	uint8_t	mac_addr[6];		/* eeprom mac address */
 	uint16_t pectl;			/* save PCIe CTL state */
 	char product_code_string[64];
 	char serial_number_string[64];
 	char cmd_mtx_name[16];
 	char driver_mtx_name[16];
 };
 
 #define MXGE_PCI_VENDOR_MYRICOM 	0x14c1
 #define MXGE_PCI_DEVICE_Z8E 	0x0008
 #define MXGE_PCI_DEVICE_Z8E_9 	0x0009
 #define MXGE_PCI_REV_Z8E	0
 #define MXGE_PCI_REV_Z8ES	1
 #define MXGE_XFP_COMPLIANCE_BYTE	131
 #define MXGE_SFP_COMPLIANCE_BYTE	  3
 #define MXGE_MIN_THROTTLE	416
 #define MXGE_MAX_THROTTLE	4096
 
 /* Types of connectors on NICs supported by this driver */
 #define MXGE_CX4 0
 #define MXGE_XFP 1
 #define MXGE_SFP 2
 #define MXGE_QRF 3
 
 #define MXGE_HIGHPART_TO_U32(X) \
 (sizeof (X) == 8) ? ((uint32_t)((uint64_t)(X) >> 32)) : (0)
 #define MXGE_LOWPART_TO_U32(X) ((uint32_t)(X))
 
 struct mxge_media_type
 {
 	int flag;
 	uint8_t bitmask;
 	char *name;
 };
 
 struct mxge_pkt_info {
 	int ip_off;
 	int ip_hlen;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	struct tcphdr *tcp;
 };
 
 
 /* implement our own memory barriers, since bus_space_barrier
    cannot handle write-combining regions */
 
 #if __FreeBSD_version < 800053
 
 #if defined (__GNUC__)
   #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
     #define wmb()  __asm__ __volatile__ ("sfence;": : :"memory")
   #elif #cpu(sparc64) || defined sparc64 || defined __sparcv9 
     #define wmb()  __asm__ __volatile__ ("membar #MemIssue": : :"memory")
   #elif #cpu(sparc) || defined sparc || defined __sparc__
     #define wmb()  __asm__ __volatile__ ("stbar;": : :"memory")
   #else
     #define wmb() 	/* XXX just to make this compile */
   #endif
 #else
   #error "unknown compiler"
 #endif
 
 #endif
 
 static inline void
 mxge_pio_copy(volatile void *to_v, void *from_v, size_t size)
 {
   register volatile uintptr_t *to;
   volatile uintptr_t *from;
   size_t i;
 
   to = (volatile uintptr_t *) to_v;
   from = from_v;
   for (i = (size / sizeof (uintptr_t)); i; i--) {
 	  *to = *from;
 	  to++;
 	  from++;
   }
 
 }
 
 void mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro);
 int mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head,
 		uint32_t csum);
 		
 
 
 /*
   This file uses Myri10GE driver indentation.
 
   Local Variables:
   c-file-style:"linux"
   tab-width:8
   End:
 */
Index: user/attilio/vmobj-rwlock/sys/powerpc/include/vmparam.h
===================================================================
--- user/attilio/vmobj-rwlock/sys/powerpc/include/vmparam.h	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys/powerpc/include/vmparam.h	(revision 247192)
@@ -1,208 +1,203 @@
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	$NetBSD: vmparam.h,v 1.11 2000/02/11 19:25:16 thorpej Exp $
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_VMPARAM_H_
 #define	_MACHINE_VMPARAM_H_
 
 #define	USRSTACK	SHAREDPAGE
 
 #ifndef	MAXTSIZ
 #define	MAXTSIZ		(64*1024*1024)		/* max text size */
 #endif
 
 #ifndef	DFLDSIZ
 #define	DFLDSIZ		(128*1024*1024)		/* default data size */
 #endif
 
 #ifndef	MAXDSIZ
 #define	MAXDSIZ		(1*1024*1024*1024)	/* max data size */
 #endif
 
 #ifndef	DFLSSIZ
 #define	DFLSSIZ		(8*1024*1024)		/* default stack size */
 #endif
 
 #ifndef	MAXSSIZ
 #define	MAXSSIZ		(64*1024*1024)		/* max stack size */
 #endif
 
 #ifdef AIM
 #define	VM_MAXUSER_ADDRESS32	((vm_offset_t)0xfffff000)
 #else
 #define	VM_MAXUSER_ADDRESS32	((vm_offset_t)0x7ffff000)
 #endif
 
 /*
  * Would like to have MAX addresses = 0, but this doesn't (currently) work
  */
 #if !defined(LOCORE)
 #ifdef __powerpc64__
 #define	VM_MIN_ADDRESS		(0x0000000000000000UL)
 #define	VM_MAXUSER_ADDRESS	(0xfffffffffffff000UL)
 #define	VM_MAX_ADDRESS		(0xffffffffffffffffUL)
 #else
 #define	VM_MIN_ADDRESS		((vm_offset_t)0)
 #define	VM_MAXUSER_ADDRESS	VM_MAXUSER_ADDRESS32
 #define	VM_MAX_ADDRESS		((vm_offset_t)0xffffffff)
 #endif
 #define	SHAREDPAGE		(VM_MAXUSER_ADDRESS - PAGE_SIZE)
 #else /* LOCORE */
 #if !defined(__powerpc64__) && defined(BOOKE)
 #define	VM_MIN_ADDRESS		0
 #define	VM_MAXUSER_ADDRESS	0x7ffff000
 #endif
 #endif /* LOCORE */
 
 #define	FREEBSD32_SHAREDPAGE	(VM_MAXUSER_ADDRESS32 - PAGE_SIZE)
 #define	FREEBSD32_USRSTACK	FREEBSD32_SHAREDPAGE
 
 #ifdef AIM
 #define	KERNBASE		0x00100000UL	/* start of kernel virtual */
 
 #ifdef __powerpc64__
 #define	VM_MIN_KERNEL_ADDRESS		0xc000000000000000UL
 #define	VM_MAX_KERNEL_ADDRESS		0xc0000001c7ffffffUL
 #define	VM_MAX_SAFE_KERNEL_ADDRESS	VM_MAX_KERNEL_ADDRESS
 #else
 #define	VM_MIN_KERNEL_ADDRESS	((vm_offset_t)KERNEL_SR << ADDR_SR_SHFT)
 #define	VM_MAX_SAFE_KERNEL_ADDRESS (VM_MIN_KERNEL_ADDRESS + 2*SEGMENT_LENGTH -1)
 #define	VM_MAX_KERNEL_ADDRESS	(VM_MIN_KERNEL_ADDRESS + 3*SEGMENT_LENGTH - 1)
 #endif
 
 /*
  * Use the direct-mapped BAT registers for UMA small allocs. This
  * takes pressure off the small amount of available KVA.
  */
 #define UMA_MD_SMALL_ALLOC
 
 #else /* Book-E */
 
 /*
  * Kernel CCSRBAR location. We make this the reset location.
  */
 #define	CCSRBAR_VA		0xfef00000
 #define	CCSRBAR_SIZE		0x00100000
 
 #define	KERNBASE		0xc0000000	/* start of kernel virtual */
 
 #define	VM_MIN_KERNEL_ADDRESS	KERNBASE
 #define	VM_MAX_KERNEL_ADDRESS	0xf8000000
 
 #endif /* AIM/E500 */
 
-/* XXX max. amount of KVM to be used by buffers. */
-#ifndef VM_MAX_KERNEL_BUF
-#define	VM_MAX_KERNEL_BUF	(SEGMENT_LENGTH * 7 / 10)
-#endif
-
 #if !defined(LOCORE)
 struct pmap_physseg {
 	struct pv_entry *pvent;
 	char *attrs;
 };
 #endif
 
 #define	VM_PHYSSEG_MAX		16	/* 1? */
 
 /*
  * The physical address space is densely populated on 32-bit systems,
  * but may not be on 64-bit ones.
  */
 #ifdef __powerpc64__
 #define	VM_PHYSSEG_SPARSE
 #else
 #define	VM_PHYSSEG_DENSE
 #endif
 
 /*
  * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool
  * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
  * the pool from which physical pages for small UMA objects are
  * allocated.
  */
 #define	VM_NFREEPOOL		3
 #define	VM_FREEPOOL_CACHE	2
 #define	VM_FREEPOOL_DEFAULT	0
 #define	VM_FREEPOOL_DIRECT	1
 
 /*
  * Create one free page list.
  */
 #define	VM_NFREELIST		1
 #define	VM_FREELIST_DEFAULT	0
 
 /*
  * The largest allocation size is 4MB.
  */
 #define	VM_NFREEORDER		11
 
 /*
  * Only one memory domain.
  */
 #ifndef VM_NDOMAIN
 #define	VM_NDOMAIN		1
 #endif
 
 /*
  * Disable superpage reservations.
  */
 #ifndef	VM_NRESERVLEVEL
 #define	VM_NRESERVLEVEL		0
 #endif
 
 #ifndef VM_INITIAL_PAGEIN
 #define	VM_INITIAL_PAGEIN	16
 #endif
 
 #ifndef SGROWSIZ
 #define	SGROWSIZ	(128UL*1024)		/* amount to grow stack */
 #endif
 
 #ifndef VM_KMEM_SIZE
 #define	VM_KMEM_SIZE		(12 * 1024 * 1024)
 #endif
 
 #ifdef __powerpc64__
 #ifndef VM_KMEM_SIZE_SCALE
 #define VM_KMEM_SIZE_SCALE      (3)
 #endif
 
 #ifndef VM_KMEM_SIZE_MAX
 #define VM_KMEM_SIZE_MAX        0x1c0000000  /* 7 GB */
 #endif
 #endif
 
 #define	ZERO_REGION_SIZE	(64 * 1024)	/* 64KB */
 
 #endif /* _MACHINE_VMPARAM_H_ */
Index: user/attilio/vmobj-rwlock/sys
===================================================================
--- user/attilio/vmobj-rwlock/sys	(revision 247191)
+++ user/attilio/vmobj-rwlock/sys	(revision 247192)

Property changes on: user/attilio/vmobj-rwlock/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys:r247139-247191
Index: user/attilio/vmobj-rwlock/tools/regression/bin/sh/builtins/read6.0
===================================================================
--- user/attilio/vmobj-rwlock/tools/regression/bin/sh/builtins/read6.0	(nonexistent)
+++ user/attilio/vmobj-rwlock/tools/regression/bin/sh/builtins/read6.0	(revision 247192)
@@ -0,0 +1,5 @@
+# $FreeBSD$
+
+: | read x
+r=$?
+[ "$r" = 1 ]

Property changes on: user/attilio/vmobj-rwlock/tools/regression/bin/sh/builtins/read6.0
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Index: user/attilio/vmobj-rwlock/usr.sbin/bhyve/mem.c
===================================================================
--- user/attilio/vmobj-rwlock/usr.sbin/bhyve/mem.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/usr.sbin/bhyve/mem.c	(revision 247192)
@@ -1,218 +1,234 @@
 /*-
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Memory ranges are represented with an RB tree. On insertion, the range
  * is checked for overlaps. On lookup, the key has the same base and limit
  * so it can be searched within the range.
  *
  * It is assumed that all setup of ranges takes place in single-threaded
  * mode before vCPUs have been started. As such, no locks are used on the
  * RB tree. If this is no longer the case, then a r/w lock could be used,
  * with readers on the lookup and a writer if the tree needs to be changed
  * (and per vCPU caches flushed)
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/tree.h>
 #include <sys/errno.h>
 #include <machine/vmm.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
 
 #include "mem.h"
 
 struct mmio_rb_range {
 	RB_ENTRY(mmio_rb_range)	mr_link;	/* RB tree links */
 	struct mem_range	mr_param;
 	uint64_t                mr_base;
 	uint64_t                mr_end;
 };
 
 struct mmio_rb_tree;
 RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
 
-RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rbroot;
+RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback;
 
 /*
  * Per-vCPU cache. Since most accesses from a vCPU will be to
  * consecutive addresses in a range, it makes sense to cache the
  * result of a lookup.
  */
 static struct mmio_rb_range	*mmio_hint[VM_MAXCPU];
 
 static int
 mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
 {
 	if (a->mr_end < b->mr_base)
 		return (-1);
 	else if (a->mr_base > b->mr_end)
 		return (1);
 	return (0);
 }
 
 static int
-mmio_rb_lookup(uint64_t addr, struct mmio_rb_range **entry)
+mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr,
+    struct mmio_rb_range **entry)
 {
 	struct mmio_rb_range find, *res;
 
 	find.mr_base = find.mr_end = addr;
 
-	res = RB_FIND(mmio_rb_tree, &mmio_rbroot, &find);
+	res = RB_FIND(mmio_rb_tree, rbt, &find);
 
 	if (res != NULL) {
 		*entry = res;
 		return (0);
 	}
 	
 	return (ENOENT);
 }
 
 static int
-mmio_rb_add(struct mmio_rb_range *new)
+mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
 {
 	struct mmio_rb_range *overlap;
 
-	overlap = RB_INSERT(mmio_rb_tree, &mmio_rbroot, new);
+	overlap = RB_INSERT(mmio_rb_tree, rbt, new);
 
 	if (overlap != NULL) {
 #ifdef RB_DEBUG
 		printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
 		       new->mr_base, new->mr_end,
 		       overlap->mr_base, overlap->mr_end);
 #endif
 
 		return (EEXIST);
 	}
 
 	return (0);
 }
 
 #if 0
 static void
-mmio_rb_dump(void)
+mmio_rb_dump(struct mmio_rb_tree *rbt)
 {
 	struct mmio_rb_range *np;
 
-	RB_FOREACH(np, mmio_rb_tree, &mmio_rbroot) {
+	RB_FOREACH(np, mmio_rb_tree, rbt) {
 		printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
 		       np->mr_param.name);
 	}
 }
 #endif
 
 RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
 
 static int
 mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
 {
 	int error;
 	struct mem_range *mr = arg;
 
 	error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
 			       rval, mr->arg1, mr->arg2);
 	return (error);
 }
 
 static int
 mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
 {
 	int error;
 	struct mem_range *mr = arg;
 
 	error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
 			       &wval, mr->arg1, mr->arg2);
 	return (error);
 }
 
 int
 emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
 {
 	struct mmio_rb_range *entry;
 	int err;
 
 	/*
 	 * First check the per-vCPU cache
 	 */
 	if (mmio_hint[vcpu] &&
 	    paddr >= mmio_hint[vcpu]->mr_base &&
 	    paddr <= mmio_hint[vcpu]->mr_end) {
 		entry = mmio_hint[vcpu];
 	} else
 		entry = NULL;
 
 	if (entry == NULL) {
-		if (mmio_rb_lookup(paddr, &entry))
+		if (!mmio_rb_lookup(&mmio_rb_root, paddr, &entry)) {
+			/* Update the per-vCPU cache */
+			mmio_hint[vcpu] = entry;			
+		} else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
 			return (ESRCH);
-
-		/* Update the per-vCPU cache */
-		mmio_hint[vcpu] = entry;
+		}
 	}
 
-	assert(entry != NULL && entry == mmio_hint[vcpu]);
-
+	assert(entry != NULL);
 	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
 				      mem_read, mem_write, &entry->mr_param);
 	return (err);
 }
 
-int
-register_mem(struct mem_range *memp)
+static int
+register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
 {
 	struct mmio_rb_range *mrp;
 	int		err;
 
 	err = 0;
 
 	mrp = malloc(sizeof(struct mmio_rb_range));
 
 	if (mrp != NULL) {
 		mrp->mr_param = *memp;
 		mrp->mr_base = memp->base;
 		mrp->mr_end = memp->base + memp->size - 1;
 
-		err = mmio_rb_add(mrp);
+		err = mmio_rb_add(rbt, mrp);
 		if (err)
 			free(mrp);
 	} else
 		err = ENOMEM;
 
 	return (err);
 }
 
+int
+register_mem(struct mem_range *memp)
+{
+
+	return (register_mem_int(&mmio_rb_root, memp));
+}
+
+int
+register_mem_fallback(struct mem_range *memp)
+{
+
+	return (register_mem_int(&mmio_rb_fallback, memp));
+}
+
 void
 init_mem(void)
 {
 
-	RB_INIT(&mmio_rbroot);
+	RB_INIT(&mmio_rb_root);
+	RB_INIT(&mmio_rb_fallback);
 }
Index: user/attilio/vmobj-rwlock/usr.sbin/bhyve/mem.h
===================================================================
--- user/attilio/vmobj-rwlock/usr.sbin/bhyve/mem.h	(revision 247191)
+++ user/attilio/vmobj-rwlock/usr.sbin/bhyve/mem.h	(revision 247192)
@@ -1,57 +1,58 @@
 /*-
  * Copyright (c) 2012 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MEM_H_
 #define	_MEM_H_
 
 #include <sys/linker_set.h>
 
 struct vmctx;
 
 typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 			  int size, uint64_t *val, void *arg1, long arg2);
 
 struct mem_range {
 	const char 	*name;
 	int		flags;
 	mem_func_t	handler;
 	void		*arg1;
 	long		arg2;
 	uint64_t  	base;
 	uint64_t  	size;
 };
 #define	MEM_F_READ		0x1
 #define	MEM_F_WRITE		0x2
 #define	MEM_F_RW		0x3
 
 void	init_mem(void);
 int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
 		    
 int	register_mem(struct mem_range *memp);
+int	register_mem_fallback(struct mem_range *memp);
 
 #endif	/* _MEM_H_ */
Index: user/attilio/vmobj-rwlock/usr.sbin/bhyve/pci_emul.c
===================================================================
--- user/attilio/vmobj-rwlock/usr.sbin/bhyve/pci_emul.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/usr.sbin/bhyve/pci_emul.c	(revision 247192)
@@ -1,1374 +1,1405 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include <assert.h>
 
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include "bhyverun.h"
 #include "inout.h"
 #include "mem.h"
 #include "mptbl.h"
 #include "pci_emul.h"
 #include "ioapic.h"
 
 #define CONF1_ADDR_PORT    0x0cf8
 #define CONF1_DATA_PORT    0x0cfc
 
 #define	CFGWRITE(pi,off,val,b)						\
 do {									\
 	if ((b) == 1) {							\
 		pci_set_cfgdata8((pi),(off),(val));			\
 	} else if ((b) == 2) {						\
 		pci_set_cfgdata16((pi),(off),(val));			\
 	} else {							\
 		pci_set_cfgdata32((pi),(off),(val));			\
 	}								\
 } while (0)
 
 #define MAXSLOTS	(PCI_SLOTMAX + 1)
 #define	MAXFUNCS	(PCI_FUNCMAX + 1)
 
 static struct slotinfo {
 	char	*si_name;
 	char	*si_param;
 	struct pci_devinst *si_devi;
 	int	si_legacy;
 } pci_slotinfo[MAXSLOTS][MAXFUNCS];
 
 /*
  * Used to keep track of legacy interrupt owners/requestors
  */
 #define NLIRQ		16
 
 static struct lirqinfo {
 	int	li_generic;
 	int	li_acount;
 	struct pci_devinst *li_owner;	/* XXX should be a list */
 } lirq[NLIRQ];
 
 SET_DECLARE(pci_devemu_set, struct pci_devemu);
 
 static uint64_t pci_emul_iobase;
 static uint64_t pci_emul_membase32;
 static uint64_t pci_emul_membase64;
 
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
 #define	PCI_EMUL_MEMBASE32	(lomem_sz)
 #define	PCI_EMUL_MEMLIMIT32	0xE0000000		/* 3.5GB */
 
 #define	PCI_EMUL_MEMBASE64	0xD000000000UL
 #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
 
 static int pci_emul_devices;
 
 /*
  * I/O access
  */
 
 /*
  * Slot options are in the form:
  *
  *  <slot>[:<func>],<emul>[,<config>]
  *
  *  slot is 0..31
  *  func is 0..7
  *  emul is a string describing the type of PCI device e.g. virtio-net
  *  config is an optional string, depending on the device, that can be
  *  used for configuration.
  *   Examples are:
  *     1,virtio-net,tap0
  *     3:0,dummy
  */
 static void
 pci_parse_slot_usage(char *aopt)
 {
 	printf("Invalid PCI slot info field \"%s\"\n", aopt);
 	free(aopt);
 }
 
 void
 pci_parse_slot(char *opt, int legacy)
 {
 	char *slot, *func, *emul, *config;
 	char *str, *cpy;
 	int snum, fnum;
 
 	str = cpy = strdup(opt);
 
 	config = NULL;
 
 	if (strchr(str, ':') != NULL) {
 		slot = strsep(&str, ":");
 		func = strsep(&str, ",");
 	} else {
 		slot = strsep(&str, ",");
 		func = NULL;
 	}
 
 	emul = strsep(&str, ",");
 	if (str != NULL) {
 		config = strsep(&str, ",");
 	}
 
 	if (emul == NULL) {
 		pci_parse_slot_usage(cpy);
 		return;
 	}
 
 	snum = atoi(slot);
 	fnum = func ? atoi(func) : 0;
 	if (snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) {
 		pci_parse_slot_usage(cpy);
 	} else {
 		pci_slotinfo[snum][fnum].si_name = emul;
 		pci_slotinfo[snum][fnum].si_param = config;
 		pci_slotinfo[snum][fnum].si_legacy = legacy;
 	}
 }
 
 static int
 pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
 {
 
 	if (offset < pi->pi_msix.pba_offset)
 		return (0);
 
 	if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
 		     uint64_t value)
 {
 	int msix_entry_offset;
 	int tab_index;
 	char *dest;
 
 	/* support only 4 or 8 byte writes */
 	if (size != 4 && size != 8)
 		return (-1);
 
 	/*
 	 * Return if table index is beyond what device supports
 	 */
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 	if (tab_index >= pi->pi_msix.table_count)
 		return (-1);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned writes */
 	if ((msix_entry_offset % size) != 0)
 		return (-1);
 
 	dest = (char *)(pi->pi_msix.table + tab_index);
 	dest += msix_entry_offset;
 
 	if (size == 4)
 		*((uint32_t *)dest) = value;
 	else
 		*((uint64_t *)dest) = value;
 
 	return (0);
 }
 
 uint64_t
 pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
 {
 	char *dest;
 	int msix_entry_offset;
 	int tab_index;
 	uint64_t retval = ~0;
 
 	/* support only 4 or 8 byte reads */
 	if (size != 4 && size != 8)
 		return (retval);
 
 	msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
 
 	/* support only aligned reads */
 	if ((msix_entry_offset % size) != 0) {
 		return (retval);
 	}
 
 	tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
 
 	if (tab_index < pi->pi_msix.table_count) {
 		/* valid MSI-X Table access */
 		dest = (char *)(pi->pi_msix.table + tab_index);
 		dest += msix_entry_offset;
 
 		if (size == 4)
 			retval = *((uint32_t *)dest);
 		else
 			retval = *((uint64_t *)dest);
 	} else if (pci_valid_pba_offset(pi, offset)) {
 		/* return 0 for PBA access */
 		retval = 0;
 	}
 
 	return (retval);
 }
 
 int
 pci_msix_table_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.table_bar);
 	else
 		return (-1);
 }
 
 int
 pci_msix_pba_bar(struct pci_devinst *pi)
 {
 
 	if (pi->pi_msix.table != NULL)
 		return (pi->pi_msix.pba_bar);
 	else
 		return (-1);
 }
 
 static int
 pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		    uint32_t *eax, void *arg)
 {
 	struct pci_devinst *pdi = arg;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int i;
 
 	for (i = 0; i <= PCI_BARMAX; i++) {
 		if (pdi->pi_bar[i].type == PCIBAR_IO &&
 		    port >= pdi->pi_bar[i].addr &&
 		    port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
 			offset = port - pdi->pi_bar[i].addr;
 			if (in)
 				*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
 							 offset, bytes);
 			else
 				(*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
 						   bytes, *eax);
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 static int
 pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 		     int size, uint64_t *val, void *arg1, long arg2)
 {
 	struct pci_devinst *pdi = arg1;
 	struct pci_devemu *pe = pdi->pi_d;
 	uint64_t offset;
 	int bidx = (int) arg2;
 
 	assert(bidx <= PCI_BARMAX);
 	assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
 	       pdi->pi_bar[bidx].type == PCIBAR_MEM64);
 	assert(addr >= pdi->pi_bar[bidx].addr &&
 	       addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
 
 	offset = addr - pdi->pi_bar[bidx].addr;
 
 	if (dir == MEM_F_WRITE)
 		(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val);
 	else
 		*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size);
 
 	return (0);
 }
 
 
 static int
 pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
 			uint64_t *addr)
 {
 	uint64_t base;
 
 	assert((size & (size - 1)) == 0);	/* must be a power of 2 */
 
 	base = roundup2(*baseptr, size);
 
 	if (base + size <= limit) {
 		*addr = base;
 		*baseptr = base + size;
 		return (0);
 	} else
 		return (-1);
 }
 
 int
 pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
 		   uint64_t size)
 {
 
 	return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
 }
 
 int
 pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
 		    enum pcibar_type type, uint64_t size)
 {
 	int i, error;
 	uint64_t *baseptr, limit, addr, mask, lobits, bar;
 	struct inout_port iop;
 	struct mem_range memp;
 
 	assert(idx >= 0 && idx <= PCI_BARMAX);
 
 	if ((size & (size - 1)) != 0)
 		size = 1UL << flsl(size);	/* round up to a power of 2 */
 
 	switch (type) {
 	case PCIBAR_NONE:
 		baseptr = NULL;
 		addr = mask = lobits = 0;
 		break;
 	case PCIBAR_IO:
 		if (hostbase &&
 		    pci_slotinfo[pdi->pi_slot][pdi->pi_func].si_legacy) {
 			assert(hostbase < PCI_EMUL_IOBASE);
 			baseptr = &hostbase;
 		} else {
 			baseptr = &pci_emul_iobase;
 		}
 		limit = PCI_EMUL_IOLIMIT;
 		mask = PCIM_BAR_IO_BASE;
 		lobits = PCIM_BAR_IO_SPACE;
 		break;
 	case PCIBAR_MEM64:
 		/*
 		 * XXX
 		 * Some drivers do not work well if the 64-bit BAR is allocated
 		 * above 4GB. Allow for this by allocating small requests under
 		 * 4GB unless then allocation size is larger than some arbitrary
 		 * number (32MB currently).
 		 */
 		if (size > 32 * 1024 * 1024) {
 			/*
 			 * XXX special case for device requiring peer-peer DMA
 			 */
 			if (size == 0x100000000UL)
 				baseptr = &hostbase;
 			else
 				baseptr = &pci_emul_membase64;
 			limit = PCI_EMUL_MEMLIMIT64;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				 PCIM_BAR_MEM_PREFETCH;
 			break;
 		} else {
 			baseptr = &pci_emul_membase32;
 			limit = PCI_EMUL_MEMLIMIT32;
 			mask = PCIM_BAR_MEM_BASE;
 			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
 		}
 		break;
 	case PCIBAR_MEM32:
 		baseptr = &pci_emul_membase32;
 		limit = PCI_EMUL_MEMLIMIT32;
 		mask = PCIM_BAR_MEM_BASE;
 		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 		break;
 	default:
 		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
 		assert(0);
 	}
 
 	if (baseptr != NULL) {
 		error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
 		if (error != 0)
 			return (error);
 	}
 
 	pdi->pi_bar[idx].type = type;
 	pdi->pi_bar[idx].addr = addr;
 	pdi->pi_bar[idx].size = size;
 
 	/* Initialize the BAR register in config space */
 	bar = (addr & mask) | lobits;
 	pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
 
 	if (type == PCIBAR_MEM64) {
 		assert(idx + 1 <= PCI_BARMAX);
 		pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
 		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
 	}
 	
 	/* add a handler to intercept accesses to the I/O bar */
 	if (type == PCIBAR_IO) {
 		iop.name = pdi->pi_name;
 		iop.flags = IOPORT_F_INOUT;
 		iop.handler = pci_emul_io_handler;
 		iop.arg = pdi;
 
 		for (i = 0; i < size; i++) {
 			iop.port = addr + i;
 			register_inout(&iop);
 		}
 	} else if (type == PCIBAR_MEM32 || type == PCIBAR_MEM64) {
 		/* add memory bar intercept handler */
 		memp.name = pdi->pi_name;
 		memp.flags = MEM_F_RW;
 		memp.base = addr;
 		memp.size = size;
 		memp.handler = pci_emul_mem_handler;
 		memp.arg1 = pdi;
 		memp.arg2 = idx;
 
 		error = register_mem(&memp);
 		assert(error == 0);
 	}
 
 	return (0);
 }
 
 #define	CAP_START_OFFSET	0x40
 static int
 pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
 {
 	int i, capoff, capid, reallen;
 	uint16_t sts;
 
 	static u_char endofcap[4] = {
 		PCIY_RESERVED, 0, 0, 0
 	};
 
 	assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
 
 	reallen = roundup2(caplen, 4);		/* dword aligned */
 
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
 		capoff = CAP_START_OFFSET;
 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
 		pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
 	} else {
 		capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
 		while (1) {
 			assert((capoff & 0x3) == 0);
 			capid = pci_get_cfgdata8(pi, capoff);
 			if (capid == PCIY_RESERVED)
 				break;
 			capoff = pci_get_cfgdata8(pi, capoff + 1);
 		}
 	}
 
 	/* Check if we have enough space */
 	if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
 		return (-1);
 
 	/* Copy the capability */
 	for (i = 0; i < caplen; i++)
 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
 
 	/* Set the next capability pointer */
 	pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
 
 	/* Copy of the reserved capability which serves as the end marker */
 	for (i = 0; i < sizeof(endofcap); i++)
 		pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
 
 	return (0);
 }
 
 static struct pci_devemu *
 pci_emul_finddev(char *name)
 {
 	struct pci_devemu **pdpp, *pdp;
 
 	SET_FOREACH(pdpp, pci_devemu_set) {
 		pdp = *pdpp;
 		if (!strcmp(pdp->pe_emu, name)) {
 			return (pdp);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, int func,
 	      char *params)
 {
 	struct pci_devinst *pdi;
 	pdi = malloc(sizeof(struct pci_devinst));
 	bzero(pdi, sizeof(*pdi));
 
 	pdi->pi_vmctx = ctx;
 	pdi->pi_bus = 0;
 	pdi->pi_slot = slot;
 	pdi->pi_func = func;
 	pdi->pi_d = pde;
 	snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
 
 	/* Disable legacy interrupts */
 	pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
 	pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
 
 	pci_set_cfgdata8(pdi, PCIR_COMMAND,
 		    PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
 	if ((*pde->pe_init)(ctx, pdi, params) != 0) {
 		free(pdi);
 	} else {
 		pci_emul_devices++;
 		pci_slotinfo[slot][func].si_devi = pdi;
 	}	
 }
 
 void
 pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
 {
 	int mmc;
 
 	CTASSERT(sizeof(struct msicap) == 14);
 
 	/* Number of msi messages must be a power of 2 between 1 and 32 */
 	assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
 	mmc = ffs(msgnum) - 1;
 
 	bzero(msicap, sizeof(struct msicap));
 	msicap->capid = PCIY_MSI;
 	msicap->nextptr = nextptr;
 	msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
 }
 
 int
 pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
 {
 	struct msicap msicap;
 
 	pci_populate_msicap(&msicap, msgnum, 0);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
 }
 
 static void
 pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
 		     uint32_t msix_tab_size, int nextptr)
 {
 	CTASSERT(sizeof(struct msixcap) == 12);
 
 	assert(msix_tab_size % 4096 == 0);
 
 	bzero(msixcap, sizeof(struct msixcap));
 	msixcap->capid = PCIY_MSIX;
 	msixcap->nextptr = nextptr;
 
 	/*
 	 * Message Control Register, all fields set to
 	 * zero except for the Table Size.
 	 * Note: Table size N is encoded as N-1
 	 */
 	msixcap->msgctrl = msgnum - 1;
 
 	/*
 	 * MSI-X BAR setup:
 	 * - MSI-X table start at offset 0
 	 * - PBA table starts at a 4K aligned offset after the MSI-X table
 	 */
 	msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
 	msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
 }
 
 static void
 pci_msix_table_init(struct pci_devinst *pi, int table_entries)
 {
 	int i, table_size;
 
 	assert(table_entries > 0);
 	assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
 
 	table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
 	pi->pi_msix.table = malloc(table_size);
 	bzero(pi->pi_msix.table, table_size);
 
 	/* set mask bit of vector control register */
 	for (i = 0; i < table_entries; i++)
 		pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
 }
 
 int
 pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
 {
 	uint16_t pba_index;
 	uint32_t tab_size;
 	struct msixcap msixcap;
 
 	assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
 	assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
 	
 	tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
 
 	/* Align table size to nearest 4K */
 	tab_size = roundup2(tab_size, 4096);
 
 	pi->pi_msix.table_bar = barnum;
 	pi->pi_msix.pba_bar   = barnum;
 	pi->pi_msix.table_offset = 0;
 	pi->pi_msix.table_count = msgnum;
 	pi->pi_msix.pba_offset = tab_size;
 
 	/* calculate the MMIO size required for MSI-X PBA */
 	pba_index = (msgnum - 1) / (PBA_TABLE_ENTRY_SIZE * 8);
 	pi->pi_msix.pba_size = (pba_index + 1) * PBA_TABLE_ENTRY_SIZE;
 
 	pci_msix_table_init(pi, msgnum);
 
 	pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size, 0);
 
 	/* allocate memory for MSI-X Table and PBA */
 	pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
 				tab_size + pi->pi_msix.pba_size);
 
 	return (pci_emul_add_capability(pi, (u_char *)&msixcap,
 					sizeof(msixcap)));
 }
 
 void
 msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask;
 	int off, table_bar;
 	
 	off = offset - capoff;
 	table_bar = pi->pi_msix.table_bar;
 	/* Message Control Register */
 	if (off == 2 && bytes == 2) {
 		rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 
 		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
 		pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
 	} 
 	
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 void
 msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		int bytes, uint32_t val)
 {
 	uint16_t msgctrl, rwmask, msgdata, mme;
 	uint32_t addrlo;
 
 	/*
 	 * If guest is writing to the message control register make sure
 	 * we do not overwrite read-only fields.
 	 */
 	if ((offset - capoff) == 2 && bytes == 2) {
 		rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
 		msgctrl = pci_get_cfgdata16(pi, offset);
 		msgctrl &= ~rwmask;
 		msgctrl |= val & rwmask;
 		val = msgctrl;
 
 		addrlo = pci_get_cfgdata32(pi, capoff + 4);
 		if (msgctrl & PCIM_MSICTRL_64BIT)
 			msgdata = pci_get_cfgdata16(pi, capoff + 12);
 		else
 			msgdata = pci_get_cfgdata16(pi, capoff + 8);
 
 		/*
 		 * XXX check delivery mode, destination mode etc
 		 */
 		mme = msgctrl & PCIM_MSICTRL_MME_MASK;
 		pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
 		if (pi->pi_msi.enabled) {
 			pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
 			pi->pi_msi.vector = msgdata & 0xff;
 			pi->pi_msi.msgnum = 1 << (mme >> 4);
 		} else {
 			pi->pi_msi.cpu = 0;
 			pi->pi_msi.vector = 0;
 			pi->pi_msi.msgnum = 0;
 		}
 	}
 
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 void
 pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
 		 int bytes, uint32_t val)
 {
 
 	/* XXX don't write to the readonly parts */
 	CFGWRITE(pi, offset, val, bytes);
 }
 
 #define	PCIECAP_VERSION	0x2
 int
 pci_emul_add_pciecap(struct pci_devinst *pi, int type)
 {
 	int err;
 	struct pciecap pciecap;
 
 	CTASSERT(sizeof(struct pciecap) == 60);
 
 	if (type != PCIEM_TYPE_ROOT_PORT)
 		return (-1);
 
 	bzero(&pciecap, sizeof(pciecap));
 
 	pciecap.capid = PCIY_EXPRESS;
 	pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT;
 	pciecap.link_capabilities = 0x411;	/* gen1, x1 */
 	pciecap.link_status = 0x11;		/* gen1, x1 */
 
 	err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
 	return (err);
 }
 
 /*
  * This function assumes that 'coff' is in the capabilities region of the
  * config space.
  */
 static void
 pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
 {
 	int capid;
 	uint8_t capoff, nextoff;
 
 	/* Do not allow un-aligned writes */
 	if ((offset & (bytes - 1)) != 0)
 		return;
 
 	/* Find the capability that we want to update */
 	capoff = CAP_START_OFFSET;
 	while (1) {
 		capid = pci_get_cfgdata8(pi, capoff);
 		if (capid == PCIY_RESERVED)
 			break;
 
 		nextoff = pci_get_cfgdata8(pi, capoff + 1);
 		if (offset >= capoff && offset < nextoff)
 			break;
 
 		capoff = nextoff;
 	}
 	assert(offset >= capoff);
 
 	/*
 	 * Capability ID and Next Capability Pointer are readonly
 	 */
 	if (offset == capoff || offset == capoff + 1)
 		return;
 
 	switch (capid) {
 	case PCIY_MSI:
 		msicap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_MSIX:
 		msixcap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	case PCIY_EXPRESS:
 		pciecap_cfgwrite(pi, capoff, offset, bytes, val);
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 pci_emul_iscap(struct pci_devinst *pi, int offset)
 {
 	int found;
 	uint16_t sts;
 	uint8_t capid, lastoff;
 
 	found = 0;
 	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
 		lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
 		while (1) {
 			assert((lastoff & 0x3) == 0);
 			capid = pci_get_cfgdata8(pi, lastoff);
 			if (capid == PCIY_RESERVED)
 				break;
 			lastoff = pci_get_cfgdata8(pi, lastoff + 1);
 		}
 		if (offset >= CAP_START_OFFSET && offset <= lastoff)
 			found = 1;
 	}
 	return (found);
 }
 
+static int
+pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+			  int size, uint64_t *val, void *arg1, long arg2)
+{
+	/*
+	 * Ignore writes; return 0xff's for reads. The mem read code
+	 * will take care of truncating to the correct size.
+	 */
+	if (dir == MEM_F_READ) {
+		*val = 0xffffffffffffffff;
+	}
+
+	return (0);
+}
+
 void
 init_pci(struct vmctx *ctx)
 {
+	struct mem_range memp;
 	struct pci_devemu *pde;
 	struct slotinfo *si;
 	int slot, func;
+	int error;
 
 	pci_emul_iobase = PCI_EMUL_IOBASE;
 	pci_emul_membase32 = PCI_EMUL_MEMBASE32;
 	pci_emul_membase64 = PCI_EMUL_MEMBASE64;
 
 	for (slot = 0; slot < MAXSLOTS; slot++) {
 		for (func = 0; func < MAXFUNCS; func++) {
 			si = &pci_slotinfo[slot][func];
 			if (si->si_name != NULL) {
 				pde = pci_emul_finddev(si->si_name);
 				if (pde != NULL) {
 					pci_emul_init(ctx, pde, slot, func,
 						      si->si_param);
 				}
 			}
 		}
 	}
 
 	/*
 	 * Allow ISA IRQs 5,10,11,12, and 15 to be available for
 	 * generic use
 	 */
 	lirq[5].li_generic = 1;
 	lirq[10].li_generic = 1;
 	lirq[11].li_generic = 1;
 	lirq[12].li_generic = 1;
 	lirq[15].li_generic = 1;
+
+	/*
+	 * Setup the PCI hole to return 0xff's when accessed in a region
+	 * with no devices
+	 */
+	memset(&memp, 0, sizeof(struct mem_range));
+	memp.name = "PCI hole";
+	memp.flags = MEM_F_RW;
+	memp.base = lomem_sz;
+	memp.size = (4ULL * 1024 * 1024 * 1024) - lomem_sz;
+	memp.handler = pci_emul_fallback_handler;
+
+	error = register_mem_fallback(&memp);
+	assert(error == 0);
 }
 
 int
 pci_msi_enabled(struct pci_devinst *pi)
 {
 	return (pi->pi_msi.enabled);
 }
 
 int
 pci_msi_msgnum(struct pci_devinst *pi)
 {
 	if (pi->pi_msi.enabled)
 		return (pi->pi_msi.msgnum);
 	else
 		return (0);
 }
 
 int
 pci_msix_enabled(struct pci_devinst *pi)
 {
 
 	return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
 }
 
 void
 pci_generate_msix(struct pci_devinst *pi, int index)
 {
 	struct msix_table_entry *mte;
 
 	if (!pci_msix_enabled(pi))
 		return;
 
 	if (pi->pi_msix.function_mask)
 		return;
 
 	if (index >= pi->pi_msix.table_count)
 		return;
 
 	mte = &pi->pi_msix.table[index];
 	if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
 		/* XXX Set PBA bit if interrupt is disabled */
 		vm_lapic_irq(pi->pi_vmctx,
 			     (mte->addr >> 12) & 0xff, mte->msg_data & 0xff);
 	}
 }
 
 void
 pci_generate_msi(struct pci_devinst *pi, int msg)
 {
 
 	if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
 		vm_lapic_irq(pi->pi_vmctx,
 			     pi->pi_msi.cpu,
 			     pi->pi_msi.vector + msg);
 	}
 }
 
 int
 pci_is_legacy(struct pci_devinst *pi)
 {
 
 	return (pci_slotinfo[pi->pi_slot][pi->pi_func].si_legacy);
 }
 
 static int
 pci_lintr_alloc(struct pci_devinst *pi, int vec)
 {
 	int i;
 
 	assert(vec < NLIRQ);
 
 	if (vec == -1) {
 		for (i = 0; i < NLIRQ; i++) {
 			if (lirq[i].li_generic &&
 			    lirq[i].li_owner == NULL) {
 				vec = i;
 				break;
 			}
 		}
 	} else {
 		if (lirq[vec].li_owner != NULL) {
 			vec = -1;
 		}
 	}
 	assert(vec != -1);
 
 	lirq[vec].li_owner = pi;
 	pi->pi_lintr_pin = vec;
 
 	return (vec);
 }
 
 int
 pci_lintr_request(struct pci_devinst *pi, int vec)
 {
 
 	vec = pci_lintr_alloc(pi, vec);
 	pci_set_cfgdata8(pi, PCIR_INTLINE, vec);
 	pci_set_cfgdata8(pi, PCIR_INTPIN, 1);
 	return (0);
 }
 
 void
 pci_lintr_assert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr_pin);
 	ioapic_assert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
 }
 
 void
 pci_lintr_deassert(struct pci_devinst *pi)
 {
 
 	assert(pi->pi_lintr_pin);
 	ioapic_deassert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
 }
 
 /*
  * Return 1 if the emulated device in 'slot' is a multi-function device.
  * Return 0 otherwise.
  */
 static int
 pci_emul_is_mfdev(int slot)
 {
 	int f, numfuncs;
 
 	numfuncs = 0;
 	for (f = 0; f < MAXFUNCS; f++) {
 		if (pci_slotinfo[slot][f].si_devi != NULL) {
 			numfuncs++;
 		}
 	}
 	return (numfuncs > 1);
 }
 
 /*
  * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
  * whether or not is a multi-function being emulated in the pci 'slot'.
  */
 static void
 pci_emul_hdrtype_fixup(int slot, int off, int bytes, uint32_t *rv)
 {
 	int mfdev;
 
 	if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
 		mfdev = pci_emul_is_mfdev(slot);
 		switch (bytes) {
 		case 1:
 		case 2:
 			*rv &= ~PCIM_MFDEV;
 			if (mfdev) {
 				*rv |= PCIM_MFDEV;
 			}
 			break;
 		case 4:
 			*rv &= ~(PCIM_MFDEV << 16);
 			if (mfdev) {
 				*rv |= (PCIM_MFDEV << 16);
 			}
 			break;
 		}
 	}
 }
 
 static int cfgbus, cfgslot, cfgfunc, cfgoff;
 
 static int
 pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	uint32_t x;
 
 	assert(!in);
 
 	if (bytes != 4)
 		return (-1);
 
 	x = *eax;
 	cfgoff = x & PCI_REGMAX;
 	cfgfunc = (x >> 8) & PCI_FUNCMAX;
 	cfgslot = (x >> 11) & PCI_SLOTMAX;
 	cfgbus = (x >> 16) & PCI_BUSMAX;
 
 	return (0);
 }
 INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
 
 static int
 pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 uint32_t *eax, void *arg)
 {
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
 	int coff, idx, needcfg;
 	uint64_t mask, bar;
 
 	assert(bytes == 1 || bytes == 2 || bytes == 4);
 	
 	if (cfgbus == 0)
 		pi = pci_slotinfo[cfgslot][cfgfunc].si_devi;
 	else
 		pi = NULL;
 
 	coff = cfgoff + (port - CONF1_DATA_PORT);
 
 #if 0
 	printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
 		in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
 #endif
 
 	/*
 	 * Just return if there is no device at this cfgslot:cfgfunc or
 	 * if the guest is doing an un-aligned access
 	 */
 	if (pi == NULL || (coff & (bytes - 1)) != 0) {
 		if (in)
 			*eax = 0xffffffff;
 		return (0);
 	}
 
 	pe = pi->pi_d;
 
 	/*
 	 * Config read
 	 */
 	if (in) {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgread != NULL) {
 			needcfg = pe->pe_cfgread(ctx, vcpu, pi,
 						    coff, bytes, eax);
 		} else {
 			needcfg = 1;
 		}
 
 		if (needcfg) {
 			if (bytes == 1)
 				*eax = pci_get_cfgdata8(pi, coff);
 			else if (bytes == 2)
 				*eax = pci_get_cfgdata16(pi, coff);
 			else
 				*eax = pci_get_cfgdata32(pi, coff);
 		}
 
 		pci_emul_hdrtype_fixup(cfgslot, coff, bytes, eax);
 	} else {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgwrite != NULL &&
 		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
 			return (0);
 
 		/*
 		 * Special handling for write to BAR registers
 		 */
 		if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
 			/*
 			 * Ignore writes to BAR registers that are not
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
 				return (0);
 			idx = (coff - PCIR_BAR(0)) / 4;
 			switch (pi->pi_bar[idx].type) {
 			case PCIBAR_NONE:
 				bar = 0;
 				break;
 			case PCIBAR_IO:
 				mask = ~(pi->pi_bar[idx].size - 1);
 				mask &= PCIM_BAR_IO_BASE;
 				bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
 				break;
 			case PCIBAR_MEM32:
 				mask = ~(pi->pi_bar[idx].size - 1);
 				mask &= PCIM_BAR_MEM_BASE;
 				bar = *eax & mask;
 				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 				break;
 			case PCIBAR_MEM64:
 				mask = ~(pi->pi_bar[idx].size - 1);
 				mask &= PCIM_BAR_MEM_BASE;
 				bar = *eax & mask;
 				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
 				       PCIM_BAR_MEM_PREFETCH;
 				break;
 			case PCIBAR_MEMHI64:
 				mask = ~(pi->pi_bar[idx - 1].size - 1);
 				mask &= PCIM_BAR_MEM_BASE;
 				bar = ((uint64_t)*eax << 32) & mask;
 				bar = bar >> 32;
 				break;
 			default:
 				assert(0);
 			}
 			pci_set_cfgdata32(pi, coff, bar);
 
 		} else if (pci_emul_iscap(pi, coff)) {
 			pci_emul_capwrite(pi, coff, bytes, *eax);
 		} else {
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
 	}
 
 	return (0);
 }
 
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
 
 /*
  * I/O ports to configure PCI IRQ routing. We ignore all writes to it.
  */
 static int
 pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		     uint32_t *eax, void *arg)
 {
 	assert(in == 0);
 	return (0);
 }
 INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
 INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
 
 #define PCI_EMUL_TEST
 #ifdef PCI_EMUL_TEST
 /*
  * Define a dummy test device
  */
 #define DIOSZ	20
 #define DMEMSZ	4096
 struct pci_emul_dsoftc {
 	uint8_t   ioregs[DIOSZ];
 	uint8_t	  memregs[DMEMSZ];
 };
 
 #define	PCI_EMUL_MSI_MSGS	 4
 #define	PCI_EMUL_MSIX_MSGS	16
 
 static int
 pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
 	int error;
 	struct pci_emul_dsoftc *sc;
 
 	sc = malloc(sizeof(struct pci_emul_dsoftc));
 	memset(sc, 0, sizeof(struct pci_emul_dsoftc));
 
 	pi->pi_arg = sc;
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
 	pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
 
 	error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
 	assert(error == 0);
 
 	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
 	assert(error == 0);
 
 	return (0);
 }
 
 static void
 pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size, uint64_t value)
 {
 	int i;
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("diow: iow too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		if (size == 1) {
 			sc->ioregs[offset] = value & 0xff;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->ioregs[offset] = value;
 		} else {
 			printf("diow: iow unknown size %d\n", size);
 		}
 
 		/*
 		 * Special magic value to generate an interrupt
 		 */
 		if (offset == 4 && size == 4 && pci_msi_enabled(pi))
 			pci_generate_msi(pi, value % pci_msi_msgnum(pi));
 
 		if (value == 0xabcdef) {
 			for (i = 0; i < pci_msi_msgnum(pi); i++)
 				pci_generate_msi(pi, i);
 		}
 	}
 
 	if (baridx == 1) {
 		if (offset + size > DMEMSZ) {
 			printf("diow: memw too large, offset %ld size %d\n",
 			       offset, size);
 			return;
 		}
 
 		if (size == 1) {
 			sc->memregs[offset] = value;
 		} else if (size == 2) {
 			*(uint16_t *)&sc->memregs[offset] = value;
 		} else if (size == 4) {
 			*(uint32_t *)&sc->memregs[offset] = value;
 		} else if (size == 8) {
 			*(uint64_t *)&sc->memregs[offset] = value;
 		} else {
 			printf("diow: memw unknown size %d\n", size);
 		}
 		
 		/*
 		 * magic interrupt ??
 		 */
 	}
 
 	if (baridx > 1) {
 		printf("diow: unknown bar idx %d\n", baridx);
 	}
 }
 
 static uint64_t
 pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
 	      uint64_t offset, int size)
 {
 	struct pci_emul_dsoftc *sc = pi->pi_arg;
 	uint32_t value;
 
 	if (baridx == 0) {
 		if (offset + size > DIOSZ) {
 			printf("dior: ior too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
 	
 		if (size == 1) {
 			value = sc->ioregs[offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->ioregs[offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->ioregs[offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 	
 	if (baridx == 1) {
 		if (offset + size > DMEMSZ) {
 			printf("dior: memr too large, offset %ld size %d\n",
 			       offset, size);
 			return (0);
 		}
 	
 		if (size == 1) {
 			value = sc->memregs[offset];
 		} else if (size == 2) {
 			value = *(uint16_t *) &sc->memregs[offset];
 		} else if (size == 4) {
 			value = *(uint32_t *) &sc->memregs[offset];
 		} else if (size == 8) {
 			value = *(uint64_t *) &sc->memregs[offset];
 		} else {
 			printf("dior: ior unknown size %d\n", size);
 		}
 	}
 
 
 	if (baridx > 1) {
 		printf("dior: unknown bar idx %d\n", baridx);
 		return (0);
 	}
 
 	return (value);
 }
 
 struct pci_devemu pci_dummy = {
 	.pe_emu = "dummy",
 	.pe_init = pci_emul_dinit,
 	.pe_barwrite = pci_emul_diow,
 	.pe_barread = pci_emul_dior
 };
 PCI_EMUL_SET(pci_dummy);
 
 #endif /* PCI_EMUL_TEST */
Index: user/attilio/vmobj-rwlock/usr.sbin/bhyve
===================================================================
--- user/attilio/vmobj-rwlock/usr.sbin/bhyve	(revision 247191)
+++ user/attilio/vmobj-rwlock/usr.sbin/bhyve	(revision 247192)

Property changes on: user/attilio/vmobj-rwlock/usr.sbin/bhyve
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/usr.sbin/bhyve:r247016-247191
Index: user/attilio/vmobj-rwlock/usr.sbin/extattr/rmextattr.c
===================================================================
--- user/attilio/vmobj-rwlock/usr.sbin/extattr/rmextattr.c	(revision 247191)
+++ user/attilio/vmobj-rwlock/usr.sbin/extattr/rmextattr.c	(revision 247192)
@@ -1,289 +1,290 @@
 /*-
  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
  * Copyright (c) 2002 Poul-Henning Kamp.
  * Copyright (c) 1999, 2000, 2001, 2002 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning
  * Kamp and Network Associates Laboratories, the Security Research Division
  * of Network Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035
  * ("CBOSS"), as part of the DARPA CHATS research program
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/extattr.h>
 
 #include <libgen.h>
 #include <libutil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <vis.h>
 #include <err.h>
 #include <errno.h>
 
 static enum { EADUNNO, EAGET, EASET, EARM, EALS } what = EADUNNO;
 
 static void __dead2
 usage(void) 
 {
 
 	switch (what) {
 	case EAGET:
 		fprintf(stderr, "usage: getextattr [-fhqsx] attrnamespace");
 		fprintf(stderr, " attrname filename ...\n");
 		exit(-1);
 	case EASET:
 		fprintf(stderr, "usage: setextattr [-fhnq] attrnamespace");
 		fprintf(stderr, " attrname attrvalue filename ...\n");
 		exit(-1);
 	case EARM:
 		fprintf(stderr, "usage: rmextattr [-fhq] attrnamespace");
 		fprintf(stderr, " attrname filename ...\n");
 		exit(-1);
 	case EALS:
 		fprintf(stderr, "usage: lsextattr [-fhq] attrnamespace");
 		fprintf(stderr, " filename ...\n");
 		exit(-1);
 	case EADUNNO:
 	default:
 		fprintf(stderr, "usage: (getextattr|lsextattr|rmextattr");
 		fprintf(stderr, "|setextattr)\n");
 		exit (-1);
 	}
 }
 
 static void
 mkbuf(char **buf, int *oldlen, int newlen)
 {
 
 	if (*oldlen >= newlen)
 		return;
 	if (*buf != NULL)
 		free(*buf);
 	*buf = malloc(newlen);
 	if (*buf == NULL)
 		err(1, "malloc");
 	*oldlen = newlen;
 	return;
 }
 
 int
 main(int argc, char *argv[])
 {
 	char	*buf, *visbuf, *p;
 
 	const char *options, *attrname;
 	int	 buflen, visbuflen, ch, error, i, arg_counter, attrnamespace,
 		 minargc;
 
 	int	flag_force = 0;
 	int	flag_nofollow = 0;
 	int	flag_null = 0;
 	int	flag_quiet = 0;
 	int	flag_string = 0;
 	int	flag_hex = 0;
 
 	visbuflen = buflen = 0;
 	visbuf = buf = NULL;
 
 	p = basename(argv[0]);
 	if (p == NULL)
 		p = argv[0];
 	if (!strcmp(p, "getextattr")) {
 		what = EAGET;
 		options = "fhqsx";
 		minargc = 3;
 	} else if (!strcmp(p, "setextattr")) {
 		what = EASET;
 		options = "fhnq";
 		minargc = 4;
 	} else if (!strcmp(p, "rmextattr")) {
 		what = EARM;
 		options = "fhq";
 		minargc = 3;
 	} else if (!strcmp(p, "lsextattr")) {
 		what = EALS;
 		options = "fhq";
 		minargc = 2;
 	} else {
 		usage();
 	}
 
 	while ((ch = getopt(argc, argv, options)) != -1) {
 		switch (ch) {
 		case 'f':
 			flag_force = 1;
 			break;
 		case 'h':
 			flag_nofollow = 1;
 			break;
 		case 'n':
 			flag_null = 1;
 			break;
 		case 'q':
 			flag_quiet = 1;
 			break;
 		case 's':
 			flag_string = 1;
 			break;
 		case 'x':
 			flag_hex = 1;
 			break;
 		case '?':
 		default:
 			usage();
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < minargc)
 		usage();
 
 	error = extattr_string_to_namespace(argv[0], &attrnamespace);
 	if (error)
 		err(-1, "%s", argv[0]);
 	argc--; argv++;
 
 	if (what != EALS) {
 		attrname = argv[0];
 		argc--; argv++;
 	} else
 		attrname = NULL;
 
 	if (what == EASET) {
 		mkbuf(&buf, &buflen, strlen(argv[0]) + 1);
 		strcpy(buf, argv[0]);
 		argc--; argv++;
 	}
 
 	for (arg_counter = 0; arg_counter < argc; arg_counter++) {
 		switch (what) {
 		case EARM:
 			if (flag_nofollow)
 				error = extattr_delete_link(argv[arg_counter],
 				    attrnamespace, attrname);
 			else
 				error = extattr_delete_file(argv[arg_counter],
 				    attrnamespace, attrname);
 			if (error >= 0)
 				continue;
 			break;
 		case EASET:
 			if (flag_nofollow)
 				error = extattr_set_link(argv[arg_counter],
 				    attrnamespace, attrname, buf,
 				    strlen(buf) + flag_null);
 			else
 				error = extattr_set_file(argv[arg_counter],
 				    attrnamespace, attrname, buf,
 				    strlen(buf) + flag_null);
 			if (error >= 0)
 				continue;
 			break;
 		case EALS:
 			if (flag_nofollow)
 				error = extattr_list_link(argv[arg_counter],
 				    attrnamespace, NULL, 0);
 			else
 				error = extattr_list_file(argv[arg_counter],
 				    attrnamespace, NULL, 0);
 			if (error < 0)
 				break;
 			mkbuf(&buf, &buflen, error);
 			if (flag_nofollow)
 				error = extattr_list_link(argv[arg_counter],
 				    attrnamespace, buf, buflen);
 			else
 				error = extattr_list_file(argv[arg_counter],
 				    attrnamespace, buf, buflen);
 			if (error < 0)
 				break;
 			if (!flag_quiet)
 				printf("%s\t", argv[arg_counter]);
 			for (i = 0; i < error; i += ch + 1) {
 			    /* The attribute name length is unsigned. */
 			    ch = (unsigned char)buf[i];
 			    printf("%s%*.*s", i ? "\t" : "",
 				ch, ch, buf + i + 1);
 			}
-			printf("\n");
+			if (!flag_quiet || error > 0)
+				printf("\n");
 			continue;
 		case EAGET:
 			if (flag_nofollow)
 				error = extattr_get_link(argv[arg_counter],
 				    attrnamespace, attrname, NULL, 0);
 			else
 				error = extattr_get_file(argv[arg_counter],
 				    attrnamespace, attrname, NULL, 0);
 			if (error < 0)
 				break;
 			mkbuf(&buf, &buflen, error);
 			if (flag_nofollow)
 				error = extattr_get_link(argv[arg_counter],
 				    attrnamespace, attrname, buf, buflen);
 			else
 				error = extattr_get_file(argv[arg_counter],
 				    attrnamespace, attrname, buf, buflen);
 			if (error < 0)
 				break;
 			if (!flag_quiet)
 				printf("%s\t", argv[arg_counter]);
 			if (flag_string) {
 				mkbuf(&visbuf, &visbuflen, error * 4 + 1);
 				strvisx(visbuf, buf, error,
 				    VIS_SAFE | VIS_WHITE);
 				printf("\"%s\"\n", visbuf);
 				continue;
 			} else if (flag_hex) {
 				for (i = 0; i < error; i++)
 					printf("%s%02x", i ? " " : "",
 					    buf[i]);
 				printf("\n");
 				continue;
 			} else {
 				fwrite(buf, error, 1, stdout);
 				printf("\n");
 				continue;
 			}
 		default:
 			break;
 		}
 		if (!flag_quiet) 
 			warn("%s: failed", argv[arg_counter]);
 		if (flag_force)
 			continue;
 		return(1);
 	}
 	return (0);
 }
Index: user/attilio/vmobj-rwlock
===================================================================
--- user/attilio/vmobj-rwlock	(revision 247191)
+++ user/attilio/vmobj-rwlock	(revision 247192)

Property changes on: user/attilio/vmobj-rwlock
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r247139-247191