Index: user/ae/inet6/contrib/llvm/patches/patch-r271432-clang-r205331-debug-info-crash.diff
===================================================================
--- user/ae/inet6/contrib/llvm/patches/patch-r271432-clang-r205331-debug-info-crash.diff	(nonexistent)
+++ user/ae/inet6/contrib/llvm/patches/patch-r271432-clang-r205331-debug-info-crash.diff	(revision 271453)
@@ -0,0 +1,46 @@
+commit 96365aef99ec463375dfdaf6eb260823e0477b6a
+Author: Adrian Prantl <aprantl@apple.com>
+Date:   Tue Apr 1 17:52:06 2014 +0000
+
+    Debug info: fix a crash when emitting IndirectFieldDecls, which were
+    previously not handled at all.
+    rdar://problem/16348575
+    
+    git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@205331 91177308-0d34-0410-b5e6-96231b3b80d8
+
+diff --git a/lib/CodeGen/CGDebugInfo.cpp b/lib/CodeGen/CGDebugInfo.cpp
+index 82db942..2556cf9 100644
+--- tools/clang/lib/CodeGen/CGDebugInfo.cpp
++++ tools/clangb/lib/CodeGen/CGDebugInfo.cpp
+@@ -1252,7 +1252,7 @@ CollectTemplateParams(const TemplateParameterList *TPList,
+         V = CGM.GetAddrOfFunction(FD);
+       // Member data pointers have special handling too to compute the fixed
+       // offset within the object.
+-      if (isa<FieldDecl>(D)) {
++      if (isa<FieldDecl>(D) || isa<IndirectFieldDecl>(D)) {
+         // These five lines (& possibly the above member function pointer
+         // handling) might be able to be refactored to use similar code in
+         // CodeGenModule::getMemberPointerConstant
+diff --git a/test/CodeGenCXX/debug-info-indirect-field-decl.cpp b/test/CodeGenCXX/debug-info-indirect-field-decl.cpp
+new file mode 100644
+index 0000000..131ceba
+--- /dev/null
++++ tools/clang/test/CodeGenCXX/debug-info-indirect-field-decl.cpp
+@@ -0,0 +1,17 @@
++// RUN: %clang_cc1 -emit-llvm -g -triple x86_64-apple-darwin %s -o - | FileCheck %s
++//
++// Test that indirect field decls are handled gracefully.
++// rdar://problem/16348575
++//
++template <class T, int T::*ptr> class Foo {  };
++
++struct Bar {
++  int i1;
++  // CHECK: [ DW_TAG_member ] [line [[@LINE+1]], size 32, align 32, offset 32] [from _ZTSN3BarUt_E]
++  union {
++    // CHECK: [ DW_TAG_member ] [i2] [line [[@LINE+1]], size 32, align 32, offset 0] [from int]
++    int i2;
++  };
++};
++
++Foo<Bar, &Bar::i2> the_foo;
Index: user/ae/inet6/contrib/llvm/tools/clang/lib/CodeGen/CGDebugInfo.cpp
===================================================================
--- user/ae/inet6/contrib/llvm/tools/clang/lib/CodeGen/CGDebugInfo.cpp	(revision 271452)
+++ user/ae/inet6/contrib/llvm/tools/clang/lib/CodeGen/CGDebugInfo.cpp	(revision 271453)
@@ -1,3294 +1,3294 @@
 //===--- CGDebugInfo.cpp - Emit Debug Information for a Module ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This coordinates the debug information generation while generating code.
 //
 //===----------------------------------------------------------------------===//
 
 #include "CGDebugInfo.h"
 #include "CGBlocks.h"
 #include "CGCXXABI.h"
 #include "CGObjCRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclFriend.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Version.h"
 #include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 using namespace clang;
 using namespace clang::CodeGen;
 
 CGDebugInfo::CGDebugInfo(CodeGenModule &CGM)
     : CGM(CGM), DebugKind(CGM.getCodeGenOpts().getDebugInfo()),
       DBuilder(CGM.getModule()) {
   CreateCompileUnit();
 }
 
 CGDebugInfo::~CGDebugInfo() {
   assert(LexicalBlockStack.empty() &&
          "Region stack mismatch, stack not empty!");
 }
 
 
 NoLocation::NoLocation(CodeGenFunction &CGF, CGBuilderTy &B)
   : DI(CGF.getDebugInfo()), Builder(B) {
   if (DI) {
     SavedLoc = DI->getLocation();
     DI->CurLoc = SourceLocation();
     Builder.SetCurrentDebugLocation(llvm::DebugLoc());
   }
 }
 
 NoLocation::~NoLocation() {
   if (DI) {
     assert(Builder.getCurrentDebugLocation().isUnknown());
     DI->CurLoc = SavedLoc;
   }
 }
 
 ArtificialLocation::ArtificialLocation(CodeGenFunction &CGF, CGBuilderTy &B)
   : DI(CGF.getDebugInfo()), Builder(B) {
   if (DI) {
     SavedLoc = DI->getLocation();
     DI->CurLoc = SourceLocation();
     Builder.SetCurrentDebugLocation(llvm::DebugLoc());
   }
 }
 
 void ArtificialLocation::Emit() {
   if (DI) {
     // Sync the Builder.
     DI->EmitLocation(Builder, SavedLoc);
     DI->CurLoc = SourceLocation();
     // Construct a location that has a valid scope, but no line info.
     assert(!DI->LexicalBlockStack.empty());
     llvm::DIDescriptor Scope(DI->LexicalBlockStack.back());
     Builder.SetCurrentDebugLocation(llvm::DebugLoc::get(0, 0, Scope));
   }
 }
 
 ArtificialLocation::~ArtificialLocation() {
   if (DI) {
     assert(Builder.getCurrentDebugLocation().getLine() == 0);
     DI->CurLoc = SavedLoc;
   }
 }
 
 void CGDebugInfo::setLocation(SourceLocation Loc) {
   // If the new location isn't valid return.
   if (Loc.isInvalid()) return;
 
   CurLoc = CGM.getContext().getSourceManager().getExpansionLoc(Loc);
 
   // If we've changed files in the middle of a lexical scope go ahead
   // and create a new lexical scope with file node if it's different
   // from the one in the scope.
   if (LexicalBlockStack.empty()) return;
 
   SourceManager &SM = CGM.getContext().getSourceManager();
   PresumedLoc PCLoc = SM.getPresumedLoc(CurLoc);
   PresumedLoc PPLoc = SM.getPresumedLoc(PrevLoc);
 
   if (PCLoc.isInvalid() || PPLoc.isInvalid() ||
       !strcmp(PPLoc.getFilename(), PCLoc.getFilename()))
     return;
 
   llvm::MDNode *LB = LexicalBlockStack.back();
   llvm::DIScope Scope = llvm::DIScope(LB);
   if (Scope.isLexicalBlockFile()) {
     llvm::DILexicalBlockFile LBF = llvm::DILexicalBlockFile(LB);
     llvm::DIDescriptor D
       = DBuilder.createLexicalBlockFile(LBF.getScope(),
                                         getOrCreateFile(CurLoc));
     llvm::MDNode *N = D;
     LexicalBlockStack.pop_back();
     LexicalBlockStack.push_back(N);
   } else if (Scope.isLexicalBlock() || Scope.isSubprogram()) {
     llvm::DIDescriptor D
       = DBuilder.createLexicalBlockFile(Scope, getOrCreateFile(CurLoc));
     llvm::MDNode *N = D;
     LexicalBlockStack.pop_back();
     LexicalBlockStack.push_back(N);
   }
 }
 
 /// getContextDescriptor - Get context info for the decl.
 llvm::DIScope CGDebugInfo::getContextDescriptor(const Decl *Context) {
   if (!Context)
     return TheCU;
 
   llvm::DenseMap<const Decl *, llvm::WeakVH>::iterator
     I = RegionMap.find(Context);
   if (I != RegionMap.end()) {
     llvm::Value *V = I->second;
     return llvm::DIScope(dyn_cast_or_null<llvm::MDNode>(V));
   }
 
   // Check namespace.
   if (const NamespaceDecl *NSDecl = dyn_cast<NamespaceDecl>(Context))
     return getOrCreateNameSpace(NSDecl);
 
   if (const RecordDecl *RDecl = dyn_cast<RecordDecl>(Context))
     if (!RDecl->isDependentType())
       return getOrCreateType(CGM.getContext().getTypeDeclType(RDecl),
                                         getOrCreateMainFile());
   return TheCU;
 }
 
 /// getFunctionName - Get function name for the given FunctionDecl. If the
 /// name is constructed on demand (e.g. C++ destructor) then the name
 /// is stored on the side.
 StringRef CGDebugInfo::getFunctionName(const FunctionDecl *FD) {
   assert (FD && "Invalid FunctionDecl!");
   IdentifierInfo *FII = FD->getIdentifier();
   FunctionTemplateSpecializationInfo *Info
     = FD->getTemplateSpecializationInfo();
   if (!Info && FII)
     return FII->getName();
 
   // Otherwise construct human readable name for debug info.
   SmallString<128> NS;
   llvm::raw_svector_ostream OS(NS);
   FD->printName(OS);
 
   // Add any template specialization args.
   if (Info) {
     const TemplateArgumentList *TArgs = Info->TemplateArguments;
     const TemplateArgument *Args = TArgs->data();
     unsigned NumArgs = TArgs->size();
     PrintingPolicy Policy(CGM.getLangOpts());
     TemplateSpecializationType::PrintTemplateArgumentList(OS, Args, NumArgs,
                                                           Policy);
   }
 
   // Copy this name on the side and use its reference.
   return internString(OS.str());
 }
 
 StringRef CGDebugInfo::getObjCMethodName(const ObjCMethodDecl *OMD) {
   SmallString<256> MethodName;
   llvm::raw_svector_ostream OS(MethodName);
   OS << (OMD->isInstanceMethod() ? '-' : '+') << '[';
   const DeclContext *DC = OMD->getDeclContext();
   if (const ObjCImplementationDecl *OID =
       dyn_cast<const ObjCImplementationDecl>(DC)) {
      OS << OID->getName();
   } else if (const ObjCInterfaceDecl *OID =
              dyn_cast<const ObjCInterfaceDecl>(DC)) {
       OS << OID->getName();
   } else if (const ObjCCategoryImplDecl *OCD =
              dyn_cast<const ObjCCategoryImplDecl>(DC)){
       OS << ((const NamedDecl *)OCD)->getIdentifier()->getNameStart() << '(' <<
           OCD->getIdentifier()->getNameStart() << ')';
   } else if (isa<ObjCProtocolDecl>(DC)) {
     // We can extract the type of the class from the self pointer.
     if (ImplicitParamDecl* SelfDecl = OMD->getSelfDecl()) {
       QualType ClassTy =
         cast<ObjCObjectPointerType>(SelfDecl->getType())->getPointeeType();
       ClassTy.print(OS, PrintingPolicy(LangOptions()));
     }
   }
   OS << ' ' << OMD->getSelector().getAsString() << ']';
 
   return internString(OS.str());
 }
 
 /// getSelectorName - Return selector name. This is used for debugging
 /// info.
 StringRef CGDebugInfo::getSelectorName(Selector S) {
   return internString(S.getAsString());
 }
 
 /// getClassName - Get class name including template argument list.
 StringRef
 CGDebugInfo::getClassName(const RecordDecl *RD) {
   const ClassTemplateSpecializationDecl *Spec
     = dyn_cast<ClassTemplateSpecializationDecl>(RD);
   if (!Spec)
     return RD->getName();
 
   const TemplateArgument *Args;
   unsigned NumArgs;
   if (TypeSourceInfo *TAW = Spec->getTypeAsWritten()) {
     const TemplateSpecializationType *TST =
       cast<TemplateSpecializationType>(TAW->getType());
     Args = TST->getArgs();
     NumArgs = TST->getNumArgs();
   } else {
     const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
     Args = TemplateArgs.data();
     NumArgs = TemplateArgs.size();
   }
   StringRef Name = RD->getIdentifier()->getName();
   PrintingPolicy Policy(CGM.getLangOpts());
   SmallString<128> TemplateArgList;
   {
     llvm::raw_svector_ostream OS(TemplateArgList);
     TemplateSpecializationType::PrintTemplateArgumentList(OS, Args, NumArgs,
                                                           Policy);
   }
 
   // Copy this name on the side and use its reference.
   return internString(Name, TemplateArgList);
 }
 
 /// getOrCreateFile - Get the file debug info descriptor for the input location.
 llvm::DIFile CGDebugInfo::getOrCreateFile(SourceLocation Loc) {
   if (!Loc.isValid())
     // If Location is not valid then use main input file.
     return DBuilder.createFile(TheCU.getFilename(), TheCU.getDirectory());
 
   SourceManager &SM = CGM.getContext().getSourceManager();
   PresumedLoc PLoc = SM.getPresumedLoc(Loc);
 
   if (PLoc.isInvalid() || StringRef(PLoc.getFilename()).empty())
     // If the location is not valid then use main input file.
     return DBuilder.createFile(TheCU.getFilename(), TheCU.getDirectory());
 
   // Cache the results.
   const char *fname = PLoc.getFilename();
   llvm::DenseMap<const char *, llvm::WeakVH>::iterator it =
     DIFileCache.find(fname);
 
   if (it != DIFileCache.end()) {
     // Verify that the information still exists.
     if (llvm::Value *V = it->second)
       return llvm::DIFile(cast<llvm::MDNode>(V));
   }
 
   llvm::DIFile F = DBuilder.createFile(PLoc.getFilename(), getCurrentDirname());
 
   DIFileCache[fname] = F;
   return F;
 }
 
 /// getOrCreateMainFile - Get the file info for main compile unit.
 llvm::DIFile CGDebugInfo::getOrCreateMainFile() {
   return DBuilder.createFile(TheCU.getFilename(), TheCU.getDirectory());
 }
 
 /// getLineNumber - Get line number for the location. If location is invalid
 /// then use current location.
 unsigned CGDebugInfo::getLineNumber(SourceLocation Loc) {
   if (Loc.isInvalid() && CurLoc.isInvalid())
     return 0;
   SourceManager &SM = CGM.getContext().getSourceManager();
   PresumedLoc PLoc = SM.getPresumedLoc(Loc.isValid() ? Loc : CurLoc);
   return PLoc.isValid()? PLoc.getLine() : 0;
 }
 
 /// getColumnNumber - Get column number for the location.
 unsigned CGDebugInfo::getColumnNumber(SourceLocation Loc, bool Force) {
   // We may not want column information at all.
   if (!Force && !CGM.getCodeGenOpts().DebugColumnInfo)
     return 0;
 
   // If the location is invalid then use the current column.
   if (Loc.isInvalid() && CurLoc.isInvalid())
     return 0;
   SourceManager &SM = CGM.getContext().getSourceManager();
   PresumedLoc PLoc = SM.getPresumedLoc(Loc.isValid() ? Loc : CurLoc);
   return PLoc.isValid()? PLoc.getColumn() : 0;
 }
 
 StringRef CGDebugInfo::getCurrentDirname() {
   if (!CGM.getCodeGenOpts().DebugCompilationDir.empty())
     return CGM.getCodeGenOpts().DebugCompilationDir;
 
   if (!CWDName.empty())
     return CWDName;
   SmallString<256> CWD;
   llvm::sys::fs::current_path(CWD);
   return CWDName = internString(CWD);
 }
 
 /// CreateCompileUnit - Create new compile unit.
 void CGDebugInfo::CreateCompileUnit() {
 
   // Get absolute path name.
   SourceManager &SM = CGM.getContext().getSourceManager();
   std::string MainFileName = CGM.getCodeGenOpts().MainFileName;
   if (MainFileName.empty())
     MainFileName = "<unknown>";
 
   // The main file name provided via the "-main-file-name" option contains just
   // the file name itself with no path information. This file name may have had
   // a relative path, so we look into the actual file entry for the main
   // file to determine the real absolute path for the file.
   std::string MainFileDir;
   if (const FileEntry *MainFile = SM.getFileEntryForID(SM.getMainFileID())) {
     MainFileDir = MainFile->getDir()->getName();
     if (MainFileDir != ".") {
       llvm::SmallString<1024> MainFileDirSS(MainFileDir);
       llvm::sys::path::append(MainFileDirSS, MainFileName);
       MainFileName = MainFileDirSS.str();
     }
   }
 
   // Save filename string.
   StringRef Filename = internString(MainFileName);
 
   // Save split dwarf file string.
   std::string SplitDwarfFile = CGM.getCodeGenOpts().SplitDwarfFile;
   StringRef SplitDwarfFilename = internString(SplitDwarfFile);
 
   unsigned LangTag;
   const LangOptions &LO = CGM.getLangOpts();
   if (LO.CPlusPlus) {
     if (LO.ObjC1)
       LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus;
     else
       LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
   } else if (LO.ObjC1) {
     LangTag = llvm::dwarf::DW_LANG_ObjC;
   } else if (LO.C99) {
     LangTag = llvm::dwarf::DW_LANG_C99;
   } else {
     LangTag = llvm::dwarf::DW_LANG_C89;
   }
 
   std::string Producer = getClangFullVersion();
 
   // Figure out which version of the ObjC runtime we have.
   unsigned RuntimeVers = 0;
   if (LO.ObjC1)
     RuntimeVers = LO.ObjCRuntime.isNonFragile() ? 2 : 1;
 
   // Create new compile unit.
   // FIXME - Eliminate TheCU.
   TheCU = DBuilder.createCompileUnit(LangTag, Filename, getCurrentDirname(),
                                      Producer, LO.Optimize,
                                      CGM.getCodeGenOpts().DwarfDebugFlags,
                                      RuntimeVers, SplitDwarfFilename);
 }
 
 /// CreateType - Get the Basic type from the cache or create a new
 /// one if necessary.
 llvm::DIType CGDebugInfo::CreateType(const BuiltinType *BT) {
   unsigned Encoding = 0;
   StringRef BTName;
   switch (BT->getKind()) {
 #define BUILTIN_TYPE(Id, SingletonId)
 #define PLACEHOLDER_TYPE(Id, SingletonId) \
   case BuiltinType::Id:
 #include "clang/AST/BuiltinTypes.def"
   case BuiltinType::Dependent:
     llvm_unreachable("Unexpected builtin type");
   case BuiltinType::NullPtr:
     return DBuilder.createNullPtrType();
   case BuiltinType::Void:
     return llvm::DIType();
   case BuiltinType::ObjCClass:
     if (ClassTy)
       return ClassTy;
     ClassTy = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type,
                                          "objc_class", TheCU,
                                          getOrCreateMainFile(), 0);
     return ClassTy;
   case BuiltinType::ObjCId: {
     // typedef struct objc_class *Class;
     // typedef struct objc_object {
     //  Class isa;
     // } *id;
 
     if (ObjTy)
       return ObjTy;
 
     if (!ClassTy)
       ClassTy = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type,
                                            "objc_class", TheCU,
                                            getOrCreateMainFile(), 0);
 
     unsigned Size = CGM.getContext().getTypeSize(CGM.getContext().VoidPtrTy);
 
     llvm::DIType ISATy = DBuilder.createPointerType(ClassTy, Size);
 
     ObjTy =
         DBuilder.createStructType(TheCU, "objc_object", getOrCreateMainFile(),
                                   0, 0, 0, 0, llvm::DIType(), llvm::DIArray());
 
     ObjTy.setTypeArray(DBuilder.getOrCreateArray(&*DBuilder.createMemberType(
         ObjTy, "isa", getOrCreateMainFile(), 0, Size, 0, 0, 0, ISATy)));
     return ObjTy;
   }
   case BuiltinType::ObjCSel: {
     if (SelTy)
       return SelTy;
     SelTy =
       DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type,
                                  "objc_selector", TheCU, getOrCreateMainFile(),
                                  0);
     return SelTy;
   }
 
   case BuiltinType::OCLImage1d:
     return getOrCreateStructPtrType("opencl_image1d_t",
                                     OCLImage1dDITy);
   case BuiltinType::OCLImage1dArray:
     return getOrCreateStructPtrType("opencl_image1d_array_t",
                                     OCLImage1dArrayDITy);
   case BuiltinType::OCLImage1dBuffer:
     return getOrCreateStructPtrType("opencl_image1d_buffer_t",
                                     OCLImage1dBufferDITy);
   case BuiltinType::OCLImage2d:
     return getOrCreateStructPtrType("opencl_image2d_t",
                                     OCLImage2dDITy);
   case BuiltinType::OCLImage2dArray:
     return getOrCreateStructPtrType("opencl_image2d_array_t",
                                     OCLImage2dArrayDITy);
   case BuiltinType::OCLImage3d:
     return getOrCreateStructPtrType("opencl_image3d_t",
                                     OCLImage3dDITy);
   case BuiltinType::OCLSampler:
     return DBuilder.createBasicType("opencl_sampler_t",
                                     CGM.getContext().getTypeSize(BT),
                                     CGM.getContext().getTypeAlign(BT),
                                     llvm::dwarf::DW_ATE_unsigned);
   case BuiltinType::OCLEvent:
     return getOrCreateStructPtrType("opencl_event_t",
                                     OCLEventDITy);
 
   case BuiltinType::UChar:
   case BuiltinType::Char_U: Encoding = llvm::dwarf::DW_ATE_unsigned_char; break;
   case BuiltinType::Char_S:
   case BuiltinType::SChar: Encoding = llvm::dwarf::DW_ATE_signed_char; break;
   case BuiltinType::Char16:
   case BuiltinType::Char32: Encoding = llvm::dwarf::DW_ATE_UTF; break;
   case BuiltinType::UShort:
   case BuiltinType::UInt:
   case BuiltinType::UInt128:
   case BuiltinType::ULong:
   case BuiltinType::WChar_U:
   case BuiltinType::ULongLong: Encoding = llvm::dwarf::DW_ATE_unsigned; break;
   case BuiltinType::Short:
   case BuiltinType::Int:
   case BuiltinType::Int128:
   case BuiltinType::Long:
   case BuiltinType::WChar_S:
   case BuiltinType::LongLong:  Encoding = llvm::dwarf::DW_ATE_signed; break;
   case BuiltinType::Bool:      Encoding = llvm::dwarf::DW_ATE_boolean; break;
   case BuiltinType::Half:
   case BuiltinType::Float:
   case BuiltinType::LongDouble:
   case BuiltinType::Double:    Encoding = llvm::dwarf::DW_ATE_float; break;
   }
 
   switch (BT->getKind()) {
   case BuiltinType::Long:      BTName = "long int"; break;
   case BuiltinType::LongLong:  BTName = "long long int"; break;
   case BuiltinType::ULong:     BTName = "long unsigned int"; break;
   case BuiltinType::ULongLong: BTName = "long long unsigned int"; break;
   default:
     BTName = BT->getName(CGM.getLangOpts());
     break;
   }
   // Bit size, align and offset of the type.
   uint64_t Size = CGM.getContext().getTypeSize(BT);
   uint64_t Align = CGM.getContext().getTypeAlign(BT);
   llvm::DIType DbgTy =
     DBuilder.createBasicType(BTName, Size, Align, Encoding);
   return DbgTy;
 }
 
 llvm::DIType CGDebugInfo::CreateType(const ComplexType *Ty) {
   // Bit size, align and offset of the type.
   unsigned Encoding = llvm::dwarf::DW_ATE_complex_float;
   if (Ty->isComplexIntegerType())
     Encoding = llvm::dwarf::DW_ATE_lo_user;
 
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
   uint64_t Align = CGM.getContext().getTypeAlign(Ty);
   llvm::DIType DbgTy =
     DBuilder.createBasicType("complex", Size, Align, Encoding);
 
   return DbgTy;
 }
 
 /// CreateCVRType - Get the qualified type from the cache or create
 /// a new one if necessary.
 llvm::DIType CGDebugInfo::CreateQualifiedType(QualType Ty, llvm::DIFile Unit) {
   QualifierCollector Qc;
   const Type *T = Qc.strip(Ty);
 
   // Ignore these qualifiers for now.
   Qc.removeObjCGCAttr();
   Qc.removeAddressSpace();
   Qc.removeObjCLifetime();
 
   // We will create one Derived type for one qualifier and recurse to handle any
   // additional ones.
   unsigned Tag;
   if (Qc.hasConst()) {
     Tag = llvm::dwarf::DW_TAG_const_type;
     Qc.removeConst();
   } else if (Qc.hasVolatile()) {
     Tag = llvm::dwarf::DW_TAG_volatile_type;
     Qc.removeVolatile();
   } else if (Qc.hasRestrict()) {
     Tag = llvm::dwarf::DW_TAG_restrict_type;
     Qc.removeRestrict();
   } else {
     assert(Qc.empty() && "Unknown type qualifier for debug info");
     return getOrCreateType(QualType(T, 0), Unit);
   }
 
   llvm::DIType FromTy = getOrCreateType(Qc.apply(CGM.getContext(), T), Unit);
 
   // No need to fill in the Name, Line, Size, Alignment, Offset in case of
   // CVR derived types.
   llvm::DIType DbgTy = DBuilder.createQualifiedType(Tag, FromTy);
 
   return DbgTy;
 }
 
 llvm::DIType CGDebugInfo::CreateType(const ObjCObjectPointerType *Ty,
                                      llvm::DIFile Unit) {
 
   // The frontend treats 'id' as a typedef to an ObjCObjectType,
   // whereas 'id<protocol>' is treated as an ObjCPointerType. For the
   // debug info, we want to emit 'id' in both cases.
   if (Ty->isObjCQualifiedIdType())
       return getOrCreateType(CGM.getContext().getObjCIdType(), Unit);
 
   llvm::DIType DbgTy =
     CreatePointerLikeType(llvm::dwarf::DW_TAG_pointer_type, Ty,
                           Ty->getPointeeType(), Unit);
   return DbgTy;
 }
 
 llvm::DIType CGDebugInfo::CreateType(const PointerType *Ty,
                                      llvm::DIFile Unit) {
   return CreatePointerLikeType(llvm::dwarf::DW_TAG_pointer_type, Ty,
                                Ty->getPointeeType(), Unit);
 }
 
 /// In C++ mode, types have linkage, so we can rely on the ODR and
 /// on their mangled names, if they're external.
 static SmallString<256>
 getUniqueTagTypeName(const TagType *Ty, CodeGenModule &CGM,
                      llvm::DICompileUnit TheCU) {
   SmallString<256> FullName;
   // FIXME: ODR should apply to ObjC++ exactly the same wasy it does to C++.
   // For now, only apply ODR with C++.
   const TagDecl *TD = Ty->getDecl();
   if (TheCU.getLanguage() != llvm::dwarf::DW_LANG_C_plus_plus ||
       !TD->isExternallyVisible())
     return FullName;
   // Microsoft Mangler does not have support for mangleCXXRTTIName yet.
   if (CGM.getTarget().getCXXABI().isMicrosoft())
     return FullName;
 
   // TODO: This is using the RTTI name. Is there a better way to get
   // a unique string for a type?
   llvm::raw_svector_ostream Out(FullName);
   CGM.getCXXABI().getMangleContext().mangleCXXRTTIName(QualType(Ty, 0), Out);
   Out.flush();
   return FullName;
 }
 
 // Creates a forward declaration for a RecordDecl in the given context.
 llvm::DICompositeType
 CGDebugInfo::getOrCreateRecordFwdDecl(const RecordType *Ty,
                                       llvm::DIDescriptor Ctx) {
   const RecordDecl *RD = Ty->getDecl();
   if (llvm::DIType T = getTypeOrNull(CGM.getContext().getRecordType(RD)))
     return llvm::DICompositeType(T);
   llvm::DIFile DefUnit = getOrCreateFile(RD->getLocation());
   unsigned Line = getLineNumber(RD->getLocation());
   StringRef RDName = getClassName(RD);
 
   unsigned Tag = 0;
   if (RD->isStruct() || RD->isInterface())
     Tag = llvm::dwarf::DW_TAG_structure_type;
   else if (RD->isUnion())
     Tag = llvm::dwarf::DW_TAG_union_type;
   else {
     assert(RD->isClass());
     Tag = llvm::dwarf::DW_TAG_class_type;
   }
 
   // Create the type.
   SmallString<256> FullName = getUniqueTagTypeName(Ty, CGM, TheCU);
   return DBuilder.createForwardDecl(Tag, RDName, Ctx, DefUnit, Line, 0, 0, 0,
                                     FullName);
 }
 
 llvm::DIType CGDebugInfo::CreatePointerLikeType(unsigned Tag,
                                                 const Type *Ty,
                                                 QualType PointeeTy,
                                                 llvm::DIFile Unit) {
   if (Tag == llvm::dwarf::DW_TAG_reference_type ||
       Tag == llvm::dwarf::DW_TAG_rvalue_reference_type)
     return DBuilder.createReferenceType(Tag, getOrCreateType(PointeeTy, Unit));
 
   // Bit size, align and offset of the type.
   // Size is always the size of a pointer. We can't use getTypeSize here
   // because that does not return the correct value for references.
   unsigned AS = CGM.getContext().getTargetAddressSpace(PointeeTy);
   uint64_t Size = CGM.getTarget().getPointerWidth(AS);
   uint64_t Align = CGM.getContext().getTypeAlign(Ty);
 
   return DBuilder.createPointerType(getOrCreateType(PointeeTy, Unit), Size,
                                     Align);
 }
 
 llvm::DIType CGDebugInfo::getOrCreateStructPtrType(StringRef Name,
                                                    llvm::DIType &Cache) {
   if (Cache)
     return Cache;
   Cache = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type, Name,
                                      TheCU, getOrCreateMainFile(), 0);
   unsigned Size = CGM.getContext().getTypeSize(CGM.getContext().VoidPtrTy);
   Cache = DBuilder.createPointerType(Cache, Size);
   return Cache;
 }
 
 llvm::DIType CGDebugInfo::CreateType(const BlockPointerType *Ty,
                                      llvm::DIFile Unit) {
   if (BlockLiteralGeneric)
     return BlockLiteralGeneric;
 
   SmallVector<llvm::Value *, 8> EltTys;
   llvm::DIType FieldTy;
   QualType FType;
   uint64_t FieldSize, FieldOffset;
   unsigned FieldAlign;
   llvm::DIArray Elements;
   llvm::DIType EltTy, DescTy;
 
   FieldOffset = 0;
   FType = CGM.getContext().UnsignedLongTy;
   EltTys.push_back(CreateMemberType(Unit, FType, "reserved", &FieldOffset));
   EltTys.push_back(CreateMemberType(Unit, FType, "Size", &FieldOffset));
 
   Elements = DBuilder.getOrCreateArray(EltTys);
   EltTys.clear();
 
   unsigned Flags = llvm::DIDescriptor::FlagAppleBlock;
   unsigned LineNo = getLineNumber(CurLoc);
 
   EltTy = DBuilder.createStructType(Unit, "__block_descriptor",
                                     Unit, LineNo, FieldOffset, 0,
                                     Flags, llvm::DIType(), Elements);
 
   // Bit size, align and offset of the type.
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
 
   DescTy = DBuilder.createPointerType(EltTy, Size);
 
   FieldOffset = 0;
   FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy);
   EltTys.push_back(CreateMemberType(Unit, FType, "__isa", &FieldOffset));
   FType = CGM.getContext().IntTy;
   EltTys.push_back(CreateMemberType(Unit, FType, "__flags", &FieldOffset));
   EltTys.push_back(CreateMemberType(Unit, FType, "__reserved", &FieldOffset));
   FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy);
   EltTys.push_back(CreateMemberType(Unit, FType, "__FuncPtr", &FieldOffset));
 
   FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy);
   FieldTy = DescTy;
   FieldSize = CGM.getContext().getTypeSize(Ty);
   FieldAlign = CGM.getContext().getTypeAlign(Ty);
   FieldTy = DBuilder.createMemberType(Unit, "__descriptor", Unit,
                                       LineNo, FieldSize, FieldAlign,
                                       FieldOffset, 0, FieldTy);
   EltTys.push_back(FieldTy);
 
   FieldOffset += FieldSize;
   Elements = DBuilder.getOrCreateArray(EltTys);
 
   EltTy = DBuilder.createStructType(Unit, "__block_literal_generic",
                                     Unit, LineNo, FieldOffset, 0,
                                     Flags, llvm::DIType(), Elements);
 
   BlockLiteralGeneric = DBuilder.createPointerType(EltTy, Size);
   return BlockLiteralGeneric;
 }
 
 llvm::DIType CGDebugInfo::CreateType(const TypedefType *Ty, llvm::DIFile Unit) {
   // Typedefs are derived from some other type.  If we have a typedef of a
   // typedef, make sure to emit the whole chain.
   llvm::DIType Src = getOrCreateType(Ty->getDecl()->getUnderlyingType(), Unit);
   if (!Src)
     return llvm::DIType();
   // We don't set size information, but do specify where the typedef was
   // declared.
   unsigned Line = getLineNumber(Ty->getDecl()->getLocation());
   const TypedefNameDecl *TyDecl = Ty->getDecl();
 
   llvm::DIDescriptor TypedefContext =
     getContextDescriptor(cast<Decl>(Ty->getDecl()->getDeclContext()));
 
   return
     DBuilder.createTypedef(Src, TyDecl->getName(), Unit, Line, TypedefContext);
 }
 
 llvm::DIType CGDebugInfo::CreateType(const FunctionType *Ty,
                                      llvm::DIFile Unit) {
   SmallVector<llvm::Value *, 16> EltTys;
 
   // Add the result type at least.
   EltTys.push_back(getOrCreateType(Ty->getResultType(), Unit));
 
   // Set up remainder of arguments if there is a prototype.
   // FIXME: IF NOT, HOW IS THIS REPRESENTED?  llvm-gcc doesn't represent '...'!
   if (isa<FunctionNoProtoType>(Ty))
     EltTys.push_back(DBuilder.createUnspecifiedParameter());
   else if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(Ty)) {
     for (unsigned i = 0, e = FPT->getNumArgs(); i != e; ++i)
       EltTys.push_back(getOrCreateType(FPT->getArgType(i), Unit));
     if (FPT->isVariadic())
       EltTys.push_back(DBuilder.createUnspecifiedParameter());
   }
 
   llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(EltTys);
   return DBuilder.createSubroutineType(Unit, EltTypeArray);
 }
 
 
 llvm::DIType CGDebugInfo::createFieldType(StringRef name,
                                           QualType type,
                                           uint64_t sizeInBitsOverride,
                                           SourceLocation loc,
                                           AccessSpecifier AS,
                                           uint64_t offsetInBits,
                                           llvm::DIFile tunit,
                                           llvm::DIScope scope) {
   llvm::DIType debugType = getOrCreateType(type, tunit);
 
   // Get the location for the field.
   llvm::DIFile file = getOrCreateFile(loc);
   unsigned line = getLineNumber(loc);
 
   uint64_t sizeInBits = 0;
   unsigned alignInBits = 0;
   if (!type->isIncompleteArrayType()) {
     llvm::tie(sizeInBits, alignInBits) = CGM.getContext().getTypeInfo(type);
 
     if (sizeInBitsOverride)
       sizeInBits = sizeInBitsOverride;
   }
 
   unsigned flags = 0;
   if (AS == clang::AS_private)
     flags |= llvm::DIDescriptor::FlagPrivate;
   else if (AS == clang::AS_protected)
     flags |= llvm::DIDescriptor::FlagProtected;
 
   return DBuilder.createMemberType(scope, name, file, line, sizeInBits,
                                    alignInBits, offsetInBits, flags, debugType);
 }
 
 /// CollectRecordLambdaFields - Helper for CollectRecordFields.
 void CGDebugInfo::
 CollectRecordLambdaFields(const CXXRecordDecl *CXXDecl,
                           SmallVectorImpl<llvm::Value *> &elements,
                           llvm::DIType RecordTy) {
   // For C++11 Lambdas a Field will be the same as a Capture, but the Capture
   // has the name and the location of the variable so we should iterate over
   // both concurrently.
   const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(CXXDecl);
   RecordDecl::field_iterator Field = CXXDecl->field_begin();
   unsigned fieldno = 0;
   for (CXXRecordDecl::capture_const_iterator I = CXXDecl->captures_begin(),
          E = CXXDecl->captures_end(); I != E; ++I, ++Field, ++fieldno) {
     const LambdaExpr::Capture C = *I;
     if (C.capturesVariable()) {
       VarDecl *V = C.getCapturedVar();
       llvm::DIFile VUnit = getOrCreateFile(C.getLocation());
       StringRef VName = V->getName();
       uint64_t SizeInBitsOverride = 0;
       if (Field->isBitField()) {
         SizeInBitsOverride = Field->getBitWidthValue(CGM.getContext());
         assert(SizeInBitsOverride && "found named 0-width bitfield");
       }
       llvm::DIType fieldType
         = createFieldType(VName, Field->getType(), SizeInBitsOverride,
                           C.getLocation(), Field->getAccess(),
                           layout.getFieldOffset(fieldno), VUnit, RecordTy);
       elements.push_back(fieldType);
     } else {
       // TODO: Need to handle 'this' in some way by probably renaming the
       // this of the lambda class and having a field member of 'this' or
       // by using AT_object_pointer for the function and having that be
       // used as 'this' for semantic references.
       assert(C.capturesThis() && "Field that isn't captured and isn't this?");
       FieldDecl *f = *Field;
       llvm::DIFile VUnit = getOrCreateFile(f->getLocation());
       QualType type = f->getType();
       llvm::DIType fieldType
         = createFieldType("this", type, 0, f->getLocation(), f->getAccess(),
                           layout.getFieldOffset(fieldno), VUnit, RecordTy);
 
       elements.push_back(fieldType);
     }
   }
 }
 
 /// Helper for CollectRecordFields.
 llvm::DIDerivedType
 CGDebugInfo::CreateRecordStaticField(const VarDecl *Var,
                                      llvm::DIType RecordTy) {
   // Create the descriptor for the static variable, with or without
   // constant initializers.
   llvm::DIFile VUnit = getOrCreateFile(Var->getLocation());
   llvm::DIType VTy = getOrCreateType(Var->getType(), VUnit);
 
   unsigned LineNumber = getLineNumber(Var->getLocation());
   StringRef VName = Var->getName();
   llvm::Constant *C = NULL;
   if (Var->getInit()) {
     const APValue *Value = Var->evaluateValue();
     if (Value) {
       if (Value->isInt())
         C = llvm::ConstantInt::get(CGM.getLLVMContext(), Value->getInt());
       if (Value->isFloat())
         C = llvm::ConstantFP::get(CGM.getLLVMContext(), Value->getFloat());
     }
   }
 
   unsigned Flags = 0;
   AccessSpecifier Access = Var->getAccess();
   if (Access == clang::AS_private)
     Flags |= llvm::DIDescriptor::FlagPrivate;
   else if (Access == clang::AS_protected)
     Flags |= llvm::DIDescriptor::FlagProtected;
 
   llvm::DIDerivedType GV = DBuilder.createStaticMemberType(
       RecordTy, VName, VUnit, LineNumber, VTy, Flags, C);
   StaticDataMemberCache[Var->getCanonicalDecl()] = llvm::WeakVH(GV);
   return GV;
 }
 
 /// CollectRecordNormalField - Helper for CollectRecordFields.
 void CGDebugInfo::
 CollectRecordNormalField(const FieldDecl *field, uint64_t OffsetInBits,
                          llvm::DIFile tunit,
                          SmallVectorImpl<llvm::Value *> &elements,
                          llvm::DIType RecordTy) {
   StringRef name = field->getName();
   QualType type = field->getType();
 
   // Ignore unnamed fields unless they're anonymous structs/unions.
   if (name.empty() && !type->isRecordType())
     return;
 
   uint64_t SizeInBitsOverride = 0;
   if (field->isBitField()) {
     SizeInBitsOverride = field->getBitWidthValue(CGM.getContext());
     assert(SizeInBitsOverride && "found named 0-width bitfield");
   }
 
   llvm::DIType fieldType
     = createFieldType(name, type, SizeInBitsOverride,
                       field->getLocation(), field->getAccess(),
                       OffsetInBits, tunit, RecordTy);
 
   elements.push_back(fieldType);
 }
 
 /// CollectRecordFields - A helper function to collect debug info for
 /// record fields. This is used while creating debug info entry for a Record.
 void CGDebugInfo::CollectRecordFields(const RecordDecl *record,
                                       llvm::DIFile tunit,
                                       SmallVectorImpl<llvm::Value *> &elements,
                                       llvm::DICompositeType RecordTy) {
   const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(record);
 
   if (CXXDecl && CXXDecl->isLambda())
     CollectRecordLambdaFields(CXXDecl, elements, RecordTy);
   else {
     const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(record);
 
     // Field number for non-static fields.
     unsigned fieldNo = 0;
 
     // Static and non-static members should appear in the same order as
     // the corresponding declarations in the source program.
     for (RecordDecl::decl_iterator I = record->decls_begin(),
            E = record->decls_end(); I != E; ++I)
       if (const VarDecl *V = dyn_cast<VarDecl>(*I)) {
         // Reuse the existing static member declaration if one exists
         llvm::DenseMap<const Decl *, llvm::WeakVH>::iterator MI =
             StaticDataMemberCache.find(V->getCanonicalDecl());
         if (MI != StaticDataMemberCache.end()) {
           assert(MI->second &&
                  "Static data member declaration should still exist");
           elements.push_back(
               llvm::DIDerivedType(cast<llvm::MDNode>(MI->second)));
         } else
           elements.push_back(CreateRecordStaticField(V, RecordTy));
       } else if (FieldDecl *field = dyn_cast<FieldDecl>(*I)) {
         CollectRecordNormalField(field, layout.getFieldOffset(fieldNo),
                                  tunit, elements, RecordTy);
 
         // Bump field number for next field.
         ++fieldNo;
       }
   }
 }
 
 /// getOrCreateMethodType - CXXMethodDecl's type is a FunctionType. This
 /// function type is not updated to include implicit "this" pointer. Use this
 /// routine to get a method type which includes "this" pointer.
 llvm::DICompositeType
 CGDebugInfo::getOrCreateMethodType(const CXXMethodDecl *Method,
                                    llvm::DIFile Unit) {
   const FunctionProtoType *Func = Method->getType()->getAs<FunctionProtoType>();
   if (Method->isStatic())
     return llvm::DICompositeType(getOrCreateType(QualType(Func, 0), Unit));
   return getOrCreateInstanceMethodType(Method->getThisType(CGM.getContext()),
                                        Func, Unit);
 }
 
 llvm::DICompositeType CGDebugInfo::getOrCreateInstanceMethodType(
     QualType ThisPtr, const FunctionProtoType *Func, llvm::DIFile Unit) {
   // Add "this" pointer.
   llvm::DIArray Args = llvm::DICompositeType(
       getOrCreateType(QualType(Func, 0), Unit)).getTypeArray();
   assert (Args.getNumElements() && "Invalid number of arguments!");
 
   SmallVector<llvm::Value *, 16> Elts;
 
   // First element is always return type. For 'void' functions it is NULL.
   Elts.push_back(Args.getElement(0));
 
   // "this" pointer is always first argument.
   const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl();
   if (isa<ClassTemplateSpecializationDecl>(RD)) {
     // Create pointer type directly in this case.
     const PointerType *ThisPtrTy = cast<PointerType>(ThisPtr);
     QualType PointeeTy = ThisPtrTy->getPointeeType();
     unsigned AS = CGM.getContext().getTargetAddressSpace(PointeeTy);
     uint64_t Size = CGM.getTarget().getPointerWidth(AS);
     uint64_t Align = CGM.getContext().getTypeAlign(ThisPtrTy);
     llvm::DIType PointeeType = getOrCreateType(PointeeTy, Unit);
     llvm::DIType ThisPtrType =
       DBuilder.createPointerType(PointeeType, Size, Align);
     TypeCache[ThisPtr.getAsOpaquePtr()] = ThisPtrType;
     // TODO: This and the artificial type below are misleading, the
     // types aren't artificial the argument is, but the current
     // metadata doesn't represent that.
     ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
     Elts.push_back(ThisPtrType);
   } else {
     llvm::DIType ThisPtrType = getOrCreateType(ThisPtr, Unit);
     TypeCache[ThisPtr.getAsOpaquePtr()] = ThisPtrType;
     ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
     Elts.push_back(ThisPtrType);
   }
 
   // Copy rest of the arguments.
   for (unsigned i = 1, e = Args.getNumElements(); i != e; ++i)
     Elts.push_back(Args.getElement(i));
 
   llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(Elts);
 
   return DBuilder.createSubroutineType(Unit, EltTypeArray);
 }
 
 /// isFunctionLocalClass - Return true if CXXRecordDecl is defined
 /// inside a function.
 static bool isFunctionLocalClass(const CXXRecordDecl *RD) {
   if (const CXXRecordDecl *NRD = dyn_cast<CXXRecordDecl>(RD->getDeclContext()))
     return isFunctionLocalClass(NRD);
   if (isa<FunctionDecl>(RD->getDeclContext()))
     return true;
   return false;
 }
 
 /// CreateCXXMemberFunction - A helper function to create a DISubprogram for
 /// a single member function GlobalDecl.
 llvm::DISubprogram
 CGDebugInfo::CreateCXXMemberFunction(const CXXMethodDecl *Method,
                                      llvm::DIFile Unit,
                                      llvm::DIType RecordTy) {
   bool IsCtorOrDtor =
     isa<CXXConstructorDecl>(Method) || isa<CXXDestructorDecl>(Method);
 
   StringRef MethodName = getFunctionName(Method);
   llvm::DICompositeType MethodTy = getOrCreateMethodType(Method, Unit);
 
   // Since a single ctor/dtor corresponds to multiple functions, it doesn't
   // make sense to give a single ctor/dtor a linkage name.
   StringRef MethodLinkageName;
   if (!IsCtorOrDtor && !isFunctionLocalClass(Method->getParent()))
     MethodLinkageName = CGM.getMangledName(Method);
 
   // Get the location for the method.
   llvm::DIFile MethodDefUnit;
   unsigned MethodLine = 0;
   if (!Method->isImplicit()) {
     MethodDefUnit = getOrCreateFile(Method->getLocation());
     MethodLine = getLineNumber(Method->getLocation());
   }
 
   // Collect virtual method info.
   llvm::DIType ContainingType;
   unsigned Virtuality = 0;
   unsigned VIndex = 0;
 
   if (Method->isVirtual()) {
     if (Method->isPure())
       Virtuality = llvm::dwarf::DW_VIRTUALITY_pure_virtual;
     else
       Virtuality = llvm::dwarf::DW_VIRTUALITY_virtual;
 
     // It doesn't make sense to give a virtual destructor a vtable index,
     // since a single destructor has two entries in the vtable.
     // FIXME: Add proper support for debug info for virtual calls in
     // the Microsoft ABI, where we may use multiple vptrs to make a vftable
     // lookup if we have multiple or virtual inheritance.
     if (!isa<CXXDestructorDecl>(Method) &&
         !CGM.getTarget().getCXXABI().isMicrosoft())
       VIndex = CGM.getItaniumVTableContext().getMethodVTableIndex(Method);
     ContainingType = RecordTy;
   }
 
   unsigned Flags = 0;
   if (Method->isImplicit())
     Flags |= llvm::DIDescriptor::FlagArtificial;
   AccessSpecifier Access = Method->getAccess();
   if (Access == clang::AS_private)
     Flags |= llvm::DIDescriptor::FlagPrivate;
   else if (Access == clang::AS_protected)
     Flags |= llvm::DIDescriptor::FlagProtected;
   if (const CXXConstructorDecl *CXXC = dyn_cast<CXXConstructorDecl>(Method)) {
     if (CXXC->isExplicit())
       Flags |= llvm::DIDescriptor::FlagExplicit;
   } else if (const CXXConversionDecl *CXXC =
              dyn_cast<CXXConversionDecl>(Method)) {
     if (CXXC->isExplicit())
       Flags |= llvm::DIDescriptor::FlagExplicit;
   }
   if (Method->hasPrototype())
     Flags |= llvm::DIDescriptor::FlagPrototyped;
 
   llvm::DIArray TParamsArray = CollectFunctionTemplateParams(Method, Unit);
   llvm::DISubprogram SP =
     DBuilder.createMethod(RecordTy, MethodName, MethodLinkageName,
                           MethodDefUnit, MethodLine,
                           MethodTy, /*isLocalToUnit=*/false,
                           /* isDefinition=*/ false,
                           Virtuality, VIndex, ContainingType,
                           Flags, CGM.getLangOpts().Optimize, NULL,
                           TParamsArray);
 
   SPCache[Method->getCanonicalDecl()] = llvm::WeakVH(SP);
 
   return SP;
 }
 
 /// CollectCXXMemberFunctions - A helper function to collect debug info for
 /// C++ member functions. This is used while creating debug info entry for
 /// a Record.
 void CGDebugInfo::
 CollectCXXMemberFunctions(const CXXRecordDecl *RD, llvm::DIFile Unit,
                           SmallVectorImpl<llvm::Value *> &EltTys,
                           llvm::DIType RecordTy) {
 
   // Since we want more than just the individual member decls if we
   // have templated functions iterate over every declaration to gather
   // the functions.
   for(DeclContext::decl_iterator I = RD->decls_begin(),
         E = RD->decls_end(); I != E; ++I) {
     if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(*I)) {
       // Reuse the existing member function declaration if it exists.
       // It may be associated with the declaration of the type & should be
       // reused as we're building the definition.
       //
       // This situation can arise in the vtable-based debug info reduction where
       // implicit members are emitted in a non-vtable TU.
       llvm::DenseMap<const FunctionDecl *, llvm::WeakVH>::iterator MI =
           SPCache.find(Method->getCanonicalDecl());
       if (MI == SPCache.end()) {
         // If the member is implicit, lazily create it when we see the
         // definition, not before. (an ODR-used implicit default ctor that's
         // never actually code generated should not produce debug info)
         if (!Method->isImplicit())
           EltTys.push_back(CreateCXXMemberFunction(Method, Unit, RecordTy));
       } else
         EltTys.push_back(MI->second);
     } else if (const FunctionTemplateDecl *FTD =
                    dyn_cast<FunctionTemplateDecl>(*I)) {
       // Add any template specializations that have already been seen. Like
       // implicit member functions, these may have been added to a declaration
       // in the case of vtable-based debug info reduction.
       for (FunctionTemplateDecl::spec_iterator SI = FTD->spec_begin(),
                                                SE = FTD->spec_end();
            SI != SE; ++SI) {
         llvm::DenseMap<const FunctionDecl *, llvm::WeakVH>::iterator MI =
             SPCache.find(cast<CXXMethodDecl>(*SI)->getCanonicalDecl());
         if (MI != SPCache.end())
           EltTys.push_back(MI->second);
       }
     }
   }
 }
 
 /// CollectCXXBases - A helper function to collect debug info for
 /// C++ base classes. This is used while creating debug info entry for
 /// a Record.
 void CGDebugInfo::
 CollectCXXBases(const CXXRecordDecl *RD, llvm::DIFile Unit,
                 SmallVectorImpl<llvm::Value *> &EltTys,
                 llvm::DIType RecordTy) {
 
   const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD);
   for (CXXRecordDecl::base_class_const_iterator BI = RD->bases_begin(),
          BE = RD->bases_end(); BI != BE; ++BI) {
     unsigned BFlags = 0;
     uint64_t BaseOffset;
 
     const CXXRecordDecl *Base =
       cast<CXXRecordDecl>(BI->getType()->getAs<RecordType>()->getDecl());
 
     if (BI->isVirtual()) {
       // virtual base offset offset is -ve. The code generator emits dwarf
       // expression where it expects +ve number.
       BaseOffset =
         0 - CGM.getItaniumVTableContext()
                .getVirtualBaseOffsetOffset(RD, Base).getQuantity();
       BFlags = llvm::DIDescriptor::FlagVirtual;
     } else
       BaseOffset = CGM.getContext().toBits(RL.getBaseClassOffset(Base));
     // FIXME: Inconsistent units for BaseOffset. It is in bytes when
     // BI->isVirtual() and bits when not.
 
     AccessSpecifier Access = BI->getAccessSpecifier();
     if (Access == clang::AS_private)
       BFlags |= llvm::DIDescriptor::FlagPrivate;
     else if (Access == clang::AS_protected)
       BFlags |= llvm::DIDescriptor::FlagProtected;
 
     llvm::DIType DTy =
       DBuilder.createInheritance(RecordTy,
                                  getOrCreateType(BI->getType(), Unit),
                                  BaseOffset, BFlags);
     EltTys.push_back(DTy);
   }
 }
 
 /// CollectTemplateParams - A helper function to collect template parameters.
 llvm::DIArray CGDebugInfo::
 CollectTemplateParams(const TemplateParameterList *TPList,
                       ArrayRef<TemplateArgument> TAList,
                       llvm::DIFile Unit) {
   SmallVector<llvm::Value *, 16> TemplateParams;
   for (unsigned i = 0, e = TAList.size(); i != e; ++i) {
     const TemplateArgument &TA = TAList[i];
     StringRef Name;
     if (TPList)
       Name = TPList->getParam(i)->getName();
     switch (TA.getKind()) {
     case TemplateArgument::Type: {
       llvm::DIType TTy = getOrCreateType(TA.getAsType(), Unit);
       llvm::DITemplateTypeParameter TTP =
           DBuilder.createTemplateTypeParameter(TheCU, Name, TTy);
       TemplateParams.push_back(TTP);
     } break;
     case TemplateArgument::Integral: {
       llvm::DIType TTy = getOrCreateType(TA.getIntegralType(), Unit);
       llvm::DITemplateValueParameter TVP =
           DBuilder.createTemplateValueParameter(
               TheCU, Name, TTy,
               llvm::ConstantInt::get(CGM.getLLVMContext(), TA.getAsIntegral()));
       TemplateParams.push_back(TVP);
     } break;
     case TemplateArgument::Declaration: {
       const ValueDecl *D = TA.getAsDecl();
       bool InstanceMember = D->isCXXInstanceMember();
       QualType T = InstanceMember
                        ? CGM.getContext().getMemberPointerType(
                              D->getType(), cast<RecordDecl>(D->getDeclContext())
                                                ->getTypeForDecl())
                        : CGM.getContext().getPointerType(D->getType());
       llvm::DIType TTy = getOrCreateType(T, Unit);
       llvm::Value *V = 0;
       // Variable pointer template parameters have a value that is the address
       // of the variable.
       if (const VarDecl *VD = dyn_cast<VarDecl>(D))
         V = CGM.GetAddrOfGlobalVar(VD);
       // Member function pointers have special support for building them, though
       // this is currently unsupported in LLVM CodeGen.
       if (InstanceMember) {
         if (const CXXMethodDecl *method = dyn_cast<CXXMethodDecl>(D))
           V = CGM.getCXXABI().EmitMemberPointer(method);
       } else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
         V = CGM.GetAddrOfFunction(FD);
       // Member data pointers have special handling too to compute the fixed
       // offset within the object.
-      if (isa<FieldDecl>(D)) {
+      if (isa<FieldDecl>(D) || isa<IndirectFieldDecl>(D)) {
         // These five lines (& possibly the above member function pointer
         // handling) might be able to be refactored to use similar code in
         // CodeGenModule::getMemberPointerConstant
         uint64_t fieldOffset = CGM.getContext().getFieldOffset(D);
         CharUnits chars =
             CGM.getContext().toCharUnitsFromBits((int64_t) fieldOffset);
         V = CGM.getCXXABI().EmitMemberDataPointer(
             cast<MemberPointerType>(T.getTypePtr()), chars);
       }
       llvm::DITemplateValueParameter TVP =
           DBuilder.createTemplateValueParameter(TheCU, Name, TTy,
                                                 V->stripPointerCasts());
       TemplateParams.push_back(TVP);
     } break;
     case TemplateArgument::NullPtr: {
       QualType T = TA.getNullPtrType();
       llvm::DIType TTy = getOrCreateType(T, Unit);
       llvm::Value *V = 0;
       // Special case member data pointer null values since they're actually -1
       // instead of zero.
       if (const MemberPointerType *MPT =
               dyn_cast<MemberPointerType>(T.getTypePtr()))
         // But treat member function pointers as simple zero integers because
         // it's easier than having a special case in LLVM's CodeGen. If LLVM
         // CodeGen grows handling for values of non-null member function
         // pointers then perhaps we could remove this special case and rely on
         // EmitNullMemberPointer for member function pointers.
         if (MPT->isMemberDataPointer())
           V = CGM.getCXXABI().EmitNullMemberPointer(MPT);
       if (!V)
         V = llvm::ConstantInt::get(CGM.Int8Ty, 0);
       llvm::DITemplateValueParameter TVP =
           DBuilder.createTemplateValueParameter(TheCU, Name, TTy, V);
       TemplateParams.push_back(TVP);
     } break;
     case TemplateArgument::Template: {
       llvm::DITemplateValueParameter TVP =
           DBuilder.createTemplateTemplateParameter(
               TheCU, Name, llvm::DIType(),
               TA.getAsTemplate().getAsTemplateDecl()
                   ->getQualifiedNameAsString());
       TemplateParams.push_back(TVP);
     } break;
     case TemplateArgument::Pack: {
       llvm::DITemplateValueParameter TVP =
           DBuilder.createTemplateParameterPack(
               TheCU, Name, llvm::DIType(),
               CollectTemplateParams(NULL, TA.getPackAsArray(), Unit));
       TemplateParams.push_back(TVP);
     } break;
     case TemplateArgument::Expression: {
       const Expr *E = TA.getAsExpr();
       QualType T = E->getType();
       llvm::Value *V = CGM.EmitConstantExpr(E, T);
       assert(V && "Expression in template argument isn't constant");
       llvm::DIType TTy = getOrCreateType(T, Unit);
       llvm::DITemplateValueParameter TVP =
           DBuilder.createTemplateValueParameter(TheCU, Name, TTy,
                                                 V->stripPointerCasts());
       TemplateParams.push_back(TVP);
     } break;
     // And the following should never occur:
     case TemplateArgument::TemplateExpansion:
     case TemplateArgument::Null:
       llvm_unreachable(
           "These argument types shouldn't exist in concrete types");
     }
   }
   return DBuilder.getOrCreateArray(TemplateParams);
 }
 
 /// CollectFunctionTemplateParams - A helper function to collect debug
 /// info for function template parameters.
 llvm::DIArray CGDebugInfo::
 CollectFunctionTemplateParams(const FunctionDecl *FD, llvm::DIFile Unit) {
   if (FD->getTemplatedKind() ==
       FunctionDecl::TK_FunctionTemplateSpecialization) {
     const TemplateParameterList *TList =
       FD->getTemplateSpecializationInfo()->getTemplate()
       ->getTemplateParameters();
     return CollectTemplateParams(
         TList, FD->getTemplateSpecializationArgs()->asArray(), Unit);
   }
   return llvm::DIArray();
 }
 
 /// CollectCXXTemplateParams - A helper function to collect debug info for
 /// template parameters.
 llvm::DIArray CGDebugInfo::
 CollectCXXTemplateParams(const ClassTemplateSpecializationDecl *TSpecial,
                          llvm::DIFile Unit) {
   llvm::PointerUnion<ClassTemplateDecl *,
                      ClassTemplatePartialSpecializationDecl *>
     PU = TSpecial->getSpecializedTemplateOrPartial();
 
   TemplateParameterList *TPList = PU.is<ClassTemplateDecl *>() ?
     PU.get<ClassTemplateDecl *>()->getTemplateParameters() :
     PU.get<ClassTemplatePartialSpecializationDecl *>()->getTemplateParameters();
   const TemplateArgumentList &TAList = TSpecial->getTemplateInstantiationArgs();
   return CollectTemplateParams(TPList, TAList.asArray(), Unit);
 }
 
 /// getOrCreateVTablePtrType - Return debug info descriptor for vtable.
 llvm::DIType CGDebugInfo::getOrCreateVTablePtrType(llvm::DIFile Unit) {
   if (VTablePtrType.isValid())
     return VTablePtrType;
 
   ASTContext &Context = CGM.getContext();
 
   /* Function type */
   llvm::Value *STy = getOrCreateType(Context.IntTy, Unit);
   llvm::DIArray SElements = DBuilder.getOrCreateArray(STy);
   llvm::DIType SubTy = DBuilder.createSubroutineType(Unit, SElements);
   unsigned Size = Context.getTypeSize(Context.VoidPtrTy);
   llvm::DIType vtbl_ptr_type = DBuilder.createPointerType(SubTy, Size, 0,
                                                           "__vtbl_ptr_type");
   VTablePtrType = DBuilder.createPointerType(vtbl_ptr_type, Size);
   return VTablePtrType;
 }
 
 /// getVTableName - Get vtable name for the given Class.
 StringRef CGDebugInfo::getVTableName(const CXXRecordDecl *RD) {
   // Copy the gdb compatible name on the side and use its reference.
   return internString("_vptr$", RD->getNameAsString());
 }
 
 
 /// CollectVTableInfo - If the C++ class has vtable info then insert appropriate
 /// debug info entry in EltTys vector.
 void CGDebugInfo::
 CollectVTableInfo(const CXXRecordDecl *RD, llvm::DIFile Unit,
                   SmallVectorImpl<llvm::Value *> &EltTys) {
   const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD);
 
   // If there is a primary base then it will hold vtable info.
   if (RL.getPrimaryBase())
     return;
 
   // If this class is not dynamic then there is not any vtable info to collect.
   if (!RD->isDynamicClass())
     return;
 
   unsigned Size = CGM.getContext().getTypeSize(CGM.getContext().VoidPtrTy);
   llvm::DIType VPTR
     = DBuilder.createMemberType(Unit, getVTableName(RD), Unit,
                                 0, Size, 0, 0,
                                 llvm::DIDescriptor::FlagArtificial,
                                 getOrCreateVTablePtrType(Unit));
   EltTys.push_back(VPTR);
 }
 
 /// getOrCreateRecordType - Emit record type's standalone debug info.
 llvm::DIType CGDebugInfo::getOrCreateRecordType(QualType RTy,
                                                 SourceLocation Loc) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   llvm::DIType T = getOrCreateType(RTy, getOrCreateFile(Loc));
   return T;
 }
 
 /// getOrCreateInterfaceType - Emit an objective c interface type standalone
 /// debug info.
 llvm::DIType CGDebugInfo::getOrCreateInterfaceType(QualType D,
                                                    SourceLocation Loc) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   llvm::DIType T = getOrCreateType(D, getOrCreateFile(Loc));
   RetainedTypes.push_back(D.getAsOpaquePtr());
   return T;
 }
 
 void CGDebugInfo::completeType(const RecordDecl *RD) {
   if (DebugKind > CodeGenOptions::LimitedDebugInfo ||
       !CGM.getLangOpts().CPlusPlus)
     completeRequiredType(RD);
 }
 
 void CGDebugInfo::completeRequiredType(const RecordDecl *RD) {
   if (const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD))
     if (CXXDecl->isDynamicClass())
       return;
 
   QualType Ty = CGM.getContext().getRecordType(RD);
   llvm::DIType T = getTypeOrNull(Ty);
   if (T && T.isForwardDecl())
     completeClassData(RD);
 }
 
 void CGDebugInfo::completeClassData(const RecordDecl *RD) {
   if (DebugKind <= CodeGenOptions::DebugLineTablesOnly)
     return;
   QualType Ty = CGM.getContext().getRecordType(RD);
   void* TyPtr = Ty.getAsOpaquePtr();
   if (CompletedTypeCache.count(TyPtr))
     return;
   llvm::DIType Res = CreateTypeDefinition(Ty->castAs<RecordType>());
   assert(!Res.isForwardDecl());
   CompletedTypeCache[TyPtr] = Res;
   TypeCache[TyPtr] = Res;
 }
 
 /// CreateType - get structure or union type.
 llvm::DIType CGDebugInfo::CreateType(const RecordType *Ty) {
   RecordDecl *RD = Ty->getDecl();
   const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
   // Always emit declarations for types that aren't required to be complete when
   // in limit-debug-info mode. If the type is later found to be required to be
   // complete this declaration will be upgraded to a definition by
   // `completeRequiredType`.
   // If the type is dynamic, only emit the definition in TUs that require class
   // data. This is handled by `completeClassData`.
   llvm::DICompositeType T(getTypeOrNull(QualType(Ty, 0)));
   // If we've already emitted the type, just use that, even if it's only a
   // declaration. The completeType, completeRequiredType, and completeClassData
   // callbacks will handle promoting the declaration to a definition.
   if (T ||
       // Under -flimit-debug-info:
       (DebugKind <= CodeGenOptions::LimitedDebugInfo &&
        // Emit only a forward declaration unless the type is required.
        ((!RD->isCompleteDefinitionRequired() && CGM.getLangOpts().CPlusPlus) ||
         // If the class is dynamic, only emit a declaration. A definition will be
         // emitted whenever the vtable is emitted.
         (CXXDecl && CXXDecl->hasDefinition() && CXXDecl->isDynamicClass())))) {
     llvm::DIDescriptor FDContext =
       getContextDescriptor(cast<Decl>(RD->getDeclContext()));
     if (!T)
       T = getOrCreateRecordFwdDecl(Ty, FDContext);
     return T;
   }
 
   return CreateTypeDefinition(Ty);
 }
 
 llvm::DIType CGDebugInfo::CreateTypeDefinition(const RecordType *Ty) {
   RecordDecl *RD = Ty->getDecl();
 
   // Get overall information about the record type for the debug info.
   llvm::DIFile DefUnit = getOrCreateFile(RD->getLocation());
 
   // Records and classes and unions can all be recursive.  To handle them, we
   // first generate a debug descriptor for the struct as a forward declaration.
   // Then (if it is a definition) we go through and get debug info for all of
   // its members.  Finally, we create a descriptor for the complete type (which
   // may refer to the forward decl if the struct is recursive) and replace all
   // uses of the forward declaration with the final definition.
 
   llvm::DICompositeType FwdDecl(getOrCreateLimitedType(Ty, DefUnit));
   assert(FwdDecl.isCompositeType() &&
          "The debug type of a RecordType should be a llvm::DICompositeType");
 
   if (FwdDecl.isForwardDecl())
     return FwdDecl;
 
   if (const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD))
     CollectContainingType(CXXDecl, FwdDecl);
 
   // Push the struct on region stack.
   LexicalBlockStack.push_back(&*FwdDecl);
   RegionMap[Ty->getDecl()] = llvm::WeakVH(FwdDecl);
 
   // Add this to the completed-type cache while we're completing it recursively.
   CompletedTypeCache[QualType(Ty, 0).getAsOpaquePtr()] = FwdDecl;
 
   // Convert all the elements.
   SmallVector<llvm::Value *, 16> EltTys;
   // what about nested types?
 
   // Note: The split of CXXDecl information here is intentional, the
   // gdb tests will depend on a certain ordering at printout. The debug
   // information offsets are still correct if we merge them all together
   // though.
   const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
   if (CXXDecl) {
     CollectCXXBases(CXXDecl, DefUnit, EltTys, FwdDecl);
     CollectVTableInfo(CXXDecl, DefUnit, EltTys);
   }
 
   // Collect data fields (including static variables and any initializers).
   CollectRecordFields(RD, DefUnit, EltTys, FwdDecl);
   if (CXXDecl)
     CollectCXXMemberFunctions(CXXDecl, DefUnit, EltTys, FwdDecl);
 
   LexicalBlockStack.pop_back();
   RegionMap.erase(Ty->getDecl());
 
   llvm::DIArray Elements = DBuilder.getOrCreateArray(EltTys);
   FwdDecl.setTypeArray(Elements);
 
   RegionMap[Ty->getDecl()] = llvm::WeakVH(FwdDecl);
   return FwdDecl;
 }
 
 /// CreateType - get objective-c object type.
 llvm::DIType CGDebugInfo::CreateType(const ObjCObjectType *Ty,
                                      llvm::DIFile Unit) {
   // Ignore protocols.
   return getOrCreateType(Ty->getBaseType(), Unit);
 }
 
 
 /// \return true if Getter has the default name for the property PD.
 static bool hasDefaultGetterName(const ObjCPropertyDecl *PD,
                                  const ObjCMethodDecl *Getter) {
   assert(PD);
   if (!Getter)
     return true;
 
   assert(Getter->getDeclName().isObjCZeroArgSelector());
   return PD->getName() ==
     Getter->getDeclName().getObjCSelector().getNameForSlot(0);
 }
 
 /// \return true if Setter has the default name for the property PD.
 static bool hasDefaultSetterName(const ObjCPropertyDecl *PD,
                                  const ObjCMethodDecl *Setter) {
   assert(PD);
   if (!Setter)
     return true;
 
   assert(Setter->getDeclName().isObjCOneArgSelector());
   return SelectorTable::constructSetterName(PD->getName()) ==
     Setter->getDeclName().getObjCSelector().getNameForSlot(0);
 }
 
 /// CreateType - get objective-c interface type.
 llvm::DIType CGDebugInfo::CreateType(const ObjCInterfaceType *Ty,
                                      llvm::DIFile Unit) {
   ObjCInterfaceDecl *ID = Ty->getDecl();
   if (!ID)
     return llvm::DIType();
 
   // Get overall information about the record type for the debug info.
   llvm::DIFile DefUnit = getOrCreateFile(ID->getLocation());
   unsigned Line = getLineNumber(ID->getLocation());
   unsigned RuntimeLang = TheCU.getLanguage();
 
   // If this is just a forward declaration return a special forward-declaration
   // debug type since we won't be able to lay out the entire type.
   ObjCInterfaceDecl *Def = ID->getDefinition();
   if (!Def) {
     llvm::DIType FwdDecl =
       DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type,
                                  ID->getName(), TheCU, DefUnit, Line,
                                  RuntimeLang);
     return FwdDecl;
   }
 
   ID = Def;
 
   // Bit size, align and offset of the type.
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
   uint64_t Align = CGM.getContext().getTypeAlign(Ty);
 
   unsigned Flags = 0;
   if (ID->getImplementation())
     Flags |= llvm::DIDescriptor::FlagObjcClassComplete;
 
   llvm::DICompositeType RealDecl =
     DBuilder.createStructType(Unit, ID->getName(), DefUnit,
                               Line, Size, Align, Flags,
                               llvm::DIType(), llvm::DIArray(), RuntimeLang);
 
   // Otherwise, insert it into the CompletedTypeCache so that recursive uses
   // will find it and we're emitting the complete type.
   QualType QualTy = QualType(Ty, 0);
   CompletedTypeCache[QualTy.getAsOpaquePtr()] = RealDecl;
 
   // Push the struct on region stack.
   LexicalBlockStack.push_back(static_cast<llvm::MDNode*>(RealDecl));
   RegionMap[Ty->getDecl()] = llvm::WeakVH(RealDecl);
 
   // Convert all the elements.
   SmallVector<llvm::Value *, 16> EltTys;
 
   ObjCInterfaceDecl *SClass = ID->getSuperClass();
   if (SClass) {
     llvm::DIType SClassTy =
       getOrCreateType(CGM.getContext().getObjCInterfaceType(SClass), Unit);
     if (!SClassTy.isValid())
       return llvm::DIType();
 
     llvm::DIType InhTag =
       DBuilder.createInheritance(RealDecl, SClassTy, 0, 0);
     EltTys.push_back(InhTag);
   }
 
   // Create entries for all of the properties.
   for (ObjCContainerDecl::prop_iterator I = ID->prop_begin(),
          E = ID->prop_end(); I != E; ++I) {
     const ObjCPropertyDecl *PD = *I;
     SourceLocation Loc = PD->getLocation();
     llvm::DIFile PUnit = getOrCreateFile(Loc);
     unsigned PLine = getLineNumber(Loc);
     ObjCMethodDecl *Getter = PD->getGetterMethodDecl();
     ObjCMethodDecl *Setter = PD->getSetterMethodDecl();
     llvm::MDNode *PropertyNode =
       DBuilder.createObjCProperty(PD->getName(),
                                   PUnit, PLine,
                                   hasDefaultGetterName(PD, Getter) ? "" :
                                   getSelectorName(PD->getGetterName()),
                                   hasDefaultSetterName(PD, Setter) ? "" :
                                   getSelectorName(PD->getSetterName()),
                                   PD->getPropertyAttributes(),
                                   getOrCreateType(PD->getType(), PUnit));
     EltTys.push_back(PropertyNode);
   }
 
   const ASTRecordLayout &RL = CGM.getContext().getASTObjCInterfaceLayout(ID);
   unsigned FieldNo = 0;
   for (ObjCIvarDecl *Field = ID->all_declared_ivar_begin(); Field;
        Field = Field->getNextIvar(), ++FieldNo) {
     llvm::DIType FieldTy = getOrCreateType(Field->getType(), Unit);
     if (!FieldTy.isValid())
       return llvm::DIType();
 
     StringRef FieldName = Field->getName();
 
     // Ignore unnamed fields.
     if (FieldName.empty())
       continue;
 
     // Get the location for the field.
     llvm::DIFile FieldDefUnit = getOrCreateFile(Field->getLocation());
     unsigned FieldLine = getLineNumber(Field->getLocation());
     QualType FType = Field->getType();
     uint64_t FieldSize = 0;
     unsigned FieldAlign = 0;
 
     if (!FType->isIncompleteArrayType()) {
 
       // Bit size, align and offset of the type.
       FieldSize = Field->isBitField()
                       ? Field->getBitWidthValue(CGM.getContext())
                       : CGM.getContext().getTypeSize(FType);
       FieldAlign = CGM.getContext().getTypeAlign(FType);
     }
 
     uint64_t FieldOffset;
     if (CGM.getLangOpts().ObjCRuntime.isNonFragile()) {
       // We don't know the runtime offset of an ivar if we're using the
       // non-fragile ABI.  For bitfields, use the bit offset into the first
       // byte of storage of the bitfield.  For other fields, use zero.
       if (Field->isBitField()) {
         FieldOffset = CGM.getObjCRuntime().ComputeBitfieldBitOffset(
             CGM, ID, Field);
         FieldOffset %= CGM.getContext().getCharWidth();
       } else {
         FieldOffset = 0;
       }
     } else {
       FieldOffset = RL.getFieldOffset(FieldNo);
     }
 
     unsigned Flags = 0;
     if (Field->getAccessControl() == ObjCIvarDecl::Protected)
       Flags = llvm::DIDescriptor::FlagProtected;
     else if (Field->getAccessControl() == ObjCIvarDecl::Private)
       Flags = llvm::DIDescriptor::FlagPrivate;
 
     llvm::MDNode *PropertyNode = NULL;
     if (ObjCImplementationDecl *ImpD = ID->getImplementation()) {
       if (ObjCPropertyImplDecl *PImpD =
           ImpD->FindPropertyImplIvarDecl(Field->getIdentifier())) {
         if (ObjCPropertyDecl *PD = PImpD->getPropertyDecl()) {
           SourceLocation Loc = PD->getLocation();
           llvm::DIFile PUnit = getOrCreateFile(Loc);
           unsigned PLine = getLineNumber(Loc);
           ObjCMethodDecl *Getter = PD->getGetterMethodDecl();
           ObjCMethodDecl *Setter = PD->getSetterMethodDecl();
           PropertyNode =
             DBuilder.createObjCProperty(PD->getName(),
                                         PUnit, PLine,
                                         hasDefaultGetterName(PD, Getter) ? "" :
                                         getSelectorName(PD->getGetterName()),
                                         hasDefaultSetterName(PD, Setter) ? "" :
                                         getSelectorName(PD->getSetterName()),
                                         PD->getPropertyAttributes(),
                                         getOrCreateType(PD->getType(), PUnit));
         }
       }
     }
     FieldTy = DBuilder.createObjCIVar(FieldName, FieldDefUnit,
                                       FieldLine, FieldSize, FieldAlign,
                                       FieldOffset, Flags, FieldTy,
                                       PropertyNode);
     EltTys.push_back(FieldTy);
   }
 
   llvm::DIArray Elements = DBuilder.getOrCreateArray(EltTys);
   RealDecl.setTypeArray(Elements);
 
   // If the implementation is not yet set, we do not want to mark it
   // as complete. An implementation may declare additional
   // private ivars that we would miss otherwise.
   if (ID->getImplementation() == 0)
     CompletedTypeCache.erase(QualTy.getAsOpaquePtr());
 
   LexicalBlockStack.pop_back();
   return RealDecl;
 }
 
 llvm::DIType CGDebugInfo::CreateType(const VectorType *Ty, llvm::DIFile Unit) {
   llvm::DIType ElementTy = getOrCreateType(Ty->getElementType(), Unit);
   int64_t Count = Ty->getNumElements();
   if (Count == 0)
     // If number of elements are not known then this is an unbounded array.
     // Use Count == -1 to express such arrays.
     Count = -1;
 
   llvm::Value *Subscript = DBuilder.getOrCreateSubrange(0, Count);
   llvm::DIArray SubscriptArray = DBuilder.getOrCreateArray(Subscript);
 
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
   uint64_t Align = CGM.getContext().getTypeAlign(Ty);
 
   return DBuilder.createVectorType(Size, Align, ElementTy, SubscriptArray);
 }
 
 llvm::DIType CGDebugInfo::CreateType(const ArrayType *Ty,
                                      llvm::DIFile Unit) {
   uint64_t Size;
   uint64_t Align;
 
   // FIXME: make getTypeAlign() aware of VLAs and incomplete array types
   if (const VariableArrayType *VAT = dyn_cast<VariableArrayType>(Ty)) {
     Size = 0;
     Align =
       CGM.getContext().getTypeAlign(CGM.getContext().getBaseElementType(VAT));
   } else if (Ty->isIncompleteArrayType()) {
     Size = 0;
     if (Ty->getElementType()->isIncompleteType())
       Align = 0;
     else
       Align = CGM.getContext().getTypeAlign(Ty->getElementType());
   } else if (Ty->isIncompleteType()) {
     Size = 0;
     Align = 0;
   } else {
     // Size and align of the whole array, not the element type.
     Size = CGM.getContext().getTypeSize(Ty);
     Align = CGM.getContext().getTypeAlign(Ty);
   }
 
   // Add the dimensions of the array.  FIXME: This loses CV qualifiers from
   // interior arrays, do we care?  Why aren't nested arrays represented the
   // obvious/recursive way?
   SmallVector<llvm::Value *, 8> Subscripts;
   QualType EltTy(Ty, 0);
   while ((Ty = dyn_cast<ArrayType>(EltTy))) {
     // If the number of elements is known, then count is that number. Otherwise,
     // it's -1. This allows us to represent a subrange with an array of 0
     // elements, like this:
     //
     //   struct foo {
     //     int x[0];
     //   };
     int64_t Count = -1;         // Count == -1 is an unbounded array.
     if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(Ty))
       Count = CAT->getSize().getZExtValue();
 
     // FIXME: Verify this is right for VLAs.
     Subscripts.push_back(DBuilder.getOrCreateSubrange(0, Count));
     EltTy = Ty->getElementType();
   }
 
   llvm::DIArray SubscriptArray = DBuilder.getOrCreateArray(Subscripts);
 
   llvm::DIType DbgTy =
     DBuilder.createArrayType(Size, Align, getOrCreateType(EltTy, Unit),
                              SubscriptArray);
   return DbgTy;
 }
 
 llvm::DIType CGDebugInfo::CreateType(const LValueReferenceType *Ty,
                                      llvm::DIFile Unit) {
   return CreatePointerLikeType(llvm::dwarf::DW_TAG_reference_type,
                                Ty, Ty->getPointeeType(), Unit);
 }
 
 llvm::DIType CGDebugInfo::CreateType(const RValueReferenceType *Ty,
                                      llvm::DIFile Unit) {
   return CreatePointerLikeType(llvm::dwarf::DW_TAG_rvalue_reference_type,
                                Ty, Ty->getPointeeType(), Unit);
 }
 
 llvm::DIType CGDebugInfo::CreateType(const MemberPointerType *Ty,
                                      llvm::DIFile U) {
   llvm::DIType ClassType = getOrCreateType(QualType(Ty->getClass(), 0), U);
   if (!Ty->getPointeeType()->isFunctionType())
     return DBuilder.createMemberPointerType(
         getOrCreateType(Ty->getPointeeType(), U), ClassType);
   return DBuilder.createMemberPointerType(getOrCreateInstanceMethodType(
       CGM.getContext().getPointerType(
           QualType(Ty->getClass(), Ty->getPointeeType().getCVRQualifiers())),
       Ty->getPointeeType()->getAs<FunctionProtoType>(), U),
                                           ClassType);
 }
 
 llvm::DIType CGDebugInfo::CreateType(const AtomicType *Ty,
                                      llvm::DIFile U) {
   // Ignore the atomic wrapping
   // FIXME: What is the correct representation?
   return getOrCreateType(Ty->getValueType(), U);
 }
 
 /// CreateEnumType - get enumeration type.
 llvm::DIType CGDebugInfo::CreateEnumType(const EnumType *Ty) {
   const EnumDecl *ED = Ty->getDecl();
   uint64_t Size = 0;
   uint64_t Align = 0;
   if (!ED->getTypeForDecl()->isIncompleteType()) {
     Size = CGM.getContext().getTypeSize(ED->getTypeForDecl());
     Align = CGM.getContext().getTypeAlign(ED->getTypeForDecl());
   }
 
   SmallString<256> FullName = getUniqueTagTypeName(Ty, CGM, TheCU);
 
   // If this is just a forward declaration, construct an appropriately
   // marked node and just return it.
   if (!ED->getDefinition()) {
     llvm::DIDescriptor EDContext;
     EDContext = getContextDescriptor(cast<Decl>(ED->getDeclContext()));
     llvm::DIFile DefUnit = getOrCreateFile(ED->getLocation());
     unsigned Line = getLineNumber(ED->getLocation());
     StringRef EDName = ED->getName();
     return DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_enumeration_type,
                                       EDName, EDContext, DefUnit, Line, 0,
                                       Size, Align, FullName);
   }
 
   // Create DIEnumerator elements for each enumerator.
   SmallVector<llvm::Value *, 16> Enumerators;
   ED = ED->getDefinition();
   for (EnumDecl::enumerator_iterator
          Enum = ED->enumerator_begin(), EnumEnd = ED->enumerator_end();
        Enum != EnumEnd; ++Enum) {
     Enumerators.push_back(
       DBuilder.createEnumerator(Enum->getName(),
                                 Enum->getInitVal().getSExtValue()));
   }
 
   // Return a CompositeType for the enum itself.
   llvm::DIArray EltArray = DBuilder.getOrCreateArray(Enumerators);
 
   llvm::DIFile DefUnit = getOrCreateFile(ED->getLocation());
   unsigned Line = getLineNumber(ED->getLocation());
   llvm::DIDescriptor EnumContext =
     getContextDescriptor(cast<Decl>(ED->getDeclContext()));
   llvm::DIType ClassTy = ED->isFixed() ?
     getOrCreateType(ED->getIntegerType(), DefUnit) : llvm::DIType();
   llvm::DIType DbgTy =
     DBuilder.createEnumerationType(EnumContext, ED->getName(), DefUnit, Line,
                                    Size, Align, EltArray,
                                    ClassTy, FullName);
   return DbgTy;
 }
 
 static QualType UnwrapTypeForDebugInfo(QualType T, const ASTContext &C) {
   Qualifiers Quals;
   do {
     Qualifiers InnerQuals = T.getLocalQualifiers();
     // Qualifiers::operator+() doesn't like it if you add a Qualifier
     // that is already there.
     Quals += Qualifiers::removeCommonQualifiers(Quals, InnerQuals);
     Quals += InnerQuals;
     QualType LastT = T;
     switch (T->getTypeClass()) {
     default:
       return C.getQualifiedType(T.getTypePtr(), Quals);
     case Type::TemplateSpecialization:
       T = cast<TemplateSpecializationType>(T)->desugar();
       break;
     case Type::TypeOfExpr:
       T = cast<TypeOfExprType>(T)->getUnderlyingExpr()->getType();
       break;
     case Type::TypeOf:
       T = cast<TypeOfType>(T)->getUnderlyingType();
       break;
     case Type::Decltype:
       T = cast<DecltypeType>(T)->getUnderlyingType();
       break;
     case Type::UnaryTransform:
       T = cast<UnaryTransformType>(T)->getUnderlyingType();
       break;
     case Type::Attributed:
       T = cast<AttributedType>(T)->getEquivalentType();
       break;
     case Type::Elaborated:
       T = cast<ElaboratedType>(T)->getNamedType();
       break;
     case Type::Paren:
       T = cast<ParenType>(T)->getInnerType();
       break;
     case Type::SubstTemplateTypeParm:
       T = cast<SubstTemplateTypeParmType>(T)->getReplacementType();
       break;
     case Type::Auto:
       QualType DT = cast<AutoType>(T)->getDeducedType();
       if (DT.isNull())
         return T;
       T = DT;
       break;
     }
 
     assert(T != LastT && "Type unwrapping failed to unwrap!");
     (void)LastT;
   } while (true);
 }
 
 /// getType - Get the type from the cache or return null type if it doesn't
 /// exist.
 llvm::DIType CGDebugInfo::getTypeOrNull(QualType Ty) {
 
   // Unwrap the type as needed for debug information.
   Ty = UnwrapTypeForDebugInfo(Ty, CGM.getContext());
 
   // Check for existing entry.
   if (Ty->getTypeClass() == Type::ObjCInterface) {
     llvm::Value *V = getCachedInterfaceTypeOrNull(Ty);
     if (V)
       return llvm::DIType(cast<llvm::MDNode>(V));
     else return llvm::DIType();
   }
 
   llvm::DenseMap<void *, llvm::WeakVH>::iterator it =
     TypeCache.find(Ty.getAsOpaquePtr());
   if (it != TypeCache.end()) {
     // Verify that the debug info still exists.
     if (llvm::Value *V = it->second)
       return llvm::DIType(cast<llvm::MDNode>(V));
   }
 
   return llvm::DIType();
 }
 
 /// getCompletedTypeOrNull - Get the type from the cache or return null if it
 /// doesn't exist.
 llvm::DIType CGDebugInfo::getCompletedTypeOrNull(QualType Ty) {
 
   // Unwrap the type as needed for debug information.
   Ty = UnwrapTypeForDebugInfo(Ty, CGM.getContext());
 
   // Check for existing entry.
   llvm::Value *V = 0;
   llvm::DenseMap<void *, llvm::WeakVH>::iterator it =
     CompletedTypeCache.find(Ty.getAsOpaquePtr());
   if (it != CompletedTypeCache.end())
     V = it->second;
   else {
     V = getCachedInterfaceTypeOrNull(Ty);
   }
 
   // Verify that any cached debug info still exists.
   return llvm::DIType(cast_or_null<llvm::MDNode>(V));
 }
 
 /// getCachedInterfaceTypeOrNull - Get the type from the interface
 /// cache, unless it needs to regenerated. Otherwise return null.
 llvm::Value *CGDebugInfo::getCachedInterfaceTypeOrNull(QualType Ty) {
   // Is there a cached interface that hasn't changed?
   llvm::DenseMap<void *, std::pair<llvm::WeakVH, unsigned > >
     ::iterator it1 = ObjCInterfaceCache.find(Ty.getAsOpaquePtr());
 
   if (it1 != ObjCInterfaceCache.end())
     if (ObjCInterfaceDecl* Decl = getObjCInterfaceDecl(Ty))
       if (Checksum(Decl) == it1->second.second)
         // Return cached forward declaration.
         return it1->second.first;
 
   return 0;
 }
 
 /// getOrCreateType - Get the type from the cache or create a new
 /// one if necessary.
 llvm::DIType CGDebugInfo::getOrCreateType(QualType Ty, llvm::DIFile Unit) {
   if (Ty.isNull())
     return llvm::DIType();
 
   // Unwrap the type as needed for debug information.
   Ty = UnwrapTypeForDebugInfo(Ty, CGM.getContext());
 
   if (llvm::DIType T = getCompletedTypeOrNull(Ty))
     return T;
 
   // Otherwise create the type.
   llvm::DIType Res = CreateTypeNode(Ty, Unit);
   void* TyPtr = Ty.getAsOpaquePtr();
 
   // And update the type cache.
   TypeCache[TyPtr] = Res;
 
   // FIXME: this getTypeOrNull call seems silly when we just inserted the type
   // into the cache - but getTypeOrNull has a special case for cached interface
   // types. We should probably just pull that out as a special case for the
   // "else" block below & skip the otherwise needless lookup.
   llvm::DIType TC = getTypeOrNull(Ty);
   if (TC && TC.isForwardDecl())
     ReplaceMap.push_back(std::make_pair(TyPtr, static_cast<llvm::Value*>(TC)));
   else if (ObjCInterfaceDecl* Decl = getObjCInterfaceDecl(Ty)) {
     // Interface types may have elements added to them by a
     // subsequent implementation or extension, so we keep them in
     // the ObjCInterfaceCache together with a checksum. Instead of
     // the (possibly) incomplete interface type, we return a forward
     // declaration that gets RAUW'd in CGDebugInfo::finalize().
     std::pair<llvm::WeakVH, unsigned> &V = ObjCInterfaceCache[TyPtr];
     if (V.first)
       return llvm::DIType(cast<llvm::MDNode>(V.first));
     TC = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type,
                                     Decl->getName(), TheCU, Unit,
                                     getLineNumber(Decl->getLocation()),
                                     TheCU.getLanguage());
     // Store the forward declaration in the cache.
     V.first = TC;
     V.second = Checksum(Decl);
 
     // Register the type for replacement in finalize().
     ReplaceMap.push_back(std::make_pair(TyPtr, static_cast<llvm::Value*>(TC)));
 
     return TC;
   }
 
   if (!Res.isForwardDecl())
     CompletedTypeCache[TyPtr] = Res;
 
   return Res;
 }
 
 /// Currently the checksum of an interface includes the number of
 /// ivars and property accessors.
 unsigned CGDebugInfo::Checksum(const ObjCInterfaceDecl *ID) {
   // The assumption is that the number of ivars can only increase
   // monotonically, so it is safe to just use their current number as
   // a checksum.
   unsigned Sum = 0;
   for (const ObjCIvarDecl *Ivar = ID->all_declared_ivar_begin();
        Ivar != 0; Ivar = Ivar->getNextIvar())
     ++Sum;
 
   return Sum;
 }
 
 ObjCInterfaceDecl *CGDebugInfo::getObjCInterfaceDecl(QualType Ty) {
   switch (Ty->getTypeClass()) {
   case Type::ObjCObjectPointer:
     return getObjCInterfaceDecl(cast<ObjCObjectPointerType>(Ty)
                                     ->getPointeeType());
   case Type::ObjCInterface:
     return cast<ObjCInterfaceType>(Ty)->getDecl();
   default:
     return 0;
   }
 }
 
 /// CreateTypeNode - Create a new debug type node.
 llvm::DIType CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile Unit) {
   // Handle qualifiers, which recursively handles what they refer to.
   if (Ty.hasLocalQualifiers())
     return CreateQualifiedType(Ty, Unit);
 
   const char *Diag = 0;
 
   // Work out details of type.
   switch (Ty->getTypeClass()) {
 #define TYPE(Class, Base)
 #define ABSTRACT_TYPE(Class, Base)
 #define NON_CANONICAL_TYPE(Class, Base)
 #define DEPENDENT_TYPE(Class, Base) case Type::Class:
 #include "clang/AST/TypeNodes.def"
     llvm_unreachable("Dependent types cannot show up in debug information");
 
   case Type::ExtVector:
   case Type::Vector:
     return CreateType(cast<VectorType>(Ty), Unit);
   case Type::ObjCObjectPointer:
     return CreateType(cast<ObjCObjectPointerType>(Ty), Unit);
   case Type::ObjCObject:
     return CreateType(cast<ObjCObjectType>(Ty), Unit);
   case Type::ObjCInterface:
     return CreateType(cast<ObjCInterfaceType>(Ty), Unit);
   case Type::Builtin:
     return CreateType(cast<BuiltinType>(Ty));
   case Type::Complex:
     return CreateType(cast<ComplexType>(Ty));
   case Type::Pointer:
     return CreateType(cast<PointerType>(Ty), Unit);
   case Type::Decayed:
     // Decayed types are just pointers in LLVM and DWARF.
     return CreateType(
         cast<PointerType>(cast<DecayedType>(Ty)->getDecayedType()), Unit);
   case Type::BlockPointer:
     return CreateType(cast<BlockPointerType>(Ty), Unit);
   case Type::Typedef:
     return CreateType(cast<TypedefType>(Ty), Unit);
   case Type::Record:
     return CreateType(cast<RecordType>(Ty));
   case Type::Enum:
     return CreateEnumType(cast<EnumType>(Ty));
   case Type::FunctionProto:
   case Type::FunctionNoProto:
     return CreateType(cast<FunctionType>(Ty), Unit);
   case Type::ConstantArray:
   case Type::VariableArray:
   case Type::IncompleteArray:
     return CreateType(cast<ArrayType>(Ty), Unit);
 
   case Type::LValueReference:
     return CreateType(cast<LValueReferenceType>(Ty), Unit);
   case Type::RValueReference:
     return CreateType(cast<RValueReferenceType>(Ty), Unit);
 
   case Type::MemberPointer:
     return CreateType(cast<MemberPointerType>(Ty), Unit);
 
   case Type::Atomic:
     return CreateType(cast<AtomicType>(Ty), Unit);
 
   case Type::Attributed:
   case Type::TemplateSpecialization:
   case Type::Elaborated:
   case Type::Paren:
   case Type::SubstTemplateTypeParm:
   case Type::TypeOfExpr:
   case Type::TypeOf:
   case Type::Decltype:
   case Type::UnaryTransform:
   case Type::PackExpansion:
     llvm_unreachable("type should have been unwrapped!");
   case Type::Auto:
     Diag = "auto";
     break;
   }
 
   assert(Diag && "Fall through without a diagnostic?");
   unsigned DiagID = CGM.getDiags().getCustomDiagID(DiagnosticsEngine::Error,
                                "debug information for %0 is not yet supported");
   CGM.getDiags().Report(DiagID)
     << Diag;
   return llvm::DIType();
 }
 
 /// getOrCreateLimitedType - Get the type from the cache or create a new
 /// limited type if necessary.
 llvm::DIType CGDebugInfo::getOrCreateLimitedType(const RecordType *Ty,
                                                  llvm::DIFile Unit) {
   QualType QTy(Ty, 0);
 
   llvm::DICompositeType T(getTypeOrNull(QTy));
 
   // We may have cached a forward decl when we could have created
   // a non-forward decl. Go ahead and create a non-forward decl
   // now.
   if (T && !T.isForwardDecl()) return T;
 
   // Otherwise create the type.
   llvm::DICompositeType Res = CreateLimitedType(Ty);
 
   // Propagate members from the declaration to the definition
   // CreateType(const RecordType*) will overwrite this with the members in the
   // correct order if the full type is needed.
   Res.setTypeArray(T.getTypeArray());
 
   if (T && T.isForwardDecl())
     ReplaceMap.push_back(
         std::make_pair(QTy.getAsOpaquePtr(), static_cast<llvm::Value *>(T)));
 
   // And update the type cache.
   TypeCache[QTy.getAsOpaquePtr()] = Res;
   return Res;
 }
 
 // TODO: Currently used for context chains when limiting debug info.
 llvm::DICompositeType CGDebugInfo::CreateLimitedType(const RecordType *Ty) {
   RecordDecl *RD = Ty->getDecl();
 
   // Get overall information about the record type for the debug info.
   llvm::DIFile DefUnit = getOrCreateFile(RD->getLocation());
   unsigned Line = getLineNumber(RD->getLocation());
   StringRef RDName = getClassName(RD);
 
   llvm::DIDescriptor RDContext =
       getContextDescriptor(cast<Decl>(RD->getDeclContext()));
 
   // If we ended up creating the type during the context chain construction,
   // just return that.
   // FIXME: this could be dealt with better if the type was recorded as
   // completed before we started this (see the CompletedTypeCache usage in
   // CGDebugInfo::CreateTypeDefinition(const RecordType*) - that would need to
   // be pushed to before context creation, but after it was known to be
   // destined for completion (might still have an issue if this caller only
   // required a declaration but the context construction ended up creating a
   // definition)
   llvm::DICompositeType T(getTypeOrNull(CGM.getContext().getRecordType(RD)));
   if (T && (!T.isForwardDecl() || !RD->getDefinition()))
       return T;
 
   // If this is just a forward or incomplete declaration, construct an
   // appropriately marked node and just return it.
   const RecordDecl *D = RD->getDefinition();
   if (!D || !D->isCompleteDefinition())
     return getOrCreateRecordFwdDecl(Ty, RDContext);
 
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
   uint64_t Align = CGM.getContext().getTypeAlign(Ty);
   llvm::DICompositeType RealDecl;
 
   SmallString<256> FullName = getUniqueTagTypeName(Ty, CGM, TheCU);
 
   if (RD->isUnion())
     RealDecl = DBuilder.createUnionType(RDContext, RDName, DefUnit, Line,
                                         Size, Align, 0, llvm::DIArray(), 0,
                                         FullName);
   else if (RD->isClass()) {
     // FIXME: This could be a struct type giving a default visibility different
     // than C++ class type, but needs llvm metadata changes first.
     RealDecl = DBuilder.createClassType(RDContext, RDName, DefUnit, Line,
                                         Size, Align, 0, 0, llvm::DIType(),
                                         llvm::DIArray(), llvm::DIType(),
                                         llvm::DIArray(), FullName);
   } else
     RealDecl = DBuilder.createStructType(RDContext, RDName, DefUnit, Line,
                                          Size, Align, 0, llvm::DIType(),
                                          llvm::DIArray(), 0, llvm::DIType(),
                                          FullName);
 
   RegionMap[Ty->getDecl()] = llvm::WeakVH(RealDecl);
   TypeCache[QualType(Ty, 0).getAsOpaquePtr()] = RealDecl;
 
   if (const ClassTemplateSpecializationDecl *TSpecial =
           dyn_cast<ClassTemplateSpecializationDecl>(RD))
     RealDecl.setTypeArray(llvm::DIArray(),
                           CollectCXXTemplateParams(TSpecial, DefUnit));
   return RealDecl;
 }
 
 void CGDebugInfo::CollectContainingType(const CXXRecordDecl *RD,
                                         llvm::DICompositeType RealDecl) {
   // A class's primary base or the class itself contains the vtable.
   llvm::DICompositeType ContainingType;
   const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD);
   if (const CXXRecordDecl *PBase = RL.getPrimaryBase()) {
     // Seek non virtual primary base root.
     while (1) {
       const ASTRecordLayout &BRL = CGM.getContext().getASTRecordLayout(PBase);
       const CXXRecordDecl *PBT = BRL.getPrimaryBase();
       if (PBT && !BRL.isPrimaryBaseVirtual())
         PBase = PBT;
       else
         break;
     }
     ContainingType = llvm::DICompositeType(
         getOrCreateType(QualType(PBase->getTypeForDecl(), 0),
                         getOrCreateFile(RD->getLocation())));
   } else if (RD->isDynamicClass())
     ContainingType = RealDecl;
 
   RealDecl.setContainingType(ContainingType);
 }
 
 /// CreateMemberType - Create new member and increase Offset by FType's size.
 llvm::DIType CGDebugInfo::CreateMemberType(llvm::DIFile Unit, QualType FType,
                                            StringRef Name,
                                            uint64_t *Offset) {
   llvm::DIType FieldTy = CGDebugInfo::getOrCreateType(FType, Unit);
   uint64_t FieldSize = CGM.getContext().getTypeSize(FType);
   unsigned FieldAlign = CGM.getContext().getTypeAlign(FType);
   llvm::DIType Ty = DBuilder.createMemberType(Unit, Name, Unit, 0,
                                               FieldSize, FieldAlign,
                                               *Offset, 0, FieldTy);
   *Offset += FieldSize;
   return Ty;
 }
 
 llvm::DIDescriptor CGDebugInfo::getDeclarationOrDefinition(const Decl *D) {
   // We only need a declaration (not a definition) of the type - so use whatever
   // we would otherwise do to get a type for a pointee. (forward declarations in
   // limited debug info, full definitions (if the type definition is available)
   // in unlimited debug info)
   if (const TypeDecl *TD = dyn_cast<TypeDecl>(D))
     return getOrCreateType(CGM.getContext().getTypeDeclType(TD),
                            getOrCreateFile(TD->getLocation()));
   // Otherwise fall back to a fairly rudimentary cache of existing declarations.
   // This doesn't handle providing declarations (for functions or variables) for
   // entities without definitions in this TU, nor when the definition proceeds
   // the call to this function.
   // FIXME: This should be split out into more specific maps with support for
   // emitting forward declarations and merging definitions with declarations,
   // the same way as we do for types.
   llvm::DenseMap<const Decl *, llvm::WeakVH>::iterator I =
       DeclCache.find(D->getCanonicalDecl());
   if (I == DeclCache.end())
     return llvm::DIDescriptor();
   llvm::Value *V = I->second;
   return llvm::DIDescriptor(dyn_cast_or_null<llvm::MDNode>(V));
 }
 
 /// getFunctionDeclaration - Return debug info descriptor to describe method
 /// declaration for the given method definition.
 llvm::DISubprogram CGDebugInfo::getFunctionDeclaration(const Decl *D) {
   if (!D || DebugKind == CodeGenOptions::DebugLineTablesOnly)
     return llvm::DISubprogram();
 
   const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
   if (!FD) return llvm::DISubprogram();
 
   // Setup context.
   llvm::DIScope S = getContextDescriptor(cast<Decl>(D->getDeclContext()));
 
   llvm::DenseMap<const FunctionDecl *, llvm::WeakVH>::iterator
     MI = SPCache.find(FD->getCanonicalDecl());
   if (MI == SPCache.end()) {
     if (const CXXMethodDecl *MD =
             dyn_cast<CXXMethodDecl>(FD->getCanonicalDecl())) {
       llvm::DICompositeType T(S);
       llvm::DISubprogram SP =
           CreateCXXMemberFunction(MD, getOrCreateFile(MD->getLocation()), T);
       T.addMember(SP);
       return SP;
     }
   }
   if (MI != SPCache.end()) {
     llvm::Value *V = MI->second;
     llvm::DISubprogram SP(dyn_cast_or_null<llvm::MDNode>(V));
     if (SP.isSubprogram() && !SP.isDefinition())
       return SP;
   }
 
   for (FunctionDecl::redecl_iterator I = FD->redecls_begin(),
          E = FD->redecls_end(); I != E; ++I) {
     const FunctionDecl *NextFD = *I;
     llvm::DenseMap<const FunctionDecl *, llvm::WeakVH>::iterator
       MI = SPCache.find(NextFD->getCanonicalDecl());
     if (MI != SPCache.end()) {
       llvm::Value *V = MI->second;
       llvm::DISubprogram SP(dyn_cast_or_null<llvm::MDNode>(V));
       if (SP.isSubprogram() && !SP.isDefinition())
         return SP;
     }
   }
   return llvm::DISubprogram();
 }
 
 // getOrCreateFunctionType - Construct DIType. If it is a c++ method, include
 // implicit parameter "this".
 llvm::DICompositeType CGDebugInfo::getOrCreateFunctionType(const Decl *D,
                                                            QualType FnType,
                                                            llvm::DIFile F) {
   if (!D || DebugKind == CodeGenOptions::DebugLineTablesOnly)
     // Create fake but valid subroutine type. Otherwise
     // llvm::DISubprogram::Verify() would return false, and
     // subprogram DIE will miss DW_AT_decl_file and
     // DW_AT_decl_line fields.
     return DBuilder.createSubroutineType(F, DBuilder.getOrCreateArray(None));
 
   if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D))
     return getOrCreateMethodType(Method, F);
   if (const ObjCMethodDecl *OMethod = dyn_cast<ObjCMethodDecl>(D)) {
     // Add "self" and "_cmd"
     SmallVector<llvm::Value *, 16> Elts;
 
     // First element is always return type. For 'void' functions it is NULL.
     QualType ResultTy = OMethod->getResultType();
 
     // Replace the instancetype keyword with the actual type.
     if (ResultTy == CGM.getContext().getObjCInstanceType())
       ResultTy = CGM.getContext().getPointerType(
         QualType(OMethod->getClassInterface()->getTypeForDecl(), 0));
 
     Elts.push_back(getOrCreateType(ResultTy, F));
     // "self" pointer is always first argument.
     QualType SelfDeclTy = OMethod->getSelfDecl()->getType();
     llvm::DIType SelfTy = getOrCreateType(SelfDeclTy, F);
     Elts.push_back(CreateSelfType(SelfDeclTy, SelfTy));
     // "_cmd" pointer is always second argument.
     llvm::DIType CmdTy = getOrCreateType(OMethod->getCmdDecl()->getType(), F);
     Elts.push_back(DBuilder.createArtificialType(CmdTy));
     // Get rest of the arguments.
     for (ObjCMethodDecl::param_const_iterator PI = OMethod->param_begin(),
            PE = OMethod->param_end(); PI != PE; ++PI)
       Elts.push_back(getOrCreateType((*PI)->getType(), F));
 
     llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(Elts);
     return DBuilder.createSubroutineType(F, EltTypeArray);
   }
 
   // Variadic function.
   if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
     if (FD->isVariadic()) {
       SmallVector<llvm::Value *, 16> EltTys;
       EltTys.push_back(getOrCreateType(FD->getResultType(), F));
       if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(FnType))
         for (unsigned i = 0, e = FPT->getNumArgs(); i != e; ++i)
           EltTys.push_back(getOrCreateType(FPT->getArgType(i), F));
       EltTys.push_back(DBuilder.createUnspecifiedParameter());
       llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(EltTys);
       return DBuilder.createSubroutineType(F, EltTypeArray);
     }
 
   return llvm::DICompositeType(getOrCreateType(FnType, F));
 }
 
 /// EmitFunctionStart - Constructs the debug code for entering a function.
 void CGDebugInfo::EmitFunctionStart(GlobalDecl GD, QualType FnType,
                                     llvm::Function *Fn,
                                     CGBuilderTy &Builder) {
 
   StringRef Name;
   StringRef LinkageName;
 
   FnBeginRegionCount.push_back(LexicalBlockStack.size());
 
   const Decl *D = GD.getDecl();
   // Function may lack declaration in source code if it is created by Clang
   // CodeGen (examples: _GLOBAL__I_a, __cxx_global_array_dtor, thunk).
   bool HasDecl = (D != 0);
   // Use the location of the declaration.
   SourceLocation Loc;
   if (HasDecl)
     Loc = D->getLocation();
 
   unsigned Flags = 0;
   llvm::DIFile Unit = getOrCreateFile(Loc);
   llvm::DIDescriptor FDContext(Unit);
   llvm::DIArray TParamsArray;
   if (!HasDecl) {
     // Use llvm function name.
     LinkageName = Fn->getName();
   } else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
     // If there is a DISubprogram for this function available then use it.
     llvm::DenseMap<const FunctionDecl *, llvm::WeakVH>::iterator
       FI = SPCache.find(FD->getCanonicalDecl());
     if (FI != SPCache.end()) {
       llvm::Value *V = FI->second;
       llvm::DIDescriptor SP(dyn_cast_or_null<llvm::MDNode>(V));
       if (SP.isSubprogram() && llvm::DISubprogram(SP).isDefinition()) {
         llvm::MDNode *SPN = SP;
         LexicalBlockStack.push_back(SPN);
         RegionMap[D] = llvm::WeakVH(SP);
         return;
       }
     }
     Name = getFunctionName(FD);
     // Use mangled name as linkage name for C/C++ functions.
     if (FD->hasPrototype()) {
       LinkageName = CGM.getMangledName(GD);
       Flags |= llvm::DIDescriptor::FlagPrototyped;
     }
     // No need to replicate the linkage name if it isn't different from the
     // subprogram name, no need to have it at all unless coverage is enabled or
     // debug is set to more than just line tables.
     if (LinkageName == Name ||
         (!CGM.getCodeGenOpts().EmitGcovArcs &&
          !CGM.getCodeGenOpts().EmitGcovNotes &&
          DebugKind <= CodeGenOptions::DebugLineTablesOnly))
       LinkageName = StringRef();
 
     if (DebugKind >= CodeGenOptions::LimitedDebugInfo) {
       if (const NamespaceDecl *NSDecl =
               dyn_cast_or_null<NamespaceDecl>(FD->getDeclContext()))
         FDContext = getOrCreateNameSpace(NSDecl);
       else if (const RecordDecl *RDecl =
                    dyn_cast_or_null<RecordDecl>(FD->getDeclContext()))
         FDContext = getContextDescriptor(cast<Decl>(RDecl));
 
       // Collect template parameters.
       TParamsArray = CollectFunctionTemplateParams(FD, Unit);
     }
   } else if (const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D)) {
     Name = getObjCMethodName(OMD);
     Flags |= llvm::DIDescriptor::FlagPrototyped;
   } else {
     // Use llvm function name.
     Name = Fn->getName();
     Flags |= llvm::DIDescriptor::FlagPrototyped;
   }
   if (!Name.empty() && Name[0] == '\01')
     Name = Name.substr(1);
 
   unsigned LineNo = getLineNumber(Loc);
   if (!HasDecl || D->isImplicit())
     Flags |= llvm::DIDescriptor::FlagArtificial;
 
   llvm::DISubprogram SP =
       DBuilder.createFunction(FDContext, Name, LinkageName, Unit, LineNo,
                               getOrCreateFunctionType(D, FnType, Unit),
                               Fn->hasInternalLinkage(), true /*definition*/,
                               getLineNumber(CurLoc), Flags,
                               CGM.getLangOpts().Optimize, Fn, TParamsArray,
                               getFunctionDeclaration(D));
   if (HasDecl)
     DeclCache.insert(std::make_pair(D->getCanonicalDecl(), llvm::WeakVH(SP)));
 
   // Push function on region stack.
   llvm::MDNode *SPN = SP;
   LexicalBlockStack.push_back(SPN);
   if (HasDecl)
     RegionMap[D] = llvm::WeakVH(SP);
 }
 
 /// EmitLocation - Emit metadata to indicate a change in line/column
 /// information in the source file. If the location is invalid, the
 /// previous location will be reused.
 void CGDebugInfo::EmitLocation(CGBuilderTy &Builder, SourceLocation Loc,
                                bool ForceColumnInfo) {
   // Update our current location
   setLocation(Loc);
 
   if (CurLoc.isInvalid() || CurLoc.isMacroID()) return;
 
   // Don't bother if things are the same as last time.
   SourceManager &SM = CGM.getContext().getSourceManager();
   if (CurLoc == PrevLoc ||
       SM.getExpansionLoc(CurLoc) == SM.getExpansionLoc(PrevLoc))
     // New Builder may not be in sync with CGDebugInfo.
     if (!Builder.getCurrentDebugLocation().isUnknown() &&
         Builder.getCurrentDebugLocation().getScope(CGM.getLLVMContext()) ==
           LexicalBlockStack.back())
       return;
 
   // Update last state.
   PrevLoc = CurLoc;
 
   llvm::MDNode *Scope = LexicalBlockStack.back();
   Builder.SetCurrentDebugLocation(llvm::DebugLoc::get
                                   (getLineNumber(CurLoc),
                                    getColumnNumber(CurLoc, ForceColumnInfo),
                                    Scope));
 }
 
 /// CreateLexicalBlock - Creates a new lexical block node and pushes it on
 /// the stack.
 void CGDebugInfo::CreateLexicalBlock(SourceLocation Loc) {
   llvm::DIDescriptor D =
     DBuilder.createLexicalBlock(LexicalBlockStack.empty() ?
                                 llvm::DIDescriptor() :
                                 llvm::DIDescriptor(LexicalBlockStack.back()),
                                 getOrCreateFile(CurLoc),
                                 getLineNumber(CurLoc),
                                 getColumnNumber(CurLoc));
   llvm::MDNode *DN = D;
   LexicalBlockStack.push_back(DN);
 }
 
 /// EmitLexicalBlockStart - Constructs the debug code for entering a declarative
 /// region - beginning of a DW_TAG_lexical_block.
 void CGDebugInfo::EmitLexicalBlockStart(CGBuilderTy &Builder,
                                         SourceLocation Loc) {
   // Set our current location.
   setLocation(Loc);
 
   // Create a new lexical block and push it on the stack.
   CreateLexicalBlock(Loc);
 
   // Emit a line table change for the current location inside the new scope.
   Builder.SetCurrentDebugLocation(llvm::DebugLoc::get(getLineNumber(Loc),
                                   getColumnNumber(Loc),
                                   LexicalBlockStack.back()));
 }
 
 /// EmitLexicalBlockEnd - Constructs the debug code for exiting a declarative
 /// region - end of a DW_TAG_lexical_block.
 void CGDebugInfo::EmitLexicalBlockEnd(CGBuilderTy &Builder,
                                       SourceLocation Loc) {
   assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!");
 
   // Provide an entry in the line table for the end of the block.
   EmitLocation(Builder, Loc);
 
   LexicalBlockStack.pop_back();
 }
 
 /// EmitFunctionEnd - Constructs the debug code for exiting a function.
 void CGDebugInfo::EmitFunctionEnd(CGBuilderTy &Builder) {
   assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!");
   unsigned RCount = FnBeginRegionCount.back();
   assert(RCount <= LexicalBlockStack.size() && "Region stack mismatch");
 
   // Pop all regions for this function.
   while (LexicalBlockStack.size() != RCount)
     EmitLexicalBlockEnd(Builder, CurLoc);
   FnBeginRegionCount.pop_back();
 }
 
 // EmitTypeForVarWithBlocksAttr - Build up structure info for the byref.
 // See BuildByRefType.
 llvm::DIType CGDebugInfo::EmitTypeForVarWithBlocksAttr(const VarDecl *VD,
                                                        uint64_t *XOffset) {
 
   SmallVector<llvm::Value *, 5> EltTys;
   QualType FType;
   uint64_t FieldSize, FieldOffset;
   unsigned FieldAlign;
 
   llvm::DIFile Unit = getOrCreateFile(VD->getLocation());
   QualType Type = VD->getType();
 
   FieldOffset = 0;
   FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy);
   EltTys.push_back(CreateMemberType(Unit, FType, "__isa", &FieldOffset));
   EltTys.push_back(CreateMemberType(Unit, FType, "__forwarding", &FieldOffset));
   FType = CGM.getContext().IntTy;
   EltTys.push_back(CreateMemberType(Unit, FType, "__flags", &FieldOffset));
   EltTys.push_back(CreateMemberType(Unit, FType, "__size", &FieldOffset));
 
   bool HasCopyAndDispose = CGM.getContext().BlockRequiresCopying(Type, VD);
   if (HasCopyAndDispose) {
     FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy);
     EltTys.push_back(CreateMemberType(Unit, FType, "__copy_helper",
                                       &FieldOffset));
     EltTys.push_back(CreateMemberType(Unit, FType, "__destroy_helper",
                                       &FieldOffset));
   }
   bool HasByrefExtendedLayout;
   Qualifiers::ObjCLifetime Lifetime;
   if (CGM.getContext().getByrefLifetime(Type,
                                         Lifetime, HasByrefExtendedLayout)
       && HasByrefExtendedLayout) {
     FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy);
     EltTys.push_back(CreateMemberType(Unit, FType,
                                       "__byref_variable_layout",
                                       &FieldOffset));
   }
 
   CharUnits Align = CGM.getContext().getDeclAlign(VD);
   if (Align > CGM.getContext().toCharUnitsFromBits(
         CGM.getTarget().getPointerAlign(0))) {
     CharUnits FieldOffsetInBytes
       = CGM.getContext().toCharUnitsFromBits(FieldOffset);
     CharUnits AlignedOffsetInBytes
       = FieldOffsetInBytes.RoundUpToAlignment(Align);
     CharUnits NumPaddingBytes
       = AlignedOffsetInBytes - FieldOffsetInBytes;
 
     if (NumPaddingBytes.isPositive()) {
       llvm::APInt pad(32, NumPaddingBytes.getQuantity());
       FType = CGM.getContext().getConstantArrayType(CGM.getContext().CharTy,
                                                     pad, ArrayType::Normal, 0);
       EltTys.push_back(CreateMemberType(Unit, FType, "", &FieldOffset));
     }
   }
 
   FType = Type;
   llvm::DIType FieldTy = CGDebugInfo::getOrCreateType(FType, Unit);
   FieldSize = CGM.getContext().getTypeSize(FType);
   FieldAlign = CGM.getContext().toBits(Align);
 
   *XOffset = FieldOffset;
   FieldTy = DBuilder.createMemberType(Unit, VD->getName(), Unit,
                                       0, FieldSize, FieldAlign,
                                       FieldOffset, 0, FieldTy);
   EltTys.push_back(FieldTy);
   FieldOffset += FieldSize;
 
   llvm::DIArray Elements = DBuilder.getOrCreateArray(EltTys);
 
   unsigned Flags = llvm::DIDescriptor::FlagBlockByrefStruct;
 
   return DBuilder.createStructType(Unit, "", Unit, 0, FieldOffset, 0, Flags,
                                    llvm::DIType(), Elements);
 }
 
 /// EmitDeclare - Emit local variable declaration debug info.
 void CGDebugInfo::EmitDeclare(const VarDecl *VD, unsigned Tag,
                               llvm::Value *Storage,
                               unsigned ArgNo, CGBuilderTy &Builder) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!");
 
   bool Unwritten =
       VD->isImplicit() || (isa<Decl>(VD->getDeclContext()) &&
                            cast<Decl>(VD->getDeclContext())->isImplicit());
   llvm::DIFile Unit;
   if (!Unwritten)
     Unit = getOrCreateFile(VD->getLocation());
   llvm::DIType Ty;
   uint64_t XOffset = 0;
   if (VD->hasAttr<BlocksAttr>())
     Ty = EmitTypeForVarWithBlocksAttr(VD, &XOffset);
   else
     Ty = getOrCreateType(VD->getType(), Unit);
 
   // If there is no debug info for this type then do not emit debug info
   // for this variable.
   if (!Ty)
     return;
 
   // Get location information.
   unsigned Line = 0;
   unsigned Column = 0;
   if (!Unwritten) {
     Line = getLineNumber(VD->getLocation());
     Column = getColumnNumber(VD->getLocation());
   }
   unsigned Flags = 0;
   if (VD->isImplicit())
     Flags |= llvm::DIDescriptor::FlagArtificial;
   // If this is the first argument and it is implicit then
   // give it an object pointer flag.
   // FIXME: There has to be a better way to do this, but for static
   // functions there won't be an implicit param at arg1 and
   // otherwise it is 'self' or 'this'.
   if (isa<ImplicitParamDecl>(VD) && ArgNo == 1)
     Flags |= llvm::DIDescriptor::FlagObjectPointer;
   if (llvm::Argument *Arg = dyn_cast<llvm::Argument>(Storage))
     if (Arg->getType()->isPointerTy() && !Arg->hasByValAttr() &&
         !VD->getType()->isPointerType())
       Flags |= llvm::DIDescriptor::FlagIndirectVariable;
 
   llvm::MDNode *Scope = LexicalBlockStack.back();
 
   StringRef Name = VD->getName();
   if (!Name.empty()) {
     if (VD->hasAttr<BlocksAttr>()) {
       CharUnits offset = CharUnits::fromQuantity(32);
       SmallVector<llvm::Value *, 9> addr;
       llvm::Type *Int64Ty = CGM.Int64Ty;
       addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus));
       // offset of __forwarding field
       offset = CGM.getContext().toCharUnitsFromBits(
         CGM.getTarget().getPointerWidth(0));
       addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity()));
       addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref));
       addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus));
       // offset of x field
       offset = CGM.getContext().toCharUnitsFromBits(XOffset);
       addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity()));
 
       // Create the descriptor for the variable.
       llvm::DIVariable D =
         DBuilder.createComplexVariable(Tag,
                                        llvm::DIDescriptor(Scope),
                                        VD->getName(), Unit, Line, Ty,
                                        addr, ArgNo);
 
       // Insert an llvm.dbg.declare into the current block.
       llvm::Instruction *Call =
         DBuilder.insertDeclare(Storage, D, Builder.GetInsertBlock());
       Call->setDebugLoc(llvm::DebugLoc::get(Line, Column, Scope));
       return;
     } else if (isa<VariableArrayType>(VD->getType()))
       Flags |= llvm::DIDescriptor::FlagIndirectVariable;
   } else if (const RecordType *RT = dyn_cast<RecordType>(VD->getType())) {
     // If VD is an anonymous union then Storage represents value for
     // all union fields.
     const RecordDecl *RD = cast<RecordDecl>(RT->getDecl());
     if (RD->isUnion() && RD->isAnonymousStructOrUnion()) {
       for (RecordDecl::field_iterator I = RD->field_begin(),
              E = RD->field_end();
            I != E; ++I) {
         FieldDecl *Field = *I;
         llvm::DIType FieldTy = getOrCreateType(Field->getType(), Unit);
         StringRef FieldName = Field->getName();
 
         // Ignore unnamed fields. Do not ignore unnamed records.
         if (FieldName.empty() && !isa<RecordType>(Field->getType()))
           continue;
 
         // Use VarDecl's Tag, Scope and Line number.
         llvm::DIVariable D =
           DBuilder.createLocalVariable(Tag, llvm::DIDescriptor(Scope),
                                        FieldName, Unit, Line, FieldTy,
                                        CGM.getLangOpts().Optimize, Flags,
                                        ArgNo);
 
         // Insert an llvm.dbg.declare into the current block.
         llvm::Instruction *Call =
           DBuilder.insertDeclare(Storage, D, Builder.GetInsertBlock());
         Call->setDebugLoc(llvm::DebugLoc::get(Line, Column, Scope));
       }
       return;
     }
   }
 
   // Create the descriptor for the variable.
   llvm::DIVariable D =
     DBuilder.createLocalVariable(Tag, llvm::DIDescriptor(Scope),
                                  Name, Unit, Line, Ty,
                                  CGM.getLangOpts().Optimize, Flags, ArgNo);
 
   // Insert an llvm.dbg.declare into the current block.
   llvm::Instruction *Call =
     DBuilder.insertDeclare(Storage, D, Builder.GetInsertBlock());
   Call->setDebugLoc(llvm::DebugLoc::get(Line, Column, Scope));
 }
 
 void CGDebugInfo::EmitDeclareOfAutoVariable(const VarDecl *VD,
                                             llvm::Value *Storage,
                                             CGBuilderTy &Builder) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   EmitDeclare(VD, llvm::dwarf::DW_TAG_auto_variable, Storage, 0, Builder);
 }
 
 /// Look up the completed type for a self pointer in the TypeCache and
 /// create a copy of it with the ObjectPointer and Artificial flags
 /// set. If the type is not cached, a new one is created. This should
 /// never happen though, since creating a type for the implicit self
 /// argument implies that we already parsed the interface definition
 /// and the ivar declarations in the implementation.
 llvm::DIType CGDebugInfo::CreateSelfType(const QualType &QualTy,
                                          llvm::DIType Ty) {
   llvm::DIType CachedTy = getTypeOrNull(QualTy);
   if (CachedTy) Ty = CachedTy;
   else DEBUG(llvm::dbgs() << "No cached type for self.");
   return DBuilder.createObjectPointerType(Ty);
 }
 
 void CGDebugInfo::EmitDeclareOfBlockDeclRefVariable(const VarDecl *VD,
                                                     llvm::Value *Storage,
                                                     CGBuilderTy &Builder,
                                                  const CGBlockInfo &blockInfo) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!");
 
   if (Builder.GetInsertBlock() == 0)
     return;
 
   bool isByRef = VD->hasAttr<BlocksAttr>();
 
   uint64_t XOffset = 0;
   llvm::DIFile Unit = getOrCreateFile(VD->getLocation());
   llvm::DIType Ty;
   if (isByRef)
     Ty = EmitTypeForVarWithBlocksAttr(VD, &XOffset);
   else
     Ty = getOrCreateType(VD->getType(), Unit);
 
   // Self is passed along as an implicit non-arg variable in a
   // block. Mark it as the object pointer.
   if (isa<ImplicitParamDecl>(VD) && VD->getName() == "self")
     Ty = CreateSelfType(VD->getType(), Ty);
 
   // Get location information.
   unsigned Line = getLineNumber(VD->getLocation());
   unsigned Column = getColumnNumber(VD->getLocation());
 
   const llvm::DataLayout &target = CGM.getDataLayout();
 
   CharUnits offset = CharUnits::fromQuantity(
     target.getStructLayout(blockInfo.StructureType)
           ->getElementOffset(blockInfo.getCapture(VD).getIndex()));
 
   SmallVector<llvm::Value *, 9> addr;
   llvm::Type *Int64Ty = CGM.Int64Ty;
   if (isa<llvm::AllocaInst>(Storage))
     addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref));
   addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus));
   addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity()));
   if (isByRef) {
     addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref));
     addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus));
     // offset of __forwarding field
     offset = CGM.getContext()
                 .toCharUnitsFromBits(target.getPointerSizeInBits(0));
     addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity()));
     addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref));
     addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus));
     // offset of x field
     offset = CGM.getContext().toCharUnitsFromBits(XOffset);
     addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity()));
   }
 
   // Create the descriptor for the variable.
   llvm::DIVariable D =
     DBuilder.createComplexVariable(llvm::dwarf::DW_TAG_auto_variable,
                                    llvm::DIDescriptor(LexicalBlockStack.back()),
                                    VD->getName(), Unit, Line, Ty, addr);
 
   // Insert an llvm.dbg.declare into the current block.
   llvm::Instruction *Call =
     DBuilder.insertDeclare(Storage, D, Builder.GetInsertPoint());
   Call->setDebugLoc(llvm::DebugLoc::get(Line, Column,
                                         LexicalBlockStack.back()));
 }
 
 /// EmitDeclareOfArgVariable - Emit call to llvm.dbg.declare for an argument
 /// variable declaration.
 void CGDebugInfo::EmitDeclareOfArgVariable(const VarDecl *VD, llvm::Value *AI,
                                            unsigned ArgNo,
                                            CGBuilderTy &Builder) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   EmitDeclare(VD, llvm::dwarf::DW_TAG_arg_variable, AI, ArgNo, Builder);
 }
 
 namespace {
   struct BlockLayoutChunk {
     uint64_t OffsetInBits;
     const BlockDecl::Capture *Capture;
   };
   bool operator<(const BlockLayoutChunk &l, const BlockLayoutChunk &r) {
     return l.OffsetInBits < r.OffsetInBits;
   }
 }
 
 void CGDebugInfo::EmitDeclareOfBlockLiteralArgVariable(const CGBlockInfo &block,
                                                        llvm::Value *Arg,
                                                        llvm::Value *LocalAddr,
                                                        CGBuilderTy &Builder) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   ASTContext &C = CGM.getContext();
   const BlockDecl *blockDecl = block.getBlockDecl();
 
   // Collect some general information about the block's location.
   SourceLocation loc = blockDecl->getCaretLocation();
   llvm::DIFile tunit = getOrCreateFile(loc);
   unsigned line = getLineNumber(loc);
   unsigned column = getColumnNumber(loc);
 
   // Build the debug-info type for the block literal.
   getContextDescriptor(cast<Decl>(blockDecl->getDeclContext()));
 
   const llvm::StructLayout *blockLayout =
     CGM.getDataLayout().getStructLayout(block.StructureType);
 
   SmallVector<llvm::Value*, 16> fields;
   fields.push_back(createFieldType("__isa", C.VoidPtrTy, 0, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(0),
                                    tunit, tunit));
   fields.push_back(createFieldType("__flags", C.IntTy, 0, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(1),
                                    tunit, tunit));
   fields.push_back(createFieldType("__reserved", C.IntTy, 0, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(2),
                                    tunit, tunit));
   fields.push_back(createFieldType("__FuncPtr", C.VoidPtrTy, 0, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(3),
                                    tunit, tunit));
   fields.push_back(createFieldType("__descriptor",
                                    C.getPointerType(block.NeedsCopyDispose ?
                                         C.getBlockDescriptorExtendedType() :
                                         C.getBlockDescriptorType()),
                                    0, loc, AS_public,
                                    blockLayout->getElementOffsetInBits(4),
                                    tunit, tunit));
 
   // We want to sort the captures by offset, not because DWARF
   // requires this, but because we're paranoid about debuggers.
   SmallVector<BlockLayoutChunk, 8> chunks;
 
   // 'this' capture.
   if (blockDecl->capturesCXXThis()) {
     BlockLayoutChunk chunk;
     chunk.OffsetInBits =
       blockLayout->getElementOffsetInBits(block.CXXThisIndex);
     chunk.Capture = 0;
     chunks.push_back(chunk);
   }
 
   // Variable captures.
   for (BlockDecl::capture_const_iterator
          i = blockDecl->capture_begin(), e = blockDecl->capture_end();
        i != e; ++i) {
     const BlockDecl::Capture &capture = *i;
     const VarDecl *variable = capture.getVariable();
     const CGBlockInfo::Capture &captureInfo = block.getCapture(variable);
 
     // Ignore constant captures.
     if (captureInfo.isConstant())
       continue;
 
     BlockLayoutChunk chunk;
     chunk.OffsetInBits =
       blockLayout->getElementOffsetInBits(captureInfo.getIndex());
     chunk.Capture = &capture;
     chunks.push_back(chunk);
   }
 
   // Sort by offset.
   llvm::array_pod_sort(chunks.begin(), chunks.end());
 
   for (SmallVectorImpl<BlockLayoutChunk>::iterator
          i = chunks.begin(), e = chunks.end(); i != e; ++i) {
     uint64_t offsetInBits = i->OffsetInBits;
     const BlockDecl::Capture *capture = i->Capture;
 
     // If we have a null capture, this must be the C++ 'this' capture.
     if (!capture) {
       const CXXMethodDecl *method =
         cast<CXXMethodDecl>(blockDecl->getNonClosureContext());
       QualType type = method->getThisType(C);
 
       fields.push_back(createFieldType("this", type, 0, loc, AS_public,
                                        offsetInBits, tunit, tunit));
       continue;
     }
 
     const VarDecl *variable = capture->getVariable();
     StringRef name = variable->getName();
 
     llvm::DIType fieldType;
     if (capture->isByRef()) {
       std::pair<uint64_t,unsigned> ptrInfo = C.getTypeInfo(C.VoidPtrTy);
 
       // FIXME: this creates a second copy of this type!
       uint64_t xoffset;
       fieldType = EmitTypeForVarWithBlocksAttr(variable, &xoffset);
       fieldType = DBuilder.createPointerType(fieldType, ptrInfo.first);
       fieldType = DBuilder.createMemberType(tunit, name, tunit, line,
                                             ptrInfo.first, ptrInfo.second,
                                             offsetInBits, 0, fieldType);
     } else {
       fieldType = createFieldType(name, variable->getType(), 0,
                                   loc, AS_public, offsetInBits, tunit, tunit);
     }
     fields.push_back(fieldType);
   }
 
   SmallString<36> typeName;
   llvm::raw_svector_ostream(typeName)
     << "__block_literal_" << CGM.getUniqueBlockCount();
 
   llvm::DIArray fieldsArray = DBuilder.getOrCreateArray(fields);
 
   llvm::DIType type =
     DBuilder.createStructType(tunit, typeName.str(), tunit, line,
                               CGM.getContext().toBits(block.BlockSize),
                               CGM.getContext().toBits(block.BlockAlign),
                               0, llvm::DIType(), fieldsArray);
   type = DBuilder.createPointerType(type, CGM.PointerWidthInBits);
 
   // Get overall information about the block.
   unsigned flags = llvm::DIDescriptor::FlagArtificial;
   llvm::MDNode *scope = LexicalBlockStack.back();
 
   // Create the descriptor for the parameter.
   llvm::DIVariable debugVar =
     DBuilder.createLocalVariable(llvm::dwarf::DW_TAG_arg_variable,
                                  llvm::DIDescriptor(scope),
                                  Arg->getName(), tunit, line, type,
                                  CGM.getLangOpts().Optimize, flags,
                                  cast<llvm::Argument>(Arg)->getArgNo() + 1);
 
   if (LocalAddr) {
     // Insert an llvm.dbg.value into the current block.
     llvm::Instruction *DbgVal =
       DBuilder.insertDbgValueIntrinsic(LocalAddr, 0, debugVar,
                                        Builder.GetInsertBlock());
     DbgVal->setDebugLoc(llvm::DebugLoc::get(line, column, scope));
   }
 
   // Insert an llvm.dbg.declare into the current block.
   llvm::Instruction *DbgDecl =
     DBuilder.insertDeclare(Arg, debugVar, Builder.GetInsertBlock());
   DbgDecl->setDebugLoc(llvm::DebugLoc::get(line, column, scope));
 }
 
 /// If D is an out-of-class definition of a static data member of a class, find
 /// its corresponding in-class declaration.
 llvm::DIDerivedType
 CGDebugInfo::getOrCreateStaticDataMemberDeclarationOrNull(const VarDecl *D) {
   if (!D->isStaticDataMember())
     return llvm::DIDerivedType();
   llvm::DenseMap<const Decl *, llvm::WeakVH>::iterator MI =
       StaticDataMemberCache.find(D->getCanonicalDecl());
   if (MI != StaticDataMemberCache.end()) {
     assert(MI->second && "Static data member declaration should still exist");
     return llvm::DIDerivedType(cast<llvm::MDNode>(MI->second));
   }
 
   // If the member wasn't found in the cache, lazily construct and add it to the
   // type (used when a limited form of the type is emitted).
   llvm::DICompositeType Ctxt(
       getContextDescriptor(cast<Decl>(D->getDeclContext())));
   llvm::DIDerivedType T = CreateRecordStaticField(D, Ctxt);
   Ctxt.addMember(T);
   return T;
 }
 
 /// EmitGlobalVariable - Emit information about a global variable.
 void CGDebugInfo::EmitGlobalVariable(llvm::GlobalVariable *Var,
                                      const VarDecl *D) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   // Create global variable debug descriptor.
   llvm::DIFile Unit = getOrCreateFile(D->getLocation());
   unsigned LineNo = getLineNumber(D->getLocation());
 
   setLocation(D->getLocation());
 
   QualType T = D->getType();
   if (T->isIncompleteArrayType()) {
 
     // CodeGen turns int[] into int[1] so we'll do the same here.
     llvm::APInt ConstVal(32, 1);
     QualType ET = CGM.getContext().getAsArrayType(T)->getElementType();
 
     T = CGM.getContext().getConstantArrayType(ET, ConstVal,
                                               ArrayType::Normal, 0);
   }
   StringRef DeclName = D->getName();
   StringRef LinkageName;
   if (D->getDeclContext() && !isa<FunctionDecl>(D->getDeclContext())
       && !isa<ObjCMethodDecl>(D->getDeclContext()))
     LinkageName = Var->getName();
   if (LinkageName == DeclName)
     LinkageName = StringRef();
   llvm::DIDescriptor DContext =
     getContextDescriptor(dyn_cast<Decl>(D->getDeclContext()));
   llvm::DIGlobalVariable GV = DBuilder.createStaticVariable(
       DContext, DeclName, LinkageName, Unit, LineNo, getOrCreateType(T, Unit),
       Var->hasInternalLinkage(), Var,
       getOrCreateStaticDataMemberDeclarationOrNull(D));
   DeclCache.insert(std::make_pair(D->getCanonicalDecl(), llvm::WeakVH(GV)));
 }
 
 /// EmitGlobalVariable - Emit information about an objective-c interface.
 void CGDebugInfo::EmitGlobalVariable(llvm::GlobalVariable *Var,
                                      ObjCInterfaceDecl *ID) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   // Create global variable debug descriptor.
   llvm::DIFile Unit = getOrCreateFile(ID->getLocation());
   unsigned LineNo = getLineNumber(ID->getLocation());
 
   StringRef Name = ID->getName();
 
   QualType T = CGM.getContext().getObjCInterfaceType(ID);
   if (T->isIncompleteArrayType()) {
 
     // CodeGen turns int[] into int[1] so we'll do the same here.
     llvm::APInt ConstVal(32, 1);
     QualType ET = CGM.getContext().getAsArrayType(T)->getElementType();
 
     T = CGM.getContext().getConstantArrayType(ET, ConstVal,
                                            ArrayType::Normal, 0);
   }
 
   DBuilder.createGlobalVariable(Name, Unit, LineNo,
                                 getOrCreateType(T, Unit),
                                 Var->hasInternalLinkage(), Var);
 }
 
 /// EmitGlobalVariable - Emit global variable's debug info.
 void CGDebugInfo::EmitGlobalVariable(const ValueDecl *VD,
                                      llvm::Constant *Init) {
   assert(DebugKind >= CodeGenOptions::LimitedDebugInfo);
   // Create the descriptor for the variable.
   llvm::DIFile Unit = getOrCreateFile(VD->getLocation());
   StringRef Name = VD->getName();
   llvm::DIType Ty = getOrCreateType(VD->getType(), Unit);
   if (const EnumConstantDecl *ECD = dyn_cast<EnumConstantDecl>(VD)) {
     const EnumDecl *ED = cast<EnumDecl>(ECD->getDeclContext());
     assert(isa<EnumType>(ED->getTypeForDecl()) && "Enum without EnumType?");
     Ty = getOrCreateType(QualType(ED->getTypeForDecl(), 0), Unit);
   }
   // Do not use DIGlobalVariable for enums.
   if (Ty.getTag() == llvm::dwarf::DW_TAG_enumeration_type)
     return;
   llvm::DIGlobalVariable GV = DBuilder.createStaticVariable(
       Unit, Name, Name, Unit, getLineNumber(VD->getLocation()), Ty, true, Init,
       getOrCreateStaticDataMemberDeclarationOrNull(cast<VarDecl>(VD)));
   DeclCache.insert(std::make_pair(VD->getCanonicalDecl(), llvm::WeakVH(GV)));
 }
 
 llvm::DIScope CGDebugInfo::getCurrentContextDescriptor(const Decl *D) {
   if (!LexicalBlockStack.empty())
     return llvm::DIScope(LexicalBlockStack.back());
   return getContextDescriptor(D);
 }
 
 void CGDebugInfo::EmitUsingDirective(const UsingDirectiveDecl &UD) {
   if (CGM.getCodeGenOpts().getDebugInfo() < CodeGenOptions::LimitedDebugInfo)
     return;
   DBuilder.createImportedModule(
       getCurrentContextDescriptor(cast<Decl>(UD.getDeclContext())),
       getOrCreateNameSpace(UD.getNominatedNamespace()),
       getLineNumber(UD.getLocation()));
 }
 
 void CGDebugInfo::EmitUsingDecl(const UsingDecl &UD) {
   if (CGM.getCodeGenOpts().getDebugInfo() < CodeGenOptions::LimitedDebugInfo)
     return;
   assert(UD.shadow_size() &&
          "We shouldn't be codegening an invalid UsingDecl containing no decls");
   // Emitting one decl is sufficient - debuggers can detect that this is an
   // overloaded name & provide lookup for all the overloads.
   const UsingShadowDecl &USD = **UD.shadow_begin();
   if (llvm::DIDescriptor Target =
           getDeclarationOrDefinition(USD.getUnderlyingDecl()))
     DBuilder.createImportedDeclaration(
         getCurrentContextDescriptor(cast<Decl>(USD.getDeclContext())), Target,
         getLineNumber(USD.getLocation()));
 }
 
 llvm::DIImportedEntity
 CGDebugInfo::EmitNamespaceAlias(const NamespaceAliasDecl &NA) {
   if (CGM.getCodeGenOpts().getDebugInfo() < CodeGenOptions::LimitedDebugInfo)
     return llvm::DIImportedEntity(0);
   llvm::WeakVH &VH = NamespaceAliasCache[&NA];
   if (VH)
     return llvm::DIImportedEntity(cast<llvm::MDNode>(VH));
   llvm::DIImportedEntity R(0);
   if (const NamespaceAliasDecl *Underlying =
           dyn_cast<NamespaceAliasDecl>(NA.getAliasedNamespace()))
     // This could cache & dedup here rather than relying on metadata deduping.
     R = DBuilder.createImportedModule(
         getCurrentContextDescriptor(cast<Decl>(NA.getDeclContext())),
         EmitNamespaceAlias(*Underlying), getLineNumber(NA.getLocation()),
         NA.getName());
   else
     R = DBuilder.createImportedModule(
         getCurrentContextDescriptor(cast<Decl>(NA.getDeclContext())),
         getOrCreateNameSpace(cast<NamespaceDecl>(NA.getAliasedNamespace())),
         getLineNumber(NA.getLocation()), NA.getName());
   VH = R;
   return R;
 }
 
 /// getOrCreateNamesSpace - Return namespace descriptor for the given
 /// namespace decl.
 llvm::DINameSpace
 CGDebugInfo::getOrCreateNameSpace(const NamespaceDecl *NSDecl) {
   NSDecl = NSDecl->getCanonicalDecl();
   llvm::DenseMap<const NamespaceDecl *, llvm::WeakVH>::iterator I =
     NameSpaceCache.find(NSDecl);
   if (I != NameSpaceCache.end())
     return llvm::DINameSpace(cast<llvm::MDNode>(I->second));
 
   unsigned LineNo = getLineNumber(NSDecl->getLocation());
   llvm::DIFile FileD = getOrCreateFile(NSDecl->getLocation());
   llvm::DIDescriptor Context =
     getContextDescriptor(dyn_cast<Decl>(NSDecl->getDeclContext()));
   llvm::DINameSpace NS =
     DBuilder.createNameSpace(Context, NSDecl->getName(), FileD, LineNo);
   NameSpaceCache[NSDecl] = llvm::WeakVH(NS);
   return NS;
 }
 
 void CGDebugInfo::finalize() {
   for (std::vector<std::pair<void *, llvm::WeakVH> >::const_iterator VI
          = ReplaceMap.begin(), VE = ReplaceMap.end(); VI != VE; ++VI) {
     llvm::DIType Ty, RepTy;
     // Verify that the debug info still exists.
     if (llvm::Value *V = VI->second)
       Ty = llvm::DIType(cast<llvm::MDNode>(V));
 
     llvm::DenseMap<void *, llvm::WeakVH>::iterator it =
       TypeCache.find(VI->first);
     if (it != TypeCache.end()) {
       // Verify that the debug info still exists.
       if (llvm::Value *V = it->second)
         RepTy = llvm::DIType(cast<llvm::MDNode>(V));
     }
 
     if (Ty && Ty.isForwardDecl() && RepTy)
       Ty.replaceAllUsesWith(RepTy);
   }
 
   // We keep our own list of retained types, because we need to look
   // up the final type in the type cache.
   for (std::vector<void *>::const_iterator RI = RetainedTypes.begin(),
          RE = RetainedTypes.end(); RI != RE; ++RI)
     DBuilder.retainType(llvm::DIType(cast<llvm::MDNode>(TypeCache[*RI])));
 
   DBuilder.finalize();
 }
Index: user/ae/inet6/contrib/llvm/tools/clang
===================================================================
--- user/ae/inet6/contrib/llvm/tools/clang	(revision 271452)
+++ user/ae/inet6/contrib/llvm/tools/clang	(revision 271453)

Property changes on: user/ae/inet6/contrib/llvm/tools/clang
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/contrib/llvm/tools/clang:r271428-271452
Index: user/ae/inet6/contrib/llvm
===================================================================
--- user/ae/inet6/contrib/llvm	(revision 271452)
+++ user/ae/inet6/contrib/llvm	(revision 271453)

Property changes on: user/ae/inet6/contrib/llvm
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/contrib/llvm:r271428-271452
Index: user/ae/inet6/etc/motd
===================================================================
--- user/ae/inet6/etc/motd	(revision 271452)
+++ user/ae/inet6/etc/motd	(revision 271453)
@@ -1,25 +1,21 @@
 FreeBSD ?.?.?  (UNKNOWN)
 
-Welcome to FreeBSD!
+Welcome to FreeBSD!  Handy technical support resources:
 
-Before seeking technical support, please use the following resources:
+Security advisories and errata: https://www.FreeBSD.org/releases/
+Handbook:     https://www.FreeBSD.org/handbook/
+FAQ:          https://www.FreeBSD.org/faq/
+Mailing list: https://lists.FreeBSD.org/mailman/listinfo/freebsd-questions/
+Forums:       https://forums.FreeBSD.org/
 
-o  Security advisories and updated errata information for all releases are
-   at http://www.FreeBSD.org/releases/ - always consult the ERRATA section
-   for your release first as it's updated frequently.
+Documents installed with the system are in the /usr/local/share/doc/freebsd/
+directory, or can be installed later with:  pkg install en-freebsd-doc
+For other languages, replace "en" with a language code like de or fr.
 
-o  The Handbook and FAQ documents are at http://www.FreeBSD.org/ and,
-   along with the mailing lists, can be searched by going to
-   http://www.FreeBSD.org/search/.  If the doc package has been installed
-   (or fetched via pkg install lang-freebsd-doc, where lang is the
-   2-letter language code, e.g. en), they are also available formatted
-   in /usr/local/share/doc/freebsd.
+Show the version of FreeBSD installed:  uname -a
+Please include that output and any error messages when posting questions.
 
-If you still have a question or problem, please take the output of
-`uname -a', along with any relevant error messages, and email it
-as a question to the questions@FreeBSD.org mailing list.  If you are
-unfamiliar with FreeBSD's directory layout, please refer to the hier(7)
-manual page.  If you are not familiar with manual pages, type `man man'.
+Introduction to manual pages:  man man
+FreeBSD directory layout:      man hier
 
 Edit /etc/motd to change this login announcement.
-
Index: user/ae/inet6/etc
===================================================================
--- user/ae/inet6/etc	(revision 271452)
+++ user/ae/inet6/etc	(revision 271453)

Property changes on: user/ae/inet6/etc
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/etc:r271428-271452
Index: user/ae/inet6/share/examples/bhyve/vmrun.sh
===================================================================
--- user/ae/inet6/share/examples/bhyve/vmrun.sh	(revision 271452)
+++ user/ae/inet6/share/examples/bhyve/vmrun.sh	(revision 271453)
@@ -1,254 +1,254 @@
 #!/bin/sh
 #
 # Copyright (c) 2013 NetApp, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 # $FreeBSD$
 #
 
 LOADER=/usr/sbin/bhyveload
 BHYVECTL=/usr/sbin/bhyvectl
 FBSDRUN=/usr/sbin/bhyve
 
 DEFAULT_MEMSIZE=512M
 DEFAULT_CPUS=2
 DEFAULT_TAPDEV=tap0
 DEFAULT_CONSOLE=stdio
 
 DEFAULT_VIRTIO_DISK="./diskdev"
 DEFAULT_ISOFILE="./release.iso"
 
 usage() {
 	echo "Usage: vmrun.sh [-ahi] [-c <CPUs>] [-C <console>] [-d <disk file>]"
 	echo "                [-e <name=value>] [-g <gdbport> ] [-H <directory>]"
 	echo "                [-I <location of installation iso>] [-m <memsize>]"
 	echo "                [-t <tapdev>] <vmname>"
 	echo ""
 	echo "       -h: display this help message"
 	echo "       -a: force memory mapped local APIC access"
 	echo "       -c: number of virtual cpus (default is ${DEFAULT_CPUS})"
 	echo "       -C: console device (default is ${DEFAULT_CONSOLE})"
 	echo "       -d: virtio diskdev file (default is ${DEFAULT_VIRTIO_DISK})"
 	echo "       -e: set FreeBSD loader environment variable"
 	echo "       -g: listen for connection from kgdb at <gdbport>"
 	echo "       -H: host filesystem to export to the loader"
 	echo "       -i: force boot of the Installation CDROM image"
 	echo "       -I: Installation CDROM image location (default is ${DEFAULT_ISOFILE})"
 	echo "       -m: memory size (default is ${DEFAULT_MEMSIZE})"
 	echo "       -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)"
 	echo ""
 	echo "       This script needs to be executed with superuser privileges"
 	echo ""
 	exit 1
 }
 
 if [ `id -u` -ne 0 ]; then
 	usage
 fi
 
 kldstat -n vmm > /dev/null 2>&1 
 if [ $? -ne 0 ]; then
 	echo "vmm.ko is not loaded!"
 	exit 1
 fi
 
 force_install=0
 isofile=${DEFAULT_ISOFILE}
 memsize=${DEFAULT_MEMSIZE}
 console=${DEFAULT_CONSOLE}
 cpus=${DEFAULT_CPUS}
 tap_total=0
 disk_total=0
 apic_opt=""
 gdbport=0
 loader_opt=""
 
 while getopts ac:C:d:e:g:hH:iI:m:t: c ; do
 	case $c in
 	a)
 		apic_opt="-a"
 		;;
 	c)
 		cpus=${OPTARG}
 		;;
 	C)
 		console=${OPTARG}
 		;;
 	d)
 		eval "disk_dev${disk_total}=\"${OPTARG}\""
 		disk_total=$(($disk_total + 1))
 		;;
 	e)
 		loader_opt="${loader_opt} -e ${OPTARG}"
 		;;
 	g)	
 		gdbport=${OPTARG}
 		;;
 	H)
 		host_base=`realpath ${OPTARG}`
 		;;
 	i)
 		force_install=1
 		;;
 	I)
 		isofile=${OPTARG}
 		;;
 	m)
 		memsize=${OPTARG}
 		;;
 	t)
 		eval "tap_dev${tap_total}=\"${OPTARG}\""
 		tap_total=$(($tap_total + 1))
 		;;
 	*)
 		usage
 		;;
 	esac
 done
 
 if [ $tap_total -eq 0 ] ; then
     tap_total=1
     tap_dev0="${DEFAULT_TAPDEV}"
 fi
 if [ $disk_total -eq 0 ] ; then
     disk_total=1
     disk_dev0="${DEFAULT_VIRTIO_DISK}"
 
 fi
 
 shift $((${OPTIND} - 1))
 
 if [ $# -ne 1 ]; then
 	usage
 fi
 
 vmname="$1"
 if [ -n "${host_base}" ]; then
 	loader_opt="${loader_opt} -h ${host_base}"
 fi
 
 make_and_check_diskdev()
 {
     local virtio_diskdev="$1"
     # Create the virtio diskdev file if needed
     if [ ! -f ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" does not exist."
 	    echo "Creating it ..."
 	    truncate -s 8G ${virtio_diskdev} > /dev/null
     fi
 
     if [ ! -r ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" is not readable"
 	    exit 1
     fi
 
     if [ ! -w ${virtio_diskdev} ]; then
 	    echo "virtio disk device file \"${virtio_diskdev}\" is not writable"
 	    exit 1
     fi
 }
 
 echo "Launching virtual machine \"$vmname\" ..."
 
 virtio_diskdev="$disk_dev0"
 
 ${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1
 
 while [ 1 ]; do
 
 	file -s ${virtio_diskdev} | grep "boot sector" > /dev/null
 	rc=$?
 	if [ $rc -ne 0 ]; then
 		file -s ${virtio_diskdev} | grep ": Unix Fast File sys" > /dev/null
 		rc=$?
 	fi
 	if [ $rc -ne 0 ]; then
 		need_install=1
 	else
 		need_install=0
 	fi
 
 	if [ $force_install -eq 1 -o $need_install -eq 1 ]; then
 		if [ ! -r ${isofile} ]; then
 			echo -n "Installation CDROM image \"${isofile}\" "
 			echo    "is not readable"
 			exit 1
 		fi
 		BOOTDISK=${isofile}
-		installer_opt="-s 31:0,virtio-blk,${BOOTDISK}"
+		installer_opt="-s 31:0,ahci-cd,${BOOTDISK}"
 	else
 		BOOTDISK=${virtio_diskdev}
 		installer_opt=""
 	fi
 
 	${LOADER} -c ${console} -m ${memsize} -d ${BOOTDISK} ${loader_opt} \
 		${vmname}
 	if [ $? -ne 0 ]; then
 		break
 	fi
 
 	#
 	# Build up args for additional tap and disk devices now.
 	#
 	nextslot=2  # slot 0 is hostbridge, slot 1 is lpc
 	devargs=""  # accumulate disk/tap args here
 	i=0
 	while [ $i -lt $tap_total ] ; do
 	    eval "tapname=\$tap_dev${i}"
 	    devargs="$devargs -s $nextslot:0,virtio-net,${tapname} "
 	    nextslot=$(($nextslot + 1))
 	    i=$(($i + 1))
 	done
 
 	i=0
 	while [ $i -lt $disk_total ] ; do
 	    eval "disk=\$disk_dev${i}"
 	    make_and_check_diskdev "${disk}"
 	    devargs="$devargs -s $nextslot:0,virtio-blk,${disk} "
 	    nextslot=$(($nextslot + 1))
 	    i=$(($i + 1))
 	done
 
 	${FBSDRUN} -c ${cpus} -m ${memsize} ${apic_opt} -A -H -P	\
 		-g ${gdbport}						\
 		-s 0:0,hostbridge					\
 		-s 1:0,lpc						\
 		${devargs}						\
 		-l com1,${console}					\
 		${installer_opt}					\
 		${vmname}
 
 	# bhyve returns the following status codes:
 	#  0 - VM has been reset
 	#  1 - VM has been powered off
 	#  2 - VM has been halted
 	#  3 - VM generated a triple fault
 	#  all other non-zero status codes are errors
 	#
 	if [ $? -ne 0 ]; then
 		break
 	fi
 done
 
 exit 99
Index: user/ae/inet6/share/man/man4/cxgbe.4
===================================================================
--- user/ae/inet6/share/man/man4/cxgbe.4	(revision 271452)
+++ user/ae/inet6/share/man/man4/cxgbe.4	(revision 271453)
@@ -1,317 +1,328 @@
 .\" Copyright (c) 2011-2014, Chelsio Inc
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions are met:
 .\"
 .\" 1. Redistributions of source code must retain the above copyright notice,
 .\"    this list of conditions and the following disclaimer.
 .\"
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" 3. Neither the name of the Chelsio Inc nor the names of its
 .\"    contributors may be used to endorse or promote products derived from
 .\"    this software without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 .\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" * Other names and brands may be claimed as the property of others.
 .\"
 .\" $FreeBSD$
 .\"
 .Dd March 20, 2014
 .Dt CXGBE 4
 .Os
 .Sh NAME
 .Nm cxgbe
 .Nd "Chelsio T4 and T5 based 40Gb, 10Gb, and 1Gb Ethernet adapter driver"
 .Sh SYNOPSIS
 To compile this driver into the kernel,
 place the following lines in your
 kernel configuration file:
 .Bd -ragged -offset indent
 .Cd "device cxgbe"
 .Ed
 .Pp
 To load the driver as a
 module at boot time, place the following lines in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 t4fw_cfg_load="YES"
 t5fw_cfg_load="YES"
 if_cxgbe_load="YES"
 .Ed
 .Sh DESCRIPTION
 The
 .Nm
 driver provides support for PCI Express Ethernet adapters based on
 the Chelsio Terminator 4 and Terminator 5 ASICs (T4 and T5).
 The driver supports Jumbo Frames, Transmit/Receive checksum offload,
 TCP segmentation offload (TSO), Large Receive Offload (LRO), VLAN
 tag insertion/extraction, VLAN checksum offload, VLAN TSO, and
 Receive Side Steering (RSS).
 For further hardware information and questions related to hardware
 requirements, see
 .Pa http://www.chelsio.com/ .
 .Pp
 Note that ports of T5 cards are named cxl and attach to a t5nex parent device
 (in contrast to ports named cxgbe that attach to a t4nex parent for a T4 card).
 Loader tunables with the hw.cxgbe prefix apply to both T4 and T5 cards.
 The sysctl MIBs are at dev.t5nex and dev.cxl for T5 cards and at dev.t4nex and
 dev.cxgbe for T4 cards.
 .Pp
 For more information on configuring this device, see
 .Xr ifconfig 8 .
 .Sh HARDWARE
 The
 .Nm
 driver supports 40Gb, 10Gb and 1Gb Ethernet adapters based on the T5 ASIC
 (ports will be named cxl):
 .Pp
 .Bl -bullet -compact
 .It
 Chelsio T580-CR
 .It
 Chelsio T580-LP-CR
 .It
 Chelsio T580-LP-SO-CR
 .It
 Chelsio T560-CR
 .It
 Chelsio T540-CR
 .It
 Chelsio T540-LP-CR
 .It
 Chelsio T522-CR
 .It
 Chelsio T520-LL-CR
 .It
 Chelsio T520-CR
 .It
 Chelsio T520-SO
 .It
 Chelsio T520-BT
 .It
 Chelsio T504-BT
 .El
 .Pp
 The
 .Nm
 driver supports 10Gb and 1Gb Ethernet adapters based on the T4 ASIC:
 .Pp
 .Bl -bullet -compact
 .It
 Chelsio T420-CR
 .It
 Chelsio T422-CR
 .It
 Chelsio T440-CR
 .It
 Chelsio T420-BCH
 .It
 Chelsio T440-BCH
 .It
 Chelsio T440-CH
 .It
 Chelsio T420-SO
 .It
 Chelsio T420-CX
 .It
 Chelsio T420-BT
 .It
 Chelsio T404-BT
 .El
 .Sh LOADER TUNABLES
 Tunables can be set at the
 .Xr loader 8
 prompt before booting the kernel or stored in
 .Xr loader.conf 5 .
 .Bl -tag -width indent
 .It Va hw.cxgbe.ntxq10g
 The number of tx queues to use for a 10Gb or 40Gb port.
 The default is 16 or the number
 of CPU cores in the system, whichever is less.
 .It Va hw.cxgbe.nrxq10g
 The number of rx queues to use for a 10Gb or 40Gb port.
 The default is 8 or the number
 of CPU cores in the system, whichever is less.
 .It Va hw.cxgbe.ntxq1g
 The number of tx queues to use for a 1Gb port.
 The default is 4 or the number
 of CPU cores in the system, whichever is less.
 .It Va hw.cxgbe.nrxq1g
 The number of rx queues to use for a 1Gb port.
 The default is 2 or the number
 of CPU cores in the system, whichever is less.
 .It Va hw.cxgbe.nofldtxq10g
 The number of TOE tx queues to use for a 10Gb or 40Gb port.
 The default is 8 or the
 number of CPU cores in the system, whichever is less.
 .It Va hw.cxgbe.nofldrxq10g
 The number of TOE rx queues to use for a 10Gb or 40Gb port.
 The default is 2 or the
 number of CPU cores in the system, whichever is less.
 .It Va hw.cxgbe.nofldtxq1g
 The number of TOE tx queues to use for a 1Gb port.
 The default is 2 or the
 number of CPU cores in the system, whichever is less.
 .It Va hw.cxgbe.nofldrxq1g
 The number of TOE rx queues to use for a 1Gb port.
 The default is 1.
 .It Va hw.cxgbe.holdoff_timer_idx_10G
 .It Va hw.cxgbe.holdoff_timer_idx_1G
 The timer index value to use to delay interrupts.
 The holdoff timer list has the values 1, 5, 10, 50, 100, and 200
 by default (all values are in microseconds) and the index selects a
 value from this list.
 The default value is 1 which means the timer value is 5us.
 Different interfaces can be assigned different values at any time via the
 dev.cxgbe.X.holdoff_tmr_idx or dev.cxl.X.holdoff_tmr_idx sysctl.
 .It Va hw.cxgbe.holdoff_pktc_idx_10G
 .It Va hw.cxgbe.holdoff_pktc_idx_1G
 The packet-count index value to use to delay interrupts.
 The packet-count list has the values 1, 8, 16, and 32 by default
 and the index selects a value from this list.
 The default value is -1 which means packet counting is disabled and interrupts
 are generated based solely on the holdoff timer value.
 Different interfaces can be assigned different values via the
 dev.cxgbe.X.holdoff_pktc_idx or dev.cxl.X.holdoff_pktc_idx sysctl.
 This sysctl works only when the interface has never been marked up (as done by
 ifconfig up).
 .It Va hw.cxgbe.qsize_txq
 The size, in number of entries, of the descriptor ring used for a tx
 queue.
 A buf_ring of the same size is also allocated for additional
 software queuing.
 See
 .Xr ifnet 9 .
 The default value is 1024.
 Different interfaces can be assigned different values via the
 dev.cxgbe.X.qsize_txq sysctl or dev.cxl.X.qsize_txq sysctl.
 This sysctl works only when the interface has never been marked up (as done by
 ifconfig up).
 .It Va hw.cxgbe.qsize_rxq
 The size, in number of entries, of the descriptor ring used for an
 rx queue.
 The default value is 1024.
 Different interfaces can be assigned different values via the
 dev.cxgbe.X.qsize_rxq or dev.cxl.X.qsize_rxq sysctl.
 This sysctl works only when the interface has never been marked up (as done by
 ifconfig up).
 .It Va hw.cxgbe.interrupt_types
 The interrupt types that the driver is allowed to use.
 Bit 0 represents INTx (line interrupts), bit 1 MSI, bit 2 MSI-X.
 The default is 7 (all allowed).
 The driver will select the best possible type out of the allowed types by
 itself.
 .It Va hw.cxgbe.fw_install
 0 prohibits the driver from installing a firmware on the card.
 1 allows the driver to install a new firmware if internal driver
 heuristics indicate that the new firmware is preferable to the one
 already on the card.
 2 instructs the driver to always install the new firmware on the card as
 long as it is compatible with the driver and is a different version than
 the one already on the card.
 The default is 1.
 .It Va hw.cxgbe.fl_pktshift
 The number of bytes of padding inserted before the begining of an Ethernet
 frame in the receive buffer.
 The default value of 2 ensures that the Ethernet payload (usually the IP header)
 is at a 4 byte aligned address.
 0-7 are all valid values.
 .It Va hw.cxgbe.fl_pad
 A non-zero value ensures that writes from the hardware to a receive buffer are
 padded up to the specified boundary.
 The default is -1 which lets the driver pick a pad boundary.
 0 disables trailer padding completely.
 .It Va hw.cxgbe.cong_drop
 Controls the hardware response to congestion.
 -1 disables congestion feedback and is not recommended.
 0 instructs the hardware to backpressure its pipeline on congestion.
-This usually results in the port emitting pause frames.
+This usually results in the port emitting PAUSE frames.
 1 instructs the hardware to drop frames destined for congested queues.
+.It Va hw.cxgbe.pause_settings
+PAUSE frame settings.
+Bit 0 is rx_pause, bit 1 is tx_pause.
+rx_pause = 1 instructs the hardware to heed incoming PAUSE frames, 0 instructs
+it to ignore them.
+tx_pause = 1 allows the hardware to emit PAUSE frames when its receive FIFO
+reaches a high threshold, 0 prohibits the hardware from emitting PAUSE frames.
+The default is 3 (both rx_pause and tx_pause = 1).
+This tunable establishes the default PAUSE settings for all ports.
+Settings can be displayed and controlled on a per-port basis via the
+dev.cxgbe.X.pause_settings (dev.cxl.X.pause_settings for T5 cards) sysctl.
 .It Va hw.cxgbe.buffer_packing
 Allow the hardware to deliver multiple frames in the same receive buffer
 opportunistically.
 The default is -1 which lets the driver decide.
 0 or 1 explicitly disable or enable this feature.
 .It Va hw.cxgbe.allow_mbufs_in_cluster
 1 allows the driver to lay down one or more mbufs within the receive buffer
 opportunistically.  This is the default.
 0 prohibits the driver from doing so.
 .It Va hw.cxgbe.largest_rx_cluster
 .It Va hw.cxgbe.safest_rx_cluster
 Sizes of rx clusters.  Each of these must be set to one of the sizes available
 (usually 2048, 4096, 9216, and 16384) and largest_rx_cluster must be greater
 than or equal to safest_rx_cluster.
 The defaults are 16384 and 4096 respectively.
 The driver will never attempt to allocate a receive buffer larger than
 largest_rx_cluster and will fall back to allocating buffers of
 safest_rx_cluster size if an allocation larger than safest_rx_cluster fails.
 Note that largest_rx_cluster merely establishes a ceiling -- the driver is
 allowed to allocate buffers of smaller sizes.
 .It Va hw.cxgbe.config_file
 Select a pre-packaged device configuration file.
 A configuration file contains a recipe for partitioning and configuring the
 hardware resources on the card.
 This tunable is for specialized applications only and should not be used in
 normal operation.
 The configuration profile currently in use is available in the dev.t4nex.X.cf
 and dev.t4nex.X.cfcsum (dev.t5nex for T5 cards) sysctls.
 .It Va hw.cxgbe.linkcaps_allowed
 .It Va hw.cxgbe.niccaps_allowed
 .It Va hw.cxgbe.toecaps_allowed
 .It Va hw.cxgbe.rdmacaps_allowed
 .It Va hw.cxgbe.iscsicaps_allowed
 .It Va hw.cxgbe.fcoecaps_allowed
 Disallowing capabilities provides a hint to the driver and firmware to not
 reserve hardware resources for that feature.
 Each of these is a bit field with a bit for each sub-capability within the
 capability.
 This tunable is for specialized applications only and should not be used in
 normal operation.
 The capabilities for which hardware resources have been reserved are listed in
 dev.t4nex.X.*caps or dev.t5nex.X.*caps sysctls.
 .El
 .Sh SUPPORT
 For general information and support,
 go to the Chelsio support website at:
 .Pa http://www.chelsio.com/ .
 .Pp
 If an issue is identified with this driver with a supported adapter,
 email all the specific information related to the issue to
 .Aq Mt support@chelsio.com .
 .Sh SEE ALSO
 .Xr altq 4 ,
 .Xr arp 4 ,
 .Xr cxgb 4 ,
 .Xr netintro 4 ,
 .Xr ng_ether 4 ,
 .Xr ifconfig 8
 .Sh HISTORY
 The
 .Nm
 device driver first appeared in
 .Fx 9.0 .
 Support for T5 cards first appeared in
 .Fx 9.2
 and
 .Fx 10.0 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 driver was written by
 .An Navdeep Parhar Aq Mt np@FreeBSD.org .
Index: user/ae/inet6/share/man/man4
===================================================================
--- user/ae/inet6/share/man/man4	(revision 271452)
+++ user/ae/inet6/share/man/man4	(revision 271453)

Property changes on: user/ae/inet6/share/man/man4
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/share/man/man4:r271428-271452
Index: user/ae/inet6/share/man/man9/ifnet.9
===================================================================
--- user/ae/inet6/share/man/man9/ifnet.9	(revision 271452)
+++ user/ae/inet6/share/man/man9/ifnet.9	(revision 271453)
@@ -1,1517 +1,1529 @@
 .\" -*- Nroff -*-
 .\" Copyright 1996, 1997 Massachusetts Institute of Technology
 .\"
 .\" Permission to use, copy, modify, and distribute this software and
 .\" its documentation for any purpose and without fee is hereby
 .\" granted, provided that both the above copyright notice and this
 .\" permission notice appear in all copies, that both the above
 .\" copyright notice and this permission notice appear in all
 .\" supporting documentation, and that the name of M.I.T. not be used
 .\" in advertising or publicity pertaining to distribution of the
 .\" software without specific, written prior permission.  M.I.T. makes
 .\" no representations about the suitability of this software for any
 .\" purpose.  It is provided "as is" without express or implied
 .\" warranty.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
 .\" ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
 .\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 .\" MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
 .\" SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 .\" SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 .\" LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 .\" USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 .\" ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 .\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 .\" OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
 .Dd July 29, 2014
 .Dt IFNET 9
 .Os
 .Sh NAME
 .Nm ifnet ,
 .Nm ifaddr ,
 .Nm ifqueue ,
 .Nm if_data
 .Nd kernel interfaces for manipulating network interfaces
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/time.h
 .In sys/socket.h
 .In net/if.h
 .In net/if_var.h
 .In net/if_types.h
 .\"
 .Ss "Interface Manipulation Functions"
 .Ft "struct ifnet *"
 .Fn if_alloc "u_char type"
 .Ft void
 .Fn if_attach "struct ifnet *ifp"
 .Ft void
 .Fn if_detach "struct ifnet *ifp"
 .Ft void
 .Fn if_free "struct ifnet *ifp"
 .Ft void
 .Fn if_free_type "struct ifnet *ifp" "u_char type"
 .Ft void
 .Fn if_down "struct ifnet *ifp"
 .Ft int
 .Fn ifioctl "struct socket *so" "u_long cmd" "caddr_t data" "struct thread *td"
 .Ft int
 .Fn ifpromisc "struct ifnet *ifp" "int pswitch"
 .Ft int
 .Fn if_allmulti "struct ifnet *ifp" "int amswitch"
 .Ft "struct ifnet *"
 .Fn ifunit "const char *name"
 .Ft "struct ifnet *"
 .Fn ifunit_ref "const char *name"
 .Ft void
 .Fn if_up "struct ifnet *ifp"
 .\"
 .Ss "Interface Address Functions"
 .Ft "struct ifaddr *"
 .Fn ifaddr_byindex "u_short idx"
 .Ft "struct ifaddr *"
 .Fn ifa_ifwithaddr "struct sockaddr *addr"
 .Ft "struct ifaddr *"
-.Fn ifa_ifwithdstaddr "struct sockaddr *addr"
+.Fn ifa_ifwithdstaddr "struct sockaddr *addr" "int fib"
 .Ft "struct ifaddr *"
-.Fn ifa_ifwithnet "struct sockaddr *addr" "int ignore_ptp"
+.Fn ifa_ifwithnet "struct sockaddr *addr" "int ignore_ptp" "int fib"
 .Ft "struct ifaddr *"
 .Fn ifaof_ifpforaddr "struct sockaddr *addr" "struct ifnet *ifp"
 .Ft void
 .Fn ifa_ref "struct ifaddr *ifa"
 .Ft void
 .Fn ifa_free "struct ifaddr *ifa"
 .\"
 .Ss "Interface Multicast Address Functions"
 .Ft int
 .Fn if_addmulti "struct ifnet *ifp" "struct sockaddr *sa" "struct ifmultiaddr **ifmap"
 .Ft int
 .Fn if_delmulti "struct ifnet *ifp" "struct sockaddr *sa"
 .Ft "struct ifmultiaddr *"
 .Fn if_findmulti "struct ifnet *ifp" "struct sockaddr *sa"
 .Ss "Output queue macros"
 .Fn IF_DEQUEUE "struct ifqueue *ifq" "struct mbuf *m"
 .\"
 .Ss "struct ifnet Member Functions"
 .Ft void
 .Fn \*(lp*if_input\*(rp "struct ifnet *ifp" "struct mbuf *m"
 .Ft int
 .Fo \*(lp*if_output\*(rp
 .Fa "struct ifnet *ifp" "struct mbuf *m"
 .Fa "const struct sockaddr *dst" "struct route *ro"
 .Fc
 .Ft void
 .Fn \*(lp*if_start\*(rp "struct ifnet *ifp"
 .Ft int
 .Fn \*(lp*if_transmit\*(rp "struct ifnet *ifp" "struct mbuf *m"
 .Ft void
 .Fn \*(lp*if_qflush\*(rp "struct ifnet *ifp"
 .Ft int
 .Fn \*(lp*if_ioctl\*(rp "struct ifnet *ifp" "u_long cmd" "caddr_t data"
 .Ft void
 .Fn \*(lp*if_init\*(rp "void *if_softc"
 .Ft int
 .Fo \*(lp*if_resolvemulti\*(rp
 .Fa "struct ifnet *ifp" "struct sockaddr **retsa" "struct sockaddr *addr"
 .Fc
 .Ss "struct ifaddr member function"
 .Ft void
 .Fo \*(lp*ifa_rtrequest\*(rp
 .Fa "int cmd" "struct rtentry *rt" "struct rt_addrinfo *info"
 .Fc
 .\"
 .Ss "Global Variables"
 .Vt extern struct ifnethead ifnet ;
 .\" extern struct ifindex_entry *ifindex_table ;
 .Vt extern int if_index ;
 .Vt extern int ifqmaxlen ;
 .Sh DATA STRUCTURES
 The kernel mechanisms for handling network interfaces reside primarily
 in the
 .Vt ifnet , if_data , ifaddr ,
 and
 .Vt ifmultiaddr
 structures in
 .In net/if.h
 and
 .In net/if_var.h
 and the functions named above and defined in
 .Pa /sys/net/if.c .
 Those interfaces which are intended to be used by user programs
 are defined in
 .In net/if.h ;
 these include the interface flags, the
 .Vt if_data
 structure, and the structures defining the appearance of
 interface-related messages on the
 .Xr route 4
 routing socket and in
 .Xr sysctl 3 .
 The header file
 .In net/if_var.h
 defines the kernel-internal interfaces, including the
 .Vt ifnet , ifaddr ,
 and
 .Vt ifmultiaddr
 structures and the functions which manipulate them.
 (A few user programs will need
 .In net/if_var.h
 because it is the prerequisite of some other header file like
 .In netinet/if_ether.h .
 Most references to those two files in particular can be replaced by
 .In net/ethernet.h . )
 .Pp
 The system keeps a linked list of interfaces using the
 .Li TAILQ
 macros defined in
 .Xr queue 3 ;
 this list is headed by a
 .Vt "struct ifnethead"
 called
 .Va ifnet .
 The elements of this list are of type
 .Vt "struct ifnet" ,
 and most kernel routines which manipulate interface as such accept or
 return pointers to these structures.
 Each interface structure
 contains an
 .Vt if_data
 structure used for statistics and information.
 Each interface also has a
 .Li TAILQ
 of interface addresses, described by
 .Vt ifaddr
 structures.
 An
 .Dv AF_LINK
 address
 (see
 .Xr link_addr 3 )
 describing the link layer implemented by the interface (if any)
 is accessed by the
 .Fn ifaddr_byindex
 function or
 .Va if_addr
 structure.
 (Some trivial interfaces do not provide any link layer addresses;
 this structure, while still present, serves only to identify the
 interface name and index.)
 .Pp
 Finally, those interfaces supporting reception of multicast datagrams
 have a
 .Li TAILQ
 of multicast group memberships, described by
 .Vt ifmultiaddr
 structures.
 These memberships are reference-counted.
 .Pp
 Interfaces are also associated with an output queue, defined as a
 .Vt "struct ifqueue" ;
 this structure is used to hold packets while the interface is in the
 process of sending another.
 .Pp
 .Ss The Vt ifnet Ss structure
 The fields of
 .Vt "struct ifnet"
 are as follows:
 .Bl -tag -width ".Va if_capabilities" -offset indent
 .It Va if_softc
 .Pq Vt "void *"
 A pointer to the driver's private state block.
 (Initialized by driver.)
 .It Va if_l2com
 .Pq Vt "void *"
 A pointer to the common data for the interface's layer 2 protocol.
 (Initialized by
 .Fn if_alloc . )
 .It Va if_vnet
 .Pq Vt "struct vnet *"
 A pointer to the virtual network stack instance.
 (Initialized by
 .Fn if_attach . )
 .It Va if_home_vnet
 .Pq Vt "struct vnet *"
 A pointer to the parent virtual network stack, where this
 .Vt "struct ifnet"
 originates from.
 (Initialized by
 .Fn if_attach . )
 .It Va if_link
 .Pq Fn TAILQ_ENTRY ifnet
 .Xr queue 3
 macro glue.
 .It Va if_xname
 .Pq Vt "char *"
 The name of the interface,
 (e.g.,
 .Dq Li fxp0
 or
 .Dq Li lo0 ) .
 (Initialized by driver
 (usually via
 .Fn if_initname ) . )
 .It Va if_dname
 .Pq Vt "const char *"
 The name of the driver.
 (Initialized by driver
 (usually via
 .Fn if_initname ) . )
 .It Va if_dunit
 .Pq Vt int
 A unique number assigned to each interface managed by a particular
 driver.
 Drivers may choose to set this to
 .Dv IF_DUNIT_NONE
 if a unit number is not associated with the device.
 (Initialized by driver
 (usually via
 .Fn if_initname ) . )
 .It Va if_refcount
 .Pq Vt u_int
 The reference count.
 (Initialized by
 .Fn if_alloc . )
 .It Va if_addrhead
 .Pq Vt "struct ifaddrhead"
 The head of the
 .Xr queue 3
 .Li TAILQ
 containing the list of addresses assigned to this interface.
 .It Va if_pcount
 .Pq Vt int
 A count of promiscuous listeners on this interface, used to
 reference-count the
 .Dv IFF_PROMISC
 flag.
 .It Va if_carp
 .Pq Vt "struct carp_if *"
 A pointer to the CARP interface structure,
 .Xr carp 4 .
 (Initialized by the driver-specific
 .Fn if_ioctl
 routine.)
 .It Va if_bpf
 .Pq Vt "struct bpf_if *"
 Opaque per-interface data for the packet filter,
 .Xr bpf 4 .
 (Initialized by
 .Fn bpf_attach . )
 .It Va if_index
 .Pq Vt u_short
 A unique number assigned to each interface in sequence as it is
 attached.
 This number can be used in a
 .Vt "struct sockaddr_dl"
 to refer to a particular interface by index
 (see
 .Xr link_addr 3 ) .
 (Initialized by
 .Fn if_alloc . )
 .It Va if_vlantrunk
 .Pq Vt struct ifvlantrunk *
 A pointer to 802.1Q trunk structure,
 .Xr vlan 4 .
 (Initialized by the driver-specific
 .Fn if_ioctl
 routine.)
 .It Va if_flags
 .Pq Vt int
 Flags describing operational parameters of this interface (see below).
 (Manipulated by generic code.)
 .It Va if_drv_flags
 .Pq Vt int
 Flags describing operational status of this interface (see below).
 (Manipulated by driver.)
 .It Va if_capabilities
 .Pq Vt int
 Flags describing the capabilities the interface supports (see below).
 .It Va if_capenable
 .Pq Vt int
 Flags describing the enabled capabilities of the interface (see below).
 .It Va if_linkmib
 .Pq Vt "void *"
 A pointer to an interface-specific MIB structure exported by
 .Xr ifmib 4 .
 (Initialized by driver.)
 .It Va if_linkmiblen
 .Pq Vt size_t
 The size of said structure.
 (Initialized by driver.)
 .It Va if_data
 .Pq Vt "struct if_data"
 More statistics and information; see
 .Sx "The if_data structure" ,
 below.
 (Initialized by driver, manipulated by both driver and generic
 code.)
 .It Va if_multiaddrs
 .Pq Vt struct ifmultihead
 The head of the
 .Xr queue 3
 .Li TAILQ
 containing the list of multicast addresses assigned to this interface.
 .It Va if_amcount
 .Pq Vt int
 A number of multicast requests on this interface, used to
 reference-count the
 .Dv IFF_ALLMULTI
 flag.
 .It Va if_addr
 .Pq Vt "struct ifaddr *"
 A pointer to the link-level interface address.
 (Initialized by
 .Fn if_alloc . )
 .\" .It Va if_llsoftc
 .\" .Pq Vt "void *"
 .\" The purpose of the field is unclear.
 .It Va if_snd
 .Pq Vt "struct ifaltq"
 The output queue.
 (Manipulated by driver.)
 .It Va if_broadcastaddr
 .Pq Vt "const u_int8_t *"
 A link-level broadcast bytestring for protocols with variable address
 length.
 .It Va if_bridge
 .Pq Vt "void *"
 A pointer to the bridge interface structure,
 .Xr if_bridge 4 .
 (Initialized by the driver-specific
 .Fn if_ioctl
 routine.)
 .It Va if_label
 .Pq Vt "struct label *"
 A pointer to the MAC Framework label structure,
 .Xr mac 4 .
 (Initialized by
 .Fn if_alloc . )
 .It Va if_afdata
 .Pq Vt "void *"
 An address family dependent data region.
 .It Va if_afdata_initialized
 .Pq Vt int
 Used to track the current state of address family initialization.
 .It Va if_afdata_lock
 .Pq Vt "struct rwlock"
 An
 .Xr rwlock 9
 lock used to protect
 .Va if_afdata
 internals.
 .It Va if_linktask
 .Pq Vt "struct task"
 A
 .Xr taskqueue 9
 task scheduled for link state change events of the interface.
 .It Va if_addr_lock
 .Pq Vt "struct rwlock"
 An
 .Xr rwlock 9
 lock used to protect interface-related address lists.
 .It Va if_clones
 .Pq Fn LIST_ENTRY ifnet
 .Xr queue 3
 macro glue for the list of clonable network interfaces.
 .It Va if_groups
 .Pq Fn TAILQ_HEAD ", ifg_list"
 The head of the
 .Xr queue 3
 .Li TAILQ
 containing the list of groups per interface.
 .It Va if_pf_kif
 .Pq Vt "void *"
 A pointer to the structure used for interface abstraction by
 .Xr pf 4 .
 .It Va if_lagg
 .Pq Vt "void *"
 A pointer to the
 .Xr lagg 4
 interface structure.
 .It Va if_alloctype
 .Pq Vt u_char
 The type of the interface as it was at the time of its allocation.
 It is used to cache the type passed to
 .Fn if_alloc ,
 but unlike
 .Va if_type ,
 it would not be changed by drivers.
 .El
 .Pp
 References to
 .Vt ifnet
 structures are gained by calling the
 .Fn if_ref
 function and released by calling the
 .Fn if_rele
 function.
 They are used to allow kernel code walking global interface lists
 to release the
 .Vt ifnet
 lock yet keep the
 .Vt ifnet
 structure stable.
 .Pp
 There are in addition a number of function pointers which the driver
 must initialize to complete its interface with the generic interface
 layer:
 .Bl -ohang -offset indent
 .It Fn if_input
 Pass a packet to an appropriate upper layer as determined
 from the link-layer header of the packet.
 This routine is to be called from an interrupt handler or
 used to emulate reception of a packet on this interface.
 A single function implementing
 .Fn if_input
 can be shared among multiple drivers utilizing the same link-layer
 framing, e.g., Ethernet.
 .It Fn if_output
 Output a packet on interface
 .Fa ifp ,
 or queue it on the output queue if the interface is already active.
 .It Fn if_transmit
 Transmit a packet on an interface or queue it if the interface is
 in use.
 This function will return
 .Dv ENOBUFS
 if the devices software and hardware queues are both full.
 This function must be installed after
 .Fn if_attach
 to override the default implementation.
 This function is exposed in order to allow drivers to manage their own queues
 and to reduce the latency caused by a frequently gratuitous enqueue / dequeue
 pair to ifq.
 The suggested internal software queueing mechanism is buf_ring.
 .It Fn if_qflush
 Free mbufs in internally managed queues when the interface is marked down.
 This function must be installed after
 .Fn if_attach
 to override the default implementation.
 This function is exposed in order to allow drivers to manage their own queues
 and to reduce the latency caused by a frequently gratuitous enqueue / dequeue
 pair to ifq.
 The suggested internal software queueing mechanism is buf_ring.
 .It Fn if_start
 Start queued output on an interface.
 This function is exposed in
 order to provide for some interface classes to share a
 .Fn if_output
 among all drivers.
 .Fn if_start
 may only be called when the
 .Dv IFF_DRV_OACTIVE
 flag is not set.
 (Thus,
 .Dv IFF_DRV_OACTIVE
 does not literally mean that output is active, but rather that the
 device's internal output queue is full.) Please note that this function
 will soon be deprecated.
 .It Fn if_ioctl
 Process interface-related
 .Xr ioctl 2
 requests
 (defined in
 .In sys/sockio.h ) .
 Preliminary processing is done by the generic routine
 .Fn ifioctl
 to check for appropriate privileges, locate the interface being
 manipulated, and perform certain generic operations like twiddling
 flags and flushing queues.
 See the description of
 .Fn ifioctl
 below for more information.
 .It Fn if_init
 Initialize and bring up the hardware,
 e.g., reset the chip and enable the receiver unit.
 Should mark the interface running,
 but not active
 .Dv ( IFF_DRV_RUNNING , ~IIF_DRV_OACTIVE ) .
 .It Fn if_resolvemulti
 Check the requested multicast group membership,
 .Fa addr ,
 for validity, and if necessary compute a link-layer group which
 corresponds to that address which is returned in
 .Fa *retsa .
 Returns zero on success, or an error code on failure.
 .El
 .Ss "Interface Flags"
 Interface flags are used for a number of different purposes.
 Some
 flags simply indicate information about the type of interface and its
 capabilities; others are dynamically manipulated to reflect the
 current state of the interface.
 Flags of the former kind are marked
 .Aq S
 in this table; the latter are marked
 .Aq D .
 Flags which begin with
 .Dq IFF_DRV_
 are stored in
 .Va if_drv_flags ;
 all other flags are stored in
 .Va if_flags .
 .Pp
 The macro
 .Dv IFF_CANTCHANGE
 defines the bits which cannot be set by a user program using the
 .Dv SIOCSIFFLAGS
 command to
 .Xr ioctl 2 ;
 these are indicated by an asterisk
 .Pq Ql *
 in the following listing.
 .Pp
 .Bl -tag -width ".Dv IFF_POINTOPOINT" -offset indent -compact
 .It Dv IFF_UP
 .Aq D
 The interface has been configured up by the user-level code.
 .It Dv IFF_BROADCAST
 .Aq S*
 The interface supports broadcast.
 .It Dv IFF_DEBUG
 .Aq D
 Used to enable/disable driver debugging code.
 .It Dv IFF_LOOPBACK
 .Aq S
 The interface is a loopback device.
 .It Dv IFF_POINTOPOINT
 .Aq S*
 The interface is point-to-point;
 .Dq broadcast
 address is actually the address of the other end.
 .It Dv IFF_DRV_RUNNING
 .Aq D*
 The interface has been configured and dynamic resources were
 successfully allocated.
 Probably only useful internal to the
 interface.
 .It Dv IFF_NOARP
 .Aq D
 Disable network address resolution on this interface.
 .It Dv IFF_PROMISC
 .Aq D*
 This interface is in promiscuous mode.
 .It Dv IFF_PPROMISC
 .Aq D
 This interface is in the permanently promiscuous mode (implies
 .Dv IFF_PROMISC ) .
 .It Dv IFF_ALLMULTI
 .Aq D*
 This interface is in all-multicasts mode (used by multicast routers).
 .It Dv IFF_DRV_OACTIVE
 .Aq D*
 The interface's hardware output queue (if any) is full; output packets
 are to be queued.
 .It Dv IFF_SIMPLEX
 .Aq S*
 The interface cannot hear its own transmissions.
 .It Dv IFF_LINK0
 .It Dv IFF_LINK1
 .It Dv IFF_LINK2
 .Aq D
 Control flags for the link layer.
 (Currently abused to select among
 multiple physical layers on some devices.)
 .It Dv IFF_MULTICAST
 .Aq S*
 This interface supports multicast.
 .It Dv IFF_CANTCONFIG
 .Aq S*
 The interface is not configurable in a meaningful way.
 Primarily useful for
 .Dv IFT_USB
 interfaces registered at the interface list.
 .It Dv IFF_MONITOR
 .Aq D
 This interface blocks transmission of packets and discards incoming
 packets after BPF processing.
 Used to monitor network traffic but not interact
 with the network in question.
 .It Dv IFF_STATICARP
 .Aq D
 Used to enable/disable ARP requests on this interface.
 .It Dv IFF_DYING
 .Aq D*
 Set when the
 .Vt ifnet
 structure of this interface is being released and still has
 .Va if_refcount
 references.
 .It Dv IFF_RENAMING
 .Aq D*
 Set when this interface is being renamed.
 .El
 .Ss "Interface Capabilities Flags"
 Interface capabilities are specialized features an interface may
 or may not support.
 These capabilities are very hardware-specific
 and allow, when enabled,
 to offload specific network processing to the interface
 or to offer a particular feature for use by other kernel parts.
 .Pp
 It should be stressed that a capability can be completely
 uncontrolled (i.e., stay always enabled with no way to disable it)
 or allow limited control over itself (e.g., depend on another
 capability's state.)
 Such peculiarities are determined solely by the hardware and driver
 of a particular interface.
 Only the driver possesses
 the knowledge on whether and how the interface capabilities
 can be controlled.
 Consequently, capabilities flags in
 .Va if_capenable
 should never be modified directly by kernel code other than
 the interface driver.
 The command
 .Dv SIOCSIFCAP
 to
 .Fn ifioctl
 is the dedicated means to attempt altering
 .Va if_capenable
 on an interface.
 Userland code shall use
 .Xr ioctl 2 .
 .Pp
 The following capabilities are currently supported by the system:
 .Bl -tag -width ".Dv IFCAP_POLLING_NOCOUNT" -offset indent
 .It Dv IFCAP_RXCSUM
 This interface can do checksum validation on receiving data.
 Some interfaces do not have sufficient buffer storage to store frames
 above a certain MTU-size completely.
 The driver for the interface might disable hardware checksum validation
 if the MTU is set above the hardcoded limit.
 .It Dv IFCAP_TXCSUM
 This interface can do checksum calculation on transmitting data.
 .It Dv IFCAP_HWCSUM
 A shorthand for
 .Pq Dv IFCAP_RXCSUM | IFCAP_TXCSUM .
 .It Dv IFCAP_NETCONS
 This interface can be a network console.
 .It Dv IFCAP_VLAN_MTU
 The
 .Xr vlan 4
 driver can operate over this interface in software tagging mode
 without having to decrease MTU on
 .Xr vlan 4
 interfaces below 1500 bytes.
 This implies the ability of this interface to cope with frames somewhat
 longer than permitted by the Ethernet specification.
 .It Dv IFCAP_VLAN_HWTAGGING
 This interface can do VLAN tagging on output and
 demultiplex frames by their VLAN tag on input.
 .It Dv IFCAP_JUMBO_MTU
 This Ethernet interface can transmit and receive frames up to
 9000 bytes long.
 .It Dv IFCAP_POLLING
 This interface supports
 .Xr polling 4 .
 See below for details.
 .It Dv IFCAP_VLAN_HWCSUM
 This interface can do checksum calculation on both transmitting
 and receiving data on
 .Xr vlan 4
 interfaces (implies
 .Dv IFCAP_HWCSUM ) .
 .It Dv IFCAP_TSO4
 This Ethernet interface supports TCP4 Segmentation offloading.
 .It Dv IFCAP_TSO6
 This Ethernet interface supports TCP6 Segmentation offloading.
 .It Dv IFCAP_TSO
 A shorthand for
 .Pq Dv IFCAP_TSO4 | IFCAP_TSO6 .
 .It Dv IFCAP_TOE4
 This Ethernet interface supports TCP offloading.
 .It Dv IFCAP_TOE6
 This Ethernet interface supports TCP6 offloading.
 .It Dv IFCAP_TOE
 A shorthand for
 .Pq Dv IFCAP_TOE4 | IFCAP_TOE6 .
 .It Dv IFCAP_WOL_UCAST
 This Ethernet interface supports waking up on any Unicast packet.
 .It Dv IFCAP_WOL_MCAST
 This Ethernet interface supports waking up on any Multicast packet.
 .It Dv IFCAP_WOL_MAGIC
 This Ethernet interface supports waking up on any Magic packet such
 as those sent by
 .Xr wake 8 .
 .It Dv IFCAP_WOL
 A shorthand for
 .Pq Dv IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC .
 .It Dv IFCAP_TOE4
 This Ethernet interface supports TCP4 Offload Engine.
 .It Dv IFCAP_TOE6
 This Ethernet interface supports TCP6 Offload Engine.
 .It Dv IFCAP_TOE
 A shorthand for
 .Pq Dv IFCAP_TOE4 | IFCAP_TOE6 .
 .It Dv IFCAP_VLAN_HWFILTER
 This interface supports frame filtering in hardware on
 .Xr vlan 4
 interfaces.
 .It Dv IFCAP_POLLING_NOCOUNT
 The return value for the number of processed packets should be
 skipped for this interface.
 .It Dv IFCAP_VLAN_HWTSO
 This interface supports TCP Segmentation offloading on
 .Xr vlan 4
 interfaces (implies
 .Dv IFCAP_TSO ) .
 .It Dv IFCAP_LINKSTATE
 This Ethernet interface supports dynamic link state changes.
 .El
 .Pp
 The ability of advanced network interfaces to offload certain
 computational tasks from the host CPU to the board is limited
 mostly to TCP/IP.
 Therefore a separate field associated with an interface
 (see
 .Va ifnet.if_data.ifi_hwassist
 below)
 keeps a detailed description of its enabled capabilities
 specific to TCP/IP processing.
 The TCP/IP module consults the field to see which tasks
 can be done on an
 .Em outgoing
 packet by the interface.
 The flags defined for that field are a superset of those for
 .Va mbuf.m_pkthdr.csum_flags ,
 namely:
 .Bl -tag -width ".Dv CSUM_FRAGMENT" -offset indent
 .It Dv CSUM_IP
 The interface will compute IP checksums.
 .It Dv CSUM_TCP
 The interface will compute TCP checksums.
 .It Dv CSUM_UDP
 The interface will compute UDP checksums.
 .It Dv CSUM_IP_FRAGS
 The interface can compute a TCP or UDP checksum for a packet
 fragmented by the host CPU.
 Makes sense only along with
 .Dv CSUM_TCP
 or
 .Dv CSUM_UDP .
 .It Dv CSUM_FRAGMENT
 The interface will do the fragmentation of IP packets if necessary.
 The host CPU does not need to care about MTU on this interface
 as long as a packet to transmit through it is an IP one and it
 does not exceed the size of the hardware buffer.
 .El
 .Pp
 An interface notifies the TCP/IP module about the tasks
 the former has performed on an
 .Em incoming
 packet by setting the corresponding flags in the field
 .Va mbuf.m_pkthdr.csum_flags
 of the
 .Vt mbuf chain
 containing the packet.
 See
 .Xr mbuf 9
 for details.
 .Pp
 The capability of a network interface to operate in
 .Xr polling 4
 mode involves several flags in different
 global variables and per-interface fields.
 The capability flag
 .Dv IFCAP_POLLING
 set in interface's
 .Va if_capabilities
 indicates support for
 .Xr polling 4
 on the particular interface.
 If set in
 .Va if_capabilities ,
 the same flag can be marked or cleared in the interface's
 .Va if_capenable
 within
 .Fn ifioctl ,
 thus initiating switch of the interface to
 .Xr polling 4
 mode or interrupt
 mode, respectively.
 The actual mode change is managed by the driver-specific
 .Fn if_ioctl
 routine.
 The
 .Xr polling
 handler returns the number of packets processed.
 .Ss The Vt if_data Ss Structure
 The
 .Vt if_data
 structure contains statistics and identifying information used
 by management programs, and which is exported to user programs by way
 of the
 .Xr ifmib 4
 branch of the
 .Xr sysctl 3
 MIB.
 The following elements of the
 .Vt if_data
 structure are initialized by the interface and are not expected to change
 significantly over the course of normal operation:
 .Bl -tag -width ".Va ifi_lastchange" -offset indent
 .It Va ifi_type
 .Pq Vt u_char
 The type of the interface, as defined in
 .In net/if_types.h
 and described below in the
 .Sx "Interface Types"
 section.
 .It Va ifi_physical
 .Pq Vt u_char
 Intended to represent a selection of physical layers on devices which
 support more than one; never implemented.
 .It Va ifi_addrlen
 .Pq Vt u_char
 Length of a link-layer address on this device, or zero if there are
 none.
 Used to initialized the address length field in
 .Vt sockaddr_dl
 structures referring to this interface.
 .It Va ifi_hdrlen
 .Pq Vt u_char
 Maximum length of any link-layer header which might be prepended by
 the driver to a packet before transmission.
 The generic code computes
 the maximum over all interfaces and uses that value to influence the
 placement of data in
 .Vt mbuf Ns s
 to attempt to ensure that there is always
 sufficient space to prepend a link-layer header without allocating an
 additional
 .Vt mbuf .
 .It Va ifi_datalen
 .Pq Vt u_char
 Length of the
 .Vt if_data
 structure.
 Allows some stabilization of the routing socket ABI in the face of
 increases in the length of
 .Vt struct ifdata .
 .It Va ifi_mtu
 .Pq Vt u_long
 The maximum transmission unit of the medium, exclusive of any
 link-layer overhead.
 .It Va ifi_metric
 .Pq Vt u_long
 A dimensionless metric interpreted by a user-mode routing process.
 .It Va ifi_baudrate
 .Pq Vt u_long
 The line rate of the interface, in bits per second.
 .It Va ifi_hwassist
 .Pq Vt u_long
 A detailed interpretation of the capabilities
 to offload computational tasks for
 .Em outgoing
 packets.
 The interface driver must keep this field in accord with
 the current value of
 .Va if_capenable .
 .It Va ifi_epoch
 .Pq Vt time_t
 The system uptime when interface was attached or the statistics
 below were reset.
 This is intended to be used to set the SNMP variable
 .Va ifCounterDiscontinuityTime .
 It may also be used to determine if two successive queries for an
 interface of the same index have returned results for the same
 interface.
 .El
 .Pp
 The structure additionally contains generic statistics applicable to a
 variety of different interface types (except as noted, all members are
 of type
 .Vt u_long ) :
 .Bl -tag -width ".Va ifi_lastchange" -offset indent
 .It Va ifi_link_state
 .Pq Vt u_char
 The current link state of Ethernet interfaces.
 See the
 .Sx Interface Link States
 section for possible values.
 .It Va ifi_ipackets
 Number of packets received.
 .It Va ifi_ierrors
 Number of receive errors detected (e.g., FCS errors, DMA overruns,
 etc.).
 More detailed breakdowns can often be had by way of a
 link-specific MIB.
 .It Va ifi_opackets
 Number of packets transmitted.
 .It Va ifi_oerrors
 Number of output errors detected (e.g., late collisions, DMA overruns,
 etc.).
 More detailed breakdowns can often be had by way of a
 link-specific MIB.
 .It Va ifi_collisions
 Total number of collisions detected on output for CSMA interfaces.
 (This member is sometimes [ab]used by other types of interfaces for
 other output error counts.)
 .It Va ifi_ibytes
 Total traffic received, in bytes.
 .It Va ifi_obytes
 Total traffic transmitted, in bytes.
 .It Va ifi_imcasts
 Number of packets received which were sent by link-layer multicast.
 .It Va ifi_omcasts
 Number of packets sent by link-layer multicast.
 .It Va ifi_iqdrops
 Number of packets dropped on input.
 Rarely implemented.
 .It Va ifi_noproto
 Number of packets received for unknown network-layer protocol.
 .It Va ifi_lastchange
 .Pq Vt "struct timeval"
 The time of the last administrative change to the interface (as required
 for
 .Tn SNMP ) .
 .El
 .Ss Interface Types
 The header file
 .In net/if_types.h
 defines symbolic constants for a number of different types of
 interfaces.
 The most common are:
 .Pp
 .Bl -tag -offset indent -width ".Dv IFT_PROPVIRTUAL" -compact
 .It Dv IFT_OTHER
 none of the following
 .It Dv IFT_ETHER
 Ethernet
 .It Dv IFT_ISO88023
 ISO 8802-3 CSMA/CD
 .It Dv IFT_ISO88024
 ISO 8802-4 Token Bus
 .It Dv IFT_ISO88025
 ISO 8802-5 Token Ring
 .It Dv IFT_ISO88026
 ISO 8802-6 DQDB MAN
 .It Dv IFT_FDDI
 FDDI
 .It Dv IFT_PPP
 Internet Point-to-Point Protocol
 .Pq Xr ppp 8
 .It Dv IFT_LOOP
 The loopback
 .Pq Xr lo 4
 interface
 .It Dv IFT_SLIP
 Serial Line IP
 .It Dv IFT_PARA
 Parallel-port IP
 .Pq Dq Tn PLIP
 .It Dv IFT_ATM
 Asynchronous Transfer Mode
 .It Dv IFT_USB
 USB Interface
 .El
 .Ss Interface Link States
 The following link states are currently defined:
 .Pp
 .Bl -tag -offset indent -width ".Dv LINK_STATE_UNKNOWN" -compact
 .It Dv LINK_STATE_UNKNOWN
 The link is in an invalid or unknown state.
 .It Dv LINK_STATE_DOWN
 The link is down.
 .It Dv LINK_STATE_UP
 The link is up.
 .El
 .Ss The Vt ifaddr Ss Structure
 Every interface is associated with a list
 (or, rather, a
 .Li TAILQ )
 of addresses, rooted at the interface structure's
 .Va if_addrlist
 member.
 The first element in this list is always an
 .Dv AF_LINK
 address representing the interface itself; multi-access network
 drivers should complete this structure by filling in their link-layer
 addresses after calling
 .Fn if_attach .
 Other members of the structure represent network-layer addresses which
 have been configured by means of the
 .Dv SIOCAIFADDR
 command to
 .Xr ioctl 2 ,
 called on a socket of the appropriate protocol family.
 The elements of this list consist of
 .Vt ifaddr
 structures.
 Most protocols will declare their own protocol-specific
 interface address structures, but all begin with a
 .Vt "struct ifaddr"
 which provides the most-commonly-needed functionality across all
 protocols.
 Interface addresses are reference-counted.
 .Pp
 The members of
 .Vt "struct ifaddr"
 are as follows:
 .Bl -tag -width ".Va ifa_rtrequest" -offset indent
 .It Va ifa_addr
 .Pq Vt "struct sockaddr *"
 The local address of the interface.
 .It Va ifa_dstaddr
 .Pq Vt "struct sockaddr *"
 The remote address of point-to-point interfaces, and the broadcast
 address of broadcast interfaces.
 .Va ( ifa_broadaddr
 is a macro for
 .Va ifa_dstaddr . )
 .It Va ifa_netmask
 .Pq Vt "struct sockaddr *"
 The network mask for multi-access interfaces, and the confusion
 generator for point-to-point interfaces.
 .It Va ifa_ifp
 .Pq Vt "struct ifnet *"
 A link back to the interface structure.
 .It Va ifa_link
 .Pq Fn TAILQ_ENTRY ifaddr
 .Xr queue 3
 glue for list of addresses on each interface.
 .It Va ifa_rtrequest
 See below.
 .It Va ifa_flags
 .Pq Vt u_short
 Some of the flags which would be used for a route representing this
 address in the route table.
 .It Va ifa_refcnt
 .Pq Vt short
 The reference count.
 .El
 .Pp
 References to
 .Vt ifaddr
 structures are gained by calling the
 .Fn ifa_ref
 function and released by calling the
 .Fn ifa_free
 function.
 .Pp
 .Fn ifa_rtrequest
 is a pointer to a function which receives callouts from the routing
 code
 .Pq Fn rtrequest
 to perform link-layer-specific actions upon requests to add,
 or delete routes.
 The
 .Fa cmd
 argument indicates the request in question:
 .Dv RTM_ADD ,
 or
 .Dv RTM_DELETE .
 The
 .Fa rt
 argument is the route in question; the
 .Fa info
 argument contains the specific destination being manipulated.
 .Sh FUNCTIONS
 The functions provided by the generic interface code can be divided
 into two groups: those which manipulate interfaces, and those which
 manipulate interface addresses.
 In addition to these functions, there
 may also be link-layer support routines which are used by a number of
 drivers implementing a specific link layer over different hardware;
 see the documentation for that link layer for more details.
 .Ss The Vt ifmultiaddr Ss Structure
 Every multicast-capable interface is associated with a list of
 multicast group memberships, which indicate at a low level which
 link-layer multicast addresses (if any) should be accepted, and at a
 high level, in which network-layer multicast groups a user process has
 expressed interest.
 .Pp
 The elements of the structure are as follows:
 .Bl -tag -width ".Va ifma_refcount" -offset indent
 .It Va ifma_link
 .Pq Fn LIST_ENTRY ifmultiaddr
 .Xr queue 3
 macro glue.
 .It Va ifma_addr
 .Pq Vt "struct sockaddr *"
 A pointer to the address which this record represents.
 The
 memberships for various address families are stored in arbitrary
 order.
 .It Va ifma_lladdr
 .Pq Vt "struct sockaddr *"
 A pointer to the link-layer multicast address, if any, to which the
 network-layer multicast address in
 .Va ifma_addr
 is mapped, else a null pointer.
 If this element is non-nil, this
 membership also holds an invisible reference to another membership for
 that link-layer address.
 .It Va ifma_refcount
 .Pq Vt u_int
 A reference count of requests for this particular membership.
 .El
 .Ss Interface Manipulation Functions
 .Bl -ohang -offset indent
 .It Fn if_alloc
 Allocate and initialize
 .Vt "struct ifnet" .
 Initialization includes the allocation of an interface index and may
 include the allocation of a
 .Fa type
 specific structure in
 .Va if_l2com .
 .It Fn if_attach
 Link the specified interface
 .Fa ifp
 into the list of network interfaces.
 Also initialize the list of
 addresses on that interface, and create a link-layer
 .Vt ifaddr
 structure to be the first element in that list.
 (A pointer to
 this address structure is saved in the
 .Vt ifnet
 structure and shall be accessed by the
 .Fn ifaddr_byindex
 function.)
 The
 .Fa ifp
 must have been allocated by
 .Fn if_alloc .
 .It Fn if_detach
 Shut down and unlink the specified
 .Fa ifp
 from the interface list.
 .It Fn if_free
 Free the given
 .Fa ifp
 back to the system.
 The interface must have been previously detached if it was ever attached.
 .It Fn if_free_type
 Identical to
 .Fn if_free
 except that the given
 .Fa type
 is used to free
 .Va if_l2com
 instead of the type in
 .Va if_type .
 This is intended for use with drivers that change their interface type.
 .It Fn if_down
 Mark the interface
 .Fa ifp
 as down (i.e.,
 .Dv IFF_UP
 is not set),
 flush its output queue, notify protocols of the transition,
 and generate a message from the
 .Xr route 4
 routing socket.
 .It Fn if_up
 Mark the interface
 .Fa ifp
 as up, notify protocols of the transition,
 and generate a message from the
 .Xr route 4
 routing socket.
 .It Fn ifpromisc
 Add or remove a promiscuous reference to
 .Fa ifp .
 If
 .Fa pswitch
 is true, add a reference;
 if it is false, remove a reference.
 On reference count transitions
 from zero to one and one to zero, set the
 .Dv IFF_PROMISC
 flag appropriately and call
 .Fn if_ioctl
 to set up the interface in the desired mode.
 .It Fn if_allmulti
 As
 .Fn ifpromisc ,
 but for the all-multicasts
 .Pq Dv IFF_ALLMULTI
 flag instead of the promiscuous flag.
 .It Fn ifunit
 Return an
 .Vt ifnet
 pointer for the interface named
 .Fa name .
 .It Fn ifunit_ref
 Return a reference-counted (via
 .Fn ifa_ref )
 .Vt ifnet
 pointer for the interface named
 .Fa name .
 This is the preferred function over
 .Fn ifunit .
 The caller is responsible for releasing the reference with
 .Fn if_rele
 when it is finished with the ifnet.
 .It Fn ifioctl
 Process the ioctl request
 .Fa cmd ,
 issued on socket
 .Fa so
 by thread
 .Fa td ,
 with data parameter
 .Fa data .
 This is the main routine for handling all interface configuration
 requests from user mode.
 It is ordinarily only called from the socket-layer
 .Xr ioctl 2
 handler, and only for commands with class
 .Sq Li i .
 Any unrecognized commands will be passed down to socket
 .Fa so Ns 's
 protocol for
 further interpretation.
 The following commands are handled by
 .Fn ifioctl :
 .Pp
 .Bl -tag -width ".Dv SIOCGIFNETMASK" -offset indent -compact
 .It Dv SIOCGIFCONF
 Get interface configuration.
 (No call-down to driver.)
 .Pp
 .It Dv SIOCSIFNAME
 Set the interface name.
 .Dv RTM_IFANNOUNCE
 departure and arrival messages are sent so that
 routing code that relies on the interface name will update its interface
 list.
 Caller must have appropriate privilege.
 (No call-down to driver.)
 .It Dv SIOCGIFCAP
 .It Dv SIOCGIFFIB
 .It Dv SIOCGIFFLAGS
 .It Dv SIOCGIFMETRIC
 .It Dv SIOCGIFMTU
 .It Dv SIOCGIFPHYS
 Get interface capabilities, FIB, flags, metric, MTU, medium selection.
 (No call-down to driver.)
 .Pp
 .It Dv SIOCSIFCAP
 Enable or disable interface capabilities.
 Caller must have appropriate privilege.
 Before a call to the driver-specific
 .Fn if_ioctl
 routine, the requested mask for enabled capabilities is checked
 against the mask of capabilities supported by the interface,
 .Va if_capabilities .
 Requesting to enable an unsupported capability is invalid.
 The rest is supposed to be done by the driver,
 which includes updating
 .Va if_capenable
 and
 .Va if_data.ifi_hwassist
 appropriately.
 .Pp
 .It Dv SIOCSIFFIB
 Sets interface FIB.
 Caller must have appropriate privilege.
 FIB values start at 0 and values greater or equals than
 .Va net.fibs
 are considered invalid.
 .It Dv SIOCSIFFLAGS
 Change interface flags.
 Caller must have appropriate privilege.
 If a change to the
 .Dv IFF_UP
 flag is requested,
 .Fn if_up
 or
 .Fn if_down
 is called as appropriate.
 Flags listed in
 .Dv IFF_CANTCHANGE
 are masked off, and the field
 .Va if_flags
 in the interface structure is updated.
 Finally, the driver
 .Fn if_ioctl
 routine is called to perform any setup
 requested.
 .Pp
 .It Dv SIOCSIFMETRIC
 .It Dv SIOCSIFPHYS
 Change interface metric or medium.
 Caller must have appropriate privilege.
 .Pp
 .It Dv SIOCSIFMTU
 Change interface MTU.
 Caller must have appropriate privilege.
 MTU
 values less than 72 or greater than 65535 are considered invalid.
 The driver
 .Fn if_ioctl
 routine is called to implement the change; it is responsible for any
 additional sanity checking and for actually modifying the MTU in the
 interface structure.
 .Pp
 .It Dv SIOCADDMULTI
 .It Dv SIOCDELMULTI
 Add or delete permanent multicast group memberships on the interface.
 Caller must have appropriate privilege.
 The
 .Fn if_addmulti
 or
 .Fn if_delmulti
 function is called to perform the operation; qq.v.
 .Pp
 .It Dv SIOCAIFADDR
 .It Dv SIOCDIFADDR
 The socket's protocol control routine is called to implement the
 requested action.
 .El
 .El
 .Pp
 .Fn if_down ,
 .Fn ifioctl ,
 .Fn ifpromisc ,
 and
 .Fn if_up
 must be called at
 .Fn splnet
 or higher.
 .Ss "Interface Address Functions"
 Several functions exist to look up an interface address structure
 given an address.
 .Fn ifa_ifwithaddr
 returns an interface address with either a local address or a
 broadcast address precisely matching the parameter
 .Fa addr .
 .Fn ifa_ifwithdstaddr
 returns an interface address for a point-to-point interface whose
 remote
 .Pq Dq destination
 address is
-.Fa addr .
+.Fa addr
+and a fib is
+.Fa fib .
+If
+.Fa fib
+is
+.Dv RT_ALL_FIBS ,
+then the first interface address matching
+.Fa addr
+will be returned.
 .Pp
 .Fn ifa_ifwithnet
 returns the most specific interface address which matches the
 specified address,
 .Fa addr ,
 subject to its configured netmask, or a point-to-point interface
 address whose remote address is
 .Fa addr
 if one is found.
 If
 .Fa ignore_ptp
-is true, skip point-to-point interface addresses.
+is true, skip point-to-point interface addresses.  The 
+.Fa fib
+parameter is handled the same way as by
+.Fn ifa_ifwithdstaddr .
 .Pp
 .Fn ifaof_ifpforaddr
 returns the most specific address configured on interface
 .Fa ifp
 which matches address
 .Fa addr ,
 subject to its configured netmask.
 If the interface is
 point-to-point, only an interface address whose remote address is
 precisely
 .Fa addr
 will be returned.
 .Pp
 .Fn ifaddr_byindex
 returns the link-level address of the interface with the given index
 .Fa idx .
 .Pp
 All of these functions return a null pointer if no such address can be
 found.
 .Ss "Interface Multicast Address Functions"
 The
 .Fn if_addmulti ,
 .Fn if_delmulti ,
 and
 .Fn if_findmulti
 functions provide support for requesting and relinquishing multicast
 group memberships, and for querying an interface's membership list,
 respectively.
 The
 .Fn if_addmulti
 function takes a pointer to an interface,
 .Fa ifp ,
 and a generic address,
 .Fa sa .
 It also takes a pointer to a
 .Vt "struct ifmultiaddr *"
 which is filled in on successful return with the address of the
 group membership control block.
 The
 .Fn if_addmulti
 function performs the following four-step process:
 .Bl -enum -offset indent
 .It
 Call the interface's
 .Fn if_resolvemulti
 entry point to determine the link-layer address, if any, corresponding
 to this membership request, and also to give the link layer an
 opportunity to veto this membership request should it so desire.
 .It
 Check the interface's group membership list for a pre-existing
 membership for this group.
 If one is not found, allocate a new one;
 if one is, increment its reference count.
 .It
 If the
 .Fn if_resolvemulti
 routine returned a link-layer address corresponding to the group,
 repeat the previous step for that address as well.
 .It
 If the interface's multicast address filter needs to be changed
 because a new membership was added, call the interface's
 .Fn if_ioctl
 routine
 (with a
 .Fa cmd
 argument of
 .Dv SIOCADDMULTI )
 to request that it do so.
 .El
 .Pp
 The
 .Fn if_delmulti
 function, given an interface
 .Fa ifp
 and an address,
 .Fa sa ,
 reverses this process.
 Both functions return zero on success, or a
 standard error number on failure.
 .Pp
 The
 .Fn if_findmulti
 function examines the membership list of interface
 .Fa ifp
 for an address matching
 .Fa sa ,
 and returns a pointer to that
 .Vt "struct ifmultiaddr"
 if one is found, else it returns a null pointer.
 .Sh SEE ALSO
 .Xr ioctl 2 ,
 .Xr link_addr 3 ,
 .Xr queue 3 ,
 .Xr sysctl 3 ,
 .Xr bpf 4 ,
 .Xr ifmib 4 ,
 .Xr lo 4 ,
 .Xr netintro 4 ,
 .Xr polling 4 ,
 .Xr config 8 ,
 .Xr ppp 8 ,
 .Xr mbuf 9 ,
 .Xr rtentry 9
 .Rs
 .%A Gary R. Wright
 .%A W. Richard Stevens
 .%B TCP/IP Illustrated
 .%V Vol. 2
 .%O Addison-Wesley, ISBN 0-201-63354-X
 .Re
 .Sh AUTHORS
 This manual page was written by
 .An Garrett A. Wollman .
Index: user/ae/inet6/share/vt/keymaps/Makefile
===================================================================
--- user/ae/inet6/share/vt/keymaps/Makefile	(revision 271452)
+++ user/ae/inet6/share/vt/keymaps/Makefile	(revision 271453)
@@ -1,91 +1,93 @@
 # $FreeBSD$
 
 FILES=	INDEX.keymaps \
 	am.kbd \
 	be.acc.kbd \
 	be.kbd \
 	bg.bds.kbd \
 	bg.phonetic.kbd \
 	br.kbd \
 	br.noacc.kbd \
 	by.kbd \
 	ca.kbd \
 	ca-fr.kbd \
 	centraleuropean.kbd \
 	centraleuropean.qwerty.kbd \
 	ch-fr.acc.kbd \
 	ch-fr.kbd \
 	ch.acc.kbd \
 	ch.kbd \
 	ch.macbook.acc.kbd \
 	colemak.acc.kbd \
 	cz.kbd \
 	de.acc.kbd \
 	de.noacc.kbd \
 	de.kbd \
 	dk.acc.kbd \
 	dk.kbd \
 	dk.macbook.kbd \
 	ee.kbd \
 	es.acc.kbd \
 	es.dvorak.kbd \
 	es.kbd \
 	fi.kbd \
+	fr.acc.kbd \
 	fr.dvorak.acc.kbd \
 	fr.dvorak.kbd \
+	fr.kbd \
 	fr.macbook.kbd \
 	gr.101.acc.kbd \
 	gr.elot.acc.kbd \
 	gr.kbd \
 	hr.kbd \
 	hu.101.kbd \
 	hu.102.kbd \
 	il.kbd \
 	is.acc.kbd \
 	is.kbd \
 	it.kbd \
 	jp.capsctrl.kbd \
 	jp.kbd \
 	jp.pc98.iso.kbd \
 	jp.pc98.kbd \
 	kz.io.kbd \
 	kz.kst.kbd \
 	latinamerican.acc.kbd \
 	latinamerican.kbd \
 	lt.kbd \
 	nl.kbd \
 	no.dvorak.kbd \
 	no.kbd \
 	nordic.asus-eee.kbd \
 	pl.dvorak.kbd \
 	pl.kbd \
 	pt.acc.kbd \
 	pt.kbd \
 	ru.kbd \
 	ru.shift.kbd \
 	ru.win.kbd \
 	se.kbd \
 	si.kbd \
 	sk.kbd \
 	tr.kbd \
 	ua.kbd \
 	ua.shift.alt.kbd \
 	uk.capsctrl.kbd \
 	uk.dvorak.kbd \
 	uk.kbd \
 	us.acc.kbd \
 	us.ctrl.kbd \
 	us.dvorak.kbd \
 	us.dvorakl.kbd \
 	us.dvorakp.kbd \
 	us.dvorakr.kbd \
 	us.dvorakx.kbd \
 	us.emacs.kbd \
 	us.kbd \
 	us.unix.kbd \
 
 FILESDIR= ${SHAREDIR}/vt/keymaps
 
 NO_OBJ=
 
 .include <bsd.prog.mk>
Index: user/ae/inet6/share
===================================================================
--- user/ae/inet6/share	(revision 271452)
+++ user/ae/inet6/share	(revision 271453)

Property changes on: user/ae/inet6/share
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/share:r271428-271452
Index: user/ae/inet6/sys/amd64/include/vmm.h
===================================================================
--- user/ae/inet6/sys/amd64/include/vmm.h	(revision 271452)
+++ user/ae/inet6/sys/amd64/include/vmm.h	(revision 271453)
@@ -1,616 +1,617 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _VMM_H_
 #define	_VMM_H_
 
 #include <x86/segments.h>
 
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
 	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
 /*
  * Identifiers for architecturally defined registers.
  */
 enum vm_reg_name {
 	VM_REG_GUEST_RAX,
 	VM_REG_GUEST_RBX,
 	VM_REG_GUEST_RCX,
 	VM_REG_GUEST_RDX,
 	VM_REG_GUEST_RSI,
 	VM_REG_GUEST_RDI,
 	VM_REG_GUEST_RBP,
 	VM_REG_GUEST_R8,
 	VM_REG_GUEST_R9,
 	VM_REG_GUEST_R10,
 	VM_REG_GUEST_R11,
 	VM_REG_GUEST_R12,
 	VM_REG_GUEST_R13,
 	VM_REG_GUEST_R14,
 	VM_REG_GUEST_R15,
 	VM_REG_GUEST_CR0,
 	VM_REG_GUEST_CR3,
 	VM_REG_GUEST_CR4,
 	VM_REG_GUEST_DR7,
 	VM_REG_GUEST_RSP,
 	VM_REG_GUEST_RIP,
 	VM_REG_GUEST_RFLAGS,
 	VM_REG_GUEST_ES,
 	VM_REG_GUEST_CS,
 	VM_REG_GUEST_SS,
 	VM_REG_GUEST_DS,
 	VM_REG_GUEST_FS,
 	VM_REG_GUEST_GS,
 	VM_REG_GUEST_LDTR,
 	VM_REG_GUEST_TR,
 	VM_REG_GUEST_IDTR,
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
 	VM_REG_GUEST_PDPTE0,
 	VM_REG_GUEST_PDPTE1,
 	VM_REG_GUEST_PDPTE2,
 	VM_REG_GUEST_PDPTE3,
+	VM_REG_GUEST_INTR_SHADOW,
 	VM_REG_LAST
 };
 
 enum x2apic_state {
 	X2APIC_DISABLED,
 	X2APIC_ENABLED,
 	X2APIC_STATE_LAST
 };
 
 #define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
 #define	VM_INTINFO_DEL_ERRCODE	0x800
 #define	VM_INTINFO_RSVD		0x7ffff000
 #define	VM_INTINFO_VALID	0x80000000
 #define	VM_INTINFO_TYPE		0x700
 #define	VM_INTINFO_HWINTR	(0 << 8)
 #define	VM_INTINFO_NMI		(2 << 8)
 #define	VM_INTINFO_HWEXCEPTION	(3 << 8)
 #define	VM_INTINFO_SWINTR	(4 << 8)
 
 #ifdef _KERNEL
 
 #define	VM_MAX_NAMELEN	32
 
 struct vm;
 struct vm_exception;
 struct vm_memory_segment;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
 struct vhpet;
 struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
 struct vm_guest_paging;
 struct pmap;
 
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
 				  struct pmap *pmap, void *rendezvous_cookie,
 				  void *suspend_cookie);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t *retval);
 typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
 				      uint64_t val);
 typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
 				  struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
 typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
 typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
 typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
 
 struct vmm_ops {
 	vmm_init_func_t		init;		/* module wide initialization */
 	vmm_cleanup_func_t	cleanup;
 	vmm_resume_func_t	resume;
 
 	vmi_init_func_t		vminit;		/* vm-specific initialization */
 	vmi_run_func_t		vmrun;
 	vmi_cleanup_func_t	vmcleanup;
 	vmi_get_register_t	vmgetreg;
 	vmi_set_register_t	vmsetreg;
 	vmi_get_desc_t		vmgetdesc;
 	vmi_set_desc_t		vmsetdesc;
 	vmi_get_cap_t		vmgetcap;
 	vmi_set_cap_t		vmsetcap;
 	vmi_vmspace_alloc	vmspace_alloc;
 	vmi_vmspace_free	vmspace_free;
 	vmi_vlapic_init		vlapic_init;
 	vmi_vlapic_cleanup	vlapic_cleanup;
 };
 
 extern struct vmm_ops vmm_ops_intel;
 extern struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
 void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
 		  void **cookie);
 void vm_gpa_release(void *cookie);
 int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 	      struct vm_memory_segment *seg);
 int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
 		  vm_offset_t *offset, struct vm_object **object);
 boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *ret_desc);
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		    struct seg_desc *desc);
 int vm_run(struct vm *vm, struct vm_run *vmrun);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vm *vm, int vcpu);
 int vm_nmi_pending(struct vm *vm, int vcpuid);
 void vm_nmi_clear(struct vm *vm, int vcpuid);
 int vm_inject_extint(struct vm *vm, int vcpu);
 int vm_extint_pending(struct vm *vm, int vcpuid);
 void vm_extint_clear(struct vm *vm, int vcpuid);
 uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
 int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
 int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
 int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
 int vm_activate_cpu(struct vm *vm, int vcpu);
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 
 /*
  * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
  * The rendezvous 'func(arg)' is not allowed to do anything that will
  * cause the thread to be put to sleep.
  *
  * If the rendezvous is being initiated from a vcpu context then the
  * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
  *
  * The caller cannot hold any locks when initiating the rendezvous.
  *
  * The implementation of this API may cause vcpus other than those specified
  * by 'dest' to be stalled. The caller should not rely on any vcpus making
  * forward progress when the rendezvous is in progress.
  */
 typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
 void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
 
 static __inline int
 vcpu_rendezvous_pending(void *rendezvous_cookie)
 {
 
 	return (*(uintptr_t *)rendezvous_cookie != 0);
 }
 
 static __inline int
 vcpu_suspended(void *suspend_cookie)
 {
 
 	return (*(int *)suspend_cookie);
 }
 
 /*
  * Return 1 if device indicated by bus/slot/func is supposed to be a
  * pci passthrough device.
  *
  * Return 0 otherwise.
  */
 int vmm_is_pptdev(int bus, int slot, int func);
 
 void *vm_iommu_domain(struct vm *vm);
 
 enum vcpu_state {
 	VCPU_IDLE,
 	VCPU_FROZEN,
 	VCPU_RUNNING,
 	VCPU_SLEEPING,
 };
 
 int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
     bool from_idle);
 enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
 
 static int __inline
 vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 {
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
 #ifdef _SYS_PROC_H_
 static int __inline
 vcpu_should_yield(struct vm *vm, int vcpu)
 {
 	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
 }
 #endif
 
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
 int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 
 /*
  * Inject exception 'vme' into the guest vcpu. This function returns 0 on
  * success and non-zero on failure.
  *
  * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
  * this function directly because they enforce the trap-like or fault-like
  * behavior of an exception.
  *
  * This function should only be called in the context of the thread that is
  * executing this vcpu.
  */
 int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
 
 /*
  * This function is called after a VM-exit that occurred during exception or
  * interrupt delivery through the IDT. The format of 'intinfo' is described
  * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
  * If a VM-exit handler completes the event delivery successfully then it
  * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
  * if the task switch emulation is triggered via a task gate then it should
  * call this function with 'intinfo=0' to indicate that the external event
  * is not pending anymore.
  *
  * Return value is 0 on success and non-zero on failure.
  */
 int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
 /*
  * This function is called before every VM-entry to retrieve a pending
  * event that should be injected into the guest. This function combines
  * nested events into a double or triple fault.
  *
  * Returns 0 if there are no events that need to be injected into the guest
  * and non-zero otherwise.
  */
 int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
 
 int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
 	uint64_t	gpa;
 	size_t		len;
 	void		*hva;
 	void		*cookie;
 };
 
 /*
  * Set up 'copyinfo[]' to copy to/from guest linear address space starting
  * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
  * a copyin or PROT_WRITE for a copyout. 
  *
  * Returns 0 on success.
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  *
  * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
  * the return value is 0. The 'copyinfo[]' resources should be freed by calling
  * 'vm_copy_teardown()' after the copy is done.
  */
 int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo);
 void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     void *kaddr, size_t len);
 void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
 
 /*
  * Identifiers for optional vmm capabilities
  */
 enum vm_cap_type {
 	VM_CAP_HALT_EXIT,
 	VM_CAP_MTRAP_EXIT,
 	VM_CAP_PAUSE_EXIT,
 	VM_CAP_UNRESTRICTED_GUEST,
 	VM_CAP_ENABLE_INVPCID,
 	VM_CAP_MAX
 };
 
 enum vm_intr_trigger {
 	EDGE_TRIGGER,
 	LEVEL_TRIGGER
 };
 	
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
  *
  * XXX The contents of the 'access' field are architecturally defined except
  * bit 16 - Segment Unusable.
  */
 struct seg_desc {
 	uint64_t	base;
 	uint32_t	limit;
 	uint32_t	access;
 };
 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
 #define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
 #define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
 #define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
 #define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
 	CPU_MODE_REAL,
 	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
 
 enum vm_paging_mode {
 	PAGING_MODE_FLAT,
 	PAGING_MODE_32,
 	PAGING_MODE_PAE,
 	PAGING_MODE_64,
 };
 
 struct vm_guest_paging {
 	uint64_t	cr3;
 	int		cpl;
 	enum vm_cpu_mode cpu_mode;
 	enum vm_paging_mode paging_mode;
 };
 
 /*
  * The data structures 'vie' and 'vie_op' are meant to be opaque to the
  * consumers of instruction decoding. The only reason why their contents
  * need to be exposed is because they are part of the 'vm_exit' structure.
  */
 struct vie_op {
 	uint8_t		op_byte;	/* actual opcode byte */
 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
 	uint16_t	op_flags;
 };
 
 #define	VIE_INST_SIZE	15
 struct vie {
 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
 	uint8_t		num_valid;		/* size of the instruction */
 	uint8_t		num_processed;
 
 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
 			rex_present:1,
 			opsize_override:1,	/* Operand size override */
 			addrsize_override:1;	/* Address size override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
 			rm:4;
 
 	uint8_t		ss:2,			/* SIB byte */
 			index:4,
 			base:4;
 
 	uint8_t		disp_bytes;
 	uint8_t		imm_bytes;
 
 	uint8_t		scale;
 	int		base_register;		/* VM_REG_GUEST_xyz */
 	int		index_register;		/* VM_REG_GUEST_xyz */
 
 	int64_t		displacement;		/* optional addr displacement */
 	int64_t		immediate;		/* optional immediate operand */
 
 	uint8_t		decoded;	/* set to 1 if successfully decoded */
 
 	struct vie_op	op;			/* opcode description */
 };
 
 enum vm_exitcode {
 	VM_EXITCODE_INOUT,
 	VM_EXITCODE_VMX,
 	VM_EXITCODE_BOGUS,
 	VM_EXITCODE_RDMSR,
 	VM_EXITCODE_WRMSR,
 	VM_EXITCODE_HLT,
 	VM_EXITCODE_MTRAP,
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
 	VM_EXITCODE_SPINUP_AP,
 	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
 	VM_EXITCODE_RENDEZVOUS,
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
 	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MAX
 };
 
 struct vm_inout {
 	uint16_t	bytes:3;	/* 1 or 2 or 4 */
 	uint16_t	in:1;
 	uint16_t	string:1;
 	uint16_t	rep:1;
 	uint16_t	port;
 	uint32_t	eax;		/* valid for out */
 };
 
 struct vm_inout_str {
 	struct vm_inout	inout;		/* must be the first element */
 	struct vm_guest_paging paging;
 	uint64_t	rflags;
 	uint64_t	cr0;
 	uint64_t	index;
 	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
 	int		addrsize;
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
 };
 
 enum task_switch_reason {
 	TSR_CALL,
 	TSR_IRET,
 	TSR_JMP,
 	TSR_IDT_GATE,	/* task gate in IDT */
 };
 
 struct vm_task_switch {
 	uint16_t	tsssel;		/* new TSS selector */
 	int		ext;		/* task switch due to external event */
 	uint32_t	errcode;
 	int		errcode_valid;	/* push 'errcode' on the new stack */
 	enum task_switch_reason reason;
 	struct vm_guest_paging paging;
 };
 
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
 	uint64_t		rip;
 	union {
 		struct vm_inout	inout;
 		struct vm_inout_str inout_str;
 		struct {
 			uint64_t	gpa;
 			int		fault_type;
 		} paging;
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
 			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
 		/*
 		 * VMX specific payload. Used when there is no "better"
 		 * exitcode to represent the VM-exit.
 		 */
 		struct {
 			int		status;		/* vmx inst status */
 			/*
 			 * 'exit_reason' and 'exit_qualification' are valid
 			 * only if 'status' is zero.
 			 */
 			uint32_t	exit_reason;
 			uint64_t	exit_qualification;
 			/*
 			 * 'inst_error' and 'inst_type' are valid
 			 * only if 'status' is non-zero.
 			 */
 			int		inst_type;
 			int		inst_error;
 		} vmx;
 		struct {
 			uint32_t	code;		/* ecx value */
 			uint64_t	wval;
 		} msr;
 		struct {
 			int		vcpu;
 			uint64_t	rip;
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
 		} hlt;
 		struct {
 			int		vector;
 		} ioapic_eoi;
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
 		struct vm_task_switch task_switch;
 	} u;
 };
 
 /* APIs to inject faults into the guest */
 void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
     int errcode);
 
 static __inline void
 vm_inject_ud(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
 }
 
 static __inline void
 vm_inject_gp(void *vm, int vcpuid)
 {
 	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
 }
 
 static __inline void
 vm_inject_ac(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
 }
 
 static __inline void
 vm_inject_ss(void *vm, int vcpuid, int errcode)
 {
 	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
 }
 
 void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
 
 #endif	/* _VMM_H_ */

Property changes on: user/ae/inet6/sys/amd64/include/vmm.h
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys/amd64/include/vmm.h:r271428-271452
Index: user/ae/inet6/sys/amd64/vmm/intel/vmx.c
===================================================================
--- user/ae/inet6/sys/amd64/vmm/intel/vmx.c	(revision 271452)
+++ user/ae/inet6/sys/amd64/vmm/intel/vmx.c	(revision 271453)
@@ -1,3308 +1,3354 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/psl.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/segments.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 #include "vmm_host.h"
 #include "vmm_ioport.h"
 #include "vmm_ipi.h"
 #include "vmm_msr.h"
 #include "vmm_ktr.h"
 #include "vmm_stat.h"
 #include "vatpic.h"
 #include "vlapic.h"
 #include "vlapic_priv.h"
 
 #include "vmx_msr.h"
 #include "ept.h"
 #include "vmx_cpufunc.h"
 #include "vmx.h"
 #include "x86.h"
 #include "vmx_controls.h"
 
 #define	PINBASED_CTLS_ONE_SETTING					\
 	(PINBASED_EXTINT_EXITING	|				\
 	 PINBASED_NMI_EXITING		|				\
 	 PINBASED_VIRTUAL_NMI)
 #define	PINBASED_CTLS_ZERO_SETTING	0
 
 #define PROCBASED_CTLS_WINDOW_SETTING					\
 	(PROCBASED_INT_WINDOW_EXITING	|				\
 	 PROCBASED_NMI_WINDOW_EXITING)
 
 #define	PROCBASED_CTLS_ONE_SETTING 					\
 	(PROCBASED_SECONDARY_CONTROLS	|				\
 	 PROCBASED_IO_EXITING		|				\
 	 PROCBASED_MSR_BITMAPS		|				\
 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
 	 PROCBASED_CR8_LOAD_EXITING	|				\
 	 PROCBASED_CR8_STORE_EXITING)
 #define	PROCBASED_CTLS_ZERO_SETTING	\
 	(PROCBASED_CR3_LOAD_EXITING |	\
 	PROCBASED_CR3_STORE_EXITING |	\
 	PROCBASED_IO_BITMAPS)
 
 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
 #define	PROCBASED_CTLS2_ZERO_SETTING	0
 
 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
 	(VM_EXIT_HOST_LMA			|			\
 	VM_EXIT_SAVE_EFER			|			\
 	VM_EXIT_LOAD_EFER)
 
 #define	VM_EXIT_CTLS_ONE_SETTING					\
 	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
 	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
 	VM_EXIT_SAVE_PAT			|			\
 	VM_EXIT_LOAD_PAT)
 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
 
 #define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
 
 #define	VM_ENTRY_CTLS_ONE_SETTING					\
 	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
 	VM_ENTRY_LOAD_PAT)
 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
 	VM_ENTRY_INTO_SMM			|			\
 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
 
 #define	guest_msr_rw(vmx, msr) \
 	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
 
 #define	guest_msr_ro(vmx, msr) \
     msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
 
 #define	HANDLED		1
 #define	UNHANDLED	0
 
 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
 
 int vmxon_enabled[MAXCPU];
 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
 static uint32_t exit_ctls, entry_ctls;
 
 static uint64_t cr0_ones_mask, cr0_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
 	     &cr0_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
 	     &cr0_zeros_mask, 0, NULL);
 
 static uint64_t cr4_ones_mask, cr4_zeros_mask;
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 	     &cr4_ones_mask, 0, NULL);
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
 
 /*
  * Optional capabilities
  */
 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
 
 static int vmx_patmsr;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
     "PAT MSR saved and restored in VCMS");
 
 static int cap_halt_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
     "HLT triggers a VM-exit");
 
 static int cap_pause_exit;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
     0, "PAUSE triggers a VM-exit");
 
 static int cap_unrestricted_guest;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
     &cap_unrestricted_guest, 0, "Unrestricted guests");
 
 static int cap_monitor_trap;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
     &cap_monitor_trap, 0, "Monitor trap flag");
 
 static int cap_invpcid;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
     &pirvec, 0, "APICv posted interrupt vector");
 
 static struct unrhdr *vpid_unr;
 static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);
 
 /*
  * Use the last page below 4GB as the APIC access address. This address is
  * occupied by the boot firmware so it is guaranteed that it will not conflict
  * with a page in system memory.
  */
 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
 static void vmx_inject_pir(struct vlapic *vlapic);
 
 #ifdef KTR
 static const char *
 exit_reason_to_str(int reason)
 {
 	static char reasonbuf[32];
 
 	switch (reason) {
 	case EXIT_REASON_EXCEPTION:
 		return "exception";
 	case EXIT_REASON_EXT_INTR:
 		return "extint";
 	case EXIT_REASON_TRIPLE_FAULT:
 		return "triplefault";
 	case EXIT_REASON_INIT:
 		return "init";
 	case EXIT_REASON_SIPI:
 		return "sipi";
 	case EXIT_REASON_IO_SMI:
 		return "iosmi";
 	case EXIT_REASON_SMI:
 		return "smi";
 	case EXIT_REASON_INTR_WINDOW:
 		return "intrwindow";
 	case EXIT_REASON_NMI_WINDOW:
 		return "nmiwindow";
 	case EXIT_REASON_TASK_SWITCH:
 		return "taskswitch";
 	case EXIT_REASON_CPUID:
 		return "cpuid";
 	case EXIT_REASON_GETSEC:
 		return "getsec";
 	case EXIT_REASON_HLT:
 		return "hlt";
 	case EXIT_REASON_INVD:
 		return "invd";
 	case EXIT_REASON_INVLPG:
 		return "invlpg";
 	case EXIT_REASON_RDPMC:
 		return "rdpmc";
 	case EXIT_REASON_RDTSC:
 		return "rdtsc";
 	case EXIT_REASON_RSM:
 		return "rsm";
 	case EXIT_REASON_VMCALL:
 		return "vmcall";
 	case EXIT_REASON_VMCLEAR:
 		return "vmclear";
 	case EXIT_REASON_VMLAUNCH:
 		return "vmlaunch";
 	case EXIT_REASON_VMPTRLD:
 		return "vmptrld";
 	case EXIT_REASON_VMPTRST:
 		return "vmptrst";
 	case EXIT_REASON_VMREAD:
 		return "vmread";
 	case EXIT_REASON_VMRESUME:
 		return "vmresume";
 	case EXIT_REASON_VMWRITE:
 		return "vmwrite";
 	case EXIT_REASON_VMXOFF:
 		return "vmxoff";
 	case EXIT_REASON_VMXON:
 		return "vmxon";
 	case EXIT_REASON_CR_ACCESS:
 		return "craccess";
 	case EXIT_REASON_DR_ACCESS:
 		return "draccess";
 	case EXIT_REASON_INOUT:
 		return "inout";
 	case EXIT_REASON_RDMSR:
 		return "rdmsr";
 	case EXIT_REASON_WRMSR:
 		return "wrmsr";
 	case EXIT_REASON_INVAL_VMCS:
 		return "invalvmcs";
 	case EXIT_REASON_INVAL_MSR:
 		return "invalmsr";
 	case EXIT_REASON_MWAIT:
 		return "mwait";
 	case EXIT_REASON_MTF:
 		return "mtf";
 	case EXIT_REASON_MONITOR:
 		return "monitor";
 	case EXIT_REASON_PAUSE:
 		return "pause";
 	case EXIT_REASON_MCE:
 		return "mce";
 	case EXIT_REASON_TPR:
 		return "tpr";
 	case EXIT_REASON_APIC_ACCESS:
 		return "apic-access";
 	case EXIT_REASON_GDTR_IDTR:
 		return "gdtridtr";
 	case EXIT_REASON_LDTR_TR:
 		return "ldtrtr";
 	case EXIT_REASON_EPT_FAULT:
 		return "eptfault";
 	case EXIT_REASON_EPT_MISCONFIG:
 		return "eptmisconfig";
 	case EXIT_REASON_INVEPT:
 		return "invept";
 	case EXIT_REASON_RDTSCP:
 		return "rdtscp";
 	case EXIT_REASON_VMX_PREEMPT:
 		return "vmxpreempt";
 	case EXIT_REASON_INVVPID:
 		return "invvpid";
 	case EXIT_REASON_WBINVD:
 		return "wbinvd";
 	case EXIT_REASON_XSETBV:
 		return "xsetbv";
 	case EXIT_REASON_APIC_WRITE:
 		return "apic-write";
 	default:
 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
 		return (reasonbuf);
 	}
 }
 #endif	/* KTR */
 
 static int
 vmx_allow_x2apic_msrs(struct vmx *vmx)
 {
 	int i, error;
 
 	error = 0;
 
 	/*
 	 * Allow readonly access to the following x2APIC MSRs from the guest.
 	 */
 	error += guest_msr_ro(vmx, MSR_APIC_ID);
 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
 
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
 	
 	for (i = 0; i < 8; i++)
 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
 
 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
 
 	/*
 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
 	 *
 	 * These registers get special treatment described in the section
 	 * "Virtualizing MSR-Based APIC Accesses".
 	 */
 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
 
 	return (error);
 }
 
 u_long
 vmx_fix_cr0(u_long cr0)
 {
 
 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
 }
 
 u_long
 vmx_fix_cr4(u_long cr4)
 {
 
 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
 }
 
 static void
 vpid_free(int vpid)
 {
 	if (vpid < 0 || vpid > 0xffff)
 		panic("vpid_free: invalid vpid %d", vpid);
 
 	/*
 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
 	 * the unit number allocator.
 	 */
 
 	if (vpid > VM_MAXCPU)
 		free_unr(vpid_unr, vpid);
 }
 
 static void
 vpid_alloc(uint16_t *vpid, int num)
 {
 	int i, x;
 
 	if (num <= 0 || num > VM_MAXCPU)
 		panic("invalid number of vpids requested: %d", num);
 
 	/*
 	 * If the "enable vpid" execution control is not enabled then the
 	 * VPID is required to be 0 for all vcpus.
 	 */
 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
 		for (i = 0; i < num; i++)
 			vpid[i] = 0;
 		return;
 	}
 
 	/*
 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
 	 */
 	for (i = 0; i < num; i++) {
 		x = alloc_unr(vpid_unr);
 		if (x == -1)
 			break;
 		else
 			vpid[i] = x;
 	}
 
 	if (i < num) {
 		atomic_add_int(&vpid_alloc_failed, 1);
 
 		/*
 		 * If the unit number allocator does not have enough unique
 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
 		 *
 		 * These VPIDs are not be unique across VMs but this does not
 		 * affect correctness because the combined mappings are also
 		 * tagged with the EP4TA which is unique for each VM.
 		 *
 		 * It is still sub-optimal because the invvpid will invalidate
 		 * combined mappings for a particular VPID across all EP4TAs.
 		 */
 		while (i-- > 0)
 			vpid_free(vpid[i]);
 
 		for (i = 0; i < num; i++)
 			vpid[i] = i + 1;
 	}
 }
 
 static void
 vpid_init(void)
 {
 	/*
 	 * VPID 0 is required when the "enable VPID" execution control is
 	 * disabled.
 	 *
 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
 	 * unit number allocator does not have sufficient unique VPIDs to
 	 * satisfy the allocation.
 	 *
 	 * The remaining VPIDs are managed by the unit number allocator.
 	 */
 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
 }
 
 static void
 msr_save_area_init(struct msr_entry *g_area, int *g_count)
 {
 	int cnt;
 
 	static struct msr_entry guest_msrs[] = {
 		{ MSR_KGSBASE, 0, 0 },
 	};
 
 	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
 	if (cnt > GUEST_MSR_MAX_ENTRIES)
 		panic("guest msr save area overrun");
 	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
 	*g_count = cnt;
 }
 
 static void
 vmx_disable(void *arg __unused)
 {
 	struct invvpid_desc invvpid_desc = { 0 };
 	struct invept_desc invept_desc = { 0 };
 
 	if (vmxon_enabled[curcpu]) {
 		/*
 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
 		 *
 		 * VMXON or VMXOFF are not required to invalidate any TLB
 		 * caching structures. This prevents potential retention of
 		 * cached information in the TLB between distinct VMX episodes.
 		 */
 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
 		vmxoff();
 	}
 	load_cr4(rcr4() & ~CR4_VMXE);
 }
 
 static int
 vmx_cleanup(void)
 {
 	
 	if (pirvec != 0)
 		vmm_ipi_free(pirvec);
 
 	if (vpid_unr != NULL) {
 		delete_unrhdr(vpid_unr);
 		vpid_unr = NULL;
 	}
 
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
 }
 
 static void
 vmx_enable(void *arg __unused)
 {
 	int error;
 	uint64_t feature_control;
 
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		wrmsr(MSR_IA32_FEATURE_CONTROL,
 		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
 		    IA32_FEATURE_CONTROL_LOCK);
 	}
 
 	load_cr4(rcr4() | CR4_VMXE);
 
 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
 	error = vmxon(vmxon_region[curcpu]);
 	if (error == 0)
 		vmxon_enabled[curcpu] = 1;
 }
 
 static void
 vmx_restore(void)
 {
 
 	if (vmxon_enabled[curcpu])
 		vmxon(vmxon_region[curcpu]);
 }
 
 static int
 vmx_init(int ipinum)
 {
 	int error, use_tpr_shadow;
 	uint64_t basic, fixed0, fixed1, feature_control;
 	uint32_t tmp, procbased2_vid_bits;
 
 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
 	if (!(cpu_feature2 & CPUID2_VMX)) {
 		printf("vmx_init: processor does not support VMX operation\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
 	 * are set (bits 0 and 2 respectively).
 	 */
 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
 		printf("vmx_init: VMX operation disabled by BIOS\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Verify capabilities MSR_VMX_BASIC:
 	 * - bit 54 indicates support for INS/OUTS decoding
 	 */
 	basic = rdmsr(MSR_VMX_BASIC);
 	if ((basic & (1UL << 54)) == 0) {
 		printf("vmx_init: processor does not support desired basic "
 		    "capabilities\n");
 		return (EINVAL);
 	}
 
 	/* Check support for primary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 			       MSR_VMX_TRUE_PROCBASED_CTLS,
 			       PROCBASED_CTLS_ONE_SETTING,
 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired primary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Clear the processor-based ctl bits that are set on demand */
 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
 
 	/* Check support for secondary processor-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 			       MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED_CTLS2_ONE_SETTING,
 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
 	if (error) {
 		printf("vmx_init: processor does not support desired secondary "
 		       "processor-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VPID */
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
 	if (error == 0)
 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
 
 	/* Check support for pin-based VM-execution controls */
 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 			       MSR_VMX_TRUE_PINBASED_CTLS,
 			       PINBASED_CTLS_ONE_SETTING,
 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "pin-based controls\n");
 		return (error);
 	}
 
 	/* Check support for VM-exit controls */
 	vmx_patmsr = 1;
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
 			       &exit_ctls);
 	if (error) {
 		/* Try again without the PAT MSR bits */
 		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
 				       MSR_VMX_TRUE_EXIT_CTLS,
 				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
 				       VM_EXIT_CTLS_ZERO_SETTING,
 				       &exit_ctls);
 		if (error) {
 			printf("vmx_init: processor does not support desired "
 			       "exit controls\n");
 			return (error);
 		} else {
 			if (bootverbose)
 				printf("vmm: PAT MSR access not supported\n");
 			guest_msr_valid(MSR_PAT);
 			vmx_patmsr = 0;
 		}
 	}
 
 	/* Check support for VM-entry controls */
 	if (vmx_patmsr) {
 		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
 				       MSR_VMX_TRUE_ENTRY_CTLS,
 				       VM_ENTRY_CTLS_ONE_SETTING,
 				       VM_ENTRY_CTLS_ZERO_SETTING,
 				       &entry_ctls);
 	} else {
 		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
 				       MSR_VMX_TRUE_ENTRY_CTLS,
 				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
 				       VM_ENTRY_CTLS_ZERO_SETTING,
 				       &entry_ctls);
 	}
 
 	if (error) {
 		printf("vmx_init: processor does not support desired "
 		       "entry controls\n");
 		       return (error);
 	}
 
 	/*
 	 * Check support for optional features by testing them
 	 * as individual bits
 	 */
 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_TRUE_PROCBASED_CTLS,
 					PROCBASED_HLT_EXITING, 0,
 					&tmp) == 0);
 
 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					MSR_VMX_PROCBASED_CTLS,
 					PROCBASED_MTF, 0,
 					&tmp) == 0);
 
 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 					 MSR_VMX_TRUE_PROCBASED_CTLS,
 					 PROCBASED_PAUSE_EXITING, 0,
 					 &tmp) == 0);
 
 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 					MSR_VMX_PROCBASED_CTLS2,
 					PROCBASED2_UNRESTRICTED_GUEST, 0,
 				        &tmp) == 0);
 
 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
 	/*
 	 * Check support for virtual interrupt delivery.
 	 */
 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
 
 	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
 	    &tmp) == 0);
 
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 	    procbased2_vid_bits, 0, &tmp);
 	if (error == 0 && use_tpr_shadow) {
 		virtual_interrupt_delivery = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
 		    &virtual_interrupt_delivery);
 	}
 
 	if (virtual_interrupt_delivery) {
 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
 		procbased_ctls2 |= procbased2_vid_bits;
 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 
 		/*
 		 * No need to emulate accesses to %CR8 if virtual
 		 * interrupt delivery is enabled.
 		 */
 		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
 		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
 
 		/*
 		 * Check for Posted Interrupts only if Virtual Interrupt
 		 * Delivery is enabled.
 		 */
 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
 			pirvec = vmm_ipi_alloc();
 			if (pirvec == 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
 					    "posted interrupt vector\n");
 				}
 			} else {
 				posted_interrupts = 1;
 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
 				    &posted_interrupts);
 			}
 		}
 	}
 
 	if (posted_interrupts)
 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
 
 	/* Initialize EPT */
 	error = ept_init(ipinum);
 	if (error) {
 		printf("vmx_init: ept initialization failed (%d)\n", error);
 		return (error);
 	}
 
 	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
 	cr0_ones_mask = fixed0 & fixed1;
 	cr0_zeros_mask = ~fixed0 & ~fixed1;
 
 	/*
 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
 	 * if unrestricted guest execution is allowed.
 	 */
 	if (cap_unrestricted_guest)
 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
 
 	/*
 	 * Do not allow the guest to set CR0_NW or CR0_CD.
 	 */
 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
 
 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
 	cr4_ones_mask = fixed0 & fixed1;
 	cr4_zeros_mask = ~fixed0 & ~fixed1;
 
 	vpid_init();
 
 	/* enable VMX operation */
 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
 
 	vmx_initialized = 1;
 
 	return (0);
 }
 
 static void
 vmx_trigger_hostintr(int vector)
 {
 	uintptr_t func;
 	struct gate_descriptor *gd;
 
 	gd = &idt[vector];
 
 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
 	    "invalid vector %d", vector));
 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
 	    vector));
 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
 	    "has invalid type %d", vector, gd->gd_type));
 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
 	    "has invalid dpl %d", vector, gd->gd_dpl));
 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
 	    "IST %d", vector, gd->gd_ist));
 
 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
 	vmx_call_isr(func);
 }
 
 static int
 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
 {
 	int error, mask_ident, shadow_ident;
 	uint64_t mask_value;
 
 	if (which != 0 && which != 4)
 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
 
 	if (which == 0) {
 		mask_ident = VMCS_CR0_MASK;
 		mask_value = cr0_ones_mask | cr0_zeros_mask;
 		shadow_ident = VMCS_CR0_SHADOW;
 	} else {
 		mask_ident = VMCS_CR4_MASK;
 		mask_value = cr4_ones_mask | cr4_zeros_mask;
 		shadow_ident = VMCS_CR4_SHADOW;
 	}
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
 	if (error)
 		return (error);
 
 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
 	if (error)
 		return (error);
 
 	return (0);
 }
 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
 
 static void *
 vmx_vminit(struct vm *vm, pmap_t pmap)
 {
 	uint16_t vpid[VM_MAXCPU];
 	int i, error, guest_msr_count;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 
 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 	if ((uintptr_t)vmx & PAGE_MASK) {
 		panic("malloc of struct vmx not aligned on %d byte boundary",
 		      PAGE_SIZE);
 	}
 	vmx->vm = vm;
 
 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
 
 	/*
 	 * Clean up EPTP-tagged guest physical and combined mappings
 	 *
 	 * VMX transitions are not required to invalidate any guest physical
 	 * mappings. So, it may be possible for stale guest physical mappings
 	 * to be present in the processor TLBs.
 	 *
 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 	 */
 	ept_invalidate_mappings(vmx->eptp);
 
 	msr_bitmap_initialize(vmx->msr_bitmap);
 
 	/*
 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
 	 * The guest FSBASE and GSBASE are saved and restored during
 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 	 * always restored from the vmcs host state area on vm-exit.
 	 *
 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
 	 * how they are saved/restored so can be directly accessed by the
 	 * guest.
 	 *
 	 * Guest KGSBASE is saved and restored in the guest MSR save area.
 	 * Host KGSBASE is restored before returning to userland from the pcb.
 	 * There will be a window of time when we are executing in the host
 	 * kernel context with a value of KGSBASE from the guest. This is ok
 	 * because the value of KGSBASE is inconsequential in kernel context.
 	 *
 	 * MSR_EFER is saved and restored in the guest VMCS area on a
 	 * VM exit and entry respectively. It is also restored from the
 	 * host VMCS area on a VM exit.
 	 *
 	 * The TSC MSR is exposed read-only. Writes are disallowed as that
 	 * will impact the host TSC.
 	 * XXX Writes would be implemented with a wrmsr trap, and
 	 * then modifying the TSC offset in the VMCS.
 	 */
 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
 	    guest_msr_rw(vmx, MSR_FSBASE) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 	    guest_msr_rw(vmx, MSR_KGSBASE) ||
 	    guest_msr_rw(vmx, MSR_EFER) ||
 	    guest_msr_ro(vmx, MSR_TSC))
 		panic("vmx_vminit: error setting guest msr access");
 
 	/*
 	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
 	 * and entry respectively. It is also restored from the host VMCS
 	 * area on a VM exit. However, if running on a system with no
 	 * MSR_PAT save/restore support, leave access disabled so accesses
 	 * will be trapped.
 	 */
 	if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
 		panic("vmx_vminit: error setting guest pat msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
 
 	if (virtual_interrupt_delivery) {
 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 		    APIC_ACCESS_ADDRESS);
 		/* XXX this should really return an error to the caller */
 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 	}
 
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vmcs = &vmx->vmcs[i];
 		vmcs->identifier = vmx_revision();
 		error = vmclear(vmcs);
 		if (error != 0) {
 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
 			      error, i);
 		}
 
 		error = vmcs_init(vmcs);
 		KASSERT(error == 0, ("vmcs_init error %d", error));
 
 		VMPTRLD(vmcs);
 		error = 0;
 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
 		error += vmwrite(VMCS_EPTP, vmx->eptp);
 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
 		if (virtual_interrupt_delivery) {
 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_VIRTUAL_APIC,
 			    vtophys(&vmx->apic_page[i]));
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
 			error += vmwrite(VMCS_EOI_EXIT3, 0);
 		}
 		if (posted_interrupts) {
 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
 			error += vmwrite(VMCS_PIR_DESC,
 			    vtophys(&vmx->pir_desc[i]));
 		}
 		VMCLEAR(vmcs);
 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 
 		vmx->cap[i].set = 0;
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
 		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
 
 		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
 		    guest_msr_count);
 		if (error != 0)
 			panic("vmcs_set_msr_save error %d", error);
 
 		/*
 		 * Set up the CR0/4 shadows, and init the read shadow
 		 * to the power-on register value from the Intel Sys Arch.
 		 *  CR0 - 0x60000010
 		 *  CR4 - 0
 		 */
 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
 		if (error != 0)
 			panic("vmx_setup_cr0_shadow %d", error);
 
 		error = vmx_setup_cr4_shadow(vmcs, 0);
 		if (error != 0)
 			panic("vmx_setup_cr4_shadow %d", error);
 
 		vmx->ctx[i].pmap = pmap;
 	}
 
 	return (vmx);
 }
 
 static int
 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 {
 	int handled, func;
 	
 	func = vmxctx->guest_rax;
 
 	handled = x86_emulate_cpuid(vm, vcpu,
 				    (uint32_t*)(&vmxctx->guest_rax),
 				    (uint32_t*)(&vmxctx->guest_rbx),
 				    (uint32_t*)(&vmxctx->guest_rcx),
 				    (uint32_t*)(&vmxctx->guest_rdx));
 	return (handled);
 }
 
 static __inline void
 vmx_run_trace(struct vmx *vmx, int vcpu)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
 #endif
 }
 
 static __inline void
 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 	       int handled)
 {
 #ifdef KTR
 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 		 handled ? "handled" : "unhandled",
 		 exit_reason_to_str(exit_reason), rip);
 #endif
 }
 
 static __inline void
 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 {
 #ifdef KTR
 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 #endif
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
 /*
  * Invalidate guest mappings identified by its vpid from the TLB.
  */
 static __inline void
 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->vpid == 0)
 		return;
 
 	if (!running) {
 		/*
 		 * Set the 'lastcpu' to an invalid host cpu.
 		 *
 		 * This will invalidate TLB entries tagged with the vcpu's
 		 * vpid the next time it runs via vmx_set_pcpu_defaults().
 		 */
 		vmxstate->lastcpu = NOCPU;
 		return;
 	}
 
 	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 	    "critical section", __func__, vcpu));
 
 	/*
 	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
 	 * mappings associated with its 'vpid' during that run. So we must
 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
 	 * stale and invalidate them.
 	 *
 	 * Note that we incur this penalty only when the scheduler chooses to
 	 * move the thread associated with this vcpu between host cpus.
 	 *
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
 	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 		invvpid_desc._res1 = 0;
 		invvpid_desc._res2 = 0;
 		invvpid_desc.vpid = vmxstate->vpid;
 		invvpid_desc.linear_addr = 0;
 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 	} else {
 		/*
 		 * The invvpid can be skipped if an invept is going to
 		 * be performed before entering the guest. The invept
 		 * will invalidate combined mappings tagged with
 		 * 'vmx->eptp' for all vpids.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
 static void
 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 {
 	struct vmxstate *vmxstate;
 
 	vmxstate = &vmx->state[vcpu];
 	if (vmxstate->lastcpu == curcpu)
 		return;
 
 	vmxstate->lastcpu = curcpu;
 
 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 
 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 	vmx_invvpid(vmx, vcpu, pmap, 1);
 }
 
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 
 static void __inline
 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
 static void __inline
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 	}
 }
 
 static void __inline
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
 
 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 {
 	uint32_t gi, info;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 	    "interruptibility-state %#x", gi));
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 	    "VM-entry interruption information %#x", info));
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
 	 * or the VMCS entry check will fail.
 	 */
 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
 }
 
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 {
 	int vector, need_nmi_exiting, extint_pending;
 	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
 		info = entryinfo;
 		vector = info & 0xff;
 		if (vector == IDT_BP || vector == IDT_OF) {
 			/*
 			 * VT-x requires #BP and #OF to be injected as software
 			 * exceptions.
 			 */
 			info &= ~VMCS_INTR_T_MASK;
 			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
 
 		if (info & VMCS_INTR_DEL_ERRCODE)
 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
 		/*
 		 * If there are no conditions blocking NMI injection then
 		 * inject it directly here otherwise enable "NMI window
 		 * exiting" to inject it as soon as we can.
 		 *
 		 * We also check for STI_BLOCKING because some implementations
 		 * don't allow NMI injection in this case. If we are running
 		 * on a processor that doesn't have this restriction it will
 		 * immediately exit and the NMI will be injected in the
 		 * "NMI window exiting" handler.
 		 */
 		need_nmi_exiting = 1;
 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 			if ((info & VMCS_INTR_VALID) == 0) {
 				vmx_inject_nmi(vmx, vcpu);
 				need_nmi_exiting = 0;
 			} else {
 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 				    "due to VM-entry intr info %#x", info);
 			}
 		} else {
 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 			    "Guest Interruptibility-state %#x", gi);
 		}
 
 		if (need_nmi_exiting)
 			vmx_set_nmi_window_exiting(vmx, vcpu);
 	}
 
 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
 
 	if (!extint_pending && virtual_interrupt_delivery) {
 		vmx_inject_pir(vlapic);
 		return;
 	}
 
 	/*
 	 * If interrupt-window exiting is already in effect then don't bother
 	 * checking for pending interrupts. This is just an optimization and
 	 * not needed for correctness.
 	 */
 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 		    "pending int_window_exiting");
 		return;
 	}
 
 	if (!extint_pending) {
 		/* Ask the local apic for a vector to inject */
 		if (!vlapic_pending_intr(vlapic, &vector))
 			return;
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [16,255] can be delivered
 		 *   through the local APIC.
 		*/
 		KASSERT(vector >= 16 && vector <= 255,
 		    ("invalid vector %d from local APIC", vector));
 	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(vmx->vm, &vector);
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
 		 * Hardware Interrupts":
 		 * - maskable interrupt vectors [0,255] can be delivered
 		 *   through the INTR pin.
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
 	}
 
 	/* Check RFLAGS.IF and the interruptibility state of the guest */
 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 	if ((rflags & PSL_I) == 0) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "rflags %#lx", vector, rflags);
 		goto cantinject;
 	}
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	if (gi & HWINTR_BLOCKING) {
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "Guest Interruptibility-state %#x", vector, gi);
 		goto cantinject;
 	}
 
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 	if (info & VMCS_INTR_VALID) {
 		/*
 		 * This is expected and could happen for multiple reasons:
 		 * - A vectoring VM-entry was aborted due to astpending
 		 * - A VM-exit happened during event injection.
 		 * - An exception was injected above.
 		 * - An NMI was injected above or after "NMI window exiting"
 		 */
 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 		    "VM-entry intr info %#x", vector, info);
 		goto cantinject;
 	}
 
 	/* Inject the interrupt */
 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 	info |= vector;
 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 
 	if (!extint_pending) {
 		/* Update the Local APIC ISR */
 		vlapic_intr_accepted(vlapic, vector);
 	} else {
 		vm_extint_clear(vmx->vm, vcpu);
 		vatpic_intr_accepted(vmx->vm, vector);
 
 		/*
 		 * After we accepted the current ExtINT the PIC may
 		 * have posted another one.  If that is the case, set
 		 * the Interrupt Window Exiting execution control so
 		 * we can inject that one too.
 		 *
 		 * Also, interrupt window exiting allows us to inject any
 		 * pending APIC vector that was preempted by the ExtINT
 		 * as soon as possible. This applies both for the software
 		 * emulated vlapic and the hardware assisted virtual APIC.
 		 */
 		vmx_set_int_window_exiting(vmx, vcpu);
 	}
 
 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 
 	return;
 
 cantinject:
 	/*
 	 * Set the Interrupt Window Exiting execution control so we can inject
 	 * the interrupt as soon as blocking condition goes away.
 	 */
 	vmx_set_int_window_exiting(vmx, vcpu);
 }
 
 /*
  * If the Virtual NMIs execution control is '1' then the logical processor
  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
  * the VMCS. An IRET instruction in VMX non-root operation will remove any
  * virtual-NMI blocking.
  *
  * This unblocking occurs even if the IRET causes a fault. In this case the
  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
  */
 static void
 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
 static void
 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 {
 	uint32_t gi;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 	    ("NMI blocking is not in effect %#x", gi));
 }
 
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	struct vmxctx *vmxctx;
 	uint64_t xcrval;
 	const struct xsave_limits *limits;
 
 	vmxctx = &vmx->ctx[vcpu];
 	limits = vmm_get_xsave_limits();
 
 	/*
 	 * Note that the processor raises a GP# fault on its own if
 	 * xsetbv is executed for CPL != 0, so we do not have to
 	 * emulate that fault here.
 	 */
 
 	/* Only xcr0 is supported. */
 	if (vmxctx->guest_rcx != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 		vm_inject_ud(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/* AVX (YMM_Hi128) requires SSE. */
 	if (xcrval & XFEATURE_ENABLED_AVX &&
 	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 	 * ZMM_Hi256, and Hi16_ZMM.
 	 */
 	if (xcrval & XFEATURE_AVX512 &&
 	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * Intel MPX requires both bound register state flags to be
 	 * set.
 	 */
 	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 		vm_inject_gp(vmx->vm, vcpu);
 		return (HANDLED);
 	}
 
 	/*
 	 * This runs "inside" vmrun() with the guest's FPU state, so
 	 * modifying xcr0 directly modifies the guest's xcr0, not the
 	 * host's.
 	 */
 	load_xcr(0, xcrval);
 	return (HANDLED);
 }
 
 static uint64_t
 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 {
 	const struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		return (vmxctx->guest_rax);
 	case 1:
 		return (vmxctx->guest_rcx);
 	case 2:
 		return (vmxctx->guest_rdx);
 	case 3:
 		return (vmxctx->guest_rbx);
 	case 4:
 		return (vmcs_read(VMCS_GUEST_RSP));
 	case 5:
 		return (vmxctx->guest_rbp);
 	case 6:
 		return (vmxctx->guest_rsi);
 	case 7:
 		return (vmxctx->guest_rdi);
 	case 8:
 		return (vmxctx->guest_r8);
 	case 9:
 		return (vmxctx->guest_r9);
 	case 10:
 		return (vmxctx->guest_r10);
 	case 11:
 		return (vmxctx->guest_r11);
 	case 12:
 		return (vmxctx->guest_r12);
 	case 13:
 		return (vmxctx->guest_r13);
 	case 14:
 		return (vmxctx->guest_r14);
 	case 15:
 		return (vmxctx->guest_r15);
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static void
 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 {
 	struct vmxctx *vmxctx;
 
 	vmxctx = &vmx->ctx[vcpu];
 
 	switch (ident) {
 	case 0:
 		vmxctx->guest_rax = regval;
 		break;
 	case 1:
 		vmxctx->guest_rcx = regval;
 		break;
 	case 2:
 		vmxctx->guest_rdx = regval;
 		break;
 	case 3:
 		vmxctx->guest_rbx = regval;
 		break;
 	case 4:
 		vmcs_write(VMCS_GUEST_RSP, regval);
 		break;
 	case 5:
 		vmxctx->guest_rbp = regval;
 		break;
 	case 6:
 		vmxctx->guest_rsi = regval;
 		break;
 	case 7:
 		vmxctx->guest_rdi = regval;
 		break;
 	case 8:
 		vmxctx->guest_r8 = regval;
 		break;
 	case 9:
 		vmxctx->guest_r9 = regval;
 		break;
 	case 10:
 		vmxctx->guest_r10 = regval;
 		break;
 	case 11:
 		vmxctx->guest_r11 = regval;
 		break;
 	case 12:
 		vmxctx->guest_r12 = regval;
 		break;
 	case 13:
 		vmxctx->guest_r13 = regval;
 		break;
 	case 14:
 		vmxctx->guest_r14 = regval;
 		break;
 	case 15:
 		vmxctx->guest_r15 = regval;
 		break;
 	default:
 		panic("invalid vmx register %d", ident);
 	}
 }
 
 static int
 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr0 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR0_SHADOW, regval);
 
 	crval = regval | cr0_ones_mask;
 	crval &= ~cr0_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR0, crval);
 
 	if (regval & CR0_PG) {
 		uint64_t efer, entry_ctls;
 
 		/*
 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 		 * the "IA-32e mode guest" bit in VM-entry control must be
 		 * equal.
 		 */
 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 		if (efer & EFER_LME) {
 			efer |= EFER_LMA;
 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 			entry_ctls |= VM_ENTRY_GUEST_LMA;
 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 		}
 	}
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	uint64_t crval, regval;
 
 	/* We only handle mov to %cr4 at this time */
 	if ((exitqual & 0xf0) != 0x00)
 		return (UNHANDLED);
 
 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 
 	vmcs_write(VMCS_CR4_SHADOW, regval);
 
 	crval = regval | cr4_ones_mask;
 	crval &= ~cr4_zeros_mask;
 	vmcs_write(VMCS_GUEST_CR4, crval);
 
 	return (HANDLED);
 }
 
 static int
 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 {
 	struct vlapic *vlapic;
 	uint64_t cr8;
 	int regnum;
 
 	/* We only handle mov %cr8 to/from a register at this time. */
 	if ((exitqual & 0xe0) != 0x00) {
 		return (UNHANDLED);
 	}
 
 	vlapic = vm_lapic(vmx->vm, vcpu);
 	regnum = (exitqual >> 8) & 0xf;
 	if (exitqual & 0x10) {
 		cr8 = vlapic_get_cr8(vlapic);
 		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 	} else {
 		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 		vlapic_set_cr8(vlapic, cr8);
 	}
 
 	return (HANDLED);
 }
 
 /*
  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
  */
 static int
 vmx_cpl(void)
 {
 	uint32_t ssar;
 
 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 	return ((ssar >> 5) & 0x3);
 }
 
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
 	uint32_t csar;
 
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		if (csar & 0x2000)
 			return (CPU_MODE_64BIT);	/* CS.L = 1 */
 		else
 			return (CPU_MODE_COMPATIBILITY);
 	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 		return (CPU_MODE_PROTECTED);
 	} else {
 		return (CPU_MODE_REAL);
 	}
 }
 
 static enum vm_paging_mode
 vmx_paging_mode(void)
 {
 
 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 		return (PAGING_MODE_FLAT);
 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 		return (PAGING_MODE_32);
 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 		return (PAGING_MODE_64);
 	else
 		return (PAGING_MODE_PAE);
 }
 
 static uint64_t
 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 {
 	uint64_t val;
 	int error;
 	enum vm_reg_name reg;
 
 	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 	error = vmx_getreg(vmx, vcpuid, reg, &val);
 	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 	return (val);
 }
 
 static uint64_t
 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 {
 	uint64_t val;
 	int error;
 
 	if (rep) {
 		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 	} else {
 		val = 1;
 	}
 	return (val);
 }
 
 static int
 inout_str_addrsize(uint32_t inst_info)
 {
 	uint32_t size;
 
 	size = (inst_info >> 7) & 0x7;
 	switch (size) {
 	case 0:
 		return (2);	/* 16 bit */
 	case 1:
 		return (4);	/* 32 bit */
 	case 2:
 		return (8);	/* 64 bit */
 	default:
 		panic("%s: invalid size encoding %d", __func__, size);
 	}
 }
 
 static void
 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
     struct vm_inout_str *vis)
 {
 	int error, s;
 
 	if (in) {
 		vis->seg_name = VM_REG_GUEST_ES;
 	} else {
 		s = (inst_info >> 15) & 0x7;
 		vis->seg_name = vm_segment_name(s);
 	}
 
 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 
 	/* XXX modify svm.c to update bit 16 of seg_desc.access (unusable) */
 }
 
 static void
 vmx_paging_info(struct vm_guest_paging *paging)
 {
 	paging->cr3 = vmcs_guest_cr3();
 	paging->cpl = vmx_cpl();
 	paging->cpu_mode = vmx_cpu_mode();
 	paging->paging_mode = vmx_paging_mode();
 }
 
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
 	struct vm_guest_paging *paging;
 	uint32_t csar;
 	
 	paging = &vmexit->u.inst_emul.paging;
 
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
 	switch (paging->cpu_mode) {
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 		break;
 	default:
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}
 }
 
 static int
 ept_fault_type(uint64_t ept_qual)
 {
 	int fault_type;
 
 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 		fault_type = VM_PROT_WRITE;
 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		fault_type = VM_PROT_EXECUTE;
 	else
 		fault_type= VM_PROT_READ;
 
 	return (fault_type);
 }
 
 static boolean_t
 ept_emulation_fault(uint64_t ept_qual)
 {
 	int read, write;
 
 	/* EPT fault on an instruction fetch doesn't make sense here */
 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
 		return (FALSE);
 
 	/* EPT fault must be a read fault or a write fault */
 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 	if ((read | write) == 0)
 		return (FALSE);
 
 	/*
 	 * The EPT violation must have been caused by accessing a
 	 * guest-physical address that is a translation of a guest-linear
 	 * address.
 	 */
 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 		return (FALSE);
 	}
 
 	return (TRUE);
 }
 
 static __inline int
 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 }
 
 static __inline int
 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 {
 	uint32_t proc_ctls2;
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 }
 
 static int
 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
     uint64_t qual)
 {
 	int error, handled, offset;
 	uint32_t *apic_regs, vector;
 	bool retu;
 
 	handled = HANDLED;
 	offset = APIC_WRITE_OFFSET(qual);
 
 	if (!apic_access_virtualization(vmx, vcpuid)) {
 		/*
 		 * In general there should not be any APIC write VM-exits
 		 * unless APIC-access virtualization is enabled.
 		 *
 		 * However self-IPI virtualization can legitimately trigger
 		 * an APIC-write VM-exit so treat it specially.
 		 */
 		if (x2apic_virtualization(vmx, vcpuid) &&
 		    offset == APIC_OFFSET_SELF_IPI) {
 			apic_regs = (uint32_t *)(vlapic->apic_page);
 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 			vlapic_self_ipi_handler(vlapic, vector);
 			return (HANDLED);
 		} else
 			return (UNHANDLED);
 	}
 
 	switch (offset) {
 	case APIC_OFFSET_ID:
 		vlapic_id_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_LDR:
 		vlapic_ldr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_DFR:
 		vlapic_dfr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_SVR:
 		vlapic_svr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ESR:
 		vlapic_esr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_ICR_LOW:
 		retu = false;
 		error = vlapic_icrlo_write_handler(vlapic, &retu);
 		if (error != 0 || retu)
 			handled = UNHANDLED;
 		break;
 	case APIC_OFFSET_CMCI_LVT:
 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 		vlapic_lvt_write_handler(vlapic, offset);
 		break;
 	case APIC_OFFSET_TIMER_ICR:
 		vlapic_icrtmr_write_handler(vlapic);
 		break;
 	case APIC_OFFSET_TIMER_DCR:
 		vlapic_dcr_write_handler(vlapic);
 		break;
 	default:
 		handled = UNHANDLED;
 		break;
 	}
 	return (handled);
 }
 
 static bool
 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 {
 
 	if (apic_access_virtualization(vmx, vcpuid) &&
 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 		return (true);
 	else
 		return (false);
 }
 
 static int
 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint64_t qual;
 	int access_type, offset, allowed;
 
 	if (!apic_access_virtualization(vmx, vcpuid))
 		return (UNHANDLED);
 
 	qual = vmexit->u.vmx.exit_qualification;
 	access_type = APIC_ACCESS_TYPE(qual);
 	offset = APIC_ACCESS_OFFSET(qual);
 
 	allowed = 0;
 	if (access_type == 0) {
 		/*
 		 * Read data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	} else if (access_type == 1) {
 		/*
 		 * Write data access to the following registers is expected.
 		 */
 		switch (offset) {
 		case APIC_OFFSET_VER:
 		case APIC_OFFSET_APR:
 		case APIC_OFFSET_PPR:
 		case APIC_OFFSET_RRR:
 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 		case APIC_OFFSET_CMCI_LVT:
 		case APIC_OFFSET_TIMER_CCR:
 			allowed = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (allowed) {
 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 		    VIE_INVALID_GLA);
 	}
 
 	/*
 	 * Regardless of whether the APIC-access is allowed this handler
 	 * always returns UNHANDLED:
 	 * - if the access is allowed then it is handled by emulating the
 	 *   instruction that caused the VM-exit (outside the critical section)
 	 * - if the access is not allowed then it will be converted to an
 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 	 */
 	return (UNHANDLED);
 }
 
 static enum task_switch_reason
 vmx_task_switch_reason(uint64_t qual)
 {
 	int reason;
 
 	reason = (qual >> 30) & 0x3;
 	switch (reason) {
 	case 0:
 		return (TSR_CALL);
 	case 1:
 		return (TSR_IRET);
 	case 2:
 		return (TSR_JMP);
 	case 3:
 		return (TSR_IDT_GATE);
 	default:
 		panic("%s: invalid reason %d", __func__, reason);
 	}
 }
 
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
 	int error, handled, in;
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
 	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, reason;
 	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 
 	handled = UNHANDLED;
 	vmxctx = &vmx->ctx[vcpu];
 
 	qual = vmexit->u.vmx.exit_qualification;
 	reason = vmexit->u.vmx.exit_reason;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
 	 * VM exits that can be triggered during event delivery need to
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
 	idtvec_info = vmcs_idt_vectoring_info();
 	if (idtvec_info & VMCS_IDT_VEC_VALID) {
 		idtvec_info &= ~(1 << 12); /* clear undefined bit */
 		exitintinfo = idtvec_info;
 		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 			idtvec_err = vmcs_idt_vectoring_err();
 			exitintinfo |= (uint64_t)idtvec_err << 32;
 		}
 		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 		    __func__, error));
 
 		/*
 		 * If 'virtual NMIs' are being used and the VM-exit
 		 * happened while injecting an NMI during the previous
 		 * VM-entry, then clear "blocking by NMI" in the
 		 * Guest Interruptibility-State so the NMI can be
 		 * reinjected on the subsequent VM-entry.
 		 *
 		 * However, if the NMI was being delivered through a task
 		 * gate, then the new task must start execution with NMIs
 		 * blocked so don't clear NMI blocking in this case.
 		 */
 		intr_type = idtvec_info & VMCS_INTR_T_MASK;
 		if (intr_type == VMCS_INTR_T_NMI) {
 			if (reason != EXIT_REASON_TASK_SWITCH)
 				vmx_clear_nmi_blocking(vmx, vcpu);
 			else
 				vmx_assert_nmi_blocking(vmx, vcpu);
 		}
 
 		/*
 		 * Update VM-entry instruction length if the event being
 		 * delivered was a software interrupt or software exception.
 		 */
 		if (intr_type == VMCS_INTR_T_SWINTR ||
 		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
 	}
 
 	switch (reason) {
 	case EXIT_REASON_TASK_SWITCH:
 		ts = &vmexit->u.task_switch;
 		ts->tsssel = qual & 0xffff;
 		ts->reason = vmx_task_switch_reason(qual);
 		ts->ext = 0;
 		ts->errcode_valid = 0;
 		vmx_paging_info(&ts->paging);
 		/*
 		 * If the task switch was due to a CALL, JMP, IRET, software
 		 * interrupt (INT n) or software exception (INT3, INTO),
 		 * then the saved %rip references the instruction that caused
 		 * the task switch. The instruction length field in the VMCS
 		 * is valid in this case.
 		 *
 		 * In all other cases (e.g., NMI, hardware exception) the
 		 * saved %rip is one that would have been saved in the old TSS
 		 * had the task switch completed normally so the instruction
 		 * length field is not needed in this case and is explicitly
 		 * set to 0.
 		 */
 		if (ts->reason == TSR_IDT_GATE) {
 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 			    ("invalid idtvec_info %#x for IDT task switch",
 			    idtvec_info));
 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
 			if (intr_type != VMCS_INTR_T_SWINTR &&
 			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
 			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 				/* Task switch triggered by external event */
 				ts->ext = 1;
 				vmexit->inst_length = 0;
 				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 					ts->errcode_valid = 1;
 					ts->errcode = vmcs_idt_vectoring_err();
 				}
 			}
 		}
 		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 		    ts->ext ? "external" : "internal",
 		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		switch (qual & 0xf) {
 		case 0:
 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 			break;
 		case 4:
 			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 			break;
 		case 8:
 			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 			break;
 		}
 		break;
 	case EXIT_REASON_RDMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 		retu = false;
 		ecx = vmxctx->guest_rcx;
 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
 		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_RDMSR;
 			vmexit->u.msr.code = ecx;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_wrmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_WRMSR:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 		retu = false;
 		eax = vmxctx->guest_rax;
 		ecx = vmxctx->guest_rcx;
 		edx = vmxctx->guest_rdx;
 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 		    ecx, (uint64_t)edx << 32 | eax);
 		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
 		    (uint64_t)edx << 32 | eax, &retu);
 		if (error) {
 			vmexit->exitcode = VM_EXITCODE_WRMSR;
 			vmexit->u.msr.code = ecx;
 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 		} else if (!retu) {
 			handled = HANDLED;
 		} else {
 			/* Return to userspace with a valid exitcode */
 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 			    ("emulate_wrmsr retu with bogus exitcode"));
 		}
 		break;
 	case EXIT_REASON_HLT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 		vmexit->exitcode = VM_EXITCODE_MTRAP;
 		break;
 	case EXIT_REASON_PAUSE:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 		vmexit->exitcode = VM_EXITCODE_PAUSE;
 		break;
 	case EXIT_REASON_INTR_WINDOW:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 		vmx_clear_int_window_exiting(vmx, vcpu);
 		return (1);
 	case EXIT_REASON_EXT_INTR:
 		/*
 		 * External interrupts serve only to cause VM exits and allow
 		 * the host interrupt handler to run.
 		 *
 		 * If this external interrupt triggers a virtual interrupt
 		 * to a VM, then that state will be recorded by the
 		 * host interrupt handler in the VM's softc. We will inject
 		 * this virtual interrupt during the subsequent VM enter.
 		 */
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 
 		/*
 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 		 * This appears to be a bug in VMware Fusion?
 		 */
 		if (!(intr_info & VMCS_INTR_VALID))
 			return (1);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 		vmx_trigger_hostintr(intr_info & 0xff);
 
 		/*
 		 * This is special. We want to treat this as an 'handled'
 		 * VM-exit but not increment the instruction pointer.
 		 */
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 		return (1);
 	case EXIT_REASON_NMI_WINDOW:
 		/* Exit to allow the pending virtual NMI to be injected */
 		if (vm_nmi_pending(vmx->vm, vcpu))
 			vmx_inject_nmi(vmx, vcpu);
 		vmx_clear_nmi_window_exiting(vmx, vcpu);
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 		return (1);
 	case EXIT_REASON_INOUT:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 		vmexit->exitcode = VM_EXITCODE_INOUT;
 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
 		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 		if (vmexit->u.inout.string) {
 			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 			vis = &vmexit->u.inout_str;
 			vmx_paging_info(&vis->paging);
 			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 			vis->index = inout_str_index(vmx, vcpu, in);
 			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 			vis->addrsize = inout_str_addrsize(inst_info);
 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 		}
 		break;
 	case EXIT_REASON_CPUID:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 		break;
 	case EXIT_REASON_EXCEPTION:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 		    ("VM exit interruption info invalid: %#x", intr_info));
 
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
 		 * fault encountered during the execution of IRET then we must
 		 * restore the state of "virtual-NMI blocking" before resuming
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
 		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_info & 0xff) != IDT_DF &&
 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 
 		/*
 		 * The NMI has already been handled in vmx_exit_handle_nmi().
 		 */
 		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
 			return (1);
 		break;
 	case EXIT_REASON_EPT_FAULT:
 		/*
 		 * If 'gpa' lies within the address space allocated to
 		 * memory then this must be a nested page fault otherwise
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
 		if (vm_mem_allocated(vmx->vm, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = gpa;
 			vmexit->u.paging.fault_type = ept_fault_type(qual);
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 		} else if (ept_emulation_fault(qual)) {
 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 		}
 		/*
 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
 		 * EPT fault during the execution of IRET then we must restore
 		 * the state of "virtual-NMI blocking" before resuming.
 		 *
 		 * See description of "NMI unblocking due to IRET" in
 		 * "Exit Qualification for EPT Violations".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
 			vmx_restore_nmi_blocking(vmx, vcpu);
 		break;
 	case EXIT_REASON_VIRTUALIZED_EOI:
 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 		vmexit->inst_length = 0;	/* trap-like */
 		break;
 	case EXIT_REASON_APIC_ACCESS:
 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 		break;
 	case EXIT_REASON_APIC_WRITE:
 		/*
 		 * APIC-write VM exit is trap-like so the %rip is already
 		 * pointing to the next instruction.
 		 */
 		vmexit->inst_length = 0;
 		vlapic = vm_lapic(vmx->vm, vcpu);
 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 		break;
 	case EXIT_REASON_XSETBV:
 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 		break;
 	default:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 		break;
 	}
 
 	if (handled) {
 		/*
 		 * It is possible that control is returned to userland
 		 * even though we were able to handle the VM exit in the
 		 * kernel.
 		 *
 		 * In such a case we want to make sure that the userland
 		 * restarts guest execution at the instruction *after*
 		 * the one we just processed. Therefore we update the
 		 * guest rip in the VMCS and in 'vmexit'.
 		 */
 		vmexit->rip += vmexit->inst_length;
 		vmexit->inst_length = 0;
 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 	} else {
 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 			/*
 			 * If this VM exit was not claimed by anybody then
 			 * treat it as a generic VMX exit.
 			 */
 			vmexit->exitcode = VM_EXITCODE_VMX;
 			vmexit->u.vmx.status = VM_SUCCESS;
 			vmexit->u.vmx.inst_type = 0;
 			vmexit->u.vmx.inst_error = 0;
 		} else {
 			/*
 			 * The exitcode and collateral have been populated.
 			 * The VM exit will be processed further in userland.
 			 */
 		}
 	}
 	return (handled);
 }
 
 static __inline void
 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 {
 
 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
 	    vmxctx->inst_fail_status));
 
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_VMX;
 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
 	vmexit->u.vmx.exit_reason = ~0;
 	vmexit->u.vmx.exit_qualification = ~0;
 
 	switch (rc) {
 	case VMX_VMRESUME_ERROR:
 	case VMX_VMLAUNCH_ERROR:
 	case VMX_INVEPT_ERROR:
 		vmexit->u.vmx.inst_type = rc;
 		break;
 	default:
 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 	}
 }
 
 /*
  * If the NMI-exiting VM execution control is set to '1' then an NMI in
  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
  * sufficient to simply vector to the NMI handler via a software interrupt.
  * However, this must be done before maskable interrupts are enabled
  * otherwise the "iret" issued by an interrupt handler will incorrectly
  * clear NMI blocking.
  */
 static __inline void
 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 {
 	uint32_t intr_info;
 
 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 
 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 		return;
 
 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 	    ("VM exit interruption info invalid: %#x", intr_info));
 
 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 		    "to NMI has invalid vector: %#x", intr_info));
 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 		__asm __volatile("int $2");
 	}
 }
 
 static int
 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
     void *rendezvous_cookie, void *suspend_cookie)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
 	struct vm *vm;
 	struct vmxctx *vmxctx;
 	struct vmcs *vmcs;
 	struct vm_exit *vmexit;
 	struct vlapic *vlapic;
 	uint64_t rip;
 	uint32_t exit_reason;
 
 	vmx = arg;
 	vm = vmx->vm;
 	vmcs = &vmx->vmcs[vcpu];
 	vmxctx = &vmx->ctx[vcpu];
 	vlapic = vm_lapic(vm, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	launched = 0;
 
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 
 	VMPTRLD(vmcs);
 
 	/*
 	 * XXX
 	 * We do this every time because we may setup the virtual machine
 	 * from a different process than the one that actually runs it.
 	 *
 	 * If the life of a virtual machine was spent entirely in the context
 	 * of a single process we could do this once in vmx_vminit().
 	 */
 	vmcs_write(VMCS_HOST_CR3, rcr3());
 
 	vmcs_write(VMCS_GUEST_RIP, startrip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
 		handled = UNHANDLED;
 
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
 		 * reasons:
 		 *
 		 * If an AST is asserted on this thread after the check below,
 		 * then the IPI_AST notification will not be lost, because it
 		 * will cause a VM exit due to external interrupt as soon as
 		 * the guest state is loaded.
 		 *
 		 * A posted interrupt after 'vmx_inject_interrupts()' will
 		 * not be "lost" because it will be held pending in the host
 		 * APIC because interrupts are disabled. The pending interrupt
 		 * will be recognized as soon as the guest state is loaded.
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
 		vmx_inject_interrupts(vmx, vcpu, vlapic);
 
 		/*
 		 * Check for vcpu suspension after injecting events because
 		 * vmx_inject_interrupts() can suspend the vcpu due to a
 		 * triple fault.
 		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
 			break;
 		}
 
 		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
 			enable_intr();
 			vm_exit_rendezvous(vmx->vm, vcpu, vmcs_guest_rip());
 			break;
 		}
 
 		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
 			vmx_astpending_trace(vmx, vcpu, vmexit->rip);
 			handled = HANDLED;
 			break;
 		}
 
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 
 		if (rc == VMX_GUEST_VMEXIT) {
 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 			enable_intr();
 			handled = vmx_exit_process(vmx, vcpu, vmexit);
 		} else {
 			enable_intr();
 			vmx_exit_inst_error(vmxctx, rc, vmexit);
 		}
 		launched = 1;
 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 	} while (handled);
 
 	/*
 	 * If a VM exit has been handled then the exitcode must be BOGUS
 	 * If a VM exit is not handled then the exitcode must not be BOGUS
 	 */
 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 		panic("Mismatch between handled (%d) and exitcode (%d)",
 		      handled, vmexit->exitcode);
 	}
 
 	if (!handled)
 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 
 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
 	VMCLEAR(vmcs);
 	return (0);
 }
 
 static void
 vmx_vmcleanup(void *arg)
 {
 	int i;
 	struct vmx *vmx = arg;
 
 	if (apic_access_virtualization(vmx, 0))
 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vpid_free(vmx->state[i].vpid);
 
 	free(vmx, M_VMX);
 
 	return;
 }
 
 static register_t *
 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_RAX:
 		return (&vmxctx->guest_rax);
 	case VM_REG_GUEST_RBX:
 		return (&vmxctx->guest_rbx);
 	case VM_REG_GUEST_RCX:
 		return (&vmxctx->guest_rcx);
 	case VM_REG_GUEST_RDX:
 		return (&vmxctx->guest_rdx);
 	case VM_REG_GUEST_RSI:
 		return (&vmxctx->guest_rsi);
 	case VM_REG_GUEST_RDI:
 		return (&vmxctx->guest_rdi);
 	case VM_REG_GUEST_RBP:
 		return (&vmxctx->guest_rbp);
 	case VM_REG_GUEST_R8:
 		return (&vmxctx->guest_r8);
 	case VM_REG_GUEST_R9:
 		return (&vmxctx->guest_r9);
 	case VM_REG_GUEST_R10:
 		return (&vmxctx->guest_r10);
 	case VM_REG_GUEST_R11:
 		return (&vmxctx->guest_r11);
 	case VM_REG_GUEST_R12:
 		return (&vmxctx->guest_r12);
 	case VM_REG_GUEST_R13:
 		return (&vmxctx->guest_r13);
 	case VM_REG_GUEST_R14:
 		return (&vmxctx->guest_r14);
 	case VM_REG_GUEST_R15:
 		return (&vmxctx->guest_r15);
 	case VM_REG_GUEST_CR2:
 		return (&vmxctx->guest_cr2);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static int
 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*retval = *regp;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 {
 	register_t *regp;
 
 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 		*regp = val;
 		return (0);
 	} else
 		return (EINVAL);
 }
 
 static int
+vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
+{
+	uint64_t gi;
+	int error;
+
+	error = vmcs_getreg(&vmx->vmcs[vcpu], running, 
+	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
+	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
+	return (error);
+}
+
+static int
+vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
+{
+	struct vmcs *vmcs;
+	uint64_t gi;
+	int error, ident;
+
+	/*
+	 * Forcing the vcpu into an interrupt shadow is not supported.
+	 */
+	if (val) {
+		error = EINVAL;
+		goto done;
+	}
+
+	vmcs = &vmx->vmcs[vcpu];
+	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
+	error = vmcs_getreg(vmcs, running, ident, &gi);
+	if (error == 0) {
+		gi &= ~HWINTR_BLOCKING;
+		error = vmcs_setreg(vmcs, running, ident, gi);
+	}
+done:
+	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
+	    error ? "failed" : "succeeded");
+	return (error);
+}
+
+static int
 vmx_shadow_reg(int reg)
 {
 	int shreg;
 
 	shreg = -1;
 
 	switch (reg) {
 	case VM_REG_GUEST_CR0:
 		shreg = VMCS_CR0_SHADOW;
                 break;
         case VM_REG_GUEST_CR4:
 		shreg = VMCS_CR4_SHADOW;
 		break;
 	default:
 		break;
 	}
 
 	return (shreg);
 }
 
 static int
 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 {
 	int running, hostcpu;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
+
 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 		return (0);
 
 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 }
 
 static int
 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
 	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	if (reg == VM_REG_GUEST_INTR_SHADOW)
+		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 
 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 		return (0);
 
 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 
 	if (error == 0) {
 		/*
 		 * If the "load EFER" VM-entry control is 1 then the
 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
 		 * bit in the VM-entry control.
 		 */
 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 		    (reg == VM_REG_GUEST_EFER)) {
 			vmcs_getreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 			if (val & EFER_LMA)
 				ctls |= VM_ENTRY_GUEST_LMA;
 			else
 				ctls &= ~VM_ENTRY_GUEST_LMA;
 			vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 		}
 
 		shadow = vmx_shadow_reg(reg);
 		if (shadow > 0) {
 			/*
 			 * Store the unmodified value in the shadow
 			 */			
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
 
 		if (reg == VM_REG_GUEST_CR3) {
 			/*
 			 * Invalidate the guest vcpu's TLB mappings to emulate
 			 * the behavior of updating %cr3.
 			 *
 			 * XXX the processor retains global mappings when %cr3
 			 * is updated but vmx_invvpid() does not.
 			 */
 			pmap = vmx->ctx[vcpu].pmap;
 			vmx_invvpid(vmx, vcpu, pmap, running);
 		}
 	}
 
 	return (error);
 }
 
 static int
 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 	if (running && hostcpu != curcpu)
 		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 
 	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 }
 
 static int
 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct vmx *vmx = arg;
 	int vcap;
 	int ret;
 
 	ret = ENOENT;
 
 	vcap = vmx->cap[vcpu].set;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit)
 			ret = 0;
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit)
 			ret = 0;
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap)
 			ret = 0;
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest)
 			ret = 0;
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid)
 			ret = 0;
 		break;
 	default:
 		break;
 	}
 
 	if (ret == 0)
 		*retval = (vcap & (1 << type)) ? 1 : 0;
 
 	return (ret);
 }
 
 static int
 vmx_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
 	uint32_t baseval;
 	uint32_t *pptr;
 	int error;
 	int flag;
 	int reg;
 	int retval;
 
 	retval = ENOENT;
 	pptr = NULL;
 
 	switch (type) {
 	case VM_CAP_HALT_EXIT:
 		if (cap_halt_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_HLT_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_MTRAP_EXIT:
 		if (cap_monitor_trap) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_MTF;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_PAUSE_EXIT:
 		if (cap_pause_exit) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls;
 			baseval = *pptr;
 			flag = PROCBASED_PAUSE_EXITING;
 			reg = VMCS_PRI_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	case VM_CAP_ENABLE_INVPCID:
 		if (cap_invpcid) {
 			retval = 0;
 			pptr = &vmx->cap[vcpu].proc_ctls2;
 			baseval = *pptr;
 			flag = PROCBASED2_ENABLE_INVPCID;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
 		}
 		break;
 	default:
 		break;
 	}
 
 	if (retval == 0) {
 		if (val) {
 			baseval |= flag;
 		} else {
 			baseval &= ~flag;
 		}
 		VMPTRLD(vmcs);
 		error = vmwrite(reg, baseval);
 		VMCLEAR(vmcs);
 
 		if (error) {
 			retval = error;
 		} else {
 			/*
 			 * Update optional stored flags, and record
 			 * setting
 			 */
 			if (pptr != NULL) {
 				*pptr = baseval;
 			}
 
 			if (val) {
 				vmx->cap[vcpu].set |= (1 << type);
 			} else {
 				vmx->cap[vcpu].set &= ~(1 << type);
 			}
 		}
 	}
 
         return (retval);
 }
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
 };
 
 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
 do {									\
 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
 	    level ? "level" : "edge", vector);				\
 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 } while (0)
 
 /*
  * vlapic->ops handlers that utilize the APICv hardware assist described in
  * Chapter 29 of the Intel SDM.
  */
 static int
 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	uint64_t mask;
 	int idx, notify;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	/*
 	 * Keep track of interrupt requests in the PIR descriptor. This is
 	 * because the virtual APIC page pointed to by the VMCS cannot be
 	 * modified if the vcpu is running.
 	 */
 	idx = vector / 64;
 	mask = 1UL << (vector % 64);
 	atomic_set_long(&pir_desc->pir[idx], mask);
 	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
 
 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 	    level, "vmx_set_intr_ready");
 	return (notify);
 }
 
 static int
 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t pending, pirval;
 	uint32_t ppr, vpr;
 	int i;
 
 	/*
 	 * This function is only expected to be called from the 'HLT' exit
 	 * handler which does not care about the vector that is pending.
 	 */
 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 
 	pending = atomic_load_acq_long(&pir_desc->pending);
 	if (!pending)
 		return (0);	/* common case */
 
 	/*
 	 * If there is an interrupt pending then it will be recognized only
 	 * if its priority is greater than the processor priority.
 	 *
 	 * Special case: if the processor priority is zero then any pending
 	 * interrupt will be recognized.
 	 */
 	lapic = vlapic->apic_page;
 	ppr = lapic->ppr & 0xf0;
 	if (ppr == 0)
 		return (1);
 
 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 	    lapic->ppr);
 
 	for (i = 3; i >= 0; i--) {
 		pirval = pir_desc->pir[i];
 		if (pirval != 0) {
 			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
 			return (vpr > ppr);
 		}
 	}
 	return (0);
 }
 
 static void
 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 {
 
 	panic("vmx_intr_accepted: not expected to be called");
 }
 
 static void
 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint64_t mask, val;
 
 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 	    ("vmx_set_tmr: vcpu cannot be running"));
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vmx = vlapic_vtx->vmx;
 	vmcs = &vmx->vmcs[vlapic->vcpuid];
 	mask = 1UL << (vector % 64);
 
 	VMPTRLD(vmcs);
 	val = vmcs_read(VMCS_EOI_EXIT(vector));
 	if (level)
 		val |= mask;
 	else
 		val &= ~mask;
 	vmcs_write(VMCS_EOI_EXIT(vector), val);
 	VMCLEAR(vmcs);
 }
 
 static void
 vmx_enable_x2apic_mode(struct vlapic *vlapic)
 {
 	struct vmx *vmx;
 	struct vmcs *vmcs;
 	uint32_t proc_ctls2;
 	int vcpuid, error;
 
 	vcpuid = vlapic->vcpuid;
 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 	vmcs = &vmx->vmcs[vcpuid];
 
 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 
 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 
 	VMPTRLD(vmcs);
 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 	VMCLEAR(vmcs);
 
 	if (vlapic->vcpuid == 0) {
 		/*
 		 * The nested page table mappings are shared by all vcpus
 		 * so unmap the APIC access page just once.
 		 */
 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 		    __func__, error));
 
 		/*
 		 * The MSR bitmap is shared by all vcpus so modify it only
 		 * once in the context of vcpu 0.
 		 */
 		error = vmx_allow_x2apic_msrs(vmx);
 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 		    __func__, error));
 	}
 }
 
 static void
 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 {
 
 	ipi_cpu(hostcpu, pirvec);
 }
 
 /*
  * Transfer the pending interrupts in the PIR descriptor to the IRR
  * in the virtual APIC page.
  */
 static void
 vmx_inject_pir(struct vlapic *vlapic)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
 	uint64_t val, pirval;
 	int rvi, pirbase = -1;
 	uint16_t intr_status_old, intr_status_new;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 		    "no posted interrupt pending");
 		return;
 	}
 
 	pirval = 0;
 	pirbase = -1;
 	lapic = vlapic->apic_page;
 
 	val = atomic_readandclear_long(&pir_desc->pir[0]);
 	if (val != 0) {
 		lapic->irr0 |= val;
 		lapic->irr1 |= val >> 32;
 		pirbase = 0;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[1]);
 	if (val != 0) {
 		lapic->irr2 |= val;
 		lapic->irr3 |= val >> 32;
 		pirbase = 64;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[2]);
 	if (val != 0) {
 		lapic->irr4 |= val;
 		lapic->irr5 |= val >> 32;
 		pirbase = 128;
 		pirval = val;
 	}
 
 	val = atomic_readandclear_long(&pir_desc->pir[3]);
 	if (val != 0) {
 		lapic->irr6 |= val;
 		lapic->irr7 |= val >> 32;
 		pirbase = 192;
 		pirval = val;
 	}
 
 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 
 	/*
 	 * Update RVI so the processor can evaluate pending virtual
 	 * interrupts on VM-entry.
 	 *
 	 * It is possible for pirval to be 0 here, even though the
 	 * pending bit has been set. The scenario is:
 	 * CPU-Y is sending a posted interrupt to CPU-X, which
 	 * is running a guest and processing posted interrupts in h/w.
 	 * CPU-X will eventually exit and the state seen in s/w is
 	 * the pending bit set, but no PIR bits set.
 	 *
 	 *      CPU-X                      CPU-Y
 	 *   (vm running)                (host running)
 	 *   rx posted interrupt
 	 *   CLEAR pending bit
 	 *				 SET PIR bit
 	 *   READ/CLEAR PIR bits
 	 *				 SET pending bit
 	 *   (vm exit)
 	 *   pending bit set, PIR 0
 	 */
 	if (pirval != 0) {
 		rvi = pirbase + flsl(pirval) - 1;
 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
 		if (intr_status_new > intr_status_old) {
 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 			    "guest_intr_status changed from 0x%04x to 0x%04x",
 			    intr_status_old, intr_status_new);
 		}
 	}
 }
 
 static struct vlapic *
 vmx_vlapic_init(void *arg, int vcpuid)
 {
 	struct vmx *vmx;
 	struct vlapic *vlapic;
 	struct vlapic_vtx *vlapic_vtx;
 	
 	vmx = arg;
 
 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 	vlapic->vm = vmx->vm;
 	vlapic->vcpuid = vcpuid;
 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 	vlapic_vtx->vmx = vmx;
 
 	if (virtual_interrupt_delivery) {
 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 		vlapic->ops.pending_intr = vmx_pending_intr;
 		vlapic->ops.intr_accepted = vmx_intr_accepted;
 		vlapic->ops.set_tmr = vmx_set_tmr;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
 	}
 
 	if (posted_interrupts)
 		vlapic->ops.post_intr = vmx_post_intr;
 
 	vlapic_init(vlapic);
 
 	return (vlapic);
 }
 
 static void
 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 {
 
 	vlapic_cleanup(vlapic);
 	free(vlapic, M_VLAPIC);
 }
 
 struct vmm_ops vmm_ops_intel = {
 	vmx_init,
 	vmx_cleanup,
 	vmx_restore,
 	vmx_vminit,
 	vmx_run,
 	vmx_vmcleanup,
 	vmx_getreg,
 	vmx_setreg,
 	vmx_getdesc,
 	vmx_setdesc,
 	vmx_getcap,
 	vmx_setcap,
 	ept_vmspace_alloc,
 	ept_vmspace_free,
 	vmx_vlapic_init,
 	vmx_vlapic_cleanup,
 };
Index: user/ae/inet6/sys/amd64/vmm/vmm.c
===================================================================
--- user/ae/inet6/sys/amd64/vmm/vmm.c	(revision 271452)
+++ user/ae/inet6/sys/amd64/vmm/vmm.c	(revision 271453)
@@ -1,2310 +1,2326 @@
 /*-
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/vm.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
 #include <machine/vmm_dev.h>
 #include <machine/vmm_instruction_emul.h>
 
 #include "vmm_ioport.h"
 #include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
 #include "vatpic.h"
 #include "vatpit.h"
 #include "vhpet.h"
 #include "vioapic.h"
 #include "vlapic.h"
 #include "vmm_msr.h"
 #include "vmm_ipi.h"
 #include "vmm_stat.h"
 #include "vmm_lapic.h"
 
 #include "io/ppt.h"
 #include "io/iommu.h"
 
 struct vlapic;
 
 /*
  * Initialization:
  * (a) allocated when vcpu is created
  * (i) initialized when vcpu is created and when it is reinitialized
  * (o) initialized the first time the vcpu is created
  * (x) initialized before use
  */
 struct vcpu {
 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
 	enum vcpu_state	state;		/* (o) vcpu state */
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	struct vm_exception exception;	/* (x) exception collateral */
 	int	exception_pending;	/* (i) exception pending */
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
 	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 };
 
 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
 	vm_paddr_t	gpa;
 	size_t		len;
 	boolean_t	wired;
 	vm_object_t	object;
 };
 #define	VM_MAX_MEMORY_SEGMENTS	2
 
 /*
  * Initialization:
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
 	void		*iommu;			/* (x) iommu-specific data */
 	struct vhpet	*vhpet;			/* (i) virtual HPET */
 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
 	struct vatpic	*vatpic;		/* (i) virtual atpic */
 	struct vatpit	*vatpit;		/* (i) virtual atpit */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
 	int		num_mem_segs;		/* (o) guest memory segments */
 	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 };
 
 static int vmm_initialized;
 
 static struct vmm_ops *ops;
 #define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
 
 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
 #define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 #define	VMSPACE_ALLOC(min, max) \
 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
 #define	VMSPACE_FREE(vmspace) \
 	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 #define	VMGETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMSETDESC(vmi, vcpu, num, desc)		\
 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 #define	VMGETCAP(vmi, vcpu, num, retval)	\
 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETCAP(vmi, vcpu, num, val)		\
 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 #define	VLAPIC_INIT(vmi, vcpu)			\
 	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
 #define	VLAPIC_CLEANUP(vmi, vlapic)		\
 	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
 
 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	fpu_stop_emulating()	clts()
 
 static MALLOC_DEFINE(M_VM, "vm", "vm");
 CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
 
 /* statistics */
 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 
 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Halt the guest if all vcpus are executing a HLT instruction with
  * interrupts disabled.
  */
 static int halt_detection_enabled = 1;
 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
     &halt_detection_enabled, 0,
     "Halt VM if all vcpus execute HLT with interrupts disabled");
 
 static int vmm_ipinum;
 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 	if (destroy) {
 		vmm_stat_free(vcpu->stats);	
 		fpu_save_area_free(vcpu->guestfpu);
 	}
 }
 
 static void
 vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
 	  
 	vcpu = &vm->vcpu[vcpu_id];
 
 	if (create) {
 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
 		    "initialized", vcpu_id));
 		vcpu_lock_init(vcpu);
 		vcpu->state = VCPU_IDLE;
 		vcpu->hostcpu = NOCPU;
 		vcpu->guestfpu = fpu_save_area_alloc();
 		vcpu->stats = vmm_stat_alloc();
 	}
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
 	fpu_save_area_reset(vcpu->guestfpu);
 	vmm_stat_init(vcpu->stats);
 	guest_msrs_init(vm, vcpu_id);
 }
 
 struct vm_exit *
 vm_exitinfo(struct vm *vm, int cpuid)
 {
 	struct vcpu *vcpu;
 
 	if (cpuid < 0 || cpuid >= VM_MAXCPU)
 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
 
 	vcpu = &vm->vcpu[cpuid];
 
 	return (&vcpu->exitinfo);
 }
 
 static void
 vmm_resume(void)
 {
 	VMM_RESUME();
 }
 
 static int
 vmm_init(void)
 {
 	int error;
 
 	vmm_host_state_init();
 
 	vmm_ipinum = vmm_ipi_alloc();
 	if (vmm_ipinum == 0)
 		vmm_ipinum = IPI_AST;
 
 	error = vmm_mem_init();
 	if (error)
 		return (error);
 	
 	if (vmm_is_intel())
 		ops = &vmm_ops_intel;
 	else if (vmm_is_amd())
 		ops = &vmm_ops_amd;
 	else
 		return (ENXIO);
 
 	vmm_msr_init();
 	vmm_resume_p = vmm_resume;
 
 	return (VMM_INIT(vmm_ipinum));
 }
 
 static int
 vmm_handler(module_t mod, int what, void *arg)
 {
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
 		if (ppt_avail_devices() > 0)
 			iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
 		break;
 	case MOD_UNLOAD:
 		error = vmmdev_cleanup();
 		if (error == 0) {
 			vmm_resume_p = NULL;
 			iommu_cleanup();
 			if (vmm_ipinum != IPI_AST)
 				vmm_ipi_free(vmm_ipinum);
 			error = VMM_CLEANUP();
 			/*
 			 * Something bad happened - prevent new
 			 * VMs from being created
 			 */
 			if (error)
 				vmm_initialized = 0;
 		}
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t vmm_kmod = {
 	"vmm",
 	vmm_handler,
 	NULL
 };
 
 /*
  * vmm initialization has the following dependencies:
  *
  * - iommu initialization must happen after the pci passthru driver has had
  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
  *
  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  *   after SMP is fully functional (after SI_SUB_SMP).
  */
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
 static void
 vm_init(struct vm *vm, bool create)
 {
 	int i;
 
 	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
 	vm->vhpet = vhpet_init(vm);
 	vm->vatpic = vatpic_init(vm);
 	vm->vatpit = vatpit_init(vm);
 
 	CPU_ZERO(&vm->active_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_init(vm, i, create);
 }
 
 int
 vm_create(const char *name, struct vm **retvm)
 {
 	struct vm *vm;
 	struct vmspace *vmspace;
 
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
 	 */
 	if (!vmm_initialized)
 		return (ENXIO);
 
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
 	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
 	if (vmspace == NULL)
 		return (ENOMEM);
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
 	vm->num_mem_segs = 0;
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
 	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
 }
 
 static void
 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 {
 
 	if (seg->object != NULL)
 		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
 
 	bzero(seg, sizeof(*seg));
 }
 
 static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
 	int i;
 
 	ppt_unassign_all(vm);
 
 	if (vm->iommu != NULL)
 		iommu_destroy_domain(vm->iommu);
 
 	vatpit_cleanup(vm->vatpit);
 	vhpet_cleanup(vm->vhpet);
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(vm, i, destroy);
 
 	VMCLEANUP(vm->cookie);
 
 	if (destroy) {
 		for (i = 0; i < vm->num_mem_segs; i++)
 			vm_free_mem_seg(vm, &vm->mem_segs[i]);
 
 		vm->num_mem_segs = 0;
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
 	}
 }
 
 void
 vm_destroy(struct vm *vm)
 {
 	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
 int
 vm_reinit(struct vm *vm)
 {
 	int error;
 
 	/*
 	 * A virtual machine can be reset only if all vcpus are suspended.
 	 */
 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 		vm_cleanup(vm, false);
 		vm_init(vm, false);
 		error = 0;
 	} else {
 		error = EBUSY;
 	}
 
 	return (error);
 }
 
 const char *
 vm_name(struct vm *vm)
 {
 	return (vm->name);
 }
 
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	vm_object_t obj;
 
 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 		return (ENOMEM);
 	else
 		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 
 	vmm_mmio_free(vm->vmspace, gpa, len);
 	return (0);
 }
 
 boolean_t
 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
 {
 	int i;
 	vm_paddr_t gpabase, gpalimit;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpabase = vm->mem_segs[i].gpa;
 		gpalimit = gpabase + vm->mem_segs[i].len;
 		if (gpa >= gpabase && gpa < gpalimit)
 			return (TRUE);		/* 'gpa' is regular memory */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
 		return (TRUE);			/* 'gpa' is pci passthru mmio */
 
 	return (FALSE);
 }
 
 int
 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
 	int available, allocated;
 	struct mem_seg *seg;
 	vm_object_t object;
 	vm_paddr_t g;
 
 	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 		return (EINVAL);
 	
 	available = allocated = 0;
 	g = gpa;
 	while (g < gpa + len) {
 		if (vm_mem_allocated(vm, g))
 			allocated++;
 		else
 			available++;
 
 		g += PAGE_SIZE;
 	}
 
 	/*
 	 * If there are some allocated and some available pages in the address
 	 * range then it is an error.
 	 */
 	if (allocated && available)
 		return (EINVAL);
 
 	/*
 	 * If the entire address range being requested has already been
 	 * allocated then there isn't anything more to do.
 	 */
 	if (allocated && available == 0)
 		return (0);
 
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);
 
 	seg = &vm->mem_segs[vm->num_mem_segs];
 
 	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
 		return (ENOMEM);
 
 	seg->gpa = gpa;
 	seg->len = len;
 	seg->object = object;
 	seg->wired = FALSE;
 
 	vm->num_mem_segs++;
 
 	return (0);
 }
 
 static vm_paddr_t
 vm_maxmem(struct vm *vm)
 {
 	int i;
 	vm_paddr_t gpa, maxmem;
 
 	maxmem = 0;
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
 		if (gpa > maxmem)
 			maxmem = gpa;
 	}
 	return (maxmem);
 }
 
 static void
 vm_gpa_unwire(struct vm *vm)
 {
 	int i, rv;
 	struct mem_seg *seg;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		if (!seg->wired)
 			continue;
 
 		rv = vm_map_unwire(&vm->vmspace->vm_map,
 				   seg->gpa, seg->gpa + seg->len,
 				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
 		    "%#lx/%ld could not be unwired: %d",
 		    vm_name(vm), seg->gpa, seg->len, rv));
 
 		seg->wired = FALSE;
 	}
 }
 
 static int
 vm_gpa_wire(struct vm *vm)
 {
 	int i, rv;
 	struct mem_seg *seg;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		if (seg->wired)
 			continue;
 
 		/* XXX rlimits? */
 		rv = vm_map_wire(&vm->vmspace->vm_map,
 				 seg->gpa, seg->gpa + seg->len,
 				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		if (rv != KERN_SUCCESS)
 			break;
 
 		seg->wired = TRUE;
 	}
 
 	if (i < vm->num_mem_segs) {
 		/*
 		 * Undo the wiring before returning an error.
 		 */
 		vm_gpa_unwire(vm);
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 static void
 vm_iommu_modify(struct vm *vm, boolean_t map)
 {
 	int i, sz;
 	vm_paddr_t gpa, hpa;
 	struct mem_seg *seg;
 	void *vp, *cookie, *host_domain;
 
 	sz = PAGE_SIZE;
 	host_domain = iommu_host_domain();
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		seg = &vm->mem_segs[i];
 		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
 		    vm_name(vm), seg->gpa, seg->len));
 
 		gpa = seg->gpa;
 		while (gpa < seg->gpa + seg->len) {
 			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
 					 &cookie);
 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 			    vm_name(vm), gpa));
 
 			vm_gpa_release(cookie);
 
 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
 			if (map) {
 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 				iommu_remove_mapping(host_domain, hpa, sz);
 			} else {
 				iommu_remove_mapping(vm->iommu, gpa, sz);
 				iommu_create_mapping(host_domain, hpa, hpa, sz);
 			}
 
 			gpa += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Invalidate the cached translations associated with the domain
 	 * from which pages were removed.
 	 */
 	if (map)
 		iommu_invalidate_tlb(host_domain);
 	else
 		iommu_invalidate_tlb(vm->iommu);
 }
 
 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
 
 int
 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 
 	error = ppt_unassign_device(vm, bus, slot, func);
 	if (error)
 		return (error);
 
 	if (ppt_assigned_devices(vm) == 0) {
 		vm_iommu_unmap(vm);
 		vm_gpa_unwire(vm);
 	}
 	return (0);
 }
 
 int
 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 {
 	int error;
 	vm_paddr_t maxaddr;
 
 	/*
 	 * Virtual machines with pci passthru devices get special treatment:
 	 * - the guest physical memory is wired
 	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
 	 *
 	 * We need to do this before the first pci passthru device is attached.
 	 */
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
 		maxaddr = vm_maxmem(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 
 		error = vm_gpa_wire(vm);
 		if (error)
 			return (error);
 
 		vm_iommu_map(vm);
 	}
 
 	error = ppt_assign_device(vm, bus, slot, func);
 	return (error);
 }
 
 void *
 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
 	    void **cookie)
 {
 	int count, pageoff;
 	vm_page_t m;
 
 	pageoff = gpa & PAGE_MASK;
 	if (len > PAGE_SIZE - pageoff)
 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
 	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 
 	if (count == 1) {
 		*cookie = m;
 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 	} else {
 		*cookie = NULL;
 		return (NULL);
 	}
 }
 
 void
 vm_gpa_release(void *cookie)
 {
 	vm_page_t m = cookie;
 
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 }
 
 int
 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 		  struct vm_memory_segment *seg)
 {
 	int i;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if (gpabase == vm->mem_segs[i].gpa) {
 			seg->gpa = vm->mem_segs[i].gpa;
 			seg->len = vm->mem_segs[i].len;
 			seg->wired = vm->mem_segs[i].wired;
 			return (0);
 		}
 	}
 	return (-1);
 }
 
 int
 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
 	      vm_offset_t *offset, struct vm_object **object)
 {
 	int i;
 	size_t seg_len;
 	vm_paddr_t seg_gpa;
 	vm_object_t seg_obj;
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if ((seg_obj = vm->mem_segs[i].object) == NULL)
 			continue;
 
 		seg_gpa = vm->mem_segs[i].gpa;
 		seg_len = vm->mem_segs[i].len;
 
 		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
 			*offset = gpa - seg_gpa;
 			*object = seg_obj;
 			vm_object_reference(seg_obj);
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
 
 int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
 }
 
 int
 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (reg >= VM_REG_LAST)
 		return (EINVAL);
 
 	return (VMSETREG(vm->cookie, vcpu, reg, val));
 }
 
 static boolean_t
 is_descriptor_table(int reg)
 {
 
 	switch (reg) {
 	case VM_REG_GUEST_IDTR:
 	case VM_REG_GUEST_GDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 static boolean_t
 is_segment_register(int reg)
 {
 	
 	switch (reg) {
 	case VM_REG_GUEST_ES:
 	case VM_REG_GUEST_CS:
 	case VM_REG_GUEST_SS:
 	case VM_REG_GUEST_DS:
 	case VM_REG_GUEST_FS:
 	case VM_REG_GUEST_GS:
 	case VM_REG_GUEST_TR:
 	case VM_REG_GUEST_LDTR:
 		return (TRUE);
 	default:
 		return (FALSE);
 	}
 }
 
 int
 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 int
 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 		struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
 		return (EINVAL);
 
 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
 
 	/* flush host state to the pcb */
 	fpuexit(curthread);
 
 	/* restore guest FPU state */
 	fpu_stop_emulating();
 	fpurestore(vcpu->guestfpu);
 
 	/* restore guest XCR0 if XSAVE is enabled in the host */
 	if (rcr4() & CR4_XSAVE)
 		load_xcr(0, vcpu->guest_xcr0);
 
 	/*
 	 * The FPU is now "dirty" with the guest's state so turn on emulation
 	 * to trap any access to the FPU by the host.
 	 */
 	fpu_start_emulating();
 }
 
 static void
 save_guest_fpustate(struct vcpu *vcpu)
 {
 
 	if ((rcr0() & CR0_TS) == 0)
 		panic("fpu emulation not enabled in host!");
 
 	/* save guest XCR0 and restore host XCR0 */
 	if (rcr4() & CR4_XSAVE) {
 		vcpu->guest_xcr0 = rxcr(0);
 		load_xcr(0, vmm_get_host_xcr0());
 	}
 
 	/* save guest FPU state */
 	fpu_stop_emulating();
 	fpusave(vcpu->guestfpu);
 	fpu_start_emulating();
 }
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
 static int
 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 
 	vcpu_assert_locked(vcpu);
 
 	/*
 	 * State transitions from the vmmdev_ioctl() must always begin from
 	 * the VCPU_IDLE state. This guarantees that there is only a single
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE)
 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
 	}
 
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 	} else {
 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 		    "vcpu that is not running", vcpu->hostcpu));
 	}
 
 	/*
 	 * The following state transitions are allowed:
 	 * IDLE -> FROZEN -> IDLE
 	 * FROZEN -> RUNNING -> FROZEN
 	 * FROZEN -> SLEEPING -> FROZEN
 	 */
 	switch (vcpu->state) {
 	case VCPU_IDLE:
 	case VCPU_RUNNING:
 	case VCPU_SLEEPING:
 		error = (newstate != VCPU_FROZEN);
 		break;
 	case VCPU_FROZEN:
 		error = (newstate == VCPU_FROZEN);
 		break;
 	default:
 		error = 1;
 		break;
 	}
 
 	if (error)
 		return (EBUSY);
 
 	vcpu->state = newstate;
 	if (newstate == VCPU_RUNNING)
 		vcpu->hostcpu = curcpu;
 	else
 		vcpu->hostcpu = NOCPU;
 
 	if (newstate == VCPU_IDLE)
 		wakeup(&vcpu->state);
 
 	return (0);
 }
 
 static void
 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 		panic("Error %d setting state to %d\n", error, newstate);
 }
 
 static void
 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 {
 	int error;
 
 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 		panic("Error %d setting state to %d", error, newstate);
 }
 
 static void
 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
 {
 
 	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
 
 	/*
 	 * Update 'rendezvous_func' and execute a write memory barrier to
 	 * ensure that it is visible across all host cpus. This is not needed
 	 * for correctness but it does ensure that all the vcpus will notice
 	 * that the rendezvous is requested immediately.
 	 */
 	vm->rendezvous_func = func;
 	wmb();
 }
 
 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
 	do {								\
 		if (vcpuid >= 0)					\
 			VCPU_CTR0(vm, vcpuid, fmt);			\
 		else							\
 			VM_CTR0(vm, fmt);				\
 	} while (0)
 
 static void
 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 {
 
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 
 		if (vcpuid != -1 &&
 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 		}
 		if (CPU_CMP(&vm->rendezvous_req_cpus,
 		    &vm->rendezvous_done_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 			vm_set_rendezvous_func(vm, NULL);
 			wakeup(&vm->rendezvous_func);
 			break;
 		}
 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", 0);
 	}
 	mtx_unlock(&vm->rendezvous_mtx);
 }
 
 /*
  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
  */
 static int
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 {
 	struct vcpu *vcpu;
 	const char *wmesg;
-	int t, vcpu_halted, vm_halted;
+	int error, t, vcpu_halted, vm_halted;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
 	vcpu = &vm->vcpu[vcpuid];
 	vcpu_halted = 0;
 	vm_halted = 0;
+
+	/*
+	 * The typical way to halt a cpu is to execute: "sti; hlt"
+	 *
+	 * STI sets RFLAGS.IF to enable interrupts. However, the processor
+	 * remains in an "interrupt shadow" for an additional instruction
+	 * following the STI. This guarantees that "sti; hlt" sequence is
+	 * atomic and a pending interrupt will be recognized after the HLT.
+	 *
+	 * After the HLT emulation is done the vcpu is no longer in an
+	 * interrupt shadow and a pending interrupt can be injected on
+	 * the next entry into the guest.
+	 */
+	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
+	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
+	    __func__, error));
 
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
 		 * Do a final check for pending NMI or interrupts before
 		 * really putting this thread to sleep. Also check for
 		 * software events that would cause this vcpu to wakeup.
 		 *
 		 * These interrupts/events could have happened after the
 		 * vcpu returned from VMRUN() and before it acquired the
 		 * vcpu lock above.
 		 */
 		if (vm->rendezvous_func != NULL || vm->suspend)
 			break;
 		if (vm_nmi_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
 				break;
 			}
 		}
 
 		/* Don't go to sleep if the vcpu thread needs to yield */
 		if (vcpu_should_yield(vm, vcpuid))
 			break;
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
 		 * track of the vcpus that have entered this state. When all
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
 			wmesg = "vmhalt";
 			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 			}
 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 				vm_halted = 1;
 				break;
 			}
 		} else {
 			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 		/*
 		 * XXX msleep_spin() cannot be interrupted by signals so
 		 * wake up periodically to check pending signals.
 		 */
 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
 
 	if (vcpu_halted)
 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 
 	vcpu_unlock(vcpu);
 
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
 	return (0);
 }
 
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 {
 	int rv, ftype;
 	struct vm_map *map;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	ftype = vme->u.paging.fault_type;
 	KASSERT(ftype == VM_PROT_READ ||
 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 	    ("vm_handle_paging: invalid fault_type %d", ftype));
 
 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 		    vme->u.paging.gpa, ftype);
 		if (rv == 0)
 			goto done;
 	}
 
 	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
 
 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
 
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
 done:
 	/* restart execution at the faulting instruction */
 	vme->inst_length = 0;
 
 	return (0);
 }
 
 static int
 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 {
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 	uint64_t gla, gpa;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
 	enum vm_cpu_mode cpu_mode;
 	int cs_d, error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
 	cpu_mode = paging->cpu_mode;
 
 	vie_init(vie);
 
 	/* Fetch, decode and emulate the faulting instruction */
 	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
 	    vme->inst_length, vie);
 	if (error == 1)
 		return (0);		/* Resume guest to handle page fault */
 	else if (error == -1)
 		return (EFAULT);
 	else if (error != 0)
 		panic("%s: vmm_fetch_instruction error %d", __func__, error);
 
 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 		return (EFAULT);
 
 	/* return to userland unless this is an in-kernel emulated device */
 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 		mread = lapic_mmio_read;
 		mwrite = lapic_mmio_write;
 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 		mread = vioapic_mmio_read;
 		mwrite = vioapic_mmio_write;
 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 		mread = vhpet_mmio_read;
 		mwrite = vhpet_mmio_write;
 	} else {
 		*retu = true;
 		return (0);
 	}
 
 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 	    mread, mwrite, retu);
 
 	return (error);
 }
 
 static int
 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 {
 	int i, done;
 	struct vcpu *vcpu;
 
 	done = 0;
 	vcpu = &vm->vcpu[vcpuid];
 
 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 
 	/*
 	 * Wait until all 'active_cpus' have suspended themselves.
 	 *
 	 * Since a VM may be suspended at any time including when one or
 	 * more vcpus are doing a rendezvous we need to call the rendezvous
 	 * handler while we are waiting to prevent a deadlock.
 	 */
 	vcpu_lock(vcpu);
 	while (1) {
 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 			break;
 		}
 
 		if (vm->rendezvous_func == NULL) {
 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		} else {
 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 			vcpu_unlock(vcpu);
 			vm_handle_rendezvous(vm, vcpuid);
 			vcpu_lock(vcpu);
 		}
 	}
 	vcpu_unlock(vcpu);
 
 	/*
 	 * Wakeup the other sleeping vcpus and return to userspace.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
 			vcpu_notify_event(vm, i, false);
 		}
 	}
 
 	*retu = true;
 	return (0);
 }
 
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
 	int i;
 
 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 		return (EINVAL);
 
 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
 		    vm->suspend, how);
 		return (EALREADY);
 	}
 
 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 
 	/*
 	 * Notify all active vcpus that they are now suspended.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	return (0);
 }
 
 void
 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 	vmexit->u.suspended.how = vm->suspend;
 }
 
 void
 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 }
 
 void
 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
 	vmexit->exitcode = VM_EXITCODE_BOGUS;
 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 }
 
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
 	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval, rip;
 	struct vm_exit *vme;
 	bool retu, intr_disabled;
 	pmap_t pmap;
 	void *rptr, *sptr;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 		return (EINVAL);
 
 	rptr = &vm->rendezvous_func;
 	sptr = &vm->suspend;
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 	rip = vmrun->rip;
 restart:
 	critical_enter();
 
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("vm_run: absurd pm_active"));
 
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 
 	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 	restore_host_msrs(vm, vcpuid);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
 	critical_exit();
 
 	if (error == 0) {
 		retu = false;
 		switch (vme->exitcode) {
 		case VM_EXITCODE_SUSPENDED:
 			error = vm_handle_suspend(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_IOAPIC_EOI:
 			vioapic_process_eoi(vm, vcpuid,
 			    vme->u.ioapic_eoi.vector);
 			break;
 		case VM_EXITCODE_RENDEZVOUS:
 			vm_handle_rendezvous(vm, vcpuid);
 			error = 0;
 			break;
 		case VM_EXITCODE_HLT:
 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 			break;
 		case VM_EXITCODE_PAGING:
 			error = vm_handle_paging(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INST_EMUL:
 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
 			break;
 		case VM_EXITCODE_INOUT:
 		case VM_EXITCODE_INOUT_STR:
 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
 			break;
 		default:
 			retu = true;	/* handled in userland */
 			break;
 		}
 	}
 
 	if (error == 0 && retu == false) {
 		rip = vme->rip + vme->inst_length;
 		goto restart;
 	}
 
 	/* copy the exit information */
 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
 int
 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 {
 	struct vcpu *vcpu;
 	int type, vector;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (info & VM_INTINFO_VALID) {
 		type = info & VM_INTINFO_TYPE;
 		vector = info & 0xff;
 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 			return (EINVAL);
 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 			return (EINVAL);
 		if (info & VM_INTINFO_RSVD)
 			return (EINVAL);
 	} else {
 		info = 0;
 	}
 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 	vcpu->exitintinfo = info;
 	return (0);
 }
 
 enum exc_class {
 	EXC_BENIGN,
 	EXC_CONTRIBUTORY,
 	EXC_PAGEFAULT
 };
 
 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
 
 static enum exc_class
 exception_class(uint64_t info)
 {
 	int type, vector;
 
 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 	type = info & VM_INTINFO_TYPE;
 	vector = info & 0xff;
 
 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 	switch (type) {
 	case VM_INTINFO_HWINTR:
 	case VM_INTINFO_SWINTR:
 	case VM_INTINFO_NMI:
 		return (EXC_BENIGN);
 	default:
 		/*
 		 * Hardware exception.
 		 *
 		 * SVM and VT-x use identical type values to represent NMI,
 		 * hardware interrupt and software interrupt.
 		 *
 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
 		 * for exceptions except #BP and #OF. #BP and #OF use a type
 		 * value of '5' or '6'. Therefore we don't check for explicit
 		 * values of 'type' to classify 'intinfo' into a hardware
 		 * exception.
 		 */
 		break;
 	}
 
 	switch (vector) {
 	case IDT_PF:
 	case IDT_VE:
 		return (EXC_PAGEFAULT);
 	case IDT_DE:
 	case IDT_TS:
 	case IDT_NP:
 	case IDT_SS:
 	case IDT_GP:
 		return (EXC_CONTRIBUTORY);
 	default:
 		return (EXC_BENIGN);
 	}
 }
 
 static int
 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
     uint64_t *retinfo)
 {
 	enum exc_class exc1, exc2;
 	int type1, vector1;
 
 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 
 	/*
 	 * If an exception occurs while attempting to call the double-fault
 	 * handler the processor enters shutdown mode (aka triple fault).
 	 */
 	type1 = info1 & VM_INTINFO_TYPE;
 	vector1 = info1 & 0xff;
 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 		    info1, info2);
 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 		*retinfo = 0;
 		return (0);
 	}
 
 	/*
 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 	 */
 	exc1 = exception_class(info1);
 	exc2 = exception_class(info2);
 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 		/* Convert nested fault into a double fault. */
 		*retinfo = IDT_DF;
 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
 	} else {
 		/* Handle exceptions serially */
 		*retinfo = info2;
 	}
 	return (1);
 }
 
 static uint64_t
 vcpu_exception_intinfo(struct vcpu *vcpu)
 {
 	uint64_t info = 0;
 
 	if (vcpu->exception_pending) {
 		info = vcpu->exception.vector & 0xff;
 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 		if (vcpu->exception.error_code_valid) {
 			info |= VM_INTINFO_DEL_ERRCODE;
 			info |= (uint64_t)vcpu->exception.error_code << 32;
 		}
 	}
 	return (info);
 }
 
 int
 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 {
 	struct vcpu *vcpu;
 	uint64_t info1, info2;
 	int valid;
 
 	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	info1 = vcpu->exitintinfo;
 	vcpu->exitintinfo = 0;
 
 	info2 = 0;
 	if (vcpu->exception_pending) {
 		info2 = vcpu_exception_intinfo(vcpu);
 		vcpu->exception_pending = 0;
 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 		    vcpu->exception.vector, info2);
 	}
 
 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 	} else if (info1 & VM_INTINFO_VALID) {
 		*retinfo = info1;
 		valid = 1;
 	} else if (info2 & VM_INTINFO_VALID) {
 		*retinfo = info2;
 		valid = 1;
 	} else {
 		valid = 0;
 	}
 
 	if (valid) {
 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 	}
 
 	return (valid);
 }
 
 int
 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 	*info1 = vcpu->exitintinfo;
 	*info2 = vcpu_exception_intinfo(vcpu);
 	return (0);
 }
 
 int
 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (exception->vector < 0 || exception->vector >= 32)
 		return (EINVAL);
 
 	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
 	 */
 	if (exception->vector == IDT_DF)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 		    "pending exception %d", exception->vector,
 		    vcpu->exception.vector);
 		return (EBUSY);
 	}
 
 	vcpu->exception_pending = 1;
 	vcpu->exception = *exception;
 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
 	return (0);
 }
 
 void
 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
     int errcode)
 {
 	struct vm_exception exception;
 	struct vm_exit *vmexit;
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 
 	exception.vector = vector;
 	exception.error_code = errcode;
 	exception.error_code_valid = errcode_valid;
 	error = vm_inject_exception(vm, vcpuid, &exception);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 
 	/*
 	 * A fault-like exception allows the instruction to be restarted
 	 * after the exception handler returns.
 	 *
 	 * By setting the inst_length to 0 we ensure that the instruction
 	 * pointer remains at the faulting instruction.
 	 */
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->inst_length = 0;
 }
 
 void
 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
 	struct vm *vm;
 	int error;
 
 	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
 vm_inject_nmi(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_nmi_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->nmi_pending);
 }
 
 void
 vm_nmi_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->nmi_pending == 0)
 		panic("vm_nmi_clear: inconsistent nmi_pending state");
 
 	vcpu->nmi_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 }
 
 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 
 int
 vm_inject_extint(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
 	vcpu_notify_event(vm, vcpuid, false);
 	return (0);
 }
 
 int
 vm_extint_pending(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	return (vcpu->extint_pending);
 }
 
 void
 vm_extint_clear(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->extint_pending == 0)
 		panic("vm_extint_clear: inconsistent extint_pending state");
 
 	vcpu->extint_pending = 0;
 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 }
 
 int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
 }
 
 int
 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 {
 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (type < 0 || type >= VM_CAP_MAX)
 		return (EINVAL);
 
 	return (VMSETCAP(vm->cookie, vcpu, type, val));
 }
 
 uint64_t *
 vm_guest_msrs(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].guest_msrs);
 }
 
 struct vlapic *
 vm_lapic(struct vm *vm, int cpu)
 {
 	return (vm->vcpu[cpu].vlapic);
 }
 
 struct vioapic *
 vm_ioapic(struct vm *vm)
 {
 
 	return (vm->vioapic);
 }
 
 struct vhpet *
 vm_hpet(struct vm *vm)
 {
 
 	return (vm->vhpet);
 }
 
 boolean_t
 vmm_is_pptdev(int bus, int slot, int func)
 {
 	int found, i, n;
 	int b, s, f;
 	char *val, *cp, *cp2;
 
 	/*
 	 * XXX
 	 * The length of an environment variable is limited to 128 bytes which
 	 * puts an upper limit on the number of passthru devices that may be
 	 * specified using a single environment variable.
 	 *
 	 * Work around this by scanning multiple environment variable
 	 * names instead of a single one - yuck!
 	 */
 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 
 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 	found = 0;
 	for (i = 0; names[i] != NULL && !found; i++) {
 		cp = val = getenv(names[i]);
 		while (cp != NULL && *cp != '\0') {
 			if ((cp2 = strchr(cp, ' ')) != NULL)
 				*cp2 = '\0';
 
 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 			if (n == 3 && bus == b && slot == s && func == f) {
 				found = 1;
 				break;
 			}
 		
 			if (cp2 != NULL)
 				*cp2++ = ' ';
 
 			cp = cp2;
 		}
 		freeenv(val);
 	}
 	return (found);
 }
 
 void *
 vm_iommu_domain(struct vm *vm)
 {
 
 	return (vm->iommu);
 }
 
 int
 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
     bool from_idle)
 {
 	int error;
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
 	vcpu_unlock(vcpu);
 
 	return (error);
 }
 
 enum vcpu_state
 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 {
 	struct vcpu *vcpu;
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	state = vcpu->state;
 	if (hostcpu != NULL)
 		*hostcpu = vcpu->hostcpu;
 	vcpu_unlock(vcpu);
 
 	return (state);
 }
 
 int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
 		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 	return (0);
 }
 
 cpuset_t
 vm_active_cpus(struct vm *vm)
 {
 
 	return (vm->active_cpus);
 }
 
 cpuset_t
 vm_suspended_cpus(struct vm *vm)
 {
 
 	return (vm->suspended_cpus);
 }
 
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
 
 	return (vm->vcpu[vcpuid].stats);
 }
 
 int
 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	*state = vm->vcpu[vcpuid].x2apic_state;
 
 	return (0);
 }
 
 int
 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 {
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
 	if (state >= X2APIC_STATE_LAST)
 		return (EINVAL);
 
 	vm->vcpu[vcpuid].x2apic_state = state;
 
 	vlapic_set_x2apic_state(vm, vcpuid, state);
 
 	return (0);
 }
 
 /*
  * This function is called to ensure that a vcpu "sees" a pending event
  * as soon as possible:
  * - If the vcpu thread is sleeping then it is woken up.
  * - If the vcpu is running on a different host_cpu then an IPI will be directed
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 void
 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 {
 	int hostcpu;
 	struct vcpu *vcpu;
 
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
 			if (lapic_intr) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
 				ipi_cpu(hostcpu, vmm_ipinum);
 			}
 		} else {
 			/*
 			 * If the 'vcpu' is running on 'curcpu' then it must
 			 * be sending a notification to itself (e.g. SELF_IPI).
 			 * The pending event will be picked up when the vcpu
 			 * transitions back to guest context.
 			 */
 		}
 	} else {
 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 		    "with hostcpu %d", vcpu->state, hostcpu));
 		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	}
 	vcpu_unlock(vcpu);
 }
 
 struct vmspace *
 vm_get_vmspace(struct vm *vm)
 {
 
 	return (vm->vmspace);
 }
 
 int
 vm_apicid2vcpuid(struct vm *vm, int apicid)
 {
 	/*
 	 * XXX apic id is assumed to be numerically identical to vcpu id
 	 */
 	return (apicid);
 }
 
 void
 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg)
 {
 	int i;
 
 	/*
 	 * Enforce that this function is called without any locks
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 
 restart:
 	mtx_lock(&vm->rendezvous_mtx);
 	if (vm->rendezvous_func != NULL) {
 		/*
 		 * If a rendezvous is already in progress then we need to
 		 * call the rendezvous handler in case this 'vcpuid' is one
 		 * of the targets of the rendezvous.
 		 */
 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 		mtx_unlock(&vm->rendezvous_mtx);
 		vm_handle_rendezvous(vm, vcpuid);
 		goto restart;
 	}
 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 	    "rendezvous is still in progress"));
 
 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 	vm->rendezvous_req_cpus = dest;
 	CPU_ZERO(&vm->rendezvous_done_cpus);
 	vm->rendezvous_arg = arg;
 	vm_set_rendezvous_func(vm, func);
 	mtx_unlock(&vm->rendezvous_mtx);
 
 	/*
 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
 	 * vcpus so they handle the rendezvous as soon as possible.
 	 */
 	for (i = 0; i < VM_MAXCPU; i++) {
 		if (CPU_ISSET(i, &dest))
 			vcpu_notify_event(vm, i, false);
 	}
 
 	vm_handle_rendezvous(vm, vcpuid);
 }
 
 struct vatpic *
 vm_atpic(struct vm *vm)
 {
 	return (vm->vatpic);
 }
 
 struct vatpit *
 vm_atpit(struct vm *vm)
 {
 	return (vm->vatpit);
 }
 
 enum vm_reg_name
 vm_segment_name(int seg)
 {
 	static enum vm_reg_name seg_names[] = {
 		VM_REG_GUEST_ES,
 		VM_REG_GUEST_CS,
 		VM_REG_GUEST_SS,
 		VM_REG_GUEST_DS,
 		VM_REG_GUEST_FS,
 		VM_REG_GUEST_GS
 	};
 
 	KASSERT(seg >= 0 && seg < nitems(seg_names),
 	    ("%s: invalid segment encoding %d", __func__, seg));
 	return (seg_names[seg]);
 }
 
 void
 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int idx;
 
 	for (idx = 0; idx < num_copyinfo; idx++) {
 		if (copyinfo[idx].cookie != NULL)
 			vm_gpa_release(copyinfo[idx].cookie);
 	}
 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 }
 
 int
 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
     int num_copyinfo)
 {
 	int error, idx, nused;
 	size_t n, off, remaining;
 	void *hva, *cookie;
 	uint64_t gpa;
 
 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 
 	nused = 0;
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
 		if (error)
 			return (error);
 		off = gpa & PAGE_MASK;
 		n = min(remaining, PAGE_SIZE - off);
 		copyinfo[nused].gpa = gpa;
 		copyinfo[nused].len = n;
 		remaining -= n;
 		gla += n;
 		nused++;
 	}
 
 	for (idx = 0; idx < nused; idx++) {
 		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
 		    prot, &cookie);
 		if (hva == NULL)
 			break;
 		copyinfo[idx].hva = hva;
 		copyinfo[idx].cookie = cookie;
 	}
 
 	if (idx != nused) {
 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 		return (-1);
 	} else {
 		return (0);
 	}
 }
 
 void
 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
     size_t len)
 {
 	char *dst;
 	int idx;
 	
 	dst = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		dst += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 void
 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
     struct vm_copyinfo *copyinfo, size_t len)
 {
 	const char *src;
 	int idx;
 
 	src = kaddr;
 	idx = 0;
 	while (len > 0) {
 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 		len -= copyinfo[idx].len;
 		src += copyinfo[idx].len;
 		idx++;
 	}
 }
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
  * these are global stats, only return the values with for vCPU 0
  */
 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 
 static void
 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 	}	
 }
 
 static void
 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 {
 
 	if (vcpu == 0) {
 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 	}	
 }
 
 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Index: user/ae/inet6/sys/amd64/vmm
===================================================================
--- user/ae/inet6/sys/amd64/vmm	(revision 271452)
+++ user/ae/inet6/sys/amd64/vmm	(revision 271453)

Property changes on: user/ae/inet6/sys/amd64/vmm
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys/amd64/vmm:r271428-271452
Index: user/ae/inet6/sys/arm/altera/socfpga/files.socfpga
===================================================================
--- user/ae/inet6/sys/arm/altera/socfpga/files.socfpga	(revision 271452)
+++ user/ae/inet6/sys/arm/altera/socfpga/files.socfpga	(revision 271453)
@@ -1,18 +1,19 @@
 # $FreeBSD$
 
 kern/kern_clocksource.c				standard
 
 arm/arm/bus_space_generic.c			standard
 arm/arm/bus_space_asm_generic.S			standard
 arm/arm/cpufunc_asm_armv5.S			standard
 arm/arm/cpufunc_asm_arm10.S			standard
 arm/arm/cpufunc_asm_arm11.S			standard
 arm/arm/cpufunc_asm_armv7.S			standard
 
 arm/arm/bus_space-v6.c				standard
 arm/arm/gic.c					standard
 arm/arm/mpcore_timer.c				standard
 
 arm/altera/socfpga/socfpga_common.c		standard
 arm/altera/socfpga/socfpga_machdep.c		standard
 arm/altera/socfpga/socfpga_manager.c		standard
+arm/altera/socfpga/socfpga_rstmgr.c		standard
Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_common.c
===================================================================
--- user/ae/inet6/sys/arm/altera/socfpga/socfpga_common.c	(revision 271452)
+++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_common.c	(revision 271453)
@@ -1,83 +1,94 @@
 /*-
  * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 
-#define	RESMAN_BASE	0xFFD05000
-#define	RESMAN_CTRL	0x4
-#define	SWWARMRSTREQ	(1 << 1)
+#include <arm/altera/socfpga/socfpga_rstmgr.h>
 
 void
 cpu_reset(void)
 {
+	uint32_t addr, paddr;
 	bus_addr_t vaddr;
+	phandle_t node;
 
-	if (bus_space_map(fdtbus_bs_tag, RESMAN_BASE, 0x10, 0, &vaddr) == 0) {
-		bus_space_write_4(fdtbus_bs_tag, vaddr,
-		    RESMAN_CTRL, SWWARMRSTREQ);
+	if (rstmgr_warmreset() == 0)
+		goto end;
+
+	node = OF_finddevice("rstmgr");
+	if (node == -1)
+		goto end;
+
+	if ((OF_getprop(node, "reg", &paddr, sizeof(paddr))) > 0) {
+		addr = fdt32_to_cpu(paddr);
+		if (bus_space_map(fdtbus_bs_tag, addr, 0x8, 0, &vaddr) == 0) {
+			bus_space_write_4(fdtbus_bs_tag, vaddr,
+			    RSTMGR_CTRL, CTRL_SWWARMRSTREQ);
+		}
 	}
 
+end:
 	while (1);
 }
 
 struct fdt_fixup_entry fdt_fixup_table[] = {
 	{ NULL, NULL }
 };
 
 static int
 fdt_pic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "arm,gic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_pic_decode_ic,
 	NULL
 };
Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h
===================================================================
--- user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h	(nonexistent)
+++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h	(revision 271453)
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
+ * All rights reserved.
+ *
+ * This software was developed by SRI International and the University of
+ * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
+ * ("CTSRD"), as part of the DARPA CRASH research programme.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define	L3REGS_REMAP		0x0	/* Remap */
+#define	 REMAP_LWHPS2FPGA	(1 << 4)
+#define	 REMAP_HPS2FPGA		(1 << 3)
+#define	 REMAP_MPUZERO		(1 << 0)
+#define	L3REGS_L4MAIN		0x8	/* L4 main peripherals security */
+#define	L3REGS_L4SP		0xC	/* L4 SP Peripherals Security */
+#define	L3REGS_L4MP		0x10	/* L4 MP Peripherals Security */
+#define	L3REGS_L4OSC1		0x14	/* L4 OSC1 Peripherals Security */
+#define	L3REGS_L4SPIM		0x18	/* L4 SPIM Peripherals Security */
+#define	L3REGS_STM		0x1C	/* STM Peripheral Security */
+#define	L3REGS_LWHPS2FPGAREGS	0x20	/* LWHPS2FPGA AXI Bridge Security */
+#define	L3REGS_USB1		0x28	/* USB1 Peripheral Security */
+#define	L3REGS_NANDDATA		0x2C	/* NAND Flash Controller Data Sec */
+#define	L3REGS_USB0		0x80	/* USB0 Peripheral Security */
+#define	L3REGS_NANDREGS		0x84	/* NAND Flash Controller Security */
+#define	L3REGS_QSPIDATA		0x88	/* QSPI Flash Controller Data Sec */
+#define	L3REGS_FPGAMGRDATA	0x8C	/* FPGA Manager Data Peripheral Sec */
+#define	L3REGS_HPS2FPGAREGS	0x90	/* HPS2FPGA AXI Bridge Perip. Sec */
+#define	L3REGS_ACP		0x94	/* MPU ACP Peripheral Security */
+#define	L3REGS_ROM		0x98	/* ROM Peripheral Security */
+#define	L3REGS_OCRAM		0x9C	/* On-chip RAM Peripheral Security */
+#define	L3REGS_SDRDATA		0xA0	/* SDRAM Data Peripheral Security */

Property changes on: user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c
===================================================================
--- user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c	(nonexistent)
+++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c	(revision 271453)
@@ -0,0 +1,259 @@
+/*-
+ * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
+ * All rights reserved.
+ *
+ * This software was developed by SRI International and the University of
+ * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
+ * ("CTSRD"), as part of the DARPA CRASH research programme.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SOCFPGA Reset Manager.
+ * Chapter 3, Cyclone V Device Handbook (CV-5V2 2014.07.22)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/rman.h>
+#include <sys/timeet.h>
+#include <sys/timetc.h>
+#include <sys/sysctl.h>
+
+#include <dev/fdt/fdt_common.h>
+#include <dev/ofw/openfirm.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+#include <machine/bus.h>
+#include <machine/fdt.h>
+#include <machine/cpu.h>
+#include <machine/intr.h>
+
+#include <arm/altera/socfpga/socfpga_common.h>
+#include <arm/altera/socfpga/socfpga_rstmgr.h>
+#include <arm/altera/socfpga/socfpga_l3regs.h>
+
+struct rstmgr_softc {
+	struct resource		*res[1];
+	bus_space_tag_t		bst;
+	bus_space_handle_t	bsh;
+	device_t		dev;
+};
+
+struct rstmgr_softc *rstmgr_sc;
+
+static struct resource_spec rstmgr_spec[] = {
+	{ SYS_RES_MEMORY,	0,	RF_ACTIVE },
+	{ -1, 0 }
+};
+
+enum {
+	RSTMGR_SYSCTL_FPGA2HPS,
+	RSTMGR_SYSCTL_LWHPS2FPGA,
+	RSTMGR_SYSCTL_HPS2FPGA
+};
+
+static int
+l3remap(struct rstmgr_softc *sc, int remap, int enable)
+{
+	uint32_t addr, paddr;
+	bus_addr_t vaddr;
+	phandle_t node;
+	int reg;
+
+	/*
+	 * Control whether bridge is visible to L3 masters or not.
+	 * Register is write-only.
+	 */
+
+	reg = REMAP_MPUZERO;
+	if (enable)
+		reg |= (remap);
+	else
+		reg &= ~(remap);
+
+	node = OF_finddevice("l3regs");
+	if (node == -1) {
+		device_printf(sc->dev, "Can't find l3regs node\n");
+		return (1);
+	}
+
+	if ((OF_getprop(node, "reg", &paddr, sizeof(paddr))) > 0) {
+		addr = fdt32_to_cpu(paddr);
+		if (bus_space_map(fdtbus_bs_tag, addr, 0x4, 0, &vaddr) == 0) {
+			bus_space_write_4(fdtbus_bs_tag, vaddr,
+			    L3REGS_REMAP, reg);
+			return (0);
+		}
+	}
+
+	return (1);
+}
+
+static int
+rstmgr_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct rstmgr_softc *sc;
+	int enable;
+	int remap;
+	int err;
+	int reg;
+	int bit;
+
+	sc = arg1;
+
+	switch (arg2) {
+	case RSTMGR_SYSCTL_FPGA2HPS:
+		bit = BRGMODRST_FPGA2HPS;
+		remap = 0;
+		break;
+	case RSTMGR_SYSCTL_LWHPS2FPGA:
+		bit = BRGMODRST_LWHPS2FPGA;
+		remap = REMAP_LWHPS2FPGA;
+		break;
+	case RSTMGR_SYSCTL_HPS2FPGA:
+		bit = BRGMODRST_HPS2FPGA;
+		remap = REMAP_HPS2FPGA;
+		break;
+	default:
+		return (1);
+	};
+
+	reg = READ4(sc, RSTMGR_BRGMODRST);
+	enable = reg & bit ? 0 : 1;
+
+	err = sysctl_handle_int(oidp, &enable, 0, req);
+	if (err || !req->newptr)
+		return (err);
+
+	if (enable == 1)
+		reg &= ~(bit);
+	else if (enable == 0)
+		reg |= (bit);
+	else
+		return (EINVAL);
+
+	WRITE4(sc, RSTMGR_BRGMODRST, reg);
+	l3remap(sc, remap, enable);
+
+	return (0);
+}
+
+int
+rstmgr_warmreset(void)
+{
+	struct rstmgr_softc *sc;
+
+	sc = rstmgr_sc;
+	if (sc == NULL)
+		return (1);
+
+	/* Request warm reset */
+	WRITE4(sc, RSTMGR_CTRL,
+	    CTRL_SWWARMRSTREQ);
+
+	return (0);
+}
+
+static int
+rstmgr_add_sysctl(struct rstmgr_softc *sc)
+{
+	struct sysctl_oid_list *children;
+	struct sysctl_ctx_list *ctx;
+
+	ctx = device_get_sysctl_ctx(sc->dev);
+	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
+
+	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fpga2hps",
+	    CTLTYPE_UINT | CTLFLAG_RW, sc, RSTMGR_SYSCTL_FPGA2HPS,
+	    rstmgr_sysctl, "I", "Enable fpga2hps bridge");
+	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lwhps2fpga",
+	    CTLTYPE_UINT | CTLFLAG_RW, sc, RSTMGR_SYSCTL_LWHPS2FPGA,
+	    rstmgr_sysctl, "I", "Enable lwhps2fpga bridge");
+	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hps2fpga",
+	    CTLTYPE_UINT | CTLFLAG_RW, sc, RSTMGR_SYSCTL_HPS2FPGA,
+	    rstmgr_sysctl, "I", "Enable hps2fpga bridge");
+
+	return (0);
+}
+
+static int
+rstmgr_probe(device_t dev)
+{
+
+	if (!ofw_bus_status_okay(dev))
+		return (ENXIO);
+
+	if (!ofw_bus_is_compatible(dev, "altr,rst-mgr"))
+		return (ENXIO);
+
+	device_set_desc(dev, "Reset Manager");
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+rstmgr_attach(device_t dev)
+{
+	struct rstmgr_softc *sc;
+
+	sc = device_get_softc(dev);
+	sc->dev = dev;
+
+	if (bus_alloc_resources(dev, rstmgr_spec, sc->res)) {
+		device_printf(dev, "could not allocate resources\n");
+		return (ENXIO);
+	}
+
+	/* Memory interface */
+	sc->bst = rman_get_bustag(sc->res[0]);
+	sc->bsh = rman_get_bushandle(sc->res[0]);
+
+	rstmgr_sc = sc;
+	rstmgr_add_sysctl(sc);
+
+	return (0);
+}
+
+static device_method_t rstmgr_methods[] = {
+	DEVMETHOD(device_probe,		rstmgr_probe),
+	DEVMETHOD(device_attach,	rstmgr_attach),
+	{ 0, 0 }
+};
+
+static driver_t rstmgr_driver = {
+	"rstmgr",
+	rstmgr_methods,
+	sizeof(struct rstmgr_softc),
+};
+
+static devclass_t rstmgr_devclass;
+
+DRIVER_MODULE(rstmgr, simplebus, rstmgr_driver, rstmgr_devclass, 0, 0);

Property changes on: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h
===================================================================
--- user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h	(nonexistent)
+++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h	(revision 271453)
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
+ * All rights reserved.
+ *
+ * This software was developed by SRI International and the University of
+ * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
+ * ("CTSRD"), as part of the DARPA CRASH research programme.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define	RSTMGR_STAT		0x0	/* Status */
+#define	RSTMGR_CTRL		0x4	/* Control */
+#define	 CTRL_SWWARMRSTREQ	(1 << 1) /* Trigger warm reset */
+#define	RSTMGR_COUNTS		0x8	/* Reset Cycles Count */
+#define	RSTMGR_MPUMODRST	0x10	/* MPU Module Reset */
+#define	RSTMGR_PERMODRST	0x14	/* Peripheral Module Reset */
+#define	RSTMGR_PER2MODRST	0x18	/* Peripheral 2 Module Reset */
+#define	RSTMGR_BRGMODRST	0x1C	/* Bridge Module Reset */
+#define	 BRGMODRST_FPGA2HPS	(1 << 2)
+#define	 BRGMODRST_LWHPS2FPGA	(1 << 1)
+#define	 BRGMODRST_HPS2FPGA	(1 << 0)
+#define	RSTMGR_MISCMODRST	0x20	/* Miscellaneous Module Reset */
+
+int rstmgr_warmreset(void);

Property changes on: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: user/ae/inet6/sys/boot/fdt/dts/arm/socfpga.dtsi
===================================================================
--- user/ae/inet6/sys/boot/fdt/dts/arm/socfpga.dtsi	(revision 271452)
+++ user/ae/inet6/sys/boot/fdt/dts/arm/socfpga.dtsi	(revision 271453)
@@ -1,119 +1,131 @@
 /*-
  * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 / {
 	compatible = "altr,socfpga";
 	#address-cells = <1>;
 	#size-cells = <1>;
 
 	interrupt-parent = <&GIC>;
 
 	aliases {
 		soc = &SOC;
+		rstmgr = &rstmgr;
+		l3regs = &l3regs;
 		serial0 = &serial0;
 		serial1 = &serial1;
 	};
 
 	SOC: socfpga {
 		#address-cells = <1>;
 		#size-cells = <1>;
 		compatible = "simple-bus";
 		ranges;
 		bus-frequency = <0>;
 
 		GIC: interrupt-controller@fffed000 {
 			compatible = "arm,gic";
 			reg = < 0xfffed000 0x1000 >, /* Distributor */
 			      < 0xfffec100 0x100 >; /* CPU Interface */
 			interrupt-controller;
 			#interrupt-cells = <1>;
 		};
 
 		mp_tmr@40002100 {
 			compatible = "arm,mpcore-timers";
 			clock-frequency = <200000000>;
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = < 0xfffec200 0x100 >, /* Global Timer */
 			      < 0xfffec600 0x100 >; /* Private Timer */
 			interrupts = < 27 29 >;
 			interrupt-parent = < &GIC >;
+		};
+
+		rstmgr: rstmgr@ffd05000 {
+			compatible = "altr,rst-mgr";
+			reg = <0xffd05000 0x1000>;
+		};
+
+		l3regs: l3regs@ff800000 {
+			compatible = "altr,l3regs";
+			reg = <0xff800000 0x1000>;
 		};
 
 		fpgamgr: fpgamgr@ff706000 {
 			compatible = "altr,fpga-mgr";
 			reg = <0xff706000 0x1000>, /* FPGAMGRREGS */
 			      <0xffb90000 0x1000>; /* FPGAMGRDATA */
 			interrupts = < 207 >;
 			interrupt-parent = <&GIC>;
 		};
 
 		serial0: serial@ffc02000 {
 			compatible = "ns16550";
 			reg = <0xffc02000 0x1000>;
 			reg-shift = <2>;
 			interrupts = <194>;
 			interrupt-parent = <&GIC>;
 			current-speed = <115200>;
 			clock-frequency = < 100000000 >;
 			status = "disabled";
 		};
 
 		serial1: serial@ffc03000 {
 			compatible = "ns16550";
 			reg = <0xffc03000 0x1000>;
 			reg-shift = <2>;
 			interrupts = <195>;
 			interrupt-parent = <&GIC>;
 			current-speed = <115200>;
 			clock-frequency = < 100000000 >;
 			status = "disabled";
 		};
 
 		usb0: usb@ffb00000 {
 			compatible = "synopsys,designware-hs-otg2";
 			reg = <0xffb00000 0xffff>;
 			interrupts = <157>;
 			interrupt-parent = <&GIC>;
 			status = "disabled";
 		};
 
 		usb1: usb@ffb40000 {
 			compatible = "synopsys,designware-hs-otg2";
 			reg = <0xffb40000 0xffff>;
 			interrupts = <160>;
 			interrupt-parent = <&GIC>;
 			dr_mode = "host";
 			status = "disabled";
 		};
 	};
 };
Index: user/ae/inet6/sys/boot
===================================================================
--- user/ae/inet6/sys/boot	(revision 271452)
+++ user/ae/inet6/sys/boot	(revision 271453)

Property changes on: user/ae/inet6/sys/boot
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys/boot:r271428-271452
Index: user/ae/inet6/sys/cam/ctl/ctl.c
===================================================================
--- user/ae/inet6/sys/cam/ctl/ctl.c	(revision 271452)
+++ user/ae/inet6/sys/cam/ctl/ctl.c	(revision 271453)
@@ -1,14266 +1,14323 @@
 /*-
  * Copyright (c) 2003-2009 Silicon Graphics International Corp.
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Edward Tomasz Napierala
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl.c#8 $
  */
 /*
  * CAM Target Layer, a SCSI device emulation subsystem.
  *
  * Author: Ken Merry <ken@FreeBSD.org>
  */
 
 #define _CTL_C
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/kthread.h>
 #include <sys/bio.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/endian.h>
 #include <sys/sysctl.h>
 
 #include <cam/cam.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_private.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_scsi_all.h>
 #include <cam/ctl/ctl_error.h>
 
 struct ctl_softc *control_softc = NULL;
 
 /*
  * Size and alignment macros needed for Copan-specific HA hardware.  These
  * can go away when the HA code is re-written, and uses busdma for any
  * hardware.
  */
 #define	CTL_ALIGN_8B(target, source, type)				\
 	if (((uint32_t)source & 0x7) != 0)				\
 		target = (type)(source + (0x8 - ((uint32_t)source & 0x7)));\
 	else								\
 		target = (type)source;
 
 #define	CTL_SIZE_8B(target, size)					\
 	if ((size & 0x7) != 0)						\
 		target = size + (0x8 - (size & 0x7));			\
 	else								\
 		target = size;
 
 #define CTL_ALIGN_8B_MARGIN	16
 
 /*
  * Template mode pages.
  */
 
 /*
  * Note that these are default values only.  The actual values will be
  * filled in when the user does a mode sense.
  */
 static struct copan_power_subpage power_page_default = {
 	/*page_code*/ PWR_PAGE_CODE | SMPH_SPF,
 	/*subpage*/ PWR_SUBPAGE_CODE,
 	/*page_length*/ {(sizeof(struct copan_power_subpage) - 4) & 0xff00,
 			 (sizeof(struct copan_power_subpage) - 4) & 0x00ff},
 	/*page_version*/ PWR_VERSION,
 	/* total_luns */ 26,
 	/* max_active_luns*/ PWR_DFLT_MAX_LUNS,
 	/*reserved*/ {0, 0, 0, 0, 0, 0, 0, 0, 0,
 		      0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		      0, 0, 0, 0, 0, 0}
 };
 
 static struct copan_power_subpage power_page_changeable = {
 	/*page_code*/ PWR_PAGE_CODE | SMPH_SPF,
 	/*subpage*/ PWR_SUBPAGE_CODE,
 	/*page_length*/ {(sizeof(struct copan_power_subpage) - 4) & 0xff00,
 			 (sizeof(struct copan_power_subpage) - 4) & 0x00ff},
 	/*page_version*/ 0,
 	/* total_luns */ 0,
 	/* max_active_luns*/ 0,
 	/*reserved*/ {0, 0, 0, 0, 0, 0, 0, 0, 0,
 		      0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		      0, 0, 0, 0, 0, 0}
 };
 
 static struct copan_aps_subpage aps_page_default = {
 	APS_PAGE_CODE | SMPH_SPF, //page_code
 	APS_SUBPAGE_CODE, //subpage
 	{(sizeof(struct copan_aps_subpage) - 4) & 0xff00,
 	 (sizeof(struct copan_aps_subpage) - 4) & 0x00ff}, //page_length
 	APS_VERSION, //page_version
 	0, //lock_active
 	{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0} //reserved
 };
 
 static struct copan_aps_subpage aps_page_changeable = {
 	APS_PAGE_CODE | SMPH_SPF, //page_code
 	APS_SUBPAGE_CODE, //subpage
 	{(sizeof(struct copan_aps_subpage) - 4) & 0xff00,
 	 (sizeof(struct copan_aps_subpage) - 4) & 0x00ff}, //page_length
 	0, //page_version
 	0, //lock_active
 	{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0} //reserved
 };
 
 static struct copan_debugconf_subpage debugconf_page_default = {
 	DBGCNF_PAGE_CODE | SMPH_SPF,	/* page_code */
 	DBGCNF_SUBPAGE_CODE,		/* subpage */
 	{(sizeof(struct copan_debugconf_subpage) - 4) >> 8,
 	 (sizeof(struct copan_debugconf_subpage) - 4) >> 0}, /* page_length */
 	DBGCNF_VERSION,			/* page_version */
 	{CTL_TIME_IO_DEFAULT_SECS>>8,
 	 CTL_TIME_IO_DEFAULT_SECS>>0},	/* ctl_time_io_secs */
 };
 
 static struct copan_debugconf_subpage debugconf_page_changeable = {
 	DBGCNF_PAGE_CODE | SMPH_SPF,	/* page_code */
 	DBGCNF_SUBPAGE_CODE,		/* subpage */
 	{(sizeof(struct copan_debugconf_subpage) - 4) >> 8,
 	 (sizeof(struct copan_debugconf_subpage) - 4) >> 0}, /* page_length */
 	0,				/* page_version */
 	{0xff,0xff},			/* ctl_time_io_secs */
 };
 
 static struct scsi_format_page format_page_default = {
 	/*page_code*/SMS_FORMAT_DEVICE_PAGE,
 	/*page_length*/sizeof(struct scsi_format_page) - 2,
 	/*tracks_per_zone*/ {0, 0},
 	/*alt_sectors_per_zone*/ {0, 0},
 	/*alt_tracks_per_zone*/ {0, 0},
 	/*alt_tracks_per_lun*/ {0, 0},
 	/*sectors_per_track*/ {(CTL_DEFAULT_SECTORS_PER_TRACK >> 8) & 0xff,
 			        CTL_DEFAULT_SECTORS_PER_TRACK & 0xff},
 	/*bytes_per_sector*/ {0, 0},
 	/*interleave*/ {0, 0},
 	/*track_skew*/ {0, 0},
 	/*cylinder_skew*/ {0, 0},
 	/*flags*/ SFP_HSEC,
 	/*reserved*/ {0, 0, 0}
 };
 
 static struct scsi_format_page format_page_changeable = {
 	/*page_code*/SMS_FORMAT_DEVICE_PAGE,
 	/*page_length*/sizeof(struct scsi_format_page) - 2,
 	/*tracks_per_zone*/ {0, 0},
 	/*alt_sectors_per_zone*/ {0, 0},
 	/*alt_tracks_per_zone*/ {0, 0},
 	/*alt_tracks_per_lun*/ {0, 0},
 	/*sectors_per_track*/ {0, 0},
 	/*bytes_per_sector*/ {0, 0},
 	/*interleave*/ {0, 0},
 	/*track_skew*/ {0, 0},
 	/*cylinder_skew*/ {0, 0},
 	/*flags*/ 0,
 	/*reserved*/ {0, 0, 0}
 };
 
 static struct scsi_rigid_disk_page rigid_disk_page_default = {
 	/*page_code*/SMS_RIGID_DISK_PAGE,
 	/*page_length*/sizeof(struct scsi_rigid_disk_page) - 2,
 	/*cylinders*/ {0, 0, 0},
 	/*heads*/ CTL_DEFAULT_HEADS,
 	/*start_write_precomp*/ {0, 0, 0},
 	/*start_reduced_current*/ {0, 0, 0},
 	/*step_rate*/ {0, 0},
 	/*landing_zone_cylinder*/ {0, 0, 0},
 	/*rpl*/ SRDP_RPL_DISABLED,
 	/*rotational_offset*/ 0,
 	/*reserved1*/ 0,
 	/*rotation_rate*/ {(CTL_DEFAULT_ROTATION_RATE >> 8) & 0xff,
 			   CTL_DEFAULT_ROTATION_RATE & 0xff},
 	/*reserved2*/ {0, 0}
 };
 
 static struct scsi_rigid_disk_page rigid_disk_page_changeable = {
 	/*page_code*/SMS_RIGID_DISK_PAGE,
 	/*page_length*/sizeof(struct scsi_rigid_disk_page) - 2,
 	/*cylinders*/ {0, 0, 0},
 	/*heads*/ 0,
 	/*start_write_precomp*/ {0, 0, 0},
 	/*start_reduced_current*/ {0, 0, 0},
 	/*step_rate*/ {0, 0},
 	/*landing_zone_cylinder*/ {0, 0, 0},
 	/*rpl*/ 0,
 	/*rotational_offset*/ 0,
 	/*reserved1*/ 0,
 	/*rotation_rate*/ {0, 0},
 	/*reserved2*/ {0, 0}
 };
 
 static struct scsi_caching_page caching_page_default = {
 	/*page_code*/SMS_CACHING_PAGE,
 	/*page_length*/sizeof(struct scsi_caching_page) - 2,
 	/*flags1*/ SCP_DISC | SCP_WCE,
 	/*ret_priority*/ 0,
 	/*disable_pf_transfer_len*/ {0xff, 0xff},
 	/*min_prefetch*/ {0, 0},
 	/*max_prefetch*/ {0xff, 0xff},
 	/*max_pf_ceiling*/ {0xff, 0xff},
 	/*flags2*/ 0,
 	/*cache_segments*/ 0,
 	/*cache_seg_size*/ {0, 0},
 	/*reserved*/ 0,
 	/*non_cache_seg_size*/ {0, 0, 0}
 };
 
 static struct scsi_caching_page caching_page_changeable = {
 	/*page_code*/SMS_CACHING_PAGE,
 	/*page_length*/sizeof(struct scsi_caching_page) - 2,
 	/*flags1*/ SCP_WCE | SCP_RCD,
 	/*ret_priority*/ 0,
 	/*disable_pf_transfer_len*/ {0, 0},
 	/*min_prefetch*/ {0, 0},
 	/*max_prefetch*/ {0, 0},
 	/*max_pf_ceiling*/ {0, 0},
 	/*flags2*/ 0,
 	/*cache_segments*/ 0,
 	/*cache_seg_size*/ {0, 0},
 	/*reserved*/ 0,
 	/*non_cache_seg_size*/ {0, 0, 0}
 };
 
 static struct scsi_control_page control_page_default = {
 	/*page_code*/SMS_CONTROL_MODE_PAGE,
 	/*page_length*/sizeof(struct scsi_control_page) - 2,
 	/*rlec*/0,
 	/*queue_flags*/0,
 	/*eca_and_aen*/0,
 	/*flags4*/SCP_TAS,
 	/*aen_holdoff_period*/{0, 0},
 	/*busy_timeout_period*/{0, 0},
 	/*extended_selftest_completion_time*/{0, 0}
 };
 
 static struct scsi_control_page control_page_changeable = {
 	/*page_code*/SMS_CONTROL_MODE_PAGE,
 	/*page_length*/sizeof(struct scsi_control_page) - 2,
 	/*rlec*/SCP_DSENSE,
 	/*queue_flags*/0,
 	/*eca_and_aen*/0,
 	/*flags4*/0,
 	/*aen_holdoff_period*/{0, 0},
 	/*busy_timeout_period*/{0, 0},
 	/*extended_selftest_completion_time*/{0, 0}
 };
 
 
 /*
  * XXX KDM move these into the softc.
  */
 static int rcv_sync_msg;
 static int persis_offset;
 static uint8_t ctl_pause_rtr;
 static int     ctl_is_single = 1;
 static int     index_to_aps_page;
 
 SYSCTL_NODE(_kern_cam, OID_AUTO, ctl, CTLFLAG_RD, 0, "CAM Target Layer");
 static int worker_threads = -1;
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, worker_threads, CTLFLAG_RDTUN,
     &worker_threads, 1, "Number of worker threads");
 static int verbose = 0;
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, verbose, CTLFLAG_RWTUN,
     &verbose, 0, "Show SCSI errors returned to initiator");
 
 /*
  * Supported pages (0x00), Serial number (0x80), Device ID (0x83),
- * Mode Page Policy (0x87),
+ * Extended INQUIRY Data (0x86), Mode Page Policy (0x87),
  * SCSI Ports (0x88), Third-party Copy (0x8F), Block limits (0xB0),
  * Block Device Characteristics (0xB1) and Logical Block Provisioning (0xB2)
  */
-#define SCSI_EVPD_NUM_SUPPORTED_PAGES	9
+#define SCSI_EVPD_NUM_SUPPORTED_PAGES	10
 
 static void ctl_isc_event_handler(ctl_ha_channel chanel, ctl_ha_event event,
 				  int param);
 static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest);
 static int ctl_init(void);
 void ctl_shutdown(void);
 static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td);
 static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td);
 static void ctl_ioctl_online(void *arg);
 static void ctl_ioctl_offline(void *arg);
 static int ctl_ioctl_lun_enable(void *arg, struct ctl_id targ_id, int lun_id);
 static int ctl_ioctl_lun_disable(void *arg, struct ctl_id targ_id, int lun_id);
 static int ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio);
 static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio);
 static int ctl_ioctl_submit_wait(union ctl_io *io);
 static void ctl_ioctl_datamove(union ctl_io *io);
 static void ctl_ioctl_done(union ctl_io *io);
 static void ctl_ioctl_hard_startstop_callback(void *arg,
 					      struct cfi_metatask *metatask);
 static void ctl_ioctl_bbrread_callback(void *arg,struct cfi_metatask *metatask);
 static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num,
 			      struct ctl_ooa *ooa_hdr,
 			      struct ctl_ooa_entry *kern_entries);
 static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 		     struct thread *td);
 static uint32_t ctl_map_lun(int port_num, uint32_t lun);
 static uint32_t ctl_map_lun_back(int port_num, uint32_t lun);
 #ifdef unused
 static union ctl_io *ctl_malloc_io(ctl_io_type io_type, uint32_t targ_port,
 				   uint32_t targ_target, uint32_t targ_lun,
 				   int can_wait);
 static void ctl_kfree_io(union ctl_io *io);
 #endif /* unused */
 static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *lun,
 			 struct ctl_be_lun *be_lun, struct ctl_id target_id);
 static int ctl_free_lun(struct ctl_lun *lun);
 static void ctl_create_lun(struct ctl_be_lun *be_lun);
 /**
 static void ctl_failover_change_pages(struct ctl_softc *softc,
 				      struct ctl_scsiio *ctsio, int master);
 **/
 
 static int ctl_do_mode_select(union ctl_io *io);
 static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun,
 			   uint64_t res_key, uint64_t sa_res_key,
 			   uint8_t type, uint32_t residx,
 			   struct ctl_scsiio *ctsio,
 			   struct scsi_per_res_out *cdb,
 			   struct scsi_per_res_out_parms* param);
 static void ctl_pro_preempt_other(struct ctl_lun *lun,
 				  union ctl_ha_msg *msg);
 static void ctl_hndl_per_res_out_on_other_sc(union ctl_ha_msg *msg);
 static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len);
+static int ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio,
 					 int alloc_len);
 static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio,
 					 int alloc_len);
 static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio);
 static int ctl_inquiry_std(struct ctl_scsiio *ctsio);
 static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint32_t *len);
 static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2);
 static ctl_action ctl_check_for_blockage(union ctl_io *pending_io,
 					 union ctl_io *ooa_io);
 static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io,
 				union ctl_io *starting_io);
 static int ctl_check_blocked(struct ctl_lun *lun);
 static int ctl_scsiio_lun_check(struct ctl_softc *ctl_softc,
 				struct ctl_lun *lun,
 				const struct ctl_cmd_entry *entry,
 				struct ctl_scsiio *ctsio);
 //static int ctl_check_rtr(union ctl_io *pending_io, struct ctl_softc *softc);
 static void ctl_failover(void);
 static int ctl_scsiio_precheck(struct ctl_softc *ctl_softc,
 			       struct ctl_scsiio *ctsio);
 static int ctl_scsiio(struct ctl_scsiio *ctsio);
 
 static int ctl_bus_reset(struct ctl_softc *ctl_softc, union ctl_io *io);
 static int ctl_target_reset(struct ctl_softc *ctl_softc, union ctl_io *io,
 			    ctl_ua_type ua_type);
 static int ctl_lun_reset(struct ctl_lun *lun, union ctl_io *io,
 			 ctl_ua_type ua_type);
 static int ctl_abort_task(union ctl_io *io);
 static int ctl_abort_task_set(union ctl_io *io);
 static int ctl_i_t_nexus_reset(union ctl_io *io);
 static void ctl_run_task(union ctl_io *io);
 #ifdef CTL_IO_DELAY
 static void ctl_datamove_timer_wakeup(void *arg);
 static void ctl_done_timer_wakeup(void *arg);
 #endif /* CTL_IO_DELAY */
 
 static void ctl_send_datamove_done(union ctl_io *io, int have_lock);
 static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq);
 static int ctl_datamove_remote_dm_write_cb(union ctl_io *io);
 static void ctl_datamove_remote_write(union ctl_io *io);
 static int ctl_datamove_remote_dm_read_cb(union ctl_io *io);
 static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq);
 static int ctl_datamove_remote_sgl_setup(union ctl_io *io);
 static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command,
 				    ctl_ha_dt_cb callback);
 static void ctl_datamove_remote_read(union ctl_io *io);
 static void ctl_datamove_remote(union ctl_io *io);
 static int ctl_process_done(union ctl_io *io);
 static void ctl_lun_thread(void *arg);
 static void ctl_work_thread(void *arg);
 static void ctl_enqueue_incoming(union ctl_io *io);
 static void ctl_enqueue_rtr(union ctl_io *io);
 static void ctl_enqueue_done(union ctl_io *io);
 static void ctl_enqueue_isc(union ctl_io *io);
 static const struct ctl_cmd_entry *
     ctl_get_cmd_entry(struct ctl_scsiio *ctsio);
 static const struct ctl_cmd_entry *
     ctl_validate_command(struct ctl_scsiio *ctsio);
 static int ctl_cmd_applicable(uint8_t lun_type,
     const struct ctl_cmd_entry *entry);
 
 /*
  * Load the serialization table.  This isn't very pretty, but is probably
  * the easiest way to do it.
  */
 #include "ctl_ser_table.c"
 
 /*
  * We only need to define open, close and ioctl routines for this driver.
  */
 static struct cdevsw ctl_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	0,
 	.d_open =	ctl_open,
 	.d_close =	ctl_close,
 	.d_ioctl =	ctl_ioctl,
 	.d_name =	"ctl",
 };
 
 
 MALLOC_DEFINE(M_CTL, "ctlmem", "Memory used for CTL");
 MALLOC_DEFINE(M_CTLIO, "ctlio", "Memory used for CTL requests");
 
 static int ctl_module_event_handler(module_t, int /*modeventtype_t*/, void *);
 
 static moduledata_t ctl_moduledata = {
 	"ctl",
 	ctl_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD);
 MODULE_VERSION(ctl, 1);
 
 static struct ctl_frontend ioctl_frontend =
 {
 	.name = "ioctl",
 };
 
 static void
 ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc,
 			    union ctl_ha_msg *msg_info)
 {
 	struct ctl_scsiio *ctsio;
 
 	if (msg_info->hdr.original_sc == NULL) {
 		printf("%s: original_sc == NULL!\n", __func__);
 		/* XXX KDM now what? */
 		return;
 	}
 
 	ctsio = &msg_info->hdr.original_sc->scsiio;
 	ctsio->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 	ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO;
 	ctsio->io_hdr.status = msg_info->hdr.status;
 	ctsio->scsi_status = msg_info->scsi.scsi_status;
 	ctsio->sense_len = msg_info->scsi.sense_len;
 	ctsio->sense_residual = msg_info->scsi.sense_residual;
 	ctsio->residual = msg_info->scsi.residual;
 	memcpy(&ctsio->sense_data, &msg_info->scsi.sense_data,
 	       sizeof(ctsio->sense_data));
 	memcpy(&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes,
 	       &msg_info->scsi.lbalen, sizeof(msg_info->scsi.lbalen));
 	ctl_enqueue_isc((union ctl_io *)ctsio);
 }
 
 static void
 ctl_isc_handler_finish_ser_only(struct ctl_softc *ctl_softc,
 				union ctl_ha_msg *msg_info)
 {
 	struct ctl_scsiio *ctsio;
 
 	if (msg_info->hdr.serializing_sc == NULL) {
 		printf("%s: serializing_sc == NULL!\n", __func__);
 		/* XXX KDM now what? */
 		return;
 	}
 
 	ctsio = &msg_info->hdr.serializing_sc->scsiio;
 #if 0
 	/*
 	 * Attempt to catch the situation where an I/O has
 	 * been freed, and we're using it again.
 	 */
 	if (ctsio->io_hdr.io_type == 0xff) {
 		union ctl_io *tmp_io;
 		tmp_io = (union ctl_io *)ctsio;
 		printf("%s: %p use after free!\n", __func__,
 		       ctsio);
 		printf("%s: type %d msg %d cdb %x iptl: "
 		       "%d:%d:%d:%d tag 0x%04x "
 		       "flag %#x status %x\n",
 			__func__,
 			tmp_io->io_hdr.io_type,
 			tmp_io->io_hdr.msg_type,
 			tmp_io->scsiio.cdb[0],
 			tmp_io->io_hdr.nexus.initid.id,
 			tmp_io->io_hdr.nexus.targ_port,
 			tmp_io->io_hdr.nexus.targ_target.id,
 			tmp_io->io_hdr.nexus.targ_lun,
 			(tmp_io->io_hdr.io_type ==
 			CTL_IO_TASK) ?
 			tmp_io->taskio.tag_num :
 			tmp_io->scsiio.tag_num,
 		        tmp_io->io_hdr.flags,
 			tmp_io->io_hdr.status);
 	}
 #endif
 	ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO;
 	ctl_enqueue_isc((union ctl_io *)ctsio);
 }
 
 /*
  * ISC (Inter Shelf Communication) event handler.  Events from the HA
  * subsystem come in here.
  */
 static void
 ctl_isc_event_handler(ctl_ha_channel channel, ctl_ha_event event, int param)
 {
 	struct ctl_softc *ctl_softc;
 	union ctl_io *io;
 	struct ctl_prio *presio;
 	ctl_ha_status isc_status;
 
 	ctl_softc = control_softc;
 	io = NULL;
 
 
 #if 0
 	printf("CTL: Isc Msg event %d\n", event);
 #endif
 	if (event == CTL_HA_EVT_MSG_RECV) {
 		union ctl_ha_msg msg_info;
 
 		isc_status = ctl_ha_msg_recv(CTL_HA_CHAN_CTL, &msg_info,
 					     sizeof(msg_info), /*wait*/ 0);
 #if 0
 		printf("CTL: msg_type %d\n", msg_info.msg_type);
 #endif
 		if (isc_status != 0) {
 			printf("Error receiving message, status = %d\n",
 			       isc_status);
 			return;
 		}
 
 		switch (msg_info.hdr.msg_type) {
 		case CTL_MSG_SERIALIZE:
 #if 0
 			printf("Serialize\n");
 #endif
 			io = ctl_alloc_io((void *)ctl_softc->othersc_pool);
 			if (io == NULL) {
 				printf("ctl_isc_event_handler: can't allocate "
 				       "ctl_io!\n");
 				/* Bad Juju */
 				/* Need to set busy and send msg back */
 				msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU;
 				msg_info.hdr.status = CTL_SCSI_ERROR;
 				msg_info.scsi.scsi_status = SCSI_STATUS_BUSY;
 				msg_info.scsi.sense_len = 0;
 			        if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 				    sizeof(msg_info), 0) > CTL_HA_STATUS_SUCCESS){
 				}
 				goto bailout;
 			}
 			ctl_zero_io(io);
 			// populate ctsio from msg_info
 			io->io_hdr.io_type = CTL_IO_SCSI;
 			io->io_hdr.msg_type = CTL_MSG_SERIALIZE;
 			io->io_hdr.original_sc = msg_info.hdr.original_sc;
 #if 0
 			printf("pOrig %x\n", (int)msg_info.original_sc);
 #endif
 			io->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC |
 					    CTL_FLAG_IO_ACTIVE;
 			/*
 			 * If we're in serialization-only mode, we don't
 			 * want to go through full done processing.  Thus
 			 * the COPY flag.
 			 *
 			 * XXX KDM add another flag that is more specific.
 			 */
 			if (ctl_softc->ha_mode == CTL_HA_MODE_SER_ONLY)
 				io->io_hdr.flags |= CTL_FLAG_INT_COPY;
 			io->io_hdr.nexus = msg_info.hdr.nexus;
 #if 0
 			printf("targ %d, port %d, iid %d, lun %d\n",
 			       io->io_hdr.nexus.targ_target.id,
 			       io->io_hdr.nexus.targ_port,
 			       io->io_hdr.nexus.initid.id,
 			       io->io_hdr.nexus.targ_lun);
 #endif
 			io->scsiio.tag_num = msg_info.scsi.tag_num;
 			io->scsiio.tag_type = msg_info.scsi.tag_type;
 			memcpy(io->scsiio.cdb, msg_info.scsi.cdb,
 			       CTL_MAX_CDBLEN);
 			if (ctl_softc->ha_mode == CTL_HA_MODE_XFER) {
 				const struct ctl_cmd_entry *entry;
 
 				entry = ctl_get_cmd_entry(&io->scsiio);
 				io->io_hdr.flags &= ~CTL_FLAG_DATA_MASK;
 				io->io_hdr.flags |=
 					entry->flags & CTL_FLAG_DATA_MASK;
 			}
 			ctl_enqueue_isc(io);
 			break;
 
 		/* Performed on the Originating SC, XFER mode only */
 		case CTL_MSG_DATAMOVE: {
 			struct ctl_sg_entry *sgl;
 			int i, j;
 
 			io = msg_info.hdr.original_sc;
 			if (io == NULL) {
 				printf("%s: original_sc == NULL!\n", __func__);
 				/* XXX KDM do something here */
 				break;
 			}
 			io->io_hdr.msg_type = CTL_MSG_DATAMOVE;
 			io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 			/*
 			 * Keep track of this, we need to send it back over
 			 * when the datamove is complete.
 			 */
 			io->io_hdr.serializing_sc = msg_info.hdr.serializing_sc;
 
 			if (msg_info.dt.sg_sequence == 0) {
 				/*
 				 * XXX KDM we use the preallocated S/G list
 				 * here, but we'll need to change this to
 				 * dynamic allocation if we need larger S/G
 				 * lists.
 				 */
 				if (msg_info.dt.kern_sg_entries >
 				    sizeof(io->io_hdr.remote_sglist) /
 				    sizeof(io->io_hdr.remote_sglist[0])) {
 					printf("%s: number of S/G entries "
 					    "needed %u > allocated num %zd\n",
 					    __func__,
 					    msg_info.dt.kern_sg_entries,
 					    sizeof(io->io_hdr.remote_sglist)/
 					    sizeof(io->io_hdr.remote_sglist[0]));
 				
 					/*
 					 * XXX KDM send a message back to
 					 * the other side to shut down the
 					 * DMA.  The error will come back
 					 * through via the normal channel.
 					 */
 					break;
 				}
 				sgl = io->io_hdr.remote_sglist;
 				memset(sgl, 0,
 				       sizeof(io->io_hdr.remote_sglist));
 
 				io->scsiio.kern_data_ptr = (uint8_t *)sgl;
 
 				io->scsiio.kern_sg_entries =
 					msg_info.dt.kern_sg_entries;
 				io->scsiio.rem_sg_entries =
 					msg_info.dt.kern_sg_entries;
 				io->scsiio.kern_data_len =
 					msg_info.dt.kern_data_len;
 				io->scsiio.kern_total_len =
 					msg_info.dt.kern_total_len;
 				io->scsiio.kern_data_resid =
 					msg_info.dt.kern_data_resid;
 				io->scsiio.kern_rel_offset =
 					msg_info.dt.kern_rel_offset;
 				/*
 				 * Clear out per-DMA flags.
 				 */
 				io->io_hdr.flags &= ~CTL_FLAG_RDMA_MASK;
 				/*
 				 * Add per-DMA flags that are set for this
 				 * particular DMA request.
 				 */
 				io->io_hdr.flags |= msg_info.dt.flags &
 						    CTL_FLAG_RDMA_MASK;
 			} else
 				sgl = (struct ctl_sg_entry *)
 					io->scsiio.kern_data_ptr;
 
 			for (i = msg_info.dt.sent_sg_entries, j = 0;
 			     i < (msg_info.dt.sent_sg_entries +
 			     msg_info.dt.cur_sg_entries); i++, j++) {
 				sgl[i].addr = msg_info.dt.sg_list[j].addr;
 				sgl[i].len = msg_info.dt.sg_list[j].len;
 
 #if 0
 				printf("%s: L: %p,%d -> %p,%d j=%d, i=%d\n",
 				       __func__,
 				       msg_info.dt.sg_list[j].addr,
 				       msg_info.dt.sg_list[j].len,
 				       sgl[i].addr, sgl[i].len, j, i);
 #endif
 			}
 #if 0
 			memcpy(&sgl[msg_info.dt.sent_sg_entries],
 			       msg_info.dt.sg_list,
 			       sizeof(*sgl) * msg_info.dt.cur_sg_entries);
 #endif
 
 			/*
 			 * If this is the last piece of the I/O, we've got
 			 * the full S/G list.  Queue processing in the thread.
 			 * Otherwise wait for the next piece.
 			 */
 			if (msg_info.dt.sg_last != 0)
 				ctl_enqueue_isc(io);
 			break;
 		}
 		/* Performed on the Serializing (primary) SC, XFER mode only */
 		case CTL_MSG_DATAMOVE_DONE: {
 			if (msg_info.hdr.serializing_sc == NULL) {
 				printf("%s: serializing_sc == NULL!\n",
 				       __func__);
 				/* XXX KDM now what? */
 				break;
 			}
 			/*
 			 * We grab the sense information here in case
 			 * there was a failure, so we can return status
 			 * back to the initiator.
 			 */
 			io = msg_info.hdr.serializing_sc;
 			io->io_hdr.msg_type = CTL_MSG_DATAMOVE_DONE;
 			io->io_hdr.status = msg_info.hdr.status;
 			io->scsiio.scsi_status = msg_info.scsi.scsi_status;
 			io->scsiio.sense_len = msg_info.scsi.sense_len;
 			io->scsiio.sense_residual =msg_info.scsi.sense_residual;
 			io->io_hdr.port_status = msg_info.scsi.fetd_status;
 			io->scsiio.residual = msg_info.scsi.residual;
 			memcpy(&io->scsiio.sense_data,&msg_info.scsi.sense_data,
 			       sizeof(io->scsiio.sense_data));
 			ctl_enqueue_isc(io);
 			break;
 		}
 
 		/* Preformed on Originating SC, SER_ONLY mode */
 		case CTL_MSG_R2R:
 			io = msg_info.hdr.original_sc;
 			if (io == NULL) {
 				printf("%s: Major Bummer\n", __func__);
 				return;
 			} else {
 #if 0
 				printf("pOrig %x\n",(int) ctsio);
 #endif
 			}
 			io->io_hdr.msg_type = CTL_MSG_R2R;
 			io->io_hdr.serializing_sc = msg_info.hdr.serializing_sc;
 			ctl_enqueue_isc(io);
 			break;
 
 		/*
 		 * Performed on Serializing(i.e. primary SC) SC in SER_ONLY
 		 * mode.
 		 * Performed on the Originating (i.e. secondary) SC in XFER
 		 * mode
 		 */
 		case CTL_MSG_FINISH_IO:
 			if (ctl_softc->ha_mode == CTL_HA_MODE_XFER)
 				ctl_isc_handler_finish_xfer(ctl_softc,
 							    &msg_info);
 			else
 				ctl_isc_handler_finish_ser_only(ctl_softc,
 								&msg_info);
 			break;
 
 		/* Preformed on Originating SC */
 		case CTL_MSG_BAD_JUJU:
 			io = msg_info.hdr.original_sc;
 			if (io == NULL) {
 				printf("%s: Bad JUJU!, original_sc is NULL!\n",
 				       __func__);
 				break;
 			}
 			ctl_copy_sense_data(&msg_info, io);
 			/*
 			 * IO should have already been cleaned up on other
 			 * SC so clear this flag so we won't send a message
 			 * back to finish the IO there.
 			 */
 			io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC;
 			io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 
 			/* io = msg_info.hdr.serializing_sc; */
 			io->io_hdr.msg_type = CTL_MSG_BAD_JUJU;
 			ctl_enqueue_isc(io);
 			break;
 
 		/* Handle resets sent from the other side */
 		case CTL_MSG_MANAGE_TASKS: {
 			struct ctl_taskio *taskio;
 			taskio = (struct ctl_taskio *)ctl_alloc_io(
 				(void *)ctl_softc->othersc_pool);
 			if (taskio == NULL) {
 				printf("ctl_isc_event_handler: can't allocate "
 				       "ctl_io!\n");
 				/* Bad Juju */
 				/* should I just call the proper reset func
 				   here??? */
 				goto bailout;
 			}
 			ctl_zero_io((union ctl_io *)taskio);
 			taskio->io_hdr.io_type = CTL_IO_TASK;
 			taskio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC;
 			taskio->io_hdr.nexus = msg_info.hdr.nexus;
 			taskio->task_action = msg_info.task.task_action;
 			taskio->tag_num = msg_info.task.tag_num;
 			taskio->tag_type = msg_info.task.tag_type;
 #ifdef CTL_TIME_IO
 			taskio->io_hdr.start_time = time_uptime;
 			getbintime(&taskio->io_hdr.start_bt);
 #if 0
 			cs_prof_gettime(&taskio->io_hdr.start_ticks);
 #endif
 #endif /* CTL_TIME_IO */
 			ctl_run_task((union ctl_io *)taskio);
 			break;
 		}
 		/* Persistent Reserve action which needs attention */
 		case CTL_MSG_PERS_ACTION:
 			presio = (struct ctl_prio *)ctl_alloc_io(
 				(void *)ctl_softc->othersc_pool);
 			if (presio == NULL) {
 				printf("ctl_isc_event_handler: can't allocate "
 				       "ctl_io!\n");
 				/* Bad Juju */
 				/* Need to set busy and send msg back */
 				goto bailout;
 			}
 			ctl_zero_io((union ctl_io *)presio);
 			presio->io_hdr.msg_type = CTL_MSG_PERS_ACTION;
 			presio->pr_msg = msg_info.pr;
 			ctl_enqueue_isc((union ctl_io *)presio);
 			break;
 		case CTL_MSG_SYNC_FE:
 			rcv_sync_msg = 1;
 			break;
 		case CTL_MSG_APS_LOCK: {
 			// It's quicker to execute this then to
 			// queue it.
 			struct ctl_lun *lun;
 			struct ctl_page_index *page_index;
 			struct copan_aps_subpage *current_sp;
 			uint32_t targ_lun;
 
 			targ_lun = msg_info.hdr.nexus.targ_mapped_lun;
 			lun = ctl_softc->ctl_luns[targ_lun];
 			mtx_lock(&lun->lun_lock);
 			page_index = &lun->mode_pages.index[index_to_aps_page];
 			current_sp = (struct copan_aps_subpage *)
 				     (page_index->page_data +
 				     (page_index->page_len * CTL_PAGE_CURRENT));
 
 			current_sp->lock_active = msg_info.aps.lock_flag;
 			mtx_unlock(&lun->lun_lock);
 		        break;
 		}
 		default:
 		        printf("How did I get here?\n");
 		}
 	} else if (event == CTL_HA_EVT_MSG_SENT) {
 		if (param != CTL_HA_STATUS_SUCCESS) {
 			printf("Bad status from ctl_ha_msg_send status %d\n",
 			       param);
 		}
 		return;
 	} else if (event == CTL_HA_EVT_DISCONNECT) {
 		printf("CTL: Got a disconnect from Isc\n");
 		return;
 	} else {
 		printf("ctl_isc_event_handler: Unknown event %d\n", event);
 		return;
 	}
 
 bailout:
 	return;
 }
 
 static void
 ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest)
 {
 	struct scsi_sense_data *sense;
 
 	sense = &dest->scsiio.sense_data;
 	bcopy(&src->scsi.sense_data, sense, sizeof(*sense));
 	dest->scsiio.scsi_status = src->scsi.scsi_status;
 	dest->scsiio.sense_len = src->scsi.sense_len;
 	dest->io_hdr.status = src->hdr.status;
 }
 
 static int
 ctl_init(void)
 {
 	struct ctl_softc *softc;
 	struct ctl_io_pool *internal_pool, *emergency_pool, *other_pool;
 	struct ctl_port *port;
         uint8_t sc_id =0;
 	int i, error, retval;
 	//int isc_retval;
 
 	retval = 0;
 	ctl_pause_rtr = 0;
         rcv_sync_msg = 0;
 
 	control_softc = malloc(sizeof(*control_softc), M_DEVBUF,
 			       M_WAITOK | M_ZERO);
 	softc = control_softc;
 
 	softc->dev = make_dev(&ctl_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600,
 			      "cam/ctl");
 
 	softc->dev->si_drv1 = softc;
 
 	/*
 	 * By default, return a "bad LUN" peripheral qualifier for unknown
 	 * LUNs.  The user can override this default using the tunable or
 	 * sysctl.  See the comment in ctl_inquiry_std() for more details.
 	 */
 	softc->inquiry_pq_no_lun = 1;
 	TUNABLE_INT_FETCH("kern.cam.ctl.inquiry_pq_no_lun",
 			  &softc->inquiry_pq_no_lun);
 	sysctl_ctx_init(&softc->sysctl_ctx);
 	softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_cam), OID_AUTO, "ctl",
 		CTLFLAG_RD, 0, "CAM Target Layer");
 
 	if (softc->sysctl_tree == NULL) {
 		printf("%s: unable to allocate sysctl tree\n", __func__);
 		destroy_dev(softc->dev);
 		free(control_softc, M_DEVBUF);
 		control_softc = NULL;
 		return (ENOMEM);
 	}
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO,
 		       "inquiry_pq_no_lun", CTLFLAG_RW,
 		       &softc->inquiry_pq_no_lun, 0,
 		       "Report no lun possible for invalid LUNs");
 
 	mtx_init(&softc->ctl_lock, "CTL mutex", NULL, MTX_DEF);
 	mtx_init(&softc->pool_lock, "CTL pool mutex", NULL, MTX_DEF);
 	softc->open_count = 0;
 
 	/*
 	 * Default to actually sending a SYNCHRONIZE CACHE command down to
 	 * the drive.
 	 */
 	softc->flags = CTL_FLAG_REAL_SYNC;
 
 	/*
 	 * In Copan's HA scheme, the "master" and "slave" roles are
 	 * figured out through the slot the controller is in.  Although it
 	 * is an active/active system, someone has to be in charge.
  	 */
 #ifdef NEEDTOPORT
         scmicro_rw(SCMICRO_GET_SHELF_ID, &sc_id);
 #endif
 
         if (sc_id == 0) {
 		softc->flags |= CTL_FLAG_MASTER_SHELF;
 		persis_offset = 0;
 	} else
 		persis_offset = CTL_MAX_INITIATORS;
 
 	/*
 	 * XXX KDM need to figure out where we want to get our target ID
 	 * and WWID.  Is it different on each port?
 	 */
 	softc->target.id = 0;
 	softc->target.wwid[0] = 0x12345678;
 	softc->target.wwid[1] = 0x87654321;
 	STAILQ_INIT(&softc->lun_list);
 	STAILQ_INIT(&softc->pending_lun_queue);
 	STAILQ_INIT(&softc->fe_list);
 	STAILQ_INIT(&softc->port_list);
 	STAILQ_INIT(&softc->be_list);
 	STAILQ_INIT(&softc->io_pools);
 	ctl_tpc_init(softc);
 
 	if (ctl_pool_create(softc, CTL_POOL_INTERNAL, CTL_POOL_ENTRIES_INTERNAL,
 			    &internal_pool)!= 0){
 		printf("ctl: can't allocate %d entry internal pool, "
 		       "exiting\n", CTL_POOL_ENTRIES_INTERNAL);
 		return (ENOMEM);
 	}
 
 	if (ctl_pool_create(softc, CTL_POOL_EMERGENCY,
 			    CTL_POOL_ENTRIES_EMERGENCY, &emergency_pool) != 0) {
 		printf("ctl: can't allocate %d entry emergency pool, "
 		       "exiting\n", CTL_POOL_ENTRIES_EMERGENCY);
 		ctl_pool_free(internal_pool);
 		return (ENOMEM);
 	}
 
 	if (ctl_pool_create(softc, CTL_POOL_4OTHERSC, CTL_POOL_ENTRIES_OTHER_SC,
 	                    &other_pool) != 0)
 	{
 		printf("ctl: can't allocate %d entry other SC pool, "
 		       "exiting\n", CTL_POOL_ENTRIES_OTHER_SC);
 		ctl_pool_free(internal_pool);
 		ctl_pool_free(emergency_pool);
 		return (ENOMEM);
 	}
 
 	softc->internal_pool = internal_pool;
 	softc->emergency_pool = emergency_pool;
 	softc->othersc_pool = other_pool;
 
 	if (worker_threads <= 0)
 		worker_threads = max(1, mp_ncpus / 4);
 	if (worker_threads > CTL_MAX_THREADS)
 		worker_threads = CTL_MAX_THREADS;
 
 	for (i = 0; i < worker_threads; i++) {
 		struct ctl_thread *thr = &softc->threads[i];
 
 		mtx_init(&thr->queue_lock, "CTL queue mutex", NULL, MTX_DEF);
 		thr->ctl_softc = softc;
 		STAILQ_INIT(&thr->incoming_queue);
 		STAILQ_INIT(&thr->rtr_queue);
 		STAILQ_INIT(&thr->done_queue);
 		STAILQ_INIT(&thr->isc_queue);
 
 		error = kproc_kthread_add(ctl_work_thread, thr,
 		    &softc->ctl_proc, &thr->thread, 0, 0, "ctl", "work%d", i);
 		if (error != 0) {
 			printf("error creating CTL work thread!\n");
 			ctl_pool_free(internal_pool);
 			ctl_pool_free(emergency_pool);
 			ctl_pool_free(other_pool);
 			return (error);
 		}
 	}
 	error = kproc_kthread_add(ctl_lun_thread, softc,
 	    &softc->ctl_proc, NULL, 0, 0, "ctl", "lun");
 	if (error != 0) {
 		printf("error creating CTL lun thread!\n");
 		ctl_pool_free(internal_pool);
 		ctl_pool_free(emergency_pool);
 		ctl_pool_free(other_pool);
 		return (error);
 	}
 	if (bootverbose)
 		printf("ctl: CAM Target Layer loaded\n");
 
 	/*
 	 * Initialize the ioctl front end.
 	 */
 	ctl_frontend_register(&ioctl_frontend);
 	port = &softc->ioctl_info.port;
 	port->frontend = &ioctl_frontend;
 	sprintf(softc->ioctl_info.port_name, "ioctl");
 	port->port_type = CTL_PORT_IOCTL;
 	port->num_requested_ctl_io = 100;
 	port->port_name = softc->ioctl_info.port_name;
 	port->port_online = ctl_ioctl_online;
 	port->port_offline = ctl_ioctl_offline;
 	port->onoff_arg = &softc->ioctl_info;
 	port->lun_enable = ctl_ioctl_lun_enable;
 	port->lun_disable = ctl_ioctl_lun_disable;
 	port->targ_lun_arg = &softc->ioctl_info;
 	port->fe_datamove = ctl_ioctl_datamove;
 	port->fe_done = ctl_ioctl_done;
 	port->max_targets = 15;
 	port->max_target_id = 15;
 
 	if (ctl_port_register(&softc->ioctl_info.port,
 	                  (softc->flags & CTL_FLAG_MASTER_SHELF)) != 0) {
 		printf("ctl: ioctl front end registration failed, will "
 		       "continue anyway\n");
 	}
 
 #ifdef CTL_IO_DELAY
 	if (sizeof(struct callout) > CTL_TIMER_BYTES) {
 		printf("sizeof(struct callout) %zd > CTL_TIMER_BYTES %zd\n",
 		       sizeof(struct callout), CTL_TIMER_BYTES);
 		return (EINVAL);
 	}
 #endif /* CTL_IO_DELAY */
 
 	return (0);
 }
 
 void
 ctl_shutdown(void)
 {
 	struct ctl_softc *softc;
 	struct ctl_lun *lun, *next_lun;
 	struct ctl_io_pool *pool;
 
 	softc = (struct ctl_softc *)control_softc;
 
 	if (ctl_port_deregister(&softc->ioctl_info.port) != 0)
 		printf("ctl: ioctl front end deregistration failed\n");
 
 	mtx_lock(&softc->ctl_lock);
 
 	/*
 	 * Free up each LUN.
 	 */
 	for (lun = STAILQ_FIRST(&softc->lun_list); lun != NULL; lun = next_lun){
 		next_lun = STAILQ_NEXT(lun, links);
 		ctl_free_lun(lun);
 	}
 
 	mtx_unlock(&softc->ctl_lock);
 
 	ctl_frontend_deregister(&ioctl_frontend);
 
 	/*
 	 * This will rip the rug out from under any FETDs or anyone else
 	 * that has a pool allocated.  Since we increment our module
 	 * refcount any time someone outside the main CTL module allocates
 	 * a pool, we shouldn't have any problems here.  The user won't be
 	 * able to unload the CTL module until client modules have
 	 * successfully unloaded.
 	 */
 	while ((pool = STAILQ_FIRST(&softc->io_pools)) != NULL)
 		ctl_pool_free(pool);
 
 #if 0
 	ctl_shutdown_thread(softc->work_thread);
 	mtx_destroy(&softc->queue_lock);
 #endif
 
 	ctl_tpc_shutdown(softc);
 	mtx_destroy(&softc->pool_lock);
 	mtx_destroy(&softc->ctl_lock);
 
 	destroy_dev(softc->dev);
 
 	sysctl_ctx_free(&softc->sysctl_ctx);
 
 	free(control_softc, M_DEVBUF);
 	control_softc = NULL;
 
 	if (bootverbose)
 		printf("ctl: CAM Target Layer unloaded\n");
 }
 
 static int
 ctl_module_event_handler(module_t mod, int what, void *arg)
 {
 
 	switch (what) {
 	case MOD_LOAD:
 		return (ctl_init());
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * XXX KDM should we do some access checks here?  Bump a reference count to
  * prevent a CTL module from being unloaded while someone has it open?
  */
 static int
 ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	return (0);
 }
 
 static int
 ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	return (0);
 }
 
 int
 ctl_port_enable(ctl_port_type port_type)
 {
 	struct ctl_softc *softc;
 	struct ctl_port *port;
 
 	if (ctl_is_single == 0) {
 		union ctl_ha_msg msg_info;
 		int isc_retval;
 
 #if 0
 		printf("%s: HA mode, synchronizing frontend enable\n",
 		        __func__);
 #endif
 		msg_info.hdr.msg_type = CTL_MSG_SYNC_FE;
 	        if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		        sizeof(msg_info), 1 )) > CTL_HA_STATUS_SUCCESS) {
 			printf("Sync msg send error retval %d\n", isc_retval);
 		}
 		if (!rcv_sync_msg) {
 			isc_retval=ctl_ha_msg_recv(CTL_HA_CHAN_CTL, &msg_info,
 			        sizeof(msg_info), 1);
 		}
 #if 0
         	printf("CTL:Frontend Enable\n");
 	} else {
 		printf("%s: single mode, skipping frontend synchronization\n",
 		        __func__);
 #endif
 	}
 
 	softc = control_softc;
 
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if (port_type & port->port_type)
 		{
 #if 0
 			printf("port %d\n", port->targ_port);
 #endif
 			ctl_port_online(port);
 		}
 	}
 
 	return (0);
 }
 
 int
 ctl_port_disable(ctl_port_type port_type)
 {
 	struct ctl_softc *softc;
 	struct ctl_port *port;
 
 	softc = control_softc;
 
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if (port_type & port->port_type)
 			ctl_port_offline(port);
 	}
 
 	return (0);
 }
 
 /*
  * Returns 0 for success, 1 for failure.
  * Currently the only failure mode is if there aren't enough entries
  * allocated.  So, in case of a failure, look at num_entries_dropped,
  * reallocate and try again.
  */
 int
 ctl_port_list(struct ctl_port_entry *entries, int num_entries_alloced,
 	      int *num_entries_filled, int *num_entries_dropped,
 	      ctl_port_type port_type, int no_virtual)
 {
 	struct ctl_softc *softc;
 	struct ctl_port *port;
 	int entries_dropped, entries_filled;
 	int retval;
 	int i;
 
 	softc = control_softc;
 
 	retval = 0;
 	entries_filled = 0;
 	entries_dropped = 0;
 
 	i = 0;
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		struct ctl_port_entry *entry;
 
 		if ((port->port_type & port_type) == 0)
 			continue;
 
 		if ((no_virtual != 0)
 		 && (port->virtual_port != 0))
 			continue;
 
 		if (entries_filled >= num_entries_alloced) {
 			entries_dropped++;
 			continue;
 		}
 		entry = &entries[i];
 
 		entry->port_type = port->port_type;
 		strlcpy(entry->port_name, port->port_name,
 			sizeof(entry->port_name));
 		entry->physical_port = port->physical_port;
 		entry->virtual_port = port->virtual_port;
 		entry->wwnn = port->wwnn;
 		entry->wwpn = port->wwpn;
 
 		i++;
 		entries_filled++;
 	}
 
 	mtx_unlock(&softc->ctl_lock);
 
 	if (entries_dropped > 0)
 		retval = 1;
 
 	*num_entries_dropped = entries_dropped;
 	*num_entries_filled = entries_filled;
 
 	return (retval);
 }
 
 static void
 ctl_ioctl_online(void *arg)
 {
 	struct ctl_ioctl_info *ioctl_info;
 
 	ioctl_info = (struct ctl_ioctl_info *)arg;
 
 	ioctl_info->flags |= CTL_IOCTL_FLAG_ENABLED;
 }
 
 static void
 ctl_ioctl_offline(void *arg)
 {
 	struct ctl_ioctl_info *ioctl_info;
 
 	ioctl_info = (struct ctl_ioctl_info *)arg;
 
 	ioctl_info->flags &= ~CTL_IOCTL_FLAG_ENABLED;
 }
 
 /*
  * Remove an initiator by port number and initiator ID.
  * Returns 0 for success, -1 for failure.
  */
 int
 ctl_remove_initiator(struct ctl_port *port, int iid)
 {
 	struct ctl_softc *softc = control_softc;
 
 	mtx_assert(&softc->ctl_lock, MA_NOTOWNED);
 
 	if (iid > CTL_MAX_INIT_PER_PORT) {
 		printf("%s: initiator ID %u > maximun %u!\n",
 		       __func__, iid, CTL_MAX_INIT_PER_PORT);
 		return (-1);
 	}
 
 	mtx_lock(&softc->ctl_lock);
 	port->wwpn_iid[iid].in_use--;
 	port->wwpn_iid[iid].last_use = time_uptime;
 	mtx_unlock(&softc->ctl_lock);
 
 	return (0);
 }
 
 /*
  * Add an initiator to the initiator map.
  * Returns iid for success, < 0 for failure.
  */
 int
 ctl_add_initiator(struct ctl_port *port, int iid, uint64_t wwpn, char *name)
 {
 	struct ctl_softc *softc = control_softc;
 	time_t best_time;
 	int i, best;
 
 	mtx_assert(&softc->ctl_lock, MA_NOTOWNED);
 
 	if (iid >= CTL_MAX_INIT_PER_PORT) {
 		printf("%s: WWPN %#jx initiator ID %u > maximum %u!\n",
 		       __func__, wwpn, iid, CTL_MAX_INIT_PER_PORT);
 		free(name, M_CTL);
 		return (-1);
 	}
 
 	mtx_lock(&softc->ctl_lock);
 
 	if (iid < 0 && (wwpn != 0 || name != NULL)) {
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			if (wwpn != 0 && wwpn == port->wwpn_iid[i].wwpn) {
 				iid = i;
 				break;
 			}
 			if (name != NULL && port->wwpn_iid[i].name != NULL &&
 			    strcmp(name, port->wwpn_iid[i].name) == 0) {
 				iid = i;
 				break;
 			}
 		}
 	}
 
 	if (iid < 0) {
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			if (port->wwpn_iid[i].in_use == 0 &&
 			    port->wwpn_iid[i].wwpn == 0 &&
 			    port->wwpn_iid[i].name == NULL) {
 				iid = i;
 				break;
 			}
 		}
 	}
 
 	if (iid < 0) {
 		best = -1;
 		best_time = INT32_MAX;
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			if (port->wwpn_iid[i].in_use == 0) {
 				if (port->wwpn_iid[i].last_use < best_time) {
 					best = i;
 					best_time = port->wwpn_iid[i].last_use;
 				}
 			}
 		}
 		iid = best;
 	}
 
 	if (iid < 0) {
 		mtx_unlock(&softc->ctl_lock);
 		free(name, M_CTL);
 		return (-2);
 	}
 
 	if (port->wwpn_iid[iid].in_use > 0 && (wwpn != 0 || name != NULL)) {
 		/*
 		 * This is not an error yet.
 		 */
 		if (wwpn != 0 && wwpn == port->wwpn_iid[iid].wwpn) {
 #if 0
 			printf("%s: port %d iid %u WWPN %#jx arrived"
 			    " again\n", __func__, port->targ_port,
 			    iid, (uintmax_t)wwpn);
 #endif
 			goto take;
 		}
 		if (name != NULL && port->wwpn_iid[iid].name != NULL &&
 		    strcmp(name, port->wwpn_iid[iid].name) == 0) {
 #if 0
 			printf("%s: port %d iid %u name '%s' arrived"
 			    " again\n", __func__, port->targ_port,
 			    iid, name);
 #endif
 			goto take;
 		}
 
 		/*
 		 * This is an error, but what do we do about it?  The
 		 * driver is telling us we have a new WWPN for this
 		 * initiator ID, so we pretty much need to use it.
 		 */
 		printf("%s: port %d iid %u WWPN %#jx '%s' arrived,"
 		    " but WWPN %#jx '%s' is still at that address\n",
 		    __func__, port->targ_port, iid, wwpn, name,
 		    (uintmax_t)port->wwpn_iid[iid].wwpn,
 		    port->wwpn_iid[iid].name);
 
 		/*
 		 * XXX KDM clear have_ca and ua_pending on each LUN for
 		 * this initiator.
 		 */
 	}
 take:
 	free(port->wwpn_iid[iid].name, M_CTL);
 	port->wwpn_iid[iid].name = name;
 	port->wwpn_iid[iid].wwpn = wwpn;
 	port->wwpn_iid[iid].in_use++;
 	mtx_unlock(&softc->ctl_lock);
 
 	return (iid);
 }
 
 static int
 ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf)
 {
 	int len;
 
 	switch (port->port_type) {
 	case CTL_PORT_FC:
 	{
 		struct scsi_transportid_fcp *id =
 		    (struct scsi_transportid_fcp *)buf;
 		if (port->wwpn_iid[iid].wwpn == 0)
 			return (0);
 		memset(id, 0, sizeof(*id));
 		id->format_protocol = SCSI_PROTO_FC;
 		scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->n_port_name);
 		return (sizeof(*id));
 	}
 	case CTL_PORT_ISCSI:
 	{
 		struct scsi_transportid_iscsi_port *id =
 		    (struct scsi_transportid_iscsi_port *)buf;
 		if (port->wwpn_iid[iid].name == NULL)
 			return (0);
 		memset(id, 0, 256);
 		id->format_protocol = SCSI_TRN_ISCSI_FORMAT_PORT |
 		    SCSI_PROTO_ISCSI;
 		len = strlcpy(id->iscsi_name, port->wwpn_iid[iid].name, 252) + 1;
 		len = roundup2(min(len, 252), 4);
 		scsi_ulto2b(len, id->additional_length);
 		return (sizeof(*id) + len);
 	}
 	case CTL_PORT_SAS:
 	{
 		struct scsi_transportid_sas *id =
 		    (struct scsi_transportid_sas *)buf;
 		if (port->wwpn_iid[iid].wwpn == 0)
 			return (0);
 		memset(id, 0, sizeof(*id));
 		id->format_protocol = SCSI_PROTO_SAS;
 		scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->sas_address);
 		return (sizeof(*id));
 	}
 	default:
 	{
 		struct scsi_transportid_spi *id =
 		    (struct scsi_transportid_spi *)buf;
 		memset(id, 0, sizeof(*id));
 		id->format_protocol = SCSI_PROTO_SPI;
 		scsi_ulto2b(iid, id->scsi_addr);
 		scsi_ulto2b(port->targ_port, id->rel_trgt_port_id);
 		return (sizeof(*id));
 	}
 	}
 }
 
 static int
 ctl_ioctl_lun_enable(void *arg, struct ctl_id targ_id, int lun_id)
 {
 	return (0);
 }
 
 static int
 ctl_ioctl_lun_disable(void *arg, struct ctl_id targ_id, int lun_id)
 {
 	return (0);
 }
 
 /*
  * Data movement routine for the CTL ioctl frontend port.
  */
 static int
 ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio)
 {
 	struct ctl_sg_entry *ext_sglist, *kern_sglist;
 	struct ctl_sg_entry ext_entry, kern_entry;
 	int ext_sglen, ext_sg_entries, kern_sg_entries;
 	int ext_sg_start, ext_offset;
 	int len_to_copy, len_copied;
 	int kern_watermark, ext_watermark;
 	int ext_sglist_malloced;
 	int i, j;
 
 	ext_sglist_malloced = 0;
 	ext_sg_start = 0;
 	ext_offset = 0;
 
 	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n"));
 
 	/*
 	 * If this flag is set, fake the data transfer.
 	 */
 	if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) {
 		ctsio->ext_data_filled = ctsio->ext_data_len;
 		goto bailout;
 	}
 
 	/*
 	 * To simplify things here, if we have a single buffer, stick it in
 	 * a S/G entry and just make it a single entry S/G list.
 	 */
 	if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) {
 		int len_seen;
 
 		ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist);
 
 		ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL,
 							   M_WAITOK);
 		ext_sglist_malloced = 1;
 		if (copyin(ctsio->ext_data_ptr, ext_sglist,
 				   ext_sglen) != 0) {
 			ctl_set_internal_failure(ctsio,
 						 /*sks_valid*/ 0,
 						 /*retry_count*/ 0);
 			goto bailout;
 		}
 		ext_sg_entries = ctsio->ext_sg_entries;
 		len_seen = 0;
 		for (i = 0; i < ext_sg_entries; i++) {
 			if ((len_seen + ext_sglist[i].len) >=
 			     ctsio->ext_data_filled) {
 				ext_sg_start = i;
 				ext_offset = ctsio->ext_data_filled - len_seen;
 				break;
 			}
 			len_seen += ext_sglist[i].len;
 		}
 	} else {
 		ext_sglist = &ext_entry;
 		ext_sglist->addr = ctsio->ext_data_ptr;
 		ext_sglist->len = ctsio->ext_data_len;
 		ext_sg_entries = 1;
 		ext_sg_start = 0;
 		ext_offset = ctsio->ext_data_filled;
 	}
 
 	if (ctsio->kern_sg_entries > 0) {
 		kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr;
 		kern_sg_entries = ctsio->kern_sg_entries;
 	} else {
 		kern_sglist = &kern_entry;
 		kern_sglist->addr = ctsio->kern_data_ptr;
 		kern_sglist->len = ctsio->kern_data_len;
 		kern_sg_entries = 1;
 	}
 
 
 	kern_watermark = 0;
 	ext_watermark = ext_offset;
 	len_copied = 0;
 	for (i = ext_sg_start, j = 0;
 	     i < ext_sg_entries && j < kern_sg_entries;) {
 		uint8_t *ext_ptr, *kern_ptr;
 
 		len_to_copy = ctl_min(ext_sglist[i].len - ext_watermark,
 				      kern_sglist[j].len - kern_watermark);
 
 		ext_ptr = (uint8_t *)ext_sglist[i].addr;
 		ext_ptr = ext_ptr + ext_watermark;
 		if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
 			/*
 			 * XXX KDM fix this!
 			 */
 			panic("need to implement bus address support");
 #if 0
 			kern_ptr = bus_to_virt(kern_sglist[j].addr);
 #endif
 		} else
 			kern_ptr = (uint8_t *)kern_sglist[j].addr;
 		kern_ptr = kern_ptr + kern_watermark;
 
 		kern_watermark += len_to_copy;
 		ext_watermark += len_to_copy;
 
 		if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
 		     CTL_FLAG_DATA_IN) {
 			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
 					 "bytes to user\n", len_to_copy));
 			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
 					 "to %p\n", kern_ptr, ext_ptr));
 			if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) {
 				ctl_set_internal_failure(ctsio,
 							 /*sks_valid*/ 0,
 							 /*retry_count*/ 0);
 				goto bailout;
 			}
 		} else {
 			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
 					 "bytes from user\n", len_to_copy));
 			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
 					 "to %p\n", ext_ptr, kern_ptr));
 			if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){
 				ctl_set_internal_failure(ctsio,
 							 /*sks_valid*/ 0,
 							 /*retry_count*/0);
 				goto bailout;
 			}
 		}
 
 		len_copied += len_to_copy;
 
 		if (ext_sglist[i].len == ext_watermark) {
 			i++;
 			ext_watermark = 0;
 		}
 
 		if (kern_sglist[j].len == kern_watermark) {
 			j++;
 			kern_watermark = 0;
 		}
 	}
 
 	ctsio->ext_data_filled += len_copied;
 
 	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, "
 			 "kern_sg_entries: %d\n", ext_sg_entries,
 			 kern_sg_entries));
 	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, "
 			 "kern_data_len = %d\n", ctsio->ext_data_len,
 			 ctsio->kern_data_len));
 
 
 	/* XXX KDM set residual?? */
 bailout:
 
 	if (ext_sglist_malloced != 0)
 		free(ext_sglist, M_CTL);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * Serialize a command that went down the "wrong" side, and so was sent to
  * this controller for execution.  The logic is a little different than the
  * standard case in ctl_scsiio_precheck().  Errors in this case need to get
  * sent back to the other side, but in the success case, we execute the
  * command on this side (XFER mode) or tell the other side to execute it
  * (SER_ONLY mode).
  */
 static int
 ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *ctl_softc;
 	union ctl_ha_msg msg_info;
 	struct ctl_lun *lun;
 	int retval = 0;
 	uint32_t targ_lun;
 
 	ctl_softc = control_softc;
 
 	targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun;
 	lun = ctl_softc->ctl_luns[targ_lun];
 	if (lun==NULL)
 	{
 		/*
 		 * Why isn't LUN defined? The other side wouldn't
 		 * send a cmd if the LUN is undefined.
 		 */
 		printf("%s: Bad JUJU!, LUN is NULL!\n", __func__);
 
 		/* "Logical unit not supported" */
 		ctl_set_sense_data(&msg_info.scsi.sense_data,
 				   lun,
 				   /*sense_format*/SSD_TYPE_NONE,
 				   /*current_error*/ 1,
 				   /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST,
 				   /*asc*/ 0x25,
 				   /*ascq*/ 0x00,
 				   SSD_ELEM_NONE);
 
 		msg_info.scsi.sense_len = SSD_FULL_SIZE;
 		msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND;
 		msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE;
 		msg_info.hdr.original_sc = ctsio->io_hdr.original_sc;
 		msg_info.hdr.serializing_sc = NULL;
 		msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU;
 	        if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 				sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) {
 		}
 		return(1);
 
 	}
 
 	mtx_lock(&lun->lun_lock);
     	TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 
 	switch (ctl_check_ooa(lun, (union ctl_io *)ctsio,
 		(union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq,
 		 ooa_links))) {
 	case CTL_ACTION_BLOCK:
 		ctsio->io_hdr.flags |= CTL_FLAG_BLOCKED;
 		TAILQ_INSERT_TAIL(&lun->blocked_queue, &ctsio->io_hdr,
 				  blocked_links);
 		break;
 	case CTL_ACTION_PASS:
 	case CTL_ACTION_SKIP:
 		if (ctl_softc->ha_mode == CTL_HA_MODE_XFER) {
 			ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 			ctl_enqueue_rtr((union ctl_io *)ctsio);
 		} else {
 
 			/* send msg back to other side */
 			msg_info.hdr.original_sc = ctsio->io_hdr.original_sc;
 			msg_info.hdr.serializing_sc = (union ctl_io *)ctsio;
 			msg_info.hdr.msg_type = CTL_MSG_R2R;
 #if 0
 			printf("2. pOrig %x\n", (int)msg_info.hdr.original_sc);
 #endif
 		        if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 			    sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) {
 			}
 		}
 		break;
 	case CTL_ACTION_OVERLAP:
 		/* OVERLAPPED COMMANDS ATTEMPTED */
 		ctl_set_sense_data(&msg_info.scsi.sense_data,
 				   lun,
 				   /*sense_format*/SSD_TYPE_NONE,
 				   /*current_error*/ 1,
 				   /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST,
 				   /*asc*/ 0x4E,
 				   /*ascq*/ 0x00,
 				   SSD_ELEM_NONE);
 
 		msg_info.scsi.sense_len = SSD_FULL_SIZE;
 		msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND;
 		msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE;
 		msg_info.hdr.original_sc = ctsio->io_hdr.original_sc;
 		msg_info.hdr.serializing_sc = NULL;
 		msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU;
 #if 0
 		printf("BAD JUJU:Major Bummer Overlap\n");
 #endif
 		TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 		retval = 1;
 		if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) {
 		}
 		break;
 	case CTL_ACTION_OVERLAP_TAG:
 		/* TAGGED OVERLAPPED COMMANDS (NN = QUEUE TAG) */
 		ctl_set_sense_data(&msg_info.scsi.sense_data,
 				   lun,
 				   /*sense_format*/SSD_TYPE_NONE,
 				   /*current_error*/ 1,
 				   /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST,
 				   /*asc*/ 0x4D,
 				   /*ascq*/ ctsio->tag_num & 0xff,
 				   SSD_ELEM_NONE);
 
 		msg_info.scsi.sense_len = SSD_FULL_SIZE;
 		msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND;
 		msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE;
 		msg_info.hdr.original_sc = ctsio->io_hdr.original_sc;
 		msg_info.hdr.serializing_sc = NULL;
 		msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU;
 #if 0
 		printf("BAD JUJU:Major Bummer Overlap Tag\n");
 #endif
 		TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 		retval = 1;
 		if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) {
 		}
 		break;
 	case CTL_ACTION_ERROR:
 	default:
 		/* "Internal target failure" */
 		ctl_set_sense_data(&msg_info.scsi.sense_data,
 				   lun,
 				   /*sense_format*/SSD_TYPE_NONE,
 				   /*current_error*/ 1,
 				   /*sense_key*/ SSD_KEY_HARDWARE_ERROR,
 				   /*asc*/ 0x44,
 				   /*ascq*/ 0x00,
 				   SSD_ELEM_NONE);
 
 		msg_info.scsi.sense_len = SSD_FULL_SIZE;
 		msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND;
 		msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE;
 		msg_info.hdr.original_sc = ctsio->io_hdr.original_sc;
 		msg_info.hdr.serializing_sc = NULL;
 		msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU;
 #if 0
 		printf("BAD JUJU:Major Bummer HW Error\n");
 #endif
 		TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 		retval = 1;
 		if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) {
 		}
 		break;
 	}
 	mtx_unlock(&lun->lun_lock);
 	return (retval);
 }
 
 static int
 ctl_ioctl_submit_wait(union ctl_io *io)
 {
 	struct ctl_fe_ioctl_params params;
 	ctl_fe_ioctl_state last_state;
 	int done, retval;
 
 	retval = 0;
 
 	bzero(&params, sizeof(params));
 
 	mtx_init(&params.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF);
 	cv_init(&params.sem, "ctlioccv");
 	params.state = CTL_IOCTL_INPROG;
 	last_state = params.state;
 
 	io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = &params;
 
 	CTL_DEBUG_PRINT(("ctl_ioctl_submit_wait\n"));
 
 	/* This shouldn't happen */
 	if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE)
 		return (retval);
 
 	done = 0;
 
 	do {
 		mtx_lock(&params.ioctl_mtx);
 		/*
 		 * Check the state here, and don't sleep if the state has
 		 * already changed (i.e. wakeup has already occured, but we
 		 * weren't waiting yet).
 		 */
 		if (params.state == last_state) {
 			/* XXX KDM cv_wait_sig instead? */
 			cv_wait(&params.sem, &params.ioctl_mtx);
 		}
 		last_state = params.state;
 
 		switch (params.state) {
 		case CTL_IOCTL_INPROG:
 			/* Why did we wake up? */
 			/* XXX KDM error here? */
 			mtx_unlock(&params.ioctl_mtx);
 			break;
 		case CTL_IOCTL_DATAMOVE:
 			CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n"));
 
 			/*
 			 * change last_state back to INPROG to avoid
 			 * deadlock on subsequent data moves.
 			 */
 			params.state = last_state = CTL_IOCTL_INPROG;
 
 			mtx_unlock(&params.ioctl_mtx);
 			ctl_ioctl_do_datamove(&io->scsiio);
 			/*
 			 * Note that in some cases, most notably writes,
 			 * this will queue the I/O and call us back later.
 			 * In other cases, generally reads, this routine
 			 * will immediately call back and wake us up,
 			 * probably using our own context.
 			 */
 			io->scsiio.be_move_done(io);
 			break;
 		case CTL_IOCTL_DONE:
 			mtx_unlock(&params.ioctl_mtx);
 			CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n"));
 			done = 1;
 			break;
 		default:
 			mtx_unlock(&params.ioctl_mtx);
 			/* XXX KDM error here? */
 			break;
 		}
 	} while (done == 0);
 
 	mtx_destroy(&params.ioctl_mtx);
 	cv_destroy(&params.sem);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static void
 ctl_ioctl_datamove(union ctl_io *io)
 {
 	struct ctl_fe_ioctl_params *params;
 
 	params = (struct ctl_fe_ioctl_params *)
 		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
 
 	mtx_lock(&params->ioctl_mtx);
 	params->state = CTL_IOCTL_DATAMOVE;
 	cv_broadcast(&params->sem);
 	mtx_unlock(&params->ioctl_mtx);
 }
 
 static void
 ctl_ioctl_done(union ctl_io *io)
 {
 	struct ctl_fe_ioctl_params *params;
 
 	params = (struct ctl_fe_ioctl_params *)
 		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
 
 	mtx_lock(&params->ioctl_mtx);
 	params->state = CTL_IOCTL_DONE;
 	cv_broadcast(&params->sem);
 	mtx_unlock(&params->ioctl_mtx);
 }
 
 static void
 ctl_ioctl_hard_startstop_callback(void *arg, struct cfi_metatask *metatask)
 {
 	struct ctl_fe_ioctl_startstop_info *sd_info;
 
 	sd_info = (struct ctl_fe_ioctl_startstop_info *)arg;
 
 	sd_info->hs_info.status = metatask->status;
 	sd_info->hs_info.total_luns = metatask->taskinfo.startstop.total_luns;
 	sd_info->hs_info.luns_complete =
 		metatask->taskinfo.startstop.luns_complete;
 	sd_info->hs_info.luns_failed = metatask->taskinfo.startstop.luns_failed;
 
 	cv_broadcast(&sd_info->sem);
 }
 
 static void
 ctl_ioctl_bbrread_callback(void *arg, struct cfi_metatask *metatask)
 {
 	struct ctl_fe_ioctl_bbrread_info *fe_bbr_info;
 
 	fe_bbr_info = (struct ctl_fe_ioctl_bbrread_info *)arg;
 
 	mtx_lock(fe_bbr_info->lock);
 	fe_bbr_info->bbr_info->status = metatask->status;
 	fe_bbr_info->bbr_info->bbr_status = metatask->taskinfo.bbrread.status;
 	fe_bbr_info->wakeup_done = 1;
 	mtx_unlock(fe_bbr_info->lock);
 
 	cv_broadcast(&fe_bbr_info->sem);
 }
 
 /*
  * Returns 0 for success, errno for failure.
  */
 static int
 ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num,
 		   struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries)
 {
 	union ctl_io *io;
 	int retval;
 
 	retval = 0;
 
 	mtx_lock(&lun->lun_lock);
 	for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); (io != NULL);
 	     (*cur_fill_num)++, io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr,
 	     ooa_links)) {
 		struct ctl_ooa_entry *entry;
 
 		/*
 		 * If we've got more than we can fit, just count the
 		 * remaining entries.
 		 */
 		if (*cur_fill_num >= ooa_hdr->alloc_num)
 			continue;
 
 		entry = &kern_entries[*cur_fill_num];
 
 		entry->tag_num = io->scsiio.tag_num;
 		entry->lun_num = lun->lun;
 #ifdef CTL_TIME_IO
 		entry->start_bt = io->io_hdr.start_bt;
 #endif
 		bcopy(io->scsiio.cdb, entry->cdb, io->scsiio.cdb_len);
 		entry->cdb_len = io->scsiio.cdb_len;
 		if (io->io_hdr.flags & CTL_FLAG_BLOCKED)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_BLOCKED;
 
 		if (io->io_hdr.flags & CTL_FLAG_DMA_INPROG)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_DMA;
 
 		if (io->io_hdr.flags & CTL_FLAG_ABORT)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_ABORT;
 
 		if (io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_RTR;
 
 		if (io->io_hdr.flags & CTL_FLAG_DMA_QUEUED)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_DMA_QUEUED;
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	return (retval);
 }
 
 static void *
 ctl_copyin_alloc(void *user_addr, int len, char *error_str,
 		 size_t error_str_len)
 {
 	void *kptr;
 
 	kptr = malloc(len, M_CTL, M_WAITOK | M_ZERO);
 
 	if (copyin(user_addr, kptr, len) != 0) {
 		snprintf(error_str, error_str_len, "Error copying %d bytes "
 			 "from user address %p to kernel address %p", len,
 			 user_addr, kptr);
 		free(kptr, M_CTL);
 		return (NULL);
 	}
 
 	return (kptr);
 }
 
 static void
 ctl_free_args(int num_args, struct ctl_be_arg *args)
 {
 	int i;
 
 	if (args == NULL)
 		return;
 
 	for (i = 0; i < num_args; i++) {
 		free(args[i].kname, M_CTL);
 		free(args[i].kvalue, M_CTL);
 	}
 
 	free(args, M_CTL);
 }
 
 static struct ctl_be_arg *
 ctl_copyin_args(int num_args, struct ctl_be_arg *uargs,
 		char *error_str, size_t error_str_len)
 {
 	struct ctl_be_arg *args;
 	int i;
 
 	args = ctl_copyin_alloc(uargs, num_args * sizeof(*args),
 				error_str, error_str_len);
 
 	if (args == NULL)
 		goto bailout;
 
 	for (i = 0; i < num_args; i++) {
 		args[i].kname = NULL;
 		args[i].kvalue = NULL;
 	}
 
 	for (i = 0; i < num_args; i++) {
 		uint8_t *tmpptr;
 
 		args[i].kname = ctl_copyin_alloc(args[i].name,
 			args[i].namelen, error_str, error_str_len);
 		if (args[i].kname == NULL)
 			goto bailout;
 
 		if (args[i].kname[args[i].namelen - 1] != '\0') {
 			snprintf(error_str, error_str_len, "Argument %d "
 				 "name is not NUL-terminated", i);
 			goto bailout;
 		}
 
 		if (args[i].flags & CTL_BEARG_RD) {
 			tmpptr = ctl_copyin_alloc(args[i].value,
 				args[i].vallen, error_str, error_str_len);
 			if (tmpptr == NULL)
 				goto bailout;
 			if ((args[i].flags & CTL_BEARG_ASCII)
 			 && (tmpptr[args[i].vallen - 1] != '\0')) {
 				snprintf(error_str, error_str_len, "Argument "
 				    "%d value is not NUL-terminated", i);
 				goto bailout;
 			}
 			args[i].kvalue = tmpptr;
 		} else {
 			args[i].kvalue = malloc(args[i].vallen,
 			    M_CTL, M_WAITOK | M_ZERO);
 		}
 	}
 
 	return (args);
 bailout:
 
 	ctl_free_args(num_args, args);
 
 	return (NULL);
 }
 
 static void
 ctl_copyout_args(int num_args, struct ctl_be_arg *args)
 {
 	int i;
 
 	for (i = 0; i < num_args; i++) {
 		if (args[i].flags & CTL_BEARG_WR)
 			copyout(args[i].kvalue, args[i].value, args[i].vallen);
 	}
 }
 
 /*
  * Escape characters that are illegal or not recommended in XML.
  */
 int
 ctl_sbuf_printf_esc(struct sbuf *sb, char *str)
 {
 	int retval;
 
 	retval = 0;
 
 	for (; *str; str++) {
 		switch (*str) {
 		case '&':
 			retval = sbuf_printf(sb, "&amp;");
 			break;
 		case '>':
 			retval = sbuf_printf(sb, "&gt;");
 			break;
 		case '<':
 			retval = sbuf_printf(sb, "&lt;");
 			break;
 		default:
 			retval = sbuf_putc(sb, *str);
 			break;
 		}
 
 		if (retval != 0)
 			break;
 
 	}
 
 	return (retval);
 }
 
 static int
 ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 	  struct thread *td)
 {
 	struct ctl_softc *softc;
 	int retval;
 
 	softc = control_softc;
 
 	retval = 0;
 
 	switch (cmd) {
 	case CTL_IO: {
 		union ctl_io *io;
 		void *pool_tmp;
 
 		/*
 		 * If we haven't been "enabled", don't allow any SCSI I/O
 		 * to this FETD.
 		 */
 		if ((softc->ioctl_info.flags & CTL_IOCTL_FLAG_ENABLED) == 0) {
 			retval = EPERM;
 			break;
 		}
 
 		io = ctl_alloc_io(softc->ioctl_info.port.ctl_pool_ref);
 		if (io == NULL) {
 			printf("ctl_ioctl: can't allocate ctl_io!\n");
 			retval = ENOSPC;
 			break;
 		}
 
 		/*
 		 * Need to save the pool reference so it doesn't get
 		 * spammed by the user's ctl_io.
 		 */
 		pool_tmp = io->io_hdr.pool;
 
 		memcpy(io, (void *)addr, sizeof(*io));
 
 		io->io_hdr.pool = pool_tmp;
 		/*
 		 * No status yet, so make sure the status is set properly.
 		 */
 		io->io_hdr.status = CTL_STATUS_NONE;
 
 		/*
 		 * The user sets the initiator ID, target and LUN IDs.
 		 */
 		io->io_hdr.nexus.targ_port = softc->ioctl_info.port.targ_port;
 		io->io_hdr.flags |= CTL_FLAG_USER_REQ;
 		if ((io->io_hdr.io_type == CTL_IO_SCSI)
 		 && (io->scsiio.tag_type != CTL_TAG_UNTAGGED))
 			io->scsiio.tag_num = softc->ioctl_info.cur_tag_num++;
 
 		retval = ctl_ioctl_submit_wait(io);
 
 		if (retval != 0) {
 			ctl_free_io(io);
 			break;
 		}
 
 		memcpy((void *)addr, io, sizeof(*io));
 
 		/* return this to our pool */
 		ctl_free_io(io);
 
 		break;
 	}
 	case CTL_ENABLE_PORT:
 	case CTL_DISABLE_PORT:
 	case CTL_SET_PORT_WWNS: {
 		struct ctl_port *port;
 		struct ctl_port_entry *entry;
 
 		entry = (struct ctl_port_entry *)addr;
 		
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			int action, done;
 
 			action = 0;
 			done = 0;
 
 			if ((entry->port_type == CTL_PORT_NONE)
 			 && (entry->targ_port == port->targ_port)) {
 				/*
 				 * If the user only wants to enable or
 				 * disable or set WWNs on a specific port,
 				 * do the operation and we're done.
 				 */
 				action = 1;
 				done = 1;
 			} else if (entry->port_type & port->port_type) {
 				/*
 				 * Compare the user's type mask with the
 				 * particular frontend type to see if we
 				 * have a match.
 				 */
 				action = 1;
 				done = 0;
 
 				/*
 				 * Make sure the user isn't trying to set
 				 * WWNs on multiple ports at the same time.
 				 */
 				if (cmd == CTL_SET_PORT_WWNS) {
 					printf("%s: Can't set WWNs on "
 					       "multiple ports\n", __func__);
 					retval = EINVAL;
 					break;
 				}
 			}
 			if (action != 0) {
 				/*
 				 * XXX KDM we have to drop the lock here,
 				 * because the online/offline operations
 				 * can potentially block.  We need to
 				 * reference count the frontends so they
 				 * can't go away,
 				 */
 				mtx_unlock(&softc->ctl_lock);
 
 				if (cmd == CTL_ENABLE_PORT) {
 					struct ctl_lun *lun;
 
 					STAILQ_FOREACH(lun, &softc->lun_list,
 						       links) {
 						port->lun_enable(port->targ_lun_arg,
 						    lun->target,
 						    lun->lun);
 					}
 
 					ctl_port_online(port);
 				} else if (cmd == CTL_DISABLE_PORT) {
 					struct ctl_lun *lun;
 
 					ctl_port_offline(port);
 
 					STAILQ_FOREACH(lun, &softc->lun_list,
 						       links) {
 						port->lun_disable(
 						    port->targ_lun_arg,
 						    lun->target,
 						    lun->lun);
 					}
 				}
 
 				mtx_lock(&softc->ctl_lock);
 
 				if (cmd == CTL_SET_PORT_WWNS)
 					ctl_port_set_wwns(port,
 					    (entry->flags & CTL_PORT_WWNN_VALID) ?
 					    1 : 0, entry->wwnn,
 					    (entry->flags & CTL_PORT_WWPN_VALID) ?
 					    1 : 0, entry->wwpn);
 			}
 			if (done != 0)
 				break;
 		}
 		mtx_unlock(&softc->ctl_lock);
 		break;
 	}
 	case CTL_GET_PORT_LIST: {
 		struct ctl_port *port;
 		struct ctl_port_list *list;
 		int i;
 
 		list = (struct ctl_port_list *)addr;
 
 		if (list->alloc_len != (list->alloc_num *
 		    sizeof(struct ctl_port_entry))) {
 			printf("%s: CTL_GET_PORT_LIST: alloc_len %u != "
 			       "alloc_num %u * sizeof(struct ctl_port_entry) "
 			       "%zu\n", __func__, list->alloc_len,
 			       list->alloc_num, sizeof(struct ctl_port_entry));
 			retval = EINVAL;
 			break;
 		}
 		list->fill_len = 0;
 		list->fill_num = 0;
 		list->dropped_num = 0;
 		i = 0;
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			struct ctl_port_entry entry, *list_entry;
 
 			if (list->fill_num >= list->alloc_num) {
 				list->dropped_num++;
 				continue;
 			}
 
 			entry.port_type = port->port_type;
 			strlcpy(entry.port_name, port->port_name,
 				sizeof(entry.port_name));
 			entry.targ_port = port->targ_port;
 			entry.physical_port = port->physical_port;
 			entry.virtual_port = port->virtual_port;
 			entry.wwnn = port->wwnn;
 			entry.wwpn = port->wwpn;
 			if (port->status & CTL_PORT_STATUS_ONLINE)
 				entry.online = 1;
 			else
 				entry.online = 0;
 
 			list_entry = &list->entries[i];
 
 			retval = copyout(&entry, list_entry, sizeof(entry));
 			if (retval != 0) {
 				printf("%s: CTL_GET_PORT_LIST: copyout "
 				       "returned %d\n", __func__, retval);
 				break;
 			}
 			i++;
 			list->fill_num++;
 			list->fill_len += sizeof(entry);
 		}
 		mtx_unlock(&softc->ctl_lock);
 
 		/*
 		 * If this is non-zero, we had a copyout fault, so there's
 		 * probably no point in attempting to set the status inside
 		 * the structure.
 		 */
 		if (retval != 0)
 			break;
 
 		if (list->dropped_num > 0)
 			list->status = CTL_PORT_LIST_NEED_MORE_SPACE;
 		else
 			list->status = CTL_PORT_LIST_OK;
 		break;
 	}
 	case CTL_DUMP_OOA: {
 		struct ctl_lun *lun;
 		union ctl_io *io;
 		char printbuf[128];
 		struct sbuf sb;
 
 		mtx_lock(&softc->ctl_lock);
 		printf("Dumping OOA queues:\n");
 		STAILQ_FOREACH(lun, &softc->lun_list, links) {
 			mtx_lock(&lun->lun_lock);
 			for (io = (union ctl_io *)TAILQ_FIRST(
 			     &lun->ooa_queue); io != NULL;
 			     io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr,
 			     ooa_links)) {
 				sbuf_new(&sb, printbuf, sizeof(printbuf),
 					 SBUF_FIXEDLEN);
 				sbuf_printf(&sb, "LUN %jd tag 0x%04x%s%s%s%s: ",
 					    (intmax_t)lun->lun,
 					    io->scsiio.tag_num,
 					    (io->io_hdr.flags &
 					    CTL_FLAG_BLOCKED) ? "" : " BLOCKED",
 					    (io->io_hdr.flags &
 					    CTL_FLAG_DMA_INPROG) ? " DMA" : "",
 					    (io->io_hdr.flags &
 					    CTL_FLAG_ABORT) ? " ABORT" : "",
 			                    (io->io_hdr.flags &
 		                        CTL_FLAG_IS_WAS_ON_RTR) ? " RTR" : "");
 				ctl_scsi_command_string(&io->scsiio, NULL, &sb);
 				sbuf_finish(&sb);
 				printf("%s\n", sbuf_data(&sb));
 			}
 			mtx_unlock(&lun->lun_lock);
 		}
 		printf("OOA queues dump done\n");
 		mtx_unlock(&softc->ctl_lock);
 		break;
 	}
 	case CTL_GET_OOA: {
 		struct ctl_lun *lun;
 		struct ctl_ooa *ooa_hdr;
 		struct ctl_ooa_entry *entries;
 		uint32_t cur_fill_num;
 
 		ooa_hdr = (struct ctl_ooa *)addr;
 
 		if ((ooa_hdr->alloc_len == 0)
 		 || (ooa_hdr->alloc_num == 0)) {
 			printf("%s: CTL_GET_OOA: alloc len %u and alloc num %u "
 			       "must be non-zero\n", __func__,
 			       ooa_hdr->alloc_len, ooa_hdr->alloc_num);
 			retval = EINVAL;
 			break;
 		}
 
 		if (ooa_hdr->alloc_len != (ooa_hdr->alloc_num *
 		    sizeof(struct ctl_ooa_entry))) {
 			printf("%s: CTL_GET_OOA: alloc len %u must be alloc "
 			       "num %d * sizeof(struct ctl_ooa_entry) %zd\n",
 			       __func__, ooa_hdr->alloc_len,
 			       ooa_hdr->alloc_num,sizeof(struct ctl_ooa_entry));
 			retval = EINVAL;
 			break;
 		}
 
 		entries = malloc(ooa_hdr->alloc_len, M_CTL, M_WAITOK | M_ZERO);
 		if (entries == NULL) {
 			printf("%s: could not allocate %d bytes for OOA "
 			       "dump\n", __func__, ooa_hdr->alloc_len);
 			retval = ENOMEM;
 			break;
 		}
 
 		mtx_lock(&softc->ctl_lock);
 		if (((ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) == 0)
 		 && ((ooa_hdr->lun_num >= CTL_MAX_LUNS)
 		  || (softc->ctl_luns[ooa_hdr->lun_num] == NULL))) {
 			mtx_unlock(&softc->ctl_lock);
 			free(entries, M_CTL);
 			printf("%s: CTL_GET_OOA: invalid LUN %ju\n",
 			       __func__, (uintmax_t)ooa_hdr->lun_num);
 			retval = EINVAL;
 			break;
 		}
 
 		cur_fill_num = 0;
 
 		if (ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) {
 			STAILQ_FOREACH(lun, &softc->lun_list, links) {
 				retval = ctl_ioctl_fill_ooa(lun, &cur_fill_num,
 					ooa_hdr, entries);
 				if (retval != 0)
 					break;
 			}
 			if (retval != 0) {
 				mtx_unlock(&softc->ctl_lock);
 				free(entries, M_CTL);
 				break;
 			}
 		} else {
 			lun = softc->ctl_luns[ooa_hdr->lun_num];
 
 			retval = ctl_ioctl_fill_ooa(lun, &cur_fill_num,ooa_hdr,
 						    entries);
 		}
 		mtx_unlock(&softc->ctl_lock);
 
 		ooa_hdr->fill_num = min(cur_fill_num, ooa_hdr->alloc_num);
 		ooa_hdr->fill_len = ooa_hdr->fill_num *
 			sizeof(struct ctl_ooa_entry);
 		retval = copyout(entries, ooa_hdr->entries, ooa_hdr->fill_len);
 		if (retval != 0) {
 			printf("%s: error copying out %d bytes for OOA dump\n", 
 			       __func__, ooa_hdr->fill_len);
 		}
 
 		getbintime(&ooa_hdr->cur_bt);
 
 		if (cur_fill_num > ooa_hdr->alloc_num) {
 			ooa_hdr->dropped_num = cur_fill_num -ooa_hdr->alloc_num;
 			ooa_hdr->status = CTL_OOA_NEED_MORE_SPACE;
 		} else {
 			ooa_hdr->dropped_num = 0;
 			ooa_hdr->status = CTL_OOA_OK;
 		}
 
 		free(entries, M_CTL);
 		break;
 	}
 	case CTL_CHECK_OOA: {
 		union ctl_io *io;
 		struct ctl_lun *lun;
 		struct ctl_ooa_info *ooa_info;
 
 
 		ooa_info = (struct ctl_ooa_info *)addr;
 
 		if (ooa_info->lun_id >= CTL_MAX_LUNS) {
 			ooa_info->status = CTL_OOA_INVALID_LUN;
 			break;
 		}
 		mtx_lock(&softc->ctl_lock);
 		lun = softc->ctl_luns[ooa_info->lun_id];
 		if (lun == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			ooa_info->status = CTL_OOA_INVALID_LUN;
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		ooa_info->num_entries = 0;
 		for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue);
 		     io != NULL; io = (union ctl_io *)TAILQ_NEXT(
 		     &io->io_hdr, ooa_links)) {
 			ooa_info->num_entries++;
 		}
 		mtx_unlock(&lun->lun_lock);
 
 		ooa_info->status = CTL_OOA_SUCCESS;
 
 		break;
 	}
 	case CTL_HARD_START:
 	case CTL_HARD_STOP: {
 		struct ctl_fe_ioctl_startstop_info ss_info;
 		struct cfi_metatask *metatask;
 		struct mtx hs_mtx;
 
 		mtx_init(&hs_mtx, "HS Mutex", NULL, MTX_DEF);
 
 		cv_init(&ss_info.sem, "hard start/stop cv" );
 
 		metatask = cfi_alloc_metatask(/*can_wait*/ 1);
 		if (metatask == NULL) {
 			retval = ENOMEM;
 			mtx_destroy(&hs_mtx);
 			break;
 		}
 
 		if (cmd == CTL_HARD_START)
 			metatask->tasktype = CFI_TASK_STARTUP;
 		else
 			metatask->tasktype = CFI_TASK_SHUTDOWN;
 
 		metatask->callback = ctl_ioctl_hard_startstop_callback;
 		metatask->callback_arg = &ss_info;
 
 		cfi_action(metatask);
 
 		/* Wait for the callback */
 		mtx_lock(&hs_mtx);
 		cv_wait_sig(&ss_info.sem, &hs_mtx);
 		mtx_unlock(&hs_mtx);
 
 		/*
 		 * All information has been copied from the metatask by the
 		 * time cv_broadcast() is called, so we free the metatask here.
 		 */
 		cfi_free_metatask(metatask);
 
 		memcpy((void *)addr, &ss_info.hs_info, sizeof(ss_info.hs_info));
 
 		mtx_destroy(&hs_mtx);
 		break;
 	}
 	case CTL_BBRREAD: {
 		struct ctl_bbrread_info *bbr_info;
 		struct ctl_fe_ioctl_bbrread_info fe_bbr_info;
 		struct mtx bbr_mtx;
 		struct cfi_metatask *metatask;
 
 		bbr_info = (struct ctl_bbrread_info *)addr;
 
 		bzero(&fe_bbr_info, sizeof(fe_bbr_info));
 
 		bzero(&bbr_mtx, sizeof(bbr_mtx));
 		mtx_init(&bbr_mtx, "BBR Mutex", NULL, MTX_DEF);
 
 		fe_bbr_info.bbr_info = bbr_info;
 		fe_bbr_info.lock = &bbr_mtx;
 
 		cv_init(&fe_bbr_info.sem, "BBR read cv");
 		metatask = cfi_alloc_metatask(/*can_wait*/ 1);
 
 		if (metatask == NULL) {
 			mtx_destroy(&bbr_mtx);
 			cv_destroy(&fe_bbr_info.sem);
 			retval = ENOMEM;
 			break;
 		}
 		metatask->tasktype = CFI_TASK_BBRREAD;
 		metatask->callback = ctl_ioctl_bbrread_callback;
 		metatask->callback_arg = &fe_bbr_info;
 		metatask->taskinfo.bbrread.lun_num = bbr_info->lun_num;
 		metatask->taskinfo.bbrread.lba = bbr_info->lba;
 		metatask->taskinfo.bbrread.len = bbr_info->len;
 
 		cfi_action(metatask);
 
 		mtx_lock(&bbr_mtx);
 		while (fe_bbr_info.wakeup_done == 0)
 			cv_wait_sig(&fe_bbr_info.sem, &bbr_mtx);
 		mtx_unlock(&bbr_mtx);
 
 		bbr_info->status = metatask->status;
 		bbr_info->bbr_status = metatask->taskinfo.bbrread.status;
 		bbr_info->scsi_status = metatask->taskinfo.bbrread.scsi_status;
 		memcpy(&bbr_info->sense_data,
 		       &metatask->taskinfo.bbrread.sense_data,
 		       ctl_min(sizeof(bbr_info->sense_data),
 			       sizeof(metatask->taskinfo.bbrread.sense_data)));
 
 		cfi_free_metatask(metatask);
 
 		mtx_destroy(&bbr_mtx);
 		cv_destroy(&fe_bbr_info.sem);
 
 		break;
 	}
 	case CTL_DELAY_IO: {
 		struct ctl_io_delay_info *delay_info;
 #ifdef CTL_IO_DELAY
 		struct ctl_lun *lun;
 #endif /* CTL_IO_DELAY */
 
 		delay_info = (struct ctl_io_delay_info *)addr;
 
 #ifdef CTL_IO_DELAY
 		mtx_lock(&softc->ctl_lock);
 
 		if ((delay_info->lun_id >= CTL_MAX_LUNS)
 		 || (softc->ctl_luns[delay_info->lun_id] == NULL)) {
 			delay_info->status = CTL_DELAY_STATUS_INVALID_LUN;
 		} else {
 			lun = softc->ctl_luns[delay_info->lun_id];
 			mtx_lock(&lun->lun_lock);
 
 			delay_info->status = CTL_DELAY_STATUS_OK;
 
 			switch (delay_info->delay_type) {
 			case CTL_DELAY_TYPE_CONT:
 				break;
 			case CTL_DELAY_TYPE_ONESHOT:
 				break;
 			default:
 				delay_info->status =
 					CTL_DELAY_STATUS_INVALID_TYPE;
 				break;
 			}
 
 			switch (delay_info->delay_loc) {
 			case CTL_DELAY_LOC_DATAMOVE:
 				lun->delay_info.datamove_type =
 					delay_info->delay_type;
 				lun->delay_info.datamove_delay =
 					delay_info->delay_secs;
 				break;
 			case CTL_DELAY_LOC_DONE:
 				lun->delay_info.done_type =
 					delay_info->delay_type;
 				lun->delay_info.done_delay =
 					delay_info->delay_secs;
 				break;
 			default:
 				delay_info->status =
 					CTL_DELAY_STATUS_INVALID_LOC;
 				break;
 			}
 			mtx_unlock(&lun->lun_lock);
 		}
 
 		mtx_unlock(&softc->ctl_lock);
 #else
 		delay_info->status = CTL_DELAY_STATUS_NOT_IMPLEMENTED;
 #endif /* CTL_IO_DELAY */
 		break;
 	}
 	case CTL_REALSYNC_SET: {
 		int *syncstate;
 
 		syncstate = (int *)addr;
 
 		mtx_lock(&softc->ctl_lock);
 		switch (*syncstate) {
 		case 0:
 			softc->flags &= ~CTL_FLAG_REAL_SYNC;
 			break;
 		case 1:
 			softc->flags |= CTL_FLAG_REAL_SYNC;
 			break;
 		default:
 			retval = EINVAL;
 			break;
 		}
 		mtx_unlock(&softc->ctl_lock);
 		break;
 	}
 	case CTL_REALSYNC_GET: {
 		int *syncstate;
 
 		syncstate = (int*)addr;
 
 		mtx_lock(&softc->ctl_lock);
 		if (softc->flags & CTL_FLAG_REAL_SYNC)
 			*syncstate = 1;
 		else
 			*syncstate = 0;
 		mtx_unlock(&softc->ctl_lock);
 
 		break;
 	}
 	case CTL_SETSYNC:
 	case CTL_GETSYNC: {
 		struct ctl_sync_info *sync_info;
 		struct ctl_lun *lun;
 
 		sync_info = (struct ctl_sync_info *)addr;
 
 		mtx_lock(&softc->ctl_lock);
 		lun = softc->ctl_luns[sync_info->lun_id];
 		if (lun == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			sync_info->status = CTL_GS_SYNC_NO_LUN;
 		}
 		/*
 		 * Get or set the sync interval.  We're not bounds checking
 		 * in the set case, hopefully the user won't do something
 		 * silly.
 		 */
 		mtx_lock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		if (cmd == CTL_GETSYNC)
 			sync_info->sync_interval = lun->sync_interval;
 		else
 			lun->sync_interval = sync_info->sync_interval;
 		mtx_unlock(&lun->lun_lock);
 
 		sync_info->status = CTL_GS_SYNC_OK;
 
 		break;
 	}
 	case CTL_GETSTATS: {
 		struct ctl_stats *stats;
 		struct ctl_lun *lun;
 		int i;
 
 		stats = (struct ctl_stats *)addr;
 
 		if ((sizeof(struct ctl_lun_io_stats) * softc->num_luns) >
 		     stats->alloc_len) {
 			stats->status = CTL_SS_NEED_MORE_SPACE;
 			stats->num_luns = softc->num_luns;
 			break;
 		}
 		/*
 		 * XXX KDM no locking here.  If the LUN list changes,
 		 * things can blow up.
 		 */
 		for (i = 0, lun = STAILQ_FIRST(&softc->lun_list); lun != NULL;
 		     i++, lun = STAILQ_NEXT(lun, links)) {
 			retval = copyout(&lun->stats, &stats->lun_stats[i],
 					 sizeof(lun->stats));
 			if (retval != 0)
 				break;
 		}
 		stats->num_luns = softc->num_luns;
 		stats->fill_len = sizeof(struct ctl_lun_io_stats) *
 				 softc->num_luns;
 		stats->status = CTL_SS_OK;
 #ifdef CTL_TIME_IO
 		stats->flags = CTL_STATS_FLAG_TIME_VALID;
 #else
 		stats->flags = CTL_STATS_FLAG_NONE;
 #endif
 		getnanouptime(&stats->timestamp);
 		break;
 	}
 	case CTL_ERROR_INJECT: {
 		struct ctl_error_desc *err_desc, *new_err_desc;
 		struct ctl_lun *lun;
 
 		err_desc = (struct ctl_error_desc *)addr;
 
 		new_err_desc = malloc(sizeof(*new_err_desc), M_CTL,
 				      M_WAITOK | M_ZERO);
 		bcopy(err_desc, new_err_desc, sizeof(*new_err_desc));
 
 		mtx_lock(&softc->ctl_lock);
 		lun = softc->ctl_luns[err_desc->lun_id];
 		if (lun == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			free(new_err_desc, M_CTL);
 			printf("%s: CTL_ERROR_INJECT: invalid LUN %ju\n",
 			       __func__, (uintmax_t)err_desc->lun_id);
 			retval = EINVAL;
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 
 		/*
 		 * We could do some checking here to verify the validity
 		 * of the request, but given the complexity of error
 		 * injection requests, the checking logic would be fairly
 		 * complex.
 		 *
 		 * For now, if the request is invalid, it just won't get
 		 * executed and might get deleted.
 		 */
 		STAILQ_INSERT_TAIL(&lun->error_list, new_err_desc, links);
 
 		/*
 		 * XXX KDM check to make sure the serial number is unique,
 		 * in case we somehow manage to wrap.  That shouldn't
 		 * happen for a very long time, but it's the right thing to
 		 * do.
 		 */
 		new_err_desc->serial = lun->error_serial;
 		err_desc->serial = lun->error_serial;
 		lun->error_serial++;
 
 		mtx_unlock(&lun->lun_lock);
 		break;
 	}
 	case CTL_ERROR_INJECT_DELETE: {
 		struct ctl_error_desc *delete_desc, *desc, *desc2;
 		struct ctl_lun *lun;
 		int delete_done;
 
 		delete_desc = (struct ctl_error_desc *)addr;
 		delete_done = 0;
 
 		mtx_lock(&softc->ctl_lock);
 		lun = softc->ctl_luns[delete_desc->lun_id];
 		if (lun == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			printf("%s: CTL_ERROR_INJECT_DELETE: invalid LUN %ju\n",
 			       __func__, (uintmax_t)delete_desc->lun_id);
 			retval = EINVAL;
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) {
 			if (desc->serial != delete_desc->serial)
 				continue;
 
 			STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc,
 				      links);
 			free(desc, M_CTL);
 			delete_done = 1;
 		}
 		mtx_unlock(&lun->lun_lock);
 		if (delete_done == 0) {
 			printf("%s: CTL_ERROR_INJECT_DELETE: can't find "
 			       "error serial %ju on LUN %u\n", __func__, 
 			       delete_desc->serial, delete_desc->lun_id);
 			retval = EINVAL;
 			break;
 		}
 		break;
 	}
 	case CTL_DUMP_STRUCTS: {
 		int i, j, k, idx;
 		struct ctl_port *port;
 		struct ctl_frontend *fe;
 
 		mtx_lock(&softc->ctl_lock);
 		printf("CTL Persistent Reservation information start:\n");
 		for (i = 0; i < CTL_MAX_LUNS; i++) {
 			struct ctl_lun *lun;
 
 			lun = softc->ctl_luns[i];
 
 			if ((lun == NULL)
 			 || ((lun->flags & CTL_LUN_DISABLED) != 0))
 				continue;
 
 			for (j = 0; j < (CTL_MAX_PORTS * 2); j++) {
 				for (k = 0; k < CTL_MAX_INIT_PER_PORT; k++){
 					idx = j * CTL_MAX_INIT_PER_PORT + k;
 					if (lun->per_res[idx].registered == 0)
 						continue;
 					printf("  LUN %d port %d iid %d key "
 					       "%#jx\n", i, j, k,
 					       (uintmax_t)scsi_8btou64(
 					       lun->per_res[idx].res_key.key));
 				}
 			}
 		}
 		printf("CTL Persistent Reservation information end\n");
 		printf("CTL Ports:\n");
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			printf("  Port %d '%s' Frontend '%s' Type %u pp %d vp %d WWNN "
 			       "%#jx WWPN %#jx\n", port->targ_port, port->port_name,
 			       port->frontend->name, port->port_type,
 			       port->physical_port, port->virtual_port,
 			       (uintmax_t)port->wwnn, (uintmax_t)port->wwpn);
 			for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) {
 				if (port->wwpn_iid[j].in_use == 0 &&
 				    port->wwpn_iid[j].wwpn == 0 &&
 				    port->wwpn_iid[j].name == NULL)
 					continue;
 
 				printf("    iid %u use %d WWPN %#jx '%s'\n",
 				    j, port->wwpn_iid[j].in_use,
 				    (uintmax_t)port->wwpn_iid[j].wwpn,
 				    port->wwpn_iid[j].name);
 			}
 		}
 		printf("CTL Port information end\n");
 		mtx_unlock(&softc->ctl_lock);
 		/*
 		 * XXX KDM calling this without a lock.  We'd likely want
 		 * to drop the lock before calling the frontend's dump
 		 * routine anyway.
 		 */
 		printf("CTL Frontends:\n");
 		STAILQ_FOREACH(fe, &softc->fe_list, links) {
 			printf("  Frontend '%s'\n", fe->name);
 			if (fe->fe_dump != NULL)
 				fe->fe_dump();
 		}
 		printf("CTL Frontend information end\n");
 		break;
 	}
 	case CTL_LUN_REQ: {
 		struct ctl_lun_req *lun_req;
 		struct ctl_backend_driver *backend;
 
 		lun_req = (struct ctl_lun_req *)addr;
 
 		backend = ctl_backend_find(lun_req->backend);
 		if (backend == NULL) {
 			lun_req->status = CTL_LUN_ERROR;
 			snprintf(lun_req->error_str,
 				 sizeof(lun_req->error_str),
 				 "Backend \"%s\" not found.",
 				 lun_req->backend);
 			break;
 		}
 		if (lun_req->num_be_args > 0) {
 			lun_req->kern_be_args = ctl_copyin_args(
 				lun_req->num_be_args,
 				lun_req->be_args,
 				lun_req->error_str,
 				sizeof(lun_req->error_str));
 			if (lun_req->kern_be_args == NULL) {
 				lun_req->status = CTL_LUN_ERROR;
 				break;
 			}
 		}
 
 		retval = backend->ioctl(dev, cmd, addr, flag, td);
 
 		if (lun_req->num_be_args > 0) {
 			ctl_copyout_args(lun_req->num_be_args,
 				      lun_req->kern_be_args);
 			ctl_free_args(lun_req->num_be_args,
 				      lun_req->kern_be_args);
 		}
 		break;
 	}
 	case CTL_LUN_LIST: {
 		struct sbuf *sb;
 		struct ctl_lun *lun;
 		struct ctl_lun_list *list;
 		struct ctl_option *opt;
 
 		list = (struct ctl_lun_list *)addr;
 
 		/*
 		 * Allocate a fixed length sbuf here, based on the length
 		 * of the user's buffer.  We could allocate an auto-extending
 		 * buffer, and then tell the user how much larger our
 		 * amount of data is than his buffer, but that presents
 		 * some problems:
 		 *
 		 * 1.  The sbuf(9) routines use a blocking malloc, and so
 		 *     we can't hold a lock while calling them with an
 		 *     auto-extending buffer.
  		 *
 		 * 2.  There is not currently a LUN reference counting
 		 *     mechanism, outside of outstanding transactions on
 		 *     the LUN's OOA queue.  So a LUN could go away on us
 		 *     while we're getting the LUN number, backend-specific
 		 *     information, etc.  Thus, given the way things
 		 *     currently work, we need to hold the CTL lock while
 		 *     grabbing LUN information.
 		 *
 		 * So, from the user's standpoint, the best thing to do is
 		 * allocate what he thinks is a reasonable buffer length,
 		 * and then if he gets a CTL_LUN_LIST_NEED_MORE_SPACE error,
 		 * double the buffer length and try again.  (And repeat
 		 * that until he succeeds.)
 		 */
 		sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN);
 		if (sb == NULL) {
 			list->status = CTL_LUN_LIST_ERROR;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Unable to allocate %d bytes for LUN list",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_printf(sb, "<ctllunlist>\n");
 
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(lun, &softc->lun_list, links) {
 			mtx_lock(&lun->lun_lock);
 			retval = sbuf_printf(sb, "<lun id=\"%ju\">\n",
 					     (uintmax_t)lun->lun);
 
 			/*
 			 * Bail out as soon as we see that we've overfilled
 			 * the buffer.
 			 */
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<backend_type>%s"
 					     "</backend_type>\n",
 					     (lun->backend == NULL) ?  "none" :
 					     lun->backend->name);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<lun_type>%d</lun_type>\n",
 					     lun->be_lun->lun_type);
 
 			if (retval != 0)
 				break;
 
 			if (lun->backend == NULL) {
 				retval = sbuf_printf(sb, "</lun>\n");
 				if (retval != 0)
 					break;
 				continue;
 			}
 
 			retval = sbuf_printf(sb, "\t<size>%ju</size>\n",
 					     (lun->be_lun->maxlba > 0) ?
 					     lun->be_lun->maxlba + 1 : 0);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<blocksize>%u</blocksize>\n",
 					     lun->be_lun->blocksize);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<serial_number>");
 
 			if (retval != 0)
 				break;
 
 			retval = ctl_sbuf_printf_esc(sb,
 						     lun->be_lun->serial_num);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "</serial_number>\n");
 		
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<device_id>");
 
 			if (retval != 0)
 				break;
 
 			retval = ctl_sbuf_printf_esc(sb,lun->be_lun->device_id);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "</device_id>\n");
 
 			if (retval != 0)
 				break;
 
 			if (lun->backend->lun_info != NULL) {
 				retval = lun->backend->lun_info(lun->be_lun->be_lun, sb);
 				if (retval != 0)
 					break;
 			}
 			STAILQ_FOREACH(opt, &lun->be_lun->options, links) {
 				retval = sbuf_printf(sb, "\t<%s>%s</%s>\n",
 				    opt->name, opt->value, opt->name);
 				if (retval != 0)
 					break;
 			}
 
 			retval = sbuf_printf(sb, "</lun>\n");
 
 			if (retval != 0)
 				break;
 			mtx_unlock(&lun->lun_lock);
 		}
 		if (lun != NULL)
 			mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 
 		if ((retval != 0)
 		 || ((retval = sbuf_printf(sb, "</ctllunlist>\n")) != 0)) {
 			retval = 0;
 			sbuf_delete(sb);
 			list->status = CTL_LUN_LIST_NEED_MORE_SPACE;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Out of space, %d bytes is too small",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_finish(sb);
 
 		retval = copyout(sbuf_data(sb), list->lun_xml,
 				 sbuf_len(sb) + 1);
 
 		list->fill_len = sbuf_len(sb) + 1;
 		list->status = CTL_LUN_LIST_OK;
 		sbuf_delete(sb);
 		break;
 	}
 	case CTL_ISCSI: {
 		struct ctl_iscsi *ci;
 		struct ctl_frontend *fe;
 
 		ci = (struct ctl_iscsi *)addr;
 
 		fe = ctl_frontend_find("iscsi");
 		if (fe == NULL) {
 			ci->status = CTL_ISCSI_ERROR;
 			snprintf(ci->error_str, sizeof(ci->error_str),
 			    "Frontend \"iscsi\" not found.");
 			break;
 		}
 
 		retval = fe->ioctl(dev, cmd, addr, flag, td);
 		break;
 	}
 	case CTL_PORT_REQ: {
 		struct ctl_req *req;
 		struct ctl_frontend *fe;
 
 		req = (struct ctl_req *)addr;
 
 		fe = ctl_frontend_find(req->driver);
 		if (fe == NULL) {
 			req->status = CTL_LUN_ERROR;
 			snprintf(req->error_str, sizeof(req->error_str),
 			    "Frontend \"%s\" not found.", req->driver);
 			break;
 		}
 		if (req->num_args > 0) {
 			req->kern_args = ctl_copyin_args(req->num_args,
 			    req->args, req->error_str, sizeof(req->error_str));
 			if (req->kern_args == NULL) {
 				req->status = CTL_LUN_ERROR;
 				break;
 			}
 		}
 
 		retval = fe->ioctl(dev, cmd, addr, flag, td);
 
 		if (req->num_args > 0) {
 			ctl_copyout_args(req->num_args, req->kern_args);
 			ctl_free_args(req->num_args, req->kern_args);
 		}
 		break;
 	}
 	case CTL_PORT_LIST: {
 		struct sbuf *sb;
 		struct ctl_port *port;
 		struct ctl_lun_list *list;
 		struct ctl_option *opt;
 
 		list = (struct ctl_lun_list *)addr;
 
 		sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN);
 		if (sb == NULL) {
 			list->status = CTL_LUN_LIST_ERROR;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Unable to allocate %d bytes for LUN list",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_printf(sb, "<ctlportlist>\n");
 
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			retval = sbuf_printf(sb, "<targ_port id=\"%ju\">\n",
 					     (uintmax_t)port->targ_port);
 
 			/*
 			 * Bail out as soon as we see that we've overfilled
 			 * the buffer.
 			 */
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<frontend_type>%s"
 			    "</frontend_type>\n", port->frontend->name);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<port_type>%d</port_type>\n",
 					     port->port_type);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<online>%s</online>\n",
 			    (port->status & CTL_PORT_STATUS_ONLINE) ? "YES" : "NO");
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<port_name>%s</port_name>\n",
 			    port->port_name);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<physical_port>%d</physical_port>\n",
 			    port->physical_port);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<virtual_port>%d</virtual_port>\n",
 			    port->virtual_port);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<wwnn>%#jx</wwnn>\n",
 			    (uintmax_t)port->wwnn);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<wwpn>%#jx</wwpn>\n",
 			    (uintmax_t)port->wwpn);
 			if (retval != 0)
 				break;
 
 			if (port->port_info != NULL) {
 				retval = port->port_info(port->onoff_arg, sb);
 				if (retval != 0)
 					break;
 			}
 			STAILQ_FOREACH(opt, &port->options, links) {
 				retval = sbuf_printf(sb, "\t<%s>%s</%s>\n",
 				    opt->name, opt->value, opt->name);
 				if (retval != 0)
 					break;
 			}
 
 			retval = sbuf_printf(sb, "</targ_port>\n");
 			if (retval != 0)
 				break;
 		}
 		mtx_unlock(&softc->ctl_lock);
 
 		if ((retval != 0)
 		 || ((retval = sbuf_printf(sb, "</ctlportlist>\n")) != 0)) {
 			retval = 0;
 			sbuf_delete(sb);
 			list->status = CTL_LUN_LIST_NEED_MORE_SPACE;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Out of space, %d bytes is too small",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_finish(sb);
 
 		retval = copyout(sbuf_data(sb), list->lun_xml,
 				 sbuf_len(sb) + 1);
 
 		list->fill_len = sbuf_len(sb) + 1;
 		list->status = CTL_LUN_LIST_OK;
 		sbuf_delete(sb);
 		break;
 	}
 	default: {
 		/* XXX KDM should we fix this? */
 #if 0
 		struct ctl_backend_driver *backend;
 		unsigned int type;
 		int found;
 
 		found = 0;
 
 		/*
 		 * We encode the backend type as the ioctl type for backend
 		 * ioctls.  So parse it out here, and then search for a
 		 * backend of this type.
 		 */
 		type = _IOC_TYPE(cmd);
 
 		STAILQ_FOREACH(backend, &softc->be_list, links) {
 			if (backend->type == type) {
 				found = 1;
 				break;
 			}
 		}
 		if (found == 0) {
 			printf("ctl: unknown ioctl command %#lx or backend "
 			       "%d\n", cmd, type);
 			retval = EINVAL;
 			break;
 		}
 		retval = backend->ioctl(dev, cmd, addr, flag, td);
 #endif
 		retval = ENOTTY;
 		break;
 	}
 	}
 	return (retval);
 }
 
 uint32_t
 ctl_get_initindex(struct ctl_nexus *nexus)
 {
 	if (nexus->targ_port < CTL_MAX_PORTS)
 		return (nexus->initid.id +
 			(nexus->targ_port * CTL_MAX_INIT_PER_PORT));
 	else
 		return (nexus->initid.id +
 		       ((nexus->targ_port - CTL_MAX_PORTS) *
 			CTL_MAX_INIT_PER_PORT));
 }
 
 uint32_t
 ctl_get_resindex(struct ctl_nexus *nexus)
 {
 	return (nexus->initid.id + (nexus->targ_port * CTL_MAX_INIT_PER_PORT));
 }
 
 uint32_t
 ctl_port_idx(int port_num)
 {
 	if (port_num < CTL_MAX_PORTS)
 		return(port_num);
 	else
 		return(port_num - CTL_MAX_PORTS);
 }
 
 static uint32_t
 ctl_map_lun(int port_num, uint32_t lun_id)
 {
 	struct ctl_port *port;
 
 	port = control_softc->ctl_ports[ctl_port_idx(port_num)];
 	if (port == NULL)
 		return (UINT32_MAX);
 	if (port->lun_map == NULL)
 		return (lun_id);
 	return (port->lun_map(port->targ_lun_arg, lun_id));
 }
 
 static uint32_t
 ctl_map_lun_back(int port_num, uint32_t lun_id)
 {
 	struct ctl_port *port;
 	uint32_t i;
 
 	port = control_softc->ctl_ports[ctl_port_idx(port_num)];
 	if (port->lun_map == NULL)
 		return (lun_id);
 	for (i = 0; i < CTL_MAX_LUNS; i++) {
 		if (port->lun_map(port->targ_lun_arg, i) == lun_id)
 			return (i);
 	}
 	return (UINT32_MAX);
 }
 
 /*
  * Note:  This only works for bitmask sizes that are at least 32 bits, and
  * that are a power of 2.
  */
 int
 ctl_ffz(uint32_t *mask, uint32_t size)
 {
 	uint32_t num_chunks, num_pieces;
 	int i, j;
 
 	num_chunks = (size >> 5);
 	if (num_chunks == 0)
 		num_chunks++;
 	num_pieces = ctl_min((sizeof(uint32_t) * 8), size);
 
 	for (i = 0; i < num_chunks; i++) {
 		for (j = 0; j < num_pieces; j++) {
 			if ((mask[i] & (1 << j)) == 0)
 				return ((i << 5) + j);
 		}
 	}
 
 	return (-1);
 }
 
 int
 ctl_set_mask(uint32_t *mask, uint32_t bit)
 {
 	uint32_t chunk, piece;
 
 	chunk = bit >> 5;
 	piece = bit % (sizeof(uint32_t) * 8);
 
 	if ((mask[chunk] & (1 << piece)) != 0)
 		return (-1);
 	else
 		mask[chunk] |= (1 << piece);
 
 	return (0);
 }
 
 int
 ctl_clear_mask(uint32_t *mask, uint32_t bit)
 {
 	uint32_t chunk, piece;
 
 	chunk = bit >> 5;
 	piece = bit % (sizeof(uint32_t) * 8);
 
 	if ((mask[chunk] & (1 << piece)) == 0)
 		return (-1);
 	else
 		mask[chunk] &= ~(1 << piece);
 
 	return (0);
 }
 
 int
 ctl_is_set(uint32_t *mask, uint32_t bit)
 {
 	uint32_t chunk, piece;
 
 	chunk = bit >> 5;
 	piece = bit % (sizeof(uint32_t) * 8);
 
 	if ((mask[chunk] & (1 << piece)) == 0)
 		return (0);
 	else
 		return (1);
 }
 
 #ifdef unused
 /*
  * The bus, target and lun are optional, they can be filled in later.
  * can_wait is used to determine whether we can wait on the malloc or not.
  */
 union ctl_io*
 ctl_malloc_io(ctl_io_type io_type, uint32_t targ_port, uint32_t targ_target,
 	      uint32_t targ_lun, int can_wait)
 {
 	union ctl_io *io;
 
 	if (can_wait)
 		io = (union ctl_io *)malloc(sizeof(*io), M_CTL, M_WAITOK);
 	else
 		io = (union ctl_io *)malloc(sizeof(*io), M_CTL, M_NOWAIT);
 
 	if (io != NULL) {
 		io->io_hdr.io_type = io_type;
 		io->io_hdr.targ_port = targ_port;
 		/*
 		 * XXX KDM this needs to change/go away.  We need to move
 		 * to a preallocated pool of ctl_scsiio structures.
 		 */
 		io->io_hdr.nexus.targ_target.id = targ_target;
 		io->io_hdr.nexus.targ_lun = targ_lun;
 	}
 
 	return (io);
 }
 
 void
 ctl_kfree_io(union ctl_io *io)
 {
 	free(io, M_CTL);
 }
 #endif /* unused */
 
 /*
  * ctl_softc, pool_type, total_ctl_io are passed in.
  * npool is passed out.
  */
 int
 ctl_pool_create(struct ctl_softc *ctl_softc, ctl_pool_type pool_type,
 		uint32_t total_ctl_io, struct ctl_io_pool **npool)
 {
 	uint32_t i;
 	union ctl_io *cur_io, *next_io;
 	struct ctl_io_pool *pool;
 	int retval;
 
 	retval = 0;
 
 	pool = (struct ctl_io_pool *)malloc(sizeof(*pool), M_CTL,
 					    M_NOWAIT | M_ZERO);
 	if (pool == NULL) {
 		retval = ENOMEM;
 		goto bailout;
 	}
 
 	pool->type = pool_type;
 	pool->ctl_softc = ctl_softc;
 
 	mtx_lock(&ctl_softc->pool_lock);
 	pool->id = ctl_softc->cur_pool_id++;
 	mtx_unlock(&ctl_softc->pool_lock);
 
 	pool->flags = CTL_POOL_FLAG_NONE;
 	pool->refcount = 1;		/* Reference for validity. */
 	STAILQ_INIT(&pool->free_queue);
 
 	/*
 	 * XXX KDM other options here:
 	 * - allocate a page at a time
 	 * - allocate one big chunk of memory.
 	 * Page allocation might work well, but would take a little more
 	 * tracking.
 	 */
 	for (i = 0; i < total_ctl_io; i++) {
 		cur_io = (union ctl_io *)malloc(sizeof(*cur_io), M_CTLIO,
 						M_NOWAIT);
 		if (cur_io == NULL) {
 			retval = ENOMEM;
 			break;
 		}
 		cur_io->io_hdr.pool = pool;
 		STAILQ_INSERT_TAIL(&pool->free_queue, &cur_io->io_hdr, links);
 		pool->total_ctl_io++;
 		pool->free_ctl_io++;
 	}
 
 	if (retval != 0) {
 		for (cur_io = (union ctl_io *)STAILQ_FIRST(&pool->free_queue);
 		     cur_io != NULL; cur_io = next_io) {
 			next_io = (union ctl_io *)STAILQ_NEXT(&cur_io->io_hdr,
 							      links);
 			STAILQ_REMOVE(&pool->free_queue, &cur_io->io_hdr,
 				      ctl_io_hdr, links);
 			free(cur_io, M_CTLIO);
 		}
 
 		free(pool, M_CTL);
 		goto bailout;
 	}
 	mtx_lock(&ctl_softc->pool_lock);
 	ctl_softc->num_pools++;
 	STAILQ_INSERT_TAIL(&ctl_softc->io_pools, pool, links);
 	/*
 	 * Increment our usage count if this is an external consumer, so we
 	 * can't get unloaded until the external consumer (most likely a
 	 * FETD) unloads and frees his pool.
 	 *
 	 * XXX KDM will this increment the caller's module use count, or
 	 * mine?
 	 */
 #if 0
 	if ((pool_type != CTL_POOL_EMERGENCY)
 	 && (pool_type != CTL_POOL_INTERNAL)
 	 && (pool_type != CTL_POOL_4OTHERSC))
 		MOD_INC_USE_COUNT;
 #endif
 
 	mtx_unlock(&ctl_softc->pool_lock);
 
 	*npool = pool;
 
 bailout:
 
 	return (retval);
 }
 
 static int
 ctl_pool_acquire(struct ctl_io_pool *pool)
 {
 
 	mtx_assert(&pool->ctl_softc->pool_lock, MA_OWNED);
 
 	if (pool->flags & CTL_POOL_FLAG_INVALID)
 		return (EINVAL);
 
 	pool->refcount++;
 
 	return (0);
 }
 
 static void
 ctl_pool_release(struct ctl_io_pool *pool)
 {
 	struct ctl_softc *ctl_softc = pool->ctl_softc;
 	union ctl_io *io;
 
 	mtx_assert(&ctl_softc->pool_lock, MA_OWNED);
 
 	if (--pool->refcount != 0)
 		return;
 
 	while ((io = (union ctl_io *)STAILQ_FIRST(&pool->free_queue)) != NULL) {
 		STAILQ_REMOVE(&pool->free_queue, &io->io_hdr, ctl_io_hdr,
 			      links);
 		free(io, M_CTLIO);
 	}
 
 	STAILQ_REMOVE(&ctl_softc->io_pools, pool, ctl_io_pool, links);
 	ctl_softc->num_pools--;
 
 	/*
 	 * XXX KDM will this decrement the caller's usage count or mine?
 	 */
 #if 0
 	if ((pool->type != CTL_POOL_EMERGENCY)
 	 && (pool->type != CTL_POOL_INTERNAL)
 	 && (pool->type != CTL_POOL_4OTHERSC))
 		MOD_DEC_USE_COUNT;
 #endif
 
 	free(pool, M_CTL);
 }
 
 void
 ctl_pool_free(struct ctl_io_pool *pool)
 {
 	struct ctl_softc *ctl_softc;
 
 	if (pool == NULL)
 		return;
 
 	ctl_softc = pool->ctl_softc;
 	mtx_lock(&ctl_softc->pool_lock);
 	pool->flags |= CTL_POOL_FLAG_INVALID;
 	ctl_pool_release(pool);
 	mtx_unlock(&ctl_softc->pool_lock);
 }
 
 /*
  * This routine does not block (except for spinlocks of course).
  * It tries to allocate a ctl_io union from the caller's pool as quickly as
  * possible.
  */
 union ctl_io *
 ctl_alloc_io(void *pool_ref)
 {
 	union ctl_io *io;
 	struct ctl_softc *ctl_softc;
 	struct ctl_io_pool *pool, *npool;
 	struct ctl_io_pool *emergency_pool;
 
 	pool = (struct ctl_io_pool *)pool_ref;
 
 	if (pool == NULL) {
 		printf("%s: pool is NULL\n", __func__);
 		return (NULL);
 	}
 
 	emergency_pool = NULL;
 
 	ctl_softc = pool->ctl_softc;
 
 	mtx_lock(&ctl_softc->pool_lock);
 	/*
 	 * First, try to get the io structure from the user's pool.
 	 */
 	if (ctl_pool_acquire(pool) == 0) {
 		io = (union ctl_io *)STAILQ_FIRST(&pool->free_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&pool->free_queue, links);
 			pool->total_allocated++;
 			pool->free_ctl_io--;
 			mtx_unlock(&ctl_softc->pool_lock);
 			return (io);
 		} else
 			ctl_pool_release(pool);
 	}
 	/*
 	 * If he doesn't have any io structures left, search for an
 	 * emergency pool and grab one from there.
 	 */
 	STAILQ_FOREACH(npool, &ctl_softc->io_pools, links) {
 		if (npool->type != CTL_POOL_EMERGENCY)
 			continue;
 
 		if (ctl_pool_acquire(npool) != 0)
 			continue;
 
 		emergency_pool = npool;
 
 		io = (union ctl_io *)STAILQ_FIRST(&npool->free_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&npool->free_queue, links);
 			npool->total_allocated++;
 			npool->free_ctl_io--;
 			mtx_unlock(&ctl_softc->pool_lock);
 			return (io);
 		} else
 			ctl_pool_release(npool);
 	}
 
 	/* Drop the spinlock before we malloc */
 	mtx_unlock(&ctl_softc->pool_lock);
 
 	/*
 	 * The emergency pool (if it exists) didn't have one, so try an
 	 * atomic (i.e. nonblocking) malloc and see if we get lucky.
 	 */
 	io = (union ctl_io *)malloc(sizeof(*io), M_CTLIO, M_NOWAIT);
 	if (io != NULL) {
 		/*
 		 * If the emergency pool exists but is empty, add this
 		 * ctl_io to its list when it gets freed.
 		 */
 		if (emergency_pool != NULL) {
 			mtx_lock(&ctl_softc->pool_lock);
 			if (ctl_pool_acquire(emergency_pool) == 0) {
 				io->io_hdr.pool = emergency_pool;
 				emergency_pool->total_ctl_io++;
 				/*
 				 * Need to bump this, otherwise
 				 * total_allocated and total_freed won't
 				 * match when we no longer have anything
 				 * outstanding.
 				 */
 				emergency_pool->total_allocated++;
 			}
 			mtx_unlock(&ctl_softc->pool_lock);
 		} else
 			io->io_hdr.pool = NULL;
 	}
 
 	return (io);
 }
 
 void
 ctl_free_io(union ctl_io *io)
 {
 	if (io == NULL)
 		return;
 
 	/*
 	 * If this ctl_io has a pool, return it to that pool.
 	 */
 	if (io->io_hdr.pool != NULL) {
 		struct ctl_io_pool *pool;
 
 		pool = (struct ctl_io_pool *)io->io_hdr.pool;
 		mtx_lock(&pool->ctl_softc->pool_lock);
 		io->io_hdr.io_type = 0xff;
 		STAILQ_INSERT_TAIL(&pool->free_queue, &io->io_hdr, links);
 		pool->total_freed++;
 		pool->free_ctl_io++;
 		ctl_pool_release(pool);
 		mtx_unlock(&pool->ctl_softc->pool_lock);
 	} else {
 		/*
 		 * Otherwise, just free it.  We probably malloced it and
 		 * the emergency pool wasn't available.
 		 */
 		free(io, M_CTLIO);
 	}
 
 }
 
 void
 ctl_zero_io(union ctl_io *io)
 {
 	void *pool_ref;
 
 	if (io == NULL)
 		return;
 
 	/*
 	 * May need to preserve linked list pointers at some point too.
 	 */
 	pool_ref = io->io_hdr.pool;
 
 	memset(io, 0, sizeof(*io));
 
 	io->io_hdr.pool = pool_ref;
 }
 
 /*
  * This routine is currently used for internal copies of ctl_ios that need
  * to persist for some reason after we've already returned status to the
  * FETD.  (Thus the flag set.)
  *
  * XXX XXX
  * Note that this makes a blind copy of all fields in the ctl_io, except
  * for the pool reference.  This includes any memory that has been
  * allocated!  That memory will no longer be valid after done has been
  * called, so this would be VERY DANGEROUS for command that actually does
  * any reads or writes.  Right now (11/7/2005), this is only used for immediate
  * start and stop commands, which don't transfer any data, so this is not a
  * problem.  If it is used for anything else, the caller would also need to
  * allocate data buffer space and this routine would need to be modified to
  * copy the data buffer(s) as well.
  */
 void
 ctl_copy_io(union ctl_io *src, union ctl_io *dest)
 {
 	void *pool_ref;
 
 	if ((src == NULL)
 	 || (dest == NULL))
 		return;
 
 	/*
 	 * May need to preserve linked list pointers at some point too.
 	 */
 	pool_ref = dest->io_hdr.pool;
 
 	memcpy(dest, src, ctl_min(sizeof(*src), sizeof(*dest)));
 
 	dest->io_hdr.pool = pool_ref;
 	/*
 	 * We need to know that this is an internal copy, and doesn't need
 	 * to get passed back to the FETD that allocated it.
 	 */
 	dest->io_hdr.flags |= CTL_FLAG_INT_COPY;
 }
 
 #ifdef NEEDTOPORT
 static void
 ctl_update_power_subpage(struct copan_power_subpage *page)
 {
 	int num_luns, num_partitions, config_type;
 	struct ctl_softc *softc;
 	cs_BOOL_t aor_present, shelf_50pct_power;
 	cs_raidset_personality_t rs_type;
 	int max_active_luns;
 
 	softc = control_softc;
 
 	/* subtract out the processor LUN */
 	num_luns = softc->num_luns - 1;
 	/*
 	 * Default to 7 LUNs active, which was the only number we allowed
 	 * in the past.
 	 */
 	max_active_luns = 7;
 
 	num_partitions = config_GetRsPartitionInfo();
 	config_type = config_GetConfigType();
 	shelf_50pct_power = config_GetShelfPowerMode();
 	aor_present = config_IsAorRsPresent();
 
 	rs_type = ddb_GetRsRaidType(1);
 	if ((rs_type != CS_RAIDSET_PERSONALITY_RAID5)
 	 && (rs_type != CS_RAIDSET_PERSONALITY_RAID1)) {
 		EPRINT(0, "Unsupported RS type %d!", rs_type);
 	}
 
 
 	page->total_luns = num_luns;
 
 	switch (config_type) {
 	case 40:
 		/*
 		 * In a 40 drive configuration, it doesn't matter what DC
 		 * cards we have, whether we have AOR enabled or not,
 		 * partitioning or not, or what type of RAIDset we have.
 		 * In that scenario, we can power up every LUN we present
 		 * to the user.
 		 */
 		max_active_luns = num_luns;
 
 		break;
 	case 64:
 		if (shelf_50pct_power == CS_FALSE) {
 			/* 25% power */
 			if (aor_present == CS_TRUE) {
 				if (rs_type ==
 				     CS_RAIDSET_PERSONALITY_RAID5) {
 					max_active_luns = 7;
 				} else if (rs_type ==
 					 CS_RAIDSET_PERSONALITY_RAID1){
 					max_active_luns = 14;
 				} else {
 					/* XXX KDM now what?? */
 				}
 			} else {
 				if (rs_type ==
 				     CS_RAIDSET_PERSONALITY_RAID5) {
 					max_active_luns = 8;
 				} else if (rs_type ==
 					 CS_RAIDSET_PERSONALITY_RAID1){
 					max_active_luns = 16;
 				} else {
 					/* XXX KDM now what?? */
 				}
 			}
 		} else {
 			/* 50% power */
 			/*
 			 * With 50% power in a 64 drive configuration, we
 			 * can power all LUNs we present.
 			 */
 			max_active_luns = num_luns;
 		}
 		break;
 	case 112:
 		if (shelf_50pct_power == CS_FALSE) {
 			/* 25% power */
 			if (aor_present == CS_TRUE) {
 				if (rs_type ==
 				     CS_RAIDSET_PERSONALITY_RAID5) {
 					max_active_luns = 7;
 				} else if (rs_type ==
 					 CS_RAIDSET_PERSONALITY_RAID1){
 					max_active_luns = 14;
 				} else {
 					/* XXX KDM now what?? */
 				}
 			} else {
 				if (rs_type ==
 				     CS_RAIDSET_PERSONALITY_RAID5) {
 					max_active_luns = 8;
 				} else if (rs_type ==
 					 CS_RAIDSET_PERSONALITY_RAID1){
 					max_active_luns = 16;
 				} else {
 					/* XXX KDM now what?? */
 				}
 			}
 		} else {
 			/* 50% power */
 			if (aor_present == CS_TRUE) {
 				if (rs_type ==
 				     CS_RAIDSET_PERSONALITY_RAID5) {
 					max_active_luns = 14;
 				} else if (rs_type ==
 					 CS_RAIDSET_PERSONALITY_RAID1){
 					/*
 					 * We're assuming here that disk
 					 * caching is enabled, and so we're
 					 * able to power up half of each
 					 * LUN, and cache all writes.
 					 */
 					max_active_luns = num_luns;
 				} else {
 					/* XXX KDM now what?? */
 				}
 			} else {
 				if (rs_type ==
 				     CS_RAIDSET_PERSONALITY_RAID5) {
 					max_active_luns = 15;
 				} else if (rs_type ==
 					 CS_RAIDSET_PERSONALITY_RAID1){
 					max_active_luns = 30;
 				} else {
 					/* XXX KDM now what?? */
 				}
 			}
 		}
 		break;
 	default:
 		/*
 		 * In this case, we have an unknown configuration, so we
 		 * just use the default from above.
 		 */
 		break;
 	}
 
 	page->max_active_luns = max_active_luns;
 #if 0
 	printk("%s: total_luns = %d, max_active_luns = %d\n", __func__,
 	       page->total_luns, page->max_active_luns);
 #endif
 }
 #endif /* NEEDTOPORT */
 
 /*
  * This routine could be used in the future to load default and/or saved
  * mode page parameters for a particuar lun.
  */
 static int
 ctl_init_page_index(struct ctl_lun *lun)
 {
 	int i;
 	struct ctl_page_index *page_index;
 	struct ctl_softc *softc;
 
 	memcpy(&lun->mode_pages.index, page_index_template,
 	       sizeof(page_index_template));
 
 	softc = lun->ctl_softc;
 
 	for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 
 		page_index = &lun->mode_pages.index[i];
 		/*
 		 * If this is a disk-only mode page, there's no point in
 		 * setting it up.  For some pages, we have to have some
 		 * basic information about the disk in order to calculate the
 		 * mode page data.
 		 */
 		if ((lun->be_lun->lun_type != T_DIRECT)
 		 && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY))
 			continue;
 
 		switch (page_index->page_code & SMPH_PC_MASK) {
 		case SMS_FORMAT_DEVICE_PAGE: {
 			struct scsi_format_page *format_page;
 
 			if (page_index->subpage != SMS_SUBPAGE_PAGE_0)
 				panic("subpage is incorrect!");
 
 			/*
 			 * Sectors per track are set above.  Bytes per
 			 * sector need to be set here on a per-LUN basis.
 			 */
 			memcpy(&lun->mode_pages.format_page[CTL_PAGE_CURRENT],
 			       &format_page_default,
 			       sizeof(format_page_default));
 			memcpy(&lun->mode_pages.format_page[
 			       CTL_PAGE_CHANGEABLE], &format_page_changeable,
 			       sizeof(format_page_changeable));
 			memcpy(&lun->mode_pages.format_page[CTL_PAGE_DEFAULT],
 			       &format_page_default,
 			       sizeof(format_page_default));
 			memcpy(&lun->mode_pages.format_page[CTL_PAGE_SAVED],
 			       &format_page_default,
 			       sizeof(format_page_default));
 
 			format_page = &lun->mode_pages.format_page[
 				CTL_PAGE_CURRENT];
 			scsi_ulto2b(lun->be_lun->blocksize,
 				    format_page->bytes_per_sector);
 
 			format_page = &lun->mode_pages.format_page[
 				CTL_PAGE_DEFAULT];
 			scsi_ulto2b(lun->be_lun->blocksize,
 				    format_page->bytes_per_sector);
 
 			format_page = &lun->mode_pages.format_page[
 				CTL_PAGE_SAVED];
 			scsi_ulto2b(lun->be_lun->blocksize,
 				    format_page->bytes_per_sector);
 
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.format_page;
 			break;
 		}
 		case SMS_RIGID_DISK_PAGE: {
 			struct scsi_rigid_disk_page *rigid_disk_page;
 			uint32_t sectors_per_cylinder;
 			uint64_t cylinders;
 #ifndef	__XSCALE__
 			int shift;
 #endif /* !__XSCALE__ */
 
 			if (page_index->subpage != SMS_SUBPAGE_PAGE_0)
 				panic("invalid subpage value %d",
 				      page_index->subpage);
 
 			/*
 			 * Rotation rate and sectors per track are set
 			 * above.  We calculate the cylinders here based on
 			 * capacity.  Due to the number of heads and
 			 * sectors per track we're using, smaller arrays
 			 * may turn out to have 0 cylinders.  Linux and
 			 * FreeBSD don't pay attention to these mode pages
 			 * to figure out capacity, but Solaris does.  It
 			 * seems to deal with 0 cylinders just fine, and
 			 * works out a fake geometry based on the capacity.
 			 */
 			memcpy(&lun->mode_pages.rigid_disk_page[
 			       CTL_PAGE_CURRENT], &rigid_disk_page_default,
 			       sizeof(rigid_disk_page_default));
 			memcpy(&lun->mode_pages.rigid_disk_page[
 			       CTL_PAGE_CHANGEABLE],&rigid_disk_page_changeable,
 			       sizeof(rigid_disk_page_changeable));
 			memcpy(&lun->mode_pages.rigid_disk_page[
 			       CTL_PAGE_DEFAULT], &rigid_disk_page_default,
 			       sizeof(rigid_disk_page_default));
 			memcpy(&lun->mode_pages.rigid_disk_page[
 			       CTL_PAGE_SAVED], &rigid_disk_page_default,
 			       sizeof(rigid_disk_page_default));
 
 			sectors_per_cylinder = CTL_DEFAULT_SECTORS_PER_TRACK *
 				CTL_DEFAULT_HEADS;
 
 			/*
 			 * The divide method here will be more accurate,
 			 * probably, but results in floating point being
 			 * used in the kernel on i386 (__udivdi3()).  On the
 			 * XScale, though, __udivdi3() is implemented in
 			 * software.
 			 *
 			 * The shift method for cylinder calculation is
 			 * accurate if sectors_per_cylinder is a power of
 			 * 2.  Otherwise it might be slightly off -- you
 			 * might have a bit of a truncation problem.
 			 */
 #ifdef	__XSCALE__
 			cylinders = (lun->be_lun->maxlba + 1) /
 				sectors_per_cylinder;
 #else
 			for (shift = 31; shift > 0; shift--) {
 				if (sectors_per_cylinder & (1 << shift))
 					break;
 			}
 			cylinders = (lun->be_lun->maxlba + 1) >> shift;
 #endif
 
 			/*
 			 * We've basically got 3 bytes, or 24 bits for the
 			 * cylinder size in the mode page.  If we're over,
 			 * just round down to 2^24.
 			 */
 			if (cylinders > 0xffffff)
 				cylinders = 0xffffff;
 
 			rigid_disk_page = &lun->mode_pages.rigid_disk_page[
 				CTL_PAGE_CURRENT];
 			scsi_ulto3b(cylinders, rigid_disk_page->cylinders);
 
 			rigid_disk_page = &lun->mode_pages.rigid_disk_page[
 				CTL_PAGE_DEFAULT];
 			scsi_ulto3b(cylinders, rigid_disk_page->cylinders);
 
 			rigid_disk_page = &lun->mode_pages.rigid_disk_page[
 				CTL_PAGE_SAVED];
 			scsi_ulto3b(cylinders, rigid_disk_page->cylinders);
 
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.rigid_disk_page;
 			break;
 		}
 		case SMS_CACHING_PAGE: {
 
 			if (page_index->subpage != SMS_SUBPAGE_PAGE_0)
 				panic("invalid subpage value %d",
 				      page_index->subpage);
 			/*
 			 * Defaults should be okay here, no calculations
 			 * needed.
 			 */
 			memcpy(&lun->mode_pages.caching_page[CTL_PAGE_CURRENT],
 			       &caching_page_default,
 			       sizeof(caching_page_default));
 			memcpy(&lun->mode_pages.caching_page[
 			       CTL_PAGE_CHANGEABLE], &caching_page_changeable,
 			       sizeof(caching_page_changeable));
 			memcpy(&lun->mode_pages.caching_page[CTL_PAGE_DEFAULT],
 			       &caching_page_default,
 			       sizeof(caching_page_default));
 			memcpy(&lun->mode_pages.caching_page[CTL_PAGE_SAVED],
 			       &caching_page_default,
 			       sizeof(caching_page_default));
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.caching_page;
 			break;
 		}
 		case SMS_CONTROL_MODE_PAGE: {
 
 			if (page_index->subpage != SMS_SUBPAGE_PAGE_0)
 				panic("invalid subpage value %d",
 				      page_index->subpage);
 
 			/*
 			 * Defaults should be okay here, no calculations
 			 * needed.
 			 */
 			memcpy(&lun->mode_pages.control_page[CTL_PAGE_CURRENT],
 			       &control_page_default,
 			       sizeof(control_page_default));
 			memcpy(&lun->mode_pages.control_page[
 			       CTL_PAGE_CHANGEABLE], &control_page_changeable,
 			       sizeof(control_page_changeable));
 			memcpy(&lun->mode_pages.control_page[CTL_PAGE_DEFAULT],
 			       &control_page_default,
 			       sizeof(control_page_default));
 			memcpy(&lun->mode_pages.control_page[CTL_PAGE_SAVED],
 			       &control_page_default,
 			       sizeof(control_page_default));
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.control_page;
 			break;
 
 		}
 		case SMS_VENDOR_SPECIFIC_PAGE:{
 			switch (page_index->subpage) {
 			case PWR_SUBPAGE_CODE: {
 				struct copan_power_subpage *current_page,
 							   *saved_page;
 
 				memcpy(&lun->mode_pages.power_subpage[
 				       CTL_PAGE_CURRENT],
 				       &power_page_default,
 				       sizeof(power_page_default));
 				memcpy(&lun->mode_pages.power_subpage[
 				       CTL_PAGE_CHANGEABLE],
 				       &power_page_changeable,
 				       sizeof(power_page_changeable));
 				memcpy(&lun->mode_pages.power_subpage[
 				       CTL_PAGE_DEFAULT],
 				       &power_page_default,
 				       sizeof(power_page_default));
 				memcpy(&lun->mode_pages.power_subpage[
 				       CTL_PAGE_SAVED],
 				       &power_page_default,
 				       sizeof(power_page_default));
 				page_index->page_data =
 				    (uint8_t *)lun->mode_pages.power_subpage;
 
 				current_page = (struct copan_power_subpage *)
 					(page_index->page_data +
 					 (page_index->page_len *
 					  CTL_PAGE_CURRENT));
 			        saved_page = (struct copan_power_subpage *)
 				        (page_index->page_data +
 					 (page_index->page_len *
 					  CTL_PAGE_SAVED));
 				break;
 			}
 			case APS_SUBPAGE_CODE: {
 				struct copan_aps_subpage *current_page,
 							 *saved_page;
 
 				// This gets set multiple times but
 				// it should always be the same. It's
 				// only done during init so who cares.
 				index_to_aps_page = i;
 
 				memcpy(&lun->mode_pages.aps_subpage[
 				       CTL_PAGE_CURRENT],
 				       &aps_page_default,
 				       sizeof(aps_page_default));
 				memcpy(&lun->mode_pages.aps_subpage[
 				       CTL_PAGE_CHANGEABLE],
 				       &aps_page_changeable,
 				       sizeof(aps_page_changeable));
 				memcpy(&lun->mode_pages.aps_subpage[
 				       CTL_PAGE_DEFAULT],
 				       &aps_page_default,
 				       sizeof(aps_page_default));
 				memcpy(&lun->mode_pages.aps_subpage[
 				       CTL_PAGE_SAVED],
 				       &aps_page_default,
 				       sizeof(aps_page_default));
 				page_index->page_data =
 					(uint8_t *)lun->mode_pages.aps_subpage;
 
 				current_page = (struct copan_aps_subpage *)
 					(page_index->page_data +
 					 (page_index->page_len *
 					  CTL_PAGE_CURRENT));
 				saved_page = (struct copan_aps_subpage *)
 					(page_index->page_data +
 					 (page_index->page_len *
 					  CTL_PAGE_SAVED));
 				break;
 			}
 			case DBGCNF_SUBPAGE_CODE: {
 				struct copan_debugconf_subpage *current_page,
 							       *saved_page;
 
 				memcpy(&lun->mode_pages.debugconf_subpage[
 				       CTL_PAGE_CURRENT],
 				       &debugconf_page_default,
 				       sizeof(debugconf_page_default));
 				memcpy(&lun->mode_pages.debugconf_subpage[
 				       CTL_PAGE_CHANGEABLE],
 				       &debugconf_page_changeable,
 				       sizeof(debugconf_page_changeable));
 				memcpy(&lun->mode_pages.debugconf_subpage[
 				       CTL_PAGE_DEFAULT],
 				       &debugconf_page_default,
 				       sizeof(debugconf_page_default));
 				memcpy(&lun->mode_pages.debugconf_subpage[
 				       CTL_PAGE_SAVED],
 				       &debugconf_page_default,
 				       sizeof(debugconf_page_default));
 				page_index->page_data =
 					(uint8_t *)lun->mode_pages.debugconf_subpage;
 
 				current_page = (struct copan_debugconf_subpage *)
 					(page_index->page_data +
 					 (page_index->page_len *
 					  CTL_PAGE_CURRENT));
 				saved_page = (struct copan_debugconf_subpage *)
 					(page_index->page_data +
 					 (page_index->page_len *
 					  CTL_PAGE_SAVED));
 				break;
 			}
 			default:
 				panic("invalid subpage value %d",
 				      page_index->subpage);
 				break;
 			}
    			break;
 		}
 		default:
 			panic("invalid page value %d",
 			      page_index->page_code & SMPH_PC_MASK);
 			break;
     	}
 	}
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * LUN allocation.
  *
  * Requirements:
  * - caller allocates and zeros LUN storage, or passes in a NULL LUN if he
  *   wants us to allocate the LUN and he can block.
  * - ctl_softc is always set
  * - be_lun is set if the LUN has a backend (needed for disk LUNs)
  *
  * Returns 0 for success, non-zero (errno) for failure.
  */
 static int
 ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *ctl_lun,
 	      struct ctl_be_lun *const be_lun, struct ctl_id target_id)
 {
 	struct ctl_lun *nlun, *lun;
 	struct ctl_port *port;
 	struct scsi_vpd_id_descriptor *desc;
 	struct scsi_vpd_id_t10 *t10id;
 	const char *eui, *naa, *scsiname, *vendor;
 	int lun_number, i, lun_malloced;
 	int devidlen, idlen1, idlen2 = 0, len;
 
 	if (be_lun == NULL)
 		return (EINVAL);
 
 	/*
 	 * We currently only support Direct Access or Processor LUN types.
 	 */
 	switch (be_lun->lun_type) {
 	case T_DIRECT:
 		break;
 	case T_PROCESSOR:
 		break;
 	case T_SEQUENTIAL:
 	case T_CHANGER:
 	default:
 		be_lun->lun_config_status(be_lun->be_lun,
 					  CTL_LUN_CONFIG_FAILURE);
 		break;
 	}
 	if (ctl_lun == NULL) {
 		lun = malloc(sizeof(*lun), M_CTL, M_WAITOK);
 		lun_malloced = 1;
 	} else {
 		lun_malloced = 0;
 		lun = ctl_lun;
 	}
 
 	memset(lun, 0, sizeof(*lun));
 	if (lun_malloced)
 		lun->flags = CTL_LUN_MALLOCED;
 
 	/* Generate LUN ID. */
 	devidlen = max(CTL_DEVID_MIN_LEN,
 	    strnlen(be_lun->device_id, CTL_DEVID_LEN));
 	idlen1 = sizeof(*t10id) + devidlen;
 	len = sizeof(struct scsi_vpd_id_descriptor) + idlen1;
 	scsiname = ctl_get_opt(&be_lun->options, "scsiname");
 	if (scsiname != NULL) {
 		idlen2 = roundup2(strlen(scsiname) + 1, 4);
 		len += sizeof(struct scsi_vpd_id_descriptor) + idlen2;
 	}
 	eui = ctl_get_opt(&be_lun->options, "eui");
 	if (eui != NULL) {
 		len += sizeof(struct scsi_vpd_id_descriptor) + 8;
 	}
 	naa = ctl_get_opt(&be_lun->options, "naa");
 	if (naa != NULL) {
 		len += sizeof(struct scsi_vpd_id_descriptor) + 8;
 	}
 	lun->lun_devid = malloc(sizeof(struct ctl_devid) + len,
 	    M_CTL, M_WAITOK | M_ZERO);
 	lun->lun_devid->len = len;
 	desc = (struct scsi_vpd_id_descriptor *)lun->lun_devid->data;
 	desc->proto_codeset = SVPD_ID_CODESET_ASCII;
 	desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_T10;
 	desc->length = idlen1;
 	t10id = (struct scsi_vpd_id_t10 *)&desc->identifier[0];
 	memset(t10id->vendor, ' ', sizeof(t10id->vendor));
 	if ((vendor = ctl_get_opt(&be_lun->options, "vendor")) == NULL) {
 		strncpy((char *)t10id->vendor, CTL_VENDOR, sizeof(t10id->vendor));
 	} else {
 		strncpy(t10id->vendor, vendor,
 		    min(sizeof(t10id->vendor), strlen(vendor)));
 	}
 	strncpy((char *)t10id->vendor_spec_id,
 	    (char *)be_lun->device_id, devidlen);
 	if (scsiname != NULL) {
 		desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 		    desc->length);
 		desc->proto_codeset = SVPD_ID_CODESET_UTF8;
 		desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN |
 		    SVPD_ID_TYPE_SCSI_NAME;
 		desc->length = idlen2;
 		strlcpy(desc->identifier, scsiname, idlen2);
 	}
 	if (eui != NULL) {
 		desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 		    desc->length);
 		desc->proto_codeset = SVPD_ID_CODESET_BINARY;
 		desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN |
 		    SVPD_ID_TYPE_EUI64;
 		desc->length = 8;
 		scsi_u64to8b(strtouq(eui, NULL, 0), desc->identifier);
 	}
 	if (naa != NULL) {
 		desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 		    desc->length);
 		desc->proto_codeset = SVPD_ID_CODESET_BINARY;
 		desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN |
 		    SVPD_ID_TYPE_NAA;
 		desc->length = 8;
 		scsi_u64to8b(strtouq(naa, NULL, 0), desc->identifier);
 	}
 
 	mtx_lock(&ctl_softc->ctl_lock);
 	/*
 	 * See if the caller requested a particular LUN number.  If so, see
 	 * if it is available.  Otherwise, allocate the first available LUN.
 	 */
 	if (be_lun->flags & CTL_LUN_FLAG_ID_REQ) {
 		if ((be_lun->req_lun_id > (CTL_MAX_LUNS - 1))
 		 || (ctl_is_set(ctl_softc->ctl_lun_mask, be_lun->req_lun_id))) {
 			mtx_unlock(&ctl_softc->ctl_lock);
 			if (be_lun->req_lun_id > (CTL_MAX_LUNS - 1)) {
 				printf("ctl: requested LUN ID %d is higher "
 				       "than CTL_MAX_LUNS - 1 (%d)\n",
 				       be_lun->req_lun_id, CTL_MAX_LUNS - 1);
 			} else {
 				/*
 				 * XXX KDM return an error, or just assign
 				 * another LUN ID in this case??
 				 */
 				printf("ctl: requested LUN ID %d is already "
 				       "in use\n", be_lun->req_lun_id);
 			}
 			if (lun->flags & CTL_LUN_MALLOCED)
 				free(lun, M_CTL);
 			be_lun->lun_config_status(be_lun->be_lun,
 						  CTL_LUN_CONFIG_FAILURE);
 			return (ENOSPC);
 		}
 		lun_number = be_lun->req_lun_id;
 	} else {
 		lun_number = ctl_ffz(ctl_softc->ctl_lun_mask, CTL_MAX_LUNS);
 		if (lun_number == -1) {
 			mtx_unlock(&ctl_softc->ctl_lock);
 			printf("ctl: can't allocate LUN on target %ju, out of "
 			       "LUNs\n", (uintmax_t)target_id.id);
 			if (lun->flags & CTL_LUN_MALLOCED)
 				free(lun, M_CTL);
 			be_lun->lun_config_status(be_lun->be_lun,
 						  CTL_LUN_CONFIG_FAILURE);
 			return (ENOSPC);
 		}
 	}
 	ctl_set_mask(ctl_softc->ctl_lun_mask, lun_number);
 
 	mtx_init(&lun->lun_lock, "CTL LUN", NULL, MTX_DEF);
 	lun->target = target_id;
 	lun->lun = lun_number;
 	lun->be_lun = be_lun;
 	/*
 	 * The processor LUN is always enabled.  Disk LUNs come on line
 	 * disabled, and must be enabled by the backend.
 	 */
 	lun->flags |= CTL_LUN_DISABLED;
 	lun->backend = be_lun->be;
 	be_lun->ctl_lun = lun;
 	be_lun->lun_id = lun_number;
 	atomic_add_int(&be_lun->be->num_luns, 1);
 	if (be_lun->flags & CTL_LUN_FLAG_POWERED_OFF)
 		lun->flags |= CTL_LUN_STOPPED;
 
 	if (be_lun->flags & CTL_LUN_FLAG_INOPERABLE)
 		lun->flags |= CTL_LUN_INOPERABLE;
 
 	if (be_lun->flags & CTL_LUN_FLAG_PRIMARY)
 		lun->flags |= CTL_LUN_PRIMARY_SC;
 
 	lun->ctl_softc = ctl_softc;
 	TAILQ_INIT(&lun->ooa_queue);
 	TAILQ_INIT(&lun->blocked_queue);
 	STAILQ_INIT(&lun->error_list);
 	ctl_tpc_lun_init(lun);
 
 	/*
 	 * Initialize the mode page index.
 	 */
 	ctl_init_page_index(lun);
 
 	/*
 	 * Set the poweron UA for all initiators on this LUN only.
 	 */
 	for (i = 0; i < CTL_MAX_INITIATORS; i++)
 		lun->pending_ua[i] = CTL_UA_POWERON;
 
 	/*
 	 * Now, before we insert this lun on the lun list, set the lun
 	 * inventory changed UA for all other luns.
 	 */
 	STAILQ_FOREACH(nlun, &ctl_softc->lun_list, links) {
 		for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 			nlun->pending_ua[i] |= CTL_UA_LUN_CHANGE;
 		}
 	}
 
 	STAILQ_INSERT_TAIL(&ctl_softc->lun_list, lun, links);
 
 	ctl_softc->ctl_luns[lun_number] = lun;
 
 	ctl_softc->num_luns++;
 
 	/* Setup statistics gathering */
 	lun->stats.device_type = be_lun->lun_type;
 	lun->stats.lun_number = lun_number;
 	if (lun->stats.device_type == T_DIRECT)
 		lun->stats.blocksize = be_lun->blocksize;
 	else
 		lun->stats.flags = CTL_LUN_STATS_NO_BLOCKSIZE;
 	for (i = 0;i < CTL_MAX_PORTS;i++)
 		lun->stats.ports[i].targ_port = i;
 
 	mtx_unlock(&ctl_softc->ctl_lock);
 
 	lun->be_lun->lun_config_status(lun->be_lun->be_lun, CTL_LUN_CONFIG_OK);
 
 	/*
 	 * Run through each registered FETD and bring it online if it isn't
 	 * already.  Enable the target ID if it hasn't been enabled, and
 	 * enable this particular LUN.
 	 */
 	STAILQ_FOREACH(port, &ctl_softc->port_list, links) {
 		int retval;
 
 		retval = port->lun_enable(port->targ_lun_arg, target_id,lun_number);
 		if (retval != 0) {
 			printf("ctl_alloc_lun: FETD %s port %d returned error "
 			       "%d for lun_enable on target %ju lun %d\n",
 			       port->port_name, port->targ_port, retval,
 			       (uintmax_t)target_id.id, lun_number);
 		} else
 			port->status |= CTL_PORT_STATUS_LUN_ONLINE;
 	}
 	return (0);
 }
 
 /*
  * Delete a LUN.
  * Assumptions:
  * - LUN has already been marked invalid and any pending I/O has been taken
  *   care of.
  */
 static int
 ctl_free_lun(struct ctl_lun *lun)
 {
 	struct ctl_softc *softc;
 #if 0
 	struct ctl_port *port;
 #endif
 	struct ctl_lun *nlun;
 	int i;
 
 	softc = lun->ctl_softc;
 
 	mtx_assert(&softc->ctl_lock, MA_OWNED);
 
 	STAILQ_REMOVE(&softc->lun_list, lun, ctl_lun, links);
 
 	ctl_clear_mask(softc->ctl_lun_mask, lun->lun);
 
 	softc->ctl_luns[lun->lun] = NULL;
 
 	if (!TAILQ_EMPTY(&lun->ooa_queue))
 		panic("Freeing a LUN %p with outstanding I/O!!\n", lun);
 
 	softc->num_luns--;
 
 	/*
 	 * XXX KDM this scheme only works for a single target/multiple LUN
 	 * setup.  It needs to be revamped for a multiple target scheme.
 	 *
 	 * XXX KDM this results in port->lun_disable() getting called twice,
 	 * once when ctl_disable_lun() is called, and a second time here.
 	 * We really need to re-think the LUN disable semantics.  There
 	 * should probably be several steps/levels to LUN removal:
 	 *  - disable
 	 *  - invalidate
 	 *  - free
  	 *
 	 * Right now we only have a disable method when communicating to
 	 * the front end ports, at least for individual LUNs.
 	 */
 #if 0
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		int retval;
 
 		retval = port->lun_disable(port->targ_lun_arg, lun->target,
 					 lun->lun);
 		if (retval != 0) {
 			printf("ctl_free_lun: FETD %s port %d returned error "
 			       "%d for lun_disable on target %ju lun %jd\n",
 			       port->port_name, port->targ_port, retval,
 			       (uintmax_t)lun->target.id, (intmax_t)lun->lun);
 		}
 
 		if (STAILQ_FIRST(&softc->lun_list) == NULL) {
 			port->status &= ~CTL_PORT_STATUS_LUN_ONLINE;
 
 			retval = port->targ_disable(port->targ_lun_arg,lun->target);
 			if (retval != 0) {
 				printf("ctl_free_lun: FETD %s port %d "
 				       "returned error %d for targ_disable on "
 				       "target %ju\n", port->port_name,
 				       port->targ_port, retval,
 				       (uintmax_t)lun->target.id);
 			} else
 				port->status &= ~CTL_PORT_STATUS_TARG_ONLINE;
 
 			if ((port->status & CTL_PORT_STATUS_TARG_ONLINE) != 0)
 				continue;
 
 #if 0
 			port->port_offline(port->onoff_arg);
 			port->status &= ~CTL_PORT_STATUS_ONLINE;
 #endif
 		}
 	}
 #endif
 
 	/*
 	 * Tell the backend to free resources, if this LUN has a backend.
 	 */
 	atomic_subtract_int(&lun->be_lun->be->num_luns, 1);
 	lun->be_lun->lun_shutdown(lun->be_lun->be_lun);
 
 	ctl_tpc_lun_shutdown(lun);
 	mtx_destroy(&lun->lun_lock);
 	free(lun->lun_devid, M_CTL);
 	if (lun->flags & CTL_LUN_MALLOCED)
 		free(lun, M_CTL);
 
 	STAILQ_FOREACH(nlun, &softc->lun_list, links) {
 		for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 			nlun->pending_ua[i] |= CTL_UA_LUN_CHANGE;
 		}
 	}
 
 	return (0);
 }
 
 static void
 ctl_create_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 
 	ctl_softc = control_softc;
 
 	/*
 	 * ctl_alloc_lun() should handle all potential failure cases.
 	 */
 	ctl_alloc_lun(ctl_softc, NULL, be_lun, ctl_softc->target);
 }
 
 int
 ctl_add_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc = control_softc;
 
 	mtx_lock(&ctl_softc->ctl_lock);
 	STAILQ_INSERT_TAIL(&ctl_softc->pending_lun_queue, be_lun, links);
 	mtx_unlock(&ctl_softc->ctl_lock);
 	wakeup(&ctl_softc->pending_lun_queue);
 
 	return (0);
 }
 
 int
 ctl_enable_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_port *port, *nport;
 	struct ctl_lun *lun;
 	int retval;
 
 	ctl_softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&ctl_softc->ctl_lock);
 	mtx_lock(&lun->lun_lock);
 	if ((lun->flags & CTL_LUN_DISABLED) == 0) {
 		/*
 		 * eh?  Why did we get called if the LUN is already
 		 * enabled?
 		 */
 		mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&ctl_softc->ctl_lock);
 		return (0);
 	}
 	lun->flags &= ~CTL_LUN_DISABLED;
 	mtx_unlock(&lun->lun_lock);
 
 	for (port = STAILQ_FIRST(&ctl_softc->port_list); port != NULL; port = nport) {
 		nport = STAILQ_NEXT(port, links);
 
 		/*
 		 * Drop the lock while we call the FETD's enable routine.
 		 * This can lead to a callback into CTL (at least in the
 		 * case of the internal initiator frontend.
 		 */
 		mtx_unlock(&ctl_softc->ctl_lock);
 		retval = port->lun_enable(port->targ_lun_arg, lun->target,lun->lun);
 		mtx_lock(&ctl_softc->ctl_lock);
 		if (retval != 0) {
 			printf("%s: FETD %s port %d returned error "
 			       "%d for lun_enable on target %ju lun %jd\n",
 			       __func__, port->port_name, port->targ_port, retval,
 			       (uintmax_t)lun->target.id, (intmax_t)lun->lun);
 		}
 #if 0
 		 else {
             /* NOTE:  TODO:  why does lun enable affect port status? */
 			port->status |= CTL_PORT_STATUS_LUN_ONLINE;
 		}
 #endif
 	}
 
 	mtx_unlock(&ctl_softc->ctl_lock);
 
 	return (0);
 }
 
 int
 ctl_disable_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_port *port;
 	struct ctl_lun *lun;
 	int retval;
 
 	ctl_softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&ctl_softc->ctl_lock);
 	mtx_lock(&lun->lun_lock);
 	if (lun->flags & CTL_LUN_DISABLED) {
 		mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&ctl_softc->ctl_lock);
 		return (0);
 	}
 	lun->flags |= CTL_LUN_DISABLED;
 	mtx_unlock(&lun->lun_lock);
 
 	STAILQ_FOREACH(port, &ctl_softc->port_list, links) {
 		mtx_unlock(&ctl_softc->ctl_lock);
 		/*
 		 * Drop the lock before we call the frontend's disable
 		 * routine, to avoid lock order reversals.
 		 *
 		 * XXX KDM what happens if the frontend list changes while
 		 * we're traversing it?  It's unlikely, but should be handled.
 		 */
 		retval = port->lun_disable(port->targ_lun_arg, lun->target,
 					 lun->lun);
 		mtx_lock(&ctl_softc->ctl_lock);
 		if (retval != 0) {
 			printf("ctl_alloc_lun: FETD %s port %d returned error "
 			       "%d for lun_disable on target %ju lun %jd\n",
 			       port->port_name, port->targ_port, retval,
 			       (uintmax_t)lun->target.id, (intmax_t)lun->lun);
 		}
 	}
 
 	mtx_unlock(&ctl_softc->ctl_lock);
 
 	return (0);
 }
 
 int
 ctl_start_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	ctl_softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags &= ~CTL_LUN_STOPPED;
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_stop_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	ctl_softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags |= CTL_LUN_STOPPED;
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_lun_offline(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	ctl_softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags |= CTL_LUN_OFFLINE;
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_lun_online(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	ctl_softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags &= ~CTL_LUN_OFFLINE;
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_invalidate_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	ctl_softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 
 	/*
 	 * The LUN needs to be disabled before it can be marked invalid.
 	 */
 	if ((lun->flags & CTL_LUN_DISABLED) == 0) {
 		mtx_unlock(&lun->lun_lock);
 		return (-1);
 	}
 	/*
 	 * Mark the LUN invalid.
 	 */
 	lun->flags |= CTL_LUN_INVALID;
 
 	/*
 	 * If there is nothing in the OOA queue, go ahead and free the LUN.
 	 * If we have something in the OOA queue, we'll free it when the
 	 * last I/O completes.
 	 */
 	if (TAILQ_EMPTY(&lun->ooa_queue)) {
 		mtx_unlock(&lun->lun_lock);
 		mtx_lock(&ctl_softc->ctl_lock);
 		ctl_free_lun(lun);
 		mtx_unlock(&ctl_softc->ctl_lock);
 	} else
 		mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_lun_inoperable(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	ctl_softc = control_softc;
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags |= CTL_LUN_INOPERABLE;
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_lun_operable(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	ctl_softc = control_softc;
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags &= ~CTL_LUN_INOPERABLE;
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_lun_power_lock(struct ctl_be_lun *be_lun, struct ctl_nexus *nexus,
 		   int lock)
 {
 	struct ctl_softc *softc;
 	struct ctl_lun *lun;
 	struct copan_aps_subpage *current_sp;
 	struct ctl_page_index *page_index;
 	int i;
 
 	softc = control_softc;
 
 	mtx_lock(&softc->ctl_lock);
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 	mtx_lock(&lun->lun_lock);
 
 	page_index = NULL;
 	for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 		if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) !=
 		     APS_PAGE_CODE)
 			continue;
 
 		if (lun->mode_pages.index[i].subpage != APS_SUBPAGE_CODE)
 			continue;
 		page_index = &lun->mode_pages.index[i];
 	}
 
 	if (page_index == NULL) {
 		mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		printf("%s: APS subpage not found for lun %ju!\n", __func__,
 		       (uintmax_t)lun->lun);
 		return (1);
 	}
 #if 0
 	if ((softc->aps_locked_lun != 0)
 	 && (softc->aps_locked_lun != lun->lun)) {
 		printf("%s: attempt to lock LUN %llu when %llu is already "
 		       "locked\n");
 		mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		return (1);
 	}
 #endif
 
 	current_sp = (struct copan_aps_subpage *)(page_index->page_data +
 		(page_index->page_len * CTL_PAGE_CURRENT));
 
 	if (lock != 0) {
 		current_sp->lock_active = APS_LOCK_ACTIVE;
 		softc->aps_locked_lun = lun->lun;
 	} else {
 		current_sp->lock_active = 0;
 		softc->aps_locked_lun = 0;
 	}
 
 
 	/*
 	 * If we're in HA mode, try to send the lock message to the other
 	 * side.
 	 */
 	if (ctl_is_single == 0) {
 		int isc_retval;
 		union ctl_ha_msg lock_msg;
 
 		lock_msg.hdr.nexus = *nexus;
 		lock_msg.hdr.msg_type = CTL_MSG_APS_LOCK;
 		if (lock != 0)
 			lock_msg.aps.lock_flag = 1;
 		else
 			lock_msg.aps.lock_flag = 0;
 		isc_retval = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &lock_msg,
 					 sizeof(lock_msg), 0);
 		if (isc_retval > CTL_HA_STATUS_SUCCESS) {
 			printf("%s: APS (lock=%d) error returned from "
 			       "ctl_ha_msg_send: %d\n", __func__, lock, isc_retval);
 			mtx_unlock(&lun->lun_lock);
 			mtx_unlock(&softc->ctl_lock);
 			return (1);
 		}
 	}
 
 	mtx_unlock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 
 	return (0);
 }
 
 void
 ctl_lun_capacity_changed(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun;
 	struct ctl_softc *softc;
 	int i;
 
 	softc = control_softc;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 
 	for (i = 0; i < CTL_MAX_INITIATORS; i++) 
 		lun->pending_ua[i] |= CTL_UA_CAPACITY_CHANGED;
 
 	mtx_unlock(&lun->lun_lock);
 }
 
 /*
  * Backend "memory move is complete" callback for requests that never
  * make it down to say RAIDCore's configuration code.
  */
 int
 ctl_config_move_done(union ctl_io *io)
 {
 	int retval;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 
 	CTL_DEBUG_PRINT(("ctl_config_move_done\n"));
 	/*
 	 * XXX KDM this shouldn't happen, but what if it does?
 	 */
 	if (io->io_hdr.io_type != CTL_IO_SCSI)
 		panic("I/O type isn't CTL_IO_SCSI!");
 
 	if ((io->io_hdr.port_status == 0)
 	 && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
 	 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE))
 		io->io_hdr.status = CTL_SUCCESS;
 	else if ((io->io_hdr.port_status != 0)
 	      && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0)
 	      && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)){
 		/*
 		 * For hardware error sense keys, the sense key
 		 * specific value is defined to be a retry count,
 		 * but we use it to pass back an internal FETD
 		 * error code.  XXX KDM  Hopefully the FETD is only
 		 * using 16 bits for an error code, since that's
 		 * all the space we have in the sks field.
 		 */
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/
 					 io->io_hdr.port_status);
 		if (io->io_hdr.flags & CTL_FLAG_ALLOCATED)
 			free(io->scsiio.kern_data_ptr, M_CTL);
 		ctl_done(io);
 		goto bailout;
 	}
 
 	if (((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
 	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)
 	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)) {
 		/*
 		 * XXX KDM just assuming a single pointer here, and not a
 		 * S/G list.  If we start using S/G lists for config data,
 		 * we'll need to know how to clean them up here as well.
 		 */
 		if (io->io_hdr.flags & CTL_FLAG_ALLOCATED)
 			free(io->scsiio.kern_data_ptr, M_CTL);
 		/* Hopefully the user has already set the status... */
 		ctl_done(io);
 	} else {
 		/*
 		 * XXX KDM now we need to continue data movement.  Some
 		 * options:
 		 * - call ctl_scsiio() again?  We don't do this for data
 		 *   writes, because for those at least we know ahead of
 		 *   time where the write will go and how long it is.  For
 		 *   config writes, though, that information is largely
 		 *   contained within the write itself, thus we need to
 		 *   parse out the data again.
 		 *
 		 * - Call some other function once the data is in?
 		 */
 
 		/*
 		 * XXX KDM call ctl_scsiio() again for now, and check flag
 		 * bits to see whether we're allocated or not.
 		 */
 		retval = ctl_scsiio(&io->scsiio);
 	}
 bailout:
 	return (retval);
 }
 
 /*
  * This gets called by a backend driver when it is done with a
  * data_submit method.
  */
 void
 ctl_data_submit_done(union ctl_io *io)
 {
 	/*
 	 * If the IO_CONT flag is set, we need to call the supplied
 	 * function to continue processing the I/O, instead of completing
 	 * the I/O just yet.
 	 *
 	 * If there is an error, though, we don't want to keep processing.
 	 * Instead, just send status back to the initiator.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) &&
 	    (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
 	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
 		io->scsiio.io_cont(io);
 		return;
 	}
 	ctl_done(io);
 }
 
 /*
  * This gets called by a backend driver when it is done with a
  * configuration write.
  */
 void
 ctl_config_write_done(union ctl_io *io)
 {
 	/*
 	 * If the IO_CONT flag is set, we need to call the supplied
 	 * function to continue processing the I/O, instead of completing
 	 * the I/O just yet.
 	 *
 	 * If there is an error, though, we don't want to keep processing.
 	 * Instead, just send status back to the initiator.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_IO_CONT)
 	 && (((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)
 	  || ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS))) {
 		io->scsiio.io_cont(io);
 		return;
 	}
 	/*
 	 * Since a configuration write can be done for commands that actually
 	 * have data allocated, like write buffer, and commands that have
 	 * no data, like start/stop unit, we need to check here.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT)
 		free(io->scsiio.kern_data_ptr, M_CTL);
 	ctl_done(io);
 }
 
 /*
  * SCSI release command.
  */
 int
 ctl_scsi_release(struct ctl_scsiio *ctsio)
 {
 	int length, longid, thirdparty_id, resv_id;
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	length = 0;
 	resv_id = 0;
 
 	CTL_DEBUG_PRINT(("ctl_scsi_release\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	ctl_softc = control_softc;
 
 	switch (ctsio->cdb[0]) {
 	case RELEASE_10: {
 		struct scsi_release_10 *cdb;
 
 		cdb = (struct scsi_release_10 *)ctsio->cdb;
 
 		if (cdb->byte2 & SR10_LONGID)
 			longid = 1;
 		else
 			thirdparty_id = cdb->thirdparty_id;
 
 		resv_id = cdb->resv_id;
 		length = scsi_2btoul(cdb->length);
 		break;
 	}
 	}
 
 
 	/*
 	 * XXX KDM right now, we only support LUN reservation.  We don't
 	 * support 3rd party reservations, or extent reservations, which
 	 * might actually need the parameter list.  If we've gotten this
 	 * far, we've got a LUN reservation.  Anything else got kicked out
 	 * above.  So, according to SPC, ignore the length.
 	 */
 	length = 0;
 
 	if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0)
 	 && (length > 0)) {
 		ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = length;
 		ctsio->kern_total_len = length;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if (length > 0)
 		thirdparty_id = scsi_8btou64(ctsio->kern_data_ptr);
 
 	mtx_lock(&lun->lun_lock);
 
 	/*
 	 * According to SPC, it is not an error for an intiator to attempt
 	 * to release a reservation on a LUN that isn't reserved, or that
 	 * is reserved by another initiator.  The reservation can only be
 	 * released, though, by the initiator who made it or by one of
 	 * several reset type events.
 	 */
 	if (lun->flags & CTL_LUN_RESERVED) {
 		if ((ctsio->io_hdr.nexus.initid.id == lun->rsv_nexus.initid.id)
 		 && (ctsio->io_hdr.nexus.targ_port == lun->rsv_nexus.targ_port)
 		 && (ctsio->io_hdr.nexus.targ_target.id ==
 		     lun->rsv_nexus.targ_target.id)) {
 			lun->flags &= ~CTL_LUN_RESERVED;
 		}
 	}
 
 	mtx_unlock(&lun->lun_lock);
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.status = CTL_SUCCESS;
 
 	if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED;
 	}
 
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_scsi_reserve(struct ctl_scsiio *ctsio)
 {
 	int extent, thirdparty, longid;
 	int resv_id, length;
 	uint64_t thirdparty_id;
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 
 	extent = 0;
 	thirdparty = 0;
 	longid = 0;
 	resv_id = 0;
 	length = 0;
 	thirdparty_id = 0;
 
 	CTL_DEBUG_PRINT(("ctl_reserve\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	ctl_softc = control_softc;
 
 	switch (ctsio->cdb[0]) {
 	case RESERVE_10: {
 		struct scsi_reserve_10 *cdb;
 
 		cdb = (struct scsi_reserve_10 *)ctsio->cdb;
 
 		if (cdb->byte2 & SR10_LONGID)
 			longid = 1;
 		else
 			thirdparty_id = cdb->thirdparty_id;
 
 		resv_id = cdb->resv_id;
 		length = scsi_2btoul(cdb->length);
 		break;
 	}
 	}
 
 	/*
 	 * XXX KDM right now, we only support LUN reservation.  We don't
 	 * support 3rd party reservations, or extent reservations, which
 	 * might actually need the parameter list.  If we've gotten this
 	 * far, we've got a LUN reservation.  Anything else got kicked out
 	 * above.  So, according to SPC, ignore the length.
 	 */
 	length = 0;
 
 	if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0)
 	 && (length > 0)) {
 		ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = length;
 		ctsio->kern_total_len = length;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if (length > 0)
 		thirdparty_id = scsi_8btou64(ctsio->kern_data_ptr);
 
 	mtx_lock(&lun->lun_lock);
 	if (lun->flags & CTL_LUN_RESERVED) {
 		if ((ctsio->io_hdr.nexus.initid.id != lun->rsv_nexus.initid.id)
 		 || (ctsio->io_hdr.nexus.targ_port != lun->rsv_nexus.targ_port)
 		 || (ctsio->io_hdr.nexus.targ_target.id !=
 		     lun->rsv_nexus.targ_target.id)) {
 			ctsio->scsi_status = SCSI_STATUS_RESERV_CONFLICT;
 			ctsio->io_hdr.status = CTL_SCSI_ERROR;
 			goto bailout;
 		}
 	}
 
 	lun->flags |= CTL_LUN_RESERVED;
 	lun->rsv_nexus = ctsio->io_hdr.nexus;
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.status = CTL_SUCCESS;
 
 bailout:
 	mtx_unlock(&lun->lun_lock);
 
 	if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED;
 	}
 
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_start_stop(struct ctl_scsiio *ctsio)
 {
 	struct scsi_start_stop_unit *cdb;
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_start_stop\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	ctl_softc = control_softc;
 	retval = 0;
 
 	cdb = (struct scsi_start_stop_unit *)ctsio->cdb;
 
 	/*
 	 * XXX KDM
 	 * We don't support the immediate bit on a stop unit.  In order to
 	 * do that, we would need to code up a way to know that a stop is
 	 * pending, and hold off any new commands until it completes, one
 	 * way or another.  Then we could accept or reject those commands
 	 * depending on its status.  We would almost need to do the reverse
 	 * of what we do below for an immediate start -- return the copy of
 	 * the ctl_io to the FETD with status to send to the host (and to
 	 * free the copy!) and then free the original I/O once the stop
 	 * actually completes.  That way, the OOA queue mechanism can work
 	 * to block commands that shouldn't proceed.  Another alternative
 	 * would be to put the copy in the queue in place of the original,
 	 * and return the original back to the caller.  That could be
 	 * slightly safer..
 	 */
 	if ((cdb->byte2 & SSS_IMMED)
 	 && ((cdb->how & SSS_START) == 0)) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 1,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if ((lun->flags & CTL_LUN_PR_RESERVED)
 	 && ((cdb->how & SSS_START)==0)) {
 		uint32_t residx;
 
 		residx = ctl_get_resindex(&ctsio->io_hdr.nexus);
 		if (!lun->per_res[residx].registered
 		 || (lun->pr_res_idx!=residx && lun->res_type < 4)) {
 
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 	}
 
 	/*
 	 * If there is no backend on this device, we can't start or stop
 	 * it.  In theory we shouldn't get any start/stop commands in the
 	 * first place at this level if the LUN doesn't have a backend.
 	 * That should get stopped by the command decode code.
 	 */
 	if (lun->backend == NULL) {
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * XXX KDM Copan-specific offline behavior.
 	 * Figure out a reasonable way to port this?
 	 */
 #ifdef NEEDTOPORT
 	mtx_lock(&lun->lun_lock);
 
 	if (((cdb->byte2 & SSS_ONOFFLINE) == 0)
 	 && (lun->flags & CTL_LUN_OFFLINE)) {
 		/*
 		 * If the LUN is offline, and the on/offline bit isn't set,
 		 * reject the start or stop.  Otherwise, let it through.
 		 */
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_lun_not_ready(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 	} else {
 		mtx_unlock(&lun->lun_lock);
 #endif /* NEEDTOPORT */
 		/*
 		 * This could be a start or a stop when we're online,
 		 * or a stop/offline or start/online.  A start or stop when
 		 * we're offline is covered in the case above.
 		 */
 		/*
 		 * In the non-immediate case, we send the request to
 		 * the backend and return status to the user when
 		 * it is done.
 		 *
 		 * In the immediate case, we allocate a new ctl_io
 		 * to hold a copy of the request, and send that to
 		 * the backend.  We then set good status on the
 		 * user's request and return it immediately.
 		 */
 		if (cdb->byte2 & SSS_IMMED) {
 			union ctl_io *new_io;
 
 			new_io = ctl_alloc_io(ctsio->io_hdr.pool);
 			if (new_io == NULL) {
 				ctl_set_busy(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 			} else {
 				ctl_copy_io((union ctl_io *)ctsio,
 					    new_io);
 				retval = lun->backend->config_write(new_io);
 				ctl_set_success(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 			}
 		} else {
 			retval = lun->backend->config_write(
 				(union ctl_io *)ctsio);
 		}
 #ifdef NEEDTOPORT
 	}
 #endif
 	return (retval);
 }
 
 /*
  * We support the SYNCHRONIZE CACHE command (10 and 16 byte versions), but
  * we don't really do anything with the LBA and length fields if the user
  * passes them in.  Instead we'll just flush out the cache for the entire
  * LUN.
  */
 int
 ctl_sync_cache(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 	uint64_t starting_lba;
 	uint32_t block_count;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_sync_cache\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	ctl_softc = control_softc;
 	retval = 0;
 
 	switch (ctsio->cdb[0]) {
 	case SYNCHRONIZE_CACHE: {
 		struct scsi_sync_cache *cdb;
 		cdb = (struct scsi_sync_cache *)ctsio->cdb;
 
 		starting_lba = scsi_4btoul(cdb->begin_lba);
 		block_count = scsi_2btoul(cdb->lb_count);
 		break;
 	}
 	case SYNCHRONIZE_CACHE_16: {
 		struct scsi_sync_cache_16 *cdb;
 		cdb = (struct scsi_sync_cache_16 *)ctsio->cdb;
 
 		starting_lba = scsi_8btou64(cdb->begin_lba);
 		block_count = scsi_4btoul(cdb->lb_count);
 		break;
 	}
 	default:
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		goto bailout;
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * We check the LBA and length, but don't do anything with them.
 	 * A SYNCHRONIZE CACHE will cause the entire cache for this lun to
 	 * get flushed.  This check will just help satisfy anyone who wants
 	 * to see an error for an out of range LBA.
 	 */
 	if ((starting_lba + block_count) > (lun->be_lun->maxlba + 1)) {
 		ctl_set_lba_out_of_range(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		goto bailout;
 	}
 
 	/*
 	 * If this LUN has no backend, we can't flush the cache anyway.
 	 */
 	if (lun->backend == NULL) {
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		goto bailout;
 	}
 
 	/*
 	 * Check to see whether we're configured to send the SYNCHRONIZE
 	 * CACHE command directly to the back end.
 	 */
 	mtx_lock(&lun->lun_lock);
 	if ((ctl_softc->flags & CTL_FLAG_REAL_SYNC)
 	 && (++(lun->sync_count) >= lun->sync_interval)) {
 		lun->sync_count = 0;
 		mtx_unlock(&lun->lun_lock);
 		retval = lun->backend->config_write((union ctl_io *)ctsio);
 	} else {
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 	}
 
 bailout:
 
 	return (retval);
 }
 
 int
 ctl_format(struct ctl_scsiio *ctsio)
 {
 	struct scsi_format *cdb;
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 	int length, defect_list_len;
 
 	CTL_DEBUG_PRINT(("ctl_format\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	ctl_softc = control_softc;
 
 	cdb = (struct scsi_format *)ctsio->cdb;
 
 	length = 0;
 	if (cdb->byte2 & SF_FMTDATA) {
 		if (cdb->byte2 & SF_LONGLIST)
 			length = sizeof(struct scsi_format_header_long);
 		else
 			length = sizeof(struct scsi_format_header_short);
 	}
 
 	if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0)
 	 && (length > 0)) {
 		ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = length;
 		ctsio->kern_total_len = length;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	defect_list_len = 0;
 
 	if (cdb->byte2 & SF_FMTDATA) {
 		if (cdb->byte2 & SF_LONGLIST) {
 			struct scsi_format_header_long *header;
 
 			header = (struct scsi_format_header_long *)
 				ctsio->kern_data_ptr;
 
 			defect_list_len = scsi_4btoul(header->defect_list_len);
 			if (defect_list_len != 0) {
 				ctl_set_invalid_field(ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 0,
 						      /*field*/ 2,
 						      /*bit_valid*/ 0,
 						      /*bit*/ 0);
 				goto bailout;
 			}
 		} else {
 			struct scsi_format_header_short *header;
 
 			header = (struct scsi_format_header_short *)
 				ctsio->kern_data_ptr;
 
 			defect_list_len = scsi_2btoul(header->defect_list_len);
 			if (defect_list_len != 0) {
 				ctl_set_invalid_field(ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 0,
 						      /*field*/ 2,
 						      /*bit_valid*/ 0,
 						      /*bit*/ 0);
 				goto bailout;
 			}
 		}
 	}
 
 	/*
 	 * The format command will clear out the "Medium format corrupted"
 	 * status if set by the configuration code.  That status is really
 	 * just a way to notify the host that we have lost the media, and
 	 * get them to issue a command that will basically make them think
 	 * they're blowing away the media.
 	 */
 	mtx_lock(&lun->lun_lock);
 	lun->flags &= ~CTL_LUN_INOPERABLE;
 	mtx_unlock(&lun->lun_lock);
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.status = CTL_SUCCESS;
 bailout:
 
 	if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED;
 	}
 
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_read_buffer(struct ctl_scsiio *ctsio)
 {
 	struct scsi_read_buffer *cdb;
 	struct ctl_lun *lun;
 	int buffer_offset, len;
 	static uint8_t descr[4];
 	static uint8_t echo_descr[4] = { 0 };
 
 	CTL_DEBUG_PRINT(("ctl_read_buffer\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	cdb = (struct scsi_read_buffer *)ctsio->cdb;
 
 	if (lun->flags & CTL_LUN_PR_RESERVED) {
 		uint32_t residx;
 
 		/*
 		 * XXX KDM need a lock here.
 		 */
 		residx = ctl_get_resindex(&ctsio->io_hdr.nexus);
 		if ((lun->res_type == SPR_TYPE_EX_AC
 		  && residx != lun->pr_res_idx)
 		 || ((lun->res_type == SPR_TYPE_EX_AC_RO
 		   || lun->res_type == SPR_TYPE_EX_AC_AR)
 		  && !lun->per_res[residx].registered)) {
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 	        }
 	}
 
 	if ((cdb->byte2 & RWB_MODE) != RWB_MODE_DATA &&
 	    (cdb->byte2 & RWB_MODE) != RWB_MODE_ECHO_DESCR &&
 	    (cdb->byte2 & RWB_MODE) != RWB_MODE_DESCR) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 1,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 4);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	len = scsi_3btoul(cdb->length);
 	buffer_offset = scsi_3btoul(cdb->offset);
 
 	if (buffer_offset + len > sizeof(lun->write_buffer)) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 6,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if ((cdb->byte2 & RWB_MODE) == RWB_MODE_DESCR) {
 		descr[0] = 0;
 		scsi_ulto3b(sizeof(lun->write_buffer), &descr[1]);
 		ctsio->kern_data_ptr = descr;
 		len = min(len, sizeof(descr));
 	} else if ((cdb->byte2 & RWB_MODE) == RWB_MODE_ECHO_DESCR) {
 		ctsio->kern_data_ptr = echo_descr;
 		len = min(len, sizeof(echo_descr));
 	} else
 		ctsio->kern_data_ptr = lun->write_buffer + buffer_offset;
 	ctsio->kern_data_len = len;
 	ctsio->kern_total_len = len;
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_write_buffer(struct ctl_scsiio *ctsio)
 {
 	struct scsi_write_buffer *cdb;
 	struct ctl_lun *lun;
 	int buffer_offset, len;
 
 	CTL_DEBUG_PRINT(("ctl_write_buffer\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	cdb = (struct scsi_write_buffer *)ctsio->cdb;
 
 	if ((cdb->byte2 & RWB_MODE) != RWB_MODE_DATA) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 1,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 4);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	len = scsi_3btoul(cdb->length);
 	buffer_offset = scsi_3btoul(cdb->offset);
 
 	if (buffer_offset + len > sizeof(lun->write_buffer)) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 6,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * If we've got a kernel request that hasn't been malloced yet,
 	 * malloc it and tell the caller the data buffer is here.
 	 */
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = lun->write_buffer + buffer_offset;
 		ctsio->kern_data_len = len;
 		ctsio->kern_total_len = len;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	ctl_done((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_write_same(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int len, retval;
 	uint8_t byte2;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	CTL_DEBUG_PRINT(("ctl_write_same\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	switch (ctsio->cdb[0]) {
 	case WRITE_SAME_10: {
 		struct scsi_write_same_10 *cdb;
 
 		cdb = (struct scsi_write_same_10 *)ctsio->cdb;
 
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		byte2 = cdb->byte2;
 		break;
 	}
 	case WRITE_SAME_16: {
 		struct scsi_write_same_16 *cdb;
 
 		cdb = (struct scsi_write_same_16 *)ctsio->cdb;
 
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		byte2 = cdb->byte2;
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/* Zero number of blocks means "to the last logical block" */
 	if (num_blocks == 0) {
 		if ((lun->be_lun->maxlba + 1) - lba > UINT32_MAX) {
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 0,
 					      /*command*/ 1,
 					      /*field*/ 0,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		num_blocks = (lun->be_lun->maxlba + 1) - lba;
 	}
 
 	len = lun->be_lun->blocksize;
 
 	/*
 	 * If we've got a kernel request that hasn't been malloced yet,
 	 * malloc it and tell the caller the data buffer is here.
 	 */
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);;
 		ctsio->kern_data_len = len;
 		ctsio->kern_total_len = len;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	lbalen->flags = byte2;
 	retval = lun->backend->config_write((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 int
 ctl_unmap(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct scsi_unmap *cdb;
 	struct ctl_ptr_len_flags *ptrlen;
 	struct scsi_unmap_header *hdr;
 	struct scsi_unmap_desc *buf, *end;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int len, retval;
 	uint8_t byte2;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	CTL_DEBUG_PRINT(("ctl_unmap\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	cdb = (struct scsi_unmap *)ctsio->cdb;
 
 	len = scsi_2btoul(cdb->length);
 	byte2 = cdb->byte2;
 
 	/*
 	 * If we've got a kernel request that hasn't been malloced yet,
 	 * malloc it and tell the caller the data buffer is here.
 	 */
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);;
 		ctsio->kern_data_len = len;
 		ctsio->kern_total_len = len;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	len = ctsio->kern_total_len - ctsio->kern_data_resid;
 	hdr = (struct scsi_unmap_header *)ctsio->kern_data_ptr;
 	if (len < sizeof (*hdr) ||
 	    len < (scsi_2btoul(hdr->length) + sizeof(hdr->length)) ||
 	    len < (scsi_2btoul(hdr->desc_length) + sizeof (*hdr)) ||
 	    scsi_2btoul(hdr->desc_length) % sizeof(*buf) != 0) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 0,
 				      /*command*/ 0,
 				      /*field*/ 0,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 	len = scsi_2btoul(hdr->desc_length);
 	buf = (struct scsi_unmap_desc *)(hdr + 1);
 	end = buf + len / sizeof(*buf);
 
 	ptrlen = (struct ctl_ptr_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	ptrlen->ptr = (void *)buf;
 	ptrlen->len = len;
 	ptrlen->flags = byte2;
 
 	for (; buf < end; buf++) {
 		lba = scsi_8btou64(buf->lba);
 		num_blocks = scsi_4btoul(buf->length);
 		if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 		 || ((lba + num_blocks) < lba)) {
 			ctl_set_lba_out_of_range(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 	}
 
 	retval = lun->backend->config_write((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 /*
  * Note that this function currently doesn't actually do anything inside
  * CTL to enforce things if the DQue bit is turned on.
  *
  * Also note that this function can't be used in the default case, because
  * the DQue bit isn't set in the changeable mask for the control mode page
  * anyway.  This is just here as an example for how to implement a page
  * handler, and a placeholder in case we want to allow the user to turn
  * tagged queueing on and off.
  *
  * The D_SENSE bit handling is functional, however, and will turn
  * descriptor sense on and off for a given LUN.
  */
 int
 ctl_control_page_handler(struct ctl_scsiio *ctsio,
 			 struct ctl_page_index *page_index, uint8_t *page_ptr)
 {
 	struct scsi_control_page *current_cp, *saved_cp, *user_cp;
 	struct ctl_lun *lun;
 	struct ctl_softc *softc;
 	int set_ua;
 	uint32_t initidx;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	set_ua = 0;
 
 	user_cp = (struct scsi_control_page *)page_ptr;
 	current_cp = (struct scsi_control_page *)
 		(page_index->page_data + (page_index->page_len *
 		CTL_PAGE_CURRENT));
 	saved_cp = (struct scsi_control_page *)
 		(page_index->page_data + (page_index->page_len *
 		CTL_PAGE_SAVED));
 
 	softc = control_softc;
 
 	mtx_lock(&lun->lun_lock);
 	if (((current_cp->rlec & SCP_DSENSE) == 0)
 	 && ((user_cp->rlec & SCP_DSENSE) != 0)) {
 		/*
 		 * Descriptor sense is currently turned off and the user
 		 * wants to turn it on.
 		 */
 		current_cp->rlec |= SCP_DSENSE;
 		saved_cp->rlec |= SCP_DSENSE;
 		lun->flags |= CTL_LUN_SENSE_DESC;
 		set_ua = 1;
 	} else if (((current_cp->rlec & SCP_DSENSE) != 0)
 		&& ((user_cp->rlec & SCP_DSENSE) == 0)) {
 		/*
 		 * Descriptor sense is currently turned on, and the user
 		 * wants to turn it off.
 		 */
 		current_cp->rlec &= ~SCP_DSENSE;
 		saved_cp->rlec &= ~SCP_DSENSE;
 		lun->flags &= ~CTL_LUN_SENSE_DESC;
 		set_ua = 1;
 	}
 	if (current_cp->queue_flags & SCP_QUEUE_DQUE) {
 		if (user_cp->queue_flags & SCP_QUEUE_DQUE) {
 #ifdef NEEDTOPORT
 			csevent_log(CSC_CTL | CSC_SHELF_SW |
 				    CTL_UNTAG_TO_UNTAG,
 				    csevent_LogType_Trace,
 				    csevent_Severity_Information,
 				    csevent_AlertLevel_Green,
 				    csevent_FRU_Firmware,
 				    csevent_FRU_Unknown,
 				    "Received untagged to untagged transition");
 #endif /* NEEDTOPORT */
 		} else {
 #ifdef NEEDTOPORT
 			csevent_log(CSC_CTL | CSC_SHELF_SW |
 				    CTL_UNTAG_TO_TAG,
 				    csevent_LogType_ConfigChange,
 				    csevent_Severity_Information,
 				    csevent_AlertLevel_Green,
 				    csevent_FRU_Firmware,
 				    csevent_FRU_Unknown,
 				    "Received untagged to tagged "
 				    "queueing transition");
 #endif /* NEEDTOPORT */
 
 			current_cp->queue_flags &= ~SCP_QUEUE_DQUE;
 			saved_cp->queue_flags &= ~SCP_QUEUE_DQUE;
 			set_ua = 1;
 		}
 	} else {
 		if (user_cp->queue_flags & SCP_QUEUE_DQUE) {
 #ifdef NEEDTOPORT
 			csevent_log(CSC_CTL | CSC_SHELF_SW |
 				    CTL_TAG_TO_UNTAG,
 				    csevent_LogType_ConfigChange,
 				    csevent_Severity_Warning,
 				    csevent_AlertLevel_Yellow,
 				    csevent_FRU_Firmware,
 				    csevent_FRU_Unknown,
 				    "Received tagged queueing to untagged "
 				    "transition");
 #endif /* NEEDTOPORT */
 
 			current_cp->queue_flags |= SCP_QUEUE_DQUE;
 			saved_cp->queue_flags |= SCP_QUEUE_DQUE;
 			set_ua = 1;
 		} else {
 #ifdef NEEDTOPORT
 			csevent_log(CSC_CTL | CSC_SHELF_SW |
 				    CTL_TAG_TO_TAG,
 				    csevent_LogType_Trace,
 				    csevent_Severity_Information,
 				    csevent_AlertLevel_Green,
 				    csevent_FRU_Firmware,
 				    csevent_FRU_Unknown,
 				    "Received tagged queueing to tagged "
 				    "queueing transition");
 #endif /* NEEDTOPORT */
 		}
 	}
 	if (set_ua != 0) {
 		int i;
 		/*
 		 * Let other initiators know that the mode
 		 * parameters for this LUN have changed.
 		 */
 		for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 			if (i == initidx)
 				continue;
 
 			lun->pending_ua[i] |= CTL_UA_MODE_CHANGE;
 		}
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_caching_sp_handler(struct ctl_scsiio *ctsio,
 		     struct ctl_page_index *page_index, uint8_t *page_ptr)
 {
 	struct scsi_caching_page *current_cp, *saved_cp, *user_cp;
 	struct ctl_lun *lun;
 	int set_ua;
 	uint32_t initidx;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	set_ua = 0;
 
 	user_cp = (struct scsi_caching_page *)page_ptr;
 	current_cp = (struct scsi_caching_page *)
 		(page_index->page_data + (page_index->page_len *
 		CTL_PAGE_CURRENT));
 	saved_cp = (struct scsi_caching_page *)
 		(page_index->page_data + (page_index->page_len *
 		CTL_PAGE_SAVED));
 
 	mtx_lock(&lun->lun_lock);
 	if ((current_cp->flags1 & (SCP_WCE | SCP_RCD)) !=
 	    (user_cp->flags1 & (SCP_WCE | SCP_RCD)))
 		set_ua = 1;
 	current_cp->flags1 &= ~(SCP_WCE | SCP_RCD);
 	current_cp->flags1 |= user_cp->flags1 & (SCP_WCE | SCP_RCD);
 	saved_cp->flags1 &= ~(SCP_WCE | SCP_RCD);
 	saved_cp->flags1 |= user_cp->flags1 & (SCP_WCE | SCP_RCD);
 	if (set_ua != 0) {
 		int i;
 		/*
 		 * Let other initiators know that the mode
 		 * parameters for this LUN have changed.
 		 */
 		for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 			if (i == initidx)
 				continue;
 
 			lun->pending_ua[i] |= CTL_UA_MODE_CHANGE;
 		}
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 int
 ctl_power_sp_handler(struct ctl_scsiio *ctsio,
 		     struct ctl_page_index *page_index, uint8_t *page_ptr)
 {
 	return (0);
 }
 
 int
 ctl_power_sp_sense_handler(struct ctl_scsiio *ctsio,
 			   struct ctl_page_index *page_index, int pc)
 {
 	struct copan_power_subpage *page;
 
 	page = (struct copan_power_subpage *)page_index->page_data +
 		(page_index->page_len * pc);
 
 	switch (pc) {
 	case SMS_PAGE_CTRL_CHANGEABLE >> 6:
 		/*
 		 * We don't update the changable bits for this page.
 		 */
 		break;
 	case SMS_PAGE_CTRL_CURRENT >> 6:
 	case SMS_PAGE_CTRL_DEFAULT >> 6:
 	case SMS_PAGE_CTRL_SAVED >> 6:
 #ifdef NEEDTOPORT
 		ctl_update_power_subpage(page);
 #endif
 		break;
 	default:
 #ifdef NEEDTOPORT
 		EPRINT(0, "Invalid PC %d!!", pc);
 #endif
 		break;
 	}
 	return (0);
 }
 
 
 int
 ctl_aps_sp_handler(struct ctl_scsiio *ctsio,
 		   struct ctl_page_index *page_index, uint8_t *page_ptr)
 {
 	struct copan_aps_subpage *user_sp;
 	struct copan_aps_subpage *current_sp;
 	union ctl_modepage_info *modepage_info;
 	struct ctl_softc *softc;
 	struct ctl_lun *lun;
 	int retval;
 
 	retval = CTL_RETVAL_COMPLETE;
 	current_sp = (struct copan_aps_subpage *)(page_index->page_data +
 		     (page_index->page_len * CTL_PAGE_CURRENT));
 	softc = control_softc;
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	user_sp = (struct copan_aps_subpage *)page_ptr;
 
 	modepage_info = (union ctl_modepage_info *)
 		ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes;
 
 	modepage_info->header.page_code = page_index->page_code & SMPH_PC_MASK;
 	modepage_info->header.subpage = page_index->subpage;
 	modepage_info->aps.lock_active = user_sp->lock_active;
 
 	mtx_lock(&softc->ctl_lock);
 
 	/*
 	 * If there is a request to lock the LUN and another LUN is locked
 	 * this is an error. If the requested LUN is already locked ignore
 	 * the request. If no LUN is locked attempt to lock it.
 	 * if there is a request to unlock the LUN and the LUN is currently
 	 * locked attempt to unlock it. Otherwise ignore the request. i.e.
 	 * if another LUN is locked or no LUN is locked.
 	 */
 	if (user_sp->lock_active & APS_LOCK_ACTIVE) {
 		if (softc->aps_locked_lun == lun->lun) {
 			/*
 			 * This LUN is already locked, so we're done.
 			 */
 			retval = CTL_RETVAL_COMPLETE;
 		} else if (softc->aps_locked_lun == 0) {
 			/*
 			 * No one has the lock, pass the request to the
 			 * backend.
 			 */
 			retval = lun->backend->config_write(
 				(union ctl_io *)ctsio);
 		} else {
 			/*
 			 * Someone else has the lock, throw out the request.
 			 */
 			ctl_set_already_locked(ctsio);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_done((union ctl_io *)ctsio);
 
 			/*
 			 * Set the return value so that ctl_do_mode_select()
 			 * won't try to complete the command.  We already
 			 * completed it here.
 			 */
 			retval = CTL_RETVAL_ERROR;
 		}
 	} else if (softc->aps_locked_lun == lun->lun) {
 		/*
 		 * This LUN is locked, so pass the unlock request to the
 		 * backend.
 		 */
 		retval = lun->backend->config_write((union ctl_io *)ctsio);
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	return (retval);
 }
 
 int
 ctl_debugconf_sp_select_handler(struct ctl_scsiio *ctsio,
 				struct ctl_page_index *page_index,
 				uint8_t *page_ptr)
 {
 	uint8_t *c;
 	int i;
 
 	c = ((struct copan_debugconf_subpage *)page_ptr)->ctl_time_io_secs;
 	ctl_time_io_secs =
 		(c[0] << 8) |
 		(c[1] << 0) |
 		0;
 	CTL_DEBUG_PRINT(("set ctl_time_io_secs to %d\n", ctl_time_io_secs));
 	printf("set ctl_time_io_secs to %d\n", ctl_time_io_secs);
 	printf("page data:");
 	for (i=0; i<8; i++)
 		printf(" %.2x",page_ptr[i]);
 	printf("\n");
 	return (0);
 }
 
 int
 ctl_debugconf_sp_sense_handler(struct ctl_scsiio *ctsio,
 			       struct ctl_page_index *page_index,
 			       int pc)
 {
 	struct copan_debugconf_subpage *page;
 
 	page = (struct copan_debugconf_subpage *)page_index->page_data +
 		(page_index->page_len * pc);
 
 	switch (pc) {
 	case SMS_PAGE_CTRL_CHANGEABLE >> 6:
 	case SMS_PAGE_CTRL_DEFAULT >> 6:
 	case SMS_PAGE_CTRL_SAVED >> 6:
 		/*
 		 * We don't update the changable or default bits for this page.
 		 */
 		break;
 	case SMS_PAGE_CTRL_CURRENT >> 6:
 		page->ctl_time_io_secs[0] = ctl_time_io_secs >> 8;
 		page->ctl_time_io_secs[1] = ctl_time_io_secs >> 0;
 		break;
 	default:
 #ifdef NEEDTOPORT
 		EPRINT(0, "Invalid PC %d!!", pc);
 #endif /* NEEDTOPORT */
 		break;
 	}
 	return (0);
 }
 
 
 static int
 ctl_do_mode_select(union ctl_io *io)
 {
 	struct scsi_mode_page_header *page_header;
 	struct ctl_page_index *page_index;
 	struct ctl_scsiio *ctsio;
 	int control_dev, page_len;
 	int page_len_offset, page_len_size;
 	union ctl_modepage_info *modepage_info;
 	struct ctl_lun *lun;
 	int *len_left, *len_used;
 	int retval, i;
 
 	ctsio = &io->scsiio;
 	page_index = NULL;
 	page_len = 0;
 	retval = CTL_RETVAL_COMPLETE;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	if (lun->be_lun->lun_type != T_DIRECT)
 		control_dev = 1;
 	else
 		control_dev = 0;
 
 	modepage_info = (union ctl_modepage_info *)
 		ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes;
 	len_left = &modepage_info->header.len_left;
 	len_used = &modepage_info->header.len_used;
 
 do_next_page:
 
 	page_header = (struct scsi_mode_page_header *)
 		(ctsio->kern_data_ptr + *len_used);
 
 	if (*len_left == 0) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	} else if (*len_left < sizeof(struct scsi_mode_page_header)) {
 
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 
 	} else if ((page_header->page_code & SMPH_SPF)
 		&& (*len_left < sizeof(struct scsi_mode_page_header_sp))) {
 
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 
 	/*
 	 * XXX KDM should we do something with the block descriptor?
 	 */
 	for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 
 		if ((control_dev != 0)
 		 && (lun->mode_pages.index[i].page_flags &
 		     CTL_PAGE_FLAG_DISK_ONLY))
 			continue;
 
 		if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) !=
 		    (page_header->page_code & SMPH_PC_MASK))
 			continue;
 
 		/*
 		 * If neither page has a subpage code, then we've got a
 		 * match.
 		 */
 		if (((lun->mode_pages.index[i].page_code & SMPH_SPF) == 0)
 		 && ((page_header->page_code & SMPH_SPF) == 0)) {
 			page_index = &lun->mode_pages.index[i];
 			page_len = page_header->page_length;
 			break;
 		}
 
 		/*
 		 * If both pages have subpages, then the subpage numbers
 		 * have to match.
 		 */
 		if ((lun->mode_pages.index[i].page_code & SMPH_SPF)
 		  && (page_header->page_code & SMPH_SPF)) {
 			struct scsi_mode_page_header_sp *sph;
 
 			sph = (struct scsi_mode_page_header_sp *)page_header;
 
 			if (lun->mode_pages.index[i].subpage ==
 			    sph->subpage) {
 				page_index = &lun->mode_pages.index[i];
 				page_len = scsi_2btoul(sph->page_length);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we couldn't find the page, or if we don't have a mode select
 	 * handler for it, send back an error to the user.
 	 */
 	if ((page_index == NULL)
 	 || (page_index->select_handler == NULL)) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 0,
 				      /*field*/ *len_used,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if (page_index->page_code & SMPH_SPF) {
 		page_len_offset = 2;
 		page_len_size = 2;
 	} else {
 		page_len_size = 1;
 		page_len_offset = 1;
 	}
 
 	/*
 	 * If the length the initiator gives us isn't the one we specify in
 	 * the mode page header, or if they didn't specify enough data in
 	 * the CDB to avoid truncating this page, kick out the request.
 	 */
 	if ((page_len != (page_index->page_len - page_len_offset -
 			  page_len_size))
 	 || (*len_left < page_index->page_len)) {
 
 
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 0,
 				      /*field*/ *len_used + page_len_offset,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Run through the mode page, checking to make sure that the bits
 	 * the user changed are actually legal for him to change.
 	 */
 	for (i = 0; i < page_index->page_len; i++) {
 		uint8_t *user_byte, *change_mask, *current_byte;
 		int bad_bit;
 		int j;
 
 		user_byte = (uint8_t *)page_header + i;
 		change_mask = page_index->page_data +
 			      (page_index->page_len * CTL_PAGE_CHANGEABLE) + i;
 		current_byte = page_index->page_data +
 			       (page_index->page_len * CTL_PAGE_CURRENT) + i;
 
 		/*
 		 * Check to see whether the user set any bits in this byte
 		 * that he is not allowed to set.
 		 */
 		if ((*user_byte & ~(*change_mask)) ==
 		    (*current_byte & ~(*change_mask)))
 			continue;
 
 		/*
 		 * Go through bit by bit to determine which one is illegal.
 		 */
 		bad_bit = 0;
 		for (j = 7; j >= 0; j--) {
 			if ((((1 << i) & ~(*change_mask)) & *user_byte) !=
 			    (((1 << i) & ~(*change_mask)) & *current_byte)) {
 				bad_bit = i;
 				break;
 			}
 		}
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 0,
 				      /*field*/ *len_used + i,
 				      /*bit_valid*/ 1,
 				      /*bit*/ bad_bit);
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Decrement these before we call the page handler, since we may
 	 * end up getting called back one way or another before the handler
 	 * returns to this context.
 	 */
 	*len_left -= page_index->page_len;
 	*len_used += page_index->page_len;
 
 	retval = page_index->select_handler(ctsio, page_index,
 					    (uint8_t *)page_header);
 
 	/*
 	 * If the page handler returns CTL_RETVAL_QUEUED, then we need to
 	 * wait until this queued command completes to finish processing
 	 * the mode page.  If it returns anything other than
 	 * CTL_RETVAL_COMPLETE (e.g. CTL_RETVAL_ERROR), then it should have
 	 * already set the sense information, freed the data pointer, and
 	 * completed the io for us.
 	 */
 	if (retval != CTL_RETVAL_COMPLETE)
 		goto bailout_no_done;
 
 	/*
 	 * If the initiator sent us more than one page, parse the next one.
 	 */
 	if (*len_left > 0)
 		goto do_next_page;
 
 	ctl_set_success(ctsio);
 	free(ctsio->kern_data_ptr, M_CTL);
 	ctl_done((union ctl_io *)ctsio);
 
 bailout_no_done:
 
 	return (CTL_RETVAL_COMPLETE);
 
 }
 
 int
 ctl_mode_select(struct ctl_scsiio *ctsio)
 {
 	int param_len, pf, sp;
 	int header_size, bd_len;
 	int len_left, len_used;
 	struct ctl_page_index *page_index;
 	struct ctl_lun *lun;
 	int control_dev, page_len;
 	union ctl_modepage_info *modepage_info;
 	int retval;
 
 	pf = 0;
 	sp = 0;
 	page_len = 0;
 	len_used = 0;
 	len_left = 0;
 	retval = 0;
 	bd_len = 0;
 	page_index = NULL;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	if (lun->be_lun->lun_type != T_DIRECT)
 		control_dev = 1;
 	else
 		control_dev = 0;
 
 	switch (ctsio->cdb[0]) {
 	case MODE_SELECT_6: {
 		struct scsi_mode_select_6 *cdb;
 
 		cdb = (struct scsi_mode_select_6 *)ctsio->cdb;
 
 		pf = (cdb->byte2 & SMS_PF) ? 1 : 0;
 		sp = (cdb->byte2 & SMS_SP) ? 1 : 0;
 
 		param_len = cdb->length;
 		header_size = sizeof(struct scsi_mode_header_6);
 		break;
 	}
 	case MODE_SELECT_10: {
 		struct scsi_mode_select_10 *cdb;
 
 		cdb = (struct scsi_mode_select_10 *)ctsio->cdb;
 
 		pf = (cdb->byte2 & SMS_PF) ? 1 : 0;
 		sp = (cdb->byte2 & SMS_SP) ? 1 : 0;
 
 		param_len = scsi_2btoul(cdb->length);
 		header_size = sizeof(struct scsi_mode_header_10);
 		break;
 	}
 	default:
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * From SPC-3:
 	 * "A parameter list length of zero indicates that the Data-Out Buffer
 	 * shall be empty. This condition shall not be considered as an error."
 	 */
 	if (param_len == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Since we'll hit this the first time through, prior to
 	 * allocation, we don't need to free a data buffer here.
 	 */
 	if (param_len < header_size) {
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Allocate the data buffer and grab the user's data.  In theory,
 	 * we shouldn't have to sanity check the parameter list length here
 	 * because the maximum size is 64K.  We should be able to malloc
 	 * that much without too many problems.
 	 */
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = param_len;
 		ctsio->kern_total_len = param_len;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	switch (ctsio->cdb[0]) {
 	case MODE_SELECT_6: {
 		struct scsi_mode_header_6 *mh6;
 
 		mh6 = (struct scsi_mode_header_6 *)ctsio->kern_data_ptr;
 		bd_len = mh6->blk_desc_len;
 		break;
 	}
 	case MODE_SELECT_10: {
 		struct scsi_mode_header_10 *mh10;
 
 		mh10 = (struct scsi_mode_header_10 *)ctsio->kern_data_ptr;
 		bd_len = scsi_2btoul(mh10->blk_desc_len);
 		break;
 	}
 	default:
 		panic("Invalid CDB type %#x", ctsio->cdb[0]);
 		break;
 	}
 
 	if (param_len < (header_size + bd_len)) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Set the IO_CONT flag, so that if this I/O gets passed to
 	 * ctl_config_write_done(), it'll get passed back to
 	 * ctl_do_mode_select() for further processing, or completion if
 	 * we're all done.
 	 */
 	ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT;
 	ctsio->io_cont = ctl_do_mode_select;
 
 	modepage_info = (union ctl_modepage_info *)
 		ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes;
 
 	memset(modepage_info, 0, sizeof(*modepage_info));
 
 	len_left = param_len - header_size - bd_len;
 	len_used = header_size + bd_len;
 
 	modepage_info->header.len_left = len_left;
 	modepage_info->header.len_used = len_used;
 
 	return (ctl_do_mode_select((union ctl_io *)ctsio));
 }
 
 int
 ctl_mode_sense(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	int pc, page_code, dbd, llba, subpage;
 	int alloc_len, page_len, header_len, total_len;
 	struct scsi_mode_block_descr *block_desc;
 	struct ctl_page_index *page_index;
 	int control_dev;
 
 	dbd = 0;
 	llba = 0;
 	block_desc = NULL;
 	page_index = NULL;
 
 	CTL_DEBUG_PRINT(("ctl_mode_sense\n"));
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	if (lun->be_lun->lun_type != T_DIRECT)
 		control_dev = 1;
 	else
 		control_dev = 0;
 
 	if (lun->flags & CTL_LUN_PR_RESERVED) {
 		uint32_t residx;
 
 		/*
 		 * XXX KDM need a lock here.
 		 */
 		residx = ctl_get_resindex(&ctsio->io_hdr.nexus);
 		if ((lun->res_type == SPR_TYPE_EX_AC
 		  && residx != lun->pr_res_idx)
 		 || ((lun->res_type == SPR_TYPE_EX_AC_RO
 		   || lun->res_type == SPR_TYPE_EX_AC_AR)
 		  && !lun->per_res[residx].registered)) {
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 	}
 
 	switch (ctsio->cdb[0]) {
 	case MODE_SENSE_6: {
 		struct scsi_mode_sense_6 *cdb;
 
 		cdb = (struct scsi_mode_sense_6 *)ctsio->cdb;
 
 		header_len = sizeof(struct scsi_mode_hdr_6);
 		if (cdb->byte2 & SMS_DBD)
 			dbd = 1;
 		else
 			header_len += sizeof(struct scsi_mode_block_descr);
 
 		pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6;
 		page_code = cdb->page & SMS_PAGE_CODE;
 		subpage = cdb->subpage;
 		alloc_len = cdb->length;
 		break;
 	}
 	case MODE_SENSE_10: {
 		struct scsi_mode_sense_10 *cdb;
 
 		cdb = (struct scsi_mode_sense_10 *)ctsio->cdb;
 
 		header_len = sizeof(struct scsi_mode_hdr_10);
 
 		if (cdb->byte2 & SMS_DBD)
 			dbd = 1;
 		else
 			header_len += sizeof(struct scsi_mode_block_descr);
 		if (cdb->byte2 & SMS10_LLBAA)
 			llba = 1;
 		pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6;
 		page_code = cdb->page & SMS_PAGE_CODE;
 		subpage = cdb->subpage;
 		alloc_len = scsi_2btoul(cdb->length);
 		break;
 	}
 	default:
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * We have to make a first pass through to calculate the size of
 	 * the pages that match the user's query.  Then we allocate enough
 	 * memory to hold it, and actually copy the data into the buffer.
 	 */
 	switch (page_code) {
 	case SMS_ALL_PAGES_PAGE: {
 		int i;
 
 		page_len = 0;
 
 		/*
 		 * At the moment, values other than 0 and 0xff here are
 		 * reserved according to SPC-3.
 		 */
 		if ((subpage != SMS_SUBPAGE_PAGE_0)
 		 && (subpage != SMS_SUBPAGE_ALL)) {
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 3,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			if ((control_dev != 0)
 			 && (lun->mode_pages.index[i].page_flags &
 			     CTL_PAGE_FLAG_DISK_ONLY))
 				continue;
 
 			/*
 			 * We don't use this subpage if the user didn't
 			 * request all subpages.
 			 */
 			if ((lun->mode_pages.index[i].subpage != 0)
 			 && (subpage == SMS_SUBPAGE_PAGE_0))
 				continue;
 
 #if 0
 			printf("found page %#x len %d\n",
 			       lun->mode_pages.index[i].page_code &
 			       SMPH_PC_MASK,
 			       lun->mode_pages.index[i].page_len);
 #endif
 			page_len += lun->mode_pages.index[i].page_len;
 		}
 		break;
 	}
 	default: {
 		int i;
 
 		page_len = 0;
 
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			/* Look for the right page code */
 			if ((lun->mode_pages.index[i].page_code &
 			     SMPH_PC_MASK) != page_code)
 				continue;
 
 			/* Look for the right subpage or the subpage wildcard*/
 			if ((lun->mode_pages.index[i].subpage != subpage)
 			 && (subpage != SMS_SUBPAGE_ALL))
 				continue;
 
 			/* Make sure the page is supported for this dev type */
 			if ((control_dev != 0)
 			 && (lun->mode_pages.index[i].page_flags &
 			     CTL_PAGE_FLAG_DISK_ONLY))
 				continue;
 
 #if 0
 			printf("found page %#x len %d\n",
 			       lun->mode_pages.index[i].page_code &
 			       SMPH_PC_MASK,
 			       lun->mode_pages.index[i].page_len);
 #endif
 
 			page_len += lun->mode_pages.index[i].page_len;
 		}
 
 		if (page_len == 0) {
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 5);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		break;
 	}
 	}
 
 	total_len = header_len + page_len;
 #if 0
 	printf("header_len = %d, page_len = %d, total_len = %d\n",
 	       header_len, page_len, total_len);
 #endif
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	if (total_len < alloc_len) {
 		ctsio->residual = alloc_len - total_len;
 		ctsio->kern_data_len = total_len;
 		ctsio->kern_total_len = total_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 
 	switch (ctsio->cdb[0]) {
 	case MODE_SENSE_6: {
 		struct scsi_mode_hdr_6 *header;
 
 		header = (struct scsi_mode_hdr_6 *)ctsio->kern_data_ptr;
 
 		header->datalen = ctl_min(total_len - 1, 254);
 		if (control_dev == 0)
 			header->dev_specific = 0x10; /* DPOFUA */
 		if (dbd)
 			header->block_descr_len = 0;
 		else
 			header->block_descr_len =
 				sizeof(struct scsi_mode_block_descr);
 		block_desc = (struct scsi_mode_block_descr *)&header[1];
 		break;
 	}
 	case MODE_SENSE_10: {
 		struct scsi_mode_hdr_10 *header;
 		int datalen;
 
 		header = (struct scsi_mode_hdr_10 *)ctsio->kern_data_ptr;
 
 		datalen = ctl_min(total_len - 2, 65533);
 		scsi_ulto2b(datalen, header->datalen);
 		if (control_dev == 0)
 			header->dev_specific = 0x10; /* DPOFUA */
 		if (dbd)
 			scsi_ulto2b(0, header->block_descr_len);
 		else
 			scsi_ulto2b(sizeof(struct scsi_mode_block_descr),
 				    header->block_descr_len);
 		block_desc = (struct scsi_mode_block_descr *)&header[1];
 		break;
 	}
 	default:
 		panic("invalid CDB type %#x", ctsio->cdb[0]);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * If we've got a disk, use its blocksize in the block
 	 * descriptor.  Otherwise, just set it to 0.
 	 */
 	if (dbd == 0) {
 		if (control_dev != 0)
 			scsi_ulto3b(lun->be_lun->blocksize,
 				    block_desc->block_len);
 		else
 			scsi_ulto3b(0, block_desc->block_len);
 	}
 
 	switch (page_code) {
 	case SMS_ALL_PAGES_PAGE: {
 		int i, data_used;
 
 		data_used = header_len;
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			struct ctl_page_index *page_index;
 
 			page_index = &lun->mode_pages.index[i];
 
 			if ((control_dev != 0)
 			 && (page_index->page_flags &
 			    CTL_PAGE_FLAG_DISK_ONLY))
 				continue;
 
 			/*
 			 * We don't use this subpage if the user didn't
 			 * request all subpages.  We already checked (above)
 			 * to make sure the user only specified a subpage
 			 * of 0 or 0xff in the SMS_ALL_PAGES_PAGE case.
 			 */
 			if ((page_index->subpage != 0)
 			 && (subpage == SMS_SUBPAGE_PAGE_0))
 				continue;
 
 			/*
 			 * Call the handler, if it exists, to update the
 			 * page to the latest values.
 			 */
 			if (page_index->sense_handler != NULL)
 				page_index->sense_handler(ctsio, page_index,pc);
 
 			memcpy(ctsio->kern_data_ptr + data_used,
 			       page_index->page_data +
 			       (page_index->page_len * pc),
 			       page_index->page_len);
 			data_used += page_index->page_len;
 		}
 		break;
 	}
 	default: {
 		int i, data_used;
 
 		data_used = header_len;
 
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			struct ctl_page_index *page_index;
 
 			page_index = &lun->mode_pages.index[i];
 
 			/* Look for the right page code */
 			if ((page_index->page_code & SMPH_PC_MASK) != page_code)
 				continue;
 
 			/* Look for the right subpage or the subpage wildcard*/
 			if ((page_index->subpage != subpage)
 			 && (subpage != SMS_SUBPAGE_ALL))
 				continue;
 
 			/* Make sure the page is supported for this dev type */
 			if ((control_dev != 0)
 			 && (page_index->page_flags &
 			     CTL_PAGE_FLAG_DISK_ONLY))
 				continue;
 
 			/*
 			 * Call the handler, if it exists, to update the
 			 * page to the latest values.
 			 */
 			if (page_index->sense_handler != NULL)
 				page_index->sense_handler(ctsio, page_index,pc);
 
 			memcpy(ctsio->kern_data_ptr + data_used,
 			       page_index->page_data +
 			       (page_index->page_len * pc),
 			       page_index->page_len);
 			data_used += page_index->page_len;
 		}
 		break;
 	}
 	}
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_read_capacity(struct ctl_scsiio *ctsio)
 {
 	struct scsi_read_capacity *cdb;
 	struct scsi_read_capacity_data *data;
 	struct ctl_lun *lun;
 	uint32_t lba;
 
 	CTL_DEBUG_PRINT(("ctl_read_capacity\n"));
 
 	cdb = (struct scsi_read_capacity *)ctsio->cdb;
 
 	lba = scsi_4btoul(cdb->addr);
 	if (((cdb->pmi & SRC_PMI) == 0)
 	 && (lba != 0)) {
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO);
 	data = (struct scsi_read_capacity_data *)ctsio->kern_data_ptr;
 	ctsio->residual = 0;
 	ctsio->kern_data_len = sizeof(*data);
 	ctsio->kern_total_len = sizeof(*data);
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * If the maximum LBA is greater than 0xfffffffe, the user must
 	 * issue a SERVICE ACTION IN (16) command, with the read capacity
 	 * serivce action set.
 	 */
 	if (lun->be_lun->maxlba > 0xfffffffe)
 		scsi_ulto4b(0xffffffff, data->addr);
 	else
 		scsi_ulto4b(lun->be_lun->maxlba, data->addr);
 
 	/*
 	 * XXX KDM this may not be 512 bytes...
 	 */
 	scsi_ulto4b(lun->be_lun->blocksize, data->length);
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_read_capacity_16(struct ctl_scsiio *ctsio)
 {
 	struct scsi_read_capacity_16 *cdb;
 	struct scsi_read_capacity_data_long *data;
 	struct ctl_lun *lun;
 	uint64_t lba;
 	uint32_t alloc_len;
 
 	CTL_DEBUG_PRINT(("ctl_read_capacity_16\n"));
 
 	cdb = (struct scsi_read_capacity_16 *)ctsio->cdb;
 
 	alloc_len = scsi_4btoul(cdb->alloc_len);
 	lba = scsi_8btou64(cdb->addr);
 
 	if ((cdb->reladr & SRC16_PMI)
 	 && (lba != 0)) {
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO);
 	data = (struct scsi_read_capacity_data_long *)ctsio->kern_data_ptr;
 
 	if (sizeof(*data) < alloc_len) {
 		ctsio->residual = alloc_len - sizeof(*data);
 		ctsio->kern_data_len = sizeof(*data);
 		ctsio->kern_total_len = sizeof(*data);
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	scsi_u64to8b(lun->be_lun->maxlba, data->addr);
 	/* XXX KDM this may not be 512 bytes... */
 	scsi_ulto4b(lun->be_lun->blocksize, data->length);
 	data->prot_lbppbe = lun->be_lun->pblockexp & SRC16_LBPPBE;
 	scsi_ulto2b(lun->be_lun->pblockoff & SRC16_LALBA_A, data->lalba_lbp);
 	if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP)
 		data->lalba_lbp[0] |= SRC16_LBPME | SRC16_LBPRZ;
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_report_tagret_port_groups(struct ctl_scsiio *ctsio)
 {
 	struct scsi_maintenance_in *cdb;
 	int retval;
 	int alloc_len, ext, total_len = 0, g, p, pc, pg;
 	int num_target_port_groups, num_target_ports, single;
 	struct ctl_lun *lun;
 	struct ctl_softc *softc;
 	struct ctl_port *port;
 	struct scsi_target_group_data *rtg_ptr;
 	struct scsi_target_group_data_extended *rtg_ext_ptr;
 	struct scsi_target_port_group_descriptor *tpg_desc;
 
 	CTL_DEBUG_PRINT(("ctl_report_tagret_port_groups\n"));
 
 	cdb = (struct scsi_maintenance_in *)ctsio->cdb;
 	softc = control_softc;
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	switch (cdb->byte2 & STG_PDF_MASK) {
 	case STG_PDF_LENGTH:
 		ext = 0;
 		break;
 	case STG_PDF_EXTENDED:
 		ext = 1;
 		break;
 	default:
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 5);
 		ctl_done((union ctl_io *)ctsio);
 		return(retval);
 	}
 
 	single = ctl_is_single;
 	if (single)
 		num_target_port_groups = 1;
 	else
 		num_target_port_groups = NUM_TARGET_PORT_GROUPS;
 	num_target_ports = 0;
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 			continue;
 		if (ctl_map_lun_back(port->targ_port, lun->lun) >= CTL_MAX_LUNS)
 			continue;
 		num_target_ports++;
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	if (ext)
 		total_len = sizeof(struct scsi_target_group_data_extended);
 	else
 		total_len = sizeof(struct scsi_target_group_data);
 	total_len += sizeof(struct scsi_target_port_group_descriptor) *
 		num_target_port_groups +
 	    sizeof(struct scsi_target_port_descriptor) *
 		num_target_ports * num_target_port_groups;
 
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 
 	ctsio->kern_sg_entries = 0;
 
 	if (total_len < alloc_len) {
 		ctsio->residual = alloc_len - total_len;
 		ctsio->kern_data_len = total_len;
 		ctsio->kern_total_len = total_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 
 	if (ext) {
 		rtg_ext_ptr = (struct scsi_target_group_data_extended *)
 		    ctsio->kern_data_ptr;
 		scsi_ulto4b(total_len - 4, rtg_ext_ptr->length);
 		rtg_ext_ptr->format_type = 0x10;
 		rtg_ext_ptr->implicit_transition_time = 0;
 		tpg_desc = &rtg_ext_ptr->groups[0];
 	} else {
 		rtg_ptr = (struct scsi_target_group_data *)
 		    ctsio->kern_data_ptr;
 		scsi_ulto4b(total_len - 4, rtg_ptr->length);
 		tpg_desc = &rtg_ptr->groups[0];
 	}
 
 	pg = ctsio->io_hdr.nexus.targ_port / CTL_MAX_PORTS;
 	mtx_lock(&softc->ctl_lock);
 	for (g = 0; g < num_target_port_groups; g++) {
 		if (g == pg)
 			tpg_desc->pref_state = TPG_PRIMARY |
 			    TPG_ASYMMETRIC_ACCESS_OPTIMIZED;
 		else
 			tpg_desc->pref_state =
 			    TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED;
 		tpg_desc->support = TPG_AO_SUP;
 		if (!single)
 			tpg_desc->support |= TPG_AN_SUP;
 		scsi_ulto2b(g + 1, tpg_desc->target_port_group);
 		tpg_desc->status = TPG_IMPLICIT;
 		pc = 0;
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 				continue;
 			if (ctl_map_lun_back(port->targ_port, lun->lun) >=
 			    CTL_MAX_LUNS)
 				continue;
 			p = port->targ_port % CTL_MAX_PORTS + g * CTL_MAX_PORTS;
 			scsi_ulto2b(p, tpg_desc->descriptors[pc].
 			    relative_target_port_identifier);
 			pc++;
 		}
 		tpg_desc->target_port_count = pc;
 		tpg_desc = (struct scsi_target_port_group_descriptor *)
 		    &tpg_desc->descriptors[pc];
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 
 	CTL_DEBUG_PRINT(("buf = %x %x %x %x %x %x %x %x\n",
 			 ctsio->kern_data_ptr[0], ctsio->kern_data_ptr[1],
 			 ctsio->kern_data_ptr[2], ctsio->kern_data_ptr[3],
 			 ctsio->kern_data_ptr[4], ctsio->kern_data_ptr[5],
 			 ctsio->kern_data_ptr[6], ctsio->kern_data_ptr[7]));
 
 	ctl_datamove((union ctl_io *)ctsio);
 	return(retval);
 }
 
 int
 ctl_report_supported_opcodes(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct scsi_report_supported_opcodes *cdb;
 	const struct ctl_cmd_entry *entry, *sentry;
 	struct scsi_report_supported_opcodes_all *all;
 	struct scsi_report_supported_opcodes_descr *descr;
 	struct scsi_report_supported_opcodes_one *one;
 	int retval;
 	int alloc_len, total_len;
 	int opcode, service_action, i, j, num;
 
 	CTL_DEBUG_PRINT(("ctl_report_supported_opcodes\n"));
 
 	cdb = (struct scsi_report_supported_opcodes *)ctsio->cdb;
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	opcode = cdb->requested_opcode;
 	service_action = scsi_2btoul(cdb->requested_service_action);
 	switch (cdb->options & RSO_OPTIONS_MASK) {
 	case RSO_OPTIONS_ALL:
 		num = 0;
 		for (i = 0; i < 256; i++) {
 			entry = &ctl_cmd_table[i];
 			if (entry->flags & CTL_CMD_FLAG_SA5) {
 				for (j = 0; j < 32; j++) {
 					sentry = &((const struct ctl_cmd_entry *)
 					    entry->execute)[j];
 					if (ctl_cmd_applicable(
 					    lun->be_lun->lun_type, sentry))
 						num++;
 				}
 			} else {
 				if (ctl_cmd_applicable(lun->be_lun->lun_type,
 				    entry))
 					num++;
 			}
 		}
 		total_len = sizeof(struct scsi_report_supported_opcodes_all) +
 		    num * sizeof(struct scsi_report_supported_opcodes_descr);
 		break;
 	case RSO_OPTIONS_OC:
 		if (ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 2);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32;
 		break;
 	case RSO_OPTIONS_OC_SA:
 		if ((ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) == 0 ||
 		    service_action >= 32) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 2);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32;
 		break;
 	default:
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 2);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 
 	ctsio->kern_sg_entries = 0;
 
 	if (total_len < alloc_len) {
 		ctsio->residual = alloc_len - total_len;
 		ctsio->kern_data_len = total_len;
 		ctsio->kern_total_len = total_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 
 	switch (cdb->options & RSO_OPTIONS_MASK) {
 	case RSO_OPTIONS_ALL:
 		all = (struct scsi_report_supported_opcodes_all *)
 		    ctsio->kern_data_ptr;
 		num = 0;
 		for (i = 0; i < 256; i++) {
 			entry = &ctl_cmd_table[i];
 			if (entry->flags & CTL_CMD_FLAG_SA5) {
 				for (j = 0; j < 32; j++) {
 					sentry = &((const struct ctl_cmd_entry *)
 					    entry->execute)[j];
 					if (!ctl_cmd_applicable(
 					    lun->be_lun->lun_type, sentry))
 						continue;
 					descr = &all->descr[num++];
 					descr->opcode = i;
 					scsi_ulto2b(j, descr->service_action);
 					descr->flags = RSO_SERVACTV;
 					scsi_ulto2b(sentry->length,
 					    descr->cdb_length);
 				}
 			} else {
 				if (!ctl_cmd_applicable(lun->be_lun->lun_type,
 				    entry))
 					continue;
 				descr = &all->descr[num++];
 				descr->opcode = i;
 				scsi_ulto2b(0, descr->service_action);
 				descr->flags = 0;
 				scsi_ulto2b(entry->length, descr->cdb_length);
 			}
 		}
 		scsi_ulto4b(
 		    num * sizeof(struct scsi_report_supported_opcodes_descr),
 		    all->length);
 		break;
 	case RSO_OPTIONS_OC:
 		one = (struct scsi_report_supported_opcodes_one *)
 		    ctsio->kern_data_ptr;
 		entry = &ctl_cmd_table[opcode];
 		goto fill_one;
 	case RSO_OPTIONS_OC_SA:
 		one = (struct scsi_report_supported_opcodes_one *)
 		    ctsio->kern_data_ptr;
 		entry = &ctl_cmd_table[opcode];
 		entry = &((const struct ctl_cmd_entry *)
 		    entry->execute)[service_action];
 fill_one:
 		if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) {
 			one->support = 3;
 			scsi_ulto2b(entry->length, one->cdb_length);
 			one->cdb_usage[0] = opcode;
 			memcpy(&one->cdb_usage[1], entry->usage,
 			    entry->length - 1);
 		} else
 			one->support = 1;
 		break;
 	}
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 
 	ctl_datamove((union ctl_io *)ctsio);
 	return(retval);
 }
 
 int
 ctl_report_supported_tmf(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct scsi_report_supported_tmf *cdb;
 	struct scsi_report_supported_tmf_data *data;
 	int retval;
 	int alloc_len, total_len;
 
 	CTL_DEBUG_PRINT(("ctl_report_supported_tmf\n"));
 
 	cdb = (struct scsi_report_supported_tmf *)ctsio->cdb;
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	total_len = sizeof(struct scsi_report_supported_tmf_data);
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 
 	ctsio->kern_sg_entries = 0;
 
 	if (total_len < alloc_len) {
 		ctsio->residual = alloc_len - total_len;
 		ctsio->kern_data_len = total_len;
 		ctsio->kern_total_len = total_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 
 	data = (struct scsi_report_supported_tmf_data *)ctsio->kern_data_ptr;
 	data->byte1 |= RST_ATS | RST_ATSS | RST_CTSS | RST_LURS | RST_TRS;
 	data->byte2 |= RST_ITNRS;
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 
 	ctl_datamove((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_report_timestamp(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct scsi_report_timestamp *cdb;
 	struct scsi_report_timestamp_data *data;
 	struct timeval tv;
 	int64_t timestamp;
 	int retval;
 	int alloc_len, total_len;
 
 	CTL_DEBUG_PRINT(("ctl_report_timestamp\n"));
 
 	cdb = (struct scsi_report_timestamp *)ctsio->cdb;
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	total_len = sizeof(struct scsi_report_timestamp_data);
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 
 	ctsio->kern_sg_entries = 0;
 
 	if (total_len < alloc_len) {
 		ctsio->residual = alloc_len - total_len;
 		ctsio->kern_data_len = total_len;
 		ctsio->kern_total_len = total_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 
 	data = (struct scsi_report_timestamp_data *)ctsio->kern_data_ptr;
 	scsi_ulto2b(sizeof(*data) - 2, data->length);
 	data->origin = RTS_ORIG_OUTSIDE;
 	getmicrotime(&tv);
 	timestamp = (int64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000;
 	scsi_ulto4b(timestamp >> 16, data->timestamp);
 	scsi_ulto2b(timestamp & 0xffff, &data->timestamp[4]);
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 
 	ctl_datamove((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_persistent_reserve_in(struct ctl_scsiio *ctsio)
 {
 	struct scsi_per_res_in *cdb;
 	int alloc_len, total_len = 0;
 	/* struct scsi_per_res_in_rsrv in_data; */
 	struct ctl_lun *lun;
 	struct ctl_softc *softc;
 
 	CTL_DEBUG_PRINT(("ctl_persistent_reserve_in\n"));
 
 	softc = control_softc;
 
 	cdb = (struct scsi_per_res_in *)ctsio->cdb;
 
 	alloc_len = scsi_2btoul(cdb->length);
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 retry:
 	mtx_lock(&lun->lun_lock);
 	switch (cdb->action) {
 	case SPRI_RK: /* read keys */
 		total_len = sizeof(struct scsi_per_res_in_keys) +
 			lun->pr_key_count *
 			sizeof(struct scsi_per_res_key);
 		break;
 	case SPRI_RR: /* read reservation */
 		if (lun->flags & CTL_LUN_PR_RESERVED)
 			total_len = sizeof(struct scsi_per_res_in_rsrv);
 		else
 			total_len = sizeof(struct scsi_per_res_in_header);
 		break;
 	case SPRI_RC: /* report capabilities */
 		total_len = sizeof(struct scsi_per_res_cap);
 		break;
 	case SPRI_RS: /* read full status */
 		total_len = sizeof(struct scsi_per_res_in_header) +
 		    (sizeof(struct scsi_per_res_in_full_desc) + 256) *
 		    lun->pr_key_count;
 		break;
 	default:
 		panic("Invalid PR type %x", cdb->action);
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 
 	if (total_len < alloc_len) {
 		ctsio->residual = alloc_len - total_len;
 		ctsio->kern_data_len = total_len;
 		ctsio->kern_total_len = total_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	mtx_lock(&lun->lun_lock);
 	switch (cdb->action) {
 	case SPRI_RK: { // read keys
         struct scsi_per_res_in_keys *res_keys;
 		int i, key_count;
 
 		res_keys = (struct scsi_per_res_in_keys*)ctsio->kern_data_ptr;
 
 		/*
 		 * We had to drop the lock to allocate our buffer, which
 		 * leaves time for someone to come in with another
 		 * persistent reservation.  (That is unlikely, though,
 		 * since this should be the only persistent reservation
 		 * command active right now.)
 		 */
 		if (total_len != (sizeof(struct scsi_per_res_in_keys) +
 		    (lun->pr_key_count *
 		     sizeof(struct scsi_per_res_key)))){
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			printf("%s: reservation length changed, retrying\n",
 			       __func__);
 			goto retry;
 		}
 
 		scsi_ulto4b(lun->PRGeneration, res_keys->header.generation);
 
 		scsi_ulto4b(sizeof(struct scsi_per_res_key) *
 			     lun->pr_key_count, res_keys->header.length);
 
 		for (i = 0, key_count = 0; i < 2*CTL_MAX_INITIATORS; i++) {
 			if (!lun->per_res[i].registered)
 				continue;
 
 			/*
 			 * We used lun->pr_key_count to calculate the
 			 * size to allocate.  If it turns out the number of
 			 * initiators with the registered flag set is
 			 * larger than that (i.e. they haven't been kept in
 			 * sync), we've got a problem.
 			 */
 			if (key_count >= lun->pr_key_count) {
 #ifdef NEEDTOPORT
 				csevent_log(CSC_CTL | CSC_SHELF_SW |
 					    CTL_PR_ERROR,
 					    csevent_LogType_Fault,
 					    csevent_AlertLevel_Yellow,
 					    csevent_FRU_ShelfController,
 					    csevent_FRU_Firmware,
 				        csevent_FRU_Unknown,
 					    "registered keys %d >= key "
 					    "count %d", key_count,
 					    lun->pr_key_count);
 #endif
 				key_count++;
 				continue;
 			}
 			memcpy(res_keys->keys[key_count].key,
 			       lun->per_res[i].res_key.key,
 			       ctl_min(sizeof(res_keys->keys[key_count].key),
 			       sizeof(lun->per_res[i].res_key)));
 			key_count++;
 		}
 		break;
 	}
 	case SPRI_RR: { // read reservation
 		struct scsi_per_res_in_rsrv *res;
 		int tmp_len, header_only;
 
 		res = (struct scsi_per_res_in_rsrv *)ctsio->kern_data_ptr;
 
 		scsi_ulto4b(lun->PRGeneration, res->header.generation);
 
 		if (lun->flags & CTL_LUN_PR_RESERVED)
 		{
 			tmp_len = sizeof(struct scsi_per_res_in_rsrv);
 			scsi_ulto4b(sizeof(struct scsi_per_res_in_rsrv_data),
 				    res->header.length);
 			header_only = 0;
 		} else {
 			tmp_len = sizeof(struct scsi_per_res_in_header);
 			scsi_ulto4b(0, res->header.length);
 			header_only = 1;
 		}
 
 		/*
 		 * We had to drop the lock to allocate our buffer, which
 		 * leaves time for someone to come in with another
 		 * persistent reservation.  (That is unlikely, though,
 		 * since this should be the only persistent reservation
 		 * command active right now.)
 		 */
 		if (tmp_len != total_len) {
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			printf("%s: reservation status changed, retrying\n",
 			       __func__);
 			goto retry;
 		}
 
 		/*
 		 * No reservation held, so we're done.
 		 */
 		if (header_only != 0)
 			break;
 
 		/*
 		 * If the registration is an All Registrants type, the key
 		 * is 0, since it doesn't really matter.
 		 */
 		if (lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) {
 			memcpy(res->data.reservation,
 			       &lun->per_res[lun->pr_res_idx].res_key,
 			       sizeof(struct scsi_per_res_key));
 		}
 		res->data.scopetype = lun->res_type;
 		break;
 	}
 	case SPRI_RC:     //report capabilities
 	{
 		struct scsi_per_res_cap *res_cap;
 		uint16_t type_mask;
 
 		res_cap = (struct scsi_per_res_cap *)ctsio->kern_data_ptr;
 		scsi_ulto2b(sizeof(*res_cap), res_cap->length);
 		res_cap->flags2 |= SPRI_TMV | SPRI_ALLOW_3;
 		type_mask = SPRI_TM_WR_EX_AR |
 			    SPRI_TM_EX_AC_RO |
 			    SPRI_TM_WR_EX_RO |
 			    SPRI_TM_EX_AC |
 			    SPRI_TM_WR_EX |
 			    SPRI_TM_EX_AC_AR;
 		scsi_ulto2b(type_mask, res_cap->type_mask);
 		break;
 	}
 	case SPRI_RS: { // read full status
 		struct scsi_per_res_in_full *res_status;
 		struct scsi_per_res_in_full_desc *res_desc;
 		struct ctl_port *port;
 		int i, len;
 
 		res_status = (struct scsi_per_res_in_full*)ctsio->kern_data_ptr;
 
 		/*
 		 * We had to drop the lock to allocate our buffer, which
 		 * leaves time for someone to come in with another
 		 * persistent reservation.  (That is unlikely, though,
 		 * since this should be the only persistent reservation
 		 * command active right now.)
 		 */
 		if (total_len < (sizeof(struct scsi_per_res_in_header) +
 		    (sizeof(struct scsi_per_res_in_full_desc) + 256) *
 		     lun->pr_key_count)){
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			printf("%s: reservation length changed, retrying\n",
 			       __func__);
 			goto retry;
 		}
 
 		scsi_ulto4b(lun->PRGeneration, res_status->header.generation);
 
 		res_desc = &res_status->desc[0];
 		for (i = 0; i < 2*CTL_MAX_INITIATORS; i++) {
 			if (!lun->per_res[i].registered)
 				continue;
 
 			memcpy(&res_desc->res_key, &lun->per_res[i].res_key.key,
 			    sizeof(res_desc->res_key));
 			if ((lun->flags & CTL_LUN_PR_RESERVED) &&
 			    (lun->pr_res_idx == i ||
 			     lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS)) {
 				res_desc->flags = SPRI_FULL_R_HOLDER;
 				res_desc->scopetype = lun->res_type;
 			}
 			scsi_ulto2b(i / CTL_MAX_INIT_PER_PORT,
 			    res_desc->rel_trgt_port_id);
 			len = 0;
 			port = softc->ctl_ports[
 			    ctl_port_idx(i / CTL_MAX_INIT_PER_PORT)];
 			if (port != NULL)
 				len = ctl_create_iid(port,
 				    i % CTL_MAX_INIT_PER_PORT,
 				    res_desc->transport_id);
 			scsi_ulto4b(len, res_desc->additional_length);
 			res_desc = (struct scsi_per_res_in_full_desc *)
 			    &res_desc->transport_id[len];
 		}
 		scsi_ulto4b((uint8_t *)res_desc - (uint8_t *)&res_status->desc[0],
 		    res_status->header.length);
 		break;
 	}
 	default:
 		/*
 		 * This is a bug, because we just checked for this above,
 		 * and should have returned an error.
 		 */
 		panic("Invalid PR type %x", cdb->action);
 		break; /* NOTREACHED */
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 
 	CTL_DEBUG_PRINT(("buf = %x %x %x %x %x %x %x %x\n",
 			 ctsio->kern_data_ptr[0], ctsio->kern_data_ptr[1],
 			 ctsio->kern_data_ptr[2], ctsio->kern_data_ptr[3],
 			 ctsio->kern_data_ptr[4], ctsio->kern_data_ptr[5],
 			 ctsio->kern_data_ptr[6], ctsio->kern_data_ptr[7]));
 
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * Returns 0 if ctl_persistent_reserve_out() should continue, non-zero if
  * it should return.
  */
 static int
 ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key,
 		uint64_t sa_res_key, uint8_t type, uint32_t residx,
 		struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb,
 		struct scsi_per_res_out_parms* param)
 {
 	union ctl_ha_msg persis_io;
 	int retval, i;
 	int isc_retval;
 
 	retval = 0;
 
 	mtx_lock(&lun->lun_lock);
 	if (sa_res_key == 0) {
 		if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) {
 			/* validate scope and type */
 			if ((cdb->scope_type & SPR_SCOPE_MASK) !=
 			     SPR_LU_SCOPE) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 4);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 			}
 
 		        if (type>8 || type==2 || type==4 || type==0) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
        	           				      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 0);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 		        }
 
 			/* temporarily unregister this nexus */
 			lun->per_res[residx].registered = 0;
 
 			/*
 			 * Unregister everybody else and build UA for
 			 * them
 			 */
 			for(i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 				if (lun->per_res[i].registered == 0)
 					continue;
 
 				if (!persis_offset
 				 && i <CTL_MAX_INITIATORS)
 					lun->pending_ua[i] |=
 						CTL_UA_REG_PREEMPT;
 				else if (persis_offset
 				      && i >= persis_offset)
 					lun->pending_ua[i-persis_offset] |=
 						CTL_UA_REG_PREEMPT;
 				lun->per_res[i].registered = 0;
 				memset(&lun->per_res[i].res_key, 0,
 				       sizeof(struct scsi_per_res_key));
 			}
 			lun->per_res[residx].registered = 1;
 			lun->pr_key_count = 1;
 			lun->res_type = type;
 			if (lun->res_type != SPR_TYPE_WR_EX_AR
 			 && lun->res_type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = residx;
 
 			/* send msg to other side */
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 			     &persis_io, sizeof(persis_io), 0)) >
 			     CTL_HA_STATUS_SUCCESS) {
 				printf("CTL:Persis Out error returned "
 				       "from ctl_ha_msg_send %d\n",
 				       isc_retval);
 			}
 		} else {
 			/* not all registrants */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 0,
 					      /*field*/ 8,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (1);
 		}
 	} else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS
 		|| !(lun->flags & CTL_LUN_PR_RESERVED)) {
 		int found = 0;
 
 		if (res_key == sa_res_key) {
 			/* special case */
 			/*
 			 * The spec implies this is not good but doesn't
 			 * say what to do. There are two choices either
 			 * generate a res conflict or check condition
 			 * with illegal field in parameter data. Since
 			 * that is what is done when the sa_res_key is
 			 * zero I'll take that approach since this has
 			 * to do with the sa_res_key.
 			 */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 0,
 					      /*field*/ 8,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (1);
 		}
 
 		for (i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 			if (lun->per_res[i].registered
 			 && memcmp(param->serv_act_res_key,
 			    lun->per_res[i].res_key.key,
 			    sizeof(struct scsi_per_res_key)) != 0)
 				continue;
 
 			found = 1;
 			lun->per_res[i].registered = 0;
 			memset(&lun->per_res[i].res_key, 0,
 			       sizeof(struct scsi_per_res_key));
 			lun->pr_key_count--;
 
 			if (!persis_offset && i < CTL_MAX_INITIATORS)
 				lun->pending_ua[i] |= CTL_UA_REG_PREEMPT;
 			else if (persis_offset && i >= persis_offset)
 				lun->pending_ua[i-persis_offset] |=
 					CTL_UA_REG_PREEMPT;
 		}
 		if (!found) {
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		/* send msg to other side */
 		persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 		persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 		persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 		persis_io.pr.pr_info.residx = lun->pr_res_idx;
 		persis_io.pr.pr_info.res_type = type;
 		memcpy(persis_io.pr.pr_info.sa_res_key,
 		       param->serv_act_res_key,
 		       sizeof(param->serv_act_res_key));
 		if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 		     &persis_io, sizeof(persis_io), 0)) >
 		     CTL_HA_STATUS_SUCCESS) {
 			printf("CTL:Persis Out error returned from "
 			       "ctl_ha_msg_send %d\n", isc_retval);
 		}
 	} else {
 		/* Reserved but not all registrants */
 		/* sa_res_key is res holder */
 		if (memcmp(param->serv_act_res_key,
                    lun->per_res[lun->pr_res_idx].res_key.key,
                    sizeof(struct scsi_per_res_key)) == 0) {
 			/* validate scope and type */
 			if ((cdb->scope_type & SPR_SCOPE_MASK) !=
 			     SPR_LU_SCOPE) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 4);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 			}
 
 			if (type>8 || type==2 || type==4 || type==0) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 0);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 			}
 
 			/*
 			 * Do the following:
 			 * if sa_res_key != res_key remove all
 			 * registrants w/sa_res_key and generate UA
 			 * for these registrants(Registrations
 			 * Preempted) if it wasn't an exclusive
 			 * reservation generate UA(Reservations
 			 * Preempted) for all other registered nexuses
 			 * if the type has changed. Establish the new
 			 * reservation and holder. If res_key and
 			 * sa_res_key are the same do the above
 			 * except don't unregister the res holder.
 			 */
 
 			/*
 			 * Temporarily unregister so it won't get
 			 * removed or UA generated
 			 */
 			lun->per_res[residx].registered = 0;
 			for(i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 				if (lun->per_res[i].registered == 0)
 					continue;
 
 				if (memcmp(param->serv_act_res_key,
 				    lun->per_res[i].res_key.key,
 				    sizeof(struct scsi_per_res_key)) == 0) {
 					lun->per_res[i].registered = 0;
 					memset(&lun->per_res[i].res_key,
 					       0,
 					       sizeof(struct scsi_per_res_key));
 					lun->pr_key_count--;
 
 					if (!persis_offset
 					 && i < CTL_MAX_INITIATORS)
 						lun->pending_ua[i] |=
 							CTL_UA_REG_PREEMPT;
 					else if (persis_offset
 					      && i >= persis_offset)
 						lun->pending_ua[i-persis_offset] |=
 						  CTL_UA_REG_PREEMPT;
 				} else if (type != lun->res_type
 					&& (lun->res_type == SPR_TYPE_WR_EX_RO
 					 || lun->res_type ==SPR_TYPE_EX_AC_RO)){
 						if (!persis_offset
 						 && i < CTL_MAX_INITIATORS)
 							lun->pending_ua[i] |=
 							CTL_UA_RES_RELEASE;
 						else if (persis_offset
 						      && i >= persis_offset)
 							lun->pending_ua[
 							i-persis_offset] |=
 							CTL_UA_RES_RELEASE;
 				}
 			}
 			lun->per_res[residx].registered = 1;
 			lun->res_type = type;
 			if (lun->res_type != SPR_TYPE_WR_EX_AR
 			 && lun->res_type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = residx;
 			else
 				lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS;
 
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 			     &persis_io, sizeof(persis_io), 0)) >
 			     CTL_HA_STATUS_SUCCESS) {
 				printf("CTL:Persis Out error returned "
 				       "from ctl_ha_msg_send %d\n",
 				       isc_retval);
 			}
 		} else {
 			/*
 			 * sa_res_key is not the res holder just
 			 * remove registrants
 			 */
 			int found=0;
 
 			for (i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 				if (memcmp(param->serv_act_res_key,
 				    lun->per_res[i].res_key.key,
 				    sizeof(struct scsi_per_res_key)) != 0)
 					continue;
 
 				found = 1;
 				lun->per_res[i].registered = 0;
 				memset(&lun->per_res[i].res_key, 0,
 				       sizeof(struct scsi_per_res_key));
 				lun->pr_key_count--;
 
 				if (!persis_offset
 				 && i < CTL_MAX_INITIATORS)
 					lun->pending_ua[i] |=
 						CTL_UA_REG_PREEMPT;
 				else if (persis_offset
 				      && i >= persis_offset)
 					lun->pending_ua[i-persis_offset] |=
 						CTL_UA_REG_PREEMPT;
 			}
 
 			if (!found) {
 				mtx_unlock(&lun->lun_lock);
 				free(ctsio->kern_data_ptr, M_CTL);
 				ctl_set_reservation_conflict(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 		        	return (1);
 			}
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 			     &persis_io, sizeof(persis_io), 0)) >
 			     CTL_HA_STATUS_SUCCESS) {
 				printf("CTL:Persis Out error returned "
 				       "from ctl_ha_msg_send %d\n",
 				isc_retval);
 			}
 		}
 	}
 
 	lun->PRGeneration++;
 	mtx_unlock(&lun->lun_lock);
 
 	return (retval);
 }
 
 static void
 ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg)
 {
 	int i;
 
 	if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS
 	 || lun->pr_res_idx == CTL_PR_NO_RESERVATION
 	 || memcmp(&lun->per_res[lun->pr_res_idx].res_key,
 		   msg->pr.pr_info.sa_res_key,
 		   sizeof(struct scsi_per_res_key)) != 0) {
 		uint64_t sa_res_key;
 		sa_res_key = scsi_8btou64(msg->pr.pr_info.sa_res_key);
 
 		if (sa_res_key == 0) {
 			/* temporarily unregister this nexus */
 			lun->per_res[msg->pr.pr_info.residx].registered = 0;
 
 			/*
 			 * Unregister everybody else and build UA for
 			 * them
 			 */
 			for(i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 				if (lun->per_res[i].registered == 0)
 					continue;
 
 				if (!persis_offset
 				 && i < CTL_MAX_INITIATORS)
 					lun->pending_ua[i] |=
 						CTL_UA_REG_PREEMPT;
 				else if (persis_offset && i >= persis_offset)
 					lun->pending_ua[i - persis_offset] |=
 						CTL_UA_REG_PREEMPT;
 				lun->per_res[i].registered = 0;
 				memset(&lun->per_res[i].res_key, 0,
 				       sizeof(struct scsi_per_res_key));
 			}
 
 			lun->per_res[msg->pr.pr_info.residx].registered = 1;
 			lun->pr_key_count = 1;
 			lun->res_type = msg->pr.pr_info.res_type;
 			if (lun->res_type != SPR_TYPE_WR_EX_AR
 			 && lun->res_type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = msg->pr.pr_info.residx;
 		} else {
 		        for (i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 				if (memcmp(msg->pr.pr_info.sa_res_key,
 		                   lun->per_res[i].res_key.key,
 		                   sizeof(struct scsi_per_res_key)) != 0)
 					continue;
 
 				lun->per_res[i].registered = 0;
 				memset(&lun->per_res[i].res_key, 0,
 				       sizeof(struct scsi_per_res_key));
 				lun->pr_key_count--;
 
 				if (!persis_offset
 				 && i < persis_offset)
 					lun->pending_ua[i] |=
 						CTL_UA_REG_PREEMPT;
 				else if (persis_offset
 				      && i >= persis_offset)
 					lun->pending_ua[i - persis_offset] |=
 						CTL_UA_REG_PREEMPT;
 			}
 		}
 	} else {
 		/*
 		 * Temporarily unregister so it won't get removed
 		 * or UA generated
 		 */
 		lun->per_res[msg->pr.pr_info.residx].registered = 0;
 		for (i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 			if (lun->per_res[i].registered == 0)
 				continue;
 
 			if (memcmp(msg->pr.pr_info.sa_res_key,
 	                   lun->per_res[i].res_key.key,
 	                   sizeof(struct scsi_per_res_key)) == 0) {
 				lun->per_res[i].registered = 0;
 				memset(&lun->per_res[i].res_key, 0,
 				       sizeof(struct scsi_per_res_key));
 				lun->pr_key_count--;
 				if (!persis_offset
 				 && i < CTL_MAX_INITIATORS)
 					lun->pending_ua[i] |=
 						CTL_UA_REG_PREEMPT;
 				else if (persis_offset
 				      && i >= persis_offset)
 					lun->pending_ua[i - persis_offset] |=
 						CTL_UA_REG_PREEMPT;
 			} else if (msg->pr.pr_info.res_type != lun->res_type
 				&& (lun->res_type == SPR_TYPE_WR_EX_RO
 				 || lun->res_type == SPR_TYPE_EX_AC_RO)) {
 					if (!persis_offset
 					 && i < persis_offset)
 						lun->pending_ua[i] |=
 							CTL_UA_RES_RELEASE;
 					else if (persis_offset
 					      && i >= persis_offset)
 					lun->pending_ua[i - persis_offset] |=
 						CTL_UA_RES_RELEASE;
 			}
 		}
 		lun->per_res[msg->pr.pr_info.residx].registered = 1;
 		lun->res_type = msg->pr.pr_info.res_type;
 		if (lun->res_type != SPR_TYPE_WR_EX_AR
 		 && lun->res_type != SPR_TYPE_EX_AC_AR)
 			lun->pr_res_idx = msg->pr.pr_info.residx;
 		else
 			lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS;
 	}
 	lun->PRGeneration++;
 
 }
 
 
 int
 ctl_persistent_reserve_out(struct ctl_scsiio *ctsio)
 {
 	int retval;
 	int isc_retval;
 	u_int32_t param_len;
 	struct scsi_per_res_out *cdb;
 	struct ctl_lun *lun;
 	struct scsi_per_res_out_parms* param;
 	struct ctl_softc *softc;
 	uint32_t residx;
 	uint64_t res_key, sa_res_key;
 	uint8_t type;
 	union ctl_ha_msg persis_io;
 	int    i;
 
 	CTL_DEBUG_PRINT(("ctl_persistent_reserve_out\n"));
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	softc = control_softc;
 
 	cdb = (struct scsi_per_res_out *)ctsio->cdb;
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	/*
 	 * We only support whole-LUN scope.  The scope & type are ignored for
 	 * register, register and ignore existing key and clear.
 	 * We sometimes ignore scope and type on preempts too!!
 	 * Verify reservation type here as well.
 	 */
 	type = cdb->scope_type & SPR_TYPE_MASK;
 	if ((cdb->action == SPRO_RESERVE)
 	 || (cdb->action == SPRO_RELEASE)) {
 		if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 4);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		if (type>8 || type==2 || type==4 || type==0) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 	}
 
 	param_len = scsi_4btoul(cdb->length);
 
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = param_len;
 		ctsio->kern_total_len = param_len;
 		ctsio->kern_data_resid = 0;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	param = (struct scsi_per_res_out_parms *)ctsio->kern_data_ptr;
 
 	residx = ctl_get_resindex(&ctsio->io_hdr.nexus);
 	res_key = scsi_8btou64(param->res_key.key);
 	sa_res_key = scsi_8btou64(param->serv_act_res_key);
 
 	/*
 	 * Validate the reservation key here except for SPRO_REG_IGNO
 	 * This must be done for all other service actions
 	 */
 	if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REG_IGNO) {
 		mtx_lock(&lun->lun_lock);
 		if (lun->per_res[residx].registered) {
 		    if (memcmp(param->res_key.key,
 			       lun->per_res[residx].res_key.key,
 			       ctl_min(sizeof(param->res_key),
 			       sizeof(lun->per_res[residx].res_key))) != 0) {
 				/*
 				 * The current key passed in doesn't match
 				 * the one the initiator previously
 				 * registered.
 				 */
 				mtx_unlock(&lun->lun_lock);
 				free(ctsio->kern_data_ptr, M_CTL);
 				ctl_set_reservation_conflict(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 				return (CTL_RETVAL_COMPLETE);
 			}
 		} else if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REGISTER) {
 			/*
 			 * We are not registered
 			 */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		} else if (res_key != 0) {
 			/*
 			 * We are not registered and trying to register but
 			 * the register key isn't zero.
 			 */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		mtx_unlock(&lun->lun_lock);
 	}
 
 	switch (cdb->action & SPRO_ACTION_MASK) {
 	case SPRO_REGISTER:
 	case SPRO_REG_IGNO: {
 
 #if 0
 		printf("Registration received\n");
 #endif
 
 		/*
 		 * We don't support any of these options, as we report in
 		 * the read capabilities request (see
 		 * ctl_persistent_reserve_in(), above).
 		 */
 		if ((param->flags & SPR_SPEC_I_PT)
 		 || (param->flags & SPR_ALL_TG_PT)
 		 || (param->flags & SPR_APTPL)) {
 			int bit_ptr;
 
 			if (param->flags & SPR_APTPL)
 				bit_ptr = 0;
 			else if (param->flags & SPR_ALL_TG_PT)
 				bit_ptr = 2;
 			else /* SPR_SPEC_I_PT */
 				bit_ptr = 3;
 
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 0,
 					      /*field*/ 20,
 					      /*bit_valid*/ 1,
 					      /*bit*/ bit_ptr);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		mtx_lock(&lun->lun_lock);
 
 		/*
 		 * The initiator wants to clear the
 		 * key/unregister.
 		 */
 		if (sa_res_key == 0) {
 			if ((res_key == 0
 			  && (cdb->action & SPRO_ACTION_MASK) == SPRO_REGISTER)
 			 || ((cdb->action & SPRO_ACTION_MASK) == SPRO_REG_IGNO
 			  && !lun->per_res[residx].registered)) {
 				mtx_unlock(&lun->lun_lock);
 				goto done;
 			}
 
 			lun->per_res[residx].registered = 0;
 			memset(&lun->per_res[residx].res_key,
 			       0, sizeof(lun->per_res[residx].res_key));
 			lun->pr_key_count--;
 
 			if (residx == lun->pr_res_idx) {
 				lun->flags &= ~CTL_LUN_PR_RESERVED;
 				lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 				if ((lun->res_type == SPR_TYPE_WR_EX_RO
 				  || lun->res_type == SPR_TYPE_EX_AC_RO)
 				 && lun->pr_key_count) {
 					/*
 					 * If the reservation is a registrants
 					 * only type we need to generate a UA
 					 * for other registered inits.  The
 					 * sense code should be RESERVATIONS
 					 * RELEASED
 					 */
 
 					for (i = 0; i < CTL_MAX_INITIATORS;i++){
 						if (lun->per_res[
 						    i+persis_offset].registered
 						    == 0)
 							continue;
 						lun->pending_ua[i] |=
 							CTL_UA_RES_RELEASE;
 					}
 				}
 				lun->res_type = 0;
 			} else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) {
 				if (lun->pr_key_count==0) {
 					lun->flags &= ~CTL_LUN_PR_RESERVED;
 					lun->res_type = 0;
 					lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 				}
 			}
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_UNREG_KEY;
 			persis_io.pr.pr_info.residx = residx;
 			if ((isc_retval = ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 			     &persis_io, sizeof(persis_io), 0 )) >
 			     CTL_HA_STATUS_SUCCESS) {
 				printf("CTL:Persis Out error returned from "
 				       "ctl_ha_msg_send %d\n", isc_retval);
 			}
 		} else /* sa_res_key != 0 */ {
 
 			/*
 			 * If we aren't registered currently then increment
 			 * the key count and set the registered flag.
 			 */
 			if (!lun->per_res[residx].registered) {
 				lun->pr_key_count++;
 				lun->per_res[residx].registered = 1;
 			}
 
 			memcpy(&lun->per_res[residx].res_key,
 			       param->serv_act_res_key,
 			       ctl_min(sizeof(param->serv_act_res_key),
 			       sizeof(lun->per_res[residx].res_key)));
 
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_REG_KEY;
 			persis_io.pr.pr_info.residx = residx;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 			     &persis_io, sizeof(persis_io), 0)) >
 			     CTL_HA_STATUS_SUCCESS) {
 				printf("CTL:Persis Out error returned from "
 				       "ctl_ha_msg_send %d\n", isc_retval);
 			}
 		}
 		lun->PRGeneration++;
 		mtx_unlock(&lun->lun_lock);
 
 		break;
 	}
 	case SPRO_RESERVE:
 #if 0
                 printf("Reserve executed type %d\n", type);
 #endif
 		mtx_lock(&lun->lun_lock);
 		if (lun->flags & CTL_LUN_PR_RESERVED) {
 			/*
 			 * if this isn't the reservation holder and it's
 			 * not a "all registrants" type or if the type is
 			 * different then we have a conflict
 			 */
 			if ((lun->pr_res_idx != residx
 			  && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS)
 			 || lun->res_type != type) {
 				mtx_unlock(&lun->lun_lock);
 				free(ctsio->kern_data_ptr, M_CTL);
 				ctl_set_reservation_conflict(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 				return (CTL_RETVAL_COMPLETE);
 			}
 			mtx_unlock(&lun->lun_lock);
 		} else /* create a reservation */ {
 			/*
 			 * If it's not an "all registrants" type record
 			 * reservation holder
 			 */
 			if (type != SPR_TYPE_WR_EX_AR
 			 && type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = residx; /* Res holder */
 			else
 				lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS;
 
 			lun->flags |= CTL_LUN_PR_RESERVED;
 			lun->res_type = type;
 
 			mtx_unlock(&lun->lun_lock);
 
 			/* send msg to other side */
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_RESERVE;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 			     &persis_io, sizeof(persis_io), 0)) >
 			     CTL_HA_STATUS_SUCCESS) {
 				printf("CTL:Persis Out error returned from "
 				       "ctl_ha_msg_send %d\n", isc_retval);
 			}
 		}
 		break;
 
 	case SPRO_RELEASE:
 		mtx_lock(&lun->lun_lock);
 		if ((lun->flags & CTL_LUN_PR_RESERVED) == 0) {
 			/* No reservation exists return good status */
 			mtx_unlock(&lun->lun_lock);
 			goto done;
 		}
 		/*
 		 * Is this nexus a reservation holder?
 		 */
 		if (lun->pr_res_idx != residx
 		 && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) {
 			/*
 			 * not a res holder return good status but
 			 * do nothing
 			 */
 			mtx_unlock(&lun->lun_lock);
 			goto done;
 		}
 
 		if (lun->res_type != type) {
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_illegal_pr_release(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		/* okay to release */
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 		lun->res_type = 0;
 
 		/*
 		 * if this isn't an exclusive access
 		 * res generate UA for all other
 		 * registrants.
 		 */
 		if (type != SPR_TYPE_EX_AC
 		 && type != SPR_TYPE_WR_EX) {
 			/*
 			 * temporarily unregister so we don't generate UA
 			 */
 			lun->per_res[residx].registered = 0;
 
 			for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 				if (lun->per_res[i+persis_offset].registered
 				    == 0)
 					continue;
 				lun->pending_ua[i] |=
 					CTL_UA_RES_RELEASE;
 			}
 
 			lun->per_res[residx].registered = 1;
 		}
 		mtx_unlock(&lun->lun_lock);
 		/* Send msg to other side */
 		persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 		persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 		persis_io.pr.pr_info.action = CTL_PR_RELEASE;
 		if ((isc_retval=ctl_ha_msg_send( CTL_HA_CHAN_CTL, &persis_io,
 		     sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) {
 			printf("CTL:Persis Out error returned from "
 			       "ctl_ha_msg_send %d\n", isc_retval);
 		}
 		break;
 
 	case SPRO_CLEAR:
 		/* send msg to other side */
 
 		mtx_lock(&lun->lun_lock);
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->res_type = 0;
 		lun->pr_key_count = 0;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 
 		memset(&lun->per_res[residx].res_key,
 		       0, sizeof(lun->per_res[residx].res_key));
 		lun->per_res[residx].registered = 0;
 
 		for (i=0; i < 2*CTL_MAX_INITIATORS; i++)
 			if (lun->per_res[i].registered) {
 				if (!persis_offset && i < CTL_MAX_INITIATORS)
 					lun->pending_ua[i] |=
 						CTL_UA_RES_PREEMPT;
 				else if (persis_offset && i >= persis_offset)
 					lun->pending_ua[i-persis_offset] |=
 					    CTL_UA_RES_PREEMPT;
 
 				memset(&lun->per_res[i].res_key,
 				       0, sizeof(struct scsi_per_res_key));
 				lun->per_res[i].registered = 0;
 			}
 		lun->PRGeneration++;
 		mtx_unlock(&lun->lun_lock);
 		persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 		persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 		persis_io.pr.pr_info.action = CTL_PR_CLEAR;
 		if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 		     sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) {
 			printf("CTL:Persis Out error returned from "
 			       "ctl_ha_msg_send %d\n", isc_retval);
 		}
 		break;
 
 	case SPRO_PREEMPT: {
 		int nretval;
 
 		nretval = ctl_pro_preempt(softc, lun, res_key, sa_res_key, type,
 					  residx, ctsio, cdb, param);
 		if (nretval != 0)
 			return (CTL_RETVAL_COMPLETE);
 		break;
 	}
 	default:
 		panic("Invalid PR type %x", cdb->action);
 	}
 
 done:
 	free(ctsio->kern_data_ptr, M_CTL);
 	ctl_set_success(ctsio);
 	ctl_done((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 /*
  * This routine is for handling a message from the other SC pertaining to
  * persistent reserve out. All the error checking will have been done
  * so only perorming the action need be done here to keep the two
  * in sync.
  */
 static void
 ctl_hndl_per_res_out_on_other_sc(union ctl_ha_msg *msg)
 {
 	struct ctl_lun *lun;
 	struct ctl_softc *softc;
 	int i;
 	uint32_t targ_lun;
 
 	softc = control_softc;
 
 	targ_lun = msg->hdr.nexus.targ_mapped_lun;
 	lun = softc->ctl_luns[targ_lun];
 	mtx_lock(&lun->lun_lock);
 	switch(msg->pr.pr_info.action) {
 	case CTL_PR_REG_KEY:
 		if (!lun->per_res[msg->pr.pr_info.residx].registered) {
 			lun->per_res[msg->pr.pr_info.residx].registered = 1;
 			lun->pr_key_count++;
 		}
 		lun->PRGeneration++;
 		memcpy(&lun->per_res[msg->pr.pr_info.residx].res_key,
 		       msg->pr.pr_info.sa_res_key,
 		       sizeof(struct scsi_per_res_key));
 		break;
 
 	case CTL_PR_UNREG_KEY:
 		lun->per_res[msg->pr.pr_info.residx].registered = 0;
 		memset(&lun->per_res[msg->pr.pr_info.residx].res_key,
 		       0, sizeof(struct scsi_per_res_key));
 		lun->pr_key_count--;
 
 		/* XXX Need to see if the reservation has been released */
 		/* if so do we need to generate UA? */
 		if (msg->pr.pr_info.residx == lun->pr_res_idx) {
 			lun->flags &= ~CTL_LUN_PR_RESERVED;
 			lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 			if ((lun->res_type == SPR_TYPE_WR_EX_RO
 			  || lun->res_type == SPR_TYPE_EX_AC_RO)
 			 && lun->pr_key_count) {
 				/*
 				 * If the reservation is a registrants
 				 * only type we need to generate a UA
 				 * for other registered inits.  The
 				 * sense code should be RESERVATIONS
 				 * RELEASED
 				 */
 
 				for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 					if (lun->per_res[i+
 					    persis_offset].registered == 0)
 						continue;
 
 					lun->pending_ua[i] |=
 						CTL_UA_RES_RELEASE;
 				}
 			}
 			lun->res_type = 0;
 		} else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) {
 			if (lun->pr_key_count==0) {
 				lun->flags &= ~CTL_LUN_PR_RESERVED;
 				lun->res_type = 0;
 				lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 			}
 		}
 		lun->PRGeneration++;
 		break;
 
 	case CTL_PR_RESERVE:
 		lun->flags |= CTL_LUN_PR_RESERVED;
 		lun->res_type = msg->pr.pr_info.res_type;
 		lun->pr_res_idx = msg->pr.pr_info.residx;
 
 		break;
 
 	case CTL_PR_RELEASE:
 		/*
 		 * if this isn't an exclusive access res generate UA for all
 		 * other registrants.
 		 */
 		if (lun->res_type != SPR_TYPE_EX_AC
 		 && lun->res_type != SPR_TYPE_WR_EX) {
 			for (i = 0; i < CTL_MAX_INITIATORS; i++)
 				if (lun->per_res[i+persis_offset].registered)
 					lun->pending_ua[i] |=
 						CTL_UA_RES_RELEASE;
 		}
 
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 		lun->res_type = 0;
 		break;
 
 	case CTL_PR_PREEMPT:
 		ctl_pro_preempt_other(lun, msg);
 		break;
 	case CTL_PR_CLEAR:
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->res_type = 0;
 		lun->pr_key_count = 0;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 		for (i=0; i < 2*CTL_MAX_INITIATORS; i++) {
 			if (lun->per_res[i].registered == 0)
 				continue;
 			if (!persis_offset
 			 && i < CTL_MAX_INITIATORS)
 				lun->pending_ua[i] |= CTL_UA_RES_PREEMPT;
 			else if (persis_offset
 			      && i >= persis_offset)
 				lun->pending_ua[i-persis_offset] |=
 					CTL_UA_RES_PREEMPT;
 			memset(&lun->per_res[i].res_key, 0,
 			       sizeof(struct scsi_per_res_key));
 			lun->per_res[i].registered = 0;
 		}
 		lun->PRGeneration++;
 		break;
 	}
 
 	mtx_unlock(&lun->lun_lock);
 }
 
 int
 ctl_read_write(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int flags, retval;
 	int isread;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	CTL_DEBUG_PRINT(("ctl_read_write: command: %#x\n", ctsio->cdb[0]));
 
 	flags = 0;
 	retval = CTL_RETVAL_COMPLETE;
 
 	isread = ctsio->cdb[0] == READ_6  || ctsio->cdb[0] == READ_10
 	      || ctsio->cdb[0] == READ_12 || ctsio->cdb[0] == READ_16;
 	if (lun->flags & CTL_LUN_PR_RESERVED && isread) {
 		uint32_t residx;
 
 		/*
 		 * XXX KDM need a lock here.
 		 */
 		residx = ctl_get_resindex(&ctsio->io_hdr.nexus);
 		if ((lun->res_type == SPR_TYPE_EX_AC
 		  && residx != lun->pr_res_idx)
 		 || ((lun->res_type == SPR_TYPE_EX_AC_RO
 		   || lun->res_type == SPR_TYPE_EX_AC_AR)
 		  && !lun->per_res[residx].registered)) {
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 	        }
 	}
 
 	switch (ctsio->cdb[0]) {
 	case READ_6:
 	case WRITE_6: {
 		struct scsi_rw_6 *cdb;
 
 		cdb = (struct scsi_rw_6 *)ctsio->cdb;
 
 		lba = scsi_3btoul(cdb->addr);
 		/* only 5 bits are valid in the most significant address byte */
 		lba &= 0x1fffff;
 		num_blocks = cdb->length;
 		/*
 		 * This is correct according to SBC-2.
 		 */
 		if (num_blocks == 0)
 			num_blocks = 256;
 		break;
 	}
 	case READ_10:
 	case WRITE_10: {
 		struct scsi_rw_10 *cdb;
 
 		cdb = (struct scsi_rw_10 *)ctsio->cdb;
 		if (cdb->byte2 & SRW10_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW10_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_10: {
 		struct scsi_write_verify_10 *cdb;
 
 		cdb = (struct scsi_write_verify_10 *)ctsio->cdb;
 		flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SWV_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		break;
 	}
 	case READ_12:
 	case WRITE_12: {
 		struct scsi_rw_12 *cdb;
 
 		cdb = (struct scsi_rw_12 *)ctsio->cdb;
 		if (cdb->byte2 & SRW12_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW12_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_12: {
 		struct scsi_write_verify_12 *cdb;
 
 		cdb = (struct scsi_write_verify_12 *)ctsio->cdb;
 		flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SWV_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case READ_16:
 	case WRITE_16: {
 		struct scsi_rw_16 *cdb;
 
 		cdb = (struct scsi_rw_16 *)ctsio->cdb;
 		if (cdb->byte2 & SRW12_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW12_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_16: {
 		struct scsi_write_verify_16 *cdb;
 
 		cdb = (struct scsi_write_verify_16 *)ctsio->cdb;
 		flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SWV_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * According to SBC-3, a transfer length of 0 is not an error.
 	 * Note that this cannot happen with WRITE(6) or READ(6), since 0
 	 * translates to 256 blocks for those commands.
 	 */
 	if (num_blocks == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/* Set FUA and/or DPO if caches are disabled. */
 	if (isread) {
 		if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 &
 		    SCP_RCD) != 0)
 			flags |= CTL_LLF_FUA | CTL_LLF_DPO;
 	} else {
 		if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 &
 		    SCP_WCE) == 0)
 			flags |= CTL_LLF_FUA;
 	}
 
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags;
 
 	ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize;
 	ctsio->kern_rel_offset = 0;
 
 	CTL_DEBUG_PRINT(("ctl_read_write: calling data_submit()\n"));
 
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 static int
 ctl_cnw_cont(union ctl_io *io)
 {
 	struct ctl_scsiio *ctsio;
 	struct ctl_lun *lun;
 	struct ctl_lba_len_flags *lbalen;
 	int retval;
 
 	ctsio = &io->scsiio;
 	ctsio->io_hdr.status = CTL_STATUS_NONE;
 	ctsio->io_hdr.flags &= ~CTL_FLAG_IO_CONT;
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->flags &= ~CTL_LLF_COMPARE;
 	lbalen->flags |= CTL_LLF_WRITE;
 
 	CTL_DEBUG_PRINT(("ctl_cnw_cont: calling data_submit()\n"));
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_cnw(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int flags, retval;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	CTL_DEBUG_PRINT(("ctl_cnw: command: %#x\n", ctsio->cdb[0]));
 
 	flags = 0;
 	retval = CTL_RETVAL_COMPLETE;
 
 	switch (ctsio->cdb[0]) {
 	case COMPARE_AND_WRITE: {
 		struct scsi_compare_and_write *cdb;
 
 		cdb = (struct scsi_compare_and_write *)ctsio->cdb;
 		if (cdb->byte2 & SRW10_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW10_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = cdb->length;
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * According to SBC-3, a transfer length of 0 is not an error.
 	 */
 	if (num_blocks == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/* Set FUA if write cache is disabled. */
 	if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 &
 	    SCP_WCE) == 0)
 		flags |= CTL_LLF_FUA;
 
 	ctsio->kern_total_len = 2 * num_blocks * lun->be_lun->blocksize;
 	ctsio->kern_rel_offset = 0;
 
 	/*
 	 * Set the IO_CONT flag, so that if this I/O gets passed to
 	 * ctl_data_submit_done(), it'll get passed back to
 	 * ctl_ctl_cnw_cont() for further processing.
 	 */
 	ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT;
 	ctsio->io_cont = ctl_cnw_cont;
 
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	lbalen->flags = CTL_LLF_COMPARE | flags;
 
 	CTL_DEBUG_PRINT(("ctl_cnw: calling data_submit()\n"));
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_verify(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int bytchk, flags;
 	int retval;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	CTL_DEBUG_PRINT(("ctl_verify: command: %#x\n", ctsio->cdb[0]));
 
 	bytchk = 0;
 	flags = CTL_LLF_FUA;
 	retval = CTL_RETVAL_COMPLETE;
 
 	switch (ctsio->cdb[0]) {
 	case VERIFY_10: {
 		struct scsi_verify_10 *cdb;
 
 		cdb = (struct scsi_verify_10 *)ctsio->cdb;
 		if (cdb->byte2 & SVFY_BYTCHK)
 			bytchk = 1;
 		if (cdb->byte2 & SVFY_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		break;
 	}
 	case VERIFY_12: {
 		struct scsi_verify_12 *cdb;
 
 		cdb = (struct scsi_verify_12 *)ctsio->cdb;
 		if (cdb->byte2 & SVFY_BYTCHK)
 			bytchk = 1;
 		if (cdb->byte2 & SVFY_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case VERIFY_16: {
 		struct scsi_rw_16 *cdb;
 
 		cdb = (struct scsi_rw_16 *)ctsio->cdb;
 		if (cdb->byte2 & SVFY_BYTCHK)
 			bytchk = 1;
 		if (cdb->byte2 & SVFY_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * According to SBC-3, a transfer length of 0 is not an error.
 	 */
 	if (num_blocks == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	if (bytchk) {
 		lbalen->flags = CTL_LLF_COMPARE | flags;
 		ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize;
 	} else {
 		lbalen->flags = CTL_LLF_VERIFY | flags;
 		ctsio->kern_total_len = 0;
 	}
 	ctsio->kern_rel_offset = 0;
 
 	CTL_DEBUG_PRINT(("ctl_verify: calling data_submit()\n"));
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_report_luns(struct ctl_scsiio *ctsio)
 {
 	struct scsi_report_luns *cdb;
 	struct scsi_report_luns_data *lun_data;
 	struct ctl_lun *lun, *request_lun;
 	int num_luns, retval;
 	uint32_t alloc_len, lun_datalen;
 	int num_filled, well_known;
 	uint32_t initidx, targ_lun_id, lun_id;
 
 	retval = CTL_RETVAL_COMPLETE;
 	well_known = 0;
 
 	cdb = (struct scsi_report_luns *)ctsio->cdb;
 
 	CTL_DEBUG_PRINT(("ctl_report_luns\n"));
 
 	mtx_lock(&control_softc->ctl_lock);
 	num_luns = control_softc->num_luns;
 	mtx_unlock(&control_softc->ctl_lock);
 
 	switch (cdb->select_report) {
 	case RPL_REPORT_DEFAULT:
 	case RPL_REPORT_ALL:
 		break;
 	case RPL_REPORT_WELLKNOWN:
 		well_known = 1;
 		num_luns = 0;
 		break;
 	default:
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (retval);
 		break; /* NOTREACHED */
 	}
 
 	alloc_len = scsi_4btoul(cdb->length);
 	/*
 	 * The initiator has to allocate at least 16 bytes for this request,
 	 * so he can at least get the header and the first LUN.  Otherwise
 	 * we reject the request (per SPC-3 rev 14, section 6.21).
 	 */
 	if (alloc_len < (sizeof(struct scsi_report_luns_data) +
 	    sizeof(struct scsi_report_luns_lundata))) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 6,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (retval);
 	}
 
 	request_lun = (struct ctl_lun *)
 		ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	lun_datalen = sizeof(*lun_data) +
 		(num_luns * sizeof(struct scsi_report_luns_lundata));
 
 	ctsio->kern_data_ptr = malloc(lun_datalen, M_CTL, M_WAITOK | M_ZERO);
 	lun_data = (struct scsi_report_luns_data *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 
 	mtx_lock(&control_softc->ctl_lock);
 	for (targ_lun_id = 0, num_filled = 0; targ_lun_id < CTL_MAX_LUNS && num_filled < num_luns; targ_lun_id++) {
 		lun_id = ctl_map_lun(ctsio->io_hdr.nexus.targ_port, targ_lun_id);
 		if (lun_id >= CTL_MAX_LUNS)
 			continue;
 		lun = control_softc->ctl_luns[lun_id];
 		if (lun == NULL)
 			continue;
 
 		if (targ_lun_id <= 0xff) {
 			/*
 			 * Peripheral addressing method, bus number 0.
 			 */
 			lun_data->luns[num_filled].lundata[0] =
 				RPL_LUNDATA_ATYP_PERIPH;
 			lun_data->luns[num_filled].lundata[1] = targ_lun_id;
 			num_filled++;
 		} else if (targ_lun_id <= 0x3fff) {
 			/*
 			 * Flat addressing method.
 			 */
 			lun_data->luns[num_filled].lundata[0] =
 				RPL_LUNDATA_ATYP_FLAT |
 				(targ_lun_id & RPL_LUNDATA_FLAT_LUN_MASK);
 #ifdef OLDCTLHEADERS
 				(SRLD_ADDR_FLAT << SRLD_ADDR_SHIFT) |
 				(targ_lun_id & SRLD_BUS_LUN_MASK);
 #endif
 			lun_data->luns[num_filled].lundata[1] =
 #ifdef OLDCTLHEADERS
 				targ_lun_id >> SRLD_BUS_LUN_BITS;
 #endif
 				targ_lun_id >> RPL_LUNDATA_FLAT_LUN_BITS;
 			num_filled++;
 		} else {
 			printf("ctl_report_luns: bogus LUN number %jd, "
 			       "skipping\n", (intmax_t)targ_lun_id);
 		}
 		/*
 		 * According to SPC-3, rev 14 section 6.21:
 		 *
 		 * "The execution of a REPORT LUNS command to any valid and
 		 * installed logical unit shall clear the REPORTED LUNS DATA
 		 * HAS CHANGED unit attention condition for all logical
 		 * units of that target with respect to the requesting
 		 * initiator. A valid and installed logical unit is one
 		 * having a PERIPHERAL QUALIFIER of 000b in the standard
 		 * INQUIRY data (see 6.4.2)."
 		 *
 		 * If request_lun is NULL, the LUN this report luns command
 		 * was issued to is either disabled or doesn't exist. In that
 		 * case, we shouldn't clear any pending lun change unit
 		 * attention.
 		 */
 		if (request_lun != NULL) {
 			mtx_lock(&lun->lun_lock);
 			lun->pending_ua[initidx] &= ~CTL_UA_LUN_CHANGE;
 			mtx_unlock(&lun->lun_lock);
 		}
 	}
 	mtx_unlock(&control_softc->ctl_lock);
 
 	/*
 	 * It's quite possible that we've returned fewer LUNs than we allocated
 	 * space for.  Trim it.
 	 */
 	lun_datalen = sizeof(*lun_data) +
 		(num_filled * sizeof(struct scsi_report_luns_lundata));
 
 	if (lun_datalen < alloc_len) {
 		ctsio->residual = alloc_len - lun_datalen;
 		ctsio->kern_data_len = lun_datalen;
 		ctsio->kern_total_len = lun_datalen;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * We set this to the actual data length, regardless of how much
 	 * space we actually have to return results.  If the user looks at
 	 * this value, he'll know whether or not he allocated enough space
 	 * and reissue the command if necessary.  We don't support well
 	 * known logical units, so if the user asks for that, return none.
 	 */
 	scsi_ulto4b(lun_datalen - 8, lun_data->length);
 
 	/*
 	 * We can only return SCSI_STATUS_CHECK_COND when we can't satisfy
 	 * this request.
 	 */
 	ctsio->scsi_status = SCSI_STATUS_OK;
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 int
 ctl_request_sense(struct ctl_scsiio *ctsio)
 {
 	struct scsi_request_sense *cdb;
 	struct scsi_sense_data *sense_ptr;
 	struct ctl_lun *lun;
 	uint32_t initidx;
 	int have_error;
 	scsi_sense_data_type sense_format;
 
 	cdb = (struct scsi_request_sense *)ctsio->cdb;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	CTL_DEBUG_PRINT(("ctl_request_sense\n"));
 
 	/*
 	 * Determine which sense format the user wants.
 	 */
 	if (cdb->byte2 & SRS_DESC)
 		sense_format = SSD_TYPE_DESC;
 	else
 		sense_format = SSD_TYPE_FIXED;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*sense_ptr), M_CTL, M_WAITOK);
 	sense_ptr = (struct scsi_sense_data *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * struct scsi_sense_data, which is currently set to 256 bytes, is
 	 * larger than the largest allowed value for the length field in the
 	 * REQUEST SENSE CDB, which is 252 bytes as of SPC-4.
 	 */
 	ctsio->residual = 0;
 	ctsio->kern_data_len = cdb->length;
 	ctsio->kern_total_len = cdb->length;
 
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * If we don't have a LUN, we don't have any pending sense.
 	 */
 	if (lun == NULL)
 		goto no_sense;
 
 	have_error = 0;
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	/*
 	 * Check for pending sense, and then for pending unit attentions.
 	 * Pending sense gets returned first, then pending unit attentions.
 	 */
 	mtx_lock(&lun->lun_lock);
 #ifdef CTL_WITH_CA
 	if (ctl_is_set(lun->have_ca, initidx)) {
 		scsi_sense_data_type stored_format;
 
 		/*
 		 * Check to see which sense format was used for the stored
 		 * sense data.
 		 */
 		stored_format = scsi_sense_type(&lun->pending_sense[initidx]);
 
 		/*
 		 * If the user requested a different sense format than the
 		 * one we stored, then we need to convert it to the other
 		 * format.  If we're going from descriptor to fixed format
 		 * sense data, we may lose things in translation, depending
 		 * on what options were used.
 		 *
 		 * If the stored format is SSD_TYPE_NONE (i.e. invalid),
 		 * for some reason we'll just copy it out as-is.
 		 */
 		if ((stored_format == SSD_TYPE_FIXED)
 		 && (sense_format == SSD_TYPE_DESC))
 			ctl_sense_to_desc((struct scsi_sense_data_fixed *)
 			    &lun->pending_sense[initidx],
 			    (struct scsi_sense_data_desc *)sense_ptr);
 		else if ((stored_format == SSD_TYPE_DESC)
 		      && (sense_format == SSD_TYPE_FIXED))
 			ctl_sense_to_fixed((struct scsi_sense_data_desc *)
 			    &lun->pending_sense[initidx],
 			    (struct scsi_sense_data_fixed *)sense_ptr);
 		else
 			memcpy(sense_ptr, &lun->pending_sense[initidx],
 			       ctl_min(sizeof(*sense_ptr),
 			       sizeof(lun->pending_sense[initidx])));
 
 		ctl_clear_mask(lun->have_ca, initidx);
 		have_error = 1;
 	} else
 #endif
 	if (lun->pending_ua[initidx] != CTL_UA_NONE) {
 		ctl_ua_type ua_type;
 
 		ua_type = ctl_build_ua(lun->pending_ua[initidx],
 				       sense_ptr, sense_format);
 		if (ua_type != CTL_UA_NONE) {
 			have_error = 1;
 			/* We're reporting this UA, so clear it */
 			lun->pending_ua[initidx] &= ~ua_type;
 		}
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	/*
 	 * We already have a pending error, return it.
 	 */
 	if (have_error != 0) {
 		/*
 		 * We report the SCSI status as OK, since the status of the
 		 * request sense command itself is OK.
 		 */
 		ctsio->scsi_status = SCSI_STATUS_OK;
 
 		/*
 		 * We report 0 for the sense length, because we aren't doing
 		 * autosense in this case.  We're reporting sense as
 		 * parameter data.
 		 */
 		ctsio->sense_len = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 no_sense:
 
 	/*
 	 * No sense information to report, so we report that everything is
 	 * okay.
 	 */
 	ctl_set_sense_data(sense_ptr,
 			   lun,
 			   sense_format,
 			   /*current_error*/ 1,
 			   /*sense_key*/ SSD_KEY_NO_SENSE,
 			   /*asc*/ 0x00,
 			   /*ascq*/ 0x00,
 			   SSD_ELEM_NONE);
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 
 	/*
 	 * We report 0 for the sense length, because we aren't doing
 	 * autosense in this case.  We're reporting sense as parameter data.
 	 */
 	ctsio->sense_len = 0;
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_tur(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	CTL_DEBUG_PRINT(("ctl_tur\n"));
 
 	if (lun == NULL)
 		return (EINVAL);
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.status = CTL_SUCCESS;
 
 	ctl_done((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 #ifdef notyet
 static int
 ctl_cmddt_inquiry(struct ctl_scsiio *ctsio)
 {
 
 }
 #endif
 
 static int
 ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct scsi_vpd_supported_pages *pages;
 	int sup_page_size;
 	struct ctl_lun *lun;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	sup_page_size = sizeof(struct scsi_vpd_supported_pages) *
 	    SCSI_EVPD_NUM_SUPPORTED_PAGES;
 	ctsio->kern_data_ptr = malloc(sup_page_size, M_CTL, M_WAITOK | M_ZERO);
 	pages = (struct scsi_vpd_supported_pages *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (sup_page_size < alloc_len) {
 		ctsio->residual = alloc_len - sup_page_size;
 		ctsio->kern_data_len = sup_page_size;
 		ctsio->kern_total_len = sup_page_size;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		pages->device = (SID_QUAL_LU_CONNECTED << 5) |
 				lun->be_lun->lun_type;
 	else
 		pages->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	pages->length = SCSI_EVPD_NUM_SUPPORTED_PAGES;
 	/* Supported VPD pages */
 	pages->page_list[0] = SVPD_SUPPORTED_PAGES;
 	/* Serial Number */
 	pages->page_list[1] = SVPD_UNIT_SERIAL_NUMBER;
 	/* Device Identification */
 	pages->page_list[2] = SVPD_DEVICE_ID;
+	/* Extended INQUIRY Data */
+	pages->page_list[3] = SVPD_EXTENDED_INQUIRY_DATA;
 	/* Mode Page Policy */
-	pages->page_list[3] = SVPD_MODE_PAGE_POLICY;
+	pages->page_list[4] = SVPD_MODE_PAGE_POLICY;
 	/* SCSI Ports */
-	pages->page_list[4] = SVPD_SCSI_PORTS;
+	pages->page_list[5] = SVPD_SCSI_PORTS;
 	/* Third-party Copy */
-	pages->page_list[5] = SVPD_SCSI_TPC;
+	pages->page_list[6] = SVPD_SCSI_TPC;
 	/* Block limits */
-	pages->page_list[6] = SVPD_BLOCK_LIMITS;
+	pages->page_list[7] = SVPD_BLOCK_LIMITS;
 	/* Block Device Characteristics */
-	pages->page_list[7] = SVPD_BDC;
+	pages->page_list[8] = SVPD_BDC;
 	/* Logical Block Provisioning */
-	pages->page_list[8] = SVPD_LBP;
+	pages->page_list[9] = SVPD_LBP;
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct scsi_vpd_unit_serial_number *sn_ptr;
 	struct ctl_lun *lun;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*sn_ptr), M_CTL, M_WAITOK | M_ZERO);
 	sn_ptr = (struct scsi_vpd_unit_serial_number *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (sizeof(*sn_ptr) < alloc_len) {
 		ctsio->residual = alloc_len - sizeof(*sn_ptr);
 		ctsio->kern_data_len = sizeof(*sn_ptr);
 		ctsio->kern_total_len = sizeof(*sn_ptr);
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		sn_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		sn_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	sn_ptr->page_code = SVPD_UNIT_SERIAL_NUMBER;
 	sn_ptr->length = ctl_min(sizeof(*sn_ptr) - 4, CTL_SN_LEN);
 	/*
 	 * If we don't have a LUN, we just leave the serial number as
 	 * all spaces.
 	 */
 	memset(sn_ptr->serial_num, 0x20, sizeof(sn_ptr->serial_num));
 	if (lun != NULL) {
 		strncpy((char *)sn_ptr->serial_num,
 			(char *)lun->be_lun->serial_num, CTL_SN_LEN);
 	}
 	ctsio->scsi_status = SCSI_STATUS_OK;
 
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 
 static int
+ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len)
+{
+	struct scsi_vpd_extended_inquiry_data *eid_ptr;
+	struct ctl_lun *lun;
+	int data_len;
+
+	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
+
+	data_len = sizeof(struct scsi_vpd_mode_page_policy) +
+	    sizeof(struct scsi_vpd_mode_page_policy_descr);
+
+	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
+	eid_ptr = (struct scsi_vpd_extended_inquiry_data *)ctsio->kern_data_ptr;
+	ctsio->kern_sg_entries = 0;
+
+	if (data_len < alloc_len) {
+		ctsio->residual = alloc_len - data_len;
+		ctsio->kern_data_len = data_len;
+		ctsio->kern_total_len = data_len;
+	} else {
+		ctsio->residual = 0;
+		ctsio->kern_data_len = alloc_len;
+		ctsio->kern_total_len = alloc_len;
+	}
+	ctsio->kern_data_resid = 0;
+	ctsio->kern_rel_offset = 0;
+	ctsio->kern_sg_entries = 0;
+
+	/*
+	 * The control device is always connected.  The disk device, on the
+	 * other hand, may not be online all the time.
+	 */
+	if (lun != NULL)
+		eid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
+				     lun->be_lun->lun_type;
+	else
+		eid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
+	eid_ptr->page_code = SVPD_EXTENDED_INQUIRY_DATA;
+	eid_ptr->page_length = data_len - 4;
+	eid_ptr->flags2 = SVPD_EID_HEADSUP | SVPD_EID_ORDSUP | SVPD_EID_SIMPSUP;
+	eid_ptr->flags3 = SVPD_EID_V_SUP;
+
+	ctsio->scsi_status = SCSI_STATUS_OK;
+	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
+	ctsio->be_move_done = ctl_config_move_done;
+	ctl_datamove((union ctl_io *)ctsio);
+
+	return (CTL_RETVAL_COMPLETE);
+}
+
+static int
 ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct scsi_vpd_mode_page_policy *mpp_ptr;
 	struct ctl_lun *lun;
 	int data_len;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	data_len = sizeof(struct scsi_vpd_mode_page_policy) +
 	    sizeof(struct scsi_vpd_mode_page_policy_descr);
 
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	mpp_ptr = (struct scsi_vpd_mode_page_policy *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (data_len < alloc_len) {
 		ctsio->residual = alloc_len - data_len;
 		ctsio->kern_data_len = data_len;
 		ctsio->kern_total_len = data_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.
 	 */
 	if (lun != NULL)
 		mpp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				     lun->be_lun->lun_type;
 	else
 		mpp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	mpp_ptr->page_code = SVPD_MODE_PAGE_POLICY;
 	scsi_ulto2b(data_len - 4, mpp_ptr->page_length);
 	mpp_ptr->descr[0].page_code = 0x3f;
 	mpp_ptr->descr[0].subpage_code = 0xff;
 	mpp_ptr->descr[0].policy = SVPD_MPP_SHARED;
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct scsi_vpd_device_id *devid_ptr;
 	struct scsi_vpd_id_descriptor *desc;
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 	struct ctl_port *port;
 	int data_len;
 	uint8_t proto;
 
 	ctl_softc = control_softc;
 
 	port = ctl_softc->ctl_ports[ctl_port_idx(ctsio->io_hdr.nexus.targ_port)];
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	data_len = sizeof(struct scsi_vpd_device_id) +
 	    sizeof(struct scsi_vpd_id_descriptor) +
 		sizeof(struct scsi_vpd_id_rel_trgt_port_id) +
 	    sizeof(struct scsi_vpd_id_descriptor) +
 		sizeof(struct scsi_vpd_id_trgt_port_grp_id);
 	if (lun && lun->lun_devid)
 		data_len += lun->lun_devid->len;
 	if (port->port_devid)
 		data_len += port->port_devid->len;
 	if (port->target_devid)
 		data_len += port->target_devid->len;
 
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	devid_ptr = (struct scsi_vpd_device_id *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (data_len < alloc_len) {
 		ctsio->residual = alloc_len - data_len;
 		ctsio->kern_data_len = data_len;
 		ctsio->kern_total_len = data_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.
 	 */
 	if (lun != NULL)
 		devid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				     lun->be_lun->lun_type;
 	else
 		devid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	devid_ptr->page_code = SVPD_DEVICE_ID;
 	scsi_ulto2b(data_len - 4, devid_ptr->length);
 
 	if (port->port_type == CTL_PORT_FC)
 		proto = SCSI_PROTO_FC << 4;
 	else if (port->port_type == CTL_PORT_ISCSI)
 		proto = SCSI_PROTO_ISCSI << 4;
 	else
 		proto = SCSI_PROTO_SPI << 4;
 	desc = (struct scsi_vpd_id_descriptor *)devid_ptr->desc_list;
 
 	/*
 	 * We're using a LUN association here.  i.e., this device ID is a
 	 * per-LUN identifier.
 	 */
 	if (lun && lun->lun_devid) {
 		memcpy(desc, lun->lun_devid->data, lun->lun_devid->len);
 		desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc +
 		    lun->lun_devid->len);
 	}
 
 	/*
 	 * This is for the WWPN which is a port association.
 	 */
 	if (port->port_devid) {
 		memcpy(desc, port->port_devid->data, port->port_devid->len);
 		desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc +
 		    port->port_devid->len);
 	}
 
 	/*
 	 * This is for the Relative Target Port(type 4h) identifier
 	 */
 	desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY;
 	desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT |
 	    SVPD_ID_TYPE_RELTARG;
 	desc->length = 4;
 	scsi_ulto2b(ctsio->io_hdr.nexus.targ_port, &desc->identifier[2]);
 	desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 	    sizeof(struct scsi_vpd_id_rel_trgt_port_id));
 
 	/*
 	 * This is for the Target Port Group(type 5h) identifier
 	 */
 	desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY;
 	desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT |
 	    SVPD_ID_TYPE_TPORTGRP;
 	desc->length = 4;
 	scsi_ulto2b(ctsio->io_hdr.nexus.targ_port / CTL_MAX_PORTS + 1,
 	    &desc->identifier[2]);
 	desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 	    sizeof(struct scsi_vpd_id_trgt_port_grp_id));
 
 	/*
 	 * This is for the Target identifier
 	 */
 	if (port->target_devid) {
 		memcpy(desc, port->target_devid->data, port->target_devid->len);
 	}
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_softc *softc = control_softc;
 	struct scsi_vpd_scsi_ports *sp;
 	struct scsi_vpd_port_designation *pd;
 	struct scsi_vpd_port_designation_cont *pdc;
 	struct ctl_lun *lun;
 	struct ctl_port *port;
 	int data_len, num_target_ports, iid_len, id_len, g, pg, p;
 	int num_target_port_groups, single;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	single = ctl_is_single;
 	if (single)
 		num_target_port_groups = 1;
 	else
 		num_target_port_groups = NUM_TARGET_PORT_GROUPS;
 	num_target_ports = 0;
 	iid_len = 0;
 	id_len = 0;
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 			continue;
 		if (lun != NULL &&
 		    ctl_map_lun_back(port->targ_port, lun->lun) >=
 		    CTL_MAX_LUNS)
 			continue;
 		num_target_ports++;
 		if (port->init_devid)
 			iid_len += port->init_devid->len;
 		if (port->port_devid)
 			id_len += port->port_devid->len;
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	data_len = sizeof(struct scsi_vpd_scsi_ports) + num_target_port_groups *
 	    num_target_ports * (sizeof(struct scsi_vpd_port_designation) +
 	     sizeof(struct scsi_vpd_port_designation_cont)) + iid_len + id_len;
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	sp = (struct scsi_vpd_scsi_ports *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (data_len < alloc_len) {
 		ctsio->residual = alloc_len - data_len;
 		ctsio->kern_data_len = data_len;
 		ctsio->kern_total_len = data_len;
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		sp->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		sp->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	sp->page_code = SVPD_SCSI_PORTS;
 	scsi_ulto2b(data_len - sizeof(struct scsi_vpd_scsi_ports),
 	    sp->page_length);
 	pd = &sp->design[0];
 
 	mtx_lock(&softc->ctl_lock);
 	if (softc->flags & CTL_FLAG_MASTER_SHELF)
 		pg = 0;
 	else
 		pg = 1;
 	for (g = 0; g < num_target_port_groups; g++) {
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 				continue;
 			if (lun != NULL &&
 			    ctl_map_lun_back(port->targ_port, lun->lun) >=
 			    CTL_MAX_LUNS)
 				continue;
 			p = port->targ_port % CTL_MAX_PORTS + g * CTL_MAX_PORTS;
 			scsi_ulto2b(p, pd->relative_port_id);
 			if (port->init_devid && g == pg) {
 				iid_len = port->init_devid->len;
 				memcpy(pd->initiator_transportid,
 				    port->init_devid->data, port->init_devid->len);
 			} else
 				iid_len = 0;
 			scsi_ulto2b(iid_len, pd->initiator_transportid_length);
 			pdc = (struct scsi_vpd_port_designation_cont *)
 			    (&pd->initiator_transportid[iid_len]);
 			if (port->port_devid && g == pg) {
 				id_len = port->port_devid->len;
 				memcpy(pdc->target_port_descriptors,
 				    port->port_devid->data, port->port_devid->len);
 			} else
 				id_len = 0;
 			scsi_ulto2b(id_len, pdc->target_port_descriptors_length);
 			pd = (struct scsi_vpd_port_designation *)
 			    ((uint8_t *)pdc->target_port_descriptors + id_len);
 		}
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct scsi_vpd_block_limits *bl_ptr;
 	struct ctl_lun *lun;
 	int bs;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*bl_ptr), M_CTL, M_WAITOK | M_ZERO);
 	bl_ptr = (struct scsi_vpd_block_limits *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (sizeof(*bl_ptr) < alloc_len) {
 		ctsio->residual = alloc_len - sizeof(*bl_ptr);
 		ctsio->kern_data_len = sizeof(*bl_ptr);
 		ctsio->kern_total_len = sizeof(*bl_ptr);
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		bl_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		bl_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	bl_ptr->page_code = SVPD_BLOCK_LIMITS;
 	scsi_ulto2b(sizeof(*bl_ptr), bl_ptr->page_length);
 	bl_ptr->max_cmp_write_len = 0xff;
 	scsi_ulto4b(0xffffffff, bl_ptr->max_txfer_len);
 	if (lun != NULL) {
 		bs = lun->be_lun->blocksize;
 		scsi_ulto4b(MAXPHYS / bs, bl_ptr->opt_txfer_len);
 		if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) {
 			scsi_ulto4b(0xffffffff, bl_ptr->max_unmap_lba_cnt);
 			scsi_ulto4b(0xffffffff, bl_ptr->max_unmap_blk_cnt);
 			if (lun->be_lun->pblockexp != 0) {
 				scsi_ulto4b((1 << lun->be_lun->pblockexp),
 				    bl_ptr->opt_unmap_grain);
 				scsi_ulto4b(0x80000000 | lun->be_lun->pblockoff,
 				    bl_ptr->unmap_grain_align);
 			}
 		}
 	}
 	scsi_u64to8b(UINT64_MAX, bl_ptr->max_write_same_length);
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct scsi_vpd_block_device_characteristics *bdc_ptr;
 	struct ctl_lun *lun;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*bdc_ptr), M_CTL, M_WAITOK | M_ZERO);
 	bdc_ptr = (struct scsi_vpd_block_device_characteristics *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (sizeof(*bdc_ptr) < alloc_len) {
 		ctsio->residual = alloc_len - sizeof(*bdc_ptr);
 		ctsio->kern_data_len = sizeof(*bdc_ptr);
 		ctsio->kern_total_len = sizeof(*bdc_ptr);
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		bdc_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		bdc_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	bdc_ptr->page_code = SVPD_BDC;
 	scsi_ulto2b(sizeof(*bdc_ptr) - 4, bdc_ptr->page_length);
 	scsi_ulto2b(SVPD_NON_ROTATING, bdc_ptr->medium_rotation_rate);
 	bdc_ptr->flags = SVPD_FUAB | SVPD_VBULS;
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct scsi_vpd_logical_block_prov *lbp_ptr;
 	struct ctl_lun *lun;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*lbp_ptr), M_CTL, M_WAITOK | M_ZERO);
 	lbp_ptr = (struct scsi_vpd_logical_block_prov *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	if (sizeof(*lbp_ptr) < alloc_len) {
 		ctsio->residual = alloc_len - sizeof(*lbp_ptr);
 		ctsio->kern_data_len = sizeof(*lbp_ptr);
 		ctsio->kern_total_len = sizeof(*lbp_ptr);
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		lbp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		lbp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	lbp_ptr->page_code = SVPD_LBP;
 	scsi_ulto2b(sizeof(*lbp_ptr) - 4, lbp_ptr->page_length);
 	if (lun != NULL && lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) {
 		lbp_ptr->flags = SVPD_LBP_UNMAP | SVPD_LBP_WS16 |
 		    SVPD_LBP_WS10 | SVPD_LBP_RZ | SVPD_LBP_ANC_SUP;
 		lbp_ptr->prov_type = SVPD_LBP_RESOURCE;
 	}
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd(struct ctl_scsiio *ctsio)
 {
 	struct scsi_inquiry *cdb;
 	struct ctl_lun *lun;
 	int alloc_len, retval;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	cdb = (struct scsi_inquiry *)ctsio->cdb;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	alloc_len = scsi_2btoul(cdb->length);
 
 	switch (cdb->page_code) {
 	case SVPD_SUPPORTED_PAGES:
 		retval = ctl_inquiry_evpd_supported(ctsio, alloc_len);
 		break;
 	case SVPD_UNIT_SERIAL_NUMBER:
 		retval = ctl_inquiry_evpd_serial(ctsio, alloc_len);
 		break;
 	case SVPD_DEVICE_ID:
 		retval = ctl_inquiry_evpd_devid(ctsio, alloc_len);
+		break;
+	case SVPD_EXTENDED_INQUIRY_DATA:
+		retval = ctl_inquiry_evpd_eid(ctsio, alloc_len);
 		break;
 	case SVPD_MODE_PAGE_POLICY:
 		retval = ctl_inquiry_evpd_mpp(ctsio, alloc_len);
 		break;
 	case SVPD_SCSI_PORTS:
 		retval = ctl_inquiry_evpd_scsi_ports(ctsio, alloc_len);
 		break;
 	case SVPD_SCSI_TPC:
 		retval = ctl_inquiry_evpd_tpc(ctsio, alloc_len);
 		break;
 	case SVPD_BLOCK_LIMITS:
 		retval = ctl_inquiry_evpd_block_limits(ctsio, alloc_len);
 		break;
 	case SVPD_BDC:
 		retval = ctl_inquiry_evpd_bdc(ctsio, alloc_len);
 		break;
 	case SVPD_LBP:
 		retval = ctl_inquiry_evpd_lbp(ctsio, alloc_len);
 		break;
 	default:
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		retval = CTL_RETVAL_COMPLETE;
 		break;
 	}
 
 	return (retval);
 }
 
 static int
 ctl_inquiry_std(struct ctl_scsiio *ctsio)
 {
 	struct scsi_inquiry_data *inq_ptr;
 	struct scsi_inquiry *cdb;
 	struct ctl_softc *ctl_softc;
 	struct ctl_lun *lun;
 	char *val;
 	uint32_t alloc_len;
 	ctl_port_type port_type;
 
 	ctl_softc = control_softc;
 
 	/*
 	 * Figure out whether we're talking to a Fibre Channel port or not.
 	 * We treat the ioctl front end, and any SCSI adapters, as packetized
 	 * SCSI front ends.
 	 */
 	port_type = ctl_softc->ctl_ports[
 	    ctl_port_idx(ctsio->io_hdr.nexus.targ_port)]->port_type;
 	if (port_type == CTL_PORT_IOCTL || port_type == CTL_PORT_INTERNAL)
 		port_type = CTL_PORT_SCSI;
 
 	lun = ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	cdb = (struct scsi_inquiry *)ctsio->cdb;
 	alloc_len = scsi_2btoul(cdb->length);
 
 	/*
 	 * We malloc the full inquiry data size here and fill it
 	 * in.  If the user only asks for less, we'll give him
 	 * that much.
 	 */
 	ctsio->kern_data_ptr = malloc(sizeof(*inq_ptr), M_CTL, M_WAITOK | M_ZERO);
 	inq_ptr = (struct scsi_inquiry_data *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_resid = 0;
 	ctsio->kern_rel_offset = 0;
 
 	if (sizeof(*inq_ptr) < alloc_len) {
 		ctsio->residual = alloc_len - sizeof(*inq_ptr);
 		ctsio->kern_data_len = sizeof(*inq_ptr);
 		ctsio->kern_total_len = sizeof(*inq_ptr);
 	} else {
 		ctsio->residual = 0;
 		ctsio->kern_data_len = alloc_len;
 		ctsio->kern_total_len = alloc_len;
 	}
 
 	/*
 	 * If we have a LUN configured, report it as connected.  Otherwise,
 	 * report that it is offline or no device is supported, depending 
 	 * on the value of inquiry_pq_no_lun.
 	 *
 	 * According to the spec (SPC-4 r34), the peripheral qualifier
 	 * SID_QUAL_LU_OFFLINE (001b) is used in the following scenario:
 	 *
 	 * "A peripheral device having the specified peripheral device type 
 	 * is not connected to this logical unit. However, the device
 	 * server is capable of supporting the specified peripheral device
 	 * type on this logical unit."
 	 *
 	 * According to the same spec, the peripheral qualifier
 	 * SID_QUAL_BAD_LU (011b) is used in this scenario:
 	 *
 	 * "The device server is not capable of supporting a peripheral
 	 * device on this logical unit. For this peripheral qualifier the
 	 * peripheral device type shall be set to 1Fh. All other peripheral
 	 * device type values are reserved for this peripheral qualifier."
 	 *
 	 * Given the text, it would seem that we probably want to report that
 	 * the LUN is offline here.  There is no LUN connected, but we can
 	 * support a LUN at the given LUN number.
 	 *
 	 * In the real world, though, it sounds like things are a little
 	 * different:
 	 *
 	 * - Linux, when presented with a LUN with the offline peripheral
 	 *   qualifier, will create an sg driver instance for it.  So when
 	 *   you attach it to CTL, you wind up with a ton of sg driver
 	 *   instances.  (One for every LUN that Linux bothered to probe.)
 	 *   Linux does this despite the fact that it issues a REPORT LUNs
 	 *   to LUN 0 to get the inventory of supported LUNs.
 	 *
 	 * - There is other anecdotal evidence (from Emulex folks) about
 	 *   arrays that use the offline peripheral qualifier for LUNs that
 	 *   are on the "passive" path in an active/passive array.
 	 *
 	 * So the solution is provide a hopefully reasonable default
 	 * (return bad/no LUN) and allow the user to change the behavior
 	 * with a tunable/sysctl variable.
 	 */
 	if (lun != NULL)
 		inq_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else if (ctl_softc->inquiry_pq_no_lun == 0)
 		inq_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	else
 		inq_ptr->device = (SID_QUAL_BAD_LU << 5) | T_NODEVICE;
 
 	/* RMB in byte 2 is 0 */
 	inq_ptr->version = SCSI_REV_SPC4;
 
 	/*
 	 * According to SAM-3, even if a device only supports a single
 	 * level of LUN addressing, it should still set the HISUP bit:
 	 *
 	 * 4.9.1 Logical unit numbers overview
 	 *
 	 * All logical unit number formats described in this standard are
 	 * hierarchical in structure even when only a single level in that
 	 * hierarchy is used. The HISUP bit shall be set to one in the
 	 * standard INQUIRY data (see SPC-2) when any logical unit number
 	 * format described in this standard is used.  Non-hierarchical
 	 * formats are outside the scope of this standard.
 	 *
 	 * Therefore we set the HiSup bit here.
 	 *
 	 * The reponse format is 2, per SPC-3.
 	 */
 	inq_ptr->response_format = SID_HiSup | 2;
 
 	inq_ptr->additional_length =
 	    offsetof(struct scsi_inquiry_data, vendor_specific1) -
 	    (offsetof(struct scsi_inquiry_data, additional_length) + 1);
 	CTL_DEBUG_PRINT(("additional_length = %d\n",
 			 inq_ptr->additional_length));
 
 	inq_ptr->spc3_flags = SPC3_SID_3PC;
 	if (!ctl_is_single)
 		inq_ptr->spc3_flags |= SPC3_SID_TPGS_IMPLICIT;
 	/* 16 bit addressing */
 	if (port_type == CTL_PORT_SCSI)
 		inq_ptr->spc2_flags = SPC2_SID_ADDR16;
 	/* XXX set the SID_MultiP bit here if we're actually going to
 	   respond on multiple ports */
 	inq_ptr->spc2_flags |= SPC2_SID_MultiP;
 
 	/* 16 bit data bus, synchronous transfers */
 	if (port_type == CTL_PORT_SCSI)
 		inq_ptr->flags = SID_WBus16 | SID_Sync;
 	/*
 	 * XXX KDM do we want to support tagged queueing on the control
 	 * device at all?
 	 */
 	if ((lun == NULL)
 	 || (lun->be_lun->lun_type != T_PROCESSOR))
 		inq_ptr->flags |= SID_CmdQue;
 	/*
 	 * Per SPC-3, unused bytes in ASCII strings are filled with spaces.
 	 * We have 8 bytes for the vendor name, and 16 bytes for the device
 	 * name and 4 bytes for the revision.
 	 */
 	if (lun == NULL || (val = ctl_get_opt(&lun->be_lun->options,
 	    "vendor")) == NULL) {
 		strncpy(inq_ptr->vendor, CTL_VENDOR, sizeof(inq_ptr->vendor));
 	} else {
 		memset(inq_ptr->vendor, ' ', sizeof(inq_ptr->vendor));
 		strncpy(inq_ptr->vendor, val,
 		    min(sizeof(inq_ptr->vendor), strlen(val)));
 	}
 	if (lun == NULL) {
 		strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT,
 		    sizeof(inq_ptr->product));
 	} else if ((val = ctl_get_opt(&lun->be_lun->options, "product")) == NULL) {
 		switch (lun->be_lun->lun_type) {
 		case T_DIRECT:
 			strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT,
 			    sizeof(inq_ptr->product));
 			break;
 		case T_PROCESSOR:
 			strncpy(inq_ptr->product, CTL_PROCESSOR_PRODUCT,
 			    sizeof(inq_ptr->product));
 			break;
 		default:
 			strncpy(inq_ptr->product, CTL_UNKNOWN_PRODUCT,
 			    sizeof(inq_ptr->product));
 			break;
 		}
 	} else {
 		memset(inq_ptr->product, ' ', sizeof(inq_ptr->product));
 		strncpy(inq_ptr->product, val,
 		    min(sizeof(inq_ptr->product), strlen(val)));
 	}
 
 	/*
 	 * XXX make this a macro somewhere so it automatically gets
 	 * incremented when we make changes.
 	 */
 	if (lun == NULL || (val = ctl_get_opt(&lun->be_lun->options,
 	    "revision")) == NULL) {
 		strncpy(inq_ptr->revision, "0001", sizeof(inq_ptr->revision));
 	} else {
 		memset(inq_ptr->revision, ' ', sizeof(inq_ptr->revision));
 		strncpy(inq_ptr->revision, val,
 		    min(sizeof(inq_ptr->revision), strlen(val)));
 	}
 
 	/*
 	 * For parallel SCSI, we support double transition and single
 	 * transition clocking.  We also support QAS (Quick Arbitration
 	 * and Selection) and Information Unit transfers on both the
 	 * control and array devices.
 	 */
 	if (port_type == CTL_PORT_SCSI)
 		inq_ptr->spi3data = SID_SPI_CLOCK_DT_ST | SID_SPI_QAS |
 				    SID_SPI_IUS;
 
 	/* SAM-5 (no version claimed) */
 	scsi_ulto2b(0x00A0, inq_ptr->version1);
 	/* SPC-4 (no version claimed) */
 	scsi_ulto2b(0x0460, inq_ptr->version2);
 	if (port_type == CTL_PORT_FC) {
 		/* FCP-2 ANSI INCITS.350:2003 */
 		scsi_ulto2b(0x0917, inq_ptr->version3);
 	} else if (port_type == CTL_PORT_SCSI) {
 		/* SPI-4 ANSI INCITS.362:200x */
 		scsi_ulto2b(0x0B56, inq_ptr->version3);
 	} else if (port_type == CTL_PORT_ISCSI) {
 		/* iSCSI (no version claimed) */
 		scsi_ulto2b(0x0960, inq_ptr->version3);
 	} else if (port_type == CTL_PORT_SAS) {
 		/* SAS (no version claimed) */
 		scsi_ulto2b(0x0BE0, inq_ptr->version3);
 	}
 
 	if (lun == NULL) {
 		/* SBC-3 (no version claimed) */
 		scsi_ulto2b(0x04C0, inq_ptr->version4);
 	} else {
 		switch (lun->be_lun->lun_type) {
 		case T_DIRECT:
 			/* SBC-3 (no version claimed) */
 			scsi_ulto2b(0x04C0, inq_ptr->version4);
 			break;
 		case T_PROCESSOR:
 		default:
 			break;
 		}
 	}
 
 	ctsio->scsi_status = SCSI_STATUS_OK;
 	if (ctsio->kern_data_len > 0) {
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 	} else {
 		ctsio->io_hdr.status = CTL_SUCCESS;
 		ctl_done((union ctl_io *)ctsio);
 	}
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_inquiry(struct ctl_scsiio *ctsio)
 {
 	struct scsi_inquiry *cdb;
 	int retval;
 
 	cdb = (struct scsi_inquiry *)ctsio->cdb;
 
 	retval = 0;
 
 	CTL_DEBUG_PRINT(("ctl_inquiry\n"));
 
 	/*
 	 * Right now, we don't support the CmdDt inquiry information.
 	 * This would be nice to support in the future.  When we do
 	 * support it, we should change this test so that it checks to make
 	 * sure SI_EVPD and SI_CMDDT aren't both set at the same time.
 	 */
 #ifdef notyet
 	if (((cdb->byte2 & SI_EVPD)
 	 && (cdb->byte2 & SI_CMDDT)))
 #endif
 	if (cdb->byte2 & SI_CMDDT) {
 		/*
 		 * Point to the SI_CMDDT bit.  We might change this
 		 * when we support SI_CMDDT, but since both bits would be
 		 * "wrong", this should probably just stay as-is then.
 		 */
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 1,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 1);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 	if (cdb->byte2 & SI_EVPD)
 		retval = ctl_inquiry_evpd(ctsio);
 #ifdef notyet
 	else if (cdb->byte2 & SI_CMDDT)
 		retval = ctl_inquiry_cmddt(ctsio);
 #endif
 	else
 		retval = ctl_inquiry_std(ctsio);
 
 	return (retval);
 }
 
 /*
  * For known CDB types, parse the LBA and length.
  */
 static int
 ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint32_t *len)
 {
 	if (io->io_hdr.io_type != CTL_IO_SCSI)
 		return (1);
 
 	switch (io->scsiio.cdb[0]) {
 	case COMPARE_AND_WRITE: {
 		struct scsi_compare_and_write *cdb;
 
 		cdb = (struct scsi_compare_and_write *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = cdb->length;
 		break;
 	}
 	case READ_6:
 	case WRITE_6: {
 		struct scsi_rw_6 *cdb;
 
 		cdb = (struct scsi_rw_6 *)io->scsiio.cdb;
 
 		*lba = scsi_3btoul(cdb->addr);
 		/* only 5 bits are valid in the most significant address byte */
 		*lba &= 0x1fffff;
 		*len = cdb->length;
 		break;
 	}
 	case READ_10:
 	case WRITE_10: {
 		struct scsi_rw_10 *cdb;
 
 		cdb = (struct scsi_rw_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_10: {
 		struct scsi_write_verify_10 *cdb;
 
 		cdb = (struct scsi_write_verify_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case READ_12:
 	case WRITE_12: {
 		struct scsi_rw_12 *cdb;
 
 		cdb = (struct scsi_rw_12 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_12: {
 		struct scsi_write_verify_12 *cdb;
 
 		cdb = (struct scsi_write_verify_12 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case READ_16:
 	case WRITE_16: {
 		struct scsi_rw_16 *cdb;
 
 		cdb = (struct scsi_rw_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_16: {
 		struct scsi_write_verify_16 *cdb;
 
 		cdb = (struct scsi_write_verify_16 *)io->scsiio.cdb;
 
 		
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_SAME_10: {
 		struct scsi_write_same_10 *cdb;
 
 		cdb = (struct scsi_write_same_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case WRITE_SAME_16: {
 		struct scsi_write_same_16 *cdb;
 
 		cdb = (struct scsi_write_same_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case VERIFY_10: {
 		struct scsi_verify_10 *cdb;
 
 		cdb = (struct scsi_verify_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case VERIFY_12: {
 		struct scsi_verify_12 *cdb;
 
 		cdb = (struct scsi_verify_12 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case VERIFY_16: {
 		struct scsi_verify_16 *cdb;
 
 		cdb = (struct scsi_verify_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	default:
 		return (1);
 		break; /* NOTREACHED */
 	}
 
 	return (0);
 }
 
 static ctl_action
 ctl_extent_check_lba(uint64_t lba1, uint32_t len1, uint64_t lba2, uint32_t len2)
 {
 	uint64_t endlba1, endlba2;
 
 	endlba1 = lba1 + len1 - 1;
 	endlba2 = lba2 + len2 - 1;
 
 	if ((endlba1 < lba2)
 	 || (endlba2 < lba1))
 		return (CTL_ACTION_PASS);
 	else
 		return (CTL_ACTION_BLOCK);
 }
 
 static ctl_action
 ctl_extent_check(union ctl_io *io1, union ctl_io *io2)
 {
 	uint64_t lba1, lba2;
 	uint32_t len1, len2;
 	int retval;
 
 	retval = ctl_get_lba_len(io1, &lba1, &len1);
 	if (retval != 0)
 		return (CTL_ACTION_ERROR);
 
 	retval = ctl_get_lba_len(io2, &lba2, &len2);
 	if (retval != 0)
 		return (CTL_ACTION_ERROR);
 
 	return (ctl_extent_check_lba(lba1, len1, lba2, len2));
 }
 
 static ctl_action
 ctl_check_for_blockage(union ctl_io *pending_io, union ctl_io *ooa_io)
 {
 	const struct ctl_cmd_entry *pending_entry, *ooa_entry;
 	ctl_serialize_action *serialize_row;
 
 	/*
 	 * The initiator attempted multiple untagged commands at the same
 	 * time.  Can't do that.
 	 */
 	if ((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED)
 	 && (ooa_io->scsiio.tag_type == CTL_TAG_UNTAGGED)
 	 && ((pending_io->io_hdr.nexus.targ_port ==
 	      ooa_io->io_hdr.nexus.targ_port)
 	  && (pending_io->io_hdr.nexus.initid.id ==
 	      ooa_io->io_hdr.nexus.initid.id))
 	 && ((ooa_io->io_hdr.flags & CTL_FLAG_ABORT) == 0))
 		return (CTL_ACTION_OVERLAP);
 
 	/*
 	 * The initiator attempted to send multiple tagged commands with
 	 * the same ID.  (It's fine if different initiators have the same
 	 * tag ID.)
 	 *
 	 * Even if all of those conditions are true, we don't kill the I/O
 	 * if the command ahead of us has been aborted.  We won't end up
 	 * sending it to the FETD, and it's perfectly legal to resend a
 	 * command with the same tag number as long as the previous
 	 * instance of this tag number has been aborted somehow.
 	 */
 	if ((pending_io->scsiio.tag_type != CTL_TAG_UNTAGGED)
 	 && (ooa_io->scsiio.tag_type != CTL_TAG_UNTAGGED)
 	 && (pending_io->scsiio.tag_num == ooa_io->scsiio.tag_num)
 	 && ((pending_io->io_hdr.nexus.targ_port ==
 	      ooa_io->io_hdr.nexus.targ_port)
 	  && (pending_io->io_hdr.nexus.initid.id ==
 	      ooa_io->io_hdr.nexus.initid.id))
 	 && ((ooa_io->io_hdr.flags & CTL_FLAG_ABORT) == 0))
 		return (CTL_ACTION_OVERLAP_TAG);
 
 	/*
 	 * If we get a head of queue tag, SAM-3 says that we should
 	 * immediately execute it.
 	 *
 	 * What happens if this command would normally block for some other
 	 * reason?  e.g. a request sense with a head of queue tag
 	 * immediately after a write.  Normally that would block, but this
 	 * will result in its getting executed immediately...
 	 *
 	 * We currently return "pass" instead of "skip", so we'll end up
 	 * going through the rest of the queue to check for overlapped tags.
 	 *
 	 * XXX KDM check for other types of blockage first??
 	 */
 	if (pending_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE)
 		return (CTL_ACTION_PASS);
 
 	/*
 	 * Ordered tags have to block until all items ahead of them
 	 * have completed.  If we get called with an ordered tag, we always
 	 * block, if something else is ahead of us in the queue.
 	 */
 	if (pending_io->scsiio.tag_type == CTL_TAG_ORDERED)
 		return (CTL_ACTION_BLOCK);
 
 	/*
 	 * Simple tags get blocked until all head of queue and ordered tags
 	 * ahead of them have completed.  I'm lumping untagged commands in
 	 * with simple tags here.  XXX KDM is that the right thing to do?
 	 */
 	if (((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED)
 	  || (pending_io->scsiio.tag_type == CTL_TAG_SIMPLE))
 	 && ((ooa_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE)
 	  || (ooa_io->scsiio.tag_type == CTL_TAG_ORDERED)))
 		return (CTL_ACTION_BLOCK);
 
 	pending_entry = ctl_get_cmd_entry(&pending_io->scsiio);
 	ooa_entry = ctl_get_cmd_entry(&ooa_io->scsiio);
 
 	serialize_row = ctl_serialize_table[ooa_entry->seridx];
 
 	switch (serialize_row[pending_entry->seridx]) {
 	case CTL_SER_BLOCK:
 		return (CTL_ACTION_BLOCK);
 		break; /* NOTREACHED */
 	case CTL_SER_EXTENT:
 		return (ctl_extent_check(pending_io, ooa_io));
 		break; /* NOTREACHED */
 	case CTL_SER_PASS:
 		return (CTL_ACTION_PASS);
 		break; /* NOTREACHED */
 	case CTL_SER_SKIP:
 		return (CTL_ACTION_SKIP);
 		break;
 	default:
 		panic("invalid serialization value %d",
 		      serialize_row[pending_entry->seridx]);
 		break; /* NOTREACHED */
 	}
 
 	return (CTL_ACTION_ERROR);
 }
 
 /*
  * Check for blockage or overlaps against the OOA (Order Of Arrival) queue.
  * Assumptions:
  * - pending_io is generally either incoming, or on the blocked queue
  * - starting I/O is the I/O we want to start the check with.
  */
 static ctl_action
 ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io,
 	      union ctl_io *starting_io)
 {
 	union ctl_io *ooa_io;
 	ctl_action action;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	/*
 	 * Run back along the OOA queue, starting with the current
 	 * blocked I/O and going through every I/O before it on the
 	 * queue.  If starting_io is NULL, we'll just end up returning
 	 * CTL_ACTION_PASS.
 	 */
 	for (ooa_io = starting_io; ooa_io != NULL;
 	     ooa_io = (union ctl_io *)TAILQ_PREV(&ooa_io->io_hdr, ctl_ooaq,
 	     ooa_links)){
 
 		/*
 		 * This routine just checks to see whether
 		 * cur_blocked is blocked by ooa_io, which is ahead
 		 * of it in the queue.  It doesn't queue/dequeue
 		 * cur_blocked.
 		 */
 		action = ctl_check_for_blockage(pending_io, ooa_io);
 		switch (action) {
 		case CTL_ACTION_BLOCK:
 		case CTL_ACTION_OVERLAP:
 		case CTL_ACTION_OVERLAP_TAG:
 		case CTL_ACTION_SKIP:
 		case CTL_ACTION_ERROR:
 			return (action);
 			break; /* NOTREACHED */
 		case CTL_ACTION_PASS:
 			break;
 		default:
 			panic("invalid action %d", action);
 			break;  /* NOTREACHED */
 		}
 	}
 
 	return (CTL_ACTION_PASS);
 }
 
 /*
  * Assumptions:
  * - An I/O has just completed, and has been removed from the per-LUN OOA
  *   queue, so some items on the blocked queue may now be unblocked.
  */
 static int
 ctl_check_blocked(struct ctl_lun *lun)
 {
 	union ctl_io *cur_blocked, *next_blocked;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	/*
 	 * Run forward from the head of the blocked queue, checking each
 	 * entry against the I/Os prior to it on the OOA queue to see if
 	 * there is still any blockage.
 	 *
 	 * We cannot use the TAILQ_FOREACH() macro, because it can't deal
 	 * with our removing a variable on it while it is traversing the
 	 * list.
 	 */
 	for (cur_blocked = (union ctl_io *)TAILQ_FIRST(&lun->blocked_queue);
 	     cur_blocked != NULL; cur_blocked = next_blocked) {
 		union ctl_io *prev_ooa;
 		ctl_action action;
 
 		next_blocked = (union ctl_io *)TAILQ_NEXT(&cur_blocked->io_hdr,
 							  blocked_links);
 
 		prev_ooa = (union ctl_io *)TAILQ_PREV(&cur_blocked->io_hdr,
 						      ctl_ooaq, ooa_links);
 
 		/*
 		 * If cur_blocked happens to be the first item in the OOA
 		 * queue now, prev_ooa will be NULL, and the action
 		 * returned will just be CTL_ACTION_PASS.
 		 */
 		action = ctl_check_ooa(lun, cur_blocked, prev_ooa);
 
 		switch (action) {
 		case CTL_ACTION_BLOCK:
 			/* Nothing to do here, still blocked */
 			break;
 		case CTL_ACTION_OVERLAP:
 		case CTL_ACTION_OVERLAP_TAG:
 			/*
 			 * This shouldn't happen!  In theory we've already
 			 * checked this command for overlap...
 			 */
 			break;
 		case CTL_ACTION_PASS:
 		case CTL_ACTION_SKIP: {
 			struct ctl_softc *softc;
 			const struct ctl_cmd_entry *entry;
 			uint32_t initidx;
 			int isc_retval;
 
 			/*
 			 * The skip case shouldn't happen, this transaction
 			 * should have never made it onto the blocked queue.
 			 */
 			/*
 			 * This I/O is no longer blocked, we can remove it
 			 * from the blocked queue.  Since this is a TAILQ
 			 * (doubly linked list), we can do O(1) removals
 			 * from any place on the list.
 			 */
 			TAILQ_REMOVE(&lun->blocked_queue, &cur_blocked->io_hdr,
 				     blocked_links);
 			cur_blocked->io_hdr.flags &= ~CTL_FLAG_BLOCKED;
 
 			if (cur_blocked->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC){
 				/*
 				 * Need to send IO back to original side to
 				 * run
 				 */
 				union ctl_ha_msg msg_info;
 
 				msg_info.hdr.original_sc =
 					cur_blocked->io_hdr.original_sc;
 				msg_info.hdr.serializing_sc = cur_blocked;
 				msg_info.hdr.msg_type = CTL_MSG_R2R;
 				if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 				     &msg_info, sizeof(msg_info), 0)) >
 				     CTL_HA_STATUS_SUCCESS) {
 					printf("CTL:Check Blocked error from "
 					       "ctl_ha_msg_send %d\n",
 					       isc_retval);
 				}
 				break;
 			}
 			entry = ctl_get_cmd_entry(&cur_blocked->scsiio);
 			softc = control_softc;
 
 			initidx = ctl_get_initindex(&cur_blocked->io_hdr.nexus);
 
 			/*
 			 * Check this I/O for LUN state changes that may
 			 * have happened while this command was blocked.
 			 * The LUN state may have been changed by a command
 			 * ahead of us in the queue, so we need to re-check
 			 * for any states that can be caused by SCSI
 			 * commands.
 			 */
 			if (ctl_scsiio_lun_check(softc, lun, entry,
 						 &cur_blocked->scsiio) == 0) {
 				cur_blocked->io_hdr.flags |=
 				                      CTL_FLAG_IS_WAS_ON_RTR;
 				ctl_enqueue_rtr(cur_blocked);
 			} else
 				ctl_done(cur_blocked);
 			break;
 		}
 		default:
 			/*
 			 * This probably shouldn't happen -- we shouldn't
 			 * get CTL_ACTION_ERROR, or anything else.
 			 */
 			break;
 		}
 	}
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * This routine (with one exception) checks LUN flags that can be set by
  * commands ahead of us in the OOA queue.  These flags have to be checked
  * when a command initially comes in, and when we pull a command off the
  * blocked queue and are preparing to execute it.  The reason we have to
  * check these flags for commands on the blocked queue is that the LUN
  * state may have been changed by a command ahead of us while we're on the
  * blocked queue.
  *
  * Ordering is somewhat important with these checks, so please pay
  * careful attention to the placement of any new checks.
  */
 static int
 ctl_scsiio_lun_check(struct ctl_softc *ctl_softc, struct ctl_lun *lun,
     const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio)
 {
 	int retval;
 
 	retval = 0;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	/*
 	 * If this shelf is a secondary shelf controller, we have to reject
 	 * any media access commands.
 	 */
 #if 0
 	/* No longer needed for HA */
 	if (((ctl_softc->flags & CTL_FLAG_MASTER_SHELF) == 0)
 	 && ((entry->flags & CTL_CMD_FLAG_OK_ON_SECONDARY) == 0)) {
 		ctl_set_lun_standby(ctsio);
 		retval = 1;
 		goto bailout;
 	}
 #endif
 
 	/*
 	 * Check for a reservation conflict.  If this command isn't allowed
 	 * even on reserved LUNs, and if this initiator isn't the one who
 	 * reserved us, reject the command with a reservation conflict.
 	 */
 	if ((lun->flags & CTL_LUN_RESERVED)
 	 && ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_RESV) == 0)) {
 		if ((ctsio->io_hdr.nexus.initid.id != lun->rsv_nexus.initid.id)
 		 || (ctsio->io_hdr.nexus.targ_port != lun->rsv_nexus.targ_port)
 		 || (ctsio->io_hdr.nexus.targ_target.id !=
 		     lun->rsv_nexus.targ_target.id)) {
 			ctsio->scsi_status = SCSI_STATUS_RESERV_CONFLICT;
 			ctsio->io_hdr.status = CTL_SCSI_ERROR;
 			retval = 1;
 			goto bailout;
 		}
 	}
 
 	if ( (lun->flags & CTL_LUN_PR_RESERVED)
 	 && ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_RESV) == 0)) {
 		uint32_t residx;
 
 		residx = ctl_get_resindex(&ctsio->io_hdr.nexus);
 		/*
 		 * if we aren't registered or it's a res holder type
 		 * reservation and this isn't the res holder then set a
 		 * conflict.
 		 * NOTE: Commands which might be allowed on write exclusive
 		 * type reservations are checked in the particular command
 		 * for a conflict. Read and SSU are the only ones.
 		 */
 		if (!lun->per_res[residx].registered
 		 || (residx != lun->pr_res_idx && lun->res_type < 4)) {
 			ctsio->scsi_status = SCSI_STATUS_RESERV_CONFLICT;
 			ctsio->io_hdr.status = CTL_SCSI_ERROR;
 			retval = 1;
 			goto bailout;
 		}
 
 	}
 
 	if ((lun->flags & CTL_LUN_OFFLINE)
 	 && ((entry->flags & CTL_CMD_FLAG_OK_ON_OFFLINE) == 0)) {
 		ctl_set_lun_not_ready(ctsio);
 		retval = 1;
 		goto bailout;
 	}
 
 	/*
 	 * If the LUN is stopped, see if this particular command is allowed
 	 * for a stopped lun.  Otherwise, reject it with 0x04,0x02.
 	 */
 	if ((lun->flags & CTL_LUN_STOPPED)
 	 && ((entry->flags & CTL_CMD_FLAG_OK_ON_STOPPED) == 0)) {
 		/* "Logical unit not ready, initializing cmd. required" */
 		ctl_set_lun_stopped(ctsio);
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((lun->flags & CTL_LUN_INOPERABLE)
 	 && ((entry->flags & CTL_CMD_FLAG_OK_ON_INOPERABLE) == 0)) {
 		/* "Medium format corrupted" */
 		ctl_set_medium_format_corrupted(ctsio);
 		retval = 1;
 		goto bailout;
 	}
 
 bailout:
 	return (retval);
 
 }
 
 static void
 ctl_failover_io(union ctl_io *io, int have_lock)
 {
 	ctl_set_busy(&io->scsiio);
 	ctl_done(io);
 }
 
 static void
 ctl_failover(void)
 {
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 	union ctl_io *next_io, *pending_io;
 	union ctl_io *io;
 	int lun_idx;
 	int i;
 
 	ctl_softc = control_softc;
 
 	mtx_lock(&ctl_softc->ctl_lock);
 	/*
 	 * Remove any cmds from the other SC from the rtr queue.  These
 	 * will obviously only be for LUNs for which we're the primary.
 	 * We can't send status or get/send data for these commands.
 	 * Since they haven't been executed yet, we can just remove them.
 	 * We'll either abort them or delete them below, depending on
 	 * which HA mode we're in.
 	 */
 #ifdef notyet
 	mtx_lock(&ctl_softc->queue_lock);
 	for (io = (union ctl_io *)STAILQ_FIRST(&ctl_softc->rtr_queue);
 	     io != NULL; io = next_io) {
 		next_io = (union ctl_io *)STAILQ_NEXT(&io->io_hdr, links);
 		if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)
 			STAILQ_REMOVE(&ctl_softc->rtr_queue, &io->io_hdr,
 				      ctl_io_hdr, links);
 	}
 	mtx_unlock(&ctl_softc->queue_lock);
 #endif
 
 	for (lun_idx=0; lun_idx < ctl_softc->num_luns; lun_idx++) {
 		lun = ctl_softc->ctl_luns[lun_idx];
 		if (lun==NULL)
 			continue;
 
 		/*
 		 * Processor LUNs are primary on both sides.
 		 * XXX will this always be true?
 		 */
 		if (lun->be_lun->lun_type == T_PROCESSOR)
 			continue;
 
 		if ((lun->flags & CTL_LUN_PRIMARY_SC)
 		 && (ctl_softc->ha_mode == CTL_HA_MODE_SER_ONLY)) {
 			printf("FAILOVER: primary lun %d\n", lun_idx);
 		        /*
 			 * Remove all commands from the other SC. First from the
 			 * blocked queue then from the ooa queue. Once we have
 			 * removed them. Call ctl_check_blocked to see if there
 			 * is anything that can run.
 			 */
 			for (io = (union ctl_io *)TAILQ_FIRST(
 			     &lun->blocked_queue); io != NULL; io = next_io) {
 
 		        	next_io = (union ctl_io *)TAILQ_NEXT(
 				    &io->io_hdr, blocked_links);
 
 				if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) {
 					TAILQ_REMOVE(&lun->blocked_queue,
 						     &io->io_hdr,blocked_links);
 					io->io_hdr.flags &= ~CTL_FLAG_BLOCKED;
 					TAILQ_REMOVE(&lun->ooa_queue,
 						     &io->io_hdr, ooa_links);
 
 					ctl_free_io(io);
 				}
 			}
 
 			for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue);
 	     		     io != NULL; io = next_io) {
 
 		        	next_io = (union ctl_io *)TAILQ_NEXT(
 				    &io->io_hdr, ooa_links);
 
 				if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) {
 
 					TAILQ_REMOVE(&lun->ooa_queue,
 						&io->io_hdr,
 					     	ooa_links);
 
 					ctl_free_io(io);
 				}
 			}
 			ctl_check_blocked(lun);
 		} else if ((lun->flags & CTL_LUN_PRIMARY_SC)
 			&& (ctl_softc->ha_mode == CTL_HA_MODE_XFER)) {
 
 			printf("FAILOVER: primary lun %d\n", lun_idx);
 			/*
 			 * Abort all commands from the other SC.  We can't
 			 * send status back for them now.  These should get
 			 * cleaned up when they are completed or come out
 			 * for a datamove operation.
 			 */
 			for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue);
 	     		     io != NULL; io = next_io) {
 		        	next_io = (union ctl_io *)TAILQ_NEXT(
 					&io->io_hdr, ooa_links);
 
 				if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)
 					io->io_hdr.flags |= CTL_FLAG_ABORT;
 			}
 		} else if (((lun->flags & CTL_LUN_PRIMARY_SC) == 0)
 			&& (ctl_softc->ha_mode == CTL_HA_MODE_XFER)) {
 
 			printf("FAILOVER: secondary lun %d\n", lun_idx);
 
 			lun->flags |= CTL_LUN_PRIMARY_SC;
 
 			/*
 			 * We send all I/O that was sent to this controller
 			 * and redirected to the other side back with
 			 * busy status, and have the initiator retry it.
 			 * Figuring out how much data has been transferred,
 			 * etc. and picking up where we left off would be 
 			 * very tricky.
 			 *
 			 * XXX KDM need to remove I/O from the blocked
 			 * queue as well!
 			 */
 			for (pending_io = (union ctl_io *)TAILQ_FIRST(
 			     &lun->ooa_queue); pending_io != NULL;
 			     pending_io = next_io) {
 
 				next_io =  (union ctl_io *)TAILQ_NEXT(
 					&pending_io->io_hdr, ooa_links);
 
 				pending_io->io_hdr.flags &=
 					~CTL_FLAG_SENT_2OTHER_SC;
 
 				if (pending_io->io_hdr.flags &
 				    CTL_FLAG_IO_ACTIVE) {
 					pending_io->io_hdr.flags |=
 						CTL_FLAG_FAILOVER;
 				} else {
 					ctl_set_busy(&pending_io->scsiio);
 					ctl_done(pending_io);
 				}
 			}
 
 			/*
 			 * Build Unit Attention
 			 */
 			for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 				lun->pending_ua[i] |=
 				                     CTL_UA_ASYM_ACC_CHANGE;
 			}
 		} else if (((lun->flags & CTL_LUN_PRIMARY_SC) == 0)
 			&& (ctl_softc->ha_mode == CTL_HA_MODE_SER_ONLY)) {
 			printf("FAILOVER: secondary lun %d\n", lun_idx);
 			/*
 			 * if the first io on the OOA is not on the RtR queue
 			 * add it.
 			 */
 			lun->flags |= CTL_LUN_PRIMARY_SC;
 
 			pending_io = (union ctl_io *)TAILQ_FIRST(
 			    &lun->ooa_queue);
 			if (pending_io==NULL) {
 				printf("Nothing on OOA queue\n");
 				continue;
 			}
 
 			pending_io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC;
 			if ((pending_io->io_hdr.flags &
 			     CTL_FLAG_IS_WAS_ON_RTR) == 0) {
 				pending_io->io_hdr.flags |=
 				    CTL_FLAG_IS_WAS_ON_RTR;
 				ctl_enqueue_rtr(pending_io);
 			}
 #if 0
 			else
 			{
 				printf("Tag 0x%04x is running\n",
 				      pending_io->scsiio.tag_num);
 			}
 #endif
 
 			next_io = (union ctl_io *)TAILQ_NEXT(
 			    &pending_io->io_hdr, ooa_links);
 			for (pending_io=next_io; pending_io != NULL;
 			     pending_io = next_io) {
 				pending_io->io_hdr.flags &=
 				    ~CTL_FLAG_SENT_2OTHER_SC;
 				next_io = (union ctl_io *)TAILQ_NEXT(
 					&pending_io->io_hdr, ooa_links);
 				if (pending_io->io_hdr.flags &
 				    CTL_FLAG_IS_WAS_ON_RTR) {
 #if 0
 				        printf("Tag 0x%04x is running\n",
 				      		pending_io->scsiio.tag_num);
 #endif
 					continue;
 				}
 
 				switch (ctl_check_ooa(lun, pending_io,
 			            (union ctl_io *)TAILQ_PREV(
 				    &pending_io->io_hdr, ctl_ooaq,
 				    ooa_links))) {
 
 				case CTL_ACTION_BLOCK:
 					TAILQ_INSERT_TAIL(&lun->blocked_queue,
 							  &pending_io->io_hdr,
 							  blocked_links);
 					pending_io->io_hdr.flags |=
 					    CTL_FLAG_BLOCKED;
 					break;
 				case CTL_ACTION_PASS:
 				case CTL_ACTION_SKIP:
 					pending_io->io_hdr.flags |=
 					    CTL_FLAG_IS_WAS_ON_RTR;
 					ctl_enqueue_rtr(pending_io);
 					break;
 				case CTL_ACTION_OVERLAP:
 					ctl_set_overlapped_cmd(
 					    (struct ctl_scsiio *)pending_io);
 					ctl_done(pending_io);
 					break;
 				case CTL_ACTION_OVERLAP_TAG:
 					ctl_set_overlapped_tag(
 					    (struct ctl_scsiio *)pending_io,
 					    pending_io->scsiio.tag_num & 0xff);
 					ctl_done(pending_io);
 					break;
 				case CTL_ACTION_ERROR:
 				default:
 					ctl_set_internal_failure(
 						(struct ctl_scsiio *)pending_io,
 						0,  // sks_valid
 						0); //retry count
 					ctl_done(pending_io);
 					break;
 				}
 			}
 
 			/*
 			 * Build Unit Attention
 			 */
 			for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 				lun->pending_ua[i] |=
 				                     CTL_UA_ASYM_ACC_CHANGE;
 			}
 		} else {
 			panic("Unhandled HA mode failover, LUN flags = %#x, "
 			      "ha_mode = #%x", lun->flags, ctl_softc->ha_mode);
 		}
 	}
 	ctl_pause_rtr = 0;
 	mtx_unlock(&ctl_softc->ctl_lock);
 }
 
 static int
 ctl_scsiio_precheck(struct ctl_softc *ctl_softc, struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	const struct ctl_cmd_entry *entry;
 	uint32_t initidx, targ_lun;
 	int retval;
 
 	retval = 0;
 
 	lun = NULL;
 
 	targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun;
 	if ((targ_lun < CTL_MAX_LUNS)
 	 && (ctl_softc->ctl_luns[targ_lun] != NULL)) {
 		lun = ctl_softc->ctl_luns[targ_lun];
 		/*
 		 * If the LUN is invalid, pretend that it doesn't exist.
 		 * It will go away as soon as all pending I/O has been
 		 * completed.
 		 */
 		if (lun->flags & CTL_LUN_DISABLED) {
 			lun = NULL;
 		} else {
 			ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = lun;
 			ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr =
 				lun->be_lun;
 			if (lun->be_lun->lun_type == T_PROCESSOR) {
 				ctsio->io_hdr.flags |= CTL_FLAG_CONTROL_DEV;
 			}
 
 			/*
 			 * Every I/O goes into the OOA queue for a
 			 * particular LUN, and stays there until completion.
 			 */
 			mtx_lock(&lun->lun_lock);
 			TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr,
 			    ooa_links);
 		}
 	} else {
 		ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = NULL;
 		ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr = NULL;
 	}
 
 	/* Get command entry and return error if it is unsuppotyed. */
 	entry = ctl_validate_command(ctsio);
 	if (entry == NULL) {
 		if (lun)
 			mtx_unlock(&lun->lun_lock);
 		return (retval);
 	}
 
 	ctsio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK;
 	ctsio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK;
 
 	/*
 	 * Check to see whether we can send this command to LUNs that don't
 	 * exist.  This should pretty much only be the case for inquiry
 	 * and request sense.  Further checks, below, really require having
 	 * a LUN, so we can't really check the command anymore.  Just put
 	 * it on the rtr queue.
 	 */
 	if (lun == NULL) {
 		if (entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) {
 			ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 			ctl_enqueue_rtr((union ctl_io *)ctsio);
 			return (retval);
 		}
 
 		ctl_set_unsupported_lun(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		CTL_DEBUG_PRINT(("ctl_scsiio_precheck: bailing out due to invalid LUN\n"));
 		return (retval);
 	} else {
 		/*
 		 * Make sure we support this particular command on this LUN.
 		 * e.g., we don't support writes to the control LUN.
 		 */
 		if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) {
 			mtx_unlock(&lun->lun_lock);
 			ctl_set_invalid_opcode(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (retval);
 		}
 	}
 
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 
 #ifdef CTL_WITH_CA
 	/*
 	 * If we've got a request sense, it'll clear the contingent
 	 * allegiance condition.  Otherwise, if we have a CA condition for
 	 * this initiator, clear it, because it sent down a command other
 	 * than request sense.
 	 */
 	if ((ctsio->cdb[0] != REQUEST_SENSE)
 	 && (ctl_is_set(lun->have_ca, initidx)))
 		ctl_clear_mask(lun->have_ca, initidx);
 #endif
 
 	/*
 	 * If the command has this flag set, it handles its own unit
 	 * attention reporting, we shouldn't do anything.  Otherwise we
 	 * check for any pending unit attentions, and send them back to the
 	 * initiator.  We only do this when a command initially comes in,
 	 * not when we pull it off the blocked queue.
 	 *
 	 * According to SAM-3, section 5.3.2, the order that things get
 	 * presented back to the host is basically unit attentions caused
 	 * by some sort of reset event, busy status, reservation conflicts
 	 * or task set full, and finally any other status.
 	 *
 	 * One issue here is that some of the unit attentions we report
 	 * don't fall into the "reset" category (e.g. "reported luns data
 	 * has changed").  So reporting it here, before the reservation
 	 * check, may be technically wrong.  I guess the only thing to do
 	 * would be to check for and report the reset events here, and then
 	 * check for the other unit attention types after we check for a
 	 * reservation conflict.
 	 *
 	 * XXX KDM need to fix this
 	 */
 	if ((entry->flags & CTL_CMD_FLAG_NO_SENSE) == 0) {
 		ctl_ua_type ua_type;
 
 		ua_type = lun->pending_ua[initidx];
 		if (ua_type != CTL_UA_NONE) {
 			scsi_sense_data_type sense_format;
 
 			if (lun != NULL)
 				sense_format = (lun->flags &
 				    CTL_LUN_SENSE_DESC) ? SSD_TYPE_DESC :
 				    SSD_TYPE_FIXED;
 			else
 				sense_format = SSD_TYPE_FIXED;
 
 			ua_type = ctl_build_ua(ua_type, &ctsio->sense_data,
 					       sense_format);
 			if (ua_type != CTL_UA_NONE) {
 				ctsio->scsi_status = SCSI_STATUS_CHECK_COND;
 				ctsio->io_hdr.status = CTL_SCSI_ERROR |
 						       CTL_AUTOSENSE;
 				ctsio->sense_len = SSD_FULL_SIZE;
 				lun->pending_ua[initidx] &= ~ua_type;
 				mtx_unlock(&lun->lun_lock);
 				ctl_done((union ctl_io *)ctsio);
 				return (retval);
 			}
 		}
 	}
 
 
 	if (ctl_scsiio_lun_check(ctl_softc, lun, entry, ctsio) != 0) {
 		mtx_unlock(&lun->lun_lock);
 		ctl_done((union ctl_io *)ctsio);
 		return (retval);
 	}
 
 	/*
 	 * XXX CHD this is where we want to send IO to other side if
 	 * this LUN is secondary on this SC. We will need to make a copy
 	 * of the IO and flag the IO on this side as SENT_2OTHER and the flag
 	 * the copy we send as FROM_OTHER.
 	 * We also need to stuff the address of the original IO so we can
 	 * find it easily. Something similar will need be done on the other
 	 * side so when we are done we can find the copy.
 	 */
 	if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0) {
 		union ctl_ha_msg msg_info;
 		int isc_retval;
 
 		ctsio->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC;
 
 		msg_info.hdr.msg_type = CTL_MSG_SERIALIZE;
 		msg_info.hdr.original_sc = (union ctl_io *)ctsio;
 #if 0
 		printf("1. ctsio %p\n", ctsio);
 #endif
 		msg_info.hdr.serializing_sc = NULL;
 		msg_info.hdr.nexus = ctsio->io_hdr.nexus;
 		msg_info.scsi.tag_num = ctsio->tag_num;
 		msg_info.scsi.tag_type = ctsio->tag_type;
 		memcpy(msg_info.scsi.cdb, ctsio->cdb, CTL_MAX_CDBLEN);
 
 		ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 
 		if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 		    (void *)&msg_info, sizeof(msg_info), 0)) >
 		    CTL_HA_STATUS_SUCCESS) {
 			printf("CTL:precheck, ctl_ha_msg_send returned %d\n",
 			       isc_retval);
 			printf("CTL:opcode is %x\n", ctsio->cdb[0]);
 		} else {
 #if 0
 			printf("CTL:Precheck sent msg, opcode is %x\n",opcode);
 #endif
 		}
 
 		/*
 		 * XXX KDM this I/O is off the incoming queue, but hasn't
 		 * been inserted on any other queue.  We may need to come
 		 * up with a holding queue while we wait for serialization
 		 * so that we have an idea of what we're waiting for from
 		 * the other side.
 		 */
 		mtx_unlock(&lun->lun_lock);
 		return (retval);
 	}
 
 	switch (ctl_check_ooa(lun, (union ctl_io *)ctsio,
 			      (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr,
 			      ctl_ooaq, ooa_links))) {
 	case CTL_ACTION_BLOCK:
 		ctsio->io_hdr.flags |= CTL_FLAG_BLOCKED;
 		TAILQ_INSERT_TAIL(&lun->blocked_queue, &ctsio->io_hdr,
 				  blocked_links);
 		mtx_unlock(&lun->lun_lock);
 		return (retval);
 	case CTL_ACTION_PASS:
 	case CTL_ACTION_SKIP:
 		ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 		mtx_unlock(&lun->lun_lock);
 		ctl_enqueue_rtr((union ctl_io *)ctsio);
 		break;
 	case CTL_ACTION_OVERLAP:
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_overlapped_cmd(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		break;
 	case CTL_ACTION_OVERLAP_TAG:
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_overlapped_tag(ctsio, ctsio->tag_num & 0xff);
 		ctl_done((union ctl_io *)ctsio);
 		break;
 	case CTL_ACTION_ERROR:
 	default:
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_internal_failure(ctsio,
 					 /*sks_valid*/ 0,
 					 /*retry_count*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		break;
 	}
 	return (retval);
 }
 
 const struct ctl_cmd_entry *
 ctl_get_cmd_entry(struct ctl_scsiio *ctsio)
 {
 	const struct ctl_cmd_entry *entry;
 	int service_action;
 
 	entry = &ctl_cmd_table[ctsio->cdb[0]];
 	if (entry->flags & CTL_CMD_FLAG_SA5) {
 		service_action = ctsio->cdb[1] & SERVICE_ACTION_MASK;
 		entry = &((const struct ctl_cmd_entry *)
 		    entry->execute)[service_action];
 	}
 	return (entry);
 }
 
 const struct ctl_cmd_entry *
 ctl_validate_command(struct ctl_scsiio *ctsio)
 {
 	const struct ctl_cmd_entry *entry;
 	int i;
 	uint8_t diff;
 
 	entry = ctl_get_cmd_entry(ctsio);
 	if (entry->execute == NULL) {
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (NULL);
 	}
 	KASSERT(entry->length > 0,
 	    ("Not defined length for command 0x%02x/0x%02x",
 	     ctsio->cdb[0], ctsio->cdb[1]));
 	for (i = 1; i < entry->length; i++) {
 		diff = ctsio->cdb[i] & ~entry->usage[i - 1];
 		if (diff == 0)
 			continue;
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ i,
 				      /*bit_valid*/ 1,
 				      /*bit*/ fls(diff) - 1);
 		ctl_done((union ctl_io *)ctsio);
 		return (NULL);
 	}
 	return (entry);
 }
 
 static int
 ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry)
 {
 
 	switch (lun_type) {
 	case T_PROCESSOR:
 		if (((entry->flags & CTL_CMD_FLAG_OK_ON_PROC) == 0) &&
 		    ((entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) == 0))
 			return (0);
 		break;
 	case T_DIRECT:
 		if (((entry->flags & CTL_CMD_FLAG_OK_ON_SLUN) == 0) &&
 		    ((entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) == 0))
 			return (0);
 		break;
 	default:
 		return (0);
 	}
 	return (1);
 }
 
 static int
 ctl_scsiio(struct ctl_scsiio *ctsio)
 {
 	int retval;
 	const struct ctl_cmd_entry *entry;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	CTL_DEBUG_PRINT(("ctl_scsiio cdb[0]=%02X\n", ctsio->cdb[0]));
 
 	entry = ctl_get_cmd_entry(ctsio);
 
 	/*
 	 * If this I/O has been aborted, just send it straight to
 	 * ctl_done() without executing it.
 	 */
 	if (ctsio->io_hdr.flags & CTL_FLAG_ABORT) {
 		ctl_done((union ctl_io *)ctsio);
 		goto bailout;
 	}
 
 	/*
 	 * All the checks should have been handled by ctl_scsiio_precheck().
 	 * We should be clear now to just execute the I/O.
 	 */
 	retval = entry->execute(ctsio);
 
 bailout:
 	return (retval);
 }
 
 /*
  * Since we only implement one target right now, a bus reset simply resets
  * our single target.
  */
 static int
 ctl_bus_reset(struct ctl_softc *ctl_softc, union ctl_io *io)
 {
 	return(ctl_target_reset(ctl_softc, io, CTL_UA_BUS_RESET));
 }
 
 static int
 ctl_target_reset(struct ctl_softc *ctl_softc, union ctl_io *io,
 		 ctl_ua_type ua_type)
 {
 	struct ctl_lun *lun;
 	int retval;
 
 	if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) {
 		union ctl_ha_msg msg_info;
 
 		io->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC;
 		msg_info.hdr.nexus = io->io_hdr.nexus;
 		if (ua_type==CTL_UA_TARG_RESET)
 			msg_info.task.task_action = CTL_TASK_TARGET_RESET;
 		else
 			msg_info.task.task_action = CTL_TASK_BUS_RESET;
 		msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS;
 		msg_info.hdr.original_sc = NULL;
 		msg_info.hdr.serializing_sc = NULL;
 		if (CTL_HA_STATUS_SUCCESS != ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 		    (void *)&msg_info, sizeof(msg_info), 0)) {
 		}
 	}
 	retval = 0;
 
 	mtx_lock(&ctl_softc->ctl_lock);
 	STAILQ_FOREACH(lun, &ctl_softc->lun_list, links)
 		retval += ctl_lun_reset(lun, io, ua_type);
 	mtx_unlock(&ctl_softc->ctl_lock);
 
 	return (retval);
 }
 
 /*
  * The LUN should always be set.  The I/O is optional, and is used to
  * distinguish between I/Os sent by this initiator, and by other
  * initiators.  We set unit attention for initiators other than this one.
  * SAM-3 is vague on this point.  It does say that a unit attention should
  * be established for other initiators when a LUN is reset (see section
  * 5.7.3), but it doesn't specifically say that the unit attention should
  * be established for this particular initiator when a LUN is reset.  Here
  * is the relevant text, from SAM-3 rev 8:
  *
  * 5.7.2 When a SCSI initiator port aborts its own tasks
  *
  * When a SCSI initiator port causes its own task(s) to be aborted, no
  * notification that the task(s) have been aborted shall be returned to
  * the SCSI initiator port other than the completion response for the
  * command or task management function action that caused the task(s) to
  * be aborted and notification(s) associated with related effects of the
  * action (e.g., a reset unit attention condition).
  *
  * XXX KDM for now, we're setting unit attention for all initiators.
  */
 static int
 ctl_lun_reset(struct ctl_lun *lun, union ctl_io *io, ctl_ua_type ua_type)
 {
 	union ctl_io *xio;
 #if 0
 	uint32_t initindex;
 #endif
 	int i;
 
 	mtx_lock(&lun->lun_lock);
 	/*
 	 * Run through the OOA queue and abort each I/O.
 	 */
 #if 0
 	TAILQ_FOREACH((struct ctl_io_hdr *)xio, &lun->ooa_queue, ooa_links) {
 #endif
 	for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL;
 	     xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) {
 		xio->io_hdr.flags |= CTL_FLAG_ABORT | CTL_FLAG_ABORT_STATUS;
 	}
 
 	/*
 	 * This version sets unit attention for every
 	 */
 #if 0
 	initindex = ctl_get_initindex(&io->io_hdr.nexus);
 	for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 		if (initindex == i)
 			continue;
 		lun->pending_ua[i] |= ua_type;
 	}
 #endif
 
 	/*
 	 * A reset (any kind, really) clears reservations established with
 	 * RESERVE/RELEASE.  It does not clear reservations established
 	 * with PERSISTENT RESERVE OUT, but we don't support that at the
 	 * moment anyway.  See SPC-2, section 5.6.  SPC-3 doesn't address
 	 * reservations made with the RESERVE/RELEASE commands, because
 	 * those commands are obsolete in SPC-3.
 	 */
 	lun->flags &= ~CTL_LUN_RESERVED;
 
 	for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 #ifdef CTL_WITH_CA
 		ctl_clear_mask(lun->have_ca, i);
 #endif
 		lun->pending_ua[i] |= ua_type;
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 static void
 ctl_abort_tasks_lun(struct ctl_lun *lun, uint32_t targ_port, uint32_t init_id,
     int other_sc)
 {
 	union ctl_io *xio;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	/*
 	 * Run through the OOA queue and attempt to find the given I/O.
 	 * The target port, initiator ID, tag type and tag number have to
 	 * match the values that we got from the initiator.  If we have an
 	 * untagged command to abort, simply abort the first untagged command
 	 * we come to.  We only allow one untagged command at a time of course.
 	 */
 	for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL;
 	     xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) {
 
 		if ((targ_port == UINT32_MAX ||
 		     targ_port == xio->io_hdr.nexus.targ_port) &&
 		    (init_id == UINT32_MAX ||
 		     init_id == xio->io_hdr.nexus.initid.id)) {
 			if (targ_port != xio->io_hdr.nexus.targ_port ||
 			    init_id != xio->io_hdr.nexus.initid.id)
 				xio->io_hdr.flags |= CTL_FLAG_ABORT_STATUS;
 			xio->io_hdr.flags |= CTL_FLAG_ABORT;
 			if (!other_sc && !(lun->flags & CTL_LUN_PRIMARY_SC)) {
 				union ctl_ha_msg msg_info;
 
 				msg_info.hdr.nexus = xio->io_hdr.nexus;
 				msg_info.task.task_action = CTL_TASK_ABORT_TASK;
 				msg_info.task.tag_num = xio->scsiio.tag_num;
 				msg_info.task.tag_type = xio->scsiio.tag_type;
 				msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS;
 				msg_info.hdr.original_sc = NULL;
 				msg_info.hdr.serializing_sc = NULL;
 				ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 				    (void *)&msg_info, sizeof(msg_info), 0);
 			}
 		}
 	}
 }
 
 static int
 ctl_abort_task_set(union ctl_io *io)
 {
 	struct ctl_softc *softc = control_softc;
 	struct ctl_lun *lun;
 	uint32_t targ_lun;
 
 	/*
 	 * Look up the LUN.
 	 */
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if ((targ_lun < CTL_MAX_LUNS) && (softc->ctl_luns[targ_lun] != NULL))
 		lun = softc->ctl_luns[targ_lun];
 	else {
 		mtx_unlock(&softc->ctl_lock);
 		return (1);
 	}
 
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	if (io->taskio.task_action == CTL_TASK_ABORT_TASK_SET) {
 		ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port,
 		    io->io_hdr.nexus.initid.id,
 		    (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0);
 	} else { /* CTL_TASK_CLEAR_TASK_SET */
 		ctl_abort_tasks_lun(lun, UINT32_MAX, UINT32_MAX,
 		    (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0);
 	}
 	mtx_unlock(&lun->lun_lock);
 	return (0);
 }
 
 static int
 ctl_i_t_nexus_reset(union ctl_io *io)
 {
 	struct ctl_softc *softc = control_softc;
 	struct ctl_lun *lun;
 	uint32_t initindex;
 
 	initindex = ctl_get_initindex(&io->io_hdr.nexus);
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(lun, &softc->lun_list, links) {
 		mtx_lock(&lun->lun_lock);
 		ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port,
 		    io->io_hdr.nexus.initid.id,
 		    (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0);
 #ifdef CTL_WITH_CA
 		ctl_clear_mask(lun->have_ca, initindex);
 #endif
 		lun->pending_ua[initindex] |= CTL_UA_I_T_NEXUS_LOSS;
 		mtx_unlock(&lun->lun_lock);
 	}
 	mtx_unlock(&softc->ctl_lock);
 	return (0);
 }
 
 static int
 ctl_abort_task(union ctl_io *io)
 {
 	union ctl_io *xio;
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 #if 0
 	struct sbuf sb;
 	char printbuf[128];
 #endif
 	int found;
 	uint32_t targ_lun;
 
 	ctl_softc = control_softc;
 	found = 0;
 
 	/*
 	 * Look up the LUN.
 	 */
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	mtx_lock(&ctl_softc->ctl_lock);
 	if ((targ_lun < CTL_MAX_LUNS)
 	 && (ctl_softc->ctl_luns[targ_lun] != NULL))
 		lun = ctl_softc->ctl_luns[targ_lun];
 	else {
 		mtx_unlock(&ctl_softc->ctl_lock);
 		return (1);
 	}
 
 #if 0
 	printf("ctl_abort_task: called for lun %lld, tag %d type %d\n",
 	       lun->lun, io->taskio.tag_num, io->taskio.tag_type);
 #endif
 
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&ctl_softc->ctl_lock);
 	/*
 	 * Run through the OOA queue and attempt to find the given I/O.
 	 * The target port, initiator ID, tag type and tag number have to
 	 * match the values that we got from the initiator.  If we have an
 	 * untagged command to abort, simply abort the first untagged command
 	 * we come to.  We only allow one untagged command at a time of course.
 	 */
 #if 0
 	TAILQ_FOREACH((struct ctl_io_hdr *)xio, &lun->ooa_queue, ooa_links) {
 #endif
 	for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL;
 	     xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) {
 #if 0
 		sbuf_new(&sb, printbuf, sizeof(printbuf), SBUF_FIXEDLEN);
 
 		sbuf_printf(&sb, "LUN %lld tag %d type %d%s%s%s%s: ",
 			    lun->lun, xio->scsiio.tag_num,
 			    xio->scsiio.tag_type,
 			    (xio->io_hdr.blocked_links.tqe_prev
 			    == NULL) ? "" : " BLOCKED",
 			    (xio->io_hdr.flags &
 			    CTL_FLAG_DMA_INPROG) ? " DMA" : "",
 			    (xio->io_hdr.flags &
 			    CTL_FLAG_ABORT) ? " ABORT" : "",
 			    (xio->io_hdr.flags &
 			    CTL_FLAG_IS_WAS_ON_RTR ? " RTR" : ""));
 		ctl_scsi_command_string(&xio->scsiio, NULL, &sb);
 		sbuf_finish(&sb);
 		printf("%s\n", sbuf_data(&sb));
 #endif
 
 		if ((xio->io_hdr.nexus.targ_port == io->io_hdr.nexus.targ_port)
 		 && (xio->io_hdr.nexus.initid.id ==
 		     io->io_hdr.nexus.initid.id)) {
 			/*
 			 * If the abort says that the task is untagged, the
 			 * task in the queue must be untagged.  Otherwise,
 			 * we just check to see whether the tag numbers
 			 * match.  This is because the QLogic firmware
 			 * doesn't pass back the tag type in an abort
 			 * request.
 			 */
 #if 0
 			if (((xio->scsiio.tag_type == CTL_TAG_UNTAGGED)
 			  && (io->taskio.tag_type == CTL_TAG_UNTAGGED))
 			 || (xio->scsiio.tag_num == io->taskio.tag_num)) {
 #endif
 			/*
 			 * XXX KDM we've got problems with FC, because it
 			 * doesn't send down a tag type with aborts.  So we
 			 * can only really go by the tag number...
 			 * This may cause problems with parallel SCSI.
 			 * Need to figure that out!!
 			 */
 			if (xio->scsiio.tag_num == io->taskio.tag_num) {
 				xio->io_hdr.flags |= CTL_FLAG_ABORT;
 				found = 1;
 				if ((io->io_hdr.flags &
 				     CTL_FLAG_FROM_OTHER_SC) == 0 &&
 				    !(lun->flags & CTL_LUN_PRIMARY_SC)) {
 					union ctl_ha_msg msg_info;
 
 					io->io_hdr.flags |=
 					                CTL_FLAG_SENT_2OTHER_SC;
 					msg_info.hdr.nexus = io->io_hdr.nexus;
 					msg_info.task.task_action =
 						CTL_TASK_ABORT_TASK;
 					msg_info.task.tag_num =
 						io->taskio.tag_num;
 					msg_info.task.tag_type =
 						io->taskio.tag_type;
 					msg_info.hdr.msg_type =
 						CTL_MSG_MANAGE_TASKS;
 					msg_info.hdr.original_sc = NULL;
 					msg_info.hdr.serializing_sc = NULL;
 #if 0
 					printf("Sent Abort to other side\n");
 #endif
 					if (CTL_HA_STATUS_SUCCESS !=
 					        ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 		    				(void *)&msg_info,
 						sizeof(msg_info), 0)) {
 					}
 				}
 #if 0
 				printf("ctl_abort_task: found I/O to abort\n");
 #endif
 				break;
 			}
 		}
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	if (found == 0) {
 		/*
 		 * This isn't really an error.  It's entirely possible for
 		 * the abort and command completion to cross on the wire.
 		 * This is more of an informative/diagnostic error.
 		 */
 #if 0
 		printf("ctl_abort_task: ABORT sent for nonexistent I/O: "
 		       "%d:%d:%d:%d tag %d type %d\n",
 		       io->io_hdr.nexus.initid.id,
 		       io->io_hdr.nexus.targ_port,
 		       io->io_hdr.nexus.targ_target.id,
 		       io->io_hdr.nexus.targ_lun, io->taskio.tag_num,
 		       io->taskio.tag_type);
 #endif
 	}
 	return (0);
 }
 
 static void
 ctl_run_task(union ctl_io *io)
 {
 	struct ctl_softc *ctl_softc = control_softc;
 	int retval = 1;
 	const char *task_desc;
 
 	CTL_DEBUG_PRINT(("ctl_run_task\n"));
 
 	KASSERT(io->io_hdr.io_type == CTL_IO_TASK,
 	    ("ctl_run_task: Unextected io_type %d\n",
 	     io->io_hdr.io_type));
 
 	task_desc = ctl_scsi_task_string(&io->taskio);
 	if (task_desc != NULL) {
 #ifdef NEEDTOPORT
 		csevent_log(CSC_CTL | CSC_SHELF_SW |
 			    CTL_TASK_REPORT,
 			    csevent_LogType_Trace,
 			    csevent_Severity_Information,
 			    csevent_AlertLevel_Green,
 			    csevent_FRU_Firmware,
 			    csevent_FRU_Unknown,
 			    "CTL: received task: %s",task_desc);
 #endif
 	} else {
 #ifdef NEEDTOPORT
 		csevent_log(CSC_CTL | CSC_SHELF_SW |
 			    CTL_TASK_REPORT,
 			    csevent_LogType_Trace,
 			    csevent_Severity_Information,
 			    csevent_AlertLevel_Green,
 			    csevent_FRU_Firmware,
 			    csevent_FRU_Unknown,
 			    "CTL: received unknown task "
 			    "type: %d (%#x)",
 			    io->taskio.task_action,
 			    io->taskio.task_action);
 #endif
 	}
 	switch (io->taskio.task_action) {
 	case CTL_TASK_ABORT_TASK:
 		retval = ctl_abort_task(io);
 		break;
 	case CTL_TASK_ABORT_TASK_SET:
 	case CTL_TASK_CLEAR_TASK_SET:
 		retval = ctl_abort_task_set(io);
 		break;
 	case CTL_TASK_CLEAR_ACA:
 		break;
 	case CTL_TASK_I_T_NEXUS_RESET:
 		retval = ctl_i_t_nexus_reset(io);
 		break;
 	case CTL_TASK_LUN_RESET: {
 		struct ctl_lun *lun;
 		uint32_t targ_lun;
 
 		targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 		mtx_lock(&ctl_softc->ctl_lock);
 		if ((targ_lun < CTL_MAX_LUNS)
 		 && (ctl_softc->ctl_luns[targ_lun] != NULL))
 			lun = ctl_softc->ctl_luns[targ_lun];
 		else {
 			mtx_unlock(&ctl_softc->ctl_lock);
 			retval = 1;
 			break;
 		}
 
 		if (!(io->io_hdr.flags &
 		    CTL_FLAG_FROM_OTHER_SC)) {
 			union ctl_ha_msg msg_info;
 
 			io->io_hdr.flags |=
 				CTL_FLAG_SENT_2OTHER_SC;
 			msg_info.hdr.msg_type =
 				CTL_MSG_MANAGE_TASKS;
 			msg_info.hdr.nexus = io->io_hdr.nexus;
 			msg_info.task.task_action =
 				CTL_TASK_LUN_RESET;
 			msg_info.hdr.original_sc = NULL;
 			msg_info.hdr.serializing_sc = NULL;
 			if (CTL_HA_STATUS_SUCCESS !=
 			    ctl_ha_msg_send(CTL_HA_CHAN_CTL,
 			    (void *)&msg_info,
 			    sizeof(msg_info), 0)) {
 			}
 		}
 
 		retval = ctl_lun_reset(lun, io,
 				       CTL_UA_LUN_RESET);
 		mtx_unlock(&ctl_softc->ctl_lock);
 		break;
 	}
 	case CTL_TASK_TARGET_RESET:
 		retval = ctl_target_reset(ctl_softc, io, CTL_UA_TARG_RESET);
 		break;
 	case CTL_TASK_BUS_RESET:
 		retval = ctl_bus_reset(ctl_softc, io);
 		break;
 	case CTL_TASK_PORT_LOGIN:
 		break;
 	case CTL_TASK_PORT_LOGOUT:
 		break;
 	default:
 		printf("ctl_run_task: got unknown task management event %d\n",
 		       io->taskio.task_action);
 		break;
 	}
 	if (retval == 0)
 		io->io_hdr.status = CTL_SUCCESS;
 	else
 		io->io_hdr.status = CTL_ERROR;
 	ctl_done(io);
 }
 
 /*
  * For HA operation.  Handle commands that come in from the other
  * controller.
  */
 static void
 ctl_handle_isc(union ctl_io *io)
 {
 	int free_io;
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 	uint32_t targ_lun;
 
 	ctl_softc = control_softc;
 
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	lun = ctl_softc->ctl_luns[targ_lun];
 
 	switch (io->io_hdr.msg_type) {
 	case CTL_MSG_SERIALIZE:
 		free_io = ctl_serialize_other_sc_cmd(&io->scsiio);
 		break;
 	case CTL_MSG_R2R: {
 		const struct ctl_cmd_entry *entry;
 
 		/*
 		 * This is only used in SER_ONLY mode.
 		 */
 		free_io = 0;
 		entry = ctl_get_cmd_entry(&io->scsiio);
 		mtx_lock(&lun->lun_lock);
 		if (ctl_scsiio_lun_check(ctl_softc, lun,
 		    entry, (struct ctl_scsiio *)io) != 0) {
 			mtx_unlock(&lun->lun_lock);
 			ctl_done(io);
 			break;
 		}
 		io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 		mtx_unlock(&lun->lun_lock);
 		ctl_enqueue_rtr(io);
 		break;
 	}
 	case CTL_MSG_FINISH_IO:
 		if (ctl_softc->ha_mode == CTL_HA_MODE_XFER) {
 			free_io = 0;
 			ctl_done(io);
 		} else {
 			free_io = 1;
 			mtx_lock(&lun->lun_lock);
 			TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr,
 				     ooa_links);
 			ctl_check_blocked(lun);
 			mtx_unlock(&lun->lun_lock);
 		}
 		break;
 	case CTL_MSG_PERS_ACTION:
 		ctl_hndl_per_res_out_on_other_sc(
 			(union ctl_ha_msg *)&io->presio.pr_msg);
 		free_io = 1;
 		break;
 	case CTL_MSG_BAD_JUJU:
 		free_io = 0;
 		ctl_done(io);
 		break;
 	case CTL_MSG_DATAMOVE:
 		/* Only used in XFER mode */
 		free_io = 0;
 		ctl_datamove_remote(io);
 		break;
 	case CTL_MSG_DATAMOVE_DONE:
 		/* Only used in XFER mode */
 		free_io = 0;
 		io->scsiio.be_move_done(io);
 		break;
 	default:
 		free_io = 1;
 		printf("%s: Invalid message type %d\n",
 		       __func__, io->io_hdr.msg_type);
 		break;
 	}
 	if (free_io)
 		ctl_free_io(io);
 
 }
 
 
 /*
  * Returns the match type in the case of a match, or CTL_LUN_PAT_NONE if
  * there is no match.
  */
 static ctl_lun_error_pattern
 ctl_cmd_pattern_match(struct ctl_scsiio *ctsio, struct ctl_error_desc *desc)
 {
 	const struct ctl_cmd_entry *entry;
 	ctl_lun_error_pattern filtered_pattern, pattern;
 
 	pattern = desc->error_pattern;
 
 	/*
 	 * XXX KDM we need more data passed into this function to match a
 	 * custom pattern, and we actually need to implement custom pattern
 	 * matching.
 	 */
 	if (pattern & CTL_LUN_PAT_CMD)
 		return (CTL_LUN_PAT_CMD);
 
 	if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_ANY)
 		return (CTL_LUN_PAT_ANY);
 
 	entry = ctl_get_cmd_entry(ctsio);
 
 	filtered_pattern = entry->pattern & pattern;
 
 	/*
 	 * If the user requested specific flags in the pattern (e.g.
 	 * CTL_LUN_PAT_RANGE), make sure the command supports all of those
 	 * flags.
 	 *
 	 * If the user did not specify any flags, it doesn't matter whether
 	 * or not the command supports the flags.
 	 */
 	if ((filtered_pattern & ~CTL_LUN_PAT_MASK) !=
 	     (pattern & ~CTL_LUN_PAT_MASK))
 		return (CTL_LUN_PAT_NONE);
 
 	/*
 	 * If the user asked for a range check, see if the requested LBA
 	 * range overlaps with this command's LBA range.
 	 */
 	if (filtered_pattern & CTL_LUN_PAT_RANGE) {
 		uint64_t lba1;
 		uint32_t len1;
 		ctl_action action;
 		int retval;
 
 		retval = ctl_get_lba_len((union ctl_io *)ctsio, &lba1, &len1);
 		if (retval != 0)
 			return (CTL_LUN_PAT_NONE);
 
 		action = ctl_extent_check_lba(lba1, len1, desc->lba_range.lba,
 					      desc->lba_range.len);
 		/*
 		 * A "pass" means that the LBA ranges don't overlap, so
 		 * this doesn't match the user's range criteria.
 		 */
 		if (action == CTL_ACTION_PASS)
 			return (CTL_LUN_PAT_NONE);
 	}
 
 	return (filtered_pattern);
 }
 
 static void
 ctl_inject_error(struct ctl_lun *lun, union ctl_io *io)
 {
 	struct ctl_error_desc *desc, *desc2;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) {
 		ctl_lun_error_pattern pattern;
 		/*
 		 * Check to see whether this particular command matches
 		 * the pattern in the descriptor.
 		 */
 		pattern = ctl_cmd_pattern_match(&io->scsiio, desc);
 		if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_NONE)
 			continue;
 
 		switch (desc->lun_error & CTL_LUN_INJ_TYPE) {
 		case CTL_LUN_INJ_ABORTED:
 			ctl_set_aborted(&io->scsiio);
 			break;
 		case CTL_LUN_INJ_MEDIUM_ERR:
 			ctl_set_medium_error(&io->scsiio);
 			break;
 		case CTL_LUN_INJ_UA:
 			/* 29h/00h  POWER ON, RESET, OR BUS DEVICE RESET
 			 * OCCURRED */
 			ctl_set_ua(&io->scsiio, 0x29, 0x00);
 			break;
 		case CTL_LUN_INJ_CUSTOM:
 			/*
 			 * We're assuming the user knows what he is doing.
 			 * Just copy the sense information without doing
 			 * checks.
 			 */
 			bcopy(&desc->custom_sense, &io->scsiio.sense_data,
 			      ctl_min(sizeof(desc->custom_sense),
 				      sizeof(io->scsiio.sense_data)));
 			io->scsiio.scsi_status = SCSI_STATUS_CHECK_COND;
 			io->scsiio.sense_len = SSD_FULL_SIZE;
 			io->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE;
 			break;
 		case CTL_LUN_INJ_NONE:
 		default:
 			/*
 			 * If this is an error injection type we don't know
 			 * about, clear the continuous flag (if it is set)
 			 * so it will get deleted below.
 			 */
 			desc->lun_error &= ~CTL_LUN_INJ_CONTINUOUS;
 			break;
 		}
 		/*
 		 * By default, each error injection action is a one-shot
 		 */
 		if (desc->lun_error & CTL_LUN_INJ_CONTINUOUS)
 			continue;
 
 		STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links);
 
 		free(desc, M_CTL);
 	}
 }
 
 #ifdef CTL_IO_DELAY
 static void
 ctl_datamove_timer_wakeup(void *arg)
 {
 	union ctl_io *io;
 
 	io = (union ctl_io *)arg;
 
 	ctl_datamove(io);
 }
 #endif /* CTL_IO_DELAY */
 
 void
 ctl_datamove(union ctl_io *io)
 {
 	void (*fe_datamove)(union ctl_io *io);
 
 	mtx_assert(&control_softc->ctl_lock, MA_NOTOWNED);
 
 	CTL_DEBUG_PRINT(("ctl_datamove\n"));
 
 #ifdef CTL_TIME_IO
 	if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) {
 		char str[256];
 		char path_str[64];
 		struct sbuf sb;
 
 		ctl_scsi_path_string(io, path_str, sizeof(path_str));
 		sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN);
 
 		sbuf_cat(&sb, path_str);
 		switch (io->io_hdr.io_type) {
 		case CTL_IO_SCSI:
 			ctl_scsi_command_string(&io->scsiio, NULL, &sb);
 			sbuf_printf(&sb, "\n");
 			sbuf_cat(&sb, path_str);
 			sbuf_printf(&sb, "Tag: 0x%04x, type %d\n",
 				    io->scsiio.tag_num, io->scsiio.tag_type);
 			break;
 		case CTL_IO_TASK:
 			sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, "
 				    "Tag Type: %d\n", io->taskio.task_action,
 				    io->taskio.tag_num, io->taskio.tag_type);
 			break;
 		default:
 			printf("Invalid CTL I/O type %d\n", io->io_hdr.io_type);
 			panic("Invalid CTL I/O type %d\n", io->io_hdr.io_type);
 			break;
 		}
 		sbuf_cat(&sb, path_str);
 		sbuf_printf(&sb, "ctl_datamove: %jd seconds\n",
 			    (intmax_t)time_uptime - io->io_hdr.start_time);
 		sbuf_finish(&sb);
 		printf("%s", sbuf_data(&sb));
 	}
 #endif /* CTL_TIME_IO */
 
 #ifdef CTL_IO_DELAY
 	if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) {
 		struct ctl_lun *lun;
 
 		lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 		io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE;
 	} else {
 		struct ctl_lun *lun;
 
 		lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 		if ((lun != NULL)
 		 && (lun->delay_info.datamove_delay > 0)) {
 			struct callout *callout;
 
 			callout = (struct callout *)&io->io_hdr.timer_bytes;
 			callout_init(callout, /*mpsafe*/ 1);
 			io->io_hdr.flags |= CTL_FLAG_DELAY_DONE;
 			callout_reset(callout,
 				      lun->delay_info.datamove_delay * hz,
 				      ctl_datamove_timer_wakeup, io);
 			if (lun->delay_info.datamove_type ==
 			    CTL_DELAY_TYPE_ONESHOT)
 				lun->delay_info.datamove_delay = 0;
 			return;
 		}
 	}
 #endif
 
 	/*
 	 * This command has been aborted.  Set the port status, so we fail
 	 * the data move.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
 		printf("ctl_datamove: tag 0x%04x on (%ju:%d:%ju:%d) aborted\n",
 		       io->scsiio.tag_num,(uintmax_t)io->io_hdr.nexus.initid.id,
 		       io->io_hdr.nexus.targ_port,
 		       (uintmax_t)io->io_hdr.nexus.targ_target.id,
 		       io->io_hdr.nexus.targ_lun);
 		io->io_hdr.port_status = 31337;
 		/*
 		 * Note that the backend, in this case, will get the
 		 * callback in its context.  In other cases it may get
 		 * called in the frontend's interrupt thread context.
 		 */
 		io->scsiio.be_move_done(io);
 		return;
 	}
 
 	/*
 	 * If we're in XFER mode and this I/O is from the other shelf
 	 * controller, we need to send the DMA to the other side to
 	 * actually transfer the data to/from the host.  In serialize only
 	 * mode the transfer happens below CTL and ctl_datamove() is only
 	 * called on the machine that originally received the I/O.
 	 */
 	if ((control_softc->ha_mode == CTL_HA_MODE_XFER)
 	 && (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) {
 		union ctl_ha_msg msg;
 		uint32_t sg_entries_sent;
 		int do_sg_copy;
 		int i;
 
 		memset(&msg, 0, sizeof(msg));
 		msg.hdr.msg_type = CTL_MSG_DATAMOVE;
 		msg.hdr.original_sc = io->io_hdr.original_sc;
 		msg.hdr.serializing_sc = io;
 		msg.hdr.nexus = io->io_hdr.nexus;
 		msg.dt.flags = io->io_hdr.flags;
 		/*
 		 * We convert everything into a S/G list here.  We can't
 		 * pass by reference, only by value between controllers.
 		 * So we can't pass a pointer to the S/G list, only as many
 		 * S/G entries as we can fit in here.  If it's possible for
 		 * us to get more than CTL_HA_MAX_SG_ENTRIES S/G entries,
 		 * then we need to break this up into multiple transfers.
 		 */
 		if (io->scsiio.kern_sg_entries == 0) {
 			msg.dt.kern_sg_entries = 1;
 			/*
 			 * If this is in cached memory, flush the cache
 			 * before we send the DMA request to the other
 			 * controller.  We want to do this in either the
 			 * read or the write case.  The read case is
 			 * straightforward.  In the write case, we want to
 			 * make sure nothing is in the local cache that
 			 * could overwrite the DMAed data.
 			 */
 			if ((io->io_hdr.flags & CTL_FLAG_NO_DATASYNC) == 0) {
 				/*
 				 * XXX KDM use bus_dmamap_sync() here.
 				 */
 			}
 
 			/*
 			 * Convert to a physical address if this is a
 			 * virtual address.
 			 */
 			if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
 				msg.dt.sg_list[0].addr =
 					io->scsiio.kern_data_ptr;
 			} else {
 				/*
 				 * XXX KDM use busdma here!
 				 */
 #if 0
 				msg.dt.sg_list[0].addr = (void *)
 					vtophys(io->scsiio.kern_data_ptr);
 #endif
 			}
 
 			msg.dt.sg_list[0].len = io->scsiio.kern_data_len;
 			do_sg_copy = 0;
 		} else {
 			struct ctl_sg_entry *sgl;
 
 			do_sg_copy = 1;
 			msg.dt.kern_sg_entries = io->scsiio.kern_sg_entries;
 			sgl = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
 			if ((io->io_hdr.flags & CTL_FLAG_NO_DATASYNC) == 0) {
 				/*
 				 * XXX KDM use bus_dmamap_sync() here.
 				 */
 			}
 		}
 
 		msg.dt.kern_data_len = io->scsiio.kern_data_len;
 		msg.dt.kern_total_len = io->scsiio.kern_total_len;
 		msg.dt.kern_data_resid = io->scsiio.kern_data_resid;
 		msg.dt.kern_rel_offset = io->scsiio.kern_rel_offset;
 		msg.dt.sg_sequence = 0;
 
 		/*
 		 * Loop until we've sent all of the S/G entries.  On the
 		 * other end, we'll recompose these S/G entries into one
 		 * contiguous list before passing it to the
 		 */
 		for (sg_entries_sent = 0; sg_entries_sent <
 		     msg.dt.kern_sg_entries; msg.dt.sg_sequence++) {
 			msg.dt.cur_sg_entries = ctl_min((sizeof(msg.dt.sg_list)/
 				sizeof(msg.dt.sg_list[0])),
 				msg.dt.kern_sg_entries - sg_entries_sent);
 
 			if (do_sg_copy != 0) {
 				struct ctl_sg_entry *sgl;
 				int j;
 
 				sgl = (struct ctl_sg_entry *)
 					io->scsiio.kern_data_ptr;
 				/*
 				 * If this is in cached memory, flush the cache
 				 * before we send the DMA request to the other
 				 * controller.  We want to do this in either
 				 * the * read or the write case.  The read
 				 * case is straightforward.  In the write
 				 * case, we want to make sure nothing is
 				 * in the local cache that could overwrite
 				 * the DMAed data.
 				 */
 
 				for (i = sg_entries_sent, j = 0;
 				     i < msg.dt.cur_sg_entries; i++, j++) {
 					if ((io->io_hdr.flags &
 					     CTL_FLAG_NO_DATASYNC) == 0) {
 						/*
 						 * XXX KDM use bus_dmamap_sync()
 						 */
 					}
 					if ((io->io_hdr.flags &
 					     CTL_FLAG_BUS_ADDR) == 0) {
 						/*
 						 * XXX KDM use busdma.
 						 */
 #if 0
 						msg.dt.sg_list[j].addr =(void *)
 						       vtophys(sgl[i].addr);
 #endif
 					} else {
 						msg.dt.sg_list[j].addr =
 							sgl[i].addr;
 					}
 					msg.dt.sg_list[j].len = sgl[i].len;
 				}
 			}
 
 			sg_entries_sent += msg.dt.cur_sg_entries;
 			if (sg_entries_sent >= msg.dt.kern_sg_entries)
 				msg.dt.sg_last = 1;
 			else
 				msg.dt.sg_last = 0;
 
 			/*
 			 * XXX KDM drop and reacquire the lock here?
 			 */
 			if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg,
 			    sizeof(msg), 0) > CTL_HA_STATUS_SUCCESS) {
 				/*
 				 * XXX do something here.
 				 */
 			}
 
 			msg.dt.sent_sg_entries = sg_entries_sent;
 		}
 		io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 		if (io->io_hdr.flags & CTL_FLAG_FAILOVER)
 			ctl_failover_io(io, /*have_lock*/ 0);
 
 	} else {
 
 		/*
 		 * Lookup the fe_datamove() function for this particular
 		 * front end.
 		 */
 		fe_datamove =
 		    control_softc->ctl_ports[ctl_port_idx(io->io_hdr.nexus.targ_port)]->fe_datamove;
 
 		fe_datamove(io);
 	}
 }
 
 static void
 ctl_send_datamove_done(union ctl_io *io, int have_lock)
 {
 	union ctl_ha_msg msg;
 	int isc_status;
 
 	memset(&msg, 0, sizeof(msg));
 
 	msg.hdr.msg_type = CTL_MSG_DATAMOVE_DONE;
 	msg.hdr.original_sc = io;
 	msg.hdr.serializing_sc = io->io_hdr.serializing_sc;
 	msg.hdr.nexus = io->io_hdr.nexus;
 	msg.hdr.status = io->io_hdr.status;
 	msg.scsi.tag_num = io->scsiio.tag_num;
 	msg.scsi.tag_type = io->scsiio.tag_type;
 	msg.scsi.scsi_status = io->scsiio.scsi_status;
 	memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data,
 	       sizeof(io->scsiio.sense_data));
 	msg.scsi.sense_len = io->scsiio.sense_len;
 	msg.scsi.sense_residual = io->scsiio.sense_residual;
 	msg.scsi.fetd_status = io->io_hdr.port_status;
 	msg.scsi.residual = io->scsiio.residual;
 	io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 
 	if (io->io_hdr.flags & CTL_FLAG_FAILOVER) {
 		ctl_failover_io(io, /*have_lock*/ have_lock);
 		return;
 	}
 
 	isc_status = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg), 0);
 	if (isc_status > CTL_HA_STATUS_SUCCESS) {
 		/* XXX do something if this fails */
 	}
 
 }
 
 /*
  * The DMA to the remote side is done, now we need to tell the other side
  * we're done so it can continue with its data movement.
  */
 static void
 ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq)
 {
 	union ctl_io *io;
 
 	io = rq->context;
 
 	if (rq->ret != CTL_HA_STATUS_SUCCESS) {
 		printf("%s: ISC DMA write failed with error %d", __func__,
 		       rq->ret);
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/ rq->ret);
 	}
 
 	ctl_dt_req_free(rq);
 
 	/*
 	 * In this case, we had to malloc the memory locally.  Free it.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) == 0) {
 		int i;
 		for (i = 0; i < io->scsiio.kern_sg_entries; i++)
 			free(io->io_hdr.local_sglist[i].addr, M_CTL);
 	}
 	/*
 	 * The data is in local and remote memory, so now we need to send
 	 * status (good or back) back to the other side.
 	 */
 	ctl_send_datamove_done(io, /*have_lock*/ 0);
 }
 
 /*
  * We've moved the data from the host/controller into local memory.  Now we
  * need to push it over to the remote controller's memory.
  */
 static int
 ctl_datamove_remote_dm_write_cb(union ctl_io *io)
 {
 	int retval;
 
 	retval = 0;
 
 	retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_WRITE,
 					  ctl_datamove_remote_write_cb);
 
 	return (retval);
 }
 
 static void
 ctl_datamove_remote_write(union ctl_io *io)
 {
 	int retval;
 	void (*fe_datamove)(union ctl_io *io);
 
 	/*
 	 * - Get the data from the host/HBA into local memory.
 	 * - DMA memory from the local controller to the remote controller.
 	 * - Send status back to the remote controller.
 	 */
 
 	retval = ctl_datamove_remote_sgl_setup(io);
 	if (retval != 0)
 		return;
 
 	/* Switch the pointer over so the FETD knows what to do */
 	io->scsiio.kern_data_ptr = (uint8_t *)io->io_hdr.local_sglist;
 
 	/*
 	 * Use a custom move done callback, since we need to send completion
 	 * back to the other controller, not to the backend on this side.
 	 */
 	io->scsiio.be_move_done = ctl_datamove_remote_dm_write_cb;
 
 	fe_datamove = control_softc->ctl_ports[ctl_port_idx(io->io_hdr.nexus.targ_port)]->fe_datamove;
 
 	fe_datamove(io);
 
 	return;
 
 }
 
 static int
 ctl_datamove_remote_dm_read_cb(union ctl_io *io)
 {
 #if 0
 	char str[256];
 	char path_str[64];
 	struct sbuf sb;
 #endif
 
 	/*
 	 * In this case, we had to malloc the memory locally.  Free it.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) == 0) {
 		int i;
 		for (i = 0; i < io->scsiio.kern_sg_entries; i++)
 			free(io->io_hdr.local_sglist[i].addr, M_CTL);
 	}
 
 #if 0
 	scsi_path_string(io, path_str, sizeof(path_str));
 	sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN);
 	sbuf_cat(&sb, path_str);
 	scsi_command_string(&io->scsiio, NULL, &sb);
 	sbuf_printf(&sb, "\n");
 	sbuf_cat(&sb, path_str);
 	sbuf_printf(&sb, "Tag: 0x%04x, type %d\n",
 		    io->scsiio.tag_num, io->scsiio.tag_type);
 	sbuf_cat(&sb, path_str);
 	sbuf_printf(&sb, "%s: flags %#x, status %#x\n", __func__,
 		    io->io_hdr.flags, io->io_hdr.status);
 	sbuf_finish(&sb);
 	printk("%s", sbuf_data(&sb));
 #endif
 
 
 	/*
 	 * The read is done, now we need to send status (good or bad) back
 	 * to the other side.
 	 */
 	ctl_send_datamove_done(io, /*have_lock*/ 0);
 
 	return (0);
 }
 
 static void
 ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq)
 {
 	union ctl_io *io;
 	void (*fe_datamove)(union ctl_io *io);
 
 	io = rq->context;
 
 	if (rq->ret != CTL_HA_STATUS_SUCCESS) {
 		printf("%s: ISC DMA read failed with error %d", __func__,
 		       rq->ret);
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/ rq->ret);
 	}
 
 	ctl_dt_req_free(rq);
 
 	/* Switch the pointer over so the FETD knows what to do */
 	io->scsiio.kern_data_ptr = (uint8_t *)io->io_hdr.local_sglist;
 
 	/*
 	 * Use a custom move done callback, since we need to send completion
 	 * back to the other controller, not to the backend on this side.
 	 */
 	io->scsiio.be_move_done = ctl_datamove_remote_dm_read_cb;
 
 	/* XXX KDM add checks like the ones in ctl_datamove? */
 
 	fe_datamove = control_softc->ctl_ports[ctl_port_idx(io->io_hdr.nexus.targ_port)]->fe_datamove;
 
 	fe_datamove(io);
 }
 
 static int
 ctl_datamove_remote_sgl_setup(union ctl_io *io)
 {
 	struct ctl_sg_entry *local_sglist, *remote_sglist;
 	struct ctl_sg_entry *local_dma_sglist, *remote_dma_sglist;
 	struct ctl_softc *softc;
 	int retval;
 	int i;
 
 	retval = 0;
 	softc = control_softc;
 
 	local_sglist = io->io_hdr.local_sglist;
 	local_dma_sglist = io->io_hdr.local_dma_sglist;
 	remote_sglist = io->io_hdr.remote_sglist;
 	remote_dma_sglist = io->io_hdr.remote_dma_sglist;
 
 	if (io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) {
 		for (i = 0; i < io->scsiio.kern_sg_entries; i++) {
 			local_sglist[i].len = remote_sglist[i].len;
 
 			/*
 			 * XXX Detect the situation where the RS-level I/O
 			 * redirector on the other side has already read the
 			 * data off of the AOR RS on this side, and
 			 * transferred it to remote (mirror) memory on the
 			 * other side.  Since we already have the data in
 			 * memory here, we just need to use it.
 			 *
 			 * XXX KDM this can probably be removed once we
 			 * get the cache device code in and take the
 			 * current AOR implementation out.
 			 */
 #ifdef NEEDTOPORT
 			if ((remote_sglist[i].addr >=
 			     (void *)vtophys(softc->mirr->addr))
 			 && (remote_sglist[i].addr <
 			     ((void *)vtophys(softc->mirr->addr) +
 			     CacheMirrorOffset))) {
 				local_sglist[i].addr = remote_sglist[i].addr -
 					CacheMirrorOffset;
 				if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
 				     CTL_FLAG_DATA_IN)
 					io->io_hdr.flags |= CTL_FLAG_REDIR_DONE;
 			} else {
 				local_sglist[i].addr = remote_sglist[i].addr +
 					CacheMirrorOffset;
 			}
 #endif
 #if 0
 			printf("%s: local %p, remote %p, len %d\n",
 			       __func__, local_sglist[i].addr,
 			       remote_sglist[i].addr, local_sglist[i].len);
 #endif
 		}
 	} else {
 		uint32_t len_to_go;
 
 		/*
 		 * In this case, we don't have automatically allocated
 		 * memory for this I/O on this controller.  This typically
 		 * happens with internal CTL I/O -- e.g. inquiry, mode
 		 * sense, etc.  Anything coming from RAIDCore will have
 		 * a mirror area available.
 		 */
 		len_to_go = io->scsiio.kern_data_len;
 
 		/*
 		 * Clear the no datasync flag, we have to use malloced
 		 * buffers.
 		 */
 		io->io_hdr.flags &= ~CTL_FLAG_NO_DATASYNC;
 
 		/*
 		 * The difficult thing here is that the size of the various
 		 * S/G segments may be different than the size from the
 		 * remote controller.  That'll make it harder when DMAing
 		 * the data back to the other side.
 		 */
 		for (i = 0; (i < sizeof(io->io_hdr.remote_sglist) /
 		     sizeof(io->io_hdr.remote_sglist[0])) &&
 		     (len_to_go > 0); i++) {
 			local_sglist[i].len = ctl_min(len_to_go, 131072);
 			CTL_SIZE_8B(local_dma_sglist[i].len,
 				    local_sglist[i].len);
 			local_sglist[i].addr =
 				malloc(local_dma_sglist[i].len, M_CTL,M_WAITOK);
 
 			local_dma_sglist[i].addr = local_sglist[i].addr;
 
 			if (local_sglist[i].addr == NULL) {
 				int j;
 
 				printf("malloc failed for %zd bytes!",
 				       local_dma_sglist[i].len);
 				for (j = 0; j < i; j++) {
 					free(local_sglist[j].addr, M_CTL);
 				}
 				ctl_set_internal_failure(&io->scsiio,
 							 /*sks_valid*/ 1,
 							 /*retry_count*/ 4857);
 				retval = 1;
 				goto bailout_error;
 				
 			}
 			/* XXX KDM do we need a sync here? */
 
 			len_to_go -= local_sglist[i].len;
 		}
 		/*
 		 * Reset the number of S/G entries accordingly.  The
 		 * original number of S/G entries is available in
 		 * rem_sg_entries.
 		 */
 		io->scsiio.kern_sg_entries = i;
 
 #if 0
 		printf("%s: kern_sg_entries = %d\n", __func__,
 		       io->scsiio.kern_sg_entries);
 		for (i = 0; i < io->scsiio.kern_sg_entries; i++)
 			printf("%s: sg[%d] = %p, %d (DMA: %d)\n", __func__, i,
 			       local_sglist[i].addr, local_sglist[i].len,
 			       local_dma_sglist[i].len);
 #endif
 	}
 
 
 	return (retval);
 
 bailout_error:
 
 	ctl_send_datamove_done(io, /*have_lock*/ 0);
 
 	return (retval);
 }
 
 static int
 ctl_datamove_remote_xfer(union ctl_io *io, unsigned command,
 			 ctl_ha_dt_cb callback)
 {
 	struct ctl_ha_dt_req *rq;
 	struct ctl_sg_entry *remote_sglist, *local_sglist;
 	struct ctl_sg_entry *remote_dma_sglist, *local_dma_sglist;
 	uint32_t local_used, remote_used, total_used;
 	int retval;
 	int i, j;
 
 	retval = 0;
 
 	rq = ctl_dt_req_alloc();
 
 	/*
 	 * If we failed to allocate the request, and if the DMA didn't fail
 	 * anyway, set busy status.  This is just a resource allocation
 	 * failure.
 	 */
 	if ((rq == NULL)
 	 && ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE))
 		ctl_set_busy(&io->scsiio);
 
 	if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE) {
 
 		if (rq != NULL)
 			ctl_dt_req_free(rq);
 
 		/*
 		 * The data move failed.  We need to return status back
 		 * to the other controller.  No point in trying to DMA
 		 * data to the remote controller.
 		 */
 
 		ctl_send_datamove_done(io, /*have_lock*/ 0);
 
 		retval = 1;
 
 		goto bailout;
 	}
 
 	local_sglist = io->io_hdr.local_sglist;
 	local_dma_sglist = io->io_hdr.local_dma_sglist;
 	remote_sglist = io->io_hdr.remote_sglist;
 	remote_dma_sglist = io->io_hdr.remote_dma_sglist;
 	local_used = 0;
 	remote_used = 0;
 	total_used = 0;
 
 	if (io->io_hdr.flags & CTL_FLAG_REDIR_DONE) {
 		rq->ret = CTL_HA_STATUS_SUCCESS;
 		rq->context = io;
 		callback(rq);
 		goto bailout;
 	}
 
 	/*
 	 * Pull/push the data over the wire from/to the other controller.
 	 * This takes into account the possibility that the local and
 	 * remote sglists may not be identical in terms of the size of
 	 * the elements and the number of elements.
 	 *
 	 * One fundamental assumption here is that the length allocated for
 	 * both the local and remote sglists is identical.  Otherwise, we've
 	 * essentially got a coding error of some sort.
 	 */
 	for (i = 0, j = 0; total_used < io->scsiio.kern_data_len; ) {
 		int isc_ret;
 		uint32_t cur_len, dma_length;
 		uint8_t *tmp_ptr;
 
 		rq->id = CTL_HA_DATA_CTL;
 		rq->command = command;
 		rq->context = io;
 
 		/*
 		 * Both pointers should be aligned.  But it is possible
 		 * that the allocation length is not.  They should both
 		 * also have enough slack left over at the end, though,
 		 * to round up to the next 8 byte boundary.
 		 */
 		cur_len = ctl_min(local_sglist[i].len - local_used,
 				  remote_sglist[j].len - remote_used);
 
 		/*
 		 * In this case, we have a size issue and need to decrease
 		 * the size, except in the case where we actually have less
 		 * than 8 bytes left.  In that case, we need to increase
 		 * the DMA length to get the last bit.
 		 */
 		if ((cur_len & 0x7) != 0) {
 			if (cur_len > 0x7) {
 				cur_len = cur_len - (cur_len & 0x7);
 				dma_length = cur_len;
 			} else {
 				CTL_SIZE_8B(dma_length, cur_len);
 			}
 
 		} else
 			dma_length = cur_len;
 
 		/*
 		 * If we had to allocate memory for this I/O, instead of using
 		 * the non-cached mirror memory, we'll need to flush the cache
 		 * before trying to DMA to the other controller.
 		 *
 		 * We could end up doing this multiple times for the same
 		 * segment if we have a larger local segment than remote
 		 * segment.  That shouldn't be an issue.
 		 */
 		if ((io->io_hdr.flags & CTL_FLAG_NO_DATASYNC) == 0) {
 			/*
 			 * XXX KDM use bus_dmamap_sync() here.
 			 */
 		}
 
 		rq->size = dma_length;
 
 		tmp_ptr = (uint8_t *)local_sglist[i].addr;
 		tmp_ptr += local_used;
 
 		/* Use physical addresses when talking to ISC hardware */
 		if ((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0) {
 			/* XXX KDM use busdma */
 #if 0
 			rq->local = vtophys(tmp_ptr);
 #endif
 		} else
 			rq->local = tmp_ptr;
 
 		tmp_ptr = (uint8_t *)remote_sglist[j].addr;
 		tmp_ptr += remote_used;
 		rq->remote = tmp_ptr;
 
 		rq->callback = NULL;
 
 		local_used += cur_len;
 		if (local_used >= local_sglist[i].len) {
 			i++;
 			local_used = 0;
 		}
 
 		remote_used += cur_len;
 		if (remote_used >= remote_sglist[j].len) {
 			j++;
 			remote_used = 0;
 		}
 		total_used += cur_len;
 
 		if (total_used >= io->scsiio.kern_data_len)
 			rq->callback = callback;
 
 		if ((rq->size & 0x7) != 0) {
 			printf("%s: warning: size %d is not on 8b boundary\n",
 			       __func__, rq->size);
 		}
 		if (((uintptr_t)rq->local & 0x7) != 0) {
 			printf("%s: warning: local %p not on 8b boundary\n",
 			       __func__, rq->local);
 		}
 		if (((uintptr_t)rq->remote & 0x7) != 0) {
 			printf("%s: warning: remote %p not on 8b boundary\n",
 			       __func__, rq->local);
 		}
 #if 0
 		printf("%s: %s: local %#x remote %#x size %d\n", __func__,
 		       (command == CTL_HA_DT_CMD_WRITE) ? "WRITE" : "READ",
 		       rq->local, rq->remote, rq->size);
 #endif
 
 		isc_ret = ctl_dt_single(rq);
 		if (isc_ret == CTL_HA_STATUS_WAIT)
 			continue;
 
 		if (isc_ret == CTL_HA_STATUS_DISCONNECT) {
 			rq->ret = CTL_HA_STATUS_SUCCESS;
 		} else {
 			rq->ret = isc_ret;
 		}
 		callback(rq);
 		goto bailout;
 	}
 
 bailout:
 	return (retval);
 
 }
 
 static void
 ctl_datamove_remote_read(union ctl_io *io)
 {
 	int retval;
 	int i;
 
 	/*
 	 * This will send an error to the other controller in the case of a
 	 * failure.
 	 */
 	retval = ctl_datamove_remote_sgl_setup(io);
 	if (retval != 0)
 		return;
 
 	retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_READ,
 					  ctl_datamove_remote_read_cb);
 	if ((retval != 0)
 	 && ((io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) == 0)) {
 		/*
 		 * Make sure we free memory if there was an error..  The
 		 * ctl_datamove_remote_xfer() function will send the
 		 * datamove done message, or call the callback with an
 		 * error if there is a problem.
 		 */
 		for (i = 0; i < io->scsiio.kern_sg_entries; i++)
 			free(io->io_hdr.local_sglist[i].addr, M_CTL);
 	}
 
 	return;
 }
 
 /*
  * Process a datamove request from the other controller.  This is used for
  * XFER mode only, not SER_ONLY mode.  For writes, we DMA into local memory
  * first.  Once that is complete, the data gets DMAed into the remote
  * controller's memory.  For reads, we DMA from the remote controller's
  * memory into our memory first, and then move it out to the FETD.
  */
 static void
 ctl_datamove_remote(union ctl_io *io)
 {
 	struct ctl_softc *softc;
 
 	softc = control_softc;
 
 	mtx_assert(&softc->ctl_lock, MA_NOTOWNED);
 
 	/*
 	 * Note that we look for an aborted I/O here, but don't do some of
 	 * the other checks that ctl_datamove() normally does.
 	 * We don't need to run the datamove delay code, since that should
 	 * have been done if need be on the other controller.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
 		printf("%s: tag 0x%04x on (%d:%d:%d:%d) aborted\n", __func__,
 		       io->scsiio.tag_num, io->io_hdr.nexus.initid.id,
 		       io->io_hdr.nexus.targ_port,
 		       io->io_hdr.nexus.targ_target.id,
 		       io->io_hdr.nexus.targ_lun);
 		io->io_hdr.port_status = 31338;
 		ctl_send_datamove_done(io, /*have_lock*/ 0);
 		return;
 	}
 
 	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) {
 		ctl_datamove_remote_write(io);
 	} else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN){
 		ctl_datamove_remote_read(io);
 	} else {
 		union ctl_ha_msg msg;
 		struct scsi_sense_data *sense;
 		uint8_t sks[3];
 		int retry_count;
 
 		memset(&msg, 0, sizeof(msg));
 
 		msg.hdr.msg_type = CTL_MSG_BAD_JUJU;
 		msg.hdr.status = CTL_SCSI_ERROR;
 		msg.scsi.scsi_status = SCSI_STATUS_CHECK_COND;
 
 		retry_count = 4243;
 
 		sense = &msg.scsi.sense_data;
 		sks[0] = SSD_SCS_VALID;
 		sks[1] = (retry_count >> 8) & 0xff;
 		sks[2] = retry_count & 0xff;
 
 		/* "Internal target failure" */
 		scsi_set_sense_data(sense,
 				    /*sense_format*/ SSD_TYPE_NONE,
 				    /*current_error*/ 1,
 				    /*sense_key*/ SSD_KEY_HARDWARE_ERROR,
 				    /*asc*/ 0x44,
 				    /*ascq*/ 0x00,
 				    /*type*/ SSD_ELEM_SKS,
 				    /*size*/ sizeof(sks),
 				    /*data*/ sks,
 				    SSD_ELEM_NONE);
 
 		io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 		if (io->io_hdr.flags & CTL_FLAG_FAILOVER) {
 			ctl_failover_io(io, /*have_lock*/ 1);
 			return;
 		}
 
 		if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg), 0) >
 		    CTL_HA_STATUS_SUCCESS) {
 			/* XXX KDM what to do if this fails? */
 		}
 		return;
 	}
 	
 }
 
 static int
 ctl_process_done(union ctl_io *io)
 {
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 	void (*fe_done)(union ctl_io *io);
 	uint32_t targ_port = ctl_port_idx(io->io_hdr.nexus.targ_port);
 
 	CTL_DEBUG_PRINT(("ctl_process_done\n"));
 
 	fe_done =
 	    control_softc->ctl_ports[targ_port]->fe_done;
 
 #ifdef CTL_TIME_IO
 	if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) {
 		char str[256];
 		char path_str[64];
 		struct sbuf sb;
 
 		ctl_scsi_path_string(io, path_str, sizeof(path_str));
 		sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN);
 
 		sbuf_cat(&sb, path_str);
 		switch (io->io_hdr.io_type) {
 		case CTL_IO_SCSI:
 			ctl_scsi_command_string(&io->scsiio, NULL, &sb);
 			sbuf_printf(&sb, "\n");
 			sbuf_cat(&sb, path_str);
 			sbuf_printf(&sb, "Tag: 0x%04x, type %d\n",
 				    io->scsiio.tag_num, io->scsiio.tag_type);
 			break;
 		case CTL_IO_TASK:
 			sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, "
 				    "Tag Type: %d\n", io->taskio.task_action,
 				    io->taskio.tag_num, io->taskio.tag_type);
 			break;
 		default:
 			printf("Invalid CTL I/O type %d\n", io->io_hdr.io_type);
 			panic("Invalid CTL I/O type %d\n", io->io_hdr.io_type);
 			break;
 		}
 		sbuf_cat(&sb, path_str);
 		sbuf_printf(&sb, "ctl_process_done: %jd seconds\n",
 			    (intmax_t)time_uptime - io->io_hdr.start_time);
 		sbuf_finish(&sb);
 		printf("%s", sbuf_data(&sb));
 	}
 #endif /* CTL_TIME_IO */
 
 	switch (io->io_hdr.io_type) {
 	case CTL_IO_SCSI:
 		break;
 	case CTL_IO_TASK:
 		if (bootverbose || verbose > 0)
 			ctl_io_error_print(io, NULL);
 		if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)
 			ctl_free_io(io);
 		else
 			fe_done(io);
 		return (CTL_RETVAL_COMPLETE);
 		break;
 	default:
 		printf("ctl_process_done: invalid io type %d\n",
 		       io->io_hdr.io_type);
 		panic("ctl_process_done: invalid io type %d\n",
 		      io->io_hdr.io_type);
 		break; /* NOTREACHED */
 	}
 
 	lun = (struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 	if (lun == NULL) {
 		CTL_DEBUG_PRINT(("NULL LUN for lun %d\n",
 				 io->io_hdr.nexus.targ_mapped_lun));
 		fe_done(io);
 		goto bailout;
 	}
 	ctl_softc = lun->ctl_softc;
 
 	mtx_lock(&lun->lun_lock);
 
 	/*
 	 * Check to see if we have any errors to inject here.  We only
 	 * inject errors for commands that don't already have errors set.
 	 */
 	if ((STAILQ_FIRST(&lun->error_list) != NULL)
 	 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS))
 		ctl_inject_error(lun, io);
 
 	/*
 	 * XXX KDM how do we treat commands that aren't completed
 	 * successfully?
 	 *
 	 * XXX KDM should we also track I/O latency?
 	 */
 	if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS &&
 	    io->io_hdr.io_type == CTL_IO_SCSI) {
 #ifdef CTL_TIME_IO
 		struct bintime cur_bt;
 #endif
 		int type;
 
 		if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
 		    CTL_FLAG_DATA_IN)
 			type = CTL_STATS_READ;
 		else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
 		    CTL_FLAG_DATA_OUT)
 			type = CTL_STATS_WRITE;
 		else
 			type = CTL_STATS_NO_IO;
 
 		lun->stats.ports[targ_port].bytes[type] +=
 		    io->scsiio.kern_total_len;
 		lun->stats.ports[targ_port].operations[type]++;
 #ifdef CTL_TIME_IO
 		bintime_add(&lun->stats.ports[targ_port].dma_time[type],
 		   &io->io_hdr.dma_bt);
 		lun->stats.ports[targ_port].num_dmas[type] +=
 		    io->io_hdr.num_dmas;
 		getbintime(&cur_bt);
 		bintime_sub(&cur_bt, &io->io_hdr.start_bt);
 		bintime_add(&lun->stats.ports[targ_port].time[type], &cur_bt);
 #endif
 	}
 
 	/*
 	 * Remove this from the OOA queue.
 	 */
 	TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links);
 
 	/*
 	 * Run through the blocked queue on this LUN and see if anything
 	 * has become unblocked, now that this transaction is done.
 	 */
 	ctl_check_blocked(lun);
 
 	/*
 	 * If the LUN has been invalidated, free it if there is nothing
 	 * left on its OOA queue.
 	 */
 	if ((lun->flags & CTL_LUN_INVALID)
 	 && TAILQ_EMPTY(&lun->ooa_queue)) {
 		mtx_unlock(&lun->lun_lock);
 		mtx_lock(&ctl_softc->ctl_lock);
 		ctl_free_lun(lun);
 		mtx_unlock(&ctl_softc->ctl_lock);
 	} else
 		mtx_unlock(&lun->lun_lock);
 
 	/*
 	 * If this command has been aborted, make sure we set the status
 	 * properly.  The FETD is responsible for freeing the I/O and doing
 	 * whatever it needs to do to clean up its state.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ABORT)
 		ctl_set_task_aborted(&io->scsiio);
 
 	/*
 	 * We print out status for every task management command.  For SCSI
 	 * commands, we filter out any unit attention errors; they happen
 	 * on every boot, and would clutter up the log.  Note:  task
 	 * management commands aren't printed here, they are printed above,
 	 * since they should never even make it down here.
 	 */
 	switch (io->io_hdr.io_type) {
 	case CTL_IO_SCSI: {
 		int error_code, sense_key, asc, ascq;
 
 		sense_key = 0;
 
 		if (((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SCSI_ERROR)
 		 && (io->scsiio.scsi_status == SCSI_STATUS_CHECK_COND)) {
 			/*
 			 * Since this is just for printing, no need to
 			 * show errors here.
 			 */
 			scsi_extract_sense_len(&io->scsiio.sense_data,
 					       io->scsiio.sense_len,
 					       &error_code,
 					       &sense_key,
 					       &asc,
 					       &ascq,
 					       /*show_errors*/ 0);
 		}
 
 		if (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)
 		 && (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SCSI_ERROR)
 		  || (io->scsiio.scsi_status != SCSI_STATUS_CHECK_COND)
 		  || (sense_key != SSD_KEY_UNIT_ATTENTION))) {
 
 			if ((time_uptime - ctl_softc->last_print_jiffies) <= 0){
 				ctl_softc->skipped_prints++;
 			} else {
 				uint32_t skipped_prints;
 
 				skipped_prints = ctl_softc->skipped_prints;
 
 				ctl_softc->skipped_prints = 0;
 				ctl_softc->last_print_jiffies = time_uptime;
 
 				if (skipped_prints > 0) {
 #ifdef NEEDTOPORT
 					csevent_log(CSC_CTL | CSC_SHELF_SW |
 					    CTL_ERROR_REPORT,
 					    csevent_LogType_Trace,
 					    csevent_Severity_Information,
 					    csevent_AlertLevel_Green,
 					    csevent_FRU_Firmware,
 					    csevent_FRU_Unknown,
 					    "High CTL error volume, %d prints "
 					    "skipped", skipped_prints);
 #endif
 				}
 				if (bootverbose || verbose > 0)
 					ctl_io_error_print(io, NULL);
 			}
 		}
 		break;
 	}
 	case CTL_IO_TASK:
 		if (bootverbose || verbose > 0)
 			ctl_io_error_print(io, NULL);
 		break;
 	default:
 		break;
 	}
 
 	/*
 	 * Tell the FETD or the other shelf controller we're done with this
 	 * command.  Note that only SCSI commands get to this point.  Task
 	 * management commands are completed above.
 	 *
 	 * We only send status to the other controller if we're in XFER
 	 * mode.  In SER_ONLY mode, the I/O is done on the controller that
 	 * received the I/O (from CTL's perspective), and so the status is
 	 * generated there.
 	 * 
 	 * XXX KDM if we hold the lock here, we could cause a deadlock
 	 * if the frontend comes back in in this context to queue
 	 * something.
 	 */
 	if ((ctl_softc->ha_mode == CTL_HA_MODE_XFER)
 	 && (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) {
 		union ctl_ha_msg msg;
 
 		memset(&msg, 0, sizeof(msg));
 		msg.hdr.msg_type = CTL_MSG_FINISH_IO;
 		msg.hdr.original_sc = io->io_hdr.original_sc;
 		msg.hdr.nexus = io->io_hdr.nexus;
 		msg.hdr.status = io->io_hdr.status;
 		msg.scsi.scsi_status = io->scsiio.scsi_status;
 		msg.scsi.tag_num = io->scsiio.tag_num;
 		msg.scsi.tag_type = io->scsiio.tag_type;
 		msg.scsi.sense_len = io->scsiio.sense_len;
 		msg.scsi.sense_residual = io->scsiio.sense_residual;
 		msg.scsi.residual = io->scsiio.residual;
 		memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data,
 		       sizeof(io->scsiio.sense_data));
 		/*
 		 * We copy this whether or not this is an I/O-related
 		 * command.  Otherwise, we'd have to go and check to see
 		 * whether it's a read/write command, and it really isn't
 		 * worth it.
 		 */
 		memcpy(&msg.scsi.lbalen,
 		       &io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes,
 		       sizeof(msg.scsi.lbalen));
 
 		if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg,
 				sizeof(msg), 0) > CTL_HA_STATUS_SUCCESS) {
 			/* XXX do something here */
 		}
 
 		ctl_free_io(io);
 	} else 
 		fe_done(io);
 
 bailout:
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 #ifdef CTL_WITH_CA
 /*
  * Front end should call this if it doesn't do autosense.  When the request
  * sense comes back in from the initiator, we'll dequeue this and send it.
  */
 int
 ctl_queue_sense(union ctl_io *io)
 {
 	struct ctl_lun *lun;
 	struct ctl_softc *ctl_softc;
 	uint32_t initidx, targ_lun;
 
 	ctl_softc = control_softc;
 
 	CTL_DEBUG_PRINT(("ctl_queue_sense\n"));
 
 	/*
 	 * LUN lookup will likely move to the ctl_work_thread() once we
 	 * have our new queueing infrastructure (that doesn't put things on
 	 * a per-LUN queue initially).  That is so that we can handle
 	 * things like an INQUIRY to a LUN that we don't have enabled.  We
 	 * can't deal with that right now.
 	 */
 	mtx_lock(&ctl_softc->ctl_lock);
 
 	/*
 	 * If we don't have a LUN for this, just toss the sense
 	 * information.
 	 */
 	targ_lun = io->io_hdr.nexus.targ_lun;
 	targ_lun = ctl_map_lun(io->io_hdr.nexus.targ_port, targ_lun);
 	if ((targ_lun < CTL_MAX_LUNS)
 	 && (ctl_softc->ctl_luns[targ_lun] != NULL))
 		lun = ctl_softc->ctl_luns[targ_lun];
 	else
 		goto bailout;
 
 	initidx = ctl_get_initindex(&io->io_hdr.nexus);
 
 	mtx_lock(&lun->lun_lock);
 	/*
 	 * Already have CA set for this LUN...toss the sense information.
 	 */
 	if (ctl_is_set(lun->have_ca, initidx)) {
 		mtx_unlock(&lun->lun_lock);
 		goto bailout;
 	}
 
 	memcpy(&lun->pending_sense[initidx], &io->scsiio.sense_data,
 	       ctl_min(sizeof(lun->pending_sense[initidx]),
 	       sizeof(io->scsiio.sense_data)));
 	ctl_set_mask(lun->have_ca, initidx);
 	mtx_unlock(&lun->lun_lock);
 
 bailout:
 	mtx_unlock(&ctl_softc->ctl_lock);
 
 	ctl_free_io(io);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 #endif
 
 /*
  * Primary command inlet from frontend ports.  All SCSI and task I/O
  * requests must go through this function.
  */
 int
 ctl_queue(union ctl_io *io)
 {
 	struct ctl_softc *ctl_softc;
 
 	CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0]));
 
 	ctl_softc = control_softc;
 
 #ifdef CTL_TIME_IO
 	io->io_hdr.start_time = time_uptime;
 	getbintime(&io->io_hdr.start_bt);
 #endif /* CTL_TIME_IO */
 
 	/* Map FE-specific LUN ID into global one. */
 	io->io_hdr.nexus.targ_mapped_lun =
 	    ctl_map_lun(io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun);
 
 	switch (io->io_hdr.io_type) {
 	case CTL_IO_SCSI:
 	case CTL_IO_TASK:
 		ctl_enqueue_incoming(io);
 		break;
 	default:
 		printf("ctl_queue: unknown I/O type %d\n", io->io_hdr.io_type);
 		return (EINVAL);
 	}
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 #ifdef CTL_IO_DELAY
 static void
 ctl_done_timer_wakeup(void *arg)
 {
 	union ctl_io *io;
 
 	io = (union ctl_io *)arg;
 	ctl_done(io);
 }
 #endif /* CTL_IO_DELAY */
 
 void
 ctl_done(union ctl_io *io)
 {
 	struct ctl_softc *ctl_softc;
 
 	ctl_softc = control_softc;
 
 	/*
 	 * Enable this to catch duplicate completion issues.
 	 */
 #if 0
 	if (io->io_hdr.flags & CTL_FLAG_ALREADY_DONE) {
 		printf("%s: type %d msg %d cdb %x iptl: "
 		       "%d:%d:%d:%d tag 0x%04x "
 		       "flag %#x status %x\n",
 			__func__,
 			io->io_hdr.io_type,
 			io->io_hdr.msg_type,
 			io->scsiio.cdb[0],
 			io->io_hdr.nexus.initid.id,
 			io->io_hdr.nexus.targ_port,
 			io->io_hdr.nexus.targ_target.id,
 			io->io_hdr.nexus.targ_lun,
 			(io->io_hdr.io_type ==
 			CTL_IO_TASK) ?
 			io->taskio.tag_num :
 			io->scsiio.tag_num,
 		        io->io_hdr.flags,
 			io->io_hdr.status);
 	} else
 		io->io_hdr.flags |= CTL_FLAG_ALREADY_DONE;
 #endif
 
 	/*
 	 * This is an internal copy of an I/O, and should not go through
 	 * the normal done processing logic.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_INT_COPY)
 		return;
 
 	/*
 	 * We need to send a msg to the serializing shelf to finish the IO
 	 * as well.  We don't send a finish message to the other shelf if
 	 * this is a task management command.  Task management commands
 	 * aren't serialized in the OOA queue, but rather just executed on
 	 * both shelf controllers for commands that originated on that
 	 * controller.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_SENT_2OTHER_SC)
 	 && (io->io_hdr.io_type != CTL_IO_TASK)) {
 		union ctl_ha_msg msg_io;
 
 		msg_io.hdr.msg_type = CTL_MSG_FINISH_IO;
 		msg_io.hdr.serializing_sc = io->io_hdr.serializing_sc;
 		if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_io,
 		    sizeof(msg_io), 0 ) != CTL_HA_STATUS_SUCCESS) {
 		}
 		/* continue on to finish IO */
 	}
 #ifdef CTL_IO_DELAY
 	if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) {
 		struct ctl_lun *lun;
 
 		lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 		io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE;
 	} else {
 		struct ctl_lun *lun;
 
 		lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 		if ((lun != NULL)
 		 && (lun->delay_info.done_delay > 0)) {
 			struct callout *callout;
 
 			callout = (struct callout *)&io->io_hdr.timer_bytes;
 			callout_init(callout, /*mpsafe*/ 1);
 			io->io_hdr.flags |= CTL_FLAG_DELAY_DONE;
 			callout_reset(callout,
 				      lun->delay_info.done_delay * hz,
 				      ctl_done_timer_wakeup, io);
 			if (lun->delay_info.done_type == CTL_DELAY_TYPE_ONESHOT)
 				lun->delay_info.done_delay = 0;
 			return;
 		}
 	}
 #endif /* CTL_IO_DELAY */
 
 	ctl_enqueue_done(io);
 }
 
 int
 ctl_isc(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	int retval;
 
 	lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr;
 
 	CTL_DEBUG_PRINT(("ctl_isc: command: %02x\n", ctsio->cdb[0]));
 
 	CTL_DEBUG_PRINT(("ctl_isc: calling data_submit()\n"));
 
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 
 static void
 ctl_work_thread(void *arg)
 {
 	struct ctl_thread *thr = (struct ctl_thread *)arg;
 	struct ctl_softc *softc = thr->ctl_softc;
 	union ctl_io *io;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_work_thread starting\n"));
 
 	for (;;) {
 		retval = 0;
 
 		/*
 		 * We handle the queues in this order:
 		 * - ISC
 		 * - done queue (to free up resources, unblock other commands)
 		 * - RtR queue
 		 * - incoming queue
 		 *
 		 * If those queues are empty, we break out of the loop and
 		 * go to sleep.
 		 */
 		mtx_lock(&thr->queue_lock);
 		io = (union ctl_io *)STAILQ_FIRST(&thr->isc_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&thr->isc_queue, links);
 			mtx_unlock(&thr->queue_lock);
 			ctl_handle_isc(io);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&thr->done_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&thr->done_queue, links);
 			/* clear any blocked commands, call fe_done */
 			mtx_unlock(&thr->queue_lock);
 			retval = ctl_process_done(io);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&thr->incoming_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&thr->incoming_queue, links);
 			mtx_unlock(&thr->queue_lock);
 			if (io->io_hdr.io_type == CTL_IO_TASK)
 				ctl_run_task(io);
 			else
 				ctl_scsiio_precheck(softc, &io->scsiio);
 			continue;
 		}
 		if (!ctl_pause_rtr) {
 			io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue);
 			if (io != NULL) {
 				STAILQ_REMOVE_HEAD(&thr->rtr_queue, links);
 				mtx_unlock(&thr->queue_lock);
 				retval = ctl_scsiio(&io->scsiio);
 				if (retval != CTL_RETVAL_COMPLETE)
 					CTL_DEBUG_PRINT(("ctl_scsiio failed\n"));
 				continue;
 			}
 		}
 
 		/* Sleep until we have something to do. */
 		mtx_sleep(thr, &thr->queue_lock, PDROP | PRIBIO, "-", 0);
 	}
 }
 
 static void
 ctl_lun_thread(void *arg)
 {
 	struct ctl_softc *softc = (struct ctl_softc *)arg;
 	struct ctl_be_lun *be_lun;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_lun_thread starting\n"));
 
 	for (;;) {
 		retval = 0;
 		mtx_lock(&softc->ctl_lock);
 		be_lun = STAILQ_FIRST(&softc->pending_lun_queue);
 		if (be_lun != NULL) {
 			STAILQ_REMOVE_HEAD(&softc->pending_lun_queue, links);
 			mtx_unlock(&softc->ctl_lock);
 			ctl_create_lun(be_lun);
 			continue;
 		}
 
 		/* Sleep until we have something to do. */
 		mtx_sleep(&softc->pending_lun_queue, &softc->ctl_lock,
 		    PDROP | PRIBIO, "-", 0);
 	}
 }
 
 static void
 ctl_enqueue_incoming(union ctl_io *io)
 {
 	struct ctl_softc *softc = control_softc;
 	struct ctl_thread *thr;
 	u_int idx;
 
 	idx = (io->io_hdr.nexus.targ_port * 127 +
 	       io->io_hdr.nexus.initid.id) % worker_threads;
 	thr = &softc->threads[idx];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->incoming_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 static void
 ctl_enqueue_rtr(union ctl_io *io)
 {
 	struct ctl_softc *softc = control_softc;
 	struct ctl_thread *thr;
 
 	thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->rtr_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 static void
 ctl_enqueue_done(union ctl_io *io)
 {
 	struct ctl_softc *softc = control_softc;
 	struct ctl_thread *thr;
 
 	thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->done_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 static void
 ctl_enqueue_isc(union ctl_io *io)
 {
 	struct ctl_softc *softc = control_softc;
 	struct ctl_thread *thr;
 
 	thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->isc_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 /* Initialization and failover */
 
 void
 ctl_init_isc_msg(void)
 {
 	printf("CTL: Still calling this thing\n");
 }
 
 /*
  * Init component
  * 	Initializes component into configuration defined by bootMode
  *	(see hasc-sv.c)
  *  	returns hasc_Status:
  * 		OK
  *		ERROR - fatal error
  */
 static ctl_ha_comp_status
 ctl_isc_init(struct ctl_ha_component *c)
 {
 	ctl_ha_comp_status ret = CTL_HA_COMP_STATUS_OK;
 
 	c->status = ret;
 	return ret;
 }
 
 /* Start component
  * 	Starts component in state requested. If component starts successfully,
  *	it must set its own state to the requestrd state
  *	When requested state is HASC_STATE_HA, the component may refine it
  * 	by adding _SLAVE or _MASTER flags.
  *	Currently allowed state transitions are:
  *	UNKNOWN->HA		- initial startup
  *	UNKNOWN->SINGLE - initial startup when no parter detected
  *	HA->SINGLE		- failover
  * returns ctl_ha_comp_status:
  * 		OK	- component successfully started in requested state
  *		FAILED  - could not start the requested state, failover may
  * 			  be possible
  *		ERROR	- fatal error detected, no future startup possible
  */
 static ctl_ha_comp_status
 ctl_isc_start(struct ctl_ha_component *c, ctl_ha_state state)
 {
 	ctl_ha_comp_status ret = CTL_HA_COMP_STATUS_OK;
 
 	printf("%s: go\n", __func__);
 
 	// UNKNOWN->HA or UNKNOWN->SINGLE (bootstrap)
 	if (c->state == CTL_HA_STATE_UNKNOWN ) {
 		ctl_is_single = 0;
 		if (ctl_ha_msg_create(CTL_HA_CHAN_CTL, ctl_isc_event_handler)
 		    != CTL_HA_STATUS_SUCCESS) {
 			printf("ctl_isc_start: ctl_ha_msg_create failed.\n");
 			ret = CTL_HA_COMP_STATUS_ERROR;
 		}
 	} else if (CTL_HA_STATE_IS_HA(c->state)
 		&& CTL_HA_STATE_IS_SINGLE(state)){
 		// HA->SINGLE transition
 	        ctl_failover();
 		ctl_is_single = 1;
 	} else {
 		printf("ctl_isc_start:Invalid state transition %X->%X\n",
 		       c->state, state);
 		ret = CTL_HA_COMP_STATUS_ERROR;
 	}
 	if (CTL_HA_STATE_IS_SINGLE(state))
 		ctl_is_single = 1;
 
 	c->state = state;
 	c->status = ret;
 	return ret;
 }
 
 /*
  * Quiesce component
  * The component must clear any error conditions (set status to OK) and
  * prepare itself to another Start call
  * returns ctl_ha_comp_status:
  * 	OK
  *	ERROR
  */
 static ctl_ha_comp_status
 ctl_isc_quiesce(struct ctl_ha_component *c)
 {
 	int ret = CTL_HA_COMP_STATUS_OK;
 
 	ctl_pause_rtr = 1;
 	c->status = ret;
 	return ret;
 }
 
 struct ctl_ha_component ctl_ha_component_ctlisc =
 {
 	.name = "CTL ISC",
 	.state = CTL_HA_STATE_UNKNOWN,
 	.init = ctl_isc_init,
 	.start = ctl_isc_start,
 	.quiesce = ctl_isc_quiesce
 };
 
 /*
  *  vim: ts=8
  */
Index: user/ae/inet6/sys/cam/scsi/scsi_all.h
===================================================================
--- user/ae/inet6/sys/cam/scsi/scsi_all.h	(revision 271452)
+++ user/ae/inet6/sys/cam/scsi/scsi_all.h	(revision 271453)
@@ -1,3672 +1,3721 @@
 /*-
  * Largely written by Julian Elischer (julian@tfs.com)
  * for TRW Financial Systems.
  *
  * TRW Financial Systems, in accordance with their agreement with Carnegie
  * Mellon University, makes this software available to CMU to distribute
  * or use in any manner that they see fit as long as this message is kept with
  * the software. For this reason TFS also grants any other persons or
  * organisations permission to use or modify this software.
  *
  * TFS supplies this software to be publicly redistributed
  * on the understanding that TFS is not responsible for the correct
  * functioning of this software in any circumstances.
  *
  * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992
  *
  * $FreeBSD$
  */
 
 /*
  * SCSI general  interface description
  */
 
 #ifndef	_SCSI_SCSI_ALL_H
 #define	_SCSI_SCSI_ALL_H 1
 
 #include <sys/cdefs.h>
 #include <machine/stdarg.h>
 
 #ifdef _KERNEL
 /*
  * This is the number of seconds we wait for devices to settle after a SCSI
  * bus reset.
  */
 extern int scsi_delay;
 #endif /* _KERNEL */
 
 /*
  * SCSI command format
  */
 
 /*
  * Define dome bits that are in ALL (or a lot of) scsi commands
  */
 #define	SCSI_CTL_LINK		0x01
 #define	SCSI_CTL_FLAG		0x02
 #define	SCSI_CTL_VENDOR		0xC0
 #define	SCSI_CMD_LUN		0xA0	/* these two should not be needed */
 #define	SCSI_CMD_LUN_SHIFT	5	/* LUN in the cmd is no longer SCSI */
 
 #define	SCSI_MAX_CDBLEN		16	/* 
 					 * 16 byte commands are in the 
 					 * SCSI-3 spec 
 					 */
 #if defined(CAM_MAX_CDBLEN) && (CAM_MAX_CDBLEN < SCSI_MAX_CDBLEN)
 #error "CAM_MAX_CDBLEN cannot be less than SCSI_MAX_CDBLEN"
 #endif
 
 /* 6byte CDBs special case 0 length to be 256 */
 #define	SCSI_CDB6_LEN(len)	((len) == 0 ? 256 : len)
 
 /*
  * This type defines actions to be taken when a particular sense code is
  * received.  Right now, these flags are only defined to take up 16 bits,
  * but can be expanded in the future if necessary.
  */
 typedef enum {
 	SS_NOP      = 0x000000,	/* Do nothing */
 	SS_RETRY    = 0x010000,	/* Retry the command */
 	SS_FAIL     = 0x020000,	/* Bail out */
 	SS_START    = 0x030000,	/* Send a Start Unit command to the device,
 				 * then retry the original command.
 				 */
 	SS_TUR      = 0x040000,	/* Send a Test Unit Ready command to the
 				 * device, then retry the original command.
 				 */
 	SS_MASK     = 0xff0000
 } scsi_sense_action;
 
 typedef enum {
 	SSQ_NONE		= 0x0000,
 	SSQ_DECREMENT_COUNT	= 0x0100,  /* Decrement the retry count */
 	SSQ_MANY		= 0x0200,  /* send lots of recovery commands */
 	SSQ_RANGE		= 0x0400,  /*
 					    * This table entry represents the
 					    * end of a range of ASCQs that
 					    * have identical error actions
 					    * and text.
 					    */
 	SSQ_PRINT_SENSE		= 0x0800,
 	SSQ_UA			= 0x1000,  /* Broadcast UA. */
 	SSQ_RESCAN		= 0x2000,  /* Rescan target for LUNs. */
 	SSQ_LOST		= 0x4000,  /* Destroy the LUNs. */
 	SSQ_MASK		= 0xff00
 } scsi_sense_action_qualifier;
 
 /* Mask for error status values */
 #define	SS_ERRMASK	0xff
 
 /* The default, retyable, error action */
 #define	SS_RDEF		SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE|EIO
 
 /* The retyable, error action, with table specified error code */
 #define	SS_RET		SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE
 
 /* Fatal error action, with table specified error code */
 #define	SS_FATAL	SS_FAIL|SSQ_PRINT_SENSE
 
 struct scsi_generic
 {
 	u_int8_t opcode;
 	u_int8_t bytes[11];
 };
 
 struct scsi_request_sense
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SRS_DESC	0x01
 	u_int8_t unused[2];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_test_unit_ready
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused[3];
 	u_int8_t control;
 };
 
 struct scsi_receive_diag {
 	uint8_t opcode;
 	uint8_t byte2;
 #define SRD_PCV		0x01
 	uint8_t page_code;
 	uint8_t length[2]; 
 	uint8_t control;
 };
 
 struct scsi_send_diag {
 	uint8_t opcode;
 	uint8_t byte2;
 #define SSD_UNITOFFL				0x01
 #define SSD_DEVOFFL				0x02
 #define SSD_SELFTEST				0x04
 #define SSD_PF					0x10
 #define SSD_SELF_TEST_CODE_MASK			0xE0
 #define SSD_SELF_TEST_CODE_SHIFT		5
 #define		SSD_SELF_TEST_CODE_NONE		0x00
 #define		SSD_SELF_TEST_CODE_BG_SHORT	0x01
 #define		SSD_SELF_TEST_CODE_BG_EXTENDED	0x02
 #define		SSD_SELF_TEST_CODE_BG_ABORT	0x04
 #define		SSD_SELF_TEST_CODE_FG_SHORT	0x05
 #define		SSD_SELF_TEST_CODE_FG_EXTENDED	0x06
 	uint8_t	reserved;
 	uint8_t	length[2];
 	uint8_t control;
 };
 
 struct scsi_sense
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused[2];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_inquiry
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SI_EVPD 	0x01
 #define	SI_CMDDT	0x02
 	u_int8_t page_code;
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_mode_sense_6
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SMS_DBD				0x08
 	u_int8_t page;
 #define	SMS_PAGE_CODE 			0x3F
 #define	SMS_VENDOR_SPECIFIC_PAGE	0x00
 #define	SMS_DISCONNECT_RECONNECT_PAGE	0x02
 #define	SMS_FORMAT_DEVICE_PAGE		0x03
 #define	SMS_GEOMETRY_PAGE		0x04
 #define	SMS_CACHE_PAGE			0x08
 #define	SMS_PERIPHERAL_DEVICE_PAGE	0x09
 #define	SMS_CONTROL_MODE_PAGE		0x0A
 #define	SMS_PROTO_SPECIFIC_PAGE		0x19
 #define	SMS_INFO_EXCEPTIONS_PAGE	0x1C
 #define	SMS_ALL_PAGES_PAGE		0x3F
 #define	SMS_PAGE_CTRL_MASK		0xC0
 #define	SMS_PAGE_CTRL_CURRENT 		0x00
 #define	SMS_PAGE_CTRL_CHANGEABLE 	0x40
 #define	SMS_PAGE_CTRL_DEFAULT 		0x80
 #define	SMS_PAGE_CTRL_SAVED 		0xC0
 	u_int8_t subpage;
 #define	SMS_SUBPAGE_PAGE_0		0x00
 #define	SMS_SUBPAGE_ALL			0xff
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_mode_sense_10
 {
 	u_int8_t opcode;
 	u_int8_t byte2;		/* same bits as small version */
 #define	SMS10_LLBAA			0x10
 	u_int8_t page; 		/* same bits as small version */
 	u_int8_t subpage;
 	u_int8_t unused[3];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_mode_select_6
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SMS_SP	0x01
 #define	SMS_PF	0x10
 	u_int8_t unused[2];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_mode_select_10
 {
 	u_int8_t opcode;
 	u_int8_t byte2;		/* same bits as small version */
 	u_int8_t unused[5];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 /*
  * When sending a mode select to a tape drive, the medium type must be 0.
  */
 struct scsi_mode_hdr_6
 {
 	u_int8_t datalen;
 	u_int8_t medium_type;
 	u_int8_t dev_specific;
 	u_int8_t block_descr_len;
 };
 
 struct scsi_mode_hdr_10
 {
 	u_int8_t datalen[2];
 	u_int8_t medium_type;
 	u_int8_t dev_specific;
 	u_int8_t reserved[2];
 	u_int8_t block_descr_len[2];
 };
 
 struct scsi_mode_block_descr
 {
 	u_int8_t density_code;
 	u_int8_t num_blocks[3];
 	u_int8_t reserved;
 	u_int8_t block_len[3];
 };
 
 struct scsi_per_res_in
 {
 	u_int8_t opcode;
 	u_int8_t action;
 #define	SPRI_RK	0x00
 #define	SPRI_RR	0x01
 #define	SPRI_RC	0x02
 #define	SPRI_RS	0x03
 	u_int8_t reserved[5];
 	u_int8_t length[2];
 #define	SPRI_MAX_LEN		0xffff
 	u_int8_t control;
 };
 
 struct scsi_per_res_in_header
 {
 	u_int8_t generation[4];
 	u_int8_t length[4];
 };
 
 struct scsi_per_res_key
 {
 	u_int8_t key[8];
 };
 
 struct scsi_per_res_in_keys
 {
 	struct scsi_per_res_in_header header;
 	struct scsi_per_res_key keys[0];
 };
 
 struct scsi_per_res_cap
 {
 	uint8_t length[2];
 	uint8_t flags1;
 #define	SPRI_RLR_C		0x80
 #define	SPRI_CRH		0x10
 #define	SPRI_SIP_C		0x08
 #define	SPRI_ATP_C		0x04
 #define	SPRI_PTPL_C		0x01
 	uint8_t flags2;
 #define	SPRI_TMV		0x80
 #define	SPRI_ALLOW_CMD_MASK	0x70
 #define	SPRI_ALLOW_CMD_SHIFT	4
 #define	SPRI_ALLOW_NA		0x00
 #define	SPRI_ALLOW_1		0x10
 #define	SPRI_ALLOW_2		0x20
 #define	SPRI_ALLOW_3		0x30
 #define	SPRI_ALLOW_4		0x40
 #define	SPRI_PTPL_A		0x01
 	uint8_t type_mask[2];
 #define	SPRI_TM_WR_EX_AR	0x8000
 #define	SPRI_TM_EX_AC_RO	0x4000
 #define	SPRI_TM_WR_EX_RO	0x2000
 #define	SPRI_TM_EX_AC		0x0800
 #define	SPRI_TM_WR_EX		0x0200
 #define	SPRI_TM_EX_AC_AR	0x0001
 	uint8_t reserved[2];
 };
 
 struct scsi_per_res_in_rsrv_data
 {
 	uint8_t reservation[8];
 	uint8_t scope_addr[4];
 	uint8_t reserved;
 	uint8_t scopetype;
 #define	SPRT_WE    0x01
 #define	SPRT_EA    0x03
 #define	SPRT_WERO  0x05
 #define	SPRT_EARO  0x06
 #define	SPRT_WEAR  0x07
 #define	SPRT_EAAR  0x08
 	uint8_t extent_length[2];
 };
 
 struct scsi_per_res_in_rsrv
 {
 	struct scsi_per_res_in_header header;
 	struct scsi_per_res_in_rsrv_data data;
 };
 
 struct scsi_per_res_in_full_desc
 {
 	struct scsi_per_res_key res_key;
 	uint8_t reserved1[4];
 	uint8_t flags;
 #define	SPRI_FULL_ALL_TG_PT	0x02
 #define	SPRI_FULL_R_HOLDER	0x01
 	uint8_t scopetype;
 	uint8_t reserved2[4];
 	uint8_t rel_trgt_port_id[2];
 	uint8_t additional_length[4];
 	uint8_t transport_id[];
 };
 
 struct scsi_per_res_in_full
 {
 	struct scsi_per_res_in_header header;
 	struct scsi_per_res_in_full_desc desc[];
 };
 
 struct scsi_per_res_out
 {
 	u_int8_t opcode;
 	u_int8_t action;
 #define	SPRO_REGISTER		0x00
 #define	SPRO_RESERVE		0x01
 #define	SPRO_RELEASE		0x02
 #define	SPRO_CLEAR		0x03
 #define	SPRO_PREEMPT		0x04
 #define	SPRO_PRE_ABO		0x05
 #define	SPRO_REG_IGNO		0x06
 #define	SPRO_REG_MOVE		0x07
 #define	SPRO_REPL_LOST_RES	0x08
 #define	SPRO_ACTION_MASK	0x1f
 	u_int8_t scope_type;
 #define	SPR_SCOPE_MASK		0xf0
 #define	SPR_SCOPE_SHIFT		4
 #define	SPR_LU_SCOPE		0x00
 #define	SPR_EXTENT_SCOPE	0x10
 #define	SPR_ELEMENT_SCOPE	0x20
 #define	SPR_TYPE_MASK		0x0f
 #define	SPR_TYPE_RD_SHARED	0x00
 #define	SPR_TYPE_WR_EX		0x01
 #define	SPR_TYPE_RD_EX		0x02
 #define	SPR_TYPE_EX_AC		0x03
 #define	SPR_TYPE_SHARED		0x04
 #define	SPR_TYPE_WR_EX_RO	0x05
 #define	SPR_TYPE_EX_AC_RO	0x06
 #define	SPR_TYPE_WR_EX_AR	0x07
 #define	SPR_TYPE_EX_AC_AR	0x08
 	u_int8_t reserved[2];
 	u_int8_t length[4];
 	u_int8_t control;
 };
 
 struct scsi_per_res_out_parms
 {
 	struct scsi_per_res_key res_key;
 	u_int8_t serv_act_res_key[8];
 	u_int8_t scope_spec_address[4];
 	u_int8_t flags;
 #define	SPR_SPEC_I_PT		0x08
 #define	SPR_ALL_TG_PT		0x04
 #define	SPR_APTPL		0x01
 	u_int8_t reserved1;
 	u_int8_t extent_length[2];
 	u_int8_t transport_id_list[];
 };
 
 struct scsi_per_res_out_trans_ids {
 	u_int8_t additional_length[4];
 	u_int8_t transport_ids[];
 };
 
 /*
  * Used with REGISTER AND MOVE serivce action of the PERSISTENT RESERVE OUT
  * command.
  */
 struct scsi_per_res_reg_move
 {
 	struct scsi_per_res_key res_key;
 	u_int8_t serv_act_res_key[8];
 	u_int8_t reserved;
 	u_int8_t flags;
 #define	SPR_REG_MOVE_UNREG	0x02
 #define	SPR_REG_MOVE_APTPL	0x01
 	u_int8_t rel_trgt_port_id[2];
 	u_int8_t transport_id_length[4];
 	u_int8_t transport_id[];
 };
 
 struct scsi_transportid_header
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_FORMAT_MASK		0xc0
 #define	SCSI_TRN_FORMAT_SHIFT		6
 #define	SCSI_TRN_PROTO_MASK		0x0f
 };
 
 struct scsi_transportid_fcp
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_FCP_FORMAT_DEFAULT	0x00
 	uint8_t reserved1[7];
 	uint8_t n_port_name[8];
 	uint8_t reserved2[8];
 };
 
 struct scsi_transportid_spi
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_SPI_FORMAT_DEFAULT	0x00
 	uint8_t reserved1;
 	uint8_t scsi_addr[2];
 	uint8_t obsolete[2];
 	uint8_t rel_trgt_port_id[2];
 	uint8_t reserved2[16];
 };
 
 struct scsi_transportid_1394
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_1394_FORMAT_DEFAULT	0x00
 	uint8_t reserved1[7];
 	uint8_t eui64[8];
 	uint8_t reserved2[8];
 };
 
 struct scsi_transportid_rdma
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_RDMA_FORMAT_DEFAULT	0x00
 	uint8_t reserved[7];
 #define	SCSI_TRN_RDMA_PORT_LEN		16
 	uint8_t initiator_port_id[SCSI_TRN_RDMA_PORT_LEN];
 };
 
 struct scsi_transportid_iscsi_device
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_ISCSI_FORMAT_DEVICE	0x00
 	uint8_t reserved;
 	uint8_t additional_length[2];
 	uint8_t iscsi_name[];
 };
 
 struct scsi_transportid_iscsi_port
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_ISCSI_FORMAT_PORT	0x40
 	uint8_t reserved;
 	uint8_t additional_length[2];
 	uint8_t iscsi_name[];
 	/*
 	 * Followed by a separator and iSCSI initiator session ID
 	 */
 };
 
 struct scsi_transportid_sas
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_SAS_FORMAT_DEFAULT	0x00
 	uint8_t reserved1[3];
 	uint8_t sas_address[8];
 	uint8_t reserved2[12];
 };
 
 struct scsi_sop_routing_id_norm {
 	uint8_t bus;
 	uint8_t devfunc;
 #define	SCSI_TRN_SOP_BUS_MAX		0xff
 #define	SCSI_TRN_SOP_DEV_MAX		0x1f
 #define	SCSI_TRN_SOP_DEV_MASK		0xf8
 #define	SCSI_TRN_SOP_DEV_SHIFT		3
 #define	SCSI_TRN_SOP_FUNC_NORM_MASK	0x07
 #define	SCSI_TRN_SOP_FUNC_NORM_MAX	0x07
 };
 
 struct scsi_sop_routing_id_alt {
 	uint8_t bus;
 	uint8_t function;
 #define	SCSI_TRN_SOP_FUNC_ALT_MAX	0xff
 };
 
 struct scsi_transportid_sop
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_SOP_FORMAT_DEFAULT	0x00
 	uint8_t reserved1;
 	uint8_t routing_id[2];
 	uint8_t reserved2[20];
 };
 
 struct scsi_log_sense
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SLS_SP				0x01
 #define	SLS_PPC				0x02
 	u_int8_t page;
 #define	SLS_PAGE_CODE 			0x3F
 #define	SLS_ALL_PAGES_PAGE		0x00
 #define	SLS_OVERRUN_PAGE		0x01
 #define	SLS_ERROR_WRITE_PAGE		0x02
 #define	SLS_ERROR_READ_PAGE		0x03
 #define	SLS_ERROR_READREVERSE_PAGE	0x04
 #define	SLS_ERROR_VERIFY_PAGE		0x05
 #define	SLS_ERROR_NONMEDIUM_PAGE	0x06
 #define	SLS_ERROR_LASTN_PAGE		0x07
 #define	SLS_SELF_TEST_PAGE		0x10
 #define	SLS_IE_PAGE			0x2f
 #define	SLS_PAGE_CTRL_MASK		0xC0
 #define	SLS_PAGE_CTRL_THRESHOLD		0x00
 #define	SLS_PAGE_CTRL_CUMULATIVE	0x40
 #define	SLS_PAGE_CTRL_THRESH_DEFAULT	0x80
 #define	SLS_PAGE_CTRL_CUMUL_DEFAULT	0xC0
 	u_int8_t reserved[2];
 	u_int8_t paramptr[2];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_log_select
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 /*	SLS_SP				0x01 */
 #define	SLS_PCR				0x02
 	u_int8_t page;
 /*	SLS_PAGE_CTRL_MASK		0xC0 */
 /*	SLS_PAGE_CTRL_THRESHOLD		0x00 */
 /*	SLS_PAGE_CTRL_CUMULATIVE	0x40 */
 /*	SLS_PAGE_CTRL_THRESH_DEFAULT	0x80 */
 /*	SLS_PAGE_CTRL_CUMUL_DEFAULT	0xC0 */
 	u_int8_t reserved[4];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_log_header
 {
 	u_int8_t page;
 	u_int8_t reserved;
 	u_int8_t datalen[2];
 };
 
 struct scsi_log_param_header {
 	u_int8_t param_code[2];
 	u_int8_t param_control;
 #define	SLP_LP				0x01
 #define	SLP_LBIN			0x02
 #define	SLP_TMC_MASK			0x0C
 #define	SLP_TMC_ALWAYS			0x00
 #define	SLP_TMC_EQUAL			0x04
 #define	SLP_TMC_NOTEQUAL		0x08
 #define	SLP_TMC_GREATER			0x0C
 #define	SLP_ETC				0x10
 #define	SLP_TSD				0x20
 #define	SLP_DS				0x40
 #define	SLP_DU				0x80
 	u_int8_t param_len;
 };
 
 struct scsi_control_page {
 	u_int8_t page_code;
 	u_int8_t page_length;
 	u_int8_t rlec;
 #define	SCP_RLEC			0x01	/*Report Log Exception Cond*/
 #define	SCP_GLTSD			0x02	/*Global Logging target
 						  save disable */
 #define	SCP_DSENSE			0x04	/*Descriptor Sense */
 #define	SCP_DPICZ			0x08	/*Disable Prot. Info Check
 						  if Prot. Field is Zero */
 #define	SCP_TMF_ONLY			0x10	/*TM Functions Only*/
 #define	SCP_TST_MASK			0xE0	/*Task Set Type Mask*/
 #define	SCP_TST_ONE			0x00	/*One Task Set*/
 #define	SCP_TST_SEPARATE		0x20	/*Separate Task Sets*/
 	u_int8_t queue_flags;
 #define	SCP_QUEUE_ALG_MASK		0xF0
 #define	SCP_QUEUE_ALG_RESTRICTED	0x00
 #define	SCP_QUEUE_ALG_UNRESTRICTED	0x10
 #define	SCP_NUAR			0x08	/*No UA on release*/
 #define	SCP_QUEUE_ERR			0x02	/*Queued I/O aborted for CACs*/
 #define	SCP_QUEUE_DQUE			0x01	/*Queued I/O disabled*/
 	u_int8_t eca_and_aen;
 #define	SCP_EECA			0x80	/*Enable Extended CA*/
 #define	SCP_RAC				0x40	/*Report a check*/
 #define	SCP_SWP				0x08	/*Software Write Protect*/
 #define	SCP_RAENP			0x04	/*Ready AEN Permission*/
 #define	SCP_UAAENP			0x02	/*UA AEN Permission*/
 #define	SCP_EAENP			0x01	/*Error AEN Permission*/
 	u_int8_t flags4;
 #define	SCP_ATO				0x80	/*Application tag owner*/
 #define	SCP_TAS				0x40	/*Task aborted status*/
 #define	SCP_ATMPE			0x20	/*Application tag mode page*/
 #define	SCP_RWWP			0x10	/*Reject write without prot*/
 	u_int8_t aen_holdoff_period[2];
 	u_int8_t busy_timeout_period[2];
 	u_int8_t extended_selftest_completion_time[2];
 };
 
 struct scsi_cache_page {
 	u_int8_t page_code;
 #define	SCHP_PAGE_SAVABLE		0x80	/* Page is savable */
 	u_int8_t page_length;
 	u_int8_t cache_flags;
 #define	SCHP_FLAGS_WCE			0x04	/* Write Cache Enable */
 #define	SCHP_FLAGS_MF			0x02	/* Multiplication factor */
 #define	SCHP_FLAGS_RCD			0x01	/* Read Cache Disable */
 	u_int8_t rw_cache_policy;
 	u_int8_t dis_prefetch[2];
 	u_int8_t min_prefetch[2];
 	u_int8_t max_prefetch[2];
 	u_int8_t max_prefetch_ceil[2];
 };
 
 /*
  * XXX KDM
  * Updated version of the cache page, as of SBC.  Update this to SBC-3 and
  * rationalize the two.
  */
 struct scsi_caching_page {
 	uint8_t page_code;
 #define	SMS_CACHING_PAGE		0x08
 	uint8_t page_length;
 	uint8_t flags1;
 #define	SCP_IC		0x80
 #define	SCP_ABPF	0x40
 #define	SCP_CAP		0x20
 #define	SCP_DISC	0x10
 #define	SCP_SIZE	0x08
 #define	SCP_WCE		0x04
 #define	SCP_MF		0x02
 #define	SCP_RCD		0x01
 	uint8_t ret_priority;
 	uint8_t disable_pf_transfer_len[2];
 	uint8_t min_prefetch[2];
 	uint8_t max_prefetch[2];
 	uint8_t max_pf_ceiling[2];
 	uint8_t flags2;
 #define	SCP_FSW		0x80
 #define	SCP_LBCSS	0x40
 #define	SCP_DRA		0x20
 #define	SCP_VS1		0x10
 #define	SCP_VS2		0x08
 	uint8_t cache_segments;
 	uint8_t cache_seg_size[2];
 	uint8_t reserved;
 	uint8_t non_cache_seg_size[3];
 };
 
 /*
  * XXX KDM move this off to a vendor shim.
  */
 struct copan_power_subpage {
 	uint8_t page_code;
 #define	PWR_PAGE_CODE		0x00
 	uint8_t subpage;
 #define	PWR_SUBPAGE_CODE	0x02
 	uint8_t page_length[2];
 	uint8_t page_version;
 #define	PWR_VERSION		    0x01
 	uint8_t total_luns;
 	uint8_t max_active_luns;
 #define	PWR_DFLT_MAX_LUNS	    0x07
 	uint8_t reserved[25];
 };
 
 /*
  * XXX KDM move this off to a vendor shim.
  */
 struct copan_aps_subpage {
 	uint8_t page_code;
 #define	APS_PAGE_CODE		0x00
 	uint8_t subpage;
 #define	APS_SUBPAGE_CODE	0x03
 	uint8_t page_length[2];
 	uint8_t page_version;
 #define	APS_VERSION		    0x00
 	uint8_t lock_active;
 #define	APS_LOCK_ACTIVE	    0x01
 #define	APS_LOCK_INACTIVE	0x00
 	uint8_t reserved[26];
 };
 
 /*
  * XXX KDM move this off to a vendor shim.
  */
 struct copan_debugconf_subpage {
 	uint8_t page_code;
 #define DBGCNF_PAGE_CODE		0x00
 	uint8_t subpage;
 #define DBGCNF_SUBPAGE_CODE	0xF0
 	uint8_t page_length[2];
 	uint8_t page_version;
 #define DBGCNF_VERSION			0x00
 	uint8_t ctl_time_io_secs[2];
 };
 
 
 struct scsi_info_exceptions_page {
 	u_int8_t page_code;
 #define	SIEP_PAGE_SAVABLE		0x80	/* Page is savable */
 	u_int8_t page_length;
 	u_int8_t info_flags;
 #define	SIEP_FLAGS_PERF			0x80
 #define	SIEP_FLAGS_EBF			0x20
 #define	SIEP_FLAGS_EWASC		0x10
 #define	SIEP_FLAGS_DEXCPT		0x08
 #define	SIEP_FLAGS_TEST			0x04
 #define	SIEP_FLAGS_EBACKERR		0x02
 #define	SIEP_FLAGS_LOGERR		0x01
 	u_int8_t mrie;
 	u_int8_t interval_timer[4];
 	u_int8_t report_count[4];
 };
 
 /*
  * SCSI protocol identifier values, current as of SPC4r36l.
  */
 #define	SCSI_PROTO_FC		0x00	/* Fibre Channel */
 #define	SCSI_PROTO_SPI		0x01	/* Parallel SCSI */
 #define	SCSI_PROTO_SSA		0x02	/* Serial Storage Arch. */
 #define	SCSI_PROTO_1394		0x03	/* IEEE 1394 (Firewire) */
 #define	SCSI_PROTO_RDMA		0x04	/* SCSI RDMA Protocol */
 #define	SCSI_PROTO_ISCSI	0x05	/* Internet SCSI */
 #define	SCSI_PROTO_iSCSI	0x05	/* Internet SCSI */
 #define	SCSI_PROTO_SAS		0x06	/* SAS Serial SCSI Protocol */
 #define	SCSI_PROTO_ADT		0x07	/* Automation/Drive Int. Trans. Prot.*/
 #define	SCSI_PROTO_ADITP	0x07	/* Automation/Drive Int. Trans. Prot.*/
 #define	SCSI_PROTO_ATA		0x08	/* AT Attachment Interface */
 #define	SCSI_PROTO_UAS		0x09	/* USB Atached SCSI */
 #define	SCSI_PROTO_SOP		0x0a	/* SCSI over PCI Express */
 #define	SCSI_PROTO_NONE		0x0f	/* No specific protocol */
 
 struct scsi_proto_specific_page {
 	u_int8_t page_code;
 #define	SPSP_PAGE_SAVABLE		0x80	/* Page is savable */
 	u_int8_t page_length;
 	u_int8_t protocol;
 #define	SPSP_PROTO_FC			SCSI_PROTO_FC
 #define	SPSP_PROTO_SPI			SCSI_PROTO_SPI
 #define	SPSP_PROTO_SSA			SCSI_PROTO_SSA
 #define	SPSP_PROTO_1394			SCSI_PROTO_1394
 #define	SPSP_PROTO_RDMA			SCSI_PROTO_RDMA
 #define	SPSP_PROTO_ISCSI		SCSI_PROTO_ISCSI
 #define	SPSP_PROTO_SAS			SCSI_PROTO_SAS
 #define	SPSP_PROTO_ADT			SCSI_PROTO_ADITP
 #define	SPSP_PROTO_ATA			SCSI_PROTO_ATA
 #define	SPSP_PROTO_UAS			SCSI_PROTO_UAS
 #define	SPSP_PROTO_SOP			SCSI_PROTO_SOP
 #define	SPSP_PROTO_NONE			SCSI_PROTO_NONE
 };
 
 struct scsi_reserve
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SR_EXTENT	0x01
 #define	SR_ID_MASK	0x0e
 #define	SR_3RDPTY	0x10
 #define	SR_LUN_MASK	0xe0
 	u_int8_t resv_id;
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_reserve_10 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SR10_3RDPTY	0x10
 #define	SR10_LONGID	0x02
 #define	SR10_EXTENT	0x01
 	uint8_t resv_id;
 	uint8_t thirdparty_id;
 	uint8_t reserved[3];
 	uint8_t length[2];
 	uint8_t control;
 };
 
 
 struct scsi_release
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t resv_id;
 	u_int8_t unused[1];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_release_10 {
 	uint8_t opcode;
 	uint8_t byte2;
 	uint8_t resv_id;
 	uint8_t thirdparty_id;
 	uint8_t reserved[3];
 	uint8_t length[2];
 	uint8_t control;
 };
 
 struct scsi_prevent
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused[2];
 	u_int8_t how;
 	u_int8_t control;
 };
 #define	PR_PREVENT 0x01
 #define	PR_ALLOW   0x00
 
 struct scsi_sync_cache
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SSC_IMMED	0x02
 #define	SSC_RELADR	0x01
 	u_int8_t begin_lba[4];
 	u_int8_t reserved;
 	u_int8_t lb_count[2];
 	u_int8_t control;	
 };
 
 struct scsi_sync_cache_16
 {
 	uint8_t opcode;
 	uint8_t byte2;
 	uint8_t begin_lba[8];
 	uint8_t lb_count[4];
 	uint8_t reserved;
 	uint8_t control;
 };
 
 struct scsi_format {
 	uint8_t opcode;
 	uint8_t byte2;
 #define	SF_LONGLIST		0x20
 #define	SF_FMTDATA		0x10
 #define	SF_CMPLIST		0x08
 #define	SF_FORMAT_MASK		0x07
 #define	SF_FORMAT_BLOCK		0x00
 #define	SF_FORMAT_LONG_BLOCK	0x03
 #define	SF_FORMAT_BFI		0x04
 #define	SF_FORMAT_PHYS		0x05
 	uint8_t vendor;
 	uint8_t interleave[2];
 	uint8_t control;
 };
 
 struct scsi_format_header_short {
 	uint8_t reserved;
 #define	SF_DATA_FOV	0x80
 #define	SF_DATA_DPRY	0x40
 #define	SF_DATA_DCRT	0x20
 #define	SF_DATA_STPF	0x10
 #define	SF_DATA_IP	0x08
 #define	SF_DATA_DSP	0x04
 #define	SF_DATA_IMMED	0x02
 #define	SF_DATA_VS	0x01
 	uint8_t byte2;
 	uint8_t defect_list_len[2];
 };
 
 struct scsi_format_header_long {
 	uint8_t reserved;
 	uint8_t byte2;
 	uint8_t reserved2[2];
 	uint8_t defect_list_len[4];
 };
 
 struct scsi_changedef
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused1;
 	u_int8_t how;
 	u_int8_t unused[4];
 	u_int8_t datalen;
 	u_int8_t control;
 };
 
 struct scsi_read_buffer
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	RWB_MODE		0x1F
 #define	RWB_MODE_HDR_DATA	0x00
 #define	RWB_MODE_VENDOR		0x01
 #define	RWB_MODE_DATA		0x02
 #define	RWB_MODE_DESCR		0x03
 #define	RWB_MODE_DOWNLOAD	0x04
 #define	RWB_MODE_DOWNLOAD_SAVE	0x05
 #define	RWB_MODE_ECHO		0x0A
 #define	RWB_MODE_ECHO_DESCR	0x0B
 #define	RWB_MODE_ERROR_HISTORY	0x1C
         u_int8_t buffer_id;
         u_int8_t offset[3];
         u_int8_t length[3];
         u_int8_t control;
 };
 
 struct scsi_write_buffer
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t buffer_id;
 	u_int8_t offset[3];
 	u_int8_t length[3];
 	u_int8_t control;
 };
 
 struct scsi_rw_6
 {
 	u_int8_t opcode;
 	u_int8_t addr[3];
 /* only 5 bits are valid in the MSB address byte */
 #define	SRW_TOPADDR	0x1F
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_rw_10
 {
 	u_int8_t opcode;
 #define	SRW10_RELADDR	0x01
 /* EBP defined for WRITE(10) only */
 #define	SRW10_EBP	0x04
 #define	SRW10_FUA	0x08
 #define	SRW10_DPO	0x10
 	u_int8_t byte2;
 	u_int8_t addr[4];
 	u_int8_t reserved;
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_rw_12
 {
 	u_int8_t opcode;
 #define	SRW12_RELADDR	0x01
 #define	SRW12_FUA	0x08
 #define	SRW12_DPO	0x10
 	u_int8_t byte2;
 	u_int8_t addr[4];
 	u_int8_t length[4];
 	u_int8_t reserved;
 	u_int8_t control;
 };
 
 struct scsi_rw_16
 {
 	u_int8_t opcode;
 #define	SRW16_RELADDR	0x01
 #define	SRW16_FUA	0x08
 #define	SRW16_DPO	0x10
 	u_int8_t byte2;
 	u_int8_t addr[8];
 	u_int8_t length[4];
 	u_int8_t reserved;
 	u_int8_t control;
 };
 
 struct scsi_write_same_10
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SWS_LBDATA	0x02
 #define	SWS_PBDATA	0x04
 #define	SWS_UNMAP	0x08
 #define	SWS_ANCHOR	0x10
 	uint8_t	addr[4];
 	uint8_t	group;
 	uint8_t	length[2];
 	uint8_t	control;
 };
 
 struct scsi_write_same_16
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 	uint8_t	addr[8];
 	uint8_t	length[4];
 	uint8_t	group;
 	uint8_t	control;
 };
 
 struct scsi_unmap
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SU_ANCHOR	0x01
 	uint8_t	reserved[4];
 	uint8_t	group;
 	uint8_t	length[2];
 	uint8_t	control;
 };
 
 struct scsi_unmap_header
 {
 	uint8_t	length[2];
 	uint8_t	desc_length[2];
 	uint8_t	reserved[4];
 };
 
 struct scsi_unmap_desc
 {
 	uint8_t	lba[8];
 	uint8_t	length[4];
 	uint8_t	reserved[4];
 };
 
 struct scsi_write_verify_10
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SWV_BYTCHK		0x02
 #define	SWV_DPO			0x10
 #define	SWV_WRPROECT_MASK	0xe0
 	uint8_t	addr[4];
 	uint8_t	group;
 	uint8_t length[2];
 	uint8_t	control;
 };
 
 struct scsi_write_verify_12
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 	uint8_t	addr[4];
 	uint8_t	length[4];
 	uint8_t	group;
 	uint8_t	control;
 };
 
 struct scsi_write_verify_16
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 	uint8_t	addr[8];
 	uint8_t	length[4];
 	uint8_t	group;
 	uint8_t	control;
 };
 
 
 struct scsi_start_stop_unit
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SSS_IMMED		0x01
 	u_int8_t reserved[2];
 	u_int8_t how;
 #define	SSS_START		0x01
 #define	SSS_LOEJ		0x02
 #define	SSS_PC_MASK		0xf0
 #define	SSS_PC_START_VALID	0x00
 #define	SSS_PC_ACTIVE		0x10
 #define	SSS_PC_IDLE		0x20
 #define	SSS_PC_STANDBY		0x30
 #define	SSS_PC_LU_CONTROL	0x70
 #define	SSS_PC_FORCE_IDLE_0	0xa0
 #define	SSS_PC_FORCE_STANDBY_0	0xb0
 	u_int8_t control;
 };
 
 struct ata_pass_12 {
 	u_int8_t opcode;
 	u_int8_t protocol;
 #define	AP_PROTO_HARD_RESET	(0x00 << 1)
 #define	AP_PROTO_SRST		(0x01 << 1)
 #define	AP_PROTO_NON_DATA	(0x03 << 1)
 #define	AP_PROTO_PIO_IN		(0x04 << 1)
 #define	AP_PROTO_PIO_OUT	(0x05 << 1)
 #define	AP_PROTO_DMA		(0x06 << 1)
 #define	AP_PROTO_DMA_QUEUED	(0x07 << 1)
 #define	AP_PROTO_DEVICE_DIAG	(0x08 << 1)
 #define	AP_PROTO_DEVICE_RESET	(0x09 << 1)
 #define	AP_PROTO_UDMA_IN	(0x0a << 1)
 #define	AP_PROTO_UDMA_OUT	(0x0b << 1)
 #define	AP_PROTO_FPDMA		(0x0c << 1)
 #define	AP_PROTO_RESP_INFO	(0x0f << 1)
 #define	AP_MULTI	0xe0
 	u_int8_t flags;
 #define	AP_T_LEN	0x03
 #define	AP_BB		0x04
 #define	AP_T_DIR	0x08
 #define	AP_CK_COND	0x20
 #define	AP_OFFLINE	0x60
 	u_int8_t features;
 	u_int8_t sector_count;
 	u_int8_t lba_low;
 	u_int8_t lba_mid;
 	u_int8_t lba_high;
 	u_int8_t device;
 	u_int8_t command;
 	u_int8_t reserved;
 	u_int8_t control;
 };
 
 struct scsi_maintenance_in
 {
         uint8_t  opcode;
         uint8_t  byte2;
 #define SERVICE_ACTION_MASK  0x1f
 #define SA_RPRT_TRGT_GRP     0x0a
         uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_supported_opcodes
 {
         uint8_t  opcode;
         uint8_t  service_action;
         uint8_t  options;
 #define RSO_RCTD		0x80
 #define RSO_OPTIONS_MASK	0x07
 #define RSO_OPTIONS_ALL		0x00
 #define RSO_OPTIONS_OC		0x01
 #define RSO_OPTIONS_OC_SA	0x02
         uint8_t  requested_opcode;
         uint8_t  requested_service_action[2];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_supported_opcodes_timeout
 {
 	uint8_t  length[2];
 	uint8_t  reserved;
 	uint8_t  cmd_specific;
 	uint8_t  nominal_time[4];
 	uint8_t  recommended_time[4];
 };
 
 struct scsi_report_supported_opcodes_descr
 {
 	uint8_t  opcode;
 	uint8_t  reserved;
 	uint8_t  service_action[2];
 	uint8_t  reserved2;
 	uint8_t  flags;
 #define RSO_SERVACTV		0x01
 #define RSO_CTDP		0x02
 	uint8_t  cdb_length[2];
 	struct scsi_report_supported_opcodes_timeout timeout[0];
 };
 
 struct scsi_report_supported_opcodes_all
 {
 	uint8_t  length[4];
 	struct scsi_report_supported_opcodes_descr descr[0];
 };
 
 struct scsi_report_supported_opcodes_one
 {
 	uint8_t  reserved;
 	uint8_t  support;
 #define RSO_ONE_CTDP		0x80
 	uint8_t  cdb_length[2];
 	uint8_t  cdb_usage[];
 };
 
 struct scsi_report_supported_tmf
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 	uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_supported_tmf_data
 {
 	uint8_t  byte1;
 #define RST_WAKES		0x01
 #define RST_TRS			0x02
 #define RST_QTS			0x04
 #define RST_LURS		0x08
 #define RST_CTSS		0x10
 #define RST_CACAS		0x20
 #define RST_ATSS		0x40
 #define RST_ATS			0x80
 	uint8_t  byte2;
 #define RST_ITNRS		0x01
 #define RST_QTSS		0x02
 #define RST_QAES		0x04
 	uint8_t  reserved[2];
 };
 
 struct scsi_report_timestamp
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 	uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_timestamp_data
 {
 	uint8_t  length[2];
 	uint8_t  origin;
 #define RTS_ORIG_MASK		0x00
 #define RTS_ORIG_ZERO		0x00
 #define RTS_ORIG_SET		0x02
 #define RTS_ORIG_OUTSIDE	0x03
 	uint8_t  reserved;
 	uint8_t  timestamp[6];
 	uint8_t  reserve2[2];
 };
 
 struct scsi_receive_copy_status_lid1
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCS_LID1		0x00
 	uint8_t  list_identifier;
 	uint8_t  reserved[7];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_status_lid1_data
 {
 	uint8_t  available_data[4];
 	uint8_t  copy_command_status;
 #define RCS_CCS_INPROG		0x00
 #define RCS_CCS_COMPLETED	0x01
 #define RCS_CCS_ERROR		0x02
 	uint8_t  segments_processed[2];
 	uint8_t  transfer_count_units;
 #define RCS_TC_BYTES		0x00
 #define RCS_TC_KBYTES		0x01
 #define RCS_TC_MBYTES		0x02
 #define RCS_TC_GBYTES		0x03
 #define RCS_TC_TBYTES		0x04
 #define RCS_TC_PBYTES		0x05
 #define RCS_TC_EBYTES		0x06
 #define RCS_TC_LBAS		0xf1
 	uint8_t  transfer_count[4];
 };
 
 struct scsi_receive_copy_failure_details
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCFD		0x04
 	uint8_t  list_identifier;
 	uint8_t  reserved[7];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_failure_details_data
 {
 	uint8_t  available_data[4];
 	uint8_t  reserved[52];
 	uint8_t  copy_command_status;
 	uint8_t  reserved2;
 	uint8_t  sense_data_length[2];
 	uint8_t  sense_data[];
 };
 
 struct scsi_receive_copy_status_lid4
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCS_LID4		0x05
 	uint8_t  list_identifier[4];
 	uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_status_lid4_data
 {
 	uint8_t  available_data[4];
 	uint8_t  response_to_service_action;
 	uint8_t  copy_command_status;
 #define RCS_CCS_COMPLETED_PROD	0x03
 #define RCS_CCS_COMPLETED_RESID	0x04
 #define RCS_CCS_INPROG_FGBG	0x10
 #define RCS_CCS_INPROG_FG	0x11
 #define RCS_CCS_INPROG_BG	0x12
 #define RCS_CCS_ABORTED		0x60
 	uint8_t  operation_counter[2];
 	uint8_t  estimated_status_update_delay[4];
 	uint8_t  extended_copy_completion_status;
 	uint8_t  length_of_the_sense_data_field;
 	uint8_t  sense_data_length;
 	uint8_t  transfer_count_units;
 	uint8_t  transfer_count[8];
 	uint8_t  segments_processed[2];
 	uint8_t  reserved[6];
 	uint8_t  sense_data[];
 };
 
 struct scsi_receive_copy_operating_parameters
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCOP		0x03
 	uint8_t  reserved[8];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_operating_parameters_data
 {
 	uint8_t  length[4];
 	uint8_t  snlid;
 #define RCOP_SNLID		0x01
 	uint8_t  reserved[3];
 	uint8_t  maximum_cscd_descriptor_count[2];
 	uint8_t  maximum_segment_descriptor_count[2];
 	uint8_t  maximum_descriptor_list_length[4];
 	uint8_t  maximum_segment_length[4];
 	uint8_t  maximum_inline_data_length[4];
 	uint8_t  held_data_limit[4];
 	uint8_t  maximum_stream_device_transfer_size[4];
 	uint8_t  reserved2[2];
 	uint8_t  total_concurrent_copies[2];
 	uint8_t  maximum_concurrent_copies;
 	uint8_t  data_segment_granularity;
 	uint8_t  inline_data_granularity;
 	uint8_t  held_data_granularity;
 	uint8_t  reserved3[3];
 	uint8_t  implemented_descriptor_list_length;
 	uint8_t  list_of_implemented_descriptor_type_codes[0];
 };
 
 struct scsi_extended_copy
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_EC_LID1		0x00
 #define EC_EC_LID4		0x01
 	uint8_t  reserved[8];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_ec_cscd_dtsp
 {
 	uint8_t  flags;
 #define EC_CSCD_FIXED		0x01
 #define EC_CSCD_PAD		0x04
 	uint8_t  block_length[3];
 };
 
 struct scsi_ec_cscd
 {
 	uint8_t  type_code;
 #define EC_CSCD_EXT		0xff
 	uint8_t  luidt_pdt;
 #define EC_LUIDT_MASK		0xc0
 #define EC_LUIDT_LUN		0x00
 #define EC_LUIDT_PROXY_TOKEN	0x40
 	uint8_t  relative_initiator_port[2];
 	uint8_t  cscd_params[24];
 	struct scsi_ec_cscd_dtsp dtsp;
 };
 
 struct scsi_ec_cscd_id
 {
 	uint8_t  type_code;
 #define EC_CSCD_ID		0xe4
 	uint8_t  luidt_pdt;
 	uint8_t  relative_initiator_port[2];
 	uint8_t  codeset;
 	uint8_t  id_type;
 	uint8_t  reserved;
 	uint8_t  length;
 	uint8_t  designator[20];
 	struct scsi_ec_cscd_dtsp dtsp;
 };
 
 struct scsi_ec_segment
 {
 	uint8_t  type_code;
 	uint8_t  flags;
 #define EC_SEG_DC		0x02
 #define EC_SEG_CAT		0x01
 	uint8_t  descr_length[2];
 	uint8_t  params[];
 };
 
 struct scsi_ec_segment_b2b
 {
 	uint8_t  type_code;
 #define EC_SEG_B2B		0x02
 	uint8_t  flags;
 	uint8_t  descr_length[2];
 	uint8_t  src_cscd[2];
 	uint8_t  dst_cscd[2];
 	uint8_t  reserved[2];
 	uint8_t  number_of_blocks[2];
 	uint8_t  src_lba[8];
 	uint8_t  dst_lba[8];
 };
 
 struct scsi_ec_segment_verify
 {
 	uint8_t  type_code;
 #define EC_SEG_VERIFY		0x07
 	uint8_t  reserved;
 	uint8_t  descr_length[2];
 	uint8_t  src_cscd[2];
 	uint8_t  reserved2[2];
 	uint8_t  tur;
 	uint8_t  reserved3[3];
 };
 
 struct scsi_ec_segment_register_key
 {
 	uint8_t  type_code;
 #define EC_SEG_REGISTER_KEY	0x14
 	uint8_t  reserved;
 	uint8_t  descr_length[2];
 	uint8_t  reserved2[2];
 	uint8_t  dst_cscd[2];
 	uint8_t  res_key[8];
 	uint8_t  sa_res_key[8];
 	uint8_t  reserved3[4];
 };
 
 struct scsi_extended_copy_lid1_data
 {
 	uint8_t  list_identifier;
 	uint8_t  flags;
 #define EC_PRIORITY		0x07
 #define EC_LIST_ID_USAGE_MASK	0x18
 #define EC_LIST_ID_USAGE_FULL	0x08
 #define EC_LIST_ID_USAGE_NOHOLD	0x10
 #define EC_LIST_ID_USAGE_NONE	0x18
 #define EC_STR			0x20
 	uint8_t  cscd_list_length[2];
 	uint8_t  reserved[4];
 	uint8_t  segment_list_length[4];
 	uint8_t  inline_data_length[4];
 	uint8_t  data[];
 };
 
 struct scsi_extended_copy_lid4_data
 {
 	uint8_t  list_format;
 #define EC_LIST_FORMAT		0x01
 	uint8_t  flags;
 	uint8_t  header_cscd_list_length[2];
 	uint8_t  reserved[11];
 	uint8_t  flags2;
 #define EC_IMMED		0x01
 #define EC_G_SENSE		0x02
 	uint8_t  header_cscd_type_code;
 	uint8_t  reserved2[3];
 	uint8_t  list_identifier[4];
 	uint8_t  reserved3[18];
 	uint8_t  cscd_list_length[2];
 	uint8_t  segment_list_length[2];
 	uint8_t  inline_data_length[2];
 	uint8_t  data[];
 };
 
 struct scsi_copy_operation_abort
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_COA			0x1c
 	uint8_t  list_identifier[4];
 	uint8_t  reserved[9];
 	uint8_t  control;
 };
 
 struct scsi_populate_token
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_PT			0x10
 	uint8_t  reserved[4];
 	uint8_t  list_identifier[4];
 	uint8_t  length[4];
 	uint8_t  group_number;
 	uint8_t  control;
 };
 
 struct scsi_range_desc
 {
 	uint8_t	lba[8];
 	uint8_t	length[4];
 	uint8_t	reserved[4];
 };
 
 struct scsi_populate_token_data
 {
 	uint8_t  length[2];
 	uint8_t  flags;
 #define EC_PT_IMMED			0x01
 #define EC_PT_RTV			0x02
 	uint8_t  reserved;
 	uint8_t  inactivity_timeout[4];
 	uint8_t  rod_type[4];
 	uint8_t  reserved2[2];
 	uint8_t  range_descriptor_length[2];
 	struct scsi_range_desc desc[];
 };
 
 struct scsi_write_using_token
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_WUT			0x11
 	uint8_t  reserved[4];
 	uint8_t  list_identifier[4];
 	uint8_t  length[4];
 	uint8_t  group_number;
 	uint8_t  control;
 };
 
 struct scsi_write_using_token_data
 {
 	uint8_t  length[2];
 	uint8_t  flags;
 #define EC_WUT_IMMED			0x01
 #define EC_WUT_DEL_TKN			0x02
 	uint8_t  reserved[5];
 	uint8_t  offset_into_rod[8];
 	uint8_t  rod_token[512];
 	uint8_t  reserved2[6];
 	uint8_t  range_descriptor_length[2];
 	struct scsi_range_desc desc[];
 };
 
 struct scsi_receive_rod_token_information
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RRTI		0x07
 	uint8_t  list_identifier[4];
 	uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved2;
 	uint8_t  control;
 };
 
 struct scsi_token
 {
 	uint8_t  type[4];
 #define ROD_TYPE_INTERNAL	0x00000000
 #define ROD_TYPE_AUR		0x00001000
 #define ROD_TYPE_PIT_DEF	0x00080000
 #define ROD_TYPE_PIT_VULN	0x00080001
 #define ROD_TYPE_PIT_PERS	0x00080002
 #define ROD_TYPE_PIT_ANY	0x0008FFFF
 #define ROD_TYPE_BLOCK_ZERO	0xFFFF0001
 	uint8_t  reserved[2];
 	uint8_t  length[2];
 	uint8_t  body[0];
 };
 
 struct scsi_report_all_rod_tokens
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RART		0x08
 	uint8_t  reserved[8];
 	uint8_t  length[4];
 	uint8_t  reserved2;
 	uint8_t  control;
 };
 
 struct scsi_report_all_rod_tokens_data
 {
 	uint8_t  available_data[4];
 	uint8_t  reserved[4];
 	uint8_t  rod_management_token_list[];
 };
 
 struct ata_pass_16 {
 	u_int8_t opcode;
 	u_int8_t protocol;
 #define	AP_EXTEND	0x01
 	u_int8_t flags;
 #define	AP_FLAG_TLEN_NO_DATA	(0 << 0)
 #define	AP_FLAG_TLEN_FEAT	(1 << 0)
 #define	AP_FLAG_TLEN_SECT_CNT	(2 << 0)
 #define	AP_FLAG_TLEN_STPSIU	(3 << 0)
 #define	AP_FLAG_BYT_BLOK_BYTES	(0 << 2)  
 #define	AP_FLAG_BYT_BLOK_BLOCKS	(1 << 2)  
 #define	AP_FLAG_TDIR_TO_DEV	(0 << 3)  
 #define	AP_FLAG_TDIR_FROM_DEV	(1 << 3)  
 #define	AP_FLAG_CHK_COND	(1 << 5)  
 	u_int8_t features_ext;
 	u_int8_t features;
 	u_int8_t sector_count_ext;
 	u_int8_t sector_count;
 	u_int8_t lba_low_ext;
 	u_int8_t lba_low;
 	u_int8_t lba_mid_ext;
 	u_int8_t lba_mid;
 	u_int8_t lba_high_ext;
 	u_int8_t lba_high;
 	u_int8_t device;
 	u_int8_t command;
 	u_int8_t control;
 };
 
 #define	SC_SCSI_1 0x01
 #define	SC_SCSI_2 0x03
 
 /*
  * Opcodes
  */
 
 #define	TEST_UNIT_READY		0x00
 #define	REQUEST_SENSE		0x03
 #define	READ_6			0x08
 #define	WRITE_6			0x0A
 #define	INQUIRY			0x12
 #define	MODE_SELECT_6		0x15
 #define	MODE_SENSE_6		0x1A
 #define	START_STOP_UNIT		0x1B
 #define	START_STOP		0x1B
 #define	RESERVE      		0x16
 #define	RELEASE      		0x17
 #define	RECEIVE_DIAGNOSTIC	0x1C
 #define	SEND_DIAGNOSTIC		0x1D
 #define	PREVENT_ALLOW		0x1E
 #define	READ_CAPACITY		0x25
 #define	READ_10			0x28
 #define	WRITE_10		0x2A
 #define	POSITION_TO_ELEMENT	0x2B
 #define	WRITE_VERIFY_10		0x2E
 #define	VERIFY_10		0x2F
 #define	SYNCHRONIZE_CACHE	0x35
 #define	READ_DEFECT_DATA_10	0x37
 #define	WRITE_BUFFER            0x3B
 #define	READ_BUFFER             0x3C
 #define	CHANGE_DEFINITION	0x40
 #define	WRITE_SAME_10		0x41
 #define	UNMAP			0x42
 #define	LOG_SELECT		0x4C
 #define	LOG_SENSE		0x4D
 #define	MODE_SELECT_10		0x55
 #define	RESERVE_10		0x56
 #define	RELEASE_10		0x57
 #define	MODE_SENSE_10		0x5A
 #define	PERSISTENT_RES_IN	0x5E
 #define	PERSISTENT_RES_OUT	0x5F
 #define	EXTENDED_COPY		0x83
 #define	RECEIVE_COPY_STATUS	0x84
 #define	ATA_PASS_16		0x85
 #define	READ_16			0x88
 #define	COMPARE_AND_WRITE	0x89
 #define	WRITE_16		0x8A
 #define	WRITE_VERIFY_16		0x8E
 #define	VERIFY_16		0x8F
 #define	SYNCHRONIZE_CACHE_16	0x91
 #define	WRITE_SAME_16		0x93
 #define	SERVICE_ACTION_IN	0x9E
 #define	REPORT_LUNS		0xA0
 #define	ATA_PASS_12		0xA1
 #define	MAINTENANCE_IN		0xA3
 #define	MAINTENANCE_OUT		0xA4
 #define	MOVE_MEDIUM     	0xA5
 #define	READ_12			0xA8
 #define	WRITE_12		0xAA
 #define	WRITE_VERIFY_12		0xAE
 #define	VERIFY_12		0xAF
 #define	READ_ELEMENT_STATUS	0xB8
 #define	READ_CD			0xBE
 
 /* Maintenance In Service Action Codes */
 #define	REPORT_IDENTIFYING_INFRMATION		0x05
 #define	REPORT_TARGET_PORT_GROUPS		0x0A
 #define	REPORT_ALIASES				0x0B
 #define	REPORT_SUPPORTED_OPERATION_CODES	0x0C
 #define	REPORT_SUPPORTED_TASK_MANAGEMENT_FUNCTIONS	0x0D
 #define	REPORT_PRIORITY				0x0E
 #define	REPORT_TIMESTAMP			0x0F
 #define	MANAGEMENT_PROTOCOL_IN			0x10
 /* Maintenance Out Service Action Codes */
 #define	SET_IDENTIFY_INFORMATION		0x06
 #define	SET_TARGET_PORT_GROUPS			0x0A
 #define	CHANGE_ALIASES				0x0B
 #define	SET_PRIORITY				0x0E
 #define	SET_TIMESTAMP				0x0F
 #define	MANGAEMENT_PROTOCOL_OUT			0x10
 
 /*
  * Device Types
  */
 #define	T_DIRECT	0x00
 #define	T_SEQUENTIAL	0x01
 #define	T_PRINTER	0x02
 #define	T_PROCESSOR	0x03
 #define	T_WORM		0x04
 #define	T_CDROM		0x05
 #define	T_SCANNER	0x06
 #define	T_OPTICAL 	0x07
 #define	T_CHANGER	0x08
 #define	T_COMM		0x09
 #define	T_ASC0		0x0a
 #define	T_ASC1		0x0b
 #define	T_STORARRAY	0x0c
 #define	T_ENCLOSURE	0x0d
 #define	T_RBC		0x0e
 #define	T_OCRW		0x0f
 #define	T_OSD		0x11
 #define	T_ADC		0x12
 #define	T_NODEVICE	0x1f
 #define	T_ANY		0xff	/* Used in Quirk table matches */
 
 #define	T_REMOV		1
 #define	T_FIXED		0
 
 /*
  * This length is the initial inquiry length used by the probe code, as    
  * well as the length necessary for scsi_print_inquiry() to function 
  * correctly.  If either use requires a different length in the future, 
  * the two values should be de-coupled.
  */
 #define	SHORT_INQUIRY_LENGTH	36
 
 struct scsi_inquiry_data
 {
 	u_int8_t device;
 #define	SID_TYPE(inq_data) ((inq_data)->device & 0x1f)
 #define	SID_QUAL(inq_data) (((inq_data)->device & 0xE0) >> 5)
 #define	SID_QUAL_LU_CONNECTED	0x00	/*
 					 * The specified peripheral device
 					 * type is currently connected to
 					 * logical unit.  If the target cannot
 					 * determine whether or not a physical
 					 * device is currently connected, it
 					 * shall also use this peripheral
 					 * qualifier when returning the INQUIRY
 					 * data.  This peripheral qualifier
 					 * does not mean that the device is
 					 * ready for access by the initiator.
 					 */
 #define	SID_QUAL_LU_OFFLINE	0x01	/*
 					 * The target is capable of supporting
 					 * the specified peripheral device type
 					 * on this logical unit; however, the
 					 * physical device is not currently
 					 * connected to this logical unit.
 					 */
 #define	SID_QUAL_RSVD		0x02
 #define	SID_QUAL_BAD_LU		0x03	/*
 					 * The target is not capable of
 					 * supporting a physical device on
 					 * this logical unit. For this
 					 * peripheral qualifier the peripheral
 					 * device type shall be set to 1Fh to
 					 * provide compatibility with previous
 					 * versions of SCSI. All other
 					 * peripheral device type values are
 					 * reserved for this peripheral
 					 * qualifier.
 					 */
 #define	SID_QUAL_IS_VENDOR_UNIQUE(inq_data) ((SID_QUAL(inq_data) & 0x08) != 0)
 	u_int8_t dev_qual2;
 #define	SID_QUAL2	0x7F
 #define	SID_LU_CONG	0x40
 #define	SID_RMB		0x80
 #define	SID_IS_REMOVABLE(inq_data) (((inq_data)->dev_qual2 & SID_RMB) != 0)
 	u_int8_t version;
 #define	SID_ANSI_REV(inq_data) ((inq_data)->version & 0x07)
 #define		SCSI_REV_0		0
 #define		SCSI_REV_CCS		1
 #define		SCSI_REV_2		2
 #define		SCSI_REV_SPC		3
 #define		SCSI_REV_SPC2		4
 #define		SCSI_REV_SPC3		5
 #define		SCSI_REV_SPC4		6
 
 #define	SID_ECMA	0x38
 #define	SID_ISO		0xC0
 	u_int8_t response_format;
 #define	SID_AENC	0x80
 #define	SID_TrmIOP	0x40
 #define	SID_NormACA	0x20
 #define	SID_HiSup	0x10
 	u_int8_t additional_length;
 #define	SID_ADDITIONAL_LENGTH(iqd)					\
 	((iqd)->additional_length +					\
 	__offsetof(struct scsi_inquiry_data, additional_length) + 1)
 	u_int8_t spc3_flags;
 #define	SPC3_SID_PROTECT	0x01
 #define	SPC3_SID_3PC		0x08
 #define	SPC3_SID_TPGS_MASK	0x30
 #define	SPC3_SID_TPGS_IMPLICIT	0x10
 #define	SPC3_SID_TPGS_EXPLICIT	0x20
 #define	SPC3_SID_ACC		0x40
 #define	SPC3_SID_SCCS		0x80
 	u_int8_t spc2_flags;
 #define	SPC2_SID_ADDR16		0x01
 #define	SPC2_SID_MChngr 	0x08
 #define	SPC2_SID_MultiP 	0x10
 #define	SPC2_SID_EncServ	0x40
 #define	SPC2_SID_BQueue		0x80
 
 #define	INQ_DATA_TQ_ENABLED(iqd)				\
     ((SID_ANSI_REV(iqd) < SCSI_REV_SPC2)? ((iqd)->flags & SID_CmdQue) :	\
     (((iqd)->flags & SID_CmdQue) && !((iqd)->spc2_flags & SPC2_SID_BQueue)) || \
     (!((iqd)->flags & SID_CmdQue) && ((iqd)->spc2_flags & SPC2_SID_BQueue)))
 
 	u_int8_t flags;
 #define	SID_SftRe	0x01
 #define	SID_CmdQue	0x02
 #define	SID_Linked	0x08
 #define	SID_Sync	0x10
 #define	SID_WBus16	0x20
 #define	SID_WBus32	0x40
 #define	SID_RelAdr	0x80
 #define	SID_VENDOR_SIZE   8
 	char	 vendor[SID_VENDOR_SIZE];
 #define	SID_PRODUCT_SIZE  16
 	char	 product[SID_PRODUCT_SIZE];
 #define	SID_REVISION_SIZE 4
 	char	 revision[SID_REVISION_SIZE];
 	/*
 	 * The following fields were taken from SCSI Primary Commands - 2
 	 * (SPC-2) Revision 14, Dated 11 November 1999
 	 */
 #define	SID_VENDOR_SPECIFIC_0_SIZE	20
 	u_int8_t vendor_specific0[SID_VENDOR_SPECIFIC_0_SIZE];
 	/*
 	 * An extension of SCSI Parallel Specific Values
 	 */
 #define	SID_SPI_IUS		0x01
 #define	SID_SPI_QAS		0x02
 #define	SID_SPI_CLOCK_ST	0x00
 #define	SID_SPI_CLOCK_DT	0x04
 #define	SID_SPI_CLOCK_DT_ST	0x0C
 #define	SID_SPI_MASK		0x0F
 	u_int8_t spi3data;
 	u_int8_t reserved2;
 	/*
 	 * Version Descriptors, stored 2 byte values.
 	 */
 	u_int8_t version1[2];
 	u_int8_t version2[2];
 	u_int8_t version3[2];
 	u_int8_t version4[2];
 	u_int8_t version5[2];
 	u_int8_t version6[2];
 	u_int8_t version7[2];
 	u_int8_t version8[2];
 
 	u_int8_t reserved3[22];
 
 #define	SID_VENDOR_SPECIFIC_1_SIZE	160
 	u_int8_t vendor_specific1[SID_VENDOR_SPECIFIC_1_SIZE];
 };
 
 /*
  * This structure is more suited to initiator operation, because the
  * maximum number of supported pages is already allocated.
  */
 struct scsi_vpd_supported_page_list
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_SUPPORTED_PAGE_LIST	0x00
 #define	SVPD_SUPPORTED_PAGES_HDR_LEN	4
 	u_int8_t reserved;
 	u_int8_t length;	/* number of VPD entries */
 #define	SVPD_SUPPORTED_PAGES_SIZE	251
 	u_int8_t list[SVPD_SUPPORTED_PAGES_SIZE];
 };
 
 /*
  * This structure is more suited to target operation, because the
  * number of supported pages is left to the user to allocate.
  */
 struct scsi_vpd_supported_pages
 {
 	u_int8_t device;
 	u_int8_t page_code;
 	u_int8_t reserved;
 #define	SVPD_SUPPORTED_PAGES	0x00
 	u_int8_t length;
 	u_int8_t page_list[0];
 };
 
 
 struct scsi_vpd_unit_serial_number
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_UNIT_SERIAL_NUMBER	0x80
 	u_int8_t reserved;
 	u_int8_t length; /* serial number length */
 #define	SVPD_SERIAL_NUM_SIZE 251
 	u_int8_t serial_num[SVPD_SERIAL_NUM_SIZE];
 };
 
 struct scsi_vpd_device_id
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_DEVICE_ID			0x83
 #define	SVPD_DEVICE_ID_MAX_SIZE		252
 #define	SVPD_DEVICE_ID_HDR_LEN \
     __offsetof(struct scsi_vpd_device_id, desc_list)
 	u_int8_t length[2];
 	u_int8_t desc_list[];
 };
 
 struct scsi_vpd_id_descriptor
 {
 	u_int8_t	proto_codeset;
 	/*
 	 * See the SCSI_PROTO definitions above for the protocols.
 	 */
 #define	SVPD_ID_PROTO_SHIFT	4
 #define	SVPD_ID_CODESET_BINARY	0x01
 #define	SVPD_ID_CODESET_ASCII	0x02
 #define	SVPD_ID_CODESET_UTF8	0x03
 #define	SVPD_ID_CODESET_MASK	0x0f
 	u_int8_t	id_type;
 #define	SVPD_ID_PIV		0x80
 #define	SVPD_ID_ASSOC_LUN	0x00
 #define	SVPD_ID_ASSOC_PORT	0x10
 #define	SVPD_ID_ASSOC_TARGET	0x20
 #define	SVPD_ID_ASSOC_MASK	0x30
 #define	SVPD_ID_TYPE_VENDOR	0x00
 #define	SVPD_ID_TYPE_T10	0x01
 #define	SVPD_ID_TYPE_EUI64	0x02
 #define	SVPD_ID_TYPE_NAA	0x03
 #define	SVPD_ID_TYPE_RELTARG	0x04
 #define	SVPD_ID_TYPE_TPORTGRP	0x05
 #define	SVPD_ID_TYPE_LUNGRP	0x06
 #define	SVPD_ID_TYPE_MD5_LUN_ID	0x07
 #define	SVPD_ID_TYPE_SCSI_NAME	0x08
 #define	SVPD_ID_TYPE_MASK	0x0f
 	u_int8_t	reserved;
 	u_int8_t	length;
 #define	SVPD_DEVICE_ID_DESC_HDR_LEN \
     __offsetof(struct scsi_vpd_id_descriptor, identifier) 
 	u_int8_t	identifier[];
 };
 
 struct scsi_vpd_id_t10
 {
 	u_int8_t	vendor[8];
 	u_int8_t	vendor_spec_id[0];
 };
 
 struct scsi_vpd_id_eui64
 {
 	u_int8_t	ieee_company_id[3];
 	u_int8_t	extension_id[5];
 };
 
 struct scsi_vpd_id_naa_basic
 {
 	uint8_t naa;
 	/* big endian, packed:
 	uint8_t	naa : 4;
 	uint8_t naa_desig : 4;
 	*/
 #define	SVPD_ID_NAA_NAA_SHIFT		4
 #define	SVPD_ID_NAA_IEEE_EXT		0x02
 #define	SVPD_ID_NAA_LOCAL_REG		0x03
 #define	SVPD_ID_NAA_IEEE_REG		0x05
 #define	SVPD_ID_NAA_IEEE_REG_EXT	0x06
 	uint8_t	naa_data[];
 };
 
 struct scsi_vpd_id_naa_ieee_extended_id
 {
 	uint8_t naa;
 	uint8_t vendor_specific_id_a;
 	uint8_t ieee_company_id[3];
 	uint8_t vendor_specific_id_b[4];
 };
 
 struct scsi_vpd_id_naa_local_reg
 {
 	uint8_t naa;
 	uint8_t local_value[7];
 };
 
 struct scsi_vpd_id_naa_ieee_reg
 {
 	uint8_t naa;
 	uint8_t reg_value[7];
 	/* big endian, packed:
 	uint8_t naa_basic : 4;
 	uint8_t ieee_company_id_0 : 4;
 	uint8_t ieee_company_id_1[2];
 	uint8_t ieee_company_id_2 : 4;
 	uint8_t vendor_specific_id_0 : 4;
 	uint8_t vendor_specific_id_1[4];
 	*/
 };
 
 struct scsi_vpd_id_naa_ieee_reg_extended
 {
 	uint8_t naa;
 	uint8_t reg_value[15];
 	/* big endian, packed:
 	uint8_t naa_basic : 4;
 	uint8_t ieee_company_id_0 : 4;
 	uint8_t ieee_company_id_1[2];
 	uint8_t ieee_company_id_2 : 4;
 	uint8_t vendor_specific_id_0 : 4;
 	uint8_t vendor_specific_id_1[4];
 	uint8_t vendor_specific_id_ext[8];
 	*/
 };
 
 struct scsi_vpd_id_rel_trgt_port_id
 {
 	uint8_t obsolete[2];
 	uint8_t rel_trgt_port_id[2];
 };
 
 struct scsi_vpd_id_trgt_port_grp_id
 {
 	uint8_t reserved[2];
 	uint8_t trgt_port_grp[2];
 };
 
 struct scsi_vpd_id_lun_grp_id
 {
 	uint8_t reserved[2];
 	uint8_t log_unit_grp[2];
 };
 
 struct scsi_vpd_id_md5_lun_id
 {
 	uint8_t lun_id[16];
 };
 
 struct scsi_vpd_id_scsi_name
 {
 	uint8_t name_string[256];
 };
 
 struct scsi_service_action_in
 {
 	uint8_t opcode;
 	uint8_t service_action;
 	uint8_t action_dependent[13];
 	uint8_t control;
 };
 
+struct scsi_vpd_extended_inquiry_data
+{
+	uint8_t device;
+	uint8_t page_code;
+#define	SVPD_EXTENDED_INQUIRY_DATA	0x86
+	uint8_t reserved;
+	uint8_t page_length;
+	uint8_t flags1;
+#define	SVPD_EID_AM		0xC0
+#define	SVPD_EID_SPT		0x38
+#define	SVPD_EID_SPT_1		0x00
+#define	SVPD_EID_SPT_12		0x08
+#define	SVPD_EID_SPT_2		0x10
+#define	SVPD_EID_SPT_13		0x18
+#define	SVPD_EID_SPT_3		0x20
+#define	SVPD_EID_SPT_23		0x28
+#define	SVPD_EID_SPT_123	0x38
+#define	SVPD_EID_GRD_CHK	0x04
+#define	SVPD_EID_APP_CHK	0x02
+#define	SVPD_EID_REF_CHK	0x01
+	uint8_t flags2;
+#define	SVPD_EID_UASK_SUP	0x20
+#define	SVPD_EID_GROUP_SUP	0x10
+#define	SVPD_EID_PRIOR_SUP	0x08
+#define	SVPD_EID_HEADSUP	0x04
+#define	SVPD_EID_ORDSUP		0x02
+#define	SVPD_EID_SIMPSUP	0x01
+	uint8_t flags3;
+#define	SVPD_EID_WU_SUP		0x08
+#define	SVPD_EID_CRD_SUP	0x04
+#define	SVPD_EID_NV_SUP		0x02
+#define	SVPD_EID_V_SUP		0x01
+	uint8_t flags4;
+#define	SVPD_EID_P_I_I_SUP	0x10
+#define	SVPD_EID_LUICLT		0x01
+	uint8_t flags5;
+#define	SVPD_EID_R_SUP		0x10
+#define	SVPD_EID_CBCS		0x01
+	uint8_t flags6;
+#define	SVPD_EID_MULTI_I_T_FW	0x0F
+	uint8_t est[2];
+	uint8_t flags7;
+#define	SVPD_EID_POA_SUP	0x80
+#define	SVPD_EID_HRA_SUP	0x80
+#define	SVPD_EID_VSA_SUP	0x80
+	uint8_t max_sense_length;
+	uint8_t reserved2[50];
+};
+
 struct scsi_vpd_mode_page_policy_descr
 {
 	uint8_t page_code;
 	uint8_t subpage_code;
 	uint8_t policy;
 #define	SVPD_MPP_SHARED		0x00
 #define	SVPD_MPP_PORT		0x01
 #define	SVPD_MPP_I_T		0x03
 #define	SVPD_MPP_MLUS		0x80
 	uint8_t reserved;
 };
 
 struct scsi_vpd_mode_page_policy
 {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_MODE_PAGE_POLICY	0x87
 	uint8_t page_length[2];
 	struct scsi_vpd_mode_page_policy_descr descr[0];
 };
 
 struct scsi_diag_page {
 	uint8_t page_code;
 	uint8_t page_specific_flags;
 	uint8_t length[2];
 	uint8_t params[0];
 };
 
 struct scsi_vpd_port_designation
 {
 	uint8_t reserved[2];
 	uint8_t relative_port_id[2];
 	uint8_t reserved2[2];
 	uint8_t initiator_transportid_length[2];
 	uint8_t initiator_transportid[0];
 };
 
 struct scsi_vpd_port_designation_cont
 {
 	uint8_t reserved[2];
 	uint8_t target_port_descriptors_length[2];
 	struct scsi_vpd_id_descriptor target_port_descriptors[0];
 };
 
 struct scsi_vpd_scsi_ports
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_SCSI_PORTS		0x88
 	u_int8_t page_length[2];
 	struct scsi_vpd_port_designation design[];
 };
 
 /*
  * ATA Information VPD Page based on
  * T10/2126-D Revision 04
  */
 #define SVPD_ATA_INFORMATION		0x89
 
 
 struct scsi_vpd_tpc_descriptor
 {
 	uint8_t desc_type[2];
 	uint8_t desc_length[2];
 	uint8_t parameters[];
 };
 
 struct scsi_vpd_tpc_descriptor_bdrl
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_BDRL			0x0000
 	uint8_t desc_length[2];
 	uint8_t vendor_specific[6];
 	uint8_t maximum_ranges[2];
 	uint8_t maximum_inactivity_timeout[4];
 	uint8_t default_inactivity_timeout[4];
 	uint8_t maximum_token_transfer_size[8];
 	uint8_t optimal_transfer_count[8];
 };
 
 struct scsi_vpd_tpc_descriptor_sc_descr
 {
 	uint8_t opcode;
 	uint8_t sa_length;
 	uint8_t supported_service_actions[0];
 };
 
 struct scsi_vpd_tpc_descriptor_sc
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SC			0x0001
 	uint8_t desc_length[2];
 	uint8_t list_length;
 	struct scsi_vpd_tpc_descriptor_sc_descr descr[];
 };
 
 struct scsi_vpd_tpc_descriptor_pd
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_PD			0x0004
 	uint8_t desc_length[2];
 	uint8_t reserved[4];
 	uint8_t maximum_cscd_descriptor_count[2];
 	uint8_t maximum_segment_descriptor_count[2];
 	uint8_t maximum_descriptor_list_length[4];
 	uint8_t maximum_inline_data_length[4];
 	uint8_t reserved2[12];
 };
 
 struct scsi_vpd_tpc_descriptor_sd
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SD			0x0008
 	uint8_t desc_length[2];
 	uint8_t list_length;
 	uint8_t supported_descriptor_codes[];
 };
 
 struct scsi_vpd_tpc_descriptor_sdid
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SDID			0x000C
 	uint8_t desc_length[2];
 	uint8_t list_length[2];
 	uint8_t supported_descriptor_ids[];
 };
 
 struct scsi_vpd_tpc_descriptor_rtf_block
 {
 	uint8_t type_format;
 #define	SVPD_TPC_RTF_BLOCK			0x00
 	uint8_t reserved;
 	uint8_t desc_length[2];
 	uint8_t reserved2[2];
 	uint8_t optimal_length_granularity[2];
 	uint8_t maximum_bytes[8];
 	uint8_t optimal_bytes[8];
 	uint8_t optimal_bytes_to_token_per_segment[8];
 	uint8_t optimal_bytes_from_token_per_segment[8];
 	uint8_t reserved3[8];
 };
 
 struct scsi_vpd_tpc_descriptor_rtf
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_RTF			0x0106
 	uint8_t desc_length[2];
 	uint8_t remote_tokens;
 	uint8_t reserved[11];
 	uint8_t minimum_token_lifetime[4];
 	uint8_t maximum_token_lifetime[4];
 	uint8_t maximum_token_inactivity_timeout[4];
 	uint8_t reserved2[18];
 	uint8_t type_specific_features_length[2];
 	uint8_t type_specific_features[0];
 };
 
 struct scsi_vpd_tpc_descriptor_srtd
 {
 	uint8_t rod_type[4];
 	uint8_t flags;
 #define	SVPD_TPC_SRTD_TOUT		0x01
 #define	SVPD_TPC_SRTD_TIN		0x02
 #define	SVPD_TPC_SRTD_ECPY		0x80
 	uint8_t reserved;
 	uint8_t preference_indicator[2];
 	uint8_t reserved2[56];
 };
 
 struct scsi_vpd_tpc_descriptor_srt
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SRT			0x0108
 	uint8_t desc_length[2];
 	uint8_t reserved[2];
 	uint8_t rod_type_descriptors_length[2];
 	uint8_t rod_type_descriptors[0];
 };
 
 struct scsi_vpd_tpc_descriptor_gco
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_GCO			0x8001
 	uint8_t desc_length[2];
 	uint8_t total_concurrent_copies[4];
 	uint8_t maximum_identified_concurrent_copies[4];
 	uint8_t maximum_segment_length[4];
 	uint8_t data_segment_granularity;
 	uint8_t inline_data_granularity;
 	uint8_t reserved[18];
 };
 
 struct scsi_vpd_tpc
 {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_SCSI_TPC			0x8F
 	uint8_t page_length[2];
 	struct scsi_vpd_tpc_descriptor descr[];
 };
 
 /*
  * Block Device Characteristics VPD Page based on
  * T10/1799-D Revision 31
  */
 struct scsi_vpd_block_characteristics
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define SVPD_BDC			0xB1
 	u_int8_t page_length[2];
 	u_int8_t medium_rotation_rate[2];
 #define SVPD_BDC_RATE_NOT_REPORTED	0x00
 #define SVPD_BDC_RATE_NON_ROTATING	0x01
 	u_int8_t reserved1;
 	u_int8_t nominal_form_factor;
 #define SVPD_BDC_FORM_NOT_REPORTED	0x00
 #define SVPD_BDC_FORM_5_25INCH		0x01
 #define SVPD_BDC_FORM_3_5INCH		0x02
 #define SVPD_BDC_FORM_2_5INCH		0x03
 #define SVPD_BDC_FORM_1_5INCH		0x04
 #define SVPD_BDC_FORM_LESSTHAN_1_5INCH	0x05
 	u_int8_t reserved2[56];
 };
 
 /*
  * Block Device Characteristics VPD Page
  */
 struct scsi_vpd_block_device_characteristics
 {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_BDC		0xB1
 	uint8_t page_length[2];
 	uint8_t medium_rotation_rate[2];
 #define	SVPD_NOT_REPORTED	0x0000
 #define	SVPD_NON_ROTATING	0x0001
 	uint8_t product_type;
 	uint8_t wab_wac_ff;
 	uint8_t flags;
 #define	SVPD_VBULS		0x01
 #define	SVPD_FUAB		0x02
 #define	SVPD_HAW_ZBC		0x10
 	uint8_t reserved[55];
 };
 
 /*
  * Logical Block Provisioning VPD Page based on
  * T10/1799-D Revision 31
  */
 struct scsi_vpd_logical_block_prov
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_LBP		0xB2
 	u_int8_t page_length[2];
 #define SVPD_LBP_PL_BASIC	0x04
 	u_int8_t threshold_exponent;
 	u_int8_t flags;
 #define SVPD_LBP_UNMAP		0x80
 #define SVPD_LBP_WS16		0x40
 #define SVPD_LBP_WS10		0x20
 #define SVPD_LBP_RZ		0x04
 #define SVPD_LBP_ANC_SUP	0x02
 #define SVPD_LBP_DP		0x01
 	u_int8_t prov_type;
 #define SVPD_LBP_RESOURCE	0x01
 #define SVPD_LBP_THIN		0x02
 	u_int8_t reserved;
 	/*
 	 * Provisioning Group Descriptor can be here if SVPD_LBP_DP is set
 	 * Its size can be determined from page_length - 4
 	 */
 };
 
 /*
  * Block Limits VDP Page based on
  * T10/1799-D Revision 31
  */
 struct scsi_vpd_block_limits
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_BLOCK_LIMITS	0xB0
 	u_int8_t page_length[2];
 #define SVPD_BL_PL_BASIC	0x10
 #define SVPD_BL_PL_TP		0x3C
 	u_int8_t reserved1;
 	u_int8_t max_cmp_write_len;
 	u_int8_t opt_txfer_len_grain[2];
 	u_int8_t max_txfer_len[4];
 	u_int8_t opt_txfer_len[4];
 	u_int8_t max_prefetch[4];
 	u_int8_t max_unmap_lba_cnt[4];
 	u_int8_t max_unmap_blk_cnt[4];
 	u_int8_t opt_unmap_grain[4];
 	u_int8_t unmap_grain_align[4];
 	u_int8_t max_write_same_length[8];
 	u_int8_t reserved2[20];
 };
 
 struct scsi_read_capacity
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SRC_RELADR	0x01
 	u_int8_t addr[4];
 	u_int8_t unused[2];
 	u_int8_t pmi;
 #define	SRC_PMI		0x01
 	u_int8_t control;
 };
 
 struct scsi_read_capacity_16
 {
 	uint8_t opcode;
 #define	SRC16_SERVICE_ACTION	0x10
 	uint8_t service_action;
 	uint8_t addr[8];
 	uint8_t alloc_len[4];
 #define	SRC16_PMI		0x01
 #define	SRC16_RELADR		0x02
 	uint8_t reladr;
 	uint8_t control;
 };
 
 struct scsi_read_capacity_data
 {
 	u_int8_t addr[4];
 	u_int8_t length[4];
 };
 
 struct scsi_read_capacity_data_long
 {
 	uint8_t addr[8];
 	uint8_t length[4];
 #define	SRC16_PROT_EN		0x01
 #define	SRC16_P_TYPE		0x0e
 #define	SRC16_PTYPE_1		0x00
 #define	SRC16_PTYPE_2		0x02
 #define	SRC16_PTYPE_3		0x04
 	uint8_t prot;
 #define	SRC16_LBPPBE		0x0f
 #define	SRC16_PI_EXPONENT	0xf0
 #define	SRC16_PI_EXPONENT_SHIFT	4
 	uint8_t prot_lbppbe;
 #define	SRC16_LALBA		0x3f
 #define	SRC16_LBPRZ		0x40
 #define	SRC16_LBPME		0x80
 /*
  * Alternate versions of these macros that are intended for use on a 16-bit
  * version of the lalba_lbp field instead of the array of 2 8 bit numbers.
  */
 #define	SRC16_LALBA_A		0x3fff
 #define	SRC16_LBPRZ_A		0x4000
 #define	SRC16_LBPME_A		0x8000
 	uint8_t lalba_lbp[2];
 	uint8_t	reserved[16];
 };
 
 struct scsi_report_luns
 {
 	uint8_t opcode;
 	uint8_t reserved1;
 #define	RPL_REPORT_DEFAULT	0x00
 #define	RPL_REPORT_WELLKNOWN	0x01
 #define	RPL_REPORT_ALL		0x02
 	uint8_t select_report;
 	uint8_t reserved2[3];
 	uint8_t length[4];
 	uint8_t reserved3;
 	uint8_t control;
 };
 
 struct scsi_report_luns_lundata {
 	uint8_t lundata[8];
 #define	RPL_LUNDATA_PERIPH_BUS_MASK	0x3f
 #define	RPL_LUNDATA_FLAT_LUN_MASK	0x3f
 #define	RPL_LUNDATA_FLAT_LUN_BITS	0x06
 #define	RPL_LUNDATA_LUN_TARG_MASK	0x3f
 #define	RPL_LUNDATA_LUN_BUS_MASK	0xe0
 #define	RPL_LUNDATA_LUN_LUN_MASK	0x1f
 #define	RPL_LUNDATA_EXT_LEN_MASK	0x30
 #define	RPL_LUNDATA_EXT_EAM_MASK	0x0f
 #define	RPL_LUNDATA_EXT_EAM_WK		0x01
 #define	RPL_LUNDATA_EXT_EAM_NOT_SPEC	0x0f
 #define	RPL_LUNDATA_ATYP_MASK	0xc0	/* MBZ for type 0 lun */
 #define	RPL_LUNDATA_ATYP_PERIPH	0x00
 #define	RPL_LUNDATA_ATYP_FLAT	0x40
 #define	RPL_LUNDATA_ATYP_LUN	0x80
 #define	RPL_LUNDATA_ATYP_EXTLUN	0xc0
 };
 
 struct scsi_report_luns_data {
 	u_int8_t length[4];	/* length of LUN inventory, in bytes */
 	u_int8_t reserved[4];	/* unused */
 	/*
 	 * LUN inventory- we only support the type zero form for now.
 	 */
 	struct scsi_report_luns_lundata luns[0];
 };
 
 struct scsi_target_group
 {
 	uint8_t opcode;
 	uint8_t service_action;
 #define	STG_PDF_MASK		0xe0
 #define	STG_PDF_LENGTH		0x00
 #define	STG_PDF_EXTENDED	0x20
 	uint8_t reserved1[4];
 	uint8_t length[4];
 	uint8_t reserved2;
 	uint8_t control;
 };
 
 struct scsi_target_port_descriptor {
 	uint8_t	reserved[2];
 	uint8_t	relative_target_port_identifier[2];
 	uint8_t desc_list[];
 };
 
 struct scsi_target_port_group_descriptor {
 	uint8_t	pref_state;
 #define	TPG_PRIMARY				0x80
 #define	TPG_ASYMMETRIC_ACCESS_STATE_MASK	0xf
 #define	TPG_ASYMMETRIC_ACCESS_OPTIMIZED		0x0
 #define	TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED	0x1
 #define	TPG_ASYMMETRIC_ACCESS_STANDBY		0x2
 #define	TPG_ASYMMETRIC_ACCESS_UNAVAILABLE	0x3
 #define	TPG_ASYMMETRIC_ACCESS_LBA_DEPENDENT	0x4
 #define	TPG_ASYMMETRIC_ACCESS_OFFLINE		0xE
 #define	TPG_ASYMMETRIC_ACCESS_TRANSITIONING	0xF
 	uint8_t support;
 #define	TPG_AO_SUP	0x01
 #define	TPG_AN_SUP	0x02
 #define	TPG_S_SUP	0x04
 #define	TPG_U_SUP	0x08
 #define	TPG_LBD_SUP	0x10
 #define	TPG_O_SUP	0x40
 #define	TPG_T_SUP	0x80
 	uint8_t target_port_group[2];
 	uint8_t reserved;
 	uint8_t status;
 #define TPG_UNAVLBL      0
 #define TPG_SET_BY_STPG  0x01
 #define TPG_IMPLICIT     0x02
 	uint8_t vendor_specific;
 	uint8_t	target_port_count;
 	struct scsi_target_port_descriptor descriptors[];
 };
 
 struct scsi_target_group_data {
 	uint8_t length[4];	/* length of returned data, in bytes */
 	struct scsi_target_port_group_descriptor groups[];
 };
 
 struct scsi_target_group_data_extended {
 	uint8_t length[4];	/* length of returned data, in bytes */
 	uint8_t format_type;	/* STG_PDF_LENGTH or STG_PDF_EXTENDED */
 	uint8_t	implicit_transition_time;
 	uint8_t reserved[2];
 	struct scsi_target_port_group_descriptor groups[];
 };
 
 
 typedef enum {
 	SSD_TYPE_NONE,
 	SSD_TYPE_FIXED,
 	SSD_TYPE_DESC
 } scsi_sense_data_type;
 
 typedef enum {
 	SSD_ELEM_NONE,
 	SSD_ELEM_SKIP,
 	SSD_ELEM_DESC,
 	SSD_ELEM_SKS,
 	SSD_ELEM_COMMAND,
 	SSD_ELEM_INFO,
 	SSD_ELEM_FRU,
 	SSD_ELEM_STREAM,
 	SSD_ELEM_MAX
 } scsi_sense_elem_type;
 
 
 struct scsi_sense_data
 {
 	uint8_t error_code;
 	/*
 	 * SPC-4 says that the maximum length of sense data is 252 bytes.
 	 * So this structure is exactly 252 bytes log.
 	 */
 #define	SSD_FULL_SIZE 252
 	uint8_t sense_buf[SSD_FULL_SIZE - 1];
 	/*
 	 * XXX KDM is this still a reasonable minimum size?
 	 */
 #define	SSD_MIN_SIZE 18
 	/*
 	 * Maximum value for the extra_len field in the sense data.
 	 */
 #define	SSD_EXTRA_MAX 244
 };
 
 /*
  * Fixed format sense data.
  */
 struct scsi_sense_data_fixed
 {
 	u_int8_t error_code;
 #define	SSD_ERRCODE			0x7F
 #define		SSD_CURRENT_ERROR	0x70
 #define		SSD_DEFERRED_ERROR	0x71
 #define	SSD_ERRCODE_VALID	0x80	
 	u_int8_t segment;
 	u_int8_t flags;
 #define	SSD_KEY				0x0F
 #define		SSD_KEY_NO_SENSE	0x00
 #define		SSD_KEY_RECOVERED_ERROR	0x01
 #define		SSD_KEY_NOT_READY	0x02
 #define		SSD_KEY_MEDIUM_ERROR	0x03
 #define		SSD_KEY_HARDWARE_ERROR	0x04
 #define		SSD_KEY_ILLEGAL_REQUEST	0x05
 #define		SSD_KEY_UNIT_ATTENTION	0x06
 #define		SSD_KEY_DATA_PROTECT	0x07
 #define		SSD_KEY_BLANK_CHECK	0x08
 #define		SSD_KEY_Vendor_Specific	0x09
 #define		SSD_KEY_COPY_ABORTED	0x0a
 #define		SSD_KEY_ABORTED_COMMAND	0x0b		
 #define		SSD_KEY_EQUAL		0x0c
 #define		SSD_KEY_VOLUME_OVERFLOW	0x0d
 #define		SSD_KEY_MISCOMPARE	0x0e
 #define		SSD_KEY_COMPLETED	0x0f			
 #define	SSD_ILI		0x20
 #define	SSD_EOM		0x40
 #define	SSD_FILEMARK	0x80
 	u_int8_t info[4];
 	u_int8_t extra_len;
 	u_int8_t cmd_spec_info[4];
 	u_int8_t add_sense_code;
 	u_int8_t add_sense_code_qual;
 	u_int8_t fru;
 	u_int8_t sense_key_spec[3];
 #define	SSD_SCS_VALID		0x80
 #define	SSD_FIELDPTR_CMD	0x40
 #define	SSD_BITPTR_VALID	0x08
 #define	SSD_BITPTR_VALUE	0x07
 	u_int8_t extra_bytes[14];
 #define	SSD_FIXED_IS_PRESENT(sense, length, field) 			\
 	((length >= (offsetof(struct scsi_sense_data_fixed, field) +	\
 	sizeof(sense->field))) ? 1 :0)
 #define	SSD_FIXED_IS_FILLED(sense, field) 				\
 	((((offsetof(struct scsi_sense_data_fixed, field) +		\
 	sizeof(sense->field)) -						\
 	(offsetof(struct scsi_sense_data_fixed, extra_len) +		\
 	sizeof(sense->extra_len))) <= sense->extra_len) ? 1 : 0)
 };
 
 /*
  * Descriptor format sense data definitions.
  * Introduced in SPC-3.
  */
 struct scsi_sense_data_desc 
 {
 	uint8_t	error_code;
 #define	SSD_DESC_CURRENT_ERROR	0x72
 #define	SSD_DESC_DEFERRED_ERROR	0x73
 	uint8_t sense_key;
 	uint8_t	add_sense_code;
 	uint8_t	add_sense_code_qual;
 	uint8_t	reserved[3];
 	/*
 	 * Note that SPC-4, section 4.5.2.1 says that the extra_len field
 	 * must be less than or equal to 244.
 	 */
 	uint8_t	extra_len;
 	uint8_t	sense_desc[0];
 #define	SSD_DESC_IS_PRESENT(sense, length, field) 			\
 	((length >= (offsetof(struct scsi_sense_data_desc, field) +	\
 	sizeof(sense->field))) ? 1 :0)
 };
 
 struct scsi_sense_desc_header
 {
 	uint8_t desc_type;
 	uint8_t length;
 };
 /*
  * The information provide in the Information descriptor is device type or
  * command specific information, and defined in a command standard.
  *
  * Note that any changes to the field names or positions in this structure,
  * even reserved fields, should be accompanied by an examination of the
  * code in ctl_set_sense() that uses them.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_info
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_INFO	0x00
 	uint8_t	length;
 	uint8_t	byte2;
 #define	SSD_INFO_VALID	0x80
 	uint8_t	reserved;
 	uint8_t	info[8];
 };
 
 /*
  * Command-specific information depends on the command for which the
  * reported condition occured.
  *
  * Note that any changes to the field names or positions in this structure,
  * even reserved fields, should be accompanied by an examination of the
  * code in ctl_set_sense() that uses them.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_command
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_COMMAND	0x01
 	uint8_t	length;
 	uint8_t	reserved[2];
 	uint8_t	command_info[8];
 };
 
 /*
  * Sense key specific descriptor.  The sense key specific data format
  * depends on the sense key in question.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_sks
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_SKS		0x02
 	uint8_t	length;
 	uint8_t reserved1[2];
 	uint8_t	sense_key_spec[3];
 #define	SSD_SKS_VALID		0x80
 	uint8_t reserved2;
 };
 
 /*
  * This is used for the Illegal Request sense key (0x05) only.
  */
 struct scsi_sense_sks_field
 {
 	uint8_t	byte0;
 #define	SSD_SKS_FIELD_VALID	0x80
 #define	SSD_SKS_FIELD_CMD	0x40
 #define	SSD_SKS_BPV		0x08
 #define	SSD_SKS_BIT_VALUE	0x07
 	uint8_t	field[2];
 };
 
 
 /* 
  * This is used for the Hardware Error (0x04), Medium Error (0x03) and
  * Recovered Error (0x01) sense keys.
  */
 struct scsi_sense_sks_retry
 {
 	uint8_t byte0;
 #define	SSD_SKS_RETRY_VALID	0x80
 	uint8_t actual_retry_count[2];
 };
 
 /*
  * Used with the NO Sense (0x00) or Not Ready (0x02) sense keys.
  */
 struct scsi_sense_sks_progress
 {
 	uint8_t byte0;
 #define	SSD_SKS_PROGRESS_VALID	0x80
 	uint8_t progress[2];
 #define	SSD_SKS_PROGRESS_DENOM	0x10000
 };
 
 /*
  * Used with the Copy Aborted (0x0a) sense key.
  */
 struct scsi_sense_sks_segment
 {
 	uint8_t byte0;
 #define	SSD_SKS_SEGMENT_VALID	0x80
 #define	SSD_SKS_SEGMENT_SD	0x20
 #define	SSD_SKS_SEGMENT_BPV	0x08
 #define	SSD_SKS_SEGMENT_BITPTR	0x07
 	uint8_t field[2];
 };
 
 /*
  * Used with the Unit Attention (0x06) sense key.
  *
  * This is currently used to indicate that the unit attention condition
  * queue has overflowed (when the overflow bit is set).
  */
 struct scsi_sense_sks_overflow
 {
 	uint8_t byte0;
 #define	SSD_SKS_OVERFLOW_VALID	0x80
 #define	SSD_SKS_OVERFLOW_SET	0x01
 	uint8_t	reserved[2];
 };
 
 /*
  * This specifies which component is associated with the sense data.  There
  * is no standard meaning for the fru value.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_fru
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_FRU		0x03
 	uint8_t	length;
 	uint8_t reserved;
 	uint8_t fru;
 };
 
 /*
  * Used for Stream commands, defined in SSC-4.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
  
 struct scsi_sense_stream
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_STREAM		0x04
 	uint8_t	length;
 	uint8_t	reserved;
 	uint8_t	byte3;
 #define	SSD_DESC_STREAM_FM	0x80
 #define	SSD_DESC_STREAM_EOM	0x40
 #define	SSD_DESC_STREAM_ILI	0x20
 };
 
 /*
  * Used for Block commands, defined in SBC-3.
  *
  * This is currently (as of SBC-3) only used for the Incorrect Length
  * Indication (ILI) bit, which says that the data length requested in the
  * READ LONG or WRITE LONG command did not match the length of the logical
  * block.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_block
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_BLOCK		0x05
 	uint8_t	length;
 	uint8_t	reserved;
 	uint8_t	byte3;
 #define	SSD_DESC_BLOCK_ILI	0x20
 };
 
 /*
  * Used for Object-Based Storage Devices (OSD-3).
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_osd_objid
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_OSD_OBJID	0x06
 	uint8_t	length;
 	uint8_t	reserved[6];
 	/*
 	 * XXX KDM provide the bit definitions here?  There are a lot of
 	 * them, and we don't have an OSD driver yet.
 	 */
 	uint8_t	not_init_cmds[4];
 	uint8_t	completed_cmds[4];
 	uint8_t	partition_id[8];
 	uint8_t	object_id[8];
 };
 
 /*
  * Used for Object-Based Storage Devices (OSD-3).
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_osd_integrity
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_OSD_INTEGRITY	0x07
 	uint8_t	length;
 	uint8_t	integ_check_val[32];
 };
 
 /*
  * Used for Object-Based Storage Devices (OSD-3).
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_osd_attr_id
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_OSD_ATTR_ID	0x08
 	uint8_t	length;
 	uint8_t	reserved[2];
 	uint8_t	attr_desc[0];
 };
 
 /*
  * Used with Sense keys No Sense (0x00) and Not Ready (0x02).
  *
  * Maximum descriptors allowed: 32 (as of SPC-4)
  */
 struct scsi_sense_progress
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_PROGRESS	0x0a
 	uint8_t	length;
 	uint8_t	sense_key;
 	uint8_t	add_sense_code;
 	uint8_t	add_sense_code_qual;
 	uint8_t reserved;
 	uint8_t	progress[2];
 };
 
 /*
  * This is typically forwarded as the result of an EXTENDED COPY command.
  *
  * Maximum descriptors allowed: 2 (as of SPC-4)
  */
 struct scsi_sense_forwarded
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_FORWARDED	0x0c
 	uint8_t	length;
 	uint8_t	byte2;
 #define	SSD_FORWARDED_FSDT	0x80
 #define	SSD_FORWARDED_SDS_MASK	0x0f
 #define	SSD_FORWARDED_SDS_UNK	0x00
 #define	SSD_FORWARDED_SDS_EXSRC	0x01
 #define	SSD_FORWARDED_SDS_EXDST	0x02
 };
 
 /*
  * Vendor-specific sense descriptor.  The desc_type field will be in the
  * range bewteen MIN and MAX inclusive.
  */
 struct scsi_sense_vendor
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_VENDOR_MIN	0x80
 #define	SSD_DESC_VENDOR_MAX	0xff
 	uint8_t length;
 	uint8_t	data[0];
 };
 
 struct scsi_mode_header_6
 {
 	u_int8_t data_length;	/* Sense data length */
 	u_int8_t medium_type;
 	u_int8_t dev_spec;
 	u_int8_t blk_desc_len;
 };
 
 struct scsi_mode_header_10
 {
 	u_int8_t data_length[2];/* Sense data length */
 	u_int8_t medium_type;
 	u_int8_t dev_spec;
 	u_int8_t unused[2];
 	u_int8_t blk_desc_len[2];
 };
 
 struct scsi_mode_page_header
 {
 	u_int8_t page_code;
 #define	SMPH_PS		0x80
 #define	SMPH_SPF	0x40
 #define	SMPH_PC_MASK	0x3f
 	u_int8_t page_length;
 };
 
 struct scsi_mode_page_header_sp
 {
 	uint8_t page_code;
 	uint8_t subpage;
 	uint8_t page_length[2];
 };
 
 
 struct scsi_mode_blk_desc
 {
 	u_int8_t density;
 	u_int8_t nblocks[3];
 	u_int8_t reserved;
 	u_int8_t blklen[3];
 };
 
 #define	SCSI_DEFAULT_DENSITY	0x00	/* use 'default' density */
 #define	SCSI_SAME_DENSITY	0x7f	/* use 'same' density- >= SCSI-2 only */
 
 
 /*
  * Status Byte
  */
 #define	SCSI_STATUS_OK			0x00
 #define	SCSI_STATUS_CHECK_COND		0x02
 #define	SCSI_STATUS_COND_MET		0x04
 #define	SCSI_STATUS_BUSY		0x08
 #define	SCSI_STATUS_INTERMED		0x10
 #define	SCSI_STATUS_INTERMED_COND_MET	0x14
 #define	SCSI_STATUS_RESERV_CONFLICT	0x18
 #define	SCSI_STATUS_CMD_TERMINATED	0x22	/* Obsolete in SAM-2 */
 #define	SCSI_STATUS_QUEUE_FULL		0x28
 #define	SCSI_STATUS_ACA_ACTIVE		0x30
 #define	SCSI_STATUS_TASK_ABORTED	0x40
 
 struct scsi_inquiry_pattern {
 	u_int8_t   type;
 	u_int8_t   media_type;
 #define	SIP_MEDIA_REMOVABLE	0x01
 #define	SIP_MEDIA_FIXED		0x02
 	const char *vendor;
 	const char *product;
 	const char *revision;
 }; 
 
 struct scsi_static_inquiry_pattern {
 	u_int8_t   type;
 	u_int8_t   media_type;
 	char       vendor[SID_VENDOR_SIZE+1];
 	char       product[SID_PRODUCT_SIZE+1];
 	char       revision[SID_REVISION_SIZE+1];
 };
 
 struct scsi_sense_quirk_entry {
 	struct scsi_inquiry_pattern	inq_pat;
 	int				num_sense_keys;
 	int				num_ascs;
 	struct sense_key_table_entry	*sense_key_info;
 	struct asc_table_entry		*asc_info;
 };
 
 struct sense_key_table_entry {
 	u_int8_t    sense_key;
 	u_int32_t   action;
 	const char *desc;
 };
 
 struct asc_table_entry {
 	u_int8_t    asc;
 	u_int8_t    ascq;
 	u_int32_t   action;
 	const char *desc;
 };
 
 struct op_table_entry {
 	u_int8_t    opcode;
 	u_int32_t   opmask;
 	const char  *desc;
 };
 
 struct scsi_op_quirk_entry {
 	struct scsi_inquiry_pattern	inq_pat;
 	int				num_ops;
 	struct op_table_entry		*op_table;
 };
 
 typedef enum {
 	SSS_FLAG_NONE		= 0x00,
 	SSS_FLAG_PRINT_COMMAND	= 0x01
 } scsi_sense_string_flags;
 
 struct scsi_nv {
 	const char *name;
 	uint64_t value;
 };
 
 typedef enum {
 	SCSI_NV_FOUND,
 	SCSI_NV_AMBIGUOUS,
 	SCSI_NV_NOT_FOUND
 } scsi_nv_status;
 
 typedef enum {
 	SCSI_NV_FLAG_NONE	= 0x00,
 	SCSI_NV_FLAG_IG_CASE	= 0x01	/* Case insensitive comparison */
 } scsi_nv_flags;
 
 struct ccb_scsiio;
 struct cam_periph;
 union  ccb;
 #ifndef _KERNEL
 struct cam_device;
 #endif
 
 extern const char *scsi_sense_key_text[];
 
 struct sbuf;
 
 __BEGIN_DECLS
 void scsi_sense_desc(int sense_key, int asc, int ascq,
 		     struct scsi_inquiry_data *inq_data,
 		     const char **sense_key_desc, const char **asc_desc);
 scsi_sense_action scsi_error_action(struct ccb_scsiio* csio,
 				    struct scsi_inquiry_data *inq_data,
 				    u_int32_t sense_flags);
 const char *	scsi_status_string(struct ccb_scsiio *csio);
 
 void scsi_desc_iterate(struct scsi_sense_data_desc *sense, u_int sense_len,
 		       int (*iter_func)(struct scsi_sense_data_desc *sense,
 					u_int, struct scsi_sense_desc_header *,
 					void *), void *arg);
 uint8_t *scsi_find_desc(struct scsi_sense_data_desc *sense, u_int sense_len,
 			uint8_t desc_type);
 void scsi_set_sense_data(struct scsi_sense_data *sense_data, 
 			 scsi_sense_data_type sense_format, int current_error,
 			 int sense_key, int asc, int ascq, ...) ;
 void scsi_set_sense_data_va(struct scsi_sense_data *sense_data,
 			    scsi_sense_data_type sense_format,
 			    int current_error, int sense_key, int asc,
 			    int ascq, va_list ap);
 int scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len,
 			uint8_t info_type, uint64_t *info,
 			int64_t *signed_info);
 int scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len,
 		 uint8_t *sks);
 int scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len,
 			struct scsi_inquiry_data *inq_data,
 			uint8_t *block_bits);
 int scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len,
 			 struct scsi_inquiry_data *inq_data,
 			 uint8_t *stream_bits);
 void scsi_info_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len,
 		    struct scsi_inquiry_data *inq_data, uint64_t info);
 void scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len,
 		       struct scsi_inquiry_data *inq_data, uint64_t csi);
 void scsi_progress_sbuf(struct sbuf *sb, uint16_t progress);
 int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks);
 void scsi_fru_sbuf(struct sbuf *sb, uint64_t fru);
 void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits, uint64_t info);
 void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits, uint64_t info);
 void scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			  u_int sense_len, uint8_t *cdb, int cdb_len,
 			  struct scsi_inquiry_data *inq_data,
 			  struct scsi_sense_desc_header *header);
 
 void scsi_sense_command_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			     u_int sense_len, uint8_t *cdb, int cdb_len,
 			     struct scsi_inquiry_data *inq_data,
 			     struct scsi_sense_desc_header *header);
 void scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header);
 void scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header);
 void scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			    u_int sense_len, uint8_t *cdb, int cdb_len,
 			    struct scsi_inquiry_data *inq_data,
 			    struct scsi_sense_desc_header *header);
 void scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			   u_int sense_len, uint8_t *cdb, int cdb_len,
 			   struct scsi_inquiry_data *inq_data,
 			   struct scsi_sense_desc_header *header);
 void scsi_sense_progress_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			      u_int sense_len, uint8_t *cdb, int cdb_len,
 			      struct scsi_inquiry_data *inq_data,
 			      struct scsi_sense_desc_header *header);
 void scsi_sense_generic_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			     u_int sense_len, uint8_t *cdb, int cdb_len,
 			     struct scsi_inquiry_data *inq_data,
 			     struct scsi_sense_desc_header *header);
 void scsi_sense_desc_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			  u_int sense_len, uint8_t *cdb, int cdb_len,
 			  struct scsi_inquiry_data *inq_data,
 			  struct scsi_sense_desc_header *header);
 scsi_sense_data_type scsi_sense_type(struct scsi_sense_data *sense_data);
 
 void scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len,
 			  struct sbuf *sb, char *path_str,
 			  struct scsi_inquiry_data *inq_data, uint8_t *cdb,
 			  int cdb_len);
 
 #ifdef _KERNEL
 int		scsi_command_string(struct ccb_scsiio *csio, struct sbuf *sb);
 int		scsi_sense_sbuf(struct ccb_scsiio *csio, struct sbuf *sb,
 				scsi_sense_string_flags flags);
 char *		scsi_sense_string(struct ccb_scsiio *csio,
 				  char *str, int str_len);
 void		scsi_sense_print(struct ccb_scsiio *csio);
 int 		scsi_vpd_supported_page(struct cam_periph *periph,
 					uint8_t page_id);
 #else /* _KERNEL */
 int		scsi_command_string(struct cam_device *device,
 				    struct ccb_scsiio *csio, struct sbuf *sb);
 int		scsi_sense_sbuf(struct cam_device *device, 
 				struct ccb_scsiio *csio, struct sbuf *sb,
 				scsi_sense_string_flags flags);
 char *		scsi_sense_string(struct cam_device *device, 
 				  struct ccb_scsiio *csio,
 				  char *str, int str_len);
 void		scsi_sense_print(struct cam_device *device, 
 				 struct ccb_scsiio *csio, FILE *ofile);
 #endif /* _KERNEL */
 
 const char *	scsi_op_desc(u_int16_t opcode, 
 			     struct scsi_inquiry_data *inq_data);
 char *		scsi_cdb_string(u_int8_t *cdb_ptr, char *cdb_string,
 				size_t len);
 
 void		scsi_print_inquiry(struct scsi_inquiry_data *inq_data);
 void		scsi_print_inquiry_short(struct scsi_inquiry_data *inq_data);
 
 u_int		scsi_calc_syncsrate(u_int period_factor);
 u_int		scsi_calc_syncparam(u_int period);
 
 typedef int	(*scsi_devid_checkfn_t)(uint8_t *);
 int		scsi_devid_is_naa_ieee_reg(uint8_t *bufp);
 int		scsi_devid_is_sas_target(uint8_t *bufp);
 int		scsi_devid_is_lun_eui64(uint8_t *bufp);
 int		scsi_devid_is_lun_naa(uint8_t *bufp);
 int		scsi_devid_is_lun_name(uint8_t *bufp);
 int		scsi_devid_is_lun_t10(uint8_t *bufp);
 struct scsi_vpd_id_descriptor *
 		scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t len,
 			       scsi_devid_checkfn_t ck_fn);
 struct scsi_vpd_id_descriptor *
 		scsi_get_devid_desc(struct scsi_vpd_id_descriptor *desc, uint32_t len,
 			       scsi_devid_checkfn_t ck_fn);
 
 int		scsi_transportid_sbuf(struct sbuf *sb,
 				      struct scsi_transportid_header *hdr,
 				      uint32_t valid_len);
 
 const char *	scsi_nv_to_str(struct scsi_nv *table, int num_table_entries,
 			       uint64_t value);
 
 scsi_nv_status	scsi_get_nv(struct scsi_nv *table, int num_table_entries,
 			    char *name, int *table_entry, scsi_nv_flags flags);
 
 int	scsi_parse_transportid_64bit(int proto_id, char *id_str,
 				     struct scsi_transportid_header **hdr,
 				     unsigned int *alloc_len,
 #ifdef _KERNEL
 				     struct malloc_type *type, int flags,
 #endif
 				     char *error_str, int error_str_len);
 
 int	scsi_parse_transportid_spi(char *id_str,
 				   struct scsi_transportid_header **hdr,
 				   unsigned int *alloc_len,
 #ifdef _KERNEL
 				   struct malloc_type *type, int flags,
 #endif
 				   char *error_str, int error_str_len);
 
 int	scsi_parse_transportid_rdma(char *id_str,
 				    struct scsi_transportid_header **hdr,
 				    unsigned int *alloc_len,
 #ifdef _KERNEL
 				    struct malloc_type *type, int flags,
 #endif
 				    char *error_str, int error_str_len);
 
 int	scsi_parse_transportid_iscsi(char *id_str,
 				     struct scsi_transportid_header **hdr,
 				     unsigned int *alloc_len,
 #ifdef _KERNEL
 				     struct malloc_type *type, int flags,
 #endif
 				     char *error_str,int error_str_len);
 
 int	scsi_parse_transportid_sop(char *id_str,
 				   struct scsi_transportid_header **hdr,
 				   unsigned int *alloc_len,
 #ifdef _KERNEL
 				   struct malloc_type *type, int flags,
 #endif
 				   char *error_str,int error_str_len);
 
 int	scsi_parse_transportid(char *transportid_str,
 			       struct scsi_transportid_header **hdr,
 			       unsigned int *alloc_len,
 #ifdef _KERNEL
 			       struct malloc_type *type, int flags,
 #endif
 			       char *error_str, int error_str_len);
 
 void		scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries,
 				     void (*cbfcnp)(struct cam_periph *, 
 						    union ccb *),
 				     u_int8_t tag_action, 
 				     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_request_sense(struct ccb_scsiio *csio, u_int32_t retries,
 				   void (*cbfcnp)(struct cam_periph *, 
 						  union ccb *),
 				   void *data_ptr, u_int8_t dxfer_len,
 				   u_int8_t tag_action, u_int8_t sense_len,
 				   u_int32_t timeout);
 
 void		scsi_inquiry(struct ccb_scsiio *csio, u_int32_t retries,
 			     void (*cbfcnp)(struct cam_periph *, union ccb *),
 			     u_int8_t tag_action, u_int8_t *inq_buf, 
 			     u_int32_t inq_len, int evpd, u_int8_t page_code,
 			     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_mode_sense(struct ccb_scsiio *csio, u_int32_t retries,
 				void (*cbfcnp)(struct cam_periph *,
 					       union ccb *),
 				u_int8_t tag_action, int dbd,
 				u_int8_t page_code, u_int8_t page,
 				u_int8_t *param_buf, u_int32_t param_len,
 				u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_mode_sense_len(struct ccb_scsiio *csio, u_int32_t retries,
 				    void (*cbfcnp)(struct cam_periph *,
 						   union ccb *),
 				    u_int8_t tag_action, int dbd,
 				    u_int8_t page_code, u_int8_t page,
 				    u_int8_t *param_buf, u_int32_t param_len,
 				    int minimum_cmd_size, u_int8_t sense_len,
 				    u_int32_t timeout);
 
 void		scsi_mode_select(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *,
 						union ccb *),
 				 u_int8_t tag_action, int scsi_page_fmt,
 				 int save_pages, u_int8_t *param_buf,
 				 u_int32_t param_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_mode_select_len(struct ccb_scsiio *csio, u_int32_t retries,
 				     void (*cbfcnp)(struct cam_periph *,
 						    union ccb *),
 				     u_int8_t tag_action, int scsi_page_fmt,
 				     int save_pages, u_int8_t *param_buf,
 				     u_int32_t param_len, int minimum_cmd_size,
 				     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_log_sense(struct ccb_scsiio *csio, u_int32_t retries,
 			       void (*cbfcnp)(struct cam_periph *, union ccb *),
 			       u_int8_t tag_action, u_int8_t page_code,
 			       u_int8_t page, int save_pages, int ppc,
 			       u_int32_t paramptr, u_int8_t *param_buf,
 			       u_int32_t param_len, u_int8_t sense_len,
 			       u_int32_t timeout);
 
 void		scsi_log_select(struct ccb_scsiio *csio, u_int32_t retries,
 				void (*cbfcnp)(struct cam_periph *,
 				union ccb *), u_int8_t tag_action,
 				u_int8_t page_code, int save_pages,
 				int pc_reset, u_int8_t *param_buf,
 				u_int32_t param_len, u_int8_t sense_len,
 				u_int32_t timeout);
 
 void		scsi_prevent(struct ccb_scsiio *csio, u_int32_t retries,
 			     void (*cbfcnp)(struct cam_periph *, union ccb *),
 			     u_int8_t tag_action, u_int8_t action,
 			     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_read_capacity(struct ccb_scsiio *csio, u_int32_t retries,
 				   void (*cbfcnp)(struct cam_periph *, 
 				   union ccb *), u_int8_t tag_action, 
 				   struct scsi_read_capacity_data *,
 				   u_int8_t sense_len, u_int32_t timeout);
 void		scsi_read_capacity_16(struct ccb_scsiio *csio, uint32_t retries,
 				      void (*cbfcnp)(struct cam_periph *,
 				      union ccb *), uint8_t tag_action,
 				      uint64_t lba, int reladr, int pmi,
 				      uint8_t *rcap_buf, int rcap_buf_len,
 				      uint8_t sense_len, uint32_t timeout);
 
 void		scsi_report_luns(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *, 
 				 union ccb *), u_int8_t tag_action, 
 				 u_int8_t select_report,
 				 struct scsi_report_luns_data *rpl_buf,
 				 u_int32_t alloc_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_report_target_group(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *, 
 				 union ccb *), u_int8_t tag_action, 
 				 u_int8_t pdf,
 				 void *buf,
 				 u_int32_t alloc_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_set_target_group(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *, 
 				 union ccb *), u_int8_t tag_action, void *buf,
 				 u_int32_t alloc_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_synchronize_cache(struct ccb_scsiio *csio, 
 				       u_int32_t retries,
 				       void (*cbfcnp)(struct cam_periph *, 
 				       union ccb *), u_int8_t tag_action, 
 				       u_int32_t begin_lba, u_int16_t lb_count,
 				       u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries,
 				     void (*cbfcnp)(struct cam_periph *,
 						    union ccb*),
 				     uint8_t tag_action, int pcv,
 				     uint8_t page_code, uint8_t *data_ptr,
 				     uint16_t allocation_length,
 				     uint8_t sense_len, uint32_t timeout);
 
 void scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries,
 			  void (*cbfcnp)(struct cam_periph *, union ccb *),
 			  uint8_t tag_action, int unit_offline,
 			  int device_offline, int self_test, int page_format,
 			  int self_test_code, uint8_t *data_ptr,
 			  uint16_t param_list_length, uint8_t sense_len,
 			  uint32_t timeout);
 
 void scsi_read_buffer(struct ccb_scsiio *csio, u_int32_t retries,
 			void (*cbfcnp)(struct cam_periph *, union ccb*),
 			uint8_t tag_action, int mode,
 			uint8_t buffer_id, u_int32_t offset,
 			uint8_t *data_ptr, uint32_t allocation_length,
 			uint8_t sense_len, uint32_t timeout);
 
 void scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries,
 			void (*cbfcnp)(struct cam_periph *, union ccb *),
 			uint8_t tag_action, int mode,
 			uint8_t buffer_id, u_int32_t offset,
 			uint8_t *data_ptr, uint32_t param_list_length,
 			uint8_t sense_len, uint32_t timeout);
 
 #define	SCSI_RW_READ	0x0001
 #define	SCSI_RW_WRITE	0x0002
 #define	SCSI_RW_DIRMASK	0x0003
 #define	SCSI_RW_BIO	0x1000
 void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, int readop, u_int8_t byte2, 
 		     int minimum_cmd_size, u_int64_t lba,
 		     u_int32_t block_count, u_int8_t *data_ptr,
 		     u_int32_t dxfer_len, u_int8_t sense_len,
 		     u_int32_t timeout);
 
 void scsi_write_same(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, u_int8_t byte2, 
 		     int minimum_cmd_size, u_int64_t lba,
 		     u_int32_t block_count, u_int8_t *data_ptr,
 		     u_int32_t dxfer_len, u_int8_t sense_len,
 		     u_int32_t timeout);
 
 void scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries,
 		       void (*cbfcnp)(struct cam_periph *, union ccb *),
 		       u_int8_t tag_action, u_int8_t *data_ptr,
 		       u_int16_t dxfer_len, u_int8_t sense_len,
 		       u_int32_t timeout);
 
 void scsi_ata_trim(struct ccb_scsiio *csio, u_int32_t retries,
 	           void (*cbfcnp)(struct cam_periph *, union ccb *),
 	           u_int8_t tag_action, u_int16_t block_count,
 	           u_int8_t *data_ptr, u_int16_t dxfer_len,
 	           u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_ata_pass_16(struct ccb_scsiio *csio, u_int32_t retries,
 		      void (*cbfcnp)(struct cam_periph *, union ccb *),
 		      u_int32_t flags, u_int8_t tag_action,
 		      u_int8_t protocol, u_int8_t ata_flags, u_int16_t features,
 		      u_int16_t sector_count, uint64_t lba, u_int8_t command,
 		      u_int8_t control, u_int8_t *data_ptr, u_int16_t dxfer_len,
 		      u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_unmap(struct ccb_scsiio *csio, u_int32_t retries,
 		void (*cbfcnp)(struct cam_periph *, union ccb *),
 		u_int8_t tag_action, u_int8_t byte2,
 		u_int8_t *data_ptr, u_int16_t dxfer_len,
 		u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, int start, int load_eject,
 		     int immediate, u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_persistent_reserve_in(struct ccb_scsiio *csio, uint32_t retries, 
 				void (*cbfcnp)(struct cam_periph *,union ccb *),
 				uint8_t tag_action, int service_action,
 				uint8_t *data_ptr, uint32_t dxfer_len,
 				int sense_len, int timeout);
 
 void scsi_persistent_reserve_out(struct ccb_scsiio *csio, uint32_t retries, 
 				 void (*cbfcnp)(struct cam_periph *,
 				       union ccb *),
 				 uint8_t tag_action, int service_action,
 				 int scope, int res_type, uint8_t *data_ptr,
 				 uint32_t dxfer_len, int sense_len,
 				 int timeout);
 
 int		scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry);
 int		scsi_static_inquiry_match(caddr_t inqbuffer,
 					  caddr_t table_entry);
 int		scsi_devid_match(uint8_t *rhs, size_t rhs_len,
 				 uint8_t *lhs, size_t lhs_len);
 
 void scsi_extract_sense(struct scsi_sense_data *sense, int *error_code,
 			int *sense_key, int *asc, int *ascq);
 int scsi_extract_sense_ccb(union ccb *ccb, int *error_code, int *sense_key,
 			   int *asc, int *ascq);
 void scsi_extract_sense_len(struct scsi_sense_data *sense,
 			    u_int sense_len, int *error_code, int *sense_key,
 			    int *asc, int *ascq, int show_errors);
 int scsi_get_sense_key(struct scsi_sense_data *sense, u_int sense_len,
 		       int show_errors);
 int scsi_get_asc(struct scsi_sense_data *sense, u_int sense_len,
 		 int show_errors);
 int scsi_get_ascq(struct scsi_sense_data *sense, u_int sense_len,
 		  int show_errors);
 static __inline void scsi_ulto2b(u_int32_t val, u_int8_t *bytes);
 static __inline void scsi_ulto3b(u_int32_t val, u_int8_t *bytes);
 static __inline void scsi_ulto4b(u_int32_t val, u_int8_t *bytes);
 static __inline void scsi_u64to8b(u_int64_t val, u_int8_t *bytes);
 static __inline uint32_t scsi_2btoul(const uint8_t *bytes);
 static __inline uint32_t scsi_3btoul(const uint8_t *bytes);
 static __inline int32_t scsi_3btol(const uint8_t *bytes);
 static __inline uint32_t scsi_4btoul(const uint8_t *bytes);
 static __inline uint64_t scsi_8btou64(const uint8_t *bytes);
 static __inline void *find_mode_page_6(struct scsi_mode_header_6 *mode_header);
 static __inline void *find_mode_page_10(struct scsi_mode_header_10 *mode_header);
 
 static __inline void
 scsi_ulto2b(u_int32_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 8) & 0xff;
 	bytes[1] = val & 0xff;
 }
 
 static __inline void
 scsi_ulto3b(u_int32_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 16) & 0xff;
 	bytes[1] = (val >> 8) & 0xff;
 	bytes[2] = val & 0xff;
 }
 
 static __inline void
 scsi_ulto4b(u_int32_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 24) & 0xff;
 	bytes[1] = (val >> 16) & 0xff;
 	bytes[2] = (val >> 8) & 0xff;
 	bytes[3] = val & 0xff;
 }
 
 static __inline void
 scsi_u64to8b(u_int64_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 56) & 0xff;
 	bytes[1] = (val >> 48) & 0xff;
 	bytes[2] = (val >> 40) & 0xff;
 	bytes[3] = (val >> 32) & 0xff;
 	bytes[4] = (val >> 24) & 0xff;
 	bytes[5] = (val >> 16) & 0xff;
 	bytes[6] = (val >> 8) & 0xff;
 	bytes[7] = val & 0xff;
 }
 
 static __inline uint32_t
 scsi_2btoul(const uint8_t *bytes)
 {
 	uint32_t rv;
 
 	rv = (bytes[0] << 8) |
 	     bytes[1];
 	return (rv);
 }
 
 static __inline uint32_t
 scsi_3btoul(const uint8_t *bytes)
 {
 	uint32_t rv;
 
 	rv = (bytes[0] << 16) |
 	     (bytes[1] << 8) |
 	     bytes[2];
 	return (rv);
 }
 
 static __inline int32_t 
 scsi_3btol(const uint8_t *bytes)
 {
 	uint32_t rc = scsi_3btoul(bytes);
  
 	if (rc & 0x00800000)
 		rc |= 0xff000000;
 
 	return (int32_t) rc;
 }
 
 static __inline uint32_t
 scsi_4btoul(const uint8_t *bytes)
 {
 	uint32_t rv;
 
 	rv = (bytes[0] << 24) |
 	     (bytes[1] << 16) |
 	     (bytes[2] << 8) |
 	     bytes[3];
 	return (rv);
 }
 
 static __inline uint64_t
 scsi_8btou64(const uint8_t *bytes)
 {
         uint64_t rv;
  
 	rv = (((uint64_t)bytes[0]) << 56) |
 	     (((uint64_t)bytes[1]) << 48) |
 	     (((uint64_t)bytes[2]) << 40) |
 	     (((uint64_t)bytes[3]) << 32) |
 	     (((uint64_t)bytes[4]) << 24) |
 	     (((uint64_t)bytes[5]) << 16) |
 	     (((uint64_t)bytes[6]) << 8) |
 	     bytes[7];
 	return (rv);
 }
 
 /*
  * Given the pointer to a returned mode sense buffer, return a pointer to
  * the start of the first mode page.
  */
 static __inline void *
 find_mode_page_6(struct scsi_mode_header_6 *mode_header)
 {
 	void *page_start;
 
 	page_start = (void *)((u_int8_t *)&mode_header[1] +
 			      mode_header->blk_desc_len);
 
 	return(page_start);
 }
 
 static __inline void *
 find_mode_page_10(struct scsi_mode_header_10 *mode_header)
 {
 	void *page_start;
 
 	page_start = (void *)((u_int8_t *)&mode_header[1] +
 			       scsi_2btoul(mode_header->blk_desc_len));
 
 	return(page_start);
 }
 
 __END_DECLS
 
 #endif /*_SCSI_SCSI_ALL_H*/
Index: user/ae/inet6/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- user/ae/inet6/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	(revision 271452)
+++ user/ae/inet6/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	(revision 271453)
@@ -1,3491 +1,3494 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/vdev_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/trim_map.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
 
 /*
  * Virtual device management.
  */
 
 /*
  * The limit for ZFS to automatically increase a top-level vdev's ashift
  * from logical ashift to physical ashift.
  *
  * Example: one or more 512B emulation child vdevs
  *          child->vdev_ashift = 9 (512 bytes)
  *          child->vdev_physical_ashift = 12 (4096 bytes)
  *          zfs_max_auto_ashift = 11 (2048 bytes)
  *          zfs_min_auto_ashift = 9 (512 bytes)
  *
  * On pool creation or the addition of a new top-level vdev, ZFS will
  * increase the ashift of the top-level vdev to 2048 as limited by
  * zfs_max_auto_ashift.
  *
  * Example: one or more 512B emulation child vdevs
  *          child->vdev_ashift = 9 (512 bytes)
  *          child->vdev_physical_ashift = 12 (4096 bytes)
  *          zfs_max_auto_ashift = 13 (8192 bytes)
  *          zfs_min_auto_ashift = 9 (512 bytes)
  *
  * On pool creation or the addition of a new top-level vdev, ZFS will
  * increase the ashift of the top-level vdev to 4096 to match the
  * max vdev_physical_ashift.
  *
  * Example: one or more 512B emulation child vdevs
  *          child->vdev_ashift = 9 (512 bytes)
  *          child->vdev_physical_ashift = 9 (512 bytes)
  *          zfs_max_auto_ashift = 13 (8192 bytes)
  *          zfs_min_auto_ashift = 12 (4096 bytes)
  *
  * On pool creation or the addition of a new top-level vdev, ZFS will
  * increase the ashift of the top-level vdev to 4096 to match the
  * zfs_min_auto_ashift.
  */
 static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
 static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
 
 static int
 sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t val;
 	int err;
 
 	val = zfs_max_auto_ashift;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
 		return (EINVAL);
 
 	zfs_max_auto_ashift = val;
 
 	return (0);
 }
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
     sysctl_vfs_zfs_max_auto_ashift, "QU",
     "Max ashift used when optimising for logical -> physical sectors size on "
     "new top-level vdevs.");
 
 static int
 sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t val;
 	int err;
 
 	val = zfs_min_auto_ashift;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
 		return (EINVAL);
 
 	zfs_min_auto_ashift = val;
 
 	return (0);
 }
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
     sysctl_vfs_zfs_min_auto_ashift, "QU",
     "Min ashift used when creating new top-level vdevs.");
 
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 #ifdef _KERNEL
 	&vdev_geom_ops,
 #else
 	&vdev_disk_ops,
 #endif
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	NULL
 };
 
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, **opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 
 	/*
 	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
 	 * so each child must provide at least 1/Nth of its asize.
 	 */
 	if (pvd->vdev_ops == &vdev_raidz_ops)
 		return (pvd->vdev_min_asize / pvd->vdev_children);
 
 	return (pvd->vdev_min_asize);
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_zalloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		bcopy(pvd->vdev_child, newchild, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 	for (int c = newc = 0; c < oldc; c++) {
 		if ((cvd = pvd->vdev_child[c]) != NULL) {
 			newchild[newc] = cvd;
 			cvd->vdev_id = newc++;
 		}
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_guid(NULL);
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 		    &vd->vdev_dtl_lock);
 	}
 	txg_list_create(&vd->vdev_ms_list,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 	vdev_cache_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	char *type;
 	uint64_t guid = 0, islog, nparity;
 	vdev_t *vd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the nparity property for RAID-Z vdevs.
 	 */
 	nparity = -1ULL;
 	if (ops == &vdev_raidz_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
 			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 				return (SET_ERROR(EINVAL));
 			/*
 			 * Previous versions could only support 1 or 2 parity
 			 * device.
 			 */
 			if (nparity > 1 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ2)
 				return (SET_ERROR(ENOTSUP));
 			if (nparity > 2 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ3)
 				return (SET_ERROR(ENOTSUP));
 		} else {
 			/*
 			 * We require the parity to be specified for SPAs that
 			 * support multiple parity levels.
 			 */
 			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 				return (SET_ERROR(EINVAL));
 			/*
 			 * Otherwise, we default to 1 parity device for RAID-Z.
 			 */
 			nparity = 1;
 		}
 	} else {
 		nparity = 0;
 	}
 	ASSERT(nparity != -1ULL);
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 
 	vd->vdev_islog = islog;
 	vd->vdev_nparity = nparity;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 		vd->vdev_path = spa_strdup(vd->vdev_path);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 	    &vd->vdev_physpath) == 0)
 		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
 		vd->vdev_fru = spa_strdup(vd->vdev_fru);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (parent && !parent->vdev_parent &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 	}
 
 	if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		vd->vdev_mg = metaslab_group_create(islog ?
 		    spa_log_class(spa) : spa_normal_class(spa), vd);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
 		 * valid in the current context.  Local vdevs will
 		 * remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT(vd->vdev_parent == NULL);
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 	vdev_cache_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
 	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT(mvd->vdev_children == 0);
 	vdev_free(mvd);
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * Compute the raidz-deflation ratio.  Note, we hard-code
 	 * in 128k (1 << 17) because it is the current "typical" blocksize.
 	 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
 	 * or we will inconsistently account for existing bp's.
 	 */
 	vd->vdev_deflate_ratio = (1 << 17) /
 	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 
 	ASSERT(oldc <= newc);
 
 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (oldc != 0) {
 		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
 	for (m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 
 		if (txg == 0) {
 			error = dmu_read(mos, vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error)
 				return (error);
 		}
 		vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, m, object, txg);
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is being removed we don't activate
 	 * the metaslabs since we want to ensure that no new
 	 * allocations are performed on this device.
 	 */
 	if (oldc == 0 && !vd->vdev_removing)
 		metaslab_group_activate(vd->vdev_mg);
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	uint64_t m;
 	uint64_t count = vd->vdev_ms_count;
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_passivate(vd->vdev_mg);
 		for (m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 	}
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_data,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			zio_buf_free(zio->io_data, zio->io_size);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		zio_buf_free(zio->io_data, zio->io_size);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, 0, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		while ((pio = zio_walk_parents(zio)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
 		    ZIO_FLAG_TRYHARD;
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 
 		/*
 		 * We can't change the vdev state in this context, so we
 		 * kick off an async task to do it on our behalf.
 		 */
 		if (zio != NULL) {
 			vd->vdev_probe_wanted = B_TRUE;
 			spa_async_request(spa, SPA_ASYNC_PROBE);
 		}
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_pad2)),
 		    VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
 	    strlen(ZVOL_DIR)) == 0)
 		return (B_TRUE);
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 	return (B_FALSE);
 }
 
 void
 vdev_open_children(vdev_t *vd)
 {
 	taskq_t *tq;
 	int children = vd->vdev_children;
 
 	/*
 	 * in order to handle pools on top of zvols, do the opens
 	 * in a single thread so that the same thread holds the
 	 * spa_namespace_lock
 	 */
 	if (B_TRUE || vdev_uses_zvols(vd)) {
 		for (int c = 0; c < children; c++)
 			vd->vdev_child[c]->vdev_open_error =
 			    vdev_open(vd->vdev_child[c]);
 		return;
 	}
 	tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 
 	for (int c = 0; c < children; c++)
 		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
 		    TQ_SLEEP) != 0);
 
 	taskq_destroy(tq);
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t logical_ashift = 0;
 	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, ENXIO);
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    vd->vdev_stat.vs_aux);
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vd->vdev_notrim = B_FALSE;
 		trim_map_create(vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
 	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	vd->vdev_physical_ashift =
 	    MAX(physical_ashift, vd->vdev_physical_ashift);
 	vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
 	vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
 
 	if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_ASHIFT_TOO_BIG);
 		return (EINVAL);
 	}
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For testing purposes, a higher ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 	} else {
 		/*
 		 * Make sure the alignment requirement hasn't increased.
 		 */
 		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy and the asize has increased,
 	 * then we've experienced dynamic LUN growth.  If automatic
 	 * expansion is enabled then use the additional space.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
 	 * resilver.  But don't do this if we are doing a reopen for a scrub,
 	 * since this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
 	    vdev_resilver_needed(vd, NULL, NULL))
 		spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 	return (0);
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents.  This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * If 'strict' is false ignore the spa guid check. This is necessary because
  * if the machine crashed during a re-guid the new guid might have been written
  * to all of the vdev labels, but not the cached config. The strict check
  * will be performed when the pool is opened again using the mos config.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd, boolean_t strict)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	uint64_t guid = 0, top_guid;
 	uint64_t state;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_validate(vd->vdev_child[c], strict) != 0)
 			return (SET_ERROR(EBADF));
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		uint64_t aux_guid = 0;
 		nvlist_t *nvl;
 		uint64_t txg = spa_last_synced_txg(spa) != 0 ?
 		    spa_last_synced_txg(spa) : -1ULL;
 
 		if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (0);
 		}
 
 		/*
 		 * Determine if this vdev has been split off into another
 		 * pool.  If so, then refuse to open it.
 		 */
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 		    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_SPLIT_POOL);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (strict && (nvlist_lookup_uint64(label,
 		    ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 		    guid != spa_guid(spa))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 		    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 		    &aux_guid) != 0)
 			aux_guid = 0;
 
 		/*
 		 * If this vdev just became a top-level vdev because its
 		 * sibling was detached, it will have adopted the parent's
 		 * vdev guid -- but the label may or may not be on disk yet.
 		 * Fortunately, either version of the label will have the
 		 * same top guid, so if we're a top-level vdev, we can
 		 * safely compare to that instead.
 		 *
 		 * If we split this vdev off instead, then we also check the
 		 * original pool's guid.  We don't want to consider the vdev
 		 * corrupt if it is partway through a split operation.
 		 */
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 		    &guid) != 0 ||
 		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
 		    &top_guid) != 0 ||
 		    ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
 		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		nvlist_free(label);
 
 		/*
 		 * If this is a verbatim import, no need to check the
 		 * state of the pool.
 		 */
 		if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 		    spa_load_state(spa) == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
 			return (SET_ERROR(EBADF));
 
 		/*
 		 * If we were able to open and validate a vdev that was
 		 * previously marked permanently unavailable, clear that state
 		 * now.
 		 */
 		if (vd->vdev_not_present)
 			vd->vdev_not_present = 0;
 	}
 
 	return (0);
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *pvd = vd->vdev_parent;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		trim_map_destroy(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache &&
 		    !l2arc_vdev_present(vd))
 			l2arc_add_vdev(spa, vd);
 	} else {
 		(void) vdev_validate(vd, B_TRUE);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : ENXIO);
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	/*
 	 * Aim for roughly 200 metaslabs per vdev.
 	 */
 	vd->vdev_ms_shift = highbit64(vd->vdev_asize / 200);
 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
 }
 
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
  * logical ashift.
  */
 void
 vdev_ashift_optimize(vdev_t *vd)
 {
 	if (vd == vd->vdev_top) {
 		if (vd->vdev_ashift < vd->vdev_physical_ashift) {
 			vd->vdev_ashift = MIN(
 			    MAX(zfs_max_auto_ashift, vd->vdev_ashift),
 			    MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
 		} else {
 			/*
 			 * Unusual case where logical ashift > physical ashift
 			 * so we can't cap the calculated ashift based on max
 			 * ashift as that would cause failures.
 			 * We still check if we need to increase it to match
 			 * the min ashift.
 			 */
 			vd->vdev_ashift = MAX(zfs_min_auto_ashift,
 			    vd->vdev_ashift);
 		}
 	}
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	ASSERT(!vd->vdev_ishole);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(rt->rt_lock);
 	if (!range_tree_contains(rt, txg, size))
 		range_tree_add(rt, txg, size);
 	mutex_exit(rt->rt_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	mutex_enter(rt->rt_lock);
 	if (range_tree_space(rt) != 0)
 		dirty = range_tree_contains(rt, txg, size);
 	mutex_exit(rt->rt_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(rt->rt_lock);
 	empty = (range_tree_space(rt) == 0);
 	mutex_exit(rt->rt_lock);
 
 	return (empty);
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	range_seg_t *rs;
 
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
 	return (rs->rs_start - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	range_seg_t *rs;
 
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
 	return (rs->rs_end);
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 
 	ASSERT0(scn->scn_phys.scn_errors);
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_resilver_txg == 0 ||
 	    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
 		return (B_TRUE);
 
 	/*
 	 * When a resilver is initiated the scan will assign the scn_max_txg
 	 * value to the highest txg value that exists in all DTLs. If this
 	 * device's max DTL is not part of this scan (i.e. it is not in
 	 * the range (scn_min_txg, scn_max_txg] then it is not eligible
 	 * for excision.
 	 */
 	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 		ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
 		ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
 		ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion.
  */
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done);
 
 	if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If we've completed a scan cleanly then determine
 		 * if this vdev should remove any DTLs. We only want to
 		 * excise regions on vdevs that were available during
 		 * the entire duration of this scan.
 		 */
 		if (scrub_txg != 0 &&
 		    (spa->spa_scrub_started ||
 		    (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
 		    vdev_dtl_should_excise(vd)) {
 			/*
 			 * We completed a scrub up to scrub_txg.  If we
 			 * did it without rebooting, then the scrub dtl
 			 * will be valid, so excise the old region and
 			 * fold in the scrub dtl.  Otherwise, leave the
 			 * dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 		}
 		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
 		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 		if (!vdev_readable(vd))
 			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		else
 			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 
 		/*
 		 * If the vdev was resilvering and no longer has any
-		 * DTLs then reset its resilvering flag.
+		 * DTLs then reset its resilvering flag and dirty
+		 * the top level so that we persist the change.
 		 */
 		if (vd->vdev_resilver_txg != 0 &&
 		    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
-		    range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
+		    range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
 			vd->vdev_resilver_txg = 0;
+			vdev_config_dirty(vd->vdev_top);
+		}
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 		return;
 	}
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		/* account for child's outage in parent's missing map */
 		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 		if (t == DTL_SCRUB)
 			continue;			/* leaf vdevs only */
 		if (t == DTL_PARTIAL)
 			minref = 1;			/* i.e. non-zero */
 		else if (vd->vdev_nparity != 0)
 			minref = vd->vdev_nparity + 1;	/* RAID-Z */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
 		space_reftree_create(&reftree);
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			mutex_enter(&cvd->vdev_dtl_lock);
 			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
 			mutex_exit(&cvd->vdev_dtl_lock);
 		}
 		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
 		space_reftree_destroy(&reftree);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(!vd->vdev_ishole);
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * Now that we've opened the space_map we need to update
 		 * the in-core DTL.
 		 */
 		space_map_update(vd->vdev_dtl_sm);
 
 		error = space_map_load(vd->vdev_dtl_sm,
 		    vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	range_tree_t *rtsync;
 	kmutex_t rtlock;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(!vd->vdev_ishole);
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0, &vd->vdev_dtl_lock));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	bzero(&rtlock, sizeof(rtlock));
 	mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
 
 	rtsync = range_tree_create(NULL, NULL, &rtlock);
 
 	mutex_enter(&rtlock);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
 	range_tree_destroy(rtsync);
 
 	mutex_exit(&rtlock);
 	mutex_destroy(&rtlock);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", txg, spa_name(spa), object,
 		    space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_update(vd->vdev_dtl_sm);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 /*
  * Determine whether the specified vdev can be offlined/detached/removed
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 
 	if (!required && zio_injection_enabled)
 		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 void
 vdev_load(vdev_t *vd)
 {
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_load(vd->vdev_child[c]);
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && !vd->vdev_ishole &&
 	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
 	    vdev_metaslab_init(vd, 0) != 0))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 void
 vdev_remove(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	dmu_tx_t *tx;
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 
 		for (int m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			if (msp == NULL || msp->ms_sm == NULL)
 				continue;
 
 			mutex_enter(&msp->ms_lock);
 			/*
 			 * If the metaslab was not loaded when the vdev
 			 * was removed then the histogram accounting may
 			 * not be accurate. Update the histogram information
 			 * here so that we ensure that the metaslab group
 			 * and metaslab class are up-to-date.
 			 */
 			metaslab_group_histogram_remove(mg, msp);
 
 			VERIFY0(space_map_allocated(msp->ms_sm));
 			space_map_free(msp->ms_sm, tx);
 			space_map_close(msp->ms_sm);
 			msp->ms_sm = NULL;
 			mutex_exit(&msp->ms_lock);
 		}
 
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			ASSERT0(mg->mg_histogram[i]);
 
 	}
 
 	if (vd->vdev_ms_array) {
 		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
 		vd->vdev_ms_array = 0;
 	}
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(!vd->vdev_ishole);
 
 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 		metaslab_sync_done(msp, txg);
 
 	if (reassess)
 		metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 	dmu_tx_t *tx;
 
 	ASSERT(!vd->vdev_ishole);
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
 		ASSERT(vd == vd->vdev_top);
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * Remove the metadata associated with this vdev once it's empty.
 	 */
 	if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove(vd, txg);
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_offline_log(spa);
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	if (vd == rvd) {
 		for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
 			vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
 
 		for (int c = 0; c < spa->spa_spares.sav_count; c++)
 			vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
 	}
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 
 		/*
 		 * When reopening in reponse to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && !vd->vdev_ishole);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 /*
  * Get statistics for the given vdev.
  */
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
 	if (vd->vdev_ops->vdev_op_leaf)
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
 	vs->vs_configured_ashift = vd->vdev_top != NULL
 	    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 	vs->vs_logical_ashift = vd->vdev_logical_ashift;
 	vs->vs_physical_ashift = vd->vdev_physical_ashift;
 	if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
 		vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
 	}
 
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (vd == rvd) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *cvd = rvd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 
 			for (int t = 0; t < ZIO_TYPES; t++) {
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
 			cvs->vs_scan_removing = cvd->vdev_removing;
 		}
 	}
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_phys_t *scn_phys =
 				    &spa->spa_dsl_pool->dp_scan->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				/* XXX cleanup? */
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		vs->vs_ops[type]++;
 		vs->vs_bytes[type] += psize;
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
 		if (zio->io_error == ECKSUM)
 			vs->vs_checksum_errors++;
 		else
 			vs->vs_read_errors++;
 	}
 	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
 		vs->vs_write_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	int64_t dspace_delta = space_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_group_t *mg = vd->vdev_mg;
 	metaslab_class_t *mc = mg ? mg->mg_class : NULL;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * childrens', thus not accurate enough for us.
 	 */
 	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
 	    vd->vdev_deflate_ratio;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	if (mc == spa_normal_class(spa)) {
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 
 	if (mc != NULL) {
 		ASSERT(rvd == vd->vdev_parent);
 		ASSERT(vd->vdev_ms_count != 0);
 
 		metaslab_class_space_update(mc,
 		    alloc_delta, defer_delta, space_delta, dspace_delta);
 	}
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    !vd->vdev_ishole)
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes into the decision.
 			 */
 			if (child->vdev_ishole)
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	/*
 	 * If we have brought this vdev back into service, we need
 	 * to notify fmd so that it can gracefully repair any outstanding
 	 * cases due to a missing device.  We do this in all cases, even those
 	 * that probably don't correlate to a repaired fault.  This is sure to
 	 * catch all cases, and we let the zfs-retire agent sort it out.  If
 	 * this is a transient state it's OK, as the retire agent will
 	 * double-check the state of the vdev before repairing it.
 	 */
 	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_prevstate != state)
 		zfs_post_state_change(spa, vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool.
  *
  * On Solaris, we do not support RAID-Z or partial configuration.  In
  * addition, only a single top-level vdev is allowed and none of the
  * leaves can be wholedisks.
  *
  * For FreeBSD, we can boot from any configuration. There is a
  * limitation that the boot filesystem must be either uncompressed or
  * compresses with lzjb compression but I'm not sure how to enforce
  * that here.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 #ifdef sun
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
 		    vd->vdev_children > 1) {
 			return (B_FALSE);
 		} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
 		    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
 			return (B_FALSE);
 		}
 	} else if (vd->vdev_wholedisk == 1) {
 		return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 #endif	/* sun */
 	return (B_TRUE);
 }
 
 /*
  * Load the state from the original vdev tree (ovd) which
  * we've retrieved from the MOS config object. If the original
  * vdev was offline or faulted then we transfer that state to the
  * device in the current vdev tree (nvd).
  */
 void
 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
 {
 	spa_t *spa = nvd->vdev_spa;
 
 	ASSERT(nvd->vdev_top->vdev_islog);
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
 
 	for (int c = 0; c < nvd->vdev_children; c++)
 		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
 
 	if (nvd->vdev_ops->vdev_op_leaf) {
 		/*
 		 * Restore the persistent vdev state
 		 */
 		nvd->vdev_offline = ovd->vdev_offline;
 		nvd->vdev_faulted = ovd->vdev_faulted;
 		nvd->vdev_degraded = ovd->vdev_degraded;
 		nvd->vdev_removed = ovd->vdev_removed;
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (avl_numnodes(&vq->vq_active_tree) > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime we panic the system.
 			 */
 			fio = avl_first(&vq->vq_active_tree);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa)) {
 				zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
 				    "delta %lluns, last io %lluns",
 				    fio->io_timestamp, delta,
 				    vq->vq_io_complete_ts);
 				fm_panic("I/O to pool '%s' appears to be "
 				    "hung on vdev guid %llu at '%s'.",
 				    spa_name(spa),
 				    (long long unsigned int) vd->vdev_guid,
 				    vd->vdev_path);
 			}
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
Index: user/ae/inet6/sys/cddl/contrib/opensolaris
===================================================================
--- user/ae/inet6/sys/cddl/contrib/opensolaris	(revision 271452)
+++ user/ae/inet6/sys/cddl/contrib/opensolaris	(revision 271453)

Property changes on: user/ae/inet6/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys/cddl/contrib/opensolaris:r271428-271452
Index: user/ae/inet6/sys/dev/cxgbe/t4_main.c
===================================================================
--- user/ae/inet6/sys/dev/cxgbe/t4_main.c	(revision 271452)
+++ user/ae/inet6/sys/dev/cxgbe/t4_main.c	(revision 271453)
@@ -1,8388 +1,8467 @@
 /*-
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/pciio.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 #include <sys/firmware.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/if_vlan_var.h>
 #if defined(__i386__) || defined(__amd64__)
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "t4_ioctl.h"
 #include "t4_l2t.h"
 
 /* T4 bus driver interface */
 static int t4_probe(device_t);
 static int t4_attach(device_t);
 static int t4_detach(device_t);
 static device_method_t t4_methods[] = {
 	DEVMETHOD(device_probe,		t4_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD_END
 };
 static driver_t t4_driver = {
 	"t4nex",
 	t4_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T4 port (cxgbe) interface */
 static int cxgbe_probe(device_t);
 static int cxgbe_attach(device_t);
 static int cxgbe_detach(device_t);
 static device_method_t cxgbe_methods[] = {
 	DEVMETHOD(device_probe,		cxgbe_probe),
 	DEVMETHOD(device_attach,	cxgbe_attach),
 	DEVMETHOD(device_detach,	cxgbe_detach),
 	{ 0, 0 }
 };
 static driver_t cxgbe_driver = {
 	"cxgbe",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 static d_ioctl_t t4_ioctl;
 static d_open_t t4_open;
 static d_close_t t4_close;
 
 static struct cdevsw t4_cdevsw = {
        .d_version = D_VERSION,
        .d_flags = 0,
        .d_open = t4_open,
        .d_close = t4_close,
        .d_ioctl = t4_ioctl,
        .d_name = "t4nex",
 };
 
 /* T5 bus driver interface */
 static int t5_probe(device_t);
 static device_method_t t5_methods[] = {
 	DEVMETHOD(device_probe,		t5_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD_END
 };
 static driver_t t5_driver = {
 	"t5nex",
 	t5_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T5 port (cxl) interface */
 static driver_t cxl_driver = {
 	"cxl",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 static struct cdevsw t5_cdevsw = {
        .d_version = D_VERSION,
        .d_flags = 0,
        .d_open = t4_open,
        .d_close = t4_close,
        .d_ioctl = t4_ioctl,
        .d_name = "t5nex",
 };
 
 /* ifnet + media interface */
 static void cxgbe_init(void *);
 static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t);
 static int cxgbe_transmit(struct ifnet *, struct mbuf *);
 static void cxgbe_qflush(struct ifnet *);
 static int cxgbe_media_change(struct ifnet *);
 static void cxgbe_media_status(struct ifnet *, struct ifmediareq *);
 
 MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services");
 
 /*
  * Correct lock order when you need to acquire multiple locks is t4_list_lock,
  * then ADAPTER_LOCK, then t4_uld_list_lock.
  */
 static struct sx t4_list_lock;
 SLIST_HEAD(, adapter) t4_list;
 #ifdef TCP_OFFLOAD
 static struct sx t4_uld_list_lock;
 SLIST_HEAD(, uld_info) t4_uld_list;
 #endif
 
 /*
  * Tunables.  See tweak_tunables() too.
  *
  * Each tunable is set to a default value here if it's known at compile-time.
  * Otherwise it is set to -1 as an indication to tweak_tunables() that it should
  * provide a reasonable default when the driver is loaded.
  *
  * Tunables applicable to both T4 and T5 are under hw.cxgbe.  Those specific to
  * T5 are under hw.cxl.
  */
 
 /*
  * Number of queues for tx and rx, 10G and 1G, NIC and offload.
  */
 #define NTXQ_10G 16
 static int t4_ntxq10g = -1;
 TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq10g);
 
 #define NRXQ_10G 8
 static int t4_nrxq10g = -1;
 TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq10g);
 
 #define NTXQ_1G 4
 static int t4_ntxq1g = -1;
 TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g);
 
 #define NRXQ_1G 2
 static int t4_nrxq1g = -1;
 TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g);
 
 static int t4_rsrv_noflowq = 0;
 TUNABLE_INT("hw.cxgbe.rsrv_noflowq", &t4_rsrv_noflowq);
 
 #ifdef TCP_OFFLOAD
 #define NOFLDTXQ_10G 8
 static int t4_nofldtxq10g = -1;
 TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g);
 
 #define NOFLDRXQ_10G 2
 static int t4_nofldrxq10g = -1;
 TUNABLE_INT("hw.cxgbe.nofldrxq10g", &t4_nofldrxq10g);
 
 #define NOFLDTXQ_1G 2
 static int t4_nofldtxq1g = -1;
 TUNABLE_INT("hw.cxgbe.nofldtxq1g", &t4_nofldtxq1g);
 
 #define NOFLDRXQ_1G 1
 static int t4_nofldrxq1g = -1;
 TUNABLE_INT("hw.cxgbe.nofldrxq1g", &t4_nofldrxq1g);
 #endif
 
 #ifdef DEV_NETMAP
 #define NNMTXQ_10G 2
 static int t4_nnmtxq10g = -1;
 TUNABLE_INT("hw.cxgbe.nnmtxq10g", &t4_nnmtxq10g);
 
 #define NNMRXQ_10G 2
 static int t4_nnmrxq10g = -1;
 TUNABLE_INT("hw.cxgbe.nnmrxq10g", &t4_nnmrxq10g);
 
 #define NNMTXQ_1G 1
 static int t4_nnmtxq1g = -1;
 TUNABLE_INT("hw.cxgbe.nnmtxq1g", &t4_nnmtxq1g);
 
 #define NNMRXQ_1G 1
 static int t4_nnmrxq1g = -1;
 TUNABLE_INT("hw.cxgbe.nnmrxq1g", &t4_nnmrxq1g);
 #endif
 
 /*
  * Holdoff parameters for 10G and 1G ports.
  */
 #define TMR_IDX_10G 1
 static int t4_tmr_idx_10g = TMR_IDX_10G;
 TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx_10g);
 
 #define PKTC_IDX_10G (-1)
 static int t4_pktc_idx_10g = PKTC_IDX_10G;
 TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx_10g);
 
 #define TMR_IDX_1G 1
 static int t4_tmr_idx_1g = TMR_IDX_1G;
 TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_1G", &t4_tmr_idx_1g);
 
 #define PKTC_IDX_1G (-1)
 static int t4_pktc_idx_1g = PKTC_IDX_1G;
 TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_1G", &t4_pktc_idx_1g);
 
 /*
  * Size (# of entries) of each tx and rx queue.
  */
 static unsigned int t4_qsize_txq = TX_EQ_QSIZE;
 TUNABLE_INT("hw.cxgbe.qsize_txq", &t4_qsize_txq);
 
 static unsigned int t4_qsize_rxq = RX_IQ_QSIZE;
 TUNABLE_INT("hw.cxgbe.qsize_rxq", &t4_qsize_rxq);
 
 /*
  * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively).
  */
 static int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX;
 TUNABLE_INT("hw.cxgbe.interrupt_types", &t4_intr_types);
 
 /*
  * Configuration file.
  */
 #define DEFAULT_CF	"default"
 #define FLASH_CF	"flash"
 #define UWIRE_CF	"uwire"
 #define FPGA_CF		"fpga"
 static char t4_cfg_file[32] = DEFAULT_CF;
 TUNABLE_STR("hw.cxgbe.config_file", t4_cfg_file, sizeof(t4_cfg_file));
 
 /*
+ * PAUSE settings (bit 0, 1 = rx_pause, tx_pause respectively).
+ * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them.
+ * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water
+ *            mark or when signalled to do so, 0 to never emit PAUSE.
+ */
+static int t4_pause_settings = PAUSE_TX | PAUSE_RX;
+TUNABLE_INT("hw.cxgbe.pause_settings", &t4_pause_settings);
+
+/*
  * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed,
  * encouraged respectively).
  */
 static unsigned int t4_fw_install = 1;
 TUNABLE_INT("hw.cxgbe.fw_install", &t4_fw_install);
 
 /*
  * ASIC features that will be used.  Disable the ones you don't want so that the
  * chip resources aren't wasted on features that will not be used.
  */
 static int t4_linkcaps_allowed = 0;	/* No DCBX, PPP, etc. by default */
 TUNABLE_INT("hw.cxgbe.linkcaps_allowed", &t4_linkcaps_allowed);
 
 static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC;
 TUNABLE_INT("hw.cxgbe.niccaps_allowed", &t4_niccaps_allowed);
 
 static int t4_toecaps_allowed = -1;
 TUNABLE_INT("hw.cxgbe.toecaps_allowed", &t4_toecaps_allowed);
 
 static int t4_rdmacaps_allowed = 0;
 TUNABLE_INT("hw.cxgbe.rdmacaps_allowed", &t4_rdmacaps_allowed);
 
 static int t4_iscsicaps_allowed = 0;
 TUNABLE_INT("hw.cxgbe.iscsicaps_allowed", &t4_iscsicaps_allowed);
 
 static int t4_fcoecaps_allowed = 0;
 TUNABLE_INT("hw.cxgbe.fcoecaps_allowed", &t4_fcoecaps_allowed);
 
 static int t5_write_combine = 0;
 TUNABLE_INT("hw.cxl.write_combine", &t5_write_combine);
 
 struct intrs_and_queues {
 	uint16_t intr_type;	/* INTx, MSI, or MSI-X */
 	uint16_t nirq;		/* Total # of vectors */
 	uint16_t intr_flags_10g;/* Interrupt flags for each 10G port */
 	uint16_t intr_flags_1g;	/* Interrupt flags for each 1G port */
 	uint16_t ntxq10g;	/* # of NIC txq's for each 10G port */
 	uint16_t nrxq10g;	/* # of NIC rxq's for each 10G port */
 	uint16_t ntxq1g;	/* # of NIC txq's for each 1G port */
 	uint16_t nrxq1g;	/* # of NIC rxq's for each 1G port */
 	uint16_t rsrv_noflowq;	/* Flag whether to reserve queue 0 */
 #ifdef TCP_OFFLOAD
 	uint16_t nofldtxq10g;	/* # of TOE txq's for each 10G port */
 	uint16_t nofldrxq10g;	/* # of TOE rxq's for each 10G port */
 	uint16_t nofldtxq1g;	/* # of TOE txq's for each 1G port */
 	uint16_t nofldrxq1g;	/* # of TOE rxq's for each 1G port */
 #endif
 #ifdef DEV_NETMAP
 	uint16_t nnmtxq10g;	/* # of netmap txq's for each 10G port */
 	uint16_t nnmrxq10g;	/* # of netmap rxq's for each 10G port */
 	uint16_t nnmtxq1g;	/* # of netmap txq's for each 1G port */
 	uint16_t nnmrxq1g;	/* # of netmap rxq's for each 1G port */
 #endif
 };
 
 struct filter_entry {
         uint32_t valid:1;	/* filter allocated and valid */
         uint32_t locked:1;	/* filter is administratively locked */
         uint32_t pending:1;	/* filter action is pending firmware reply */
 	uint32_t smtidx:8;	/* Source MAC Table index for smac */
 	struct l2t_entry *l2t;	/* Layer Two Table entry for dmac */
 
         struct t4_filter_specification fs;
 };
 
 static int map_bars_0_and_4(struct adapter *);
 static int map_bar_2(struct adapter *);
 static void setup_memwin(struct adapter *);
 static int validate_mem_range(struct adapter *, uint32_t, int);
 static int fwmtype_to_hwmtype(int);
 static int validate_mt_off_len(struct adapter *, int, uint32_t, int,
     uint32_t *);
 static void memwin_info(struct adapter *, int, uint32_t *, uint32_t *);
 static uint32_t position_memwin(struct adapter *, int, uint32_t);
 static int cfg_itype_and_nqueues(struct adapter *, int, int,
     struct intrs_and_queues *);
 static int prep_firmware(struct adapter *);
 static int partition_resources(struct adapter *, const struct firmware *,
     const char *);
 static int get_params__pre_init(struct adapter *);
 static int get_params__post_init(struct adapter *);
 static int set_params__post_init(struct adapter *);
 static void t4_set_desc(struct adapter *);
 static void build_medialist(struct port_info *, struct ifmedia *);
 static int cxgbe_init_synchronized(struct port_info *);
 static int cxgbe_uninit_synchronized(struct port_info *);
 static int setup_intr_handlers(struct adapter *);
 static void quiesce_eq(struct adapter *, struct sge_eq *);
 static void quiesce_iq(struct adapter *, struct sge_iq *);
 static void quiesce_fl(struct adapter *, struct sge_fl *);
 static int t4_alloc_irq(struct adapter *, struct irq *, int rid,
     driver_intr_t *, void *, char *);
 static int t4_free_irq(struct adapter *, struct irq *);
 static void reg_block_dump(struct adapter *, uint8_t *, unsigned int,
     unsigned int);
 static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *);
 static void cxgbe_tick(void *);
 static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t);
 static int cpl_not_handled(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *);
 static int fw_msg_not_handled(struct adapter *, const __be64 *);
 static int t4_sysctls(struct adapter *);
 static int cxgbe_sysctls(struct port_info *);
 static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
 static int sysctl_bitfield(SYSCTL_HANDLER_ARGS);
 static int sysctl_btphy(SYSCTL_HANDLER_ARGS);
 static int sysctl_noflowq(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS);
+static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS);
 static int sysctl_temperature(SYSCTL_HANDLER_ARGS);
 #ifdef SBUF_DRAIN
 static int sysctl_cctrl(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_devlog(SYSCTL_HANDLER_ARGS);
 static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS);
 static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS);
 static int sysctl_meminfo(SYSCTL_HANDLER_ARGS);
 static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS);
 static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS);
 static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tids(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS);
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 #endif
 static inline void txq_start(struct ifnet *, struct sge_txq *);
 static uint32_t fconf_to_mode(uint32_t);
 static uint32_t mode_to_fconf(uint32_t);
 static uint32_t fspec_to_fconf(struct t4_filter_specification *);
 static int get_filter_mode(struct adapter *, uint32_t *);
 static int set_filter_mode(struct adapter *, uint32_t);
 static inline uint64_t get_filter_hits(struct adapter *, uint32_t);
 static int get_filter(struct adapter *, struct t4_filter *);
 static int set_filter(struct adapter *, struct t4_filter *);
 static int del_filter(struct adapter *, struct t4_filter *);
 static void clear_filter(struct filter_entry *);
 static int set_filter_wr(struct adapter *, int);
 static int del_filter_wr(struct adapter *, int);
 static int get_sge_context(struct adapter *, struct t4_sge_context *);
 static int load_fw(struct adapter *, struct t4_data *);
 static int read_card_mem(struct adapter *, int, struct t4_mem_range *);
 static int read_i2c(struct adapter *, struct t4_i2c_data *);
 static int set_sched_class(struct adapter *, struct t4_sched_params *);
 static int set_sched_queue(struct adapter *, struct t4_sched_queue *);
 #ifdef TCP_OFFLOAD
 static int toe_capability(struct port_info *, int);
 #endif
 static int mod_event(module_t, int, void *);
 
 struct {
 	uint16_t device;
 	char *desc;
 } t4_pciids[] = {
 	{0xa000, "Chelsio Terminator 4 FPGA"},
 	{0x4400, "Chelsio T440-dbg"},
 	{0x4401, "Chelsio T420-CR"},
 	{0x4402, "Chelsio T422-CR"},
 	{0x4403, "Chelsio T440-CR"},
 	{0x4404, "Chelsio T420-BCH"},
 	{0x4405, "Chelsio T440-BCH"},
 	{0x4406, "Chelsio T440-CH"},
 	{0x4407, "Chelsio T420-SO"},
 	{0x4408, "Chelsio T420-CX"},
 	{0x4409, "Chelsio T420-BT"},
 	{0x440a, "Chelsio T404-BT"},
 	{0x440e, "Chelsio T440-LP-CR"},
 }, t5_pciids[] = {
 	{0xb000, "Chelsio Terminator 5 FPGA"},
 	{0x5400, "Chelsio T580-dbg"},
 	{0x5401,  "Chelsio T520-CR"},		/* 2 x 10G */
 	{0x5402,  "Chelsio T522-CR"},		/* 2 x 10G, 2 X 1G */
 	{0x5403,  "Chelsio T540-CR"},		/* 4 x 10G */
 	{0x5407,  "Chelsio T520-SO"},		/* 2 x 10G, nomem */
 	{0x5409,  "Chelsio T520-BT"},		/* 2 x 10GBaseT */
 	{0x540a,  "Chelsio T504-BT"},		/* 4 x 1G */
 	{0x540d,  "Chelsio T580-CR"},		/* 2 x 40G */
 	{0x540e,  "Chelsio T540-LP-CR"},	/* 4 x 10G */
 	{0x5410,  "Chelsio T580-LP-CR"},	/* 2 x 40G */
 	{0x5411,  "Chelsio T520-LL-CR"},	/* 2 x 10G */
 	{0x5412,  "Chelsio T560-CR"},		/* 1 x 40G, 2 x 10G */
 	{0x5414,  "Chelsio T580-LP-SO-CR"},	/* 2 x 40G, nomem */
 #ifdef notyet
 	{0x5404,  "Chelsio T520-BCH"},
 	{0x5405,  "Chelsio T540-BCH"},
 	{0x5406,  "Chelsio T540-CH"},
 	{0x5408,  "Chelsio T520-CX"},
 	{0x540b,  "Chelsio B520-SR"},
 	{0x540c,  "Chelsio B504-BT"},
 	{0x540f,  "Chelsio Amsterdam"},
 	{0x5413,  "Chelsio T580-CHR"},
 #endif
 };
 
 #ifdef TCP_OFFLOAD
 /*
  * service_iq() has an iq and needs the fl.  Offset of fl from the iq should be
  * exactly the same for both rxq and ofld_rxq.
  */
 CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq));
 CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
 #endif
 
 /* No easy way to include t4_msg.h before adapter.h so we check this way */
 CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS);
 CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES);
 
 CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
 
 static int
 t4_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xa000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t4_pciids); i++) {
 		if (d == t4_pciids[i].device) {
 			device_set_desc(dev, t4_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t5_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xb000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t5_pciids); i++) {
 		if (d == t5_pciids[i].device) {
 			device_set_desc(dev, t5_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t4_attach(device_t dev)
 {
 	struct adapter *sc;
 	int rc = 0, i, n10g, n1g, rqidx, tqidx;
 	struct intrs_and_queues iaq;
 	struct sge *s;
 #ifdef TCP_OFFLOAD
 	int ofld_rqidx, ofld_tqidx;
 #endif
 #ifdef DEV_NETMAP
 	int nm_rqidx, nm_tqidx;
 #endif
 	const char *pcie_ts;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 
 	pci_enable_busmaster(dev);
 	if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) {
 		uint32_t v;
 
 		pci_set_max_read_req(dev, 4096);
 		v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2);
 		v |= PCIEM_CTL_RELAXED_ORD_ENABLE;
 		pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2);
 	}
 
 	sc->traceq = -1;
 	mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF);
 	snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer",
 	    device_get_nameunit(dev));
 
 	snprintf(sc->lockname, sizeof(sc->lockname), "%s",
 	    device_get_nameunit(dev));
 	mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF);
 	sx_xlock(&t4_list_lock);
 	SLIST_INSERT_HEAD(&t4_list, sc, link);
 	sx_xunlock(&t4_list_lock);
 
 	mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF);
 	TAILQ_INIT(&sc->sfl);
 	callout_init(&sc->sfl_callout, CALLOUT_MPSAFE);
 
 	rc = map_bars_0_and_4(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	/*
 	 * This is the real PF# to which we're attaching.  Works from within PCI
 	 * passthrough environments too, where pci_get_function() could return a
 	 * different PF# depending on the passthrough configuration.  We need to
 	 * use the real PF# in all our communication with the firmware.
 	 */
 	sc->pf = G_SOURCEPF(t4_read_reg(sc, A_PL_WHOAMI));
 	sc->mbox = sc->pf;
 
 	memset(sc->chan_map, 0xff, sizeof(sc->chan_map));
 	sc->an_handler = an_not_handled;
 	for (i = 0; i < nitems(sc->cpl_handler); i++)
 		sc->cpl_handler[i] = cpl_not_handled;
 	for (i = 0; i < nitems(sc->fw_msg_handler); i++)
 		sc->fw_msg_handler[i] = fw_msg_not_handled;
 	t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, t4_filter_rpl);
 	t4_register_cpl_handler(sc, CPL_TRACE_PKT, t4_trace_pkt);
 	t4_register_cpl_handler(sc, CPL_TRACE_PKT_T5, t5_trace_pkt);
 	t4_init_sge_cpl_handlers(sc);
 
 	/* Prepare the adapter for operation */
 	rc = -t4_prep_adapter(sc);
 	if (rc != 0) {
 		device_printf(dev, "failed to prepare adapter: %d.\n", rc);
 		goto done;
 	}
 
 	/*
 	 * Do this really early, with the memory windows set up even before the
 	 * character device.  The userland tool's register i/o and mem read
 	 * will work even in "recovery mode".
 	 */
 	setup_memwin(sc);
 	sc->cdev = make_dev(is_t4(sc) ? &t4_cdevsw : &t5_cdevsw,
 	    device_get_unit(dev), UID_ROOT, GID_WHEEL, 0600, "%s",
 	    device_get_nameunit(dev));
 	if (sc->cdev == NULL)
 		device_printf(dev, "failed to create nexus char device.\n");
 	else
 		sc->cdev->si_drv1 = sc;
 
 	/* Go no further if recovery mode has been requested. */
 	if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) {
 		device_printf(dev, "recovery mode.\n");
 		goto done;
 	}
 
 	/* Prepare the firmware for operation */
 	rc = prep_firmware(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = get_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = set_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = map_bar_2(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = t4_create_dma_tag(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	/*
 	 * First pass over all the ports - allocate VIs and initialize some
 	 * basic parameters like mac address, port type, etc.  We also figure
 	 * out whether a port is 10G or 1G and use that information when
 	 * calculating how many interrupts to attempt to allocate.
 	 */
 	n10g = n1g = 0;
 	for_each_port(sc, i) {
 		struct port_info *pi;
 
 		pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK);
 		sc->port[i] = pi;
 
 		/* These must be set before t4_port_init */
 		pi->adapter = sc;
 		pi->port_id = i;
 
 		/* Allocate the vi and initialize parameters like mac addr */
 		rc = -t4_port_init(pi, sc->mbox, sc->pf, 0);
 		if (rc != 0) {
 			device_printf(dev, "unable to initialize port %d: %d\n",
 			    i, rc);
 			free(pi, M_CXGBE);
 			sc->port[i] = NULL;
 			goto done;
 		}
+
+		pi->link_cfg.requested_fc &= ~(PAUSE_TX | PAUSE_RX);
+		pi->link_cfg.requested_fc |= t4_pause_settings;
+		pi->link_cfg.fc &= ~(PAUSE_TX | PAUSE_RX);
+		pi->link_cfg.fc |= t4_pause_settings;
+
 		rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, &pi->link_cfg);
 		if (rc != 0) {
 			device_printf(dev, "port %d l1cfg failed: %d\n", i, rc);
 			free(pi, M_CXGBE);
 			sc->port[i] = NULL;
 			goto done;
 		}
 
 		snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d",
 		    device_get_nameunit(dev), i);
 		mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF);
 		sc->chan_map[pi->tx_chan] = i;
 
 		if (is_10G_port(pi) || is_40G_port(pi)) {
 			n10g++;
 			pi->tmr_idx = t4_tmr_idx_10g;
 			pi->pktc_idx = t4_pktc_idx_10g;
 		} else {
 			n1g++;
 			pi->tmr_idx = t4_tmr_idx_1g;
 			pi->pktc_idx = t4_pktc_idx_1g;
 		}
 
 		pi->xact_addr_filt = -1;
 		pi->linkdnrc = -1;
 
 		pi->qsize_rxq = t4_qsize_rxq;
 		pi->qsize_txq = t4_qsize_txq;
 
 		pi->dev = device_add_child(dev, is_t4(sc) ? "cxgbe" : "cxl", -1);
 		if (pi->dev == NULL) {
 			device_printf(dev,
 			    "failed to add device for port %d.\n", i);
 			rc = ENXIO;
 			goto done;
 		}
 		device_set_softc(pi->dev, pi);
 	}
 
 	/*
 	 * Interrupt type, # of interrupts, # of rx/tx queues, etc.
 	 */
 	rc = cfg_itype_and_nqueues(sc, n10g, n1g, &iaq);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	sc->intr_type = iaq.intr_type;
 	sc->intr_count = iaq.nirq;
 
 	s = &sc->sge;
 	s->nrxq = n10g * iaq.nrxq10g + n1g * iaq.nrxq1g;
 	s->ntxq = n10g * iaq.ntxq10g + n1g * iaq.ntxq1g;
 	s->neq = s->ntxq + s->nrxq;	/* the free list in an rxq is an eq */
 	s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */
 	s->niq = s->nrxq + 1;		/* 1 extra for firmware event queue */
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g;
 		s->nofldtxq = n10g * iaq.nofldtxq10g + n1g * iaq.nofldtxq1g;
 		s->neq += s->nofldtxq + s->nofldrxq;
 		s->niq += s->nofldrxq;
 
 		s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 		s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 	}
 #endif
 #ifdef DEV_NETMAP
 	s->nnmrxq = n10g * iaq.nnmrxq10g + n1g * iaq.nnmrxq1g;
 	s->nnmtxq = n10g * iaq.nnmtxq10g + n1g * iaq.nnmtxq1g;
 	s->neq += s->nnmtxq + s->nnmrxq;
 	s->niq += s->nnmrxq;
 
 	s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 	s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 #endif
 
 	s->ctrlq = malloc(sc->params.nports * sizeof(struct sge_wrq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_init_l2t(sc, M_WAITOK);
 
 	/*
 	 * Second pass over the ports.  This time we know the number of rx and
 	 * tx queues that each port should get.
 	 */
 	rqidx = tqidx = 0;
 #ifdef TCP_OFFLOAD
 	ofld_rqidx = ofld_tqidx = 0;
 #endif
 #ifdef DEV_NETMAP
 	nm_rqidx = nm_tqidx = 0;
 #endif
 	for_each_port(sc, i) {
 		struct port_info *pi = sc->port[i];
 
 		if (pi == NULL)
 			continue;
 
 		pi->first_rxq = rqidx;
 		pi->first_txq = tqidx;
 		if (is_10G_port(pi) || is_40G_port(pi)) {
 			pi->flags |= iaq.intr_flags_10g;
 			pi->nrxq = iaq.nrxq10g;
 			pi->ntxq = iaq.ntxq10g;
 		} else {
 			pi->flags |= iaq.intr_flags_1g;
 			pi->nrxq = iaq.nrxq1g;
 			pi->ntxq = iaq.ntxq1g;
 		}
 
 		if (pi->ntxq > 1)
 			pi->rsrv_noflowq = iaq.rsrv_noflowq ? 1 : 0;
 		else
 			pi->rsrv_noflowq = 0;
 
 		rqidx += pi->nrxq;
 		tqidx += pi->ntxq;
 #ifdef TCP_OFFLOAD
 		if (is_offload(sc)) {
 			pi->first_ofld_rxq = ofld_rqidx;
 			pi->first_ofld_txq = ofld_tqidx;
 			if (is_10G_port(pi) || is_40G_port(pi)) {
 				pi->nofldrxq = iaq.nofldrxq10g;
 				pi->nofldtxq = iaq.nofldtxq10g;
 			} else {
 				pi->nofldrxq = iaq.nofldrxq1g;
 				pi->nofldtxq = iaq.nofldtxq1g;
 			}
 			ofld_rqidx += pi->nofldrxq;
 			ofld_tqidx += pi->nofldtxq;
 		}
 #endif
 #ifdef DEV_NETMAP
 		pi->first_nm_rxq = nm_rqidx;
 		pi->first_nm_txq = nm_tqidx;
 		if (is_10G_port(pi) || is_40G_port(pi)) {
 			pi->nnmrxq = iaq.nnmrxq10g;
 			pi->nnmtxq = iaq.nnmtxq10g;
 		} else {
 			pi->nnmrxq = iaq.nnmrxq1g;
 			pi->nnmtxq = iaq.nnmtxq1g;
 		}
 		nm_rqidx += pi->nnmrxq;
 		nm_tqidx += pi->nnmtxq;
 #endif
 	}
 
 	rc = setup_intr_handlers(sc);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt handlers: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_generic_attach(dev);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to attach all child ports: %d\n", rc);
 		goto done;
 	}
 
 	switch (sc->params.pci.speed) {
 		case 0x1:
 			pcie_ts = "2.5";
 			break;
 		case 0x2:
 			pcie_ts = "5.0";
 			break;
 		case 0x3:
 			pcie_ts = "8.0";
 			break;
 		default:
 			pcie_ts = "??";
 			break;
 	}
 	device_printf(dev,
 	    "PCIe x%d (%s GTS/s) (%d), %d ports, %d %s interrupt%s, %d eq, %d iq\n",
 	    sc->params.pci.width, pcie_ts, sc->params.pci.speed,
 	    sc->params.nports, sc->intr_count,
 	    sc->intr_type == INTR_MSIX ? "MSI-X" :
 	    (sc->intr_type == INTR_MSI ? "MSI" : "INTx"),
 	    sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq);
 
 	t4_set_desc(sc);
 
 done:
 	if (rc != 0 && sc->cdev) {
 		/* cdev was created and so cxgbetool works; recover that way. */
 		device_printf(dev,
 		    "error during attach, adapter is now in recovery mode.\n");
 		rc = 0;
 	}
 
 	if (rc != 0)
 		t4_detach(dev);
 	else
 		t4_sysctls(sc);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 static int
 t4_detach(device_t dev)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 	int i, rc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->flags & FULL_INIT_DONE)
 		t4_intr_disable(sc);
 
 	if (sc->cdev) {
 		destroy_dev(sc->cdev);
 		sc->cdev = NULL;
 	}
 
 	rc = bus_generic_detach(dev);
 	if (rc) {
 		device_printf(dev,
 		    "failed to detach child devices: %d\n", rc);
 		return (rc);
 	}
 
 	for (i = 0; i < sc->intr_count; i++)
 		t4_free_irq(sc, &sc->irq[i]);
 
 	for (i = 0; i < MAX_NPORTS; i++) {
 		pi = sc->port[i];
 		if (pi) {
 			t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->viid);
 			if (pi->dev)
 				device_delete_child(dev, pi->dev);
 
 			mtx_destroy(&pi->pi_lock);
 			free(pi, M_CXGBE);
 		}
 	}
 
 	if (sc->flags & FULL_INIT_DONE)
 		adapter_full_uninit(sc);
 
 	if (sc->flags & FW_OK)
 		t4_fw_bye(sc, sc->mbox);
 
 	if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX)
 		pci_release_msi(dev);
 
 	if (sc->regs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid,
 		    sc->regs_res);
 
 	if (sc->udbs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid,
 		    sc->udbs_res);
 
 	if (sc->msix_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid,
 		    sc->msix_res);
 
 	if (sc->l2t)
 		t4_free_l2t(sc->l2t);
 
 #ifdef TCP_OFFLOAD
 	free(sc->sge.ofld_rxq, M_CXGBE);
 	free(sc->sge.ofld_txq, M_CXGBE);
 #endif
 #ifdef DEV_NETMAP
 	free(sc->sge.nm_rxq, M_CXGBE);
 	free(sc->sge.nm_txq, M_CXGBE);
 #endif
 	free(sc->irq, M_CXGBE);
 	free(sc->sge.rxq, M_CXGBE);
 	free(sc->sge.txq, M_CXGBE);
 	free(sc->sge.ctrlq, M_CXGBE);
 	free(sc->sge.iqmap, M_CXGBE);
 	free(sc->sge.eqmap, M_CXGBE);
 	free(sc->tids.ftid_tab, M_CXGBE);
 	t4_destroy_dma_tag(sc);
 	if (mtx_initialized(&sc->sc_lock)) {
 		sx_xlock(&t4_list_lock);
 		SLIST_REMOVE(&t4_list, sc, adapter, link);
 		sx_xunlock(&t4_list_lock);
 		mtx_destroy(&sc->sc_lock);
 	}
 
 	if (mtx_initialized(&sc->tids.ftid_lock))
 		mtx_destroy(&sc->tids.ftid_lock);
 	if (mtx_initialized(&sc->sfl_lock))
 		mtx_destroy(&sc->sfl_lock);
 	if (mtx_initialized(&sc->ifp_lock))
 		mtx_destroy(&sc->ifp_lock);
 
 	bzero(sc, sizeof(*sc));
 
 	return (0);
 }
 
 static int
 cxgbe_probe(device_t dev)
 {
 	char buf[128];
 	struct port_info *pi = device_get_softc(dev);
 
 	snprintf(buf, sizeof(buf), "port %d", pi->port_id);
 	device_set_desc_copy(dev, buf);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \
     IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \
     IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS)
 #define T4_CAP_ENABLE (T4_CAP)
 
 static int
 cxgbe_attach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct ifnet *ifp;
 	char *s;
 	int n, o;
 
 	/* Allocate an ifnet and set it up */
 	ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(dev, "Cannot allocate ifnet\n");
 		return (ENOMEM);
 	}
 	pi->ifp = ifp;
 	ifp->if_softc = pi;
 
 	callout_init(&pi->tick, CALLOUT_MPSAFE);
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 
 	ifp->if_init = cxgbe_init;
 	ifp->if_ioctl = cxgbe_ioctl;
 	ifp->if_transmit = cxgbe_transmit;
 	ifp->if_qflush = cxgbe_qflush;
 
 	ifp->if_capabilities = T4_CAP;
 #ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter))
 		ifp->if_capabilities |= IFCAP_TOE;
 #endif
 	ifp->if_capenable = T4_CAP_ENABLE;
 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
 
 	/* Initialize ifmedia for this port */
 	ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change,
 	    cxgbe_media_status);
 	build_medialist(pi, &pi->media);
 
 	pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp,
 	    EVENTHANDLER_PRI_ANY);
 
 	ether_ifattach(ifp, pi->hw_addr);
 
 	n = 128;
 	s = malloc(n, M_CXGBE, M_WAITOK);
 	o = snprintf(s, n, "%d txq, %d rxq (NIC)", pi->ntxq, pi->nrxq);
 	MPASS(n > o);
 #ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter)) {
 		o += snprintf(s + o, n - o, "; %d txq, %d rxq (TOE)",
 		    pi->nofldtxq, pi->nofldrxq);
 		MPASS(n > o);
 	}
 #endif
 #ifdef DEV_NETMAP
 	o += snprintf(s + o, n - o, "; %d txq, %d rxq (netmap)", pi->nnmtxq,
 	    pi->nnmrxq);
 	MPASS(n > o);
 #endif
 	device_printf(dev, "%s\n", s);
 	free(s, M_CXGBE);
 
 #ifdef DEV_NETMAP
 	/* nm_media handled here to keep implementation private to this file */
 	ifmedia_init(&pi->nm_media, IFM_IMASK, cxgbe_media_change,
 	    cxgbe_media_status);
 	build_medialist(pi, &pi->nm_media);
 	create_netmap_ifnet(pi);	/* logs errors it something fails */
 #endif
 	cxgbe_sysctls(pi);
 
 	return (0);
 }
 
 static int
 cxgbe_detach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
 
 	/* Tell if_ioctl and if_init that the port is going away */
 	ADAPTER_LOCK(sc);
 	SET_DOOMED(pi);
 	wakeup(&sc->flags);
 	while (IS_BUSY(sc))
 		mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0);
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = "t4detach";
 	sc->last_op_thr = curthread;
 #endif
 	ADAPTER_UNLOCK(sc);
 
 	if (pi->flags & HAS_TRACEQ) {
 		sc->traceq = -1;	/* cloner should not create ifnet */
 		t4_tracer_port_detach(sc);
 	}
 
 	if (pi->vlan_c)
 		EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c);
 
 	PORT_LOCK(pi);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	callout_stop(&pi->tick);
 	PORT_UNLOCK(pi);
 	callout_drain(&pi->tick);
 
 	/* Let detach proceed even if these fail. */
 	cxgbe_uninit_synchronized(pi);
 	port_full_uninit(pi);
 
 	ifmedia_removeall(&pi->media);
 	ether_ifdetach(pi->ifp);
 	if_free(pi->ifp);
 
 #ifdef DEV_NETMAP
 	/* XXXNM: equivalent of cxgbe_uninit_synchronized to ifdown nm_ifp */
 	destroy_netmap_ifnet(pi);
 #endif
 
 	ADAPTER_LOCK(sc);
 	CLR_BUSY(sc);
 	wakeup(&sc->flags);
 	ADAPTER_UNLOCK(sc);
 
 	return (0);
 }
 
 static void
 cxgbe_init(void *arg)
 {
 	struct port_info *pi = arg;
 	struct adapter *sc = pi->adapter;
 
 	if (begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4init") != 0)
 		return;
 	cxgbe_init_synchronized(pi);
 	end_synchronized_op(sc, 0);
 }
 
 static int
 cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
 {
 	int rc = 0, mtu, flags, can_sleep;
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
 	struct ifreq *ifr = (struct ifreq *)data;
 	uint32_t mask;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		mtu = ifr->ifr_mtu;
 		if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO))
 			return (EINVAL);
 
 		rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4mtu");
 		if (rc)
 			return (rc);
 		ifp->if_mtu = mtu;
 		if (pi->flags & PORT_INIT_DONE) {
 			t4_update_fl_bufsize(ifp);
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_MTU);
 		}
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFFLAGS:
 		can_sleep = 0;
 redo_sifflags:
 		rc = begin_synchronized_op(sc, pi,
 		    can_sleep ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4flg");
 		if (rc)
 			return (rc);
 
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				flags = pi->if_flags;
 				if ((ifp->if_flags ^ flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					if (can_sleep == 1) {
 						end_synchronized_op(sc, 0);
 						can_sleep = 0;
 						goto redo_sifflags;
 					}
 					rc = update_mac_settings(ifp,
 					    XGMAC_PROMISC | XGMAC_ALLMULTI);
 				}
 			} else {
 				if (can_sleep == 0) {
 					end_synchronized_op(sc, LOCK_HELD);
 					can_sleep = 1;
 					goto redo_sifflags;
 				}
 				rc = cxgbe_init_synchronized(pi);
 			}
 			pi->if_flags = ifp->if_flags;
 		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			if (can_sleep == 0) {
 				end_synchronized_op(sc, LOCK_HELD);
 				can_sleep = 1;
 				goto redo_sifflags;
 			}
 			rc = cxgbe_uninit_synchronized(pi);
 		}
 		end_synchronized_op(sc, can_sleep ? 0 : LOCK_HELD);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI: /* these two are called with a mutex held :-( */
 		rc = begin_synchronized_op(sc, pi, HOLD_LOCK, "t4multi");
 		if (rc)
 			return (rc);
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			rc = update_mac_settings(ifp, XGMAC_MCADDRS);
 		end_synchronized_op(sc, LOCK_HELD);
 		break;
 
 	case SIOCSIFCAP:
 		rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4cap");
 		if (rc)
 			return (rc);
 
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 
 			if (IFCAP_TSO4 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				if_printf(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 			if (IFCAP_TSO6 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO6;
 				if_printf(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 
 		/*
 		 * Note that we leave CSUM_TSO alone (it is always set).  The
 		 * kernel takes both IFCAP_TSOx and CSUM_TSO into account before
 		 * sending a TSO request our way, so it's sufficient to toggle
 		 * IFCAP_TSOx only.
 		 */
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO4;
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum6 first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO6;
 		}
 		if (mask & IFCAP_LRO) {
 #if defined(INET) || defined(INET6)
 			int i;
 			struct sge_rxq *rxq;
 
 			ifp->if_capenable ^= IFCAP_LRO;
 			for_each_rxq(pi, i, rxq) {
 				if (ifp->if_capenable & IFCAP_LRO)
 					rxq->iq.flags |= IQ_LRO_ENABLED;
 				else
 					rxq->iq.flags &= ~IQ_LRO_ENABLED;
 			}
 #endif
 		}
 #ifdef TCP_OFFLOAD
 		if (mask & IFCAP_TOE) {
 			int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE;
 
 			rc = toe_capability(pi, enable);
 			if (rc != 0)
 				goto fail;
 
 			ifp->if_capenable ^= mask;
 		}
 #endif
 		if (mask & IFCAP_VLAN_HWTAGGING) {
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_VLANEX);
 		}
 		if (mask & IFCAP_VLAN_MTU) {
 			ifp->if_capenable ^= IFCAP_VLAN_MTU;
 
 			/* Need to find out how to disable auto-mtu-inflation */
 		}
 		if (mask & IFCAP_VLAN_HWTSO)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
 		if (mask & IFCAP_VLAN_HWCSUM)
 			ifp->if_capenable ^= IFCAP_VLAN_HWCSUM;
 
 #ifdef VLAN_CAPABILITIES
 		VLAN_CAPABILITIES(ifp);
 #endif
 fail:
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		ifmedia_ioctl(ifp, ifr, &pi->media, cmd);
 		break;
 
 	default:
 		rc = ether_ioctl(ifp, cmd, data);
 	}
 
 	return (rc);
 }
 
 static int
 cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
 	struct sge_txq *txq = &sc->sge.txq[pi->first_txq];
 	struct buf_ring *br;
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	if (__predict_false(pi->link_cfg.link_ok == 0)) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if (m->m_flags & M_FLOWID)
 		txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq))
 		    + pi->rsrv_noflowq);
 	br = txq->br;
 
 	if (TXQ_TRYLOCK(txq) == 0) {
 		struct sge_eq *eq = &txq->eq;
 
 		/*
 		 * It is possible that t4_eth_tx finishes up and releases the
 		 * lock between the TRYLOCK above and the drbr_enqueue here.  We
 		 * need to make sure that this mbuf doesn't just sit there in
 		 * the drbr.
 		 */
 
 		rc = drbr_enqueue(ifp, br, m);
 		if (rc == 0 && callout_pending(&eq->tx_callout) == 0 &&
 		    !(eq->flags & EQ_DOOMED))
 			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
 		return (rc);
 	}
 
 	/*
 	 * txq->m is the mbuf that is held up due to a temporary shortage of
 	 * resources and it should be put on the wire first.  Then what's in
 	 * drbr and finally the mbuf that was just passed in to us.
 	 *
 	 * Return code should indicate the fate of the mbuf that was passed in
 	 * this time.
 	 */
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	if (drbr_needs_enqueue(ifp, br) || txq->m) {
 
 		/* Queued for transmission. */
 
 		rc = drbr_enqueue(ifp, br, m);
 		m = txq->m ? txq->m : drbr_dequeue(ifp, br);
 		(void) t4_eth_tx(ifp, txq, m);
 		TXQ_UNLOCK(txq);
 		return (rc);
 	}
 
 	/* Direct transmission. */
 	rc = t4_eth_tx(ifp, txq, m);
 	if (rc != 0 && txq->m)
 		rc = 0;	/* held, will be transmitted soon (hopefully) */
 
 	TXQ_UNLOCK(txq);
 	return (rc);
 }
 
 static void
 cxgbe_qflush(struct ifnet *ifp)
 {
 	struct port_info *pi = ifp->if_softc;
 	struct sge_txq *txq;
 	int i;
 	struct mbuf *m;
 
 	/* queues do not exist if !PORT_INIT_DONE. */
 	if (pi->flags & PORT_INIT_DONE) {
 		for_each_txq(pi, i, txq) {
 			TXQ_LOCK(txq);
 			m_freem(txq->m);
 			txq->m = NULL;
 			while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
 				m_freem(m);
 			TXQ_UNLOCK(txq);
 		}
 	}
 	if_qflush(ifp);
 }
 
 static int
 cxgbe_media_change(struct ifnet *ifp)
 {
 	struct port_info *pi = ifp->if_softc;
 
 	device_printf(pi->dev, "%s unimplemented.\n", __func__);
 
 	return (EOPNOTSUPP);
 }
 
 static void
 cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	struct port_info *pi = ifp->if_softc;
 	struct ifmedia *media = NULL;
 	struct ifmedia_entry *cur;
 	int speed = pi->link_cfg.speed;
 	int data = (pi->port_type << 8) | pi->mod_type;
 
 	if (ifp == pi->ifp)
 		media = &pi->media;
 #ifdef DEV_NETMAP
 	else if (ifp == pi->nm_ifp)
 		media = &pi->nm_media;
 #endif
 	MPASS(media != NULL);
 
 	cur = media->ifm_cur;
 	if (cur->ifm_data != data) {
 		build_medialist(pi, media);
 		cur = media->ifm_cur;
 	}
 
 	ifmr->ifm_status = IFM_AVALID;
 	if (!pi->link_cfg.link_ok)
 		return;
 
 	ifmr->ifm_status |= IFM_ACTIVE;
 
 	/* active and current will differ iff current media is autoselect. */
 	if (IFM_SUBTYPE(cur->ifm_media) != IFM_AUTO)
 		return;
 
 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
 	if (speed == SPEED_10000)
 		ifmr->ifm_active |= IFM_10G_T;
 	else if (speed == SPEED_1000)
 		ifmr->ifm_active |= IFM_1000_T;
 	else if (speed == SPEED_100)
 		ifmr->ifm_active |= IFM_100_TX;
 	else if (speed == SPEED_10)
 		ifmr->ifm_active |= IFM_10_T;
 	else
 		KASSERT(0, ("%s: link up but speed unknown (%u)", __func__,
 			    speed));
 }
 
 void
 t4_fatal_err(struct adapter *sc)
 {
 	t4_set_reg_field(sc, A_SGE_CONTROL, F_GLOBALENABLE, 0);
 	t4_intr_disable(sc);
 	log(LOG_EMERG, "%s: encountered fatal error, adapter stopped.\n",
 	    device_get_nameunit(sc->dev));
 }
 
 static int
 map_bars_0_and_4(struct adapter *sc)
 {
 	sc->regs_rid = PCIR_BAR(0);
 	sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->regs_rid, RF_ACTIVE);
 	if (sc->regs_res == NULL) {
 		device_printf(sc->dev, "cannot map registers.\n");
 		return (ENXIO);
 	}
 	sc->bt = rman_get_bustag(sc->regs_res);
 	sc->bh = rman_get_bushandle(sc->regs_res);
 	sc->mmio_len = rman_get_size(sc->regs_res);
 	setbit(&sc->doorbells, DOORBELL_KDB);
 
 	sc->msix_rid = PCIR_BAR(4);
 	sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->msix_rid, RF_ACTIVE);
 	if (sc->msix_res == NULL) {
 		device_printf(sc->dev, "cannot map MSI-X BAR.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static int
 map_bar_2(struct adapter *sc)
 {
 
 	/*
 	 * T4: only iWARP driver uses the userspace doorbells.  There is no need
 	 * to map it if RDMA is disabled.
 	 */
 	if (is_t4(sc) && sc->rdmacaps == 0)
 		return (0);
 
 	sc->udbs_rid = PCIR_BAR(2);
 	sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->udbs_rid, RF_ACTIVE);
 	if (sc->udbs_res == NULL) {
 		device_printf(sc->dev, "cannot map doorbell BAR.\n");
 		return (ENXIO);
 	}
 	sc->udbs_base = rman_get_virtual(sc->udbs_res);
 
 	if (is_t5(sc)) {
 		setbit(&sc->doorbells, DOORBELL_UDB);
 #if defined(__i386__) || defined(__amd64__)
 		if (t5_write_combine) {
 			int rc;
 
 			/*
 			 * Enable write combining on BAR2.  This is the
 			 * userspace doorbell BAR and is split into 128B
 			 * (UDBS_SEG_SIZE) doorbell regions, each associated
 			 * with an egress queue.  The first 64B has the doorbell
 			 * and the second 64B can be used to submit a tx work
 			 * request with an implicit doorbell.
 			 */
 
 			rc = pmap_change_attr((vm_offset_t)sc->udbs_base,
 			    rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING);
 			if (rc == 0) {
 				clrbit(&sc->doorbells, DOORBELL_UDB);
 				setbit(&sc->doorbells, DOORBELL_WCWR);
 				setbit(&sc->doorbells, DOORBELL_UDBWC);
 			} else {
 				device_printf(sc->dev,
 				    "couldn't enable write combining: %d\n",
 				    rc);
 			}
 
 			t4_write_reg(sc, A_SGE_STAT_CFG,
 			    V_STATSOURCE_T5(7) | V_STATMODE(0));
 		}
 #endif
 	}
 
 	return (0);
 }
 
 static const struct memwin t4_memwin[] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 }
 };
 
 static const struct memwin t5_memwin[] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 },
 };
 
 static void
 setup_memwin(struct adapter *sc)
 {
 	const struct memwin *mw;
 	int i, n;
 	uint32_t bar0;
 
 	if (is_t4(sc)) {
 		/*
 		 * Read low 32b of bar0 indirectly via the hardware backdoor
 		 * mechanism.  Works from within PCI passthrough environments
 		 * too, where rman_get_start() can return a different value.  We
 		 * need to program the T4 memory window decoders with the actual
 		 * addresses that will be coming across the PCIe link.
 		 */
 		bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0));
 		bar0 &= (uint32_t) PCIM_BAR_MEM_BASE;
 
 		mw = &t4_memwin[0];
 		n = nitems(t4_memwin);
 	} else {
 		/* T5 uses the relative offset inside the PCIe BAR */
 		bar0 = 0;
 
 		mw = &t5_memwin[0];
 		n = nitems(t5_memwin);
 	}
 
 	for (i = 0; i < n; i++, mw++) {
 		t4_write_reg(sc,
 		    PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i),
 		    (mw->base + bar0) | V_BIR(0) |
 		    V_WINDOW(ilog2(mw->aperture) - 10));
 	}
 
 	/* flush */
 	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2));
 }
 
 /*
  * Verify that the memory range specified by the addr/len pair is valid and lies
  * entirely within a single region (EDCx or MCx).
  */
 static int
 validate_mem_range(struct adapter *sc, uint32_t addr, int len)
 {
 	uint32_t em, addr_len, maddr, mlen;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (addr & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	/* Enabled memories */
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	if (em & F_EDRAM0_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		maddr = G_EDRAM0_BASE(addr_len) << 20;
 		mlen = G_EDRAM0_SIZE(addr_len) << 20;
 		if (mlen > 0 && addr >= maddr && addr < maddr + mlen &&
 		    addr + len <= maddr + mlen)
 			return (0);
 	}
 	if (em & F_EDRAM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		maddr = G_EDRAM1_BASE(addr_len) << 20;
 		mlen = G_EDRAM1_SIZE(addr_len) << 20;
 		if (mlen > 0 && addr >= maddr && addr < maddr + mlen &&
 		    addr + len <= maddr + mlen)
 			return (0);
 	}
 	if (em & F_EXT_MEM_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		maddr = G_EXT_MEM_BASE(addr_len) << 20;
 		mlen = G_EXT_MEM_SIZE(addr_len) << 20;
 		if (mlen > 0 && addr >= maddr && addr < maddr + mlen &&
 		    addr + len <= maddr + mlen)
 			return (0);
 	}
 	if (!is_t4(sc) && em & F_EXT_MEM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		maddr = G_EXT_MEM1_BASE(addr_len) << 20;
 		mlen = G_EXT_MEM1_SIZE(addr_len) << 20;
 		if (mlen > 0 && addr >= maddr && addr < maddr + mlen &&
 		    addr + len <= maddr + mlen)
 			return (0);
 	}
 
 	return (EFAULT);
 }
 
 static int
 fwmtype_to_hwmtype(int mtype)
 {
 
 	switch (mtype) {
 	case FW_MEMTYPE_EDC0:
 		return (MEM_EDC0);
 	case FW_MEMTYPE_EDC1:
 		return (MEM_EDC1);
 	case FW_MEMTYPE_EXTMEM:
 		return (MEM_MC0);
 	case FW_MEMTYPE_EXTMEM1:
 		return (MEM_MC1);
 	default:
 		panic("%s: cannot translate fw mtype %d.", __func__, mtype);
 	}
 }
 
 /*
  * Verify that the memory range specified by the memtype/offset/len pair is
  * valid and lies entirely within the memtype specified.  The global address of
  * the start of the range is returned in addr.
  */
 static int
 validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len,
     uint32_t *addr)
 {
 	uint32_t em, addr_len, maddr, mlen;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (off & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	switch (fwmtype_to_hwmtype(mtype)) {
 	case MEM_EDC0:
 		if (!(em & F_EDRAM0_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		maddr = G_EDRAM0_BASE(addr_len) << 20;
 		mlen = G_EDRAM0_SIZE(addr_len) << 20;
 		break;
 	case MEM_EDC1:
 		if (!(em & F_EDRAM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		maddr = G_EDRAM1_BASE(addr_len) << 20;
 		mlen = G_EDRAM1_SIZE(addr_len) << 20;
 		break;
 	case MEM_MC:
 		if (!(em & F_EXT_MEM_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		maddr = G_EXT_MEM_BASE(addr_len) << 20;
 		mlen = G_EXT_MEM_SIZE(addr_len) << 20;
 		break;
 	case MEM_MC1:
 		if (is_t4(sc) || !(em & F_EXT_MEM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		maddr = G_EXT_MEM1_BASE(addr_len) << 20;
 		mlen = G_EXT_MEM1_SIZE(addr_len) << 20;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if (mlen > 0 && off < mlen && off + len <= mlen) {
 		*addr = maddr + off;	/* global address */
 		return (0);
 	}
 
 	return (EFAULT);
 }
 
 static void
 memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture)
 {
 	const struct memwin *mw;
 
 	if (is_t4(sc)) {
 		KASSERT(win >= 0 && win < nitems(t4_memwin),
 		    ("%s: incorrect memwin# (%d)", __func__, win));
 		mw = &t4_memwin[win];
 	} else {
 		KASSERT(win >= 0 && win < nitems(t5_memwin),
 		    ("%s: incorrect memwin# (%d)", __func__, win));
 		mw = &t5_memwin[win];
 	}
 
 	if (base != NULL)
 		*base = mw->base;
 	if (aperture != NULL)
 		*aperture = mw->aperture;
 }
 
 /*
  * Positions the memory window such that it can be used to access the specified
  * address in the chip's address space.  The return value is the offset of addr
  * from the start of the window.
  */
 static uint32_t
 position_memwin(struct adapter *sc, int n, uint32_t addr)
 {
 	uint32_t start, pf;
 	uint32_t reg;
 
 	KASSERT(n >= 0 && n <= 3,
 	    ("%s: invalid window %d.", __func__, n));
 	KASSERT((addr & 3) == 0,
 	    ("%s: addr (0x%x) is not at a 4B boundary.", __func__, addr));
 
 	if (is_t4(sc)) {
 		pf = 0;
 		start = addr & ~0xf;	/* start must be 16B aligned */
 	} else {
 		pf = V_PFNUM(sc->pf);
 		start = addr & ~0x7f;	/* start must be 128B aligned */
 	}
 	reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, n);
 
 	t4_write_reg(sc, reg, start | pf);
 	t4_read_reg(sc, reg);
 
 	return (addr - start);
 }
 
 static int
 cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g,
     struct intrs_and_queues *iaq)
 {
 	int rc, itype, navail, nrxq10g, nrxq1g, n;
 	int nofldrxq10g = 0, nofldrxq1g = 0;
 	int nnmrxq10g = 0, nnmrxq1g = 0;
 
 	bzero(iaq, sizeof(*iaq));
 
 	iaq->ntxq10g = t4_ntxq10g;
 	iaq->ntxq1g = t4_ntxq1g;
 	iaq->nrxq10g = nrxq10g = t4_nrxq10g;
 	iaq->nrxq1g = nrxq1g = t4_nrxq1g;
 	iaq->rsrv_noflowq = t4_rsrv_noflowq;
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		iaq->nofldtxq10g = t4_nofldtxq10g;
 		iaq->nofldtxq1g = t4_nofldtxq1g;
 		iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g;
 		iaq->nofldrxq1g = nofldrxq1g = t4_nofldrxq1g;
 	}
 #endif
 #ifdef DEV_NETMAP
 	iaq->nnmtxq10g = t4_nnmtxq10g;
 	iaq->nnmtxq1g = t4_nnmtxq1g;
 	iaq->nnmrxq10g = nnmrxq10g = t4_nnmrxq10g;
 	iaq->nnmrxq1g = nnmrxq1g = t4_nnmrxq1g;
 #endif
 
 	for (itype = INTR_MSIX; itype; itype >>= 1) {
 
 		if ((itype & t4_intr_types) == 0)
 			continue;	/* not allowed */
 
 		if (itype == INTR_MSIX)
 			navail = pci_msix_count(sc->dev);
 		else if (itype == INTR_MSI)
 			navail = pci_msi_count(sc->dev);
 		else
 			navail = 1;
 restart:
 		if (navail == 0)
 			continue;
 
 		iaq->intr_type = itype;
 		iaq->intr_flags_10g = 0;
 		iaq->intr_flags_1g = 0;
 
 		/*
 		 * Best option: an interrupt vector for errors, one for the
 		 * firmware event queue, and one for every rxq (NIC, TOE, and
 		 * netmap).
 		 */
 		iaq->nirq = T4_EXTRA_INTR;
 		iaq->nirq += n10g * (nrxq10g + nofldrxq10g + nnmrxq10g);
 		iaq->nirq += n1g * (nrxq1g + nofldrxq1g + nnmrxq1g);
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 			iaq->intr_flags_10g = INTR_ALL;
 			iaq->intr_flags_1g = INTR_ALL;
 			goto allocate;
 		}
 
 		/*
 		 * Second best option: a vector for errors, one for the firmware
 		 * event queue, and vectors for either all the NIC rx queues or
 		 * all the TOE rx queues.  The queues that don't get vectors
 		 * will forward their interrupts to those that do.
 		 *
 		 * Note: netmap rx queues cannot be created early and so they
 		 * can't be setup to receive forwarded interrupts for others.
 		 */
 		iaq->nirq = T4_EXTRA_INTR;
 		if (nrxq10g >= nofldrxq10g) {
 			iaq->intr_flags_10g = INTR_RXQ;
 			iaq->nirq += n10g * nrxq10g;
 #ifdef DEV_NETMAP
 			iaq->nnmrxq10g = min(nnmrxq10g, nrxq10g);
 #endif
 		} else {
 			iaq->intr_flags_10g = INTR_OFLD_RXQ;
 			iaq->nirq += n10g * nofldrxq10g;
 #ifdef DEV_NETMAP
 			iaq->nnmrxq10g = min(nnmrxq10g, nofldrxq10g);
 #endif
 		}
 		if (nrxq1g >= nofldrxq1g) {
 			iaq->intr_flags_1g = INTR_RXQ;
 			iaq->nirq += n1g * nrxq1g;
 #ifdef DEV_NETMAP
 			iaq->nnmrxq1g = min(nnmrxq1g, nrxq1g);
 #endif
 		} else {
 			iaq->intr_flags_1g = INTR_OFLD_RXQ;
 			iaq->nirq += n1g * nofldrxq1g;
 #ifdef DEV_NETMAP
 			iaq->nnmrxq1g = min(nnmrxq1g, nofldrxq1g);
 #endif
 		}
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq)))
 			goto allocate;
 
 		/*
 		 * Next best option: an interrupt vector for errors, one for the
 		 * firmware event queue, and at least one per port.  At this
 		 * point we know we'll have to downsize nrxq and/or nofldrxq
 		 * and/or nnmrxq to fit what's available to us.
 		 */
 		iaq->nirq = T4_EXTRA_INTR;
 		iaq->nirq += n10g + n1g;
 		if (iaq->nirq <= navail) {
 			int leftover = navail - iaq->nirq;
 
 			if (n10g > 0) {
 				int target = max(nrxq10g, nofldrxq10g);
 
 				iaq->intr_flags_10g = nrxq10g >= nofldrxq10g ?
 				    INTR_RXQ : INTR_OFLD_RXQ;
 
 				n = 1;
 				while (n < target && leftover >= n10g) {
 					leftover -= n10g;
 					iaq->nirq += n10g;
 					n++;
 				}
 				iaq->nrxq10g = min(n, nrxq10g);
 #ifdef TCP_OFFLOAD
 				iaq->nofldrxq10g = min(n, nofldrxq10g);
 #endif
 #ifdef DEV_NETMAP
 				iaq->nnmrxq10g = min(n, nnmrxq10g);
 #endif
 			}
 
 			if (n1g > 0) {
 				int target = max(nrxq1g, nofldrxq1g);
 
 				iaq->intr_flags_1g = nrxq1g >= nofldrxq1g ?
 				    INTR_RXQ : INTR_OFLD_RXQ;
 
 				n = 1;
 				while (n < target && leftover >= n1g) {
 					leftover -= n1g;
 					iaq->nirq += n1g;
 					n++;
 				}
 				iaq->nrxq1g = min(n, nrxq1g);
 #ifdef TCP_OFFLOAD
 				iaq->nofldrxq1g = min(n, nofldrxq1g);
 #endif
 #ifdef DEV_NETMAP
 				iaq->nnmrxq1g = min(n, nnmrxq1g);
 #endif
 			}
 
 			if (itype != INTR_MSI || powerof2(iaq->nirq))
 				goto allocate;
 		}
 
 		/*
 		 * Least desirable option: one interrupt vector for everything.
 		 */
 		iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1;
 		iaq->intr_flags_10g = iaq->intr_flags_1g = 0;
 #ifdef TCP_OFFLOAD
 		if (is_offload(sc))
 			iaq->nofldrxq10g = iaq->nofldrxq1g = 1;
 #endif
 #ifdef DEV_NETMAP
 		iaq->nnmrxq10g = iaq->nnmrxq1g = 1;
 #endif
 
 allocate:
 		navail = iaq->nirq;
 		rc = 0;
 		if (itype == INTR_MSIX)
 			rc = pci_alloc_msix(sc->dev, &navail);
 		else if (itype == INTR_MSI)
 			rc = pci_alloc_msi(sc->dev, &navail);
 
 		if (rc == 0) {
 			if (navail == iaq->nirq)
 				return (0);
 
 			/*
 			 * Didn't get the number requested.  Use whatever number
 			 * the kernel is willing to allocate (it's in navail).
 			 */
 			device_printf(sc->dev, "fewer vectors than requested, "
 			    "type=%d, req=%d, rcvd=%d; will downshift req.\n",
 			    itype, iaq->nirq, navail);
 			pci_release_msi(sc->dev);
 			goto restart;
 		}
 
 		device_printf(sc->dev,
 		    "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n",
 		    itype, rc, iaq->nirq, navail);
 	}
 
 	device_printf(sc->dev,
 	    "failed to find a usable interrupt type.  "
 	    "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types,
 	    pci_msix_count(sc->dev), pci_msi_count(sc->dev));
 
 	return (ENXIO);
 }
 
 #define FW_VERSION(chip) ( \
     V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \
     V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \
     V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \
     V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD))
 #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf)
 
 struct fw_info {
 	uint8_t chip;
 	char *kld_name;
 	char *fw_mod_name;
 	struct fw_hdr fw_hdr;	/* XXX: waste of space, need a sparse struct */
 } fw_info[] = {
 	{
 		.chip = CHELSIO_T4,
 		.kld_name = "t4fw_cfg",
 		.fw_mod_name = "t4fw",
 		.fw_hdr = {
 			.chip = FW_HDR_CHIP_T4,
 			.fw_ver = htobe32_const(FW_VERSION(T4)),
 			.intfver_nic = FW_INTFVER(T4, NIC),
 			.intfver_vnic = FW_INTFVER(T4, VNIC),
 			.intfver_ofld = FW_INTFVER(T4, OFLD),
 			.intfver_ri = FW_INTFVER(T4, RI),
 			.intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T4, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T4, FCOE),
 		},
 	}, {
 		.chip = CHELSIO_T5,
 		.kld_name = "t5fw_cfg",
 		.fw_mod_name = "t5fw",
 		.fw_hdr = {
 			.chip = FW_HDR_CHIP_T5,
 			.fw_ver = htobe32_const(FW_VERSION(T5)),
 			.intfver_nic = FW_INTFVER(T5, NIC),
 			.intfver_vnic = FW_INTFVER(T5, VNIC),
 			.intfver_ofld = FW_INTFVER(T5, OFLD),
 			.intfver_ri = FW_INTFVER(T5, RI),
 			.intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T5, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T5, FCOE),
 		},
 	}
 };
 
 static struct fw_info *
 find_fw_info(int chip)
 {
 	int i;
 
 	for (i = 0; i < nitems(fw_info); i++) {
 		if (fw_info[i].chip == chip)
 			return (&fw_info[i]);
 	}
 	return (NULL);
 }
 
 /*
  * Is the given firmware API compatible with the one the driver was compiled
  * with?
  */
 static int
 fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2)
 {
 
 	/* short circuit if it's the exact same firmware version */
 	if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver)
 		return (1);
 
 	/*
 	 * XXX: Is this too conservative?  Perhaps I should limit this to the
 	 * features that are supported in the driver.
 	 */
 #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x)
 	if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) &&
 	    SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) &&
 	    SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe))
 		return (1);
 #undef SAME_INTF
 
 	return (0);
 }
 
 /*
  * The firmware in the KLD is usable, but should it be installed?  This routine
  * explains itself in detail if it indicates the KLD firmware should be
  * installed.
  */
 static int
 should_install_kld_fw(struct adapter *sc, int card_fw_usable, int k, int c)
 {
 	const char *reason;
 
 	if (!card_fw_usable) {
 		reason = "incompatible or unusable";
 		goto install;
 	}
 
 	if (k > c) {
 		reason = "older than the version bundled with this driver";
 		goto install;
 	}
 
 	if (t4_fw_install == 2 && k != c) {
 		reason = "different than the version bundled with this driver";
 		goto install;
 	}
 
 	return (0);
 
 install:
 	if (t4_fw_install == 0) {
 		device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 		    "but the driver is prohibited from installing a different "
 		    "firmware on the card.\n",
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason);
 
 		return (0);
 	}
 
 	device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 	    "installing firmware %u.%u.%u.%u on card.\n",
 	    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 	    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason,
 	    G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k),
 	    G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k));
 
 	return (1);
 }
 /*
  * Establish contact with the firmware and determine if we are the master driver
  * or not, and whether we are responsible for chip initialization.
  */
 static int
 prep_firmware(struct adapter *sc)
 {
 	const struct firmware *fw = NULL, *default_cfg;
 	int rc, pf, card_fw_usable, kld_fw_usable, need_fw_reset = 1;
 	enum dev_state state;
 	struct fw_info *fw_info;
 	struct fw_hdr *card_fw;		/* fw on the card */
 	const struct fw_hdr *kld_fw;	/* fw in the KLD */
 	const struct fw_hdr *drv_fw;	/* fw header the driver was compiled
 					   against */
 
 	/* Contact firmware. */
 	rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state);
 	if (rc < 0 || state == DEV_STATE_ERR) {
 		rc = -rc;
 		device_printf(sc->dev,
 		    "failed to connect to the firmware: %d, %d.\n", rc, state);
 		return (rc);
 	}
 	pf = rc;
 	if (pf == sc->mbox)
 		sc->flags |= MASTER_PF;
 	else if (state == DEV_STATE_UNINIT) {
 		/*
 		 * We didn't get to be the master so we definitely won't be
 		 * configuring the chip.  It's a bug if someone else hasn't
 		 * configured it already.
 		 */
 		device_printf(sc->dev, "couldn't be master(%d), "
 		    "device not already initialized either(%d).\n", rc, state);
 		return (EDOOFUS);
 	}
 
 	/* This is the firmware whose headers the driver was compiled against */
 	fw_info = find_fw_info(chip_id(sc));
 	if (fw_info == NULL) {
 		device_printf(sc->dev,
 		    "unable to look up firmware information for chip %d.\n",
 		    chip_id(sc));
 		return (EINVAL);
 	}
 	drv_fw = &fw_info->fw_hdr;
 
 	/*
 	 * The firmware KLD contains many modules.  The KLD name is also the
 	 * name of the module that contains the default config file.
 	 */
 	default_cfg = firmware_get(fw_info->kld_name);
 
 	/* Read the header of the firmware on the card */
 	card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK);
 	rc = -t4_read_flash(sc, FLASH_FW_START,
 	    sizeof (*card_fw) / sizeof (uint32_t), (uint32_t *)card_fw, 1);
 	if (rc == 0)
 		card_fw_usable = fw_compatible(drv_fw, (const void*)card_fw);
 	else {
 		device_printf(sc->dev,
 		    "Unable to read card's firmware header: %d\n", rc);
 		card_fw_usable = 0;
 	}
 
 	/* This is the firmware in the KLD */
 	fw = firmware_get(fw_info->fw_mod_name);
 	if (fw != NULL) {
 		kld_fw = (const void *)fw->data;
 		kld_fw_usable = fw_compatible(drv_fw, kld_fw);
 	} else {
 		kld_fw = NULL;
 		kld_fw_usable = 0;
 	}
 
 	if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver &&
 	    (!kld_fw_usable || kld_fw->fw_ver == drv_fw->fw_ver)) {
 		/*
 		 * Common case: the firmware on the card is an exact match and
 		 * the KLD is an exact match too, or the KLD is
 		 * absent/incompatible.  Note that t4_fw_install = 2 is ignored
 		 * here -- use cxgbetool loadfw if you want to reinstall the
 		 * same firmware as the one on the card.
 		 */
 	} else if (kld_fw_usable && state == DEV_STATE_UNINIT &&
 	    should_install_kld_fw(sc, card_fw_usable, be32toh(kld_fw->fw_ver),
 	    be32toh(card_fw->fw_ver))) {
 
 		rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to install firmware: %d\n", rc);
 			goto done;
 		}
 
 		/* Installed successfully, update the cached header too. */
 		memcpy(card_fw, kld_fw, sizeof(*card_fw));
 		card_fw_usable = 1;
 		need_fw_reset = 0;	/* already reset as part of load_fw */
 	}
 
 	if (!card_fw_usable) {
 		uint32_t d, c, k;
 
 		d = ntohl(drv_fw->fw_ver);
 		c = ntohl(card_fw->fw_ver);
 		k = kld_fw ? ntohl(kld_fw->fw_ver) : 0;
 
 		device_printf(sc->dev, "Cannot find a usable firmware: "
 		    "fw_install %d, chip state %d, "
 		    "driver compiled with %d.%d.%d.%d, "
 		    "card has %d.%d.%d.%d, KLD has %d.%d.%d.%d\n",
 		    t4_fw_install, state,
 		    G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d),
 		    G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d),
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c),
 		    G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k),
 		    G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k));
 		rc = EINVAL;
 		goto done;
 	}
 
 	/* We're using whatever's on the card and it's known to be good. */
 	sc->params.fw_vers = ntohl(card_fw->fw_ver);
 	snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers));
 	t4_get_tp_version(sc, &sc->params.tp_vers);
 
 	/* Reset device */
 	if (need_fw_reset &&
 	    (rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST)) != 0) {
 		device_printf(sc->dev, "firmware reset failed: %d.\n", rc);
 		if (rc != ETIMEDOUT && rc != EIO)
 			t4_fw_bye(sc, sc->mbox);
 		goto done;
 	}
 	sc->flags |= FW_OK;
 
 	rc = get_params__pre_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	/* Partition adapter resources as specified in the config file. */
 	if (state == DEV_STATE_UNINIT) {
 
 		KASSERT(sc->flags & MASTER_PF,
 		    ("%s: trying to change chip settings when not master.",
 		    __func__));
 
 		rc = partition_resources(sc, default_cfg, fw_info->kld_name);
 		if (rc != 0)
 			goto done;	/* error message displayed already */
 
 		t4_tweak_chip_settings(sc);
 
 		/* get basic stuff going */
 		rc = -t4_fw_initialize(sc, sc->mbox);
 		if (rc != 0) {
 			device_printf(sc->dev, "fw init failed: %d.\n", rc);
 			goto done;
 		}
 	} else {
 		snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", pf);
 		sc->cfcsum = 0;
 	}
 
 done:
 	free(card_fw, M_CXGBE);
 	if (fw != NULL)
 		firmware_put(fw, FIRMWARE_UNLOAD);
 	if (default_cfg != NULL)
 		firmware_put(default_cfg, FIRMWARE_UNLOAD);
 
 	return (rc);
 }
 
 #define FW_PARAM_DEV(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param))
 #define FW_PARAM_PFVF(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param))
 
 /*
  * Partition chip resources for use between various PFs, VFs, etc.
  */
 static int
 partition_resources(struct adapter *sc, const struct firmware *default_cfg,
     const char *name_prefix)
 {
 	const struct firmware *cfg = NULL;
 	int rc = 0;
 	struct fw_caps_config_cmd caps;
 	uint32_t mtype, moff, finicsum, cfcsum;
 
 	/*
 	 * Figure out what configuration file to use.  Pick the default config
 	 * file for the card if the user hasn't specified one explicitly.
 	 */
 	snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", t4_cfg_file);
 	if (strncmp(t4_cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) {
 		/* Card specific overrides go here. */
 		if (pci_get_device(sc->dev) == 0x440a)
 			snprintf(sc->cfg_file, sizeof(sc->cfg_file), UWIRE_CF);
 		if (is_fpga(sc))
 			snprintf(sc->cfg_file, sizeof(sc->cfg_file), FPGA_CF);
 	}
 
 	/*
 	 * We need to load another module if the profile is anything except
 	 * "default" or "flash".
 	 */
 	if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) != 0 &&
 	    strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) {
 		char s[32];
 
 		snprintf(s, sizeof(s), "%s_%s", name_prefix, sc->cfg_file);
 		cfg = firmware_get(s);
 		if (cfg == NULL) {
 			if (default_cfg != NULL) {
 				device_printf(sc->dev,
 				    "unable to load module \"%s\" for "
 				    "configuration profile \"%s\", will use "
 				    "the default config file instead.\n",
 				    s, sc->cfg_file);
 				snprintf(sc->cfg_file, sizeof(sc->cfg_file),
 				    "%s", DEFAULT_CF);
 			} else {
 				device_printf(sc->dev,
 				    "unable to load module \"%s\" for "
 				    "configuration profile \"%s\", will use "
 				    "the config file on the card's flash "
 				    "instead.\n", s, sc->cfg_file);
 				snprintf(sc->cfg_file, sizeof(sc->cfg_file),
 				    "%s", FLASH_CF);
 			}
 		}
 	}
 
 	if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) == 0 &&
 	    default_cfg == NULL) {
 		device_printf(sc->dev,
 		    "default config file not available, will use the config "
 		    "file on the card's flash instead.\n");
 		snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF);
 	}
 
 	if (strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) {
 		u_int cflen, i, n;
 		const uint32_t *cfdata;
 		uint32_t param, val, addr, off, mw_base, mw_aperture;
 
 		KASSERT(cfg != NULL || default_cfg != NULL,
 		    ("%s: no config to upload", __func__));
 
 		/*
 		 * Ask the firmware where it wants us to upload the config file.
 		 */
 		param = FW_PARAM_DEV(CF);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* No support for config file?  Shouldn't happen. */
 			device_printf(sc->dev,
 			    "failed to query config file location: %d.\n", rc);
 			goto done;
 		}
 		mtype = G_FW_PARAMS_PARAM_Y(val);
 		moff = G_FW_PARAMS_PARAM_Z(val) << 16;
 
 		/*
 		 * XXX: sheer laziness.  We deliberately added 4 bytes of
 		 * useless stuffing/comments at the end of the config file so
 		 * it's ok to simply throw away the last remaining bytes when
 		 * the config file is not an exact multiple of 4.  This also
 		 * helps with the validate_mt_off_len check.
 		 */
 		if (cfg != NULL) {
 			cflen = cfg->datasize & ~3;
 			cfdata = cfg->data;
 		} else {
 			cflen = default_cfg->datasize & ~3;
 			cfdata = default_cfg->data;
 		}
 
 		if (cflen > FLASH_CFG_MAX_SIZE) {
 			device_printf(sc->dev,
 			    "config file too long (%d, max allowed is %d).  "
 			    "Will try to use the config on the card, if any.\n",
 			    cflen, FLASH_CFG_MAX_SIZE);
 			goto use_config_on_flash;
 		}
 
 		rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "%s: addr (%d/0x%x) or len %d is not valid: %d.  "
 			    "Will try to use the config on the card, if any.\n",
 			    __func__, mtype, moff, cflen, rc);
 			goto use_config_on_flash;
 		}
 
 		memwin_info(sc, 2, &mw_base, &mw_aperture);
 		while (cflen) {
 			off = position_memwin(sc, 2, addr);
 			n = min(cflen, mw_aperture - off);
 			for (i = 0; i < n; i += 4)
 				t4_write_reg(sc, mw_base + off + i, *cfdata++);
 			cflen -= n;
 			addr += n;
 		}
 	} else {
 use_config_on_flash:
 		mtype = FW_MEMTYPE_FLASH;
 		moff = t4_flash_cfg_addr(sc);
 	}
 
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 	    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
 	    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to pre-process config file: %d "
 		    "(mtype %d, moff 0x%x).\n", rc, mtype, moff);
 		goto done;
 	}
 
 	finicsum = be32toh(caps.finicsum);
 	cfcsum = be32toh(caps.cfcsum);
 	if (finicsum != cfcsum) {
 		device_printf(sc->dev,
 		    "WARNING: config file checksum mismatch: %08x %08x\n",
 		    finicsum, cfcsum);
 	}
 	sc->cfcsum = cfcsum;
 
 #define LIMIT_CAPS(x) do { \
 	caps.x &= htobe16(t4_##x##_allowed); \
 } while (0)
 
 	/*
 	 * Let the firmware know what features will (not) be used so it can tune
 	 * things accordingly.
 	 */
 	LIMIT_CAPS(linkcaps);
 	LIMIT_CAPS(niccaps);
 	LIMIT_CAPS(toecaps);
 	LIMIT_CAPS(rdmacaps);
 	LIMIT_CAPS(iscsicaps);
 	LIMIT_CAPS(fcoecaps);
 #undef LIMIT_CAPS
 
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_WRITE);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to process config file: %d.\n", rc);
 	}
 done:
 	if (cfg != NULL)
 		firmware_put(cfg, FIRMWARE_UNLOAD);
 	return (rc);
 }
 
 /*
  * Retrieve parameters that are needed (or nice to have) very early.
  */
 static int
 get_params__pre_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[2], val[2];
 	struct fw_devlog_cmd cmd;
 	struct devlog_params *dlog = &sc->params.devlog;
 
 	param[0] = FW_PARAM_DEV(PORTVEC);
 	param[1] = FW_PARAM_DEV(CCLK);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (pre_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->params.portvec = val[0];
 	sc->params.nports = bitcount32(val[0]);
 	sc->params.vpd.cclk = val[1];
 
 	/* Read device log parameters. */
 	bzero(&cmd, sizeof(cmd));
 	cmd.op_to_write = htobe32(V_FW_CMD_OP(FW_DEVLOG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	cmd.retval_len16 = htobe32(FW_LEN16(cmd));
 	rc = -t4_wr_mbox(sc, sc->mbox, &cmd, sizeof(cmd), &cmd);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to get devlog parameters: %d.\n", rc);
 		bzero(dlog, sizeof (*dlog));
 		rc = 0;	/* devlog isn't critical for device operation */
 	} else {
 		val[0] = be32toh(cmd.memtype_devlog_memaddr16_devlog);
 		dlog->memtype = G_FW_DEVLOG_CMD_MEMTYPE_DEVLOG(val[0]);
 		dlog->start = G_FW_DEVLOG_CMD_MEMADDR16_DEVLOG(val[0]) << 4;
 		dlog->size = be32toh(cmd.memsize_devlog);
 	}
 
 	return (rc);
 }
 
 /*
  * Retrieve various parameters that are of interest to the driver.  The device
  * has been initialized by the firmware at this point.
  */
 static int
 get_params__post_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[7], val[7];
 	struct fw_caps_config_cmd caps;
 
 	param[0] = FW_PARAM_PFVF(IQFLINT_START);
 	param[1] = FW_PARAM_PFVF(EQ_START);
 	param[2] = FW_PARAM_PFVF(FILTER_START);
 	param[3] = FW_PARAM_PFVF(FILTER_END);
 	param[4] = FW_PARAM_PFVF(L2T_START);
 	param[5] = FW_PARAM_PFVF(L2T_END);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (post_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->sge.iq_start = val[0];
 	sc->sge.eq_start = val[1];
 	sc->tids.ftid_base = val[2];
 	sc->tids.nftids = val[3] - val[2] + 1;
 	sc->params.ftid_min = val[2];
 	sc->params.ftid_max = val[3];
 	sc->vres.l2t.start = val[4];
 	sc->vres.l2t.size = val[5] - val[4] + 1;
 	KASSERT(sc->vres.l2t.size <= L2T_SIZE,
 	    ("%s: L2 table size (%u) larger than expected (%u)",
 	    __func__, sc->vres.l2t.size, L2T_SIZE));
 
 	/* get capabilites */
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to get card capabilities: %d.\n", rc);
 		return (rc);
 	}
 
 #define READ_CAPS(x) do { \
 	sc->x = htobe16(caps.x); \
 } while (0)
 	READ_CAPS(linkcaps);
 	READ_CAPS(niccaps);
 	READ_CAPS(toecaps);
 	READ_CAPS(rdmacaps);
 	READ_CAPS(iscsicaps);
 	READ_CAPS(fcoecaps);
 
 	if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) {
 		param[0] = FW_PARAM_PFVF(ETHOFLD_START);
 		param[1] = FW_PARAM_PFVF(ETHOFLD_END);
 		param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query NIC parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.etid_base = val[0];
 		sc->params.etid_min = val[0];
 		sc->tids.netids = val[1] - val[0] + 1;
 		sc->params.netids = sc->tids.netids;
 		sc->params.eo_wr_cred = val[2];
 		sc->params.ethoffload = 1;
 	}
 
 	if (sc->toecaps) {
 		/* query offload-related parameters */
 		param[0] = FW_PARAM_DEV(NTID);
 		param[1] = FW_PARAM_PFVF(SERVER_START);
 		param[2] = FW_PARAM_PFVF(SERVER_END);
 		param[3] = FW_PARAM_PFVF(TDDP_START);
 		param[4] = FW_PARAM_PFVF(TDDP_END);
 		param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query TOE parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.ntids = val[0];
 		sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS);
 		sc->tids.stid_base = val[1];
 		sc->tids.nstids = val[2] - val[1] + 1;
 		sc->vres.ddp.start = val[3];
 		sc->vres.ddp.size = val[4] - val[3] + 1;
 		sc->params.ofldq_wr_cred = val[5];
 		sc->params.offload = 1;
 	}
 	if (sc->rdmacaps) {
 		param[0] = FW_PARAM_PFVF(STAG_START);
 		param[1] = FW_PARAM_PFVF(STAG_END);
 		param[2] = FW_PARAM_PFVF(RQ_START);
 		param[3] = FW_PARAM_PFVF(RQ_END);
 		param[4] = FW_PARAM_PFVF(PBL_START);
 		param[5] = FW_PARAM_PFVF(PBL_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(1): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.stag.start = val[0];
 		sc->vres.stag.size = val[1] - val[0] + 1;
 		sc->vres.rq.start = val[2];
 		sc->vres.rq.size = val[3] - val[2] + 1;
 		sc->vres.pbl.start = val[4];
 		sc->vres.pbl.size = val[5] - val[4] + 1;
 
 		param[0] = FW_PARAM_PFVF(SQRQ_START);
 		param[1] = FW_PARAM_PFVF(SQRQ_END);
 		param[2] = FW_PARAM_PFVF(CQ_START);
 		param[3] = FW_PARAM_PFVF(CQ_END);
 		param[4] = FW_PARAM_PFVF(OCQ_START);
 		param[5] = FW_PARAM_PFVF(OCQ_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(2): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.qp.start = val[0];
 		sc->vres.qp.size = val[1] - val[0] + 1;
 		sc->vres.cq.start = val[2];
 		sc->vres.cq.size = val[3] - val[2] + 1;
 		sc->vres.ocq.start = val[4];
 		sc->vres.ocq.size = val[5] - val[4] + 1;
 	}
 	if (sc->iscsicaps) {
 		param[0] = FW_PARAM_PFVF(ISCSI_START);
 		param[1] = FW_PARAM_PFVF(ISCSI_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query iSCSI parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.iscsi.start = val[0];
 		sc->vres.iscsi.size = val[1] - val[0] + 1;
 	}
 
 	/*
 	 * We've got the params we wanted to query via the firmware.  Now grab
 	 * some others directly from the chip.
 	 */
 	rc = t4_read_chip_settings(sc);
 
 	return (rc);
 }
 
 static int
 set_params__post_init(struct adapter *sc)
 {
 	uint32_t param, val;
 
 	/* ask for encapsulated CPLs */
 	param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP);
 	val = 1;
 	(void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 
 	return (0);
 }
 
 #undef FW_PARAM_PFVF
 #undef FW_PARAM_DEV
 
 static void
 t4_set_desc(struct adapter *sc)
 {
 	char buf[128];
 	struct adapter_params *p = &sc->params;
 
 	snprintf(buf, sizeof(buf), "Chelsio %s %sNIC (rev %d), S/N:%s, "
 	    "P/N:%s, E/C:%s", p->vpd.id, is_offload(sc) ? "R" : "",
 	    chip_rev(sc), p->vpd.sn, p->vpd.pn, p->vpd.ec);
 
 	device_set_desc_copy(sc->dev, buf);
 }
 
 static void
 build_medialist(struct port_info *pi, struct ifmedia *media)
 {
 	int data, m;
 
 	PORT_LOCK(pi);
 
 	ifmedia_removeall(media);
 
 	m = IFM_ETHER | IFM_FDX;
 	data = (pi->port_type << 8) | pi->mod_type;
 
 	switch(pi->port_type) {
 	case FW_PORT_TYPE_BT_XFI:
 		ifmedia_add(media, m | IFM_10G_T, data, NULL);
 		break;
 
 	case FW_PORT_TYPE_BT_XAUI:
 		ifmedia_add(media, m | IFM_10G_T, data, NULL);
 		/* fall through */
 
 	case FW_PORT_TYPE_BT_SGMII:
 		ifmedia_add(media, m | IFM_1000_T, data, NULL);
 		ifmedia_add(media, m | IFM_100_TX, data, NULL);
 		ifmedia_add(media, IFM_ETHER | IFM_AUTO, data, NULL);
 		ifmedia_set(media, IFM_ETHER | IFM_AUTO);
 		break;
 
 	case FW_PORT_TYPE_CX4:
 		ifmedia_add(media, m | IFM_10G_CX4, data, NULL);
 		ifmedia_set(media, m | IFM_10G_CX4);
 		break;
 
 	case FW_PORT_TYPE_QSFP_10G:
 	case FW_PORT_TYPE_SFP:
 	case FW_PORT_TYPE_FIBER_XFI:
 	case FW_PORT_TYPE_FIBER_XAUI:
 		switch (pi->mod_type) {
 
 		case FW_PORT_MOD_TYPE_LR:
 			ifmedia_add(media, m | IFM_10G_LR, data, NULL);
 			ifmedia_set(media, m | IFM_10G_LR);
 			break;
 
 		case FW_PORT_MOD_TYPE_SR:
 			ifmedia_add(media, m | IFM_10G_SR, data, NULL);
 			ifmedia_set(media, m | IFM_10G_SR);
 			break;
 
 		case FW_PORT_MOD_TYPE_LRM:
 			ifmedia_add(media, m | IFM_10G_LRM, data, NULL);
 			ifmedia_set(media, m | IFM_10G_LRM);
 			break;
 
 		case FW_PORT_MOD_TYPE_TWINAX_PASSIVE:
 		case FW_PORT_MOD_TYPE_TWINAX_ACTIVE:
 			ifmedia_add(media, m | IFM_10G_TWINAX, data, NULL);
 			ifmedia_set(media, m | IFM_10G_TWINAX);
 			break;
 
 		case FW_PORT_MOD_TYPE_NONE:
 			m &= ~IFM_FDX;
 			ifmedia_add(media, m | IFM_NONE, data, NULL);
 			ifmedia_set(media, m | IFM_NONE);
 			break;
 
 		case FW_PORT_MOD_TYPE_NA:
 		case FW_PORT_MOD_TYPE_ER:
 		default:
 			device_printf(pi->dev,
 			    "unknown port_type (%d), mod_type (%d)\n",
 			    pi->port_type, pi->mod_type);
 			ifmedia_add(media, m | IFM_UNKNOWN, data, NULL);
 			ifmedia_set(media, m | IFM_UNKNOWN);
 			break;
 		}
 		break;
 
 	case FW_PORT_TYPE_QSFP:
 		switch (pi->mod_type) {
 
 		case FW_PORT_MOD_TYPE_LR:
 			ifmedia_add(media, m | IFM_40G_LR4, data, NULL);
 			ifmedia_set(media, m | IFM_40G_LR4);
 			break;
 
 		case FW_PORT_MOD_TYPE_SR:
 			ifmedia_add(media, m | IFM_40G_SR4, data, NULL);
 			ifmedia_set(media, m | IFM_40G_SR4);
 			break;
 
 		case FW_PORT_MOD_TYPE_TWINAX_PASSIVE:
 		case FW_PORT_MOD_TYPE_TWINAX_ACTIVE:
 			ifmedia_add(media, m | IFM_40G_CR4, data, NULL);
 			ifmedia_set(media, m | IFM_40G_CR4);
 			break;
 
 		case FW_PORT_MOD_TYPE_NONE:
 			m &= ~IFM_FDX;
 			ifmedia_add(media, m | IFM_NONE, data, NULL);
 			ifmedia_set(media, m | IFM_NONE);
 			break;
 
 		default:
 			device_printf(pi->dev,
 			    "unknown port_type (%d), mod_type (%d)\n",
 			    pi->port_type, pi->mod_type);
 			ifmedia_add(media, m | IFM_UNKNOWN, data, NULL);
 			ifmedia_set(media, m | IFM_UNKNOWN);
 			break;
 		}
 		break;
 
 	default:
 		device_printf(pi->dev,
 		    "unknown port_type (%d), mod_type (%d)\n", pi->port_type,
 		    pi->mod_type);
 		ifmedia_add(media, m | IFM_UNKNOWN, data, NULL);
 		ifmedia_set(media, m | IFM_UNKNOWN);
 		break;
 	}
 
 	PORT_UNLOCK(pi);
 }
 
 #define FW_MAC_EXACT_CHUNK	7
 
 /*
  * Program the port's XGMAC based on parameters in ifnet.  The caller also
  * indicates which parameters should be programmed (the rest are left alone).
  */
 int
 update_mac_settings(struct ifnet *ifp, int flags)
 {
 	int rc = 0;
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
 	int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
 	uint16_t viid = 0xffff;
 	int16_t *xact_addr_filt = NULL;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT(flags, ("%s: not told what to update.", __func__));
 
 	if (ifp == pi->ifp) {
 		viid = pi->viid;
 		xact_addr_filt = &pi->xact_addr_filt;
 	}
 #ifdef DEV_NETMAP
 	else if (ifp == pi->nm_ifp) {
 		viid = pi->nm_viid;
 		xact_addr_filt = &pi->nm_xact_addr_filt;
 	}
 #endif
 	if (flags & XGMAC_MTU)
 		mtu = ifp->if_mtu;
 
 	if (flags & XGMAC_PROMISC)
 		promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0;
 
 	if (flags & XGMAC_ALLMULTI)
 		allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0;
 
 	if (flags & XGMAC_VLANEX)
 		vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0;
 
 	if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) {
 		rc = -t4_set_rxmode(sc, sc->mbox, viid, mtu, promisc, allmulti,
 		    1, vlanex, false);
 		if (rc) {
 			if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags,
 			    rc);
 			return (rc);
 		}
 	}
 
 	if (flags & XGMAC_UCADDR) {
 		uint8_t ucaddr[ETHER_ADDR_LEN];
 
 		bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr));
 		rc = t4_change_mac(sc, sc->mbox, viid, *xact_addr_filt, ucaddr,
 		    true, true);
 		if (rc < 0) {
 			rc = -rc;
 			if_printf(ifp, "change_mac failed: %d\n", rc);
 			return (rc);
 		} else {
 			*xact_addr_filt = rc;
 			rc = 0;
 		}
 	}
 
 	if (flags & XGMAC_MCADDRS) {
 		const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK];
 		int del = 1;
 		uint64_t hash = 0;
 		struct ifmultiaddr *ifma;
 		int i = 0, j;
 
 		if_maddr_rlock(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_LINK)
 				continue;
 			mcaddr[i++] =
 			    LLADDR((struct sockaddr_dl *)ifma->ifma_addr);
 
 			if (i == FW_MAC_EXACT_CHUNK) {
 				rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del,
 				    i, mcaddr, NULL, &hash, 0);
 				if (rc < 0) {
 					rc = -rc;
 					for (j = 0; j < i; j++) {
 						if_printf(ifp,
 						    "failed to add mc address"
 						    " %02x:%02x:%02x:"
 						    "%02x:%02x:%02x rc=%d\n",
 						    mcaddr[j][0], mcaddr[j][1],
 						    mcaddr[j][2], mcaddr[j][3],
 						    mcaddr[j][4], mcaddr[j][5],
 						    rc);
 					}
 					goto mcfail;
 				}
 				del = 0;
 				i = 0;
 			}
 		}
 		if (i > 0) {
 			rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del, i,
 			    mcaddr, NULL, &hash, 0);
 			if (rc < 0) {
 				rc = -rc;
 				for (j = 0; j < i; j++) {
 					if_printf(ifp,
 					    "failed to add mc address"
 					    " %02x:%02x:%02x:"
 					    "%02x:%02x:%02x rc=%d\n",
 					    mcaddr[j][0], mcaddr[j][1],
 					    mcaddr[j][2], mcaddr[j][3],
 					    mcaddr[j][4], mcaddr[j][5],
 					    rc);
 				}
 				goto mcfail;
 			}
 		}
 
 		rc = -t4_set_addr_hash(sc, sc->mbox, viid, 0, hash, 0);
 		if (rc != 0)
 			if_printf(ifp, "failed to set mc address hash: %d", rc);
 mcfail:
 		if_maddr_runlock(ifp);
 	}
 
 	return (rc);
 }
 
 int
 begin_synchronized_op(struct adapter *sc, struct port_info *pi, int flags,
     char *wmesg)
 {
 	int rc, pri;
 
 #ifdef WITNESS
 	/* the caller thinks it's ok to sleep, but is it really? */
 	if (flags & SLEEP_OK)
 		pause("t4slptst", 1);
 #endif
 
 	if (INTR_OK)
 		pri = PCATCH;
 	else
 		pri = 0;
 
 	ADAPTER_LOCK(sc);
 	for (;;) {
 
 		if (pi && IS_DOOMED(pi)) {
 			rc = ENXIO;
 			goto done;
 		}
 
 		if (!IS_BUSY(sc)) {
 			rc = 0;
 			break;
 		}
 
 		if (!(flags & SLEEP_OK)) {
 			rc = EBUSY;
 			goto done;
 		}
 
 		if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) {
 			rc = EINTR;
 			goto done;
 		}
 	}
 
 	KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__));
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = wmesg;
 	sc->last_op_thr = curthread;
 #endif
 
 done:
 	if (!(flags & HOLD_LOCK) || rc)
 		ADAPTER_UNLOCK(sc);
 
 	return (rc);
 }
 
 void
 end_synchronized_op(struct adapter *sc, int flags)
 {
 
 	if (flags & LOCK_HELD)
 		ADAPTER_LOCK_ASSERT_OWNED(sc);
 	else
 		ADAPTER_LOCK(sc);
 
 	KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__));
 	CLR_BUSY(sc);
 	wakeup(&sc->flags);
 	ADAPTER_UNLOCK(sc);
 }
 
 static int
 cxgbe_init_synchronized(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
 	int rc = 0;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (isset(&sc->open_device_map, pi->port_id)) {
 		KASSERT(ifp->if_drv_flags & IFF_DRV_RUNNING,
 		    ("mismatch between open_device_map and if_drv_flags"));
 		return (0);	/* already running */
 	}
 
 	if (!(sc->flags & FULL_INIT_DONE) &&
 	    ((rc = adapter_full_init(sc)) != 0))
 		return (rc);	/* error message displayed already */
 
 	if (!(pi->flags & PORT_INIT_DONE) &&
 	    ((rc = port_full_init(pi)) != 0))
 		return (rc); /* error message displayed already */
 
 	rc = update_mac_settings(ifp, XGMAC_ALL);
 	if (rc)
 		goto done;	/* error message displayed already */
 
 	rc = -t4_enable_vi(sc, sc->mbox, pi->viid, true, true);
 	if (rc != 0) {
 		if_printf(ifp, "enable_vi failed: %d\n", rc);
 		goto done;
 	}
 
 	/*
 	 * The first iq of the first port to come up is used for tracing.
 	 */
 	if (sc->traceq < 0) {
 		sc->traceq = sc->sge.rxq[pi->first_rxq].iq.abs_id;
 		t4_write_reg(sc, is_t4(sc) ?  A_MPS_TRC_RSS_CONTROL :
 		    A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) |
 		    V_QUEUENUMBER(sc->traceq));
 		pi->flags |= HAS_TRACEQ;
 	}
 
 	/* all ok */
 	setbit(&sc->open_device_map, pi->port_id);
 	PORT_LOCK(pi);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	PORT_UNLOCK(pi);
 
 	callout_reset(&pi->tick, hz, cxgbe_tick, pi);
 done:
 	if (rc != 0)
 		cxgbe_uninit_synchronized(pi);
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 static int
 cxgbe_uninit_synchronized(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
 	int rc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	/*
 	 * Disable the VI so that all its data in either direction is discarded
 	 * by the MPS.  Leave everything else (the queues, interrupts, and 1Hz
 	 * tick) intact as the TP can deliver negative advice or data that it's
 	 * holding in its RAM (for an offloaded connection) even after the VI is
 	 * disabled.
 	 */
 	rc = -t4_enable_vi(sc, sc->mbox, pi->viid, false, false);
 	if (rc) {
 		if_printf(ifp, "disable_vi failed: %d\n", rc);
 		return (rc);
 	}
 
 	clrbit(&sc->open_device_map, pi->port_id);
 	PORT_LOCK(pi);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	PORT_UNLOCK(pi);
 
 	pi->link_cfg.link_ok = 0;
 	pi->link_cfg.speed = 0;
 	pi->linkdnrc = -1;
 	t4_os_link_changed(sc, pi->port_id, 0, -1);
 
 	return (0);
 }
 
 /*
  * It is ok for this function to fail midway and return right away.  t4_detach
  * will walk the entire sc->irq list and clean up whatever is valid.
  */
 static int
 setup_intr_handlers(struct adapter *sc)
 {
 	int rc, rid, p, q;
 	char s[8];
 	struct irq *irq;
 	struct port_info *pi;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 #endif
 
 	/*
 	 * Setup interrupts.
 	 */
 	irq = &sc->irq[0];
 	rid = sc->intr_type == INTR_INTX ? 0 : 1;
 	if (sc->intr_count == 1)
 		return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all"));
 
 	/* Multiple interrupts. */
 	KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports,
 	    ("%s: too few intr.", __func__));
 
 	/* The first one is always error intr */
 	rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err");
 	if (rc != 0)
 		return (rc);
 	irq++;
 	rid++;
 
 	/* The second one is always the firmware event queue */
 	rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sc->sge.fwq, "evt");
 	if (rc != 0)
 		return (rc);
 	irq++;
 	rid++;
 
 	for_each_port(sc, p) {
 		pi = sc->port[p];
 
 		if (pi->flags & INTR_RXQ) {
 			for_each_rxq(pi, q, rxq) {
 				snprintf(s, sizeof(s), "%d.%d", p, q);
 				rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq,
 				    s);
 				if (rc != 0)
 					return (rc);
 				irq++;
 				rid++;
 			}
 		}
 #ifdef TCP_OFFLOAD
 		if (pi->flags & INTR_OFLD_RXQ) {
 			for_each_ofld_rxq(pi, q, ofld_rxq) {
 				snprintf(s, sizeof(s), "%d,%d", p, q);
 				rc = t4_alloc_irq(sc, irq, rid, t4_intr,
 				    ofld_rxq, s);
 				if (rc != 0)
 					return (rc);
 				irq++;
 				rid++;
 			}
 		}
 #endif
 #ifdef DEV_NETMAP
 		if (pi->flags & INTR_NM_RXQ) {
 			for_each_nm_rxq(pi, q, nm_rxq) {
 				snprintf(s, sizeof(s), "%d-%d", p, q);
 				rc = t4_alloc_irq(sc, irq, rid, t4_nm_intr,
 				    nm_rxq, s);
 				if (rc != 0)
 					return (rc);
 				irq++;
 				rid++;
 			}
 		}
 #endif
 	}
 	MPASS(irq == &sc->irq[sc->intr_count]);
 
 	return (0);
 }
 
 int
 adapter_full_init(struct adapter *sc)
 {
 	int rc, i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 	KASSERT((sc->flags & FULL_INIT_DONE) == 0,
 	    ("%s: FULL_INIT_DONE already", __func__));
 
 	/*
 	 * queues that belong to the adapter (not any particular port).
 	 */
 	rc = t4_setup_adapter_queues(sc);
 	if (rc != 0)
 		goto done;
 
 	for (i = 0; i < nitems(sc->tq); i++) {
 		sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT,
 		    taskqueue_thread_enqueue, &sc->tq[i]);
 		if (sc->tq[i] == NULL) {
 			device_printf(sc->dev,
 			    "failed to allocate task queue %d\n", i);
 			rc = ENOMEM;
 			goto done;
 		}
 		taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d",
 		    device_get_nameunit(sc->dev), i);
 	}
 
 	t4_intr_enable(sc);
 	sc->flags |= FULL_INIT_DONE;
 done:
 	if (rc != 0)
 		adapter_full_uninit(sc);
 
 	return (rc);
 }
 
 int
 adapter_full_uninit(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	t4_teardown_adapter_queues(sc);
 
 	for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) {
 		taskqueue_free(sc->tq[i]);
 		sc->tq[i] = NULL;
 	}
 
 	sc->flags &= ~FULL_INIT_DONE;
 
 	return (0);
 }
 
 int
 port_full_init(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
 	uint16_t *rss;
 	struct sge_rxq *rxq;
 	int rc, i, j;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT((pi->flags & PORT_INIT_DONE) == 0,
 	    ("%s: PORT_INIT_DONE already", __func__));
 
 	sysctl_ctx_init(&pi->ctx);
 	pi->flags |= PORT_SYSCTL_CTX;
 
 	/*
 	 * Allocate tx/rx/fl queues for this port.
 	 */
 	rc = t4_setup_port_queues(pi);
 	if (rc != 0)
 		goto done;	/* error message displayed already */
 
 	/*
 	 * Setup RSS for this port.  Save a copy of the RSS table for later use.
 	 */
 	rss = malloc(pi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK);
 	for (i = 0; i < pi->rss_size;) {
 		for_each_rxq(pi, j, rxq) {
 			rss[i++] = rxq->iq.abs_id;
 			if (i == pi->rss_size)
 				break;
 		}
 	}
 
 	rc = -t4_config_rss_range(sc, sc->mbox, pi->viid, 0, pi->rss_size, rss,
 	    pi->rss_size);
 	if (rc != 0) {
 		if_printf(ifp, "rss_config failed: %d\n", rc);
 		goto done;
 	}
 
 	pi->rss = rss;
 	pi->flags |= PORT_INIT_DONE;
 done:
 	if (rc != 0)
 		port_full_uninit(pi);
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 int
 port_full_uninit(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
 
 	if (pi->flags & PORT_INIT_DONE) {
 
 		/* Need to quiesce queues.  XXX: ctrl queues? */
 
 		for_each_txq(pi, i, txq) {
 			quiesce_eq(sc, &txq->eq);
 		}
 
 #ifdef TCP_OFFLOAD
 		for_each_ofld_txq(pi, i, ofld_txq) {
 			quiesce_eq(sc, &ofld_txq->eq);
 		}
 #endif
 
 		for_each_rxq(pi, i, rxq) {
 			quiesce_iq(sc, &rxq->iq);
 			quiesce_fl(sc, &rxq->fl);
 		}
 
 #ifdef TCP_OFFLOAD
 		for_each_ofld_rxq(pi, i, ofld_rxq) {
 			quiesce_iq(sc, &ofld_rxq->iq);
 			quiesce_fl(sc, &ofld_rxq->fl);
 		}
 #endif
 		free(pi->rss, M_CXGBE);
 	}
 
 	t4_teardown_port_queues(pi);
 	pi->flags &= ~PORT_INIT_DONE;
 
 	return (0);
 }
 
 static void
 quiesce_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	EQ_LOCK(eq);
 	eq->flags |= EQ_DOOMED;
 
 	/*
 	 * Wait for the response to a credit flush if one's
 	 * pending.
 	 */
 	while (eq->flags & EQ_CRFLUSHED)
 		mtx_sleep(eq, &eq->eq_lock, 0, "crflush", 0);
 	EQ_UNLOCK(eq);
 
 	callout_drain(&eq->tx_callout);	/* XXX: iffy */
 	pause("callout", 10);		/* Still iffy */
 
 	taskqueue_drain(sc->tq[eq->tx_chan], &eq->tx_task);
 }
 
 static void
 quiesce_iq(struct adapter *sc, struct sge_iq *iq)
 {
 	(void) sc;	/* unused */
 
 	/* Synchronize with the interrupt handler */
 	while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED))
 		pause("iqfree", 1);
 }
 
 static void
 quiesce_fl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	fl->flags |= FL_DOOMED;
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 
 	callout_drain(&sc->sfl_callout);
 	KASSERT((fl->flags & FL_STARVING) == 0,
 	    ("%s: still starving", __func__));
 }
 
 static int
 t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid,
     driver_intr_t *handler, void *arg, char *name)
 {
 	int rc;
 
 	irq->rid = rid;
 	irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid,
 	    RF_SHAREABLE | RF_ACTIVE);
 	if (irq->res == NULL) {
 		device_printf(sc->dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 
 	rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET,
 	    NULL, handler, arg, &irq->tag);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 		    rid, name, rc);
 	} else if (name)
 		bus_describe_intr(sc->dev, irq->res, irq->tag, name);
 
 	return (rc);
 }
 
 static int
 t4_free_irq(struct adapter *sc, struct irq *irq)
 {
 	if (irq->tag)
 		bus_teardown_intr(sc->dev, irq->res, irq->tag);
 	if (irq->res)
 		bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res);
 
 	bzero(irq, sizeof(*irq));
 
 	return (0);
 }
 
 static void
 reg_block_dump(struct adapter *sc, uint8_t *buf, unsigned int start,
     unsigned int end)
 {
 	uint32_t *p = (uint32_t *)(buf + start);
 
 	for ( ; start <= end; start += sizeof(uint32_t))
 		*p++ = t4_read_reg(sc, start);
 }
 
 static void
 t4_get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf)
 {
 	int i, n;
 	const unsigned int *reg_ranges;
 	static const unsigned int t4_reg_ranges[] = {
 		0x1008, 0x1108,
 		0x1180, 0x11b4,
 		0x11fc, 0x123c,
 		0x1300, 0x173c,
 		0x1800, 0x18fc,
 		0x3000, 0x30d8,
 		0x30e0, 0x5924,
 		0x5960, 0x59d4,
 		0x5a00, 0x5af8,
 		0x6000, 0x6098,
 		0x6100, 0x6150,
 		0x6200, 0x6208,
 		0x6240, 0x6248,
 		0x6280, 0x6338,
 		0x6370, 0x638c,
 		0x6400, 0x643c,
 		0x6500, 0x6524,
 		0x6a00, 0x6a38,
 		0x6a60, 0x6a78,
 		0x6b00, 0x6b84,
 		0x6bf0, 0x6c84,
 		0x6cf0, 0x6d84,
 		0x6df0, 0x6e84,
 		0x6ef0, 0x6f84,
 		0x6ff0, 0x7084,
 		0x70f0, 0x7184,
 		0x71f0, 0x7284,
 		0x72f0, 0x7384,
 		0x73f0, 0x7450,
 		0x7500, 0x7530,
 		0x7600, 0x761c,
 		0x7680, 0x76cc,
 		0x7700, 0x7798,
 		0x77c0, 0x77fc,
 		0x7900, 0x79fc,
 		0x7b00, 0x7c38,
 		0x7d00, 0x7efc,
 		0x8dc0, 0x8e1c,
 		0x8e30, 0x8e78,
 		0x8ea0, 0x8f6c,
 		0x8fc0, 0x9074,
 		0x90fc, 0x90fc,
 		0x9400, 0x9458,
 		0x9600, 0x96bc,
 		0x9800, 0x9808,
 		0x9820, 0x983c,
 		0x9850, 0x9864,
 		0x9c00, 0x9c6c,
 		0x9c80, 0x9cec,
 		0x9d00, 0x9d6c,
 		0x9d80, 0x9dec,
 		0x9e00, 0x9e6c,
 		0x9e80, 0x9eec,
 		0x9f00, 0x9f6c,
 		0x9f80, 0x9fec,
 		0xd004, 0xd03c,
 		0xdfc0, 0xdfe0,
 		0xe000, 0xea7c,
 		0xf000, 0x11110,
 		0x11118, 0x11190,
 		0x19040, 0x1906c,
 		0x19078, 0x19080,
 		0x1908c, 0x19124,
 		0x19150, 0x191b0,
 		0x191d0, 0x191e8,
 		0x19238, 0x1924c,
 		0x193f8, 0x19474,
 		0x19490, 0x194f8,
 		0x19800, 0x19f30,
 		0x1a000, 0x1a06c,
 		0x1a0b0, 0x1a120,
 		0x1a128, 0x1a138,
 		0x1a190, 0x1a1c4,
 		0x1a1fc, 0x1a1fc,
 		0x1e040, 0x1e04c,
 		0x1e284, 0x1e28c,
 		0x1e2c0, 0x1e2c0,
 		0x1e2e0, 0x1e2e0,
 		0x1e300, 0x1e384,
 		0x1e3c0, 0x1e3c8,
 		0x1e440, 0x1e44c,
 		0x1e684, 0x1e68c,
 		0x1e6c0, 0x1e6c0,
 		0x1e6e0, 0x1e6e0,
 		0x1e700, 0x1e784,
 		0x1e7c0, 0x1e7c8,
 		0x1e840, 0x1e84c,
 		0x1ea84, 0x1ea8c,
 		0x1eac0, 0x1eac0,
 		0x1eae0, 0x1eae0,
 		0x1eb00, 0x1eb84,
 		0x1ebc0, 0x1ebc8,
 		0x1ec40, 0x1ec4c,
 		0x1ee84, 0x1ee8c,
 		0x1eec0, 0x1eec0,
 		0x1eee0, 0x1eee0,
 		0x1ef00, 0x1ef84,
 		0x1efc0, 0x1efc8,
 		0x1f040, 0x1f04c,
 		0x1f284, 0x1f28c,
 		0x1f2c0, 0x1f2c0,
 		0x1f2e0, 0x1f2e0,
 		0x1f300, 0x1f384,
 		0x1f3c0, 0x1f3c8,
 		0x1f440, 0x1f44c,
 		0x1f684, 0x1f68c,
 		0x1f6c0, 0x1f6c0,
 		0x1f6e0, 0x1f6e0,
 		0x1f700, 0x1f784,
 		0x1f7c0, 0x1f7c8,
 		0x1f840, 0x1f84c,
 		0x1fa84, 0x1fa8c,
 		0x1fac0, 0x1fac0,
 		0x1fae0, 0x1fae0,
 		0x1fb00, 0x1fb84,
 		0x1fbc0, 0x1fbc8,
 		0x1fc40, 0x1fc4c,
 		0x1fe84, 0x1fe8c,
 		0x1fec0, 0x1fec0,
 		0x1fee0, 0x1fee0,
 		0x1ff00, 0x1ff84,
 		0x1ffc0, 0x1ffc8,
 		0x20000, 0x2002c,
 		0x20100, 0x2013c,
 		0x20190, 0x201c8,
 		0x20200, 0x20318,
 		0x20400, 0x20528,
 		0x20540, 0x20614,
 		0x21000, 0x21040,
 		0x2104c, 0x21060,
 		0x210c0, 0x210ec,
 		0x21200, 0x21268,
 		0x21270, 0x21284,
 		0x212fc, 0x21388,
 		0x21400, 0x21404,
 		0x21500, 0x21518,
 		0x2152c, 0x2153c,
 		0x21550, 0x21554,
 		0x21600, 0x21600,
 		0x21608, 0x21628,
 		0x21630, 0x2163c,
 		0x21700, 0x2171c,
 		0x21780, 0x2178c,
 		0x21800, 0x21c38,
 		0x21c80, 0x21d7c,
 		0x21e00, 0x21e04,
 		0x22000, 0x2202c,
 		0x22100, 0x2213c,
 		0x22190, 0x221c8,
 		0x22200, 0x22318,
 		0x22400, 0x22528,
 		0x22540, 0x22614,
 		0x23000, 0x23040,
 		0x2304c, 0x23060,
 		0x230c0, 0x230ec,
 		0x23200, 0x23268,
 		0x23270, 0x23284,
 		0x232fc, 0x23388,
 		0x23400, 0x23404,
 		0x23500, 0x23518,
 		0x2352c, 0x2353c,
 		0x23550, 0x23554,
 		0x23600, 0x23600,
 		0x23608, 0x23628,
 		0x23630, 0x2363c,
 		0x23700, 0x2371c,
 		0x23780, 0x2378c,
 		0x23800, 0x23c38,
 		0x23c80, 0x23d7c,
 		0x23e00, 0x23e04,
 		0x24000, 0x2402c,
 		0x24100, 0x2413c,
 		0x24190, 0x241c8,
 		0x24200, 0x24318,
 		0x24400, 0x24528,
 		0x24540, 0x24614,
 		0x25000, 0x25040,
 		0x2504c, 0x25060,
 		0x250c0, 0x250ec,
 		0x25200, 0x25268,
 		0x25270, 0x25284,
 		0x252fc, 0x25388,
 		0x25400, 0x25404,
 		0x25500, 0x25518,
 		0x2552c, 0x2553c,
 		0x25550, 0x25554,
 		0x25600, 0x25600,
 		0x25608, 0x25628,
 		0x25630, 0x2563c,
 		0x25700, 0x2571c,
 		0x25780, 0x2578c,
 		0x25800, 0x25c38,
 		0x25c80, 0x25d7c,
 		0x25e00, 0x25e04,
 		0x26000, 0x2602c,
 		0x26100, 0x2613c,
 		0x26190, 0x261c8,
 		0x26200, 0x26318,
 		0x26400, 0x26528,
 		0x26540, 0x26614,
 		0x27000, 0x27040,
 		0x2704c, 0x27060,
 		0x270c0, 0x270ec,
 		0x27200, 0x27268,
 		0x27270, 0x27284,
 		0x272fc, 0x27388,
 		0x27400, 0x27404,
 		0x27500, 0x27518,
 		0x2752c, 0x2753c,
 		0x27550, 0x27554,
 		0x27600, 0x27600,
 		0x27608, 0x27628,
 		0x27630, 0x2763c,
 		0x27700, 0x2771c,
 		0x27780, 0x2778c,
 		0x27800, 0x27c38,
 		0x27c80, 0x27d7c,
 		0x27e00, 0x27e04
 	};
 	static const unsigned int t5_reg_ranges[] = {
 		0x1008, 0x1148,
 		0x1180, 0x11b4,
 		0x11fc, 0x123c,
 		0x1280, 0x173c,
 		0x1800, 0x18fc,
 		0x3000, 0x3028,
 		0x3060, 0x30d8,
 		0x30e0, 0x30fc,
 		0x3140, 0x357c,
 		0x35a8, 0x35cc,
 		0x35ec, 0x35ec,
 		0x3600, 0x5624,
 		0x56cc, 0x575c,
 		0x580c, 0x5814,
 		0x5890, 0x58bc,
 		0x5940, 0x59dc,
 		0x59fc, 0x5a18,
 		0x5a60, 0x5a9c,
 		0x5b94, 0x5bfc,
 		0x6000, 0x6040,
 		0x6058, 0x614c,
 		0x7700, 0x7798,
 		0x77c0, 0x78fc,
 		0x7b00, 0x7c54,
 		0x7d00, 0x7efc,
 		0x8dc0, 0x8de0,
 		0x8df8, 0x8e84,
 		0x8ea0, 0x8f84,
 		0x8fc0, 0x90f8,
 		0x9400, 0x9470,
 		0x9600, 0x96f4,
 		0x9800, 0x9808,
 		0x9820, 0x983c,
 		0x9850, 0x9864,
 		0x9c00, 0x9c6c,
 		0x9c80, 0x9cec,
 		0x9d00, 0x9d6c,
 		0x9d80, 0x9dec,
 		0x9e00, 0x9e6c,
 		0x9e80, 0x9eec,
 		0x9f00, 0x9f6c,
 		0x9f80, 0xa020,
 		0xd004, 0xd03c,
 		0xdfc0, 0xdfe0,
 		0xe000, 0x11088,
 		0x1109c, 0x11110,
 		0x11118, 0x1117c,
 		0x11190, 0x11204,
 		0x19040, 0x1906c,
 		0x19078, 0x19080,
 		0x1908c, 0x19124,
 		0x19150, 0x191b0,
 		0x191d0, 0x191e8,
 		0x19238, 0x19290,
 		0x193f8, 0x19474,
 		0x19490, 0x194cc,
 		0x194f0, 0x194f8,
 		0x19c00, 0x19c60,
 		0x19c94, 0x19e10,
 		0x19e50, 0x19f34,
 		0x19f40, 0x19f50,
 		0x19f90, 0x19fe4,
 		0x1a000, 0x1a06c,
 		0x1a0b0, 0x1a120,
 		0x1a128, 0x1a138,
 		0x1a190, 0x1a1c4,
 		0x1a1fc, 0x1a1fc,
 		0x1e008, 0x1e00c,
 		0x1e040, 0x1e04c,
 		0x1e284, 0x1e290,
 		0x1e2c0, 0x1e2c0,
 		0x1e2e0, 0x1e2e0,
 		0x1e300, 0x1e384,
 		0x1e3c0, 0x1e3c8,
 		0x1e408, 0x1e40c,
 		0x1e440, 0x1e44c,
 		0x1e684, 0x1e690,
 		0x1e6c0, 0x1e6c0,
 		0x1e6e0, 0x1e6e0,
 		0x1e700, 0x1e784,
 		0x1e7c0, 0x1e7c8,
 		0x1e808, 0x1e80c,
 		0x1e840, 0x1e84c,
 		0x1ea84, 0x1ea90,
 		0x1eac0, 0x1eac0,
 		0x1eae0, 0x1eae0,
 		0x1eb00, 0x1eb84,
 		0x1ebc0, 0x1ebc8,
 		0x1ec08, 0x1ec0c,
 		0x1ec40, 0x1ec4c,
 		0x1ee84, 0x1ee90,
 		0x1eec0, 0x1eec0,
 		0x1eee0, 0x1eee0,
 		0x1ef00, 0x1ef84,
 		0x1efc0, 0x1efc8,
 		0x1f008, 0x1f00c,
 		0x1f040, 0x1f04c,
 		0x1f284, 0x1f290,
 		0x1f2c0, 0x1f2c0,
 		0x1f2e0, 0x1f2e0,
 		0x1f300, 0x1f384,
 		0x1f3c0, 0x1f3c8,
 		0x1f408, 0x1f40c,
 		0x1f440, 0x1f44c,
 		0x1f684, 0x1f690,
 		0x1f6c0, 0x1f6c0,
 		0x1f6e0, 0x1f6e0,
 		0x1f700, 0x1f784,
 		0x1f7c0, 0x1f7c8,
 		0x1f808, 0x1f80c,
 		0x1f840, 0x1f84c,
 		0x1fa84, 0x1fa90,
 		0x1fac0, 0x1fac0,
 		0x1fae0, 0x1fae0,
 		0x1fb00, 0x1fb84,
 		0x1fbc0, 0x1fbc8,
 		0x1fc08, 0x1fc0c,
 		0x1fc40, 0x1fc4c,
 		0x1fe84, 0x1fe90,
 		0x1fec0, 0x1fec0,
 		0x1fee0, 0x1fee0,
 		0x1ff00, 0x1ff84,
 		0x1ffc0, 0x1ffc8,
 		0x30000, 0x30030,
 		0x30100, 0x30144,
 		0x30190, 0x301d0,
 		0x30200, 0x30318,
 		0x30400, 0x3052c,
 		0x30540, 0x3061c,
 		0x30800, 0x30834,
 		0x308c0, 0x30908,
 		0x30910, 0x309ac,
 		0x30a00, 0x30a2c,
 		0x30a44, 0x30a50,
 		0x30a74, 0x30c24,
 		0x30d00, 0x30d00,
 		0x30d08, 0x30d14,
 		0x30d1c, 0x30d20,
 		0x30d3c, 0x30d50,
 		0x31200, 0x3120c,
 		0x31220, 0x31220,
 		0x31240, 0x31240,
 		0x31600, 0x3160c,
 		0x31a00, 0x31a1c,
 		0x31e00, 0x31e20,
 		0x31e38, 0x31e3c,
 		0x31e80, 0x31e80,
 		0x31e88, 0x31ea8,
 		0x31eb0, 0x31eb4,
 		0x31ec8, 0x31ed4,
 		0x31fb8, 0x32004,
 		0x32200, 0x32200,
 		0x32208, 0x32240,
 		0x32248, 0x32280,
 		0x32288, 0x322c0,
 		0x322c8, 0x322fc,
 		0x32600, 0x32630,
 		0x32a00, 0x32abc,
 		0x32b00, 0x32b70,
 		0x33000, 0x33048,
 		0x33060, 0x3309c,
 		0x330f0, 0x33148,
 		0x33160, 0x3319c,
 		0x331f0, 0x332e4,
 		0x332f8, 0x333e4,
 		0x333f8, 0x33448,
 		0x33460, 0x3349c,
 		0x334f0, 0x33548,
 		0x33560, 0x3359c,
 		0x335f0, 0x336e4,
 		0x336f8, 0x337e4,
 		0x337f8, 0x337fc,
 		0x33814, 0x33814,
 		0x3382c, 0x3382c,
 		0x33880, 0x3388c,
 		0x338e8, 0x338ec,
 		0x33900, 0x33948,
 		0x33960, 0x3399c,
 		0x339f0, 0x33ae4,
 		0x33af8, 0x33b10,
 		0x33b28, 0x33b28,
 		0x33b3c, 0x33b50,
 		0x33bf0, 0x33c10,
 		0x33c28, 0x33c28,
 		0x33c3c, 0x33c50,
 		0x33cf0, 0x33cfc,
 		0x34000, 0x34030,
 		0x34100, 0x34144,
 		0x34190, 0x341d0,
 		0x34200, 0x34318,
 		0x34400, 0x3452c,
 		0x34540, 0x3461c,
 		0x34800, 0x34834,
 		0x348c0, 0x34908,
 		0x34910, 0x349ac,
 		0x34a00, 0x34a2c,
 		0x34a44, 0x34a50,
 		0x34a74, 0x34c24,
 		0x34d00, 0x34d00,
 		0x34d08, 0x34d14,
 		0x34d1c, 0x34d20,
 		0x34d3c, 0x34d50,
 		0x35200, 0x3520c,
 		0x35220, 0x35220,
 		0x35240, 0x35240,
 		0x35600, 0x3560c,
 		0x35a00, 0x35a1c,
 		0x35e00, 0x35e20,
 		0x35e38, 0x35e3c,
 		0x35e80, 0x35e80,
 		0x35e88, 0x35ea8,
 		0x35eb0, 0x35eb4,
 		0x35ec8, 0x35ed4,
 		0x35fb8, 0x36004,
 		0x36200, 0x36200,
 		0x36208, 0x36240,
 		0x36248, 0x36280,
 		0x36288, 0x362c0,
 		0x362c8, 0x362fc,
 		0x36600, 0x36630,
 		0x36a00, 0x36abc,
 		0x36b00, 0x36b70,
 		0x37000, 0x37048,
 		0x37060, 0x3709c,
 		0x370f0, 0x37148,
 		0x37160, 0x3719c,
 		0x371f0, 0x372e4,
 		0x372f8, 0x373e4,
 		0x373f8, 0x37448,
 		0x37460, 0x3749c,
 		0x374f0, 0x37548,
 		0x37560, 0x3759c,
 		0x375f0, 0x376e4,
 		0x376f8, 0x377e4,
 		0x377f8, 0x377fc,
 		0x37814, 0x37814,
 		0x3782c, 0x3782c,
 		0x37880, 0x3788c,
 		0x378e8, 0x378ec,
 		0x37900, 0x37948,
 		0x37960, 0x3799c,
 		0x379f0, 0x37ae4,
 		0x37af8, 0x37b10,
 		0x37b28, 0x37b28,
 		0x37b3c, 0x37b50,
 		0x37bf0, 0x37c10,
 		0x37c28, 0x37c28,
 		0x37c3c, 0x37c50,
 		0x37cf0, 0x37cfc,
 		0x38000, 0x38030,
 		0x38100, 0x38144,
 		0x38190, 0x381d0,
 		0x38200, 0x38318,
 		0x38400, 0x3852c,
 		0x38540, 0x3861c,
 		0x38800, 0x38834,
 		0x388c0, 0x38908,
 		0x38910, 0x389ac,
 		0x38a00, 0x38a2c,
 		0x38a44, 0x38a50,
 		0x38a74, 0x38c24,
 		0x38d00, 0x38d00,
 		0x38d08, 0x38d14,
 		0x38d1c, 0x38d20,
 		0x38d3c, 0x38d50,
 		0x39200, 0x3920c,
 		0x39220, 0x39220,
 		0x39240, 0x39240,
 		0x39600, 0x3960c,
 		0x39a00, 0x39a1c,
 		0x39e00, 0x39e20,
 		0x39e38, 0x39e3c,
 		0x39e80, 0x39e80,
 		0x39e88, 0x39ea8,
 		0x39eb0, 0x39eb4,
 		0x39ec8, 0x39ed4,
 		0x39fb8, 0x3a004,
 		0x3a200, 0x3a200,
 		0x3a208, 0x3a240,
 		0x3a248, 0x3a280,
 		0x3a288, 0x3a2c0,
 		0x3a2c8, 0x3a2fc,
 		0x3a600, 0x3a630,
 		0x3aa00, 0x3aabc,
 		0x3ab00, 0x3ab70,
 		0x3b000, 0x3b048,
 		0x3b060, 0x3b09c,
 		0x3b0f0, 0x3b148,
 		0x3b160, 0x3b19c,
 		0x3b1f0, 0x3b2e4,
 		0x3b2f8, 0x3b3e4,
 		0x3b3f8, 0x3b448,
 		0x3b460, 0x3b49c,
 		0x3b4f0, 0x3b548,
 		0x3b560, 0x3b59c,
 		0x3b5f0, 0x3b6e4,
 		0x3b6f8, 0x3b7e4,
 		0x3b7f8, 0x3b7fc,
 		0x3b814, 0x3b814,
 		0x3b82c, 0x3b82c,
 		0x3b880, 0x3b88c,
 		0x3b8e8, 0x3b8ec,
 		0x3b900, 0x3b948,
 		0x3b960, 0x3b99c,
 		0x3b9f0, 0x3bae4,
 		0x3baf8, 0x3bb10,
 		0x3bb28, 0x3bb28,
 		0x3bb3c, 0x3bb50,
 		0x3bbf0, 0x3bc10,
 		0x3bc28, 0x3bc28,
 		0x3bc3c, 0x3bc50,
 		0x3bcf0, 0x3bcfc,
 		0x3c000, 0x3c030,
 		0x3c100, 0x3c144,
 		0x3c190, 0x3c1d0,
 		0x3c200, 0x3c318,
 		0x3c400, 0x3c52c,
 		0x3c540, 0x3c61c,
 		0x3c800, 0x3c834,
 		0x3c8c0, 0x3c908,
 		0x3c910, 0x3c9ac,
 		0x3ca00, 0x3ca2c,
 		0x3ca44, 0x3ca50,
 		0x3ca74, 0x3cc24,
 		0x3cd00, 0x3cd00,
 		0x3cd08, 0x3cd14,
 		0x3cd1c, 0x3cd20,
 		0x3cd3c, 0x3cd50,
 		0x3d200, 0x3d20c,
 		0x3d220, 0x3d220,
 		0x3d240, 0x3d240,
 		0x3d600, 0x3d60c,
 		0x3da00, 0x3da1c,
 		0x3de00, 0x3de20,
 		0x3de38, 0x3de3c,
 		0x3de80, 0x3de80,
 		0x3de88, 0x3dea8,
 		0x3deb0, 0x3deb4,
 		0x3dec8, 0x3ded4,
 		0x3dfb8, 0x3e004,
 		0x3e200, 0x3e200,
 		0x3e208, 0x3e240,
 		0x3e248, 0x3e280,
 		0x3e288, 0x3e2c0,
 		0x3e2c8, 0x3e2fc,
 		0x3e600, 0x3e630,
 		0x3ea00, 0x3eabc,
 		0x3eb00, 0x3eb70,
 		0x3f000, 0x3f048,
 		0x3f060, 0x3f09c,
 		0x3f0f0, 0x3f148,
 		0x3f160, 0x3f19c,
 		0x3f1f0, 0x3f2e4,
 		0x3f2f8, 0x3f3e4,
 		0x3f3f8, 0x3f448,
 		0x3f460, 0x3f49c,
 		0x3f4f0, 0x3f548,
 		0x3f560, 0x3f59c,
 		0x3f5f0, 0x3f6e4,
 		0x3f6f8, 0x3f7e4,
 		0x3f7f8, 0x3f7fc,
 		0x3f814, 0x3f814,
 		0x3f82c, 0x3f82c,
 		0x3f880, 0x3f88c,
 		0x3f8e8, 0x3f8ec,
 		0x3f900, 0x3f948,
 		0x3f960, 0x3f99c,
 		0x3f9f0, 0x3fae4,
 		0x3faf8, 0x3fb10,
 		0x3fb28, 0x3fb28,
 		0x3fb3c, 0x3fb50,
 		0x3fbf0, 0x3fc10,
 		0x3fc28, 0x3fc28,
 		0x3fc3c, 0x3fc50,
 		0x3fcf0, 0x3fcfc,
 		0x40000, 0x4000c,
 		0x40040, 0x40068,
 		0x4007c, 0x40144,
 		0x40180, 0x4018c,
 		0x40200, 0x40298,
 		0x402ac, 0x4033c,
 		0x403f8, 0x403fc,
 		0x41304, 0x413c4,
 		0x41400, 0x4141c,
 		0x41480, 0x414d0,
 		0x44000, 0x44078,
 		0x440c0, 0x44278,
 		0x442c0, 0x44478,
 		0x444c0, 0x44678,
 		0x446c0, 0x44878,
 		0x448c0, 0x449fc,
 		0x45000, 0x45068,
 		0x45080, 0x45084,
 		0x450a0, 0x450b0,
 		0x45200, 0x45268,
 		0x45280, 0x45284,
 		0x452a0, 0x452b0,
 		0x460c0, 0x460e4,
 		0x47000, 0x4708c,
 		0x47200, 0x47250,
 		0x47400, 0x47420,
 		0x47600, 0x47618,
 		0x47800, 0x47814,
 		0x48000, 0x4800c,
 		0x48040, 0x48068,
 		0x4807c, 0x48144,
 		0x48180, 0x4818c,
 		0x48200, 0x48298,
 		0x482ac, 0x4833c,
 		0x483f8, 0x483fc,
 		0x49304, 0x493c4,
 		0x49400, 0x4941c,
 		0x49480, 0x494d0,
 		0x4c000, 0x4c078,
 		0x4c0c0, 0x4c278,
 		0x4c2c0, 0x4c478,
 		0x4c4c0, 0x4c678,
 		0x4c6c0, 0x4c878,
 		0x4c8c0, 0x4c9fc,
 		0x4d000, 0x4d068,
 		0x4d080, 0x4d084,
 		0x4d0a0, 0x4d0b0,
 		0x4d200, 0x4d268,
 		0x4d280, 0x4d284,
 		0x4d2a0, 0x4d2b0,
 		0x4e0c0, 0x4e0e4,
 		0x4f000, 0x4f08c,
 		0x4f200, 0x4f250,
 		0x4f400, 0x4f420,
 		0x4f600, 0x4f618,
 		0x4f800, 0x4f814,
 		0x50000, 0x500cc,
 		0x50400, 0x50400,
 		0x50800, 0x508cc,
 		0x50c00, 0x50c00,
 		0x51000, 0x5101c,
 		0x51300, 0x51308,
 	};
 
 	if (is_t4(sc)) {
 		reg_ranges = &t4_reg_ranges[0];
 		n = nitems(t4_reg_ranges);
 	} else {
 		reg_ranges = &t5_reg_ranges[0];
 		n = nitems(t5_reg_ranges);
 	}
 
 	regs->version = chip_id(sc) | chip_rev(sc) << 10;
 	for (i = 0; i < n; i += 2)
 		reg_block_dump(sc, buf, reg_ranges[i], reg_ranges[i + 1]);
 }
 
 static void
 cxgbe_tick(void *arg)
 {
 	struct port_info *pi = arg;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
 	struct sge_txq *txq;
 	int i, drops;
 	struct port_stats *s = &pi->stats;
 
 	PORT_LOCK(pi);
 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		PORT_UNLOCK(pi);
 		return;	/* without scheduling another callout */
 	}
 
 	t4_get_port_stats(sc, pi->tx_chan, s);
 
 	ifp->if_opackets = s->tx_frames - s->tx_pause;
 	ifp->if_ipackets = s->rx_frames - s->rx_pause;
 	ifp->if_obytes = s->tx_octets - s->tx_pause * 64;
 	ifp->if_ibytes = s->rx_octets - s->rx_pause * 64;
 	ifp->if_omcasts = s->tx_mcast_frames - s->tx_pause;
 	ifp->if_imcasts = s->rx_mcast_frames - s->rx_pause;
 	ifp->if_iqdrops = s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 +
 	    s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 +
 	    s->rx_trunc3;
 	for (i = 0; i < 4; i++) {
 		if (pi->rx_chan_map & (1 << i)) {
 			uint32_t v;
 
 			/*
 			 * XXX: indirect reads from the same ADDR/DATA pair can
 			 * race with each other.
 			 */
 			t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v,
 			    1, A_TP_MIB_TNL_CNG_DROP_0 + i);
 			ifp->if_iqdrops += v;
 		}
 	}
 
 	drops = s->tx_drop;
 	for_each_txq(pi, i, txq)
 		drops += txq->br->br_drops;
 	ifp->if_oqdrops = drops;
 
 	ifp->if_oerrors = s->tx_error_frames;
 	ifp->if_ierrors = s->rx_jabber + s->rx_runt + s->rx_too_long +
 	    s->rx_fcs_err + s->rx_len_err;
 
 	callout_schedule(&pi->tick, hz);
 	PORT_UNLOCK(pi);
 }
 
 static void
 cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid)
 {
 	struct ifnet *vlan;
 
 	if (arg != ifp || ifp->if_type != IFT_ETHER)
 		return;
 
 	vlan = VLAN_DEVAT(ifp, vid);
 	VLAN_SETCOOKIE(vlan, ifp);
 }
 
 static int
 cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 
 #ifdef INVARIANTS
 	panic("%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 #else
 	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n",
 	    __func__, rss->opcode, iq, m);
 	m_freem(m);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	if (opcode >= nitems(sc->cpl_handler))
 		return (EINVAL);
 
 	new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled;
 	loc = (uintptr_t *) &sc->cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 static int
 an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
 {
 
 #ifdef INVARIANTS
 	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
 #else
 	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n",
 	    __func__, iq, ctrl);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_an_handler(struct adapter *sc, an_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
 	loc = (uintptr_t *) &sc->an_handler;
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 static int
 fw_msg_not_handled(struct adapter *sc, const __be64 *rpl)
 {
 	const struct cpl_fw6_msg *cpl =
 	    __containerof(rpl, struct cpl_fw6_msg, data[0]);
 
 #ifdef INVARIANTS
 	panic("%s: fw_msg type %d", __func__, cpl->type);
 #else
 	log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_fw_msg_handler(struct adapter *sc, int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	if (type >= nitems(sc->fw_msg_handler))
 		return (EINVAL);
 
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL)
 		return (EINVAL);
 
 	new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled;
 	loc = (uintptr_t *) &sc->fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 static int
 t4_sysctls(struct adapter *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children, *c0;
 	static char *caps[] = {
 		"\20\1PPP\2QFC\3DCBX",			/* caps[0] linkcaps */
 		"\20\1NIC\2VM\3IDS\4UM\5UM_ISGL"	/* caps[1] niccaps */
 		    "\6HASHFILTER\7ETHOFLD",
 		"\20\1TOE",				/* caps[2] toecaps */
 		"\20\1RDDP\2RDMAC",			/* caps[3] rdmacaps */
 		"\20\1INITIATOR_PDU\2TARGET_PDU"	/* caps[4] iscsicaps */
 		    "\3INITIATOR_CNXOFLD\4TARGET_CNXOFLD"
 		    "\5INITIATOR_SSNOFLD\6TARGET_SSNOFLD",
 		"\20\1INITIATOR\2TARGET\3CTRL_OFLD"	/* caps[5] fcoecaps */
 		    "\4PO_INITIAOR\5PO_TARGET"
 	};
 	static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"};
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 
 	/*
 	 * dev.t4nex.X.
 	 */
 	oid = device_get_sysctl_tree(sc->dev);
 	c0 = children = SYSCTL_CHILDREN(oid);
 
 	sc->sc_do_rxcopy = 1;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW,
 	    &sc->sc_do_rxcopy, 1, "Do RX copy of small frames");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL,
 	    sc->params.nports, "# of ports");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD,
 	    NULL, chip_rev(sc), "chip hardware revision");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf",
 	    CTLFLAG_RD, &sc->cfg_file, 0, "configuration file");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL,
 	    sc->cfcsum, "config file checksum");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells",
 	    CTLTYPE_STRING | CTLFLAG_RD, doorbells, sc->doorbells,
 	    sysctl_bitfield, "A", "available doorbells");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkcaps",
 	    CTLTYPE_STRING | CTLFLAG_RD, caps[0], sc->linkcaps,
 	    sysctl_bitfield, "A", "available link capabilities");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "niccaps",
 	    CTLTYPE_STRING | CTLFLAG_RD, caps[1], sc->niccaps,
 	    sysctl_bitfield, "A", "available NIC capabilities");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "toecaps",
 	    CTLTYPE_STRING | CTLFLAG_RD, caps[2], sc->toecaps,
 	    sysctl_bitfield, "A", "available TCP offload capabilities");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdmacaps",
 	    CTLTYPE_STRING | CTLFLAG_RD, caps[3], sc->rdmacaps,
 	    sysctl_bitfield, "A", "available RDMA capabilities");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "iscsicaps",
 	    CTLTYPE_STRING | CTLFLAG_RD, caps[4], sc->iscsicaps,
 	    sysctl_bitfield, "A", "available iSCSI capabilities");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoecaps",
 	    CTLTYPE_STRING | CTLFLAG_RD, caps[5], sc->fcoecaps,
 	    sysctl_bitfield, "A", "available FCoE capabilities");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL,
 	    sc->params.vpd.cclk, "core clock frequency (in KHz)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc->sge.timer_val,
 	    sizeof(sc->sge.timer_val), sysctl_int_array, "A",
 	    "interrupt holdoff timer values (us)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc->sge.counter_val,
 	    sizeof(sc->sge.counter_val), sysctl_int_array, "A",
 	    "interrupt holdoff packet counter values");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD,
 	    NULL, sc->tids.nftids, "number of filters");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT |
 	    CTLFLAG_RD, sc, 0, sysctl_temperature, "I",
 	    "chip temperature (in Celsius)");
 
 	t4_sge_sysctls(sc, ctx, children);
 
 	sc->lro_timeout = 100;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW,
 	    &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)");
 
 #ifdef SBUF_DRAIN
 	/*
 	 * dev.t4nex.X.misc.  Marked CTLFLAG_SKIP to avoid information overload.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc",
 	    CTLFLAG_RD | CTLFLAG_SKIP, NULL,
 	    "logs and miscellaneous information");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cctrl, "A", "congestion control");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 1,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 2,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 3,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 4,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 5,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_la, "A", "CIM logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_ma_la, "A", "CIM MA logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)");
 
 	if (is_t5(sc)) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ,
 		    sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ,
 		    sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)");
 	}
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_pif_la, "A", "CIM PIF logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_qcfg, "A", "CIM queue configuration");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cpl_stats, "A", "CPL statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_ddp_stats, "A", "DDP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_devlog, "A", "firmware's device log");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_fcoe_stats, "A", "FCoE statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_hw_sched, "A", "hardware scheduler ");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_l2t, "A", "hardware L2 table");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_lb_stats, "A", "loopback statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_meminfo, "A", "memory regions");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_mps_tcam, "A", "MPS TCAM entries");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_path_mtus, "A", "path MTUs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_pm_stats, "A", "PM statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_rdma_stats, "A", "RDMA statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tcp_stats, "A", "TCP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tids, "A", "TID information");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tp_err_stats, "A", "TP error statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tp_la, "A", "TP logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tx_rate, "A", "Tx rate");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_ulprx_la, "A", "ULPRX logic analyzer");
 
 	if (is_t5(sc)) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 		    sysctl_wcwr_stats, "A", "write combined work requests");
 	}
 #endif
 
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		/*
 		 * dev.t4nex.X.toe.
 		 */
 		oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD,
 		    NULL, "TOE parameters");
 		children = SYSCTL_CHILDREN(oid);
 
 		sc->tt.sndbuf = 256 * 1024;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW,
 		    &sc->tt.sndbuf, 0, "max hardware send buffer size");
 
 		sc->tt.ddp = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW,
 		    &sc->tt.ddp, 0, "DDP allowed");
 
 		sc->tt.indsz = G_INDICATESIZE(t4_read_reg(sc, A_TP_PARA_REG5));
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "indsz", CTLFLAG_RW,
 		    &sc->tt.indsz, 0, "DDP max indicate size allowed");
 
 		sc->tt.ddp_thres =
 		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2));
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp_thres", CTLFLAG_RW,
 		    &sc->tt.ddp_thres, 0, "DDP threshold");
 
 		sc->tt.rx_coalesce = 1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce",
 		    CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing");
 	}
 #endif
 
 
 	return (0);
 }
 
 static int
 cxgbe_sysctls(struct port_info *pi)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 	struct adapter *sc = pi->adapter;
 
 	ctx = device_get_sysctl_ctx(pi->dev);
 
 	/*
 	 * dev.cxgbe.X.
 	 */
 	oid = device_get_sysctl_tree(pi->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING |
 	   CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down");
 	if (pi->port_type == FW_PORT_TYPE_BT_XAUI) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature",
 		    CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I",
 		    "PHY temperature (in Celsius)");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version",
 		    CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I",
 		    "PHY firmware version");
 	}
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD,
 	    &pi->nrxq, 0, "# of rx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD,
 	    &pi->ntxq, 0, "# of tx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD,
 	    &pi->first_rxq, 0, "index of first rx queue");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
 	    &pi->first_txq, 0, "index of first tx queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT |
 	    CTLFLAG_RW, pi, 0, sysctl_noflowq, "IU",
 	    "Reserve queue 0 for non-flowid packets");
 
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD,
 		    &pi->nofldrxq, 0,
 		    "# of rx queues for offloaded TCP connections");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD,
 		    &pi->nofldtxq, 0,
 		    "# of tx queues for offloaded TCP connections");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq",
 		    CTLFLAG_RD, &pi->first_ofld_rxq, 0,
 		    "index of first TOE rx queue");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq",
 		    CTLFLAG_RD, &pi->first_ofld_txq, 0,
 		    "index of first TOE tx queue");
 	}
 #endif
 #ifdef DEV_NETMAP
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD,
 	    &pi->nnmrxq, 0, "# of rx queues for netmap");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD,
 	    &pi->nnmtxq, 0, "# of tx queues for netmap");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq",
 	    CTLFLAG_RD, &pi->first_nm_rxq, 0,
 	    "index of first netmap rx queue");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq",
 	    CTLFLAG_RD, &pi->first_nm_txq, 0,
 	    "index of first netmap tx queue");
 #endif
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx",
 	    CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_holdoff_tmr_idx, "I",
 	    "holdoff timer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx",
 	    CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_holdoff_pktc_idx, "I",
 	    "holdoff packet counter index");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq",
 	    CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_qsize_rxq, "I",
 	    "rx queue size");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq",
 	    CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_qsize_txq, "I",
 	    "tx queue size");
 
+	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings",
+	    CTLTYPE_STRING | CTLFLAG_RW, pi, PAUSE_TX, sysctl_pause_settings,
+	    "A", "PAUSE settings (bit 0 = rx_pause, bit 1 = tx_pause)");
+
 	/*
 	 * dev.cxgbe.X.stats.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD,
 	    NULL, "port statistics");
 	children = SYSCTL_CHILDREN(oid);
 
 #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \
 	SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \
 	    CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \
 	    sysctl_handle_t4_reg64, "QU", desc)
 
 	SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_64",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L));
 
 	SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err",
 	    "# of frames received with bad FCS",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_len_err",
 	    "# of frames received with length error",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_64",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L));
 
 #undef SYSCTL_ADD_T4_REG64
 
 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \
 	    &pi->stats.name, desc)
 
 	/* We get these from port_stats and they may be stale by upto 1s */
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0,
 	    "# drops due to buffer-group 0 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1,
 	    "# drops due to buffer-group 1 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2,
 	    "# drops due to buffer-group 2 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3,
 	    "# drops due to buffer-group 3 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc0,
 	    "# of buffer-group 0 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc1,
 	    "# of buffer-group 1 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc2,
 	    "# of buffer-group 2 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc3,
 	    "# of buffer-group 3 truncated packets");
 
 #undef SYSCTL_ADD_T4_PORTSTAT
 
 	return (0);
 }
 
 static int
 sysctl_int_array(SYSCTL_HANDLER_ARGS)
 {
 	int rc, *i;
 	struct sbuf sb;
 
 	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
 	for (i = arg1; arg2; arg2 -= sizeof(int), i++)
 		sbuf_printf(&sb, "%d ", *i);
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 static int
 sysctl_bitfield(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%b", (int)arg2, (char *)arg1);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_btphy(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	int op = arg2;
 	struct adapter *sc = pi->adapter;
 	u_int v;
 	int rc;
 
 	rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4btt");
 	if (rc)
 		return (rc);
 	/* XXX: magic numbers */
 	rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820,
 	    &v);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 	if (op == 0)
 		v /= 256;
 
 	rc = sysctl_handle_int(oidp, &v, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_noflowq(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	int rc, val;
 
 	val = pi->rsrv_noflowq;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if ((val >= 1) && (pi->ntxq > 1))
 		pi->rsrv_noflowq = 1;
 	else
 		pi->rsrv_noflowq = 0;
 
 	return (rc);
 }
 
 static int
 sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	int idx, rc, i;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	uint8_t v;
 
 	idx = pi->tmr_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < 0 || idx >= SGE_NTIMERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4tmr");
 	if (rc)
 		return (rc);
 
 	v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(pi->pktc_idx != -1);
 	for_each_rxq(pi, i, rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&rxq->iq.intr_params, v);
 #else
 		rxq->iq.intr_params = v;
 #endif
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&ofld_rxq->iq.intr_params, v);
 #else
 		ofld_rxq->iq.intr_params = v;
 #endif
 	}
 #endif
 	pi->tmr_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	int idx, rc;
 
 	idx = pi->pktc_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < -1 || idx >= SGE_NCOUNTERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4pktc");
 	if (rc)
 		return (rc);
 
 	if (pi->flags & PORT_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		pi->pktc_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	int qsize, rc;
 
 	qsize = pi->qsize_rxq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (qsize < 128 || (qsize & 7))
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4rxqs");
 	if (rc)
 		return (rc);
 
 	if (pi->flags & PORT_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		pi->qsize_rxq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_txq(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	int qsize, rc;
 
 	qsize = pi->qsize_txq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	/* bufring size must be powerof2 */
 	if (qsize < 128 || !powerof2(qsize))
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4txqs");
 	if (rc)
 		return (rc);
 
 	if (pi->flags & PORT_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		pi->qsize_txq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
+	return (rc);
+}
+
+static int
+sysctl_pause_settings(SYSCTL_HANDLER_ARGS)
+{
+	struct port_info *pi = arg1;
+	struct adapter *sc = pi->adapter;
+	struct link_config *lc = &pi->link_cfg;
+	int rc;
+
+	if (req->newptr == NULL) {
+		struct sbuf *sb;
+		static char *bits = "\20\1PAUSE_RX\2PAUSE_TX";
+
+		rc = sysctl_wire_old_buffer(req, 0);
+		if (rc != 0)
+			return(rc);
+
+		sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
+		if (sb == NULL)
+			return (ENOMEM);
+
+		sbuf_printf(sb, "%b", lc->fc & (PAUSE_TX | PAUSE_RX), bits);
+		rc = sbuf_finish(sb);
+		sbuf_delete(sb);
+	} else {
+		char s[2];
+		int n;
+
+		s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX));
+		s[1] = 0;
+
+		rc = sysctl_handle_string(oidp, s, sizeof(s), req);
+		if (rc != 0)
+			return(rc);
+
+		if (s[1] != 0)
+			return (EINVAL);
+		if (s[0] < '0' || s[0] > '9')
+			return (EINVAL);	/* not a number */
+		n = s[0] - '0';
+		if (n & ~(PAUSE_TX | PAUSE_RX))
+			return (EINVAL);	/* some other bit is set too */
+
+		rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4PAUSE");
+		if (rc)
+			return (rc);
+		if ((lc->requested_fc & (PAUSE_TX | PAUSE_RX)) != n) {
+			int link_ok = lc->link_ok;
+
+			lc->requested_fc &= ~(PAUSE_TX | PAUSE_RX);
+			lc->requested_fc |= n;
+			rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, lc);
+			lc->link_ok = link_ok;	/* restore */
+		}
+		end_synchronized_op(sc, 0);
+	}
+
 	return (rc);
 }
 
 static int
 sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int reg = arg2;
 	uint64_t val;
 
 	val = t4_read_reg64(sc, reg);
 
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 static int
 sysctl_temperature(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, t;
 	uint32_t param, val;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp");
 	if (rc)
 		return (rc);
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 	    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	/* unknown is returned as 0 but we display -1 in that case */
 	t = val == 0 ? -1 : val;
 
 	rc = sysctl_handle_int(oidp, &t, 0, req);
 	return (rc);
 }
 
 #ifdef SBUF_DRAIN
 static int
 sysctl_cctrl(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t incr[NMTUS][NCCTRL_WIN];
 	static const char *dec_fac[] = {
 		"0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875",
 		"0.9375"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_read_cong_tbl(sc, incr);
 
 	for (i = 0; i < NCCTRL_WIN; ++i) {
 		sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i,
 		    incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i],
 		    incr[5][i], incr[6][i], incr[7][i]);
 		sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n",
 		    incr[8][i], incr[9][i], incr[10][i], incr[11][i],
 		    incr[12][i], incr[13][i], incr[14][i], incr[15][i],
 		    sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = {
 	"TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI",	/* ibq's */
 	"ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI",	/* obq's */
 	"SGE0-RX", "SGE1-RX"	/* additional obq's (T5 onwards) */
 };
 
 static int
 sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n, qid = arg2;
 	uint32_t *buf, *p;
 	char *qtype;
 	u_int cim_num_obq = is_t4(sc) ? CIM_NUM_OBQ : CIM_NUM_OBQ_T5;
 
 	KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq,
 	    ("%s: bad qid %d\n", __func__, qid));
 
 	if (qid < CIM_NUM_IBQ) {
 		/* inbound queue */
 		qtype = "IBQ";
 		n = 4 * CIM_IBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		rc = t4_read_cim_ibq(sc, qid, buf, n);
 	} else {
 		/* outbound queue */
 		qtype = "OBQ";
 		qid -= CIM_NUM_IBQ;
 		n = 4 * cim_num_obq * CIM_OBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		rc = t4_read_cim_obq(sc, qid, buf, n);
 	}
 
 	if (rc < 0) {
 		rc = -rc;
 		goto done;
 	}
 	n = rc * sizeof(uint32_t);	/* rc has # of words actually read */
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		goto done;
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]);
 	for (i = 0, p = buf; i < n; i += 16, p += 4)
 		sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1],
 		    p[2], p[3]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int cfg;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg);
 	if (rc != 0)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	rc = -t4_cim_read_la(sc, buf, NULL);
 	if (rc != 0)
 		goto done;
 
 	sbuf_printf(sb, "Status   Data      PC%s",
 	    cfg & F_UPDBGLACAPTPCONLY ? "" :
 	    "     LS0Stat  LS0Addr             LS0Data");
 
 	KASSERT((sc->params.cim_la_size & 7) == 0,
 	    ("%s: p will walk off the end of buf", __func__));
 
 	for (p = buf; p < &buf[sc->params.cim_la_size]; p += 8) {
 		if (cfg & F_UPDBGLACAPTPCONLY) {
 			sbuf_printf(sb, "\n  %02x   %08x %08x", p[5] & 0xff,
 			    p[6], p[7]);
 			sbuf_printf(sb, "\n  %02x   %02x%06x %02x%06x",
 			    (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8,
 			    p[4] & 0xff, p[5] >> 8);
 			sbuf_printf(sb, "\n  %02x   %x%07x %x%07x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4);
 		} else {
 			sbuf_printf(sb,
 			    "\n  %02x   %x%07x %x%07x %08x %08x "
 			    "%08x%08x%08x%08x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5],
 			    p[6], p[7]);
 		}
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE);
 	p = buf;
 
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2],
 		    p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCnt ID Tag UE       Data       RDY VLD");
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%3u %2u  %x   %u %08x%08x  %u   %u",
 		    (p[2] >> 10) & 0xff, (p[2] >> 7) & 7,
 		    (p[2] >> 3) & 0xf, (p[2] >> 2) & 1,
 		    (p[1] >> 2) | ((p[2] & 3) << 30),
 		    (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1,
 		    p[0] & 1);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL);
 	p = buf;
 
 	sbuf_printf(sb, "Cntl ID DataBE   Addr                 Data");
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x  %04x  %08x %08x%08x%08x%08x",
 		    (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff,
 		    p[4], p[3], p[2], p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCntl ID               Data");
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x %08x%08x%08x%08x",
 		    (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t thres[CIM_NUM_IBQ];
 	uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr;
 	uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat;
 	u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq;
 
 	if (is_t4(sc)) {
 		cim_num_obq = CIM_NUM_OBQ;
 		ibq_rdaddr = A_UP_IBQ_0_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_REALADDR;
 	} else {
 		cim_num_obq = CIM_NUM_OBQ_T5;
 		ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR;
 	}
 	nq = CIM_NUM_IBQ + cim_num_obq;
 
 	rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat);
 	if (rc == 0)
 		rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr);
 	if (rc != 0)
 		return (rc);
 
 	t4_read_cimq_cfg(sc, base, size, thres);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "Queue  Base  Size Thres RdPtr WrPtr  SOP  EOP Avail");
 
 	for (i = 0; i < CIM_NUM_IBQ; i++, p += 4)
 		sbuf_printf(sb, "\n%7s %5x %5u %5u %6x  %4x %4u %4u %5u",
 		    qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]),
 		    G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 	for ( ; i < nq; i++, p += 4, wr += 2)
 		sbuf_printf(sb, "\n%7s %5x %5u %12x  %4x %4u %4u %5u", qname[i],
 		    base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff,
 		    wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cpl_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_cpl_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_tp_get_cpl_stats(sc, &stats);
 
 	sbuf_printf(sb, "                 channel 0  channel 1  channel 2  "
 	    "channel 3\n");
 	sbuf_printf(sb, "CPL requests:   %10u %10u %10u %10u\n",
 		   stats.req[0], stats.req[1], stats.req[2], stats.req[3]);
 	sbuf_printf(sb, "CPL responses:  %10u %10u %10u %10u",
 		   stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_ddp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_usm_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_get_usm_stats(sc, &stats);
 
 	sbuf_printf(sb, "Frames: %u\n", stats.frames);
 	sbuf_printf(sb, "Octets: %ju\n", stats.octets);
 	sbuf_printf(sb, "Drops:  %u", stats.drops);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 const char *devlog_level_strings[] = {
 	[FW_DEVLOG_LEVEL_EMERG]		= "EMERG",
 	[FW_DEVLOG_LEVEL_CRIT]		= "CRIT",
 	[FW_DEVLOG_LEVEL_ERR]		= "ERR",
 	[FW_DEVLOG_LEVEL_NOTICE]	= "NOTICE",
 	[FW_DEVLOG_LEVEL_INFO]		= "INFO",
 	[FW_DEVLOG_LEVEL_DEBUG]		= "DEBUG"
 };
 
 const char *devlog_facility_strings[] = {
 	[FW_DEVLOG_FACILITY_CORE]	= "CORE",
 	[FW_DEVLOG_FACILITY_CF]		= "CF",
 	[FW_DEVLOG_FACILITY_SCHED]	= "SCHED",
 	[FW_DEVLOG_FACILITY_TIMER]	= "TIMER",
 	[FW_DEVLOG_FACILITY_RES]	= "RES",
 	[FW_DEVLOG_FACILITY_HW]		= "HW",
 	[FW_DEVLOG_FACILITY_FLR]	= "FLR",
 	[FW_DEVLOG_FACILITY_DMAQ]	= "DMAQ",
 	[FW_DEVLOG_FACILITY_PHY]	= "PHY",
 	[FW_DEVLOG_FACILITY_MAC]	= "MAC",
 	[FW_DEVLOG_FACILITY_PORT]	= "PORT",
 	[FW_DEVLOG_FACILITY_VI]		= "VI",
 	[FW_DEVLOG_FACILITY_FILTER]	= "FILTER",
 	[FW_DEVLOG_FACILITY_ACL]	= "ACL",
 	[FW_DEVLOG_FACILITY_TM]		= "TM",
 	[FW_DEVLOG_FACILITY_QFC]	= "QFC",
 	[FW_DEVLOG_FACILITY_DCB]	= "DCB",
 	[FW_DEVLOG_FACILITY_ETH]	= "ETH",
 	[FW_DEVLOG_FACILITY_OFLD]	= "OFLD",
 	[FW_DEVLOG_FACILITY_RI]		= "RI",
 	[FW_DEVLOG_FACILITY_ISCSI]	= "ISCSI",
 	[FW_DEVLOG_FACILITY_FCOE]	= "FCOE",
 	[FW_DEVLOG_FACILITY_FOISCSI]	= "FOISCSI",
 	[FW_DEVLOG_FACILITY_FOFCOE]	= "FOFCOE"
 };
 
 static int
 sysctl_devlog(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct devlog_params *dparams = &sc->params.devlog;
 	struct fw_devlog_e *buf, *e;
 	int i, j, rc, nentries, first = 0, m;
 	struct sbuf *sb;
 	uint64_t ftstamp = UINT64_MAX;
 
 	if (dparams->start == 0) {
 		dparams->memtype = FW_MEMTYPE_EDC0;
 		dparams->start = 0x84000;
 		dparams->size = 32768;
 	}
 
 	nentries = dparams->size / sizeof(struct fw_devlog_e);
 
 	buf = malloc(dparams->size, M_CXGBE, M_NOWAIT);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	m = fwmtype_to_hwmtype(dparams->memtype);
 	rc = -t4_mem_read(sc, m, dparams->start, dparams->size, (void *)buf);
 	if (rc != 0)
 		goto done;
 
 	for (i = 0; i < nentries; i++) {
 		e = &buf[i];
 
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		e->timestamp = be64toh(e->timestamp);
 		e->seqno = be32toh(e->seqno);
 		for (j = 0; j < 8; j++)
 			e->params[j] = be32toh(e->params[j]);
 
 		if (e->timestamp < ftstamp) {
 			ftstamp = e->timestamp;
 			first = i;
 		}
 	}
 
 	if (buf[first].timestamp == 0)
 		goto done;	/* nothing in the log */
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		goto done;
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 	sbuf_printf(sb, "%10s  %15s  %8s  %8s  %s\n",
 	    "Seq#", "Tstamp", "Level", "Facility", "Message");
 
 	i = first;
 	do {
 		e = &buf[i];
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		sbuf_printf(sb, "%10d  %15ju  %8s  %8s  ",
 		    e->seqno, e->timestamp,
 		    (e->level < nitems(devlog_level_strings) ?
 			devlog_level_strings[e->level] : "UNKNOWN"),
 		    (e->facility < nitems(devlog_facility_strings) ?
 			devlog_facility_strings[e->facility] : "UNKNOWN"));
 		sbuf_printf(sb, e->fmt, e->params[0], e->params[1],
 		    e->params[2], e->params[3], e->params[4],
 		    e->params[5], e->params[6], e->params[7]);
 
 		if (++i == nentries)
 			i = 0;
 	} while (i != first);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_fcoe_stats stats[4];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_get_fcoe_stats(sc, 0, &stats[0]);
 	t4_get_fcoe_stats(sc, 1, &stats[1]);
 	t4_get_fcoe_stats(sc, 2, &stats[2]);
 	t4_get_fcoe_stats(sc, 3, &stats[3]);
 
 	sbuf_printf(sb, "                   channel 0        channel 1        "
 	    "channel 2        channel 3\n");
 	sbuf_printf(sb, "octetsDDP:  %16ju %16ju %16ju %16ju\n",
 	    stats[0].octetsDDP, stats[1].octetsDDP, stats[2].octetsDDP,
 	    stats[3].octetsDDP);
 	sbuf_printf(sb, "framesDDP:  %16u %16u %16u %16u\n", stats[0].framesDDP,
 	    stats[1].framesDDP, stats[2].framesDDP, stats[3].framesDDP);
 	sbuf_printf(sb, "framesDrop: %16u %16u %16u %16u",
 	    stats[0].framesDrop, stats[1].framesDrop, stats[2].framesDrop,
 	    stats[3].framesDrop);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_hw_sched(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	unsigned int map, kbps, ipg, mode;
 	unsigned int pace_tab[NTX_SCHED];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP);
 	mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG));
 	t4_read_pace_tbl(sc, pace_tab);
 
 	sbuf_printf(sb, "Scheduler  Mode   Channel  Rate (Kbps)   "
 	    "Class IPG (0.1 ns)   Flow IPG (us)");
 
 	for (i = 0; i < NTX_SCHED; ++i, map >>= 2) {
 		t4_get_tx_sched(sc, i, &kbps, &ipg);
 		sbuf_printf(sb, "\n    %u      %-5s     %u     ", i,
 		    (mode & (1 << i)) ? "flow" : "class", map & 3);
 		if (kbps)
 			sbuf_printf(sb, "%9u     ", kbps);
 		else
 			sbuf_printf(sb, " disabled     ");
 
 		if (ipg)
 			sbuf_printf(sb, "%13u        ", ipg);
 		else
 			sbuf_printf(sb, "     disabled        ");
 
 		if (pace_tab[i])
 			sbuf_printf(sb, "%10u", pace_tab[i]);
 		else
 			sbuf_printf(sb, "  disabled");
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_lb_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, j;
 	uint64_t *p0, *p1;
 	struct lb_port_stats s[2];
 	static const char *stat_name[] = {
 		"OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:",
 		"UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:",
 		"Frames128To255:", "Frames256To511:", "Frames512To1023:",
 		"Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:",
 		"BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:",
 		"BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:",
 		"BG2FramesTrunc:", "BG3FramesTrunc:"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	memset(s, 0, sizeof(s));
 
 	for (i = 0; i < 4; i += 2) {
 		t4_get_lb_stats(sc, i, &s[0]);
 		t4_get_lb_stats(sc, i + 1, &s[1]);
 
 		p0 = &s[0].octets;
 		p1 = &s[1].octets;
 		sbuf_printf(sb, "%s                       Loopback %u"
 		    "           Loopback %u", i == 0 ? "" : "\n", i, i + 1);
 
 		for (j = 0; j < nitems(stat_name); j++)
 			sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j],
 				   *p0++, *p1++);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_linkdnrc(SYSCTL_HANDLER_ARGS)
 {
 	int rc = 0;
 	struct port_info *pi = arg1;
 	struct sbuf *sb;
 	static const char *linkdnreasons[] = {
 		"non-specific", "remote fault", "autoneg failed", "reserved3",
 		"PHY overheated", "unknown", "rx los", "reserved7"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 64, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (pi->linkdnrc < 0)
 		sbuf_printf(sb, "n/a");
 	else if (pi->linkdnrc < nitems(linkdnreasons))
 		sbuf_printf(sb, "%s", linkdnreasons[pi->linkdnrc]);
 	else
 		sbuf_printf(sb, "%d", pi->linkdnrc);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 struct mem_desc {
 	unsigned int base;
 	unsigned int limit;
 	unsigned int idx;
 };
 
 static int
 mem_desc_cmp(const void *a, const void *b)
 {
 	return ((const struct mem_desc *)a)->base -
 	       ((const struct mem_desc *)b)->base;
 }
 
 static void
 mem_region_show(struct sbuf *sb, const char *name, unsigned int from,
     unsigned int to)
 {
 	unsigned int size;
 
 	size = to - from + 1;
 	if (size == 0)
 		return;
 
 	/* XXX: need humanize_number(3) in libkern for a more readable 'size' */
 	sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size);
 }
 
 static int
 sysctl_meminfo(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n;
 	uint32_t lo, hi, used, alloc;
 	static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"};
 	static const char *region[] = {
 		"DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:",
 		"Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:",
 		"Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:",
 		"TDDP region:", "TPT region:", "STAG region:", "RQ region:",
 		"RQUDP region:", "PBL region:", "TXPBL region:",
 		"DBVFIFO region:", "ULPRX state:", "ULPTX state:",
 		"On-chip queues:"
 	};
 	struct mem_desc avail[4];
 	struct mem_desc mem[nitems(region) + 3];	/* up to 3 holes */
 	struct mem_desc *md = mem;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < nitems(mem); i++) {
 		mem[i].limit = 0;
 		mem[i].idx = i;
 	}
 
 	/* Find and sort the populated memory ranges */
 	i = 0;
 	lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	if (lo & F_EDRAM0_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		avail[i].base = G_EDRAM0_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20);
 		avail[i].idx = 0;
 		i++;
 	}
 	if (lo & F_EDRAM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		avail[i].base = G_EDRAM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20);
 		avail[i].idx = 1;
 		i++;
 	}
 	if (lo & F_EXT_MEM_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		avail[i].base = G_EXT_MEM_BASE(hi) << 20;
 		avail[i].limit = avail[i].base +
 		    (G_EXT_MEM_SIZE(hi) << 20);
 		avail[i].idx = is_t4(sc) ? 2 : 3;	/* Call it MC for T4 */
 		i++;
 	}
 	if (!is_t4(sc) && lo & F_EXT_MEM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		avail[i].base = G_EXT_MEM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base +
 		    (G_EXT_MEM1_SIZE(hi) << 20);
 		avail[i].idx = 4;
 		i++;
 	}
 	if (!i)                                    /* no memory available */
 		return 0;
 	qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp);
 
 	(md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE);
 
 	/* the next few have explicit upper bounds */
 	md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) *
 		    G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE));
 	md++;
 
 	md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) *
 		    G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE));
 	md++;
 
 	if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 		hi = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4;
 		md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE);
 		md->limit = (sc->tids.ntids - hi) * 16 + md->base - 1;
 	} else {
 		md->base = 0;
 		md->idx = nitems(region);  /* hide it */
 	}
 	md++;
 
 #define ulp_region(reg) \
 	md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\
 	(md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT)
 
 	ulp_region(RX_ISCSI);
 	ulp_region(RX_TDDP);
 	ulp_region(TX_TPT);
 	ulp_region(RX_STAG);
 	ulp_region(RX_RQ);
 	ulp_region(RX_RQUDP);
 	ulp_region(RX_PBL);
 	ulp_region(TX_PBL);
 #undef ulp_region
 
 	md->base = 0;
 	md->idx = nitems(region);
 	if (!is_t4(sc) && t4_read_reg(sc, A_SGE_CONTROL2) & F_VFIFO_ENABLE) {
 		md->base = G_BASEADDR(t4_read_reg(sc, A_SGE_DBVFIFO_BADDR));
 		md->limit = md->base + (G_DBVFIFO_SIZE((t4_read_reg(sc,
 		    A_SGE_DBVFIFO_SIZE))) << 2) - 1;
 	}
 	md++;
 
 	md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE);
 	md->limit = md->base + sc->tids.ntids - 1;
 	md++;
 	md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE);
 	md->limit = md->base + sc->tids.ntids - 1;
 	md++;
 
 	md->base = sc->vres.ocq.start;
 	if (sc->vres.ocq.size)
 		md->limit = md->base + sc->vres.ocq.size - 1;
 	else
 		md->idx = nitems(region);  /* hide it */
 	md++;
 
 	/* add any address-space holes, there can be up to 3 */
 	for (n = 0; n < i - 1; n++)
 		if (avail[n].limit < avail[n + 1].base)
 			(md++)->base = avail[n].limit;
 	if (avail[n].limit)
 		(md++)->base = avail[n].limit;
 
 	n = md - mem;
 	qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp);
 
 	for (lo = 0; lo < i; lo++)
 		mem_region_show(sb, memory[avail[lo].idx], avail[lo].base,
 				avail[lo].limit - 1);
 
 	sbuf_printf(sb, "\n");
 	for (i = 0; i < n; i++) {
 		if (mem[i].idx >= nitems(region))
 			continue;                        /* skip holes */
 		if (!mem[i].limit)
 			mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0;
 		mem_region_show(sb, region[mem[i].idx], mem[i].base,
 				mem[i].limit);
 	}
 
 	sbuf_printf(sb, "\n");
 	lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP RAM:", lo, hi);
 
 	lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP Extmem2:", lo, hi);
 
 	lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE);
 	sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n",
 		   G_PMRXMAXPAGE(lo),
 		   t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10,
 		   (lo & F_PMRXNUMCHN) ? 2 : 1);
 
 	lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE);
 	hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
 	sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n",
 		   G_PMTXMAXPAGE(lo),
 		   hi >= (1 << 20) ? (hi >> 20) : (hi >> 10),
 		   hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo));
 	sbuf_printf(sb, "%u p-structs\n",
 		   t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT));
 
 	for (i = 0; i < 4; i++) {
 		lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4);
 		if (is_t4(sc)) {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		} else {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		}
 		sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated",
 			   i, used, alloc);
 	}
 	for (i = 0; i < 4; i++) {
 		lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4);
 		if (is_t4(sc)) {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		} else {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		}
 		sbuf_printf(sb,
 			   "\nLoopback %d using %u pages out of %u allocated",
 			   i, used, alloc);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static inline void
 tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask)
 {
 	*mask = x | y;
 	y = htobe64(y);
 	memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN);
 }
 
 static int
 sysctl_mps_tcam(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "Idx  Ethernet address     Mask     Vld Ports PF"
 	    "  VF              Replication             P0 P1 P2 P3  ML");
 	n = is_t4(sc) ? NUM_MPS_CLS_SRAM_L_INSTANCES :
 	    NUM_MPS_T5_CLS_SRAM_L_INSTANCES;
 	for (i = 0; i < n; i++) {
 		uint64_t tcamx, tcamy, mask;
 		uint32_t cls_lo, cls_hi;
 		uint8_t addr[ETHER_ADDR_LEN];
 
 		tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i));
 		tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i));
 		cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i));
 		cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i));
 
 		if (tcamx & tcamy)
 			continue;
 
 		tcamxy2valmask(tcamx, tcamy, addr, &mask);
 		sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx"
 			   "  %c   %#x%4u%4d", i, addr[0], addr[1], addr[2],
 			   addr[3], addr[4], addr[5], (uintmax_t)mask,
 			   (cls_lo & F_SRAM_VLD) ? 'Y' : 'N',
 			   G_PORTMAP(cls_hi), G_PF(cls_lo),
 			   (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1);
 
 		if (cls_lo & F_REPLICATE) {
 			struct fw_ldst_cmd ldst_cmd;
 
 			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
 			ldst_cmd.op_to_addrspace =
 			    htobe32(V_FW_CMD_OP(FW_LDST_CMD) |
 				F_FW_CMD_REQUEST | F_FW_CMD_READ |
 				V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS));
 			ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd));
 			ldst_cmd.u.mps.fid_ctl =
 			    htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) |
 				V_FW_LDST_CMD_CTL(i));
 
 			rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t4mps");
 			if (rc)
 				break;
 			rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd,
 			    sizeof(ldst_cmd), &ldst_cmd);
 			end_synchronized_op(sc, 0);
 
 			if (rc != 0) {
 				sbuf_printf(sb,
 				    " ------------ error %3u ------------", rc);
 				rc = 0;
 			} else {
 				sbuf_printf(sb, " %08x %08x %08x %08x",
 				    be32toh(ldst_cmd.u.mps.rplc127_96),
 				    be32toh(ldst_cmd.u.mps.rplc95_64),
 				    be32toh(ldst_cmd.u.mps.rplc63_32),
 				    be32toh(ldst_cmd.u.mps.rplc31_0));
 			}
 		} else
 			sbuf_printf(sb, "%36s", "");
 
 		sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo),
 		    G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo),
 		    G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf);
 	}
 
 	if (rc)
 		(void) sbuf_finish(sb);
 	else
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_path_mtus(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint16_t mtus[NMTUS];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_read_mtu_tbl(sc, mtus, NULL);
 
 	sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u",
 	    mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6],
 	    mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13],
 	    mtus[14], mtus[15]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_pm_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint32_t cnt[PM_NSTATS];
 	uint64_t cyc[PM_NSTATS];
 	static const char *rx_stats[] = {
 		"Read:", "Write bypass:", "Write mem:", "Flush:"
 	};
 	static const char *tx_stats[] = {
 		"Read:", "Write bypass:", "Write mem:", "Bypass + mem:"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_pmtx_get_stats(sc, cnt, cyc);
 	sbuf_printf(sb, "                Tx pcmds             Tx bytes");
 	for (i = 0; i < ARRAY_SIZE(tx_stats); i++)
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], cnt[i],
 		    cyc[i]);
 
 	t4_pmrx_get_stats(sc, cnt, cyc);
 	sbuf_printf(sb, "\n                Rx pcmds             Rx bytes");
 	for (i = 0; i < ARRAY_SIZE(rx_stats); i++)
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], cnt[i],
 		    cyc[i]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_rdma_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_rdma_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_tp_get_rdma_stats(sc, &stats);
 	sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod);
 	sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tcp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_tcp_stats v4, v6;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_tp_get_tcp_stats(sc, &v4, &v6);
 	sbuf_printf(sb,
 	    "                                IP                 IPv6\n");
 	sbuf_printf(sb, "OutRsts:      %20u %20u\n",
 	    v4.tcpOutRsts, v6.tcpOutRsts);
 	sbuf_printf(sb, "InSegs:       %20ju %20ju\n",
 	    v4.tcpInSegs, v6.tcpInSegs);
 	sbuf_printf(sb, "OutSegs:      %20ju %20ju\n",
 	    v4.tcpOutSegs, v6.tcpOutSegs);
 	sbuf_printf(sb, "RetransSegs:  %20ju %20ju",
 	    v4.tcpRetransSegs, v6.tcpRetransSegs);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tids(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tid_info *t = &sc->tids;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (t->natids) {
 		sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1,
 		    t->atids_in_use);
 	}
 
 	if (t->ntids) {
 		if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 			uint32_t b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4;
 
 			if (b) {
 				sbuf_printf(sb, "TID range: 0-%u, %u-%u", b - 1,
 				    t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4,
 				    t->ntids - 1);
 			} else {
 				sbuf_printf(sb, "TID range: %u-%u",
 				    t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4,
 				    t->ntids - 1);
 			}
 		} else
 			sbuf_printf(sb, "TID range: 0-%u", t->ntids - 1);
 		sbuf_printf(sb, ", in use: %u\n",
 		    atomic_load_acq_int(&t->tids_in_use));
 	}
 
 	if (t->nstids) {
 		sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base,
 		    t->stid_base + t->nstids - 1, t->stids_in_use);
 	}
 
 	if (t->nftids) {
 		sbuf_printf(sb, "FTID range: %u-%u\n", t->ftid_base,
 		    t->ftid_base + t->nftids - 1);
 	}
 
 	if (t->netids) {
 		sbuf_printf(sb, "ETID range: %u-%u\n", t->etid_base,
 		    t->etid_base + t->netids - 1);
 	}
 
 	sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users",
 	    t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4),
 	    t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6));
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_err_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_tp_get_err_stats(sc, &stats);
 
 	sbuf_printf(sb, "                 channel 0  channel 1  channel 2  "
 		      "channel 3\n");
 	sbuf_printf(sb, "macInErrs:      %10u %10u %10u %10u\n",
 	    stats.macInErrs[0], stats.macInErrs[1], stats.macInErrs[2],
 	    stats.macInErrs[3]);
 	sbuf_printf(sb, "hdrInErrs:      %10u %10u %10u %10u\n",
 	    stats.hdrInErrs[0], stats.hdrInErrs[1], stats.hdrInErrs[2],
 	    stats.hdrInErrs[3]);
 	sbuf_printf(sb, "tcpInErrs:      %10u %10u %10u %10u\n",
 	    stats.tcpInErrs[0], stats.tcpInErrs[1], stats.tcpInErrs[2],
 	    stats.tcpInErrs[3]);
 	sbuf_printf(sb, "tcp6InErrs:     %10u %10u %10u %10u\n",
 	    stats.tcp6InErrs[0], stats.tcp6InErrs[1], stats.tcp6InErrs[2],
 	    stats.tcp6InErrs[3]);
 	sbuf_printf(sb, "tnlCongDrops:   %10u %10u %10u %10u\n",
 	    stats.tnlCongDrops[0], stats.tnlCongDrops[1], stats.tnlCongDrops[2],
 	    stats.tnlCongDrops[3]);
 	sbuf_printf(sb, "tnlTxDrops:     %10u %10u %10u %10u\n",
 	    stats.tnlTxDrops[0], stats.tnlTxDrops[1], stats.tnlTxDrops[2],
 	    stats.tnlTxDrops[3]);
 	sbuf_printf(sb, "ofldVlanDrops:  %10u %10u %10u %10u\n",
 	    stats.ofldVlanDrops[0], stats.ofldVlanDrops[1],
 	    stats.ofldVlanDrops[2], stats.ofldVlanDrops[3]);
 	sbuf_printf(sb, "ofldChanDrops:  %10u %10u %10u %10u\n\n",
 	    stats.ofldChanDrops[0], stats.ofldChanDrops[1],
 	    stats.ofldChanDrops[2], stats.ofldChanDrops[3]);
 	sbuf_printf(sb, "ofldNoNeigh:    %u\nofldCongDefer:  %u",
 	    stats.ofldNoNeigh, stats.ofldCongDefer);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 struct field_desc {
 	const char *name;
 	u_int start;
 	u_int width;
 };
 
 static void
 field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f)
 {
 	char buf[32];
 	int line_size = 0;
 
 	while (f->name) {
 		uint64_t mask = (1ULL << f->width) - 1;
 		int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name,
 		    ((uintmax_t)v >> f->start) & mask);
 
 		if (line_size + len >= 79) {
 			line_size = 8;
 			sbuf_printf(sb, "\n        ");
 		}
 		sbuf_printf(sb, "%s ", buf);
 		line_size += len + 1;
 		f++;
 	}
 	sbuf_printf(sb, "\n");
 }
 
 static struct field_desc tp_la0[] = {
 	{ "RcfOpCodeOut", 60, 4 },
 	{ "State", 56, 4 },
 	{ "WcfState", 52, 4 },
 	{ "RcfOpcSrcOut", 50, 2 },
 	{ "CRxError", 49, 1 },
 	{ "ERxError", 48, 1 },
 	{ "SanityFailed", 47, 1 },
 	{ "SpuriousMsg", 46, 1 },
 	{ "FlushInputMsg", 45, 1 },
 	{ "FlushInputCpl", 44, 1 },
 	{ "RssUpBit", 43, 1 },
 	{ "RssFilterHit", 42, 1 },
 	{ "Tid", 32, 10 },
 	{ "InitTcb", 31, 1 },
 	{ "LineNumber", 24, 7 },
 	{ "Emsg", 23, 1 },
 	{ "EdataOut", 22, 1 },
 	{ "Cmsg", 21, 1 },
 	{ "CdataOut", 20, 1 },
 	{ "EreadPdu", 19, 1 },
 	{ "CreadPdu", 18, 1 },
 	{ "TunnelPkt", 17, 1 },
 	{ "RcfPeerFin", 16, 1 },
 	{ "RcfReasonOut", 12, 4 },
 	{ "TxCchannel", 10, 2 },
 	{ "RcfTxChannel", 8, 2 },
 	{ "RxEchannel", 6, 2 },
 	{ "RcfRxChannel", 5, 1 },
 	{ "RcfDataOutSrdy", 4, 1 },
 	{ "RxDvld", 3, 1 },
 	{ "RxOoDvld", 2, 1 },
 	{ "RxCongestion", 1, 1 },
 	{ "TxCongestion", 0, 1 },
 	{ NULL }
 };
 
 static struct field_desc tp_la1[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "CplCmdOut", 48, 8 },
 	{ "ESynOut", 47, 1 },
 	{ "EAckOut", 46, 1 },
 	{ "EFinOut", 45, 1 },
 	{ "ERstOut", 44, 1 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static struct field_desc tp_la2[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "MpsVfVld", 55, 1 },
 	{ "MpsPf", 52, 3 },
 	{ "MpsVf", 44, 8 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static void
 tp_la_show(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	field_desc_show(sb, *p, tp_la0);
 }
 
 static void
 tp_la_show2(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], tp_la0);
 }
 
 static void
 tp_la_show3(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1);
 }
 
 static int
 sysctl_tp_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint64_t *buf, *p;
 	int rc;
 	u_int i, inc;
 	void (*show_func)(struct sbuf *, uint64_t *, int);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK);
 
 	t4_tp_read_la(sc, buf, NULL);
 	p = buf;
 
 	switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) {
 	case 2:
 		inc = 2;
 		show_func = tp_la_show2;
 		break;
 	case 3:
 		inc = 2;
 		show_func = tp_la_show3;
 		break;
 	default:
 		inc = 1;
 		show_func = tp_la_show;
 	}
 
 	for (i = 0; i < TPLA_SIZE / inc; i++, p += inc)
 		(*show_func)(sb, p, i);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_tx_rate(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	u64 nrate[NCHAN], orate[NCHAN];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_get_chan_txrate(sc, nrate, orate);
 	sbuf_printf(sb, "              channel 0   channel 1   channel 2   "
 		 "channel 3\n");
 	sbuf_printf(sb, "NIC B/s:     %10ju  %10ju  %10ju  %10ju\n",
 	    nrate[0], nrate[1], nrate[2], nrate[3]);
 	sbuf_printf(sb, "Offload B/s: %10ju  %10ju  %10ju  %10ju",
 	    orate[0], orate[1], orate[2], orate[3]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_ulprx_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc, i;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_ulprx_read_la(sc, buf);
 	p = buf;
 
 	sbuf_printf(sb, "      Pcmd        Type   Message"
 	    "                Data");
 	for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) {
 		sbuf_printf(sb, "\n%08x%08x  %4x  %08x  %08x%08x%08x%08x",
 		    p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, v;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	v = t4_read_reg(sc, A_SGE_STAT_CFG);
 	if (G_STATSOURCE_T5(v) == 7) {
 		if (G_STATMODE(v) == 0) {
 			sbuf_printf(sb, "total %d, incomplete %d",
 			    t4_read_reg(sc, A_SGE_STAT_TOTAL),
 			    t4_read_reg(sc, A_SGE_STAT_MATCH));
 		} else if (G_STATMODE(v) == 1) {
 			sbuf_printf(sb, "total %d, data overflow %d",
 			    t4_read_reg(sc, A_SGE_STAT_TOTAL),
 			    t4_read_reg(sc, A_SGE_STAT_MATCH));
 		}
 	}
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 #endif
 
 static inline void
 txq_start(struct ifnet *ifp, struct sge_txq *txq)
 {
 	struct buf_ring *br;
 	struct mbuf *m;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 
 	br = txq->br;
 	m = txq->m ? txq->m : drbr_dequeue(ifp, br);
 	if (m)
 		t4_eth_tx(ifp, txq, m);
 }
 
 void
 t4_tx_callout(void *arg)
 {
 	struct sge_eq *eq = arg;
 	struct adapter *sc;
 
 	if (EQ_TRYLOCK(eq) == 0)
 		goto reschedule;
 
 	if (eq->flags & EQ_STALLED && !can_resume_tx(eq)) {
 		EQ_UNLOCK(eq);
 reschedule:
 		if (__predict_true(!(eq->flags && EQ_DOOMED)))
 			callout_schedule(&eq->tx_callout, 1);
 		return;
 	}
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 
 	if (__predict_true((eq->flags & EQ_DOOMED) == 0)) {
 
 		if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
 			struct sge_txq *txq = arg;
 			struct port_info *pi = txq->ifp->if_softc;
 
 			sc = pi->adapter;
 		} else {
 			struct sge_wrq *wrq = arg;
 
 			sc = wrq->adapter;
 		}
 
 		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
 	}
 
 	EQ_UNLOCK(eq);
 }
 
 void
 t4_tx_task(void *arg, int count)
 {
 	struct sge_eq *eq = arg;
 
 	EQ_LOCK(eq);
 	if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
 		struct sge_txq *txq = arg;
 		txq_start(txq->ifp, txq);
 	} else {
 		struct sge_wrq *wrq = arg;
 		t4_wrq_tx_locked(wrq->adapter, wrq, NULL);
 	}
 	EQ_UNLOCK(eq);
 }
 
 static uint32_t
 fconf_to_mode(uint32_t fconf)
 {
 	uint32_t mode;
 
 	mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR |
 	    T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT;
 
 	if (fconf & F_FRAGMENTATION)
 		mode |= T4_FILTER_IP_FRAGMENT;
 
 	if (fconf & F_MPSHITTYPE)
 		mode |= T4_FILTER_MPS_HIT_TYPE;
 
 	if (fconf & F_MACMATCH)
 		mode |= T4_FILTER_MAC_IDX;
 
 	if (fconf & F_ETHERTYPE)
 		mode |= T4_FILTER_ETH_TYPE;
 
 	if (fconf & F_PROTOCOL)
 		mode |= T4_FILTER_IP_PROTO;
 
 	if (fconf & F_TOS)
 		mode |= T4_FILTER_IP_TOS;
 
 	if (fconf & F_VLAN)
 		mode |= T4_FILTER_VLAN;
 
 	if (fconf & F_VNIC_ID)
 		mode |= T4_FILTER_VNIC;
 
 	if (fconf & F_PORT)
 		mode |= T4_FILTER_PORT;
 
 	if (fconf & F_FCOE)
 		mode |= T4_FILTER_FCoE;
 
 	return (mode);
 }
 
 static uint32_t
 mode_to_fconf(uint32_t mode)
 {
 	uint32_t fconf = 0;
 
 	if (mode & T4_FILTER_IP_FRAGMENT)
 		fconf |= F_FRAGMENTATION;
 
 	if (mode & T4_FILTER_MPS_HIT_TYPE)
 		fconf |= F_MPSHITTYPE;
 
 	if (mode & T4_FILTER_MAC_IDX)
 		fconf |= F_MACMATCH;
 
 	if (mode & T4_FILTER_ETH_TYPE)
 		fconf |= F_ETHERTYPE;
 
 	if (mode & T4_FILTER_IP_PROTO)
 		fconf |= F_PROTOCOL;
 
 	if (mode & T4_FILTER_IP_TOS)
 		fconf |= F_TOS;
 
 	if (mode & T4_FILTER_VLAN)
 		fconf |= F_VLAN;
 
 	if (mode & T4_FILTER_VNIC)
 		fconf |= F_VNIC_ID;
 
 	if (mode & T4_FILTER_PORT)
 		fconf |= F_PORT;
 
 	if (mode & T4_FILTER_FCoE)
 		fconf |= F_FCOE;
 
 	return (fconf);
 }
 
 static uint32_t
 fspec_to_fconf(struct t4_filter_specification *fs)
 {
 	uint32_t fconf = 0;
 
 	if (fs->val.frag || fs->mask.frag)
 		fconf |= F_FRAGMENTATION;
 
 	if (fs->val.matchtype || fs->mask.matchtype)
 		fconf |= F_MPSHITTYPE;
 
 	if (fs->val.macidx || fs->mask.macidx)
 		fconf |= F_MACMATCH;
 
 	if (fs->val.ethtype || fs->mask.ethtype)
 		fconf |= F_ETHERTYPE;
 
 	if (fs->val.proto || fs->mask.proto)
 		fconf |= F_PROTOCOL;
 
 	if (fs->val.tos || fs->mask.tos)
 		fconf |= F_TOS;
 
 	if (fs->val.vlan_vld || fs->mask.vlan_vld)
 		fconf |= F_VLAN;
 
 	if (fs->val.vnic_vld || fs->mask.vnic_vld)
 		fconf |= F_VNIC_ID;
 
 	if (fs->val.iport || fs->mask.iport)
 		fconf |= F_PORT;
 
 	if (fs->val.fcoe || fs->mask.fcoe)
 		fconf |= F_FCOE;
 
 	return (fconf);
 }
 
 static int
 get_filter_mode(struct adapter *sc, uint32_t *mode)
 {
 	int rc;
 	uint32_t fconf;
 
 	rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4getfm");
 	if (rc)
 		return (rc);
 
 	t4_read_indirect(sc, A_TP_PIO_ADDR, A_TP_PIO_DATA, &fconf, 1,
 	    A_TP_VLAN_PRI_MAP);
 
 	if (sc->params.tp.vlan_pri_map != fconf) {
 		log(LOG_WARNING, "%s: cached filter mode out of sync %x %x.\n",
 		    device_get_nameunit(sc->dev), sc->params.tp.vlan_pri_map,
 		    fconf);
 		sc->params.tp.vlan_pri_map = fconf;
 	}
 
 	*mode = fconf_to_mode(sc->params.tp.vlan_pri_map);
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 set_filter_mode(struct adapter *sc, uint32_t mode)
 {
 	uint32_t fconf;
 	int rc;
 
 	fconf = mode_to_fconf(mode);
 
 	rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4setfm");
 	if (rc)
 		return (rc);
 
 	if (sc->tids.ftids_in_use > 0) {
 		rc = EBUSY;
 		goto done;
 	}
 
 #ifdef TCP_OFFLOAD
 	if (sc->offload_map) {
 		rc = EBUSY;
 		goto done;
 	}
 #endif
 
 #ifdef notyet
 	rc = -t4_set_filter_mode(sc, fconf);
 	if (rc == 0)
 		sc->filter_mode = fconf;
 #else
 	rc = ENOTSUP;
 #endif
 
 done:
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static inline uint64_t
 get_filter_hits(struct adapter *sc, uint32_t fid)
 {
 	uint32_t mw_base, off, tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	uint64_t hits;
 
 	memwin_info(sc, 0, &mw_base, NULL);
 	off = position_memwin(sc, 0,
 	    tcb_base + (fid + sc->tids.ftid_base) * TCB_SIZE);
 	if (is_t4(sc)) {
 		hits = t4_read_reg64(sc, mw_base + off + 16);
 		hits = be64toh(hits);
 	} else {
 		hits = t4_read_reg(sc, mw_base + off + 24);
 		hits = be32toh(hits);
 	}
 
 	return (hits);
 }
 
 static int
 get_filter(struct adapter *sc, struct t4_filter *t)
 {
 	int i, rc, nfilters = sc->tids.nftids;
 	struct filter_entry *f;
 
 	rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4getf");
 	if (rc)
 		return (rc);
 
 	if (sc->tids.ftids_in_use == 0 || sc->tids.ftid_tab == NULL ||
 	    t->idx >= nfilters) {
 		t->idx = 0xffffffff;
 		goto done;
 	}
 
 	f = &sc->tids.ftid_tab[t->idx];
 	for (i = t->idx; i < nfilters; i++, f++) {
 		if (f->valid) {
 			t->idx = i;
 			t->l2tidx = f->l2t ? f->l2t->idx : 0;
 			t->smtidx = f->smtidx;
 			if (f->fs.hitcnts)
 				t->hits = get_filter_hits(sc, t->idx);
 			else
 				t->hits = UINT64_MAX;
 			t->fs = f->fs;
 
 			goto done;
 		}
 	}
 
 	t->idx = 0xffffffff;
 done:
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 set_filter(struct adapter *sc, struct t4_filter *t)
 {
 	unsigned int nfilters, nports;
 	struct filter_entry *f;
 	int i, rc;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setf");
 	if (rc)
 		return (rc);
 
 	nfilters = sc->tids.nftids;
 	nports = sc->params.nports;
 
 	if (nfilters == 0) {
 		rc = ENOTSUP;
 		goto done;
 	}
 
 	if (!(sc->flags & FULL_INIT_DONE)) {
 		rc = EAGAIN;
 		goto done;
 	}
 
 	if (t->idx >= nfilters) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	/* Validate against the global filter mode */
 	if ((sc->params.tp.vlan_pri_map | fspec_to_fconf(&t->fs)) !=
 	    sc->params.tp.vlan_pri_map) {
 		rc = E2BIG;
 		goto done;
 	}
 
 	if (t->fs.action == FILTER_SWITCH && t->fs.eport >= nports) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	if (t->fs.val.iport >= nports) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	/* Can't specify an iq if not steering to it */
 	if (!t->fs.dirsteer && t->fs.iq) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	/* IPv6 filter idx must be 4 aligned */
 	if (t->fs.type == 1 &&
 	    ((t->idx & 0x3) || t->idx + 4 >= nfilters)) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	if (sc->tids.ftid_tab == NULL) {
 		KASSERT(sc->tids.ftids_in_use == 0,
 		    ("%s: no memory allocated but filters_in_use > 0",
 		    __func__));
 
 		sc->tids.ftid_tab = malloc(sizeof (struct filter_entry) *
 		    nfilters, M_CXGBE, M_NOWAIT | M_ZERO);
 		if (sc->tids.ftid_tab == NULL) {
 			rc = ENOMEM;
 			goto done;
 		}
 		mtx_init(&sc->tids.ftid_lock, "T4 filters", 0, MTX_DEF);
 	}
 
 	for (i = 0; i < 4; i++) {
 		f = &sc->tids.ftid_tab[t->idx + i];
 
 		if (f->pending || f->valid) {
 			rc = EBUSY;
 			goto done;
 		}
 		if (f->locked) {
 			rc = EPERM;
 			goto done;
 		}
 
 		if (t->fs.type == 0)
 			break;
 	}
 
 	f = &sc->tids.ftid_tab[t->idx];
 	f->fs = t->fs;
 
 	rc = set_filter_wr(sc, t->idx);
 done:
 	end_synchronized_op(sc, 0);
 
 	if (rc == 0) {
 		mtx_lock(&sc->tids.ftid_lock);
 		for (;;) {
 			if (f->pending == 0) {
 				rc = f->valid ? 0 : EIO;
 				break;
 			}
 
 			if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock,
 			    PCATCH, "t4setfw", 0)) {
 				rc = EINPROGRESS;
 				break;
 			}
 		}
 		mtx_unlock(&sc->tids.ftid_lock);
 	}
 	return (rc);
 }
 
 static int
 del_filter(struct adapter *sc, struct t4_filter *t)
 {
 	unsigned int nfilters;
 	struct filter_entry *f;
 	int rc;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4delf");
 	if (rc)
 		return (rc);
 
 	nfilters = sc->tids.nftids;
 
 	if (nfilters == 0) {
 		rc = ENOTSUP;
 		goto done;
 	}
 
 	if (sc->tids.ftid_tab == NULL || sc->tids.ftids_in_use == 0 ||
 	    t->idx >= nfilters) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	if (!(sc->flags & FULL_INIT_DONE)) {
 		rc = EAGAIN;
 		goto done;
 	}
 
 	f = &sc->tids.ftid_tab[t->idx];
 
 	if (f->pending) {
 		rc = EBUSY;
 		goto done;
 	}
 	if (f->locked) {
 		rc = EPERM;
 		goto done;
 	}
 
 	if (f->valid) {
 		t->fs = f->fs;	/* extra info for the caller */
 		rc = del_filter_wr(sc, t->idx);
 	}
 
 done:
 	end_synchronized_op(sc, 0);
 
 	if (rc == 0) {
 		mtx_lock(&sc->tids.ftid_lock);
 		for (;;) {
 			if (f->pending == 0) {
 				rc = f->valid ? EIO : 0;
 				break;
 			}
 
 			if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock,
 			    PCATCH, "t4delfw", 0)) {
 				rc = EINPROGRESS;
 				break;
 			}
 		}
 		mtx_unlock(&sc->tids.ftid_lock);
 	}
 
 	return (rc);
 }
 
 static void
 clear_filter(struct filter_entry *f)
 {
 	if (f->l2t)
 		t4_l2t_release(f->l2t);
 
 	bzero(f, sizeof (*f));
 }
 
 static int
 set_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
 	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (f->fs.newdmac || f->fs.newvlan) {
 		/* This filter needs an L2T entry; allocate one. */
 		f->l2t = t4_l2t_alloc_switching(sc->l2t);
 		if (f->l2t == NULL)
 			return (EAGAIN);
 		if (t4_l2t_set_switching(sc, f->l2t, f->fs.vlan, f->fs.eport,
 		    f->fs.dmac)) {
 			t4_l2t_release(f->l2t);
 			f->l2t = NULL;
 			return (ENOMEM);
 		}
 	}
 
 	ftid = sc->tids.ftid_base + fidx;
 
 	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
 	if (wr == NULL)
 		return (ENOMEM);
 
 	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR));
 	fwr->len16_pkd = htobe32(FW_LEN16(*fwr));
 	fwr->tid_to_iq =
 	    htobe32(V_FW_FILTER_WR_TID(ftid) |
 		V_FW_FILTER_WR_RQTYPE(f->fs.type) |
 		V_FW_FILTER_WR_NOREPLY(0) |
 		V_FW_FILTER_WR_IQ(f->fs.iq));
 	fwr->del_filter_to_l2tix =
 	    htobe32(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) |
 		V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) |
 		V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) |
 		V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) |
 		V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) |
 		V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) |
 		V_FW_FILTER_WR_DMAC(f->fs.newdmac) |
 		V_FW_FILTER_WR_SMAC(f->fs.newsmac) |
 		V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT ||
 		    f->fs.newvlan == VLAN_REWRITE) |
 		V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE ||
 		    f->fs.newvlan == VLAN_REWRITE) |
 		V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) |
 		V_FW_FILTER_WR_TXCHAN(f->fs.eport) |
 		V_FW_FILTER_WR_PRIO(f->fs.prio) |
 		V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0));
 	fwr->ethtype = htobe16(f->fs.val.ethtype);
 	fwr->ethtypem = htobe16(f->fs.mask.ethtype);
 	fwr->frag_to_ovlan_vldm =
 	    (V_FW_FILTER_WR_FRAG(f->fs.val.frag) |
 		V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) |
 		V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.vlan_vld) |
 		V_FW_FILTER_WR_OVLAN_VLD(f->fs.val.vnic_vld) |
 		V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.vlan_vld) |
 		V_FW_FILTER_WR_OVLAN_VLDM(f->fs.mask.vnic_vld));
 	fwr->smac_sel = 0;
 	fwr->rx_chan_rx_rpl_iq = htobe16(V_FW_FILTER_WR_RX_CHAN(0) |
 	    V_FW_FILTER_WR_RX_RPL_IQ(sc->sge.fwq.abs_id));
 	fwr->maci_to_matchtypem =
 	    htobe32(V_FW_FILTER_WR_MACI(f->fs.val.macidx) |
 		V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) |
 		V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) |
 		V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) |
 		V_FW_FILTER_WR_PORT(f->fs.val.iport) |
 		V_FW_FILTER_WR_PORTM(f->fs.mask.iport) |
 		V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) |
 		V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype));
 	fwr->ptcl = f->fs.val.proto;
 	fwr->ptclm = f->fs.mask.proto;
 	fwr->ttyp = f->fs.val.tos;
 	fwr->ttypm = f->fs.mask.tos;
 	fwr->ivlan = htobe16(f->fs.val.vlan);
 	fwr->ivlanm = htobe16(f->fs.mask.vlan);
 	fwr->ovlan = htobe16(f->fs.val.vnic);
 	fwr->ovlanm = htobe16(f->fs.mask.vnic);
 	bcopy(f->fs.val.dip, fwr->lip, sizeof (fwr->lip));
 	bcopy(f->fs.mask.dip, fwr->lipm, sizeof (fwr->lipm));
 	bcopy(f->fs.val.sip, fwr->fip, sizeof (fwr->fip));
 	bcopy(f->fs.mask.sip, fwr->fipm, sizeof (fwr->fipm));
 	fwr->lp = htobe16(f->fs.val.dport);
 	fwr->lpm = htobe16(f->fs.mask.dport);
 	fwr->fp = htobe16(f->fs.val.sport);
 	fwr->fpm = htobe16(f->fs.mask.sport);
 	if (f->fs.newsmac)
 		bcopy(f->fs.smac, fwr->sma, sizeof (fwr->sma));
 
 	f->pending = 1;
 	sc->tids.ftids_in_use++;
 
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 del_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
 	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
 
 	ftid = sc->tids.ftid_base + fidx;
 
 	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
 	if (wr == NULL)
 		return (ENOMEM);
 	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id);
 
 	f->pending = 1;
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 int
 t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1);
 	unsigned int idx = GET_TID(rpl);
 	unsigned int rc;
 	struct filter_entry *f;
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (is_ftid(sc, idx)) {
 
 		idx -= sc->tids.ftid_base;
 		f = &sc->tids.ftid_tab[idx];
 		rc = G_COOKIE(rpl->cookie);
 
 		mtx_lock(&sc->tids.ftid_lock);
 		if (rc == FW_FILTER_WR_FLT_ADDED) {
 			KASSERT(f->pending, ("%s: filter[%u] isn't pending.",
 			    __func__, idx));
 			f->smtidx = (be64toh(rpl->oldval) >> 24) & 0xff;
 			f->pending = 0;  /* asynchronous setup completed */
 			f->valid = 1;
 		} else {
 			if (rc != FW_FILTER_WR_FLT_DELETED) {
 				/* Add or delete failed, display an error */
 				log(LOG_ERR,
 				    "filter %u setup failed with error %u\n",
 				    idx, rc);
 			}
 
 			clear_filter(f);
 			sc->tids.ftids_in_use--;
 		}
 		wakeup(&sc->tids.ftid_tab);
 		mtx_unlock(&sc->tids.ftid_lock);
 	}
 
 	return (0);
 }
 
 static int
 get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt)
 {
 	int rc;
 
 	if (cntxt->cid > M_CTXTQID)
 		return (EINVAL);
 
 	if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS &&
 	    cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt");
 	if (rc)
 		return (rc);
 
 	if (sc->flags & FW_OK) {
 		rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id,
 		    &cntxt->data[0]);
 		if (rc == 0)
 			goto done;
 	}
 
 	/*
 	 * Read via firmware failed or wasn't even attempted.  Read directly via
 	 * the backdoor.
 	 */
 	rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_fw(struct adapter *sc, struct t4_data *fw)
 {
 	int rc;
 	uint8_t *fw_data;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw");
 	if (rc)
 		return (rc);
 
 	if (sc->flags & FULL_INIT_DONE) {
 		rc = EBUSY;
 		goto done;
 	}
 
 	fw_data = malloc(fw->len, M_CXGBE, M_WAITOK);
 	if (fw_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(fw->data, fw_data, fw->len);
 	if (rc == 0)
 		rc = -t4_load_fw(sc, fw_data, fw->len);
 
 	free(fw_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr)
 {
 	uint32_t addr, off, remaining, i, n;
 	uint32_t *buf, *b;
 	uint32_t mw_base, mw_aperture;
 	int rc;
 	uint8_t *dst;
 
 	rc = validate_mem_range(sc, mr->addr, mr->len);
 	if (rc != 0)
 		return (rc);
 
 	memwin_info(sc, win, &mw_base, &mw_aperture);
 	buf = b = malloc(min(mr->len, mw_aperture), M_CXGBE, M_WAITOK);
 	addr = mr->addr;
 	remaining = mr->len;
 	dst = (void *)mr->data;
 
 	while (remaining) {
 		off = position_memwin(sc, win, addr);
 
 		/* number of bytes that we'll copy in the inner loop */
 		n = min(remaining, mw_aperture - off);
 		for (i = 0; i < n; i += 4)
 			*b++ = t4_read_reg(sc, mw_base + off + i);
 
 		rc = copyout(buf, dst, n);
 		if (rc != 0)
 			break;
 
 		b = buf;
 		dst += n;
 		remaining -= n;
 		addr += n;
 	}
 
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd)
 {
 	int rc;
 
 	if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports)
 		return (EINVAL);
 
 	if (i2cd->len > sizeof(i2cd->data))
 		return (EFBIG);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd");
 	if (rc)
 		return (rc);
 	rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr,
 	    i2cd->offset, i2cd->len, &i2cd->data[0]);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 static int
 in_range(int val, int lo, int hi)
 {
 
 	return (val < 0 || (val <= hi && val >= lo));
 }
 
 static int
 set_sched_class(struct adapter *sc, struct t4_sched_params *p)
 {
 	int fw_subcmd, fw_type, rc;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsc");
 	if (rc)
 		return (rc);
 
 	if (!(sc->flags & FULL_INIT_DONE)) {
 		rc = EAGAIN;
 		goto done;
 	}
 
 	/*
 	 * Translate the cxgbetool parameters into T4 firmware parameters.  (The
 	 * sub-command and type are in common locations.)
 	 */
 	if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
 		fw_subcmd = FW_SCHED_SC_CONFIG;
 	else if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
 		fw_subcmd = FW_SCHED_SC_PARAMS;
 	else {
 		rc = EINVAL;
 		goto done;
 	}
 	if (p->type == SCHED_CLASS_TYPE_PACKET)
 		fw_type = FW_SCHED_TYPE_PKTSCHED;
 	else {
 		rc = EINVAL;
 		goto done;
 	}
 
 	if (fw_subcmd == FW_SCHED_SC_CONFIG) {
 		/* Vet our parameters ..*/
 		if (p->u.config.minmax < 0) {
 			rc = EINVAL;
 			goto done;
 		}
 
 		/* And pass the request to the firmware ...*/
 		rc = -t4_sched_config(sc, fw_type, p->u.config.minmax, 1);
 		goto done;
 	}
 
 	if (fw_subcmd == FW_SCHED_SC_PARAMS) {
 		int fw_level;
 		int fw_mode;
 		int fw_rateunit;
 		int fw_ratemode;
 
 		if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL)
 			fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
 		else if (p->u.params.level == SCHED_CLASS_LEVEL_CL_WRR)
 			fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
 		else if (p->u.params.level == SCHED_CLASS_LEVEL_CH_RL)
 			fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
 		else {
 			rc = EINVAL;
 			goto done;
 		}
 
 		if (p->u.params.mode == SCHED_CLASS_MODE_CLASS)
 			fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
 		else if (p->u.params.mode == SCHED_CLASS_MODE_FLOW)
 			fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
 		else {
 			rc = EINVAL;
 			goto done;
 		}
 
 		if (p->u.params.rateunit == SCHED_CLASS_RATEUNIT_BITS)
 			fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
 		else if (p->u.params.rateunit == SCHED_CLASS_RATEUNIT_PKTS)
 			fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
 		else {
 			rc = EINVAL;
 			goto done;
 		}
 
 		if (p->u.params.ratemode == SCHED_CLASS_RATEMODE_REL)
 			fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
 		else if (p->u.params.ratemode == SCHED_CLASS_RATEMODE_ABS)
 			fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
 		else {
 			rc = EINVAL;
 			goto done;
 		}
 
 		/* Vet our parameters ... */
 		if (!in_range(p->u.params.channel, 0, 3) ||
 		    !in_range(p->u.params.cl, 0, is_t4(sc) ? 15 : 16) ||
 		    !in_range(p->u.params.minrate, 0, 10000000) ||
 		    !in_range(p->u.params.maxrate, 0, 10000000) ||
 		    !in_range(p->u.params.weight, 0, 100)) {
 			rc = ERANGE;
 			goto done;
 		}
 
 		/*
 		 * Translate any unset parameters into the firmware's
 		 * nomenclature and/or fail the call if the parameters
 		 * are required ...
 		 */
 		if (p->u.params.rateunit < 0 || p->u.params.ratemode < 0 ||
 		    p->u.params.channel < 0 || p->u.params.cl < 0) {
 			rc = EINVAL;
 			goto done;
 		}
 		if (p->u.params.minrate < 0)
 			p->u.params.minrate = 0;
 		if (p->u.params.maxrate < 0) {
 			if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL ||
 			    p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) {
 				rc = EINVAL;
 				goto done;
 			} else
 				p->u.params.maxrate = 0;
 		}
 		if (p->u.params.weight < 0) {
 			if (p->u.params.level == SCHED_CLASS_LEVEL_CL_WRR) {
 				rc = EINVAL;
 				goto done;
 			} else
 				p->u.params.weight = 0;
 		}
 		if (p->u.params.pktsize < 0) {
 			if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL ||
 			    p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) {
 				rc = EINVAL;
 				goto done;
 			} else
 				p->u.params.pktsize = 0;
 		}
 
 		/* See what the firmware thinks of the request ... */
 		rc = -t4_sched_params(sc, fw_type, fw_level, fw_mode,
 		    fw_rateunit, fw_ratemode, p->u.params.channel,
 		    p->u.params.cl, p->u.params.minrate, p->u.params.maxrate,
 		    p->u.params.weight, p->u.params.pktsize, 1);
 		goto done;
 	}
 
 	rc = EINVAL;
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
 {
 	struct port_info *pi = NULL;
 	struct sge_txq *txq;
 	uint32_t fw_mnem, fw_queue, fw_class;
 	int i, rc;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsq");
 	if (rc)
 		return (rc);
 
 	if (!(sc->flags & FULL_INIT_DONE)) {
 		rc = EAGAIN;
 		goto done;
 	}
 
 	if (p->port >= sc->params.nports) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	pi = sc->port[p->port];
 	if (!in_range(p->queue, 0, pi->ntxq - 1) || !in_range(p->cl, 0, 7)) {
 		rc = EINVAL;
 		goto done;
 	}
 
 	/*
 	 * Create a template for the FW_PARAMS_CMD mnemonic and value (TX
 	 * Scheduling Class in this case).
 	 */
 	fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH));
 	fw_class = p->cl < 0 ? 0xffffffff : p->cl;
 
 	/*
 	 * If op.queue is non-negative, then we're only changing the scheduling
 	 * on a single specified TX queue.
 	 */
 	if (p->queue >= 0) {
 		txq = &sc->sge.txq[pi->first_txq + p->queue];
 		fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue,
 		    &fw_class);
 		goto done;
 	}
 
 	/*
 	 * Change the scheduling on all the TX queues for the
 	 * interface.
 	 */
 	for_each_txq(pi, i, txq) {
 		fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue,
 		    &fw_class);
 		if (rc)
 			goto done;
 	}
 
 	rc = 0;
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 int
 t4_os_find_pci_capability(struct adapter *sc, int cap)
 {
 	int i;
 
 	return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0);
 }
 
 int
 t4_os_pci_save_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_save(dev, dinfo, 0);
 	return (0);
 }
 
 int
 t4_os_pci_restore_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_restore(dev, dinfo);
 	return (0);
 }
 
 void
 t4_os_portmod_changed(const struct adapter *sc, int idx)
 {
 	struct port_info *pi = sc->port[idx];
 	static const char *mod_str[] = {
 		NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM"
 	};
 
 	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
 		if_printf(pi->ifp, "transceiver unplugged.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN)
 		if_printf(pi->ifp, "unknown transceiver inserted.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED)
 		if_printf(pi->ifp, "unsupported transceiver inserted.\n");
 	else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) {
 		if_printf(pi->ifp, "%s transceiver inserted.\n",
 		    mod_str[pi->mod_type]);
 	} else {
 		if_printf(pi->ifp, "transceiver (type %d) inserted.\n",
 		    pi->mod_type);
 	}
 }
 
 void
 t4_os_link_changed(struct adapter *sc, int idx, int link_stat, int reason)
 {
 	struct port_info *pi = sc->port[idx];
 	struct ifnet *ifp = pi->ifp;
 
 	if (link_stat) {
 		pi->linkdnrc = -1;
 		ifp->if_baudrate = IF_Mbps(pi->link_cfg.speed);
 		if_link_state_change(ifp, LINK_STATE_UP);
 	} else {
 		if (reason >= 0)
 			pi->linkdnrc = reason;
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 	}
 }
 
 void
 t4_iterate(void (*func)(struct adapter *, void *), void *arg)
 {
 	struct adapter *sc;
 
 	sx_slock(&t4_list_lock);
 	SLIST_FOREACH(sc, &t4_list, link) {
 		/*
 		 * func should not make any assumptions about what state sc is
 		 * in - the only guarantee is that sc->sc_lock is a valid lock.
 		 */
 		func(sc, arg);
 	}
 	sx_sunlock(&t4_list_lock);
 }
 
 static int
 t4_open(struct cdev *dev, int flags, int type, struct thread *td)
 {
        return (0);
 }
 
 static int
 t4_close(struct cdev *dev, int flags, int type, struct thread *td)
 {
        return (0);
 }
 
 static int
 t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	int rc;
 	struct adapter *sc = dev->si_drv1;
 
 	rc = priv_check(td, PRIV_DRIVER);
 	if (rc != 0)
 		return (rc);
 
 	switch (cmd) {
 	case CHELSIO_T4_GETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		if (edata->size == 4)
 			edata->val = t4_read_reg(sc, edata->addr);
 		else if (edata->size == 8)
 			edata->val = t4_read_reg64(sc, edata->addr);
 		else
 			return (EINVAL);
 
 		break;
 	}
 	case CHELSIO_T4_SETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		if (edata->size == 4) {
 			if (edata->val & 0xffffffff00000000)
 				return (EINVAL);
 			t4_write_reg(sc, edata->addr, (uint32_t) edata->val);
 		} else if (edata->size == 8)
 			t4_write_reg64(sc, edata->addr, edata->val);
 		else
 			return (EINVAL);
 		break;
 	}
 	case CHELSIO_T4_REGDUMP: {
 		struct t4_regdump *regs = (struct t4_regdump *)data;
 		int reglen = is_t4(sc) ? T4_REGDUMP_SIZE : T5_REGDUMP_SIZE;
 		uint8_t *buf;
 
 		if (regs->len < reglen) {
 			regs->len = reglen; /* hint to the caller */
 			return (ENOBUFS);
 		}
 
 		regs->len = reglen;
 		buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO);
 		t4_get_regs(sc, regs, buf);
 		rc = copyout(buf, regs->data, reglen);
 		free(buf, M_CXGBE);
 		break;
 	}
 	case CHELSIO_T4_GET_FILTER_MODE:
 		rc = get_filter_mode(sc, (uint32_t *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER_MODE:
 		rc = set_filter_mode(sc, *(uint32_t *)data);
 		break;
 	case CHELSIO_T4_GET_FILTER:
 		rc = get_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER:
 		rc = set_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_DEL_FILTER:
 		rc = del_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_GET_SGE_CONTEXT:
 		rc = get_sge_context(sc, (struct t4_sge_context *)data);
 		break;
 	case CHELSIO_T4_LOAD_FW:
 		rc = load_fw(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_GET_MEM:
 		rc = read_card_mem(sc, 2, (struct t4_mem_range *)data);
 		break;
 	case CHELSIO_T4_GET_I2C:
 		rc = read_i2c(sc, (struct t4_i2c_data *)data);
 		break;
 	case CHELSIO_T4_CLEAR_STATS: {
 		int i;
 		u_int port_id = *(uint32_t *)data;
 		struct port_info *pi;
 
 		if (port_id >= sc->params.nports)
 			return (EINVAL);
 		pi = sc->port[port_id];
 
 		/* MAC stats */
 		t4_clr_port_stats(sc, pi->tx_chan);
 
 		if (pi->flags & PORT_INIT_DONE) {
 			struct sge_rxq *rxq;
 			struct sge_txq *txq;
 			struct sge_wrq *wrq;
 
 			for_each_rxq(pi, i, rxq) {
 #if defined(INET) || defined(INET6)
 				rxq->lro.lro_queued = 0;
 				rxq->lro.lro_flushed = 0;
 #endif
 				rxq->rxcsum = 0;
 				rxq->vlan_extraction = 0;
 			}
 
 			for_each_txq(pi, i, txq) {
 				txq->txcsum = 0;
 				txq->tso_wrs = 0;
 				txq->vlan_insertion = 0;
 				txq->imm_wrs = 0;
 				txq->sgl_wrs = 0;
 				txq->txpkt_wrs = 0;
 				txq->txpkts_wrs = 0;
 				txq->txpkts_pkts = 0;
 				txq->br->br_drops = 0;
 				txq->no_dmamap = 0;
 				txq->no_desc = 0;
 			}
 
 #ifdef TCP_OFFLOAD
 			/* nothing to clear for each ofld_rxq */
 
 			for_each_ofld_txq(pi, i, wrq) {
 				wrq->tx_wrs = 0;
 				wrq->no_desc = 0;
 			}
 #endif
 			wrq = &sc->sge.ctrlq[pi->port_id];
 			wrq->tx_wrs = 0;
 			wrq->no_desc = 0;
 		}
 		break;
 	}
 	case CHELSIO_T4_SCHED_CLASS:
 		rc = set_sched_class(sc, (struct t4_sched_params *)data);
 		break;
 	case CHELSIO_T4_SCHED_QUEUE:
 		rc = set_sched_queue(sc, (struct t4_sched_queue *)data);
 		break;
 	case CHELSIO_T4_GET_TRACER:
 		rc = t4_get_tracer(sc, (struct t4_tracer *)data);
 		break;
 	case CHELSIO_T4_SET_TRACER:
 		rc = t4_set_tracer(sc, (struct t4_tracer *)data);
 		break;
 	default:
 		rc = EINVAL;
 	}
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 void
 t4_iscsi_init(struct ifnet *ifp, unsigned int tag_mask,
     const unsigned int *pgsz_order)
 {
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
 
 	t4_write_reg(sc, A_ULP_RX_ISCSI_TAGMASK, tag_mask);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, V_HPZ0(pgsz_order[0]) |
 		V_HPZ1(pgsz_order[1]) | V_HPZ2(pgsz_order[2]) |
 		V_HPZ3(pgsz_order[3]));
 }
 
 static int
 toe_capability(struct port_info *pi, int enable)
 {
 	int rc;
 	struct adapter *sc = pi->adapter;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!is_offload(sc))
 		return (ENODEV);
 
 	if (enable) {
 		if (!(sc->flags & FULL_INIT_DONE)) {
 			rc = cxgbe_init_synchronized(pi);
 			if (rc)
 				return (rc);
 		}
 
 		if (isset(&sc->offload_map, pi->port_id))
 			return (0);
 
 		if (!(sc->flags & TOM_INIT_DONE)) {
 			rc = t4_activate_uld(sc, ULD_TOM);
 			if (rc == EAGAIN) {
 				log(LOG_WARNING,
 				    "You must kldload t4_tom.ko before trying "
 				    "to enable TOE on a cxgbe interface.\n");
 			}
 			if (rc != 0)
 				return (rc);
 			KASSERT(sc->tom_softc != NULL,
 			    ("%s: TOM activated but softc NULL", __func__));
 			KASSERT(sc->flags & TOM_INIT_DONE,
 			    ("%s: TOM activated but flag not set", __func__));
 		}
 
 		setbit(&sc->offload_map, pi->port_id);
 	} else {
 		if (!isset(&sc->offload_map, pi->port_id))
 			return (0);
 
 		KASSERT(sc->flags & TOM_INIT_DONE,
 		    ("%s: TOM never initialized?", __func__));
 		clrbit(&sc->offload_map, pi->port_id);
 	}
 
 	return (0);
 }
 
 /*
  * Add an upper layer driver to the global list.
  */
 int
 t4_register_uld(struct uld_info *ui)
 {
 	int rc = 0;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u->uld_id == ui->uld_id) {
 		    rc = EEXIST;
 		    goto done;
 	    }
 	}
 
 	SLIST_INSERT_HEAD(&t4_uld_list, ui, link);
 	ui->refcount = 0;
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_unregister_uld(struct uld_info *ui)
 {
 	int rc = EINVAL;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u == ui) {
 		    if (ui->refcount > 0) {
 			    rc = EBUSY;
 			    goto done;
 		    }
 
 		    SLIST_REMOVE(&t4_uld_list, ui, uld_info, link);
 		    rc = 0;
 		    goto done;
 	    }
 	}
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_activate_uld(struct adapter *sc, int id)
 {
 	int rc = EAGAIN;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			rc = ui->activate(sc);
 			if (rc == 0)
 				ui->refcount++;
 			goto done;
 		}
 	}
 done:
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 
 int
 t4_deactivate_uld(struct adapter *sc, int id)
 {
 	int rc = EINVAL;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			rc = ui->deactivate(sc);
 			if (rc == 0)
 				ui->refcount--;
 			goto done;
 		}
 	}
 done:
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 #endif
 
 /*
  * Come up with reasonable defaults for some of the tunables, provided they're
  * not set by the user (in which case we'll use the values as is).
  */
 static void
 tweak_tunables(void)
 {
 	int nc = mp_ncpus;	/* our snapshot of the number of CPUs */
 
 	if (t4_ntxq10g < 1)
 		t4_ntxq10g = min(nc, NTXQ_10G);
 
 	if (t4_ntxq1g < 1)
 		t4_ntxq1g = min(nc, NTXQ_1G);
 
 	if (t4_nrxq10g < 1)
 		t4_nrxq10g = min(nc, NRXQ_10G);
 
 	if (t4_nrxq1g < 1)
 		t4_nrxq1g = min(nc, NRXQ_1G);
 
 #ifdef TCP_OFFLOAD
 	if (t4_nofldtxq10g < 1)
 		t4_nofldtxq10g = min(nc, NOFLDTXQ_10G);
 
 	if (t4_nofldtxq1g < 1)
 		t4_nofldtxq1g = min(nc, NOFLDTXQ_1G);
 
 	if (t4_nofldrxq10g < 1)
 		t4_nofldrxq10g = min(nc, NOFLDRXQ_10G);
 
 	if (t4_nofldrxq1g < 1)
 		t4_nofldrxq1g = min(nc, NOFLDRXQ_1G);
 
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = FW_CAPS_CONFIG_TOE;
 #else
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = 0;
 #endif
 
 #ifdef DEV_NETMAP
 	if (t4_nnmtxq10g < 1)
 		t4_nnmtxq10g = min(nc, NNMTXQ_10G);
 
 	if (t4_nnmtxq1g < 1)
 		t4_nnmtxq1g = min(nc, NNMTXQ_1G);
 
 	if (t4_nnmrxq10g < 1)
 		t4_nnmrxq10g = min(nc, NNMRXQ_10G);
 
 	if (t4_nnmrxq1g < 1)
 		t4_nnmrxq1g = min(nc, NNMRXQ_1G);
 #endif
 
 	if (t4_tmr_idx_10g < 0 || t4_tmr_idx_10g >= SGE_NTIMERS)
 		t4_tmr_idx_10g = TMR_IDX_10G;
 
 	if (t4_pktc_idx_10g < -1 || t4_pktc_idx_10g >= SGE_NCOUNTERS)
 		t4_pktc_idx_10g = PKTC_IDX_10G;
 
 	if (t4_tmr_idx_1g < 0 || t4_tmr_idx_1g >= SGE_NTIMERS)
 		t4_tmr_idx_1g = TMR_IDX_1G;
 
 	if (t4_pktc_idx_1g < -1 || t4_pktc_idx_1g >= SGE_NCOUNTERS)
 		t4_pktc_idx_1g = PKTC_IDX_1G;
 
 	if (t4_qsize_txq < 128)
 		t4_qsize_txq = 128;
 
 	if (t4_qsize_rxq < 128)
 		t4_qsize_rxq = 128;
 	while (t4_qsize_rxq & 7)
 		t4_qsize_rxq++;
 
 	t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX;
 }
 
 static struct sx mlu;	/* mod load unload */
 SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
 
 static int
 mod_event(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 	static int loaded = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		sx_xlock(&mlu);
 		if (loaded++ == 0) {
 			t4_sge_modload();
 			sx_init(&t4_list_lock, "T4/T5 adapters");
 			SLIST_INIT(&t4_list);
 #ifdef TCP_OFFLOAD
 			sx_init(&t4_uld_list_lock, "T4/T5 ULDs");
 			SLIST_INIT(&t4_uld_list);
 #endif
 			t4_tracer_modload();
 			tweak_tunables();
 		}
 		sx_xunlock(&mlu);
 		break;
 
 	case MOD_UNLOAD:
 		sx_xlock(&mlu);
 		if (--loaded == 0) {
 			int tries;
 
 			sx_slock(&t4_list_lock);
 			if (!SLIST_EMPTY(&t4_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #ifdef TCP_OFFLOAD
 			sx_slock(&t4_uld_list_lock);
 			if (!SLIST_EMPTY(&t4_uld_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_uld_list_lock);
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #endif
 			tries = 0;
 			while (tries++ < 5 && t4_sge_extfree_refs() != 0) {
 				uprintf("%ju clusters with custom free routine "
 				    "still is use.\n", t4_sge_extfree_refs());
 				pause("t4unload", 2 * hz);
 			}
 #ifdef TCP_OFFLOAD
 			sx_sunlock(&t4_uld_list_lock);
 #endif
 			sx_sunlock(&t4_list_lock);
 
 			if (t4_sge_extfree_refs() == 0) {
 				t4_tracer_modunload();
 #ifdef TCP_OFFLOAD
 				sx_destroy(&t4_uld_list_lock);
 #endif
 				sx_destroy(&t4_list_lock);
 				t4_sge_modunload();
 				loaded = 0;
 			} else {
 				rc = EBUSY;
 				loaded++;	/* undo earlier decrement */
 			}
 		}
 done_unload:
 		sx_xunlock(&mlu);
 		break;
 	}
 
 	return (rc);
 }
 
 static devclass_t t4_devclass, t5_devclass;
 static devclass_t cxgbe_devclass, cxl_devclass;
 
 DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0);
 MODULE_VERSION(t4nex, 1);
 MODULE_DEPEND(t4nex, firmware, 1, 1, 1);
 
 DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0);
 MODULE_VERSION(t5nex, 1);
 MODULE_DEPEND(t5nex, firmware, 1, 1, 1);
 
 DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0);
 MODULE_VERSION(cxgbe, 1);
 
 DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0);
 MODULE_VERSION(cxl, 1);
Index: user/ae/inet6/sys/geom/geom_map.c
===================================================================
--- user/ae/inet6/sys/geom/geom_map.c	(revision 271452)
+++ user/ae/inet6/sys/geom/geom_map.c	(revision 271453)
@@ -1,394 +1,394 @@
 /*-
  * Copyright (c) 2010-2011 Aleksandr Rybalko <ray@dlink.ua>
  *   based on geom_redboot.c
  * Copyright (c) 2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
  *    redistribution must be conditioned upon including a substantially
  *    similar Disclaimer requirement for further binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGES.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/errno.h>
 #include <sys/endian.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 
 #include <geom/geom.h>
 #include <geom/geom_slice.h>
 
 #define	MAP_CLASS_NAME	"MAP"
 #define	MAP_MAXSLICE	64
 #define	MAP_MAX_MARKER_LEN	64
 
 struct g_map_softc {
 	off_t		 offset[MAP_MAXSLICE];	/* offset in flash */
 	off_t		 size[MAP_MAXSLICE];	/* image size in bytes */
 	off_t		 entry[MAP_MAXSLICE];
 	off_t		 dsize[MAP_MAXSLICE];
 	uint8_t		 readonly[MAP_MAXSLICE];
 	g_access_t	*parent_access;
 };
 
 static int
 g_map_access(struct g_provider *pp, int dread, int dwrite, int dexcl)
 {
 	struct g_geom *gp;
 	struct g_slicer *gsp;
 	struct g_map_softc *sc;
 
 	gp = pp->geom;
 	gsp = gp->softc;
 	sc = gsp->softc;
 
 	if (dwrite > 0 && sc->readonly[pp->index])
 		return (EPERM);
 
 	return (sc->parent_access(pp, dread, dwrite, dexcl)); 
 }
 
 static int
 g_map_start(struct bio *bp)
 {
 	struct g_provider *pp;
 	struct g_geom *gp;
 	struct g_map_softc *sc;
 	struct g_slicer *gsp;
 	int idx;
 
 	pp = bp->bio_to;
 	idx = pp->index;
 	gp = pp->geom;
 	gsp = gp->softc;
 	sc = gsp->softc;
 
 	if (bp->bio_cmd == BIO_GETATTR) {
 		if (g_handleattr_int(bp, MAP_CLASS_NAME "::entry",
 		    sc->entry[idx])) {
 			return (1);
 		}
 		if (g_handleattr_int(bp, MAP_CLASS_NAME "::dsize",
 		    sc->dsize[idx])) {
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 static void
 g_map_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp __unused, struct g_provider *pp)
 {
 	struct g_map_softc *sc;
 	struct g_slicer *gsp;
 
 	gsp = gp->softc;
 	sc = gsp->softc;
 	g_slice_dumpconf(sb, indent, gp, cp, pp);
 	if (pp != NULL) {
 		if (indent == NULL) {
 			sbuf_printf(sb, " entry %jd", (intmax_t)sc->entry[pp->index]);
 			sbuf_printf(sb, " dsize %jd", (intmax_t)sc->dsize[pp->index]);
 		} else {
 			sbuf_printf(sb, "%s<entry>%jd</entry>\n", indent,
 			    (intmax_t)sc->entry[pp->index]);
 			sbuf_printf(sb, "%s<dsize>%jd</dsize>\n", indent,
 			    (intmax_t)sc->dsize[pp->index]);
 		}
 	}
 }
 
 static int
 find_marker(struct g_consumer *cp, const char *line, off_t *offset)
 {
 	off_t search_start, search_offset, search_step;
 	size_t sectorsize;
 	uint8_t *buf;
 	char *op, key[MAP_MAX_MARKER_LEN], search_key[MAP_MAX_MARKER_LEN];
 	int ret, c;
 
 	/* Try convert to numeric first */
 	*offset = strtouq(line, &op, 0);
 	if (*op == '\0') 
 		return (0);
 
 	bzero(search_key, MAP_MAX_MARKER_LEN);
 	sectorsize = cp->provider->sectorsize;
 
 	ret = sscanf(line, "search:%qi:%qi:%63c",
 	    &search_start, &search_step, search_key);
 	if (ret < 3)
 		return (1);
 
 	if (bootverbose) {
-		printf("MAP: search key \"%s\" from 0x%jx, step 0x%jx\n",
-		    search_key, (intmax_t)search_start, (intmax_t)search_step);
+		printf("MAP: search %s for key \"%s\" from 0x%jx, step 0x%jx\n",
+		    cp->geom->name, search_key, (intmax_t)search_start, (intmax_t)search_step);
 	}
 
 	/* error if search_key is empty */
 	if (strlen(search_key) < 1)
 		return (1);
 
 	/* sscanf successful, and we start marker search */
 	for (search_offset = search_start;
 	     search_offset < cp->provider->mediasize;
 	     search_offset += search_step) {
 
 		g_topology_unlock();
 		buf = g_read_data(cp, rounddown(search_offset, sectorsize),
 		    roundup(strlen(search_key), sectorsize), NULL);
 		g_topology_lock();
 
 		/* Wildcard, replace '.' with byte from data */
 		/* TODO: add support wildcard escape '\.' */
 
 		strncpy(key, search_key, MAP_MAX_MARKER_LEN);
 
 		for (c = 0; c < MAP_MAX_MARKER_LEN && key[c]; c++) {
 			if (key[c] == '.') {
 				key[c] = ((char *)(buf + 
 				    (search_offset % sectorsize)))[c];
 			}
 		}
 
 		if (buf != NULL && strncmp(buf + search_offset % sectorsize,
 		    key, strlen(search_key)) == 0) {
 			g_free(buf);
 			/* Marker found, so return their offset */
 			*offset = search_offset;
 			return (0);
 		}
 		g_free(buf);
 	}
 
 	/* Marker not found */
 	return (1);
 }
 
 static int
 g_map_parse_part(struct g_class *mp, struct g_provider *pp,
     struct g_consumer *cp, struct g_geom *gp, struct g_map_softc *sc, int i)
 {
 	const char *value, *name;
 	char *op;
 	off_t start, end, offset, size, dsize;
 	int readonly, ret;
 
 	/* hint.map.0.at="cfid0" - bind to cfid0 media */
 	if (resource_string_value("map", i, "at", &value) != 0)
 		return (1);
 
 	/* Check if this correct provider */
 	if (strcmp(pp->name, value) != 0)
 		return (1);
 
 	/*
 	 * hint.map.0.name="uboot" - name of partition, will be available
 	 * as "/dev/map/uboot"
 	 */
 	if (resource_string_value("map", i, "name", &name) != 0) {
 		if (bootverbose)
 			printf("MAP: hint.map.%d has no name\n", i);
 		return (1);
 	}
 
 	/*
 	 * hint.map.0.start="0x00010000" - partition start at 0x00010000
 	 * or hint.map.0.start="search:0x00010000:0x200:marker text" -
 	 * search for text "marker text", begin at 0x10000, step 0x200
 	 * until we found marker or end of media reached
 	 */ 
 	if (resource_string_value("map", i, "start", &value) != 0) {
 		if (bootverbose)
 			printf("MAP: \"%s\" has no start value\n", name);
 		return (1);
 	}
 	if (find_marker(cp, value, &start) != 0) {
 		if (bootverbose) {
 			printf("MAP: \"%s\" can't parse/use start value\n",
 			    name);
 		}
 		return (1);
 	}
 
 	/* like "start" */
 	if (resource_string_value("map", i, "end", &value) != 0) {
 		if (bootverbose)
 			printf("MAP: \"%s\" has no end value\n", name);
 		return (1);
 	}
 	if (find_marker(cp, value, &end) != 0) {
 		if (bootverbose) {
 			printf("MAP: \"%s\" can't parse/use start value\n",
 			    name);
 		}
 		return (1);
 	}
 
 	/* variable readonly optional, disable write access */
 	if (resource_int_value("map", i, "readonly", &readonly) != 0)
 		readonly = 0;
 
 	/* offset of partition data, from partition begin */
 	if (resource_string_value("map", i, "offset", &value) == 0) {
 		offset = strtouq(value, &op, 0);
 		if (*op != '\0') {
 			if (bootverbose) {
 				printf("MAP: \"%s\" can't parse offset\n",
 				    name);
 			}
 			return (1);
 		}
 	} else {
 		offset = 0;
 	}
 
 	/* partition data size */
 	if (resource_string_value("map", i, "dsize", &value) == 0) {
 		dsize = strtouq(value, &op, 0);
 		if (*op != '\0') {
 			if (bootverbose) {
 				printf("MAP: \"%s\" can't parse dsize\n", 
 				    name);
 			}
 			return (1);
 		}
 	} else {
 		dsize = 0;
 	}
 
 	size = end - start;
 	if (dsize == 0)
 		dsize = size - offset;
 
 	/* end is 0 or size is 0, No MAP - so next */
 	if (end < start) {
 		if (bootverbose) {
 			printf("MAP: \"%s\", \"end\" less than "
 			    "\"start\"\n", name);
 		}
 		return (1);
 	}
 
 	if (offset + dsize > size) {
 		if (bootverbose) {
 			printf("MAP: \"%s\", \"dsize\" bigger than "
 			    "partition - offset\n", name);
 		}
 		return (1);
 	}
 
 	ret = g_slice_config(gp, i, G_SLICE_CONFIG_SET, start + offset,
 	    dsize, cp->provider->sectorsize, "map/%s", name);
 	if (ret != 0) {
 		if (bootverbose) {
 			printf("MAP: g_slice_config returns %d for \"%s\"\n", 
 			    ret, name);
 		}
 		return (1);
 	}
 
 	if (bootverbose) {
-		printf("MAP: %jxx%jx, data=%jxx%jx "
+		printf("MAP: %s: %jxx%jx, data=%jxx%jx "
 		    "\"/dev/map/%s\"\n",
-		    (intmax_t)start, (intmax_t)size, (intmax_t)offset,
+		    cp->geom->name, (intmax_t)start, (intmax_t)size, (intmax_t)offset,
 		    (intmax_t)dsize, name);
 	}
 
 	sc->offset[i] = start;
 	sc->size[i] = size;
 	sc->entry[i] = offset;
 	sc->dsize[i] = dsize;
 	sc->readonly[i] = readonly ? 1 : 0;
 
 	return (0);
 }
 
 static struct g_geom *
 g_map_taste(struct g_class *mp, struct g_provider *pp, int insist __unused)
 {
 	struct g_map_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int i;
 
 	g_trace(G_T_TOPOLOGY, "map_taste(%s,%s)", mp->name, pp->name);
 	g_topology_assert();
 	if (strcmp(pp->geom->class->name, MAP_CLASS_NAME) == 0)
 		return (NULL);
 
 	gp = g_slice_new(mp, MAP_MAXSLICE, pp, &cp, &sc, sizeof(*sc),
 	    g_map_start);
 	if (gp == NULL)
 		return (NULL);
 
 	/* interpose our access method */
 	sc->parent_access = gp->access;
 	gp->access = g_map_access;
 
 	for (i = 0; i < MAP_MAXSLICE; i++)
 		g_map_parse_part(mp, pp, cp, gp, sc, i);
 
 
 	g_access(cp, -1, 0, 0);
 	if (LIST_EMPTY(&gp->provider)) {
 		if (bootverbose)
 			printf("MAP: No valid partition found at %s\n", pp->name);
 		g_slice_spoiled(cp);
 		return (NULL);
 	}
 	return (gp);
 }
 
 static void
 g_map_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = gctl_get_geom(req, mp, "geom");
 	if (gp == NULL)
 		return;
 	gctl_error(req, "Unknown verb");
 }
 
 static struct g_class g_map_class = {
 	.name = MAP_CLASS_NAME,
 	.version = G_VERSION,
 	.taste = g_map_taste,
 	.dumpconf = g_map_dumpconf,
 	.ctlreq = g_map_config,
 };
 DECLARE_GEOM_CLASS(g_map_class, g_map);
Index: user/ae/inet6/sys/net/if.c
===================================================================
--- user/ae/inet6/sys/net/if.c	(revision 271452)
+++ user/ae/inet6/sys/net/if.c	(revision 271453)
@@ -1,4098 +1,4086 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/scope6_var.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 struct ifindex_entry {
 	struct  ifnet *ife_ifnet;
 };
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*bridge_linkstate_p)(struct ifnet *ifp);
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_init(void *);
 static void	if_grow(void);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	if_rtdel(struct radix_node *, void *);
 static int	ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, int);
 static void	if_detach_internal(struct ifnet *, int);
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 VNET_DEFINE(int, if_index);
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 static VNET_DEFINE(int, if_indexlim) = 8;
 
 /* Table of ifnet by index. */
 VNET_DEFINE(struct ifindex_entry *, ifindex_table);
 
 #define	V_if_indexlim		VNET(if_indexlim)
 #define	V_ifindex_table		VNET(ifindex_table)
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
  * an rwlock.  Either may be acquired shared to stablize the list, but both
  * must be acquired writable to modify the list.  This model allows us to
  * both stablize the interface list during interrupt thread processing, but
  * also to stablize it over long-running ioctls, without introducing priority
  * inversions and deadlocks.
  */
 struct rwlock ifnet_rwlock;
 struct sx ifnet_sxlock;
 
 /*
  * The allocation of network interfaces is a rather non-atomic affair; we
  * need to select an index before we are ready to expose the interface for
  * use, so will use this pointer value to indicate reservation.
  */
 #define	IFNET_HOLD	(void *)(uintptr_t)(-1)
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex_locked(u_short idx)
 {
 
 	if (idx > V_if_index)
 		return (NULL);
 	if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD)
 		return (NULL);
 	return (V_ifindex_table[idx].ife_ifnet);
 }
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
 		IFNET_RUNLOCK_NOSLEEP();
 		return (NULL);
 	}
 	if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Allocate an ifindex array entry; return 0 on success or an error on
  * failure.
  */
 static int
 ifindex_alloc_locked(u_short *idxp)
 {
 	u_short idx;
 
 	IFNET_WLOCK_ASSERT();
 
 retry:
 	/*
 	 * Try to find an empty slot below V_if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= V_if_index; idx++) {
 		if (V_ifindex_table[idx].ife_ifnet == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= V_if_indexlim) {
 		if_grow();
 		goto retry;
 	}
 	if (idx > V_if_index)
 		V_if_index = idx;
 	*idxp = idx;
 	return (0);
 }
 
 static void
 ifindex_free_locked(u_short idx)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx].ife_ifnet = NULL;
 	while (V_if_index > 0 &&
 	    V_ifindex_table[V_if_index].ife_ifnet == NULL)
 		V_if_index--;
 }
 
 static void
 ifindex_free(u_short idx)
 {
 
 	IFNET_WLOCK();
 	ifindex_free_locked(idx);
 	IFNET_WUNLOCK();
 }
 
 static void
 ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx].ife_ifnet = ifp;
 }
 
 static void
 ifnet_setbyindex(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK();
 	ifnet_setbyindex_locked(idx, ifp);
 	IFNET_WUNLOCK();
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifa = ifnet_byindex_locked(idx)->if_addr;
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	TAILQ_INIT(&V_ifnet);
 	TAILQ_INIT(&V_ifg_head);
 	IFNET_WLOCK();
 	if_grow();				/* create initial table */
 	IFNET_WUNLOCK();
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 /* ARGSUSED*/
 static void
 if_init(void *dummy __unused)
 {
 
 	IFNET_LOCK_INIT();
 	if_clone_init();
 }
 SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL);
 
 
 #ifdef VIMAGE
 static void
 vnet_if_uninit(const void *unused __unused)
 {
 
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
 	    "not empty", __func__, __LINE__, &V_ifnet));
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
 	    "not empty", __func__, __LINE__, &V_ifg_head));
 
 	free((caddr_t)V_ifindex_table, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_if_uninit, NULL);
 #endif
 
 static void
 if_grow(void)
 {
 	int oldlim;
 	u_int n;
 	struct ifindex_entry *e;
 
 	IFNET_WLOCK_ASSERT();
 	oldlim = V_if_indexlim;
 	IFNET_WUNLOCK();
 	n = (oldlim << 1) * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	IFNET_WLOCK();
 	if (V_if_indexlim != oldlim) {
 		free(e, M_IFNET);
 		return;
 	}
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		free((caddr_t)V_ifindex_table, M_IFNET);
 	}
 	V_if_indexlim <<= 1;
 	V_ifindex_table = e;
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 struct ifnet *
 if_alloc(u_char type)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
 	IFNET_WLOCK();
 	if (ifindex_alloc_locked(&idx) != 0) {
 		IFNET_WUNLOCK();
 		free(ifp, M_IFNET);
 		return (NULL);
 	}
 	ifnet_setbyindex_locked(idx, IFNET_HOLD);
 	IFNET_WUNLOCK();
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
 			free(ifp, M_IFNET);
 			ifindex_free(idx);
 			return (NULL);
 		}
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	TAILQ_INIT(&ifp->if_addrhead);
 	TAILQ_INIT(&ifp->if_multiaddrs);
 	TAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	ifnet_setbyindex(ifp->if_index, ifp);
 	return (ifp);
 }
 
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the last reference to an
  * interface is released.
  */
 static void
 if_free_internal(struct ifnet *ifp)
 {
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("if_free_internal: interface not dying"));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	if (ifp->if_description != NULL)
 		free(ifp->if_description, M_IFDESCR);
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 	free(ifp, M_IFNET);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IFNET_WLOCK();
 	KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
 	    ("%s: freeing unallocated ifnet", ifp->if_xname));
 
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		if_free_internal(ifp);
 	CURVNET_RESTORE();
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	refcount_acquire(&ifp->if_refcount);
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	if_free_internal(ifp);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 	
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initalization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, 0);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, int vmove)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit;
 		ifp->if_qflush = if_qflush;
 	}
 
 	if (ifp->if_get_counter == NULL)
 		ifp->if_get_counter = if_get_counter_compat;
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_rtrequest = link_rtrequest;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 #if defined(INET) || defined(INET6)
 		/* Initialize to max value. */
 		if (ifp->if_hw_tsomax == 0)
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, 32 * MCLBYTES -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 		KASSERT(ifp->if_hw_tsomax <= IP_MAXPACKET &&
 		    ifp->if_hw_tsomax >= IP_MAXPACKET / 8,
 		    ("%s: tsomax outside of range", __func__));
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = TAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	if (IF_AFDATA_TRYLOCK(ifp) == 0)
 		return;
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 1);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if_detach_internal(ifp, 0);
 	CURVNET_RESTORE();
 }
 
 static void
 if_detach_internal(struct ifnet *ifp, int vmove)
 {
 	struct ifaddr *ifa;
 	struct radix_node_head	*rnh;
 	int i, j;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			TAILQ_REMOVE(&V_ifnet, ifp, if_link);
 			found = 1;
 			break;
 		}
 #ifdef VIMAGE
 	if (found)
 		curvnet->vnet_ifcnt--;
 #endif
 	IFNET_WUNLOCK();
 	if (!found) {
 		if (vmove)
 			panic("%s: ifp=%p not on the ifnet tailq %p",
 			    __func__, ifp, &V_ifnet);
 		else
 			return; /* XXX this should panic as well? */
 	}
 
 	/*
 	 * Remove/wait for pending events.
 	 */
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 	if_down(ifp);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Remove link ifaddr pointer and maybe decrement if_index.
 		 * Clean up all addresses.
 		 */
 		ifp->if_addr = NULL;
 
 		/* We can now free link ifaddr. */
 		if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = TAILQ_FIRST(&ifp->if_addrhead);
 			TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 			ifa_free(ifa);
 		}
 	}
 
 	/*
 	 * Delete all remaining routes using this interface
 	 * Unfortuneatly the only way to do this is to slog through
 	 * the entire routing table looking for routes which point
 	 * to this interface...oh well...
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		for (j = 0; j < rt_numfibs; j++) {
 			rnh = rt_tables_get_rnh(j, i);
 			if (rnh == NULL)
 				continue;
 			RADIX_NODE_HEAD_LOCK(rnh);
 			(void) rnh->rnh_walktree(rnh, if_rtdel, ifp);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 	}
 
 	if_delgroups(ifp);
 
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 	}
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  * An attempt is made to shrink if_index in current vnet, find an
  * unused if_index in target vnet and calls if_grow() if necessary,
  * and finally find an unused if_xname for the target vnet.
  */
 void
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 	u_short idx;
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 */
 	if_detach_internal(ifp, 1);
 
 	/*
 	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
 	 * the if_index for that vnet if possible.
 	 *
 	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
 	 * or we'd lock on one vnet and unlock on another.
 	 */
 	IFNET_WLOCK();
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 
 	IFNET_WLOCK();
 	if (ifindex_alloc_locked(&idx) != 0) {
 		IFNET_WUNLOCK();
 		panic("if_index overflow");
 	}
 	ifp->if_index = idx;
 	ifnet_setbyindex_locked(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 
 	if_attach_internal(ifp, 1);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	CURVNET_RESTORE();
 	if (difp != NULL) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Move the interface into the child jail/vnet. */
 	if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Get interface back from child jail/vnet. */
 	if_vmove(ifp, vnet_dst);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		TAILQ_INIT(&ifg->ifg_members);
 		TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL) {
 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
 		free(ifgm, M_TEMP);
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 		IFNET_WUNLOCK();
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	} else
 		IFNET_WUNLOCK();
 
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while (!TAILQ_EMPTY(&ifp->if_groups)) {
 		ifgl = TAILQ_FIRST(&ifp->if_groups);
 
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 
 		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 		IF_ADDR_WUNLOCK(ifp);
 
 		TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 			if (ifgm->ifgm_ifp == ifp)
 				break;
 
 		if (ifgm != NULL) {
 			TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifgm_next);
 			free(ifgm, M_TEMP);
 		}
 
 		if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 			TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 			IFNET_WUNLOCK();
 			EVENTHANDLER_INVOKE(group_detach_event,
 			    ifgl->ifgl_group);
 			free(ifgl->ifgl_group, M_TEMP);
 		} else
 			IFNET_WUNLOCK();
 
 		free(ifgl, M_TEMP);
 
 		EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed
  * to by data
  */
 static int
 if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 	struct ifgroupreq	*ifgr = data;
 
 	if (ifgr->ifgr_len == 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		IF_ADDR_RUNLOCK(ifp);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	IF_ADDR_RUNLOCK(ifp);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by data
  */
 static int
 if_getgroupmembers(struct ifgroupreq *data)
 {
 	struct ifgroupreq	*ifgr = data;
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rn	pointer to node in the routing table
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  *
  */
 static int
 if_rtdel(struct radix_node *rn, void *arg)
 {
 	struct rtentry	*rt = (struct rtentry *)rn;
 	struct ifnet	*ifp = arg;
 	int		err;
 
 	if (rt->rt_ifp == ifp) {
 
 		/*
 		 * Protect (sorta) against walktree recursion problems
 		 * with cloned routes
 		 */
 		if ((rt->rt_flags & RTF_UP) == 0)
 			return (0);
 
 		err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 				rt_mask(rt),
 				rt->rt_flags|RTF_RNH_LOCKED|RTF_PINNED,
 				(struct rtentry **) NULL, rt->rt_fibnum);
 		if (err) {
 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Return counter values from old racy non-pcpu counters.
  */
 uint64_t
 if_get_counter_compat(struct ifnet *ifp, ifnet_counter cnt)
 {
 
 	switch (cnt) {
 		case IFCOUNTER_IPACKETS:
 			return (ifp->if_ipackets);
 		case IFCOUNTER_IERRORS:
 			return (ifp->if_ierrors);
 		case IFCOUNTER_OPACKETS:
 			return (ifp->if_opackets);
 		case IFCOUNTER_OERRORS:
 			return (ifp->if_oerrors);
 		case IFCOUNTER_COLLISIONS:
 			return (ifp->if_collisions);
 		case IFCOUNTER_IBYTES:
 			return (ifp->if_ibytes);
 		case IFCOUNTER_OBYTES:
 			return (ifp->if_obytes);
 		case IFCOUNTER_IMCASTS:
 			return (ifp->if_imcasts);
 		case IFCOUNTER_OMCASTS:
 			return (ifp->if_omcasts);
 		case IFCOUNTER_IQDROPS:
 			return (ifp->if_iqdrops);
 		case IFCOUNTER_OQDROPS:
 			return (ifp->if_oqdrops);
 		case IFCOUNTER_NOPROTO:
 			return (ifp->if_noproto);
 	}
 	panic("%s: unknown counter %d", __func__, cnt);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Wrapper functions for struct ifnet address list locking macros.  These are
  * used by kernel modules to avoid encoding programming interface or binary
  * interface assumptions that may be violated when kernel-internal locking
  * approaches change.
  */
 void
 if_addr_rlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RLOCK(ifp);
 }
 
 void
 if_addr_runlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RUNLOCK(ifp);
 }
 
 void
 if_maddr_rlock(if_t ifp)
 {
 
 	IF_ADDR_RLOCK((struct ifnet *)ifp);
 }
 
 void
 if_maddr_runlock(if_t ifp)
 {
 
 	IF_ADDR_RUNLOCK((struct ifnet *)ifp);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 
 	refcount_acquire(&ifa->ifa_refcnt);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt)) {
 		counter_u64_free(ifa->ifa_opackets);
 		counter_u64_free(ifa->ifa_ipackets);
 		counter_u64_free(ifa->ifa_obytes);
 		counter_u64_free(ifa->ifa_ibytes);
 		free(ifa, M_IFADDR);
 	}
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 	int error = 0;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 
 	bzero(&info, sizeof(info));
 	info.rti_ifp = V_loif;
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	error = rtrequest1_fib(RTM_ADD, &info, &rt, ifa->ifa_ifp->if_fib);
 
 	if (error == 0 && rt != NULL) {
 		RT_LOCK(rt);
 		((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
 			ifa->ifa_ifp->if_type;
 		((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 			ifa->ifa_ifp->if_index;
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 	} else if (error != 0)
 		log(LOG_DEBUG, "%s: insertion failed: %u\n", __func__, error);
 
 	return (error);
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 	int error = 0;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 
 	bzero(&null_sdl, sizeof(null_sdl));
 	null_sdl.sdl_len = sizeof(null_sdl);
 	null_sdl.sdl_family = AF_LINK;
 	null_sdl.sdl_type = ifa->ifa_ifp->if_type;
 	null_sdl.sdl_index = ifa->ifa_ifp->if_index;
 	bzero(&info, sizeof(info));
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	error = rtrequest1_fib(RTM_DELETE, &info, NULL, ifa->ifa_ifp->if_fib);
 
 	if (error != 0)
 		log(LOG_DEBUG, "%s: deletion failed: %u\n", __func__, error);
 
 	return (error);
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *sa, int fib)
 {
 	struct rtentry *rt;
 
 	rt = rtalloc1_fib(sa, 0, 0, fib);
 	if (rt == NULL) {
 		log(LOG_DEBUG, "%s: fail", __func__);
 		return (EHOSTUNREACH);
 	}
 	((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
 	    ifa->ifa_ifp->if_type;
 	((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 	    ifa->ifa_ifp->if_index;
 	RTFREE_LOCKED(rt);
 
 	return (0);
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((struct sockaddr_dl *)(a1))->sdl_len ==			\
 	 ((struct sockaddr_dl *)(a2))->sdl_len) &&			\
 	 (bcmp(LLADDR((struct sockaddr_dl *)(a1)),			\
 	       LLADDR((struct sockaddr_dl *)(a2)),			\
 	       ((struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 static struct ifaddr *
 ifa_ifwithaddr_internal(struct sockaddr *addr, int getref)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 struct ifaddr *
 ifa_ifwithaddr(struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 1));
 }
 
 int
 ifa_ifwithaddr_check(struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 0) != NULL);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
-ifa_ifwithbroadaddr(struct sockaddr *addr)
+ifa_ifwithbroadaddr(struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
+			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
-ifa_ifwithdstaddr_fib(struct sockaddr *addr, int fibnum)
+ifa_ifwithdstaddr(struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
-struct ifaddr *
-ifa_ifwithdstaddr(struct sockaddr *addr)
-{
-
-	return (ifa_ifwithdstaddr_fib(addr, RT_ALL_FIBS));
-}
-
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
-ifa_ifwithnet_fib(struct sockaddr *addr, int ignore_ptp, int fibnum)
+ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	char *addr_data = addr->sa_data, *cplim;
 
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 #ifdef INET6
 	if (af == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 
 		sin6 = (struct sockaddr_in6 *)addr;
 		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
 		    sin6->sin6_scope_id != 0) {
 			ifp = in6_getlinkifnet(sin6->sin6_scope_id);
 			if (ifp != NULL)
 				return ((struct ifaddr *)
 				    in6ifa_ifpforlinklocal(ifp, 0));
 		}
 	}
 #endif
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.  Maintain a reference
 	 * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that
 	 * kept it stable when we move onto the next interface.
 	 */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					ifa_ref(ifa);
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					if (ifa_maybe != NULL)
 						ifa_free(ifa_maybe);
 					ifa_maybe = ifa;
 					ifa_ref(ifa_maybe);
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	if (ifa_maybe != NULL)
 		ifa_free(ifa_maybe);
 	return (ifa);
-}
-
-struct ifaddr *
-ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp)
-{
-
-	return (ifa_ifwithnet_fib(addr, ignore_ptp, RT_ALL_FIBS));
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 #include <net/if_llatbl.h>
 
 /*
  * Default action when installing a route with a Link Level gateway.
  * Lookup an appropriate real ifa to point to.
  * This should be moved to /sys/net/link.c eventually.
  */
 static void
 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa, *oifa;
 	struct sockaddr *dst;
 	struct ifnet *ifp;
 
 	RT_LOCK_ASSERT(rt);
 
 	if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
 	    ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
 		return;
 	ifa = ifaof_ifpforaddr(dst, ifp);
 	if (ifa) {
 		oifa = rt->rt_ifa;
 		rt->rt_ifa = ifa;
 		ifa_free(oifa);
 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
 			ifa->ifa_rtrequest(cmd, rt, info);
 	}
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    IFP2AC(ifp)->ac_netgraph != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		(*bridge_linkstate_p)(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 	
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != 0) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL)
 		if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 static int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr->ifr_buffer.length < descrlen)
 				ifr->ifr_buffer.buffer = NULL;
 			else
 				error = copyout(ifp->if_description,
 				    ifr->ifr_buffer.buffer, descrlen);
 			ifr->ifr_buffer.length = descrlen;
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr->ifr_buffer.length > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr->ifr_buffer.length == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
 			    M_WAITOK | M_ZERO);
 			error = copyin(ifr->ifr_buffer.buffer, descrbuf,
 			    ifr->ifr_buffer.length - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			if_up(ifp);
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
 			    ifp->if_xname,
 			    (new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		log(LOG_INFO, "%s: changing name to '%s'\n",
 		    ifp->if_xname, new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			IF_ADDR_RUNLOCK(ifp);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFGENERIC:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 		break;
 
 	case SIOCAIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	case SIOCGIFGROUP:
 		if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
 			return (error);
 		break;
 
 	case SIOCDIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 #endif
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 
 	CURVNET_SET(so->so_vnet);
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		CURVNET_RESTORE();
 		return (error);
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGIFCONF32:
 		{
 			struct ifconf32 *ifc32;
 			struct ifconf ifc;
 
 			ifc32 = (struct ifconf32 *)data;
 			ifc.ifc_len = ifc32->ifc_len;
 			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 
 			error = ifconf(SIOCGIFCONF, (void *)&ifc);
 			CURVNET_RESTORE();
 			if (error == 0)
 				ifc32->ifc_len = ifc.ifc_len;
 			return (error);
 		}
 #endif
 	}
 	ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name),
 			    cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error == 0)
 			error = if_clone_destroy(ifr->ifr_name);
 		CURVNET_RESTORE();
 		return (error);
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		CURVNET_RESTORE();
 		return (error);
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (error);
 	}
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
 	    ifp, td));
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 	if_rele(ifp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
 		log(LOG_INFO, "%s: promiscuous mode %s\n",
 		    ifp->if_xname,
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		int addrs;
 
 		/*
 		 * Zero the ifr_name buffer to make sure we don't
 		 * disclose the contents of the stack.
 		 */
 		memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				ifr.ifr_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (addrs == 0) {
 			bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 static void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct ifnet *oifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	IFNET_RUNLOCK_NOSLEEP();
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	struct ifnet *ifp;
 	int lastref;
 
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct ifnet *oifp;
 
 		IFNET_RLOCK_NOSLEEP();
 		TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp) {
 			printf("%s: ifnet %p disappeared\n", __func__, ifp);
 			ifp = NULL;
 		}
 		IFNET_RUNLOCK_NOSLEEP();
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, 0);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
 				    ifma_link);
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 
 	if (ifp != NULL)
 		TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if_freemulti(ifma);
 
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  */
 int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	IF_ADDR_RLOCK(ifp);
 	ifa = ifp->if_addr;
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EINVAL);
 	}
 	ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL) {
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	if (len != sdl->sdl_alen) {	/* don't allow length to change */
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISO88025:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_ARCNET:
 	case IFT_IEEE8023ADLAG:
 	case IFT_IEEE80211:
 		bcopy(lladdr, LLADDR(sdl), len);
 		ifa_free(ifa);
 		break;
 	default:
 		ifa_free(ifa);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 #ifdef INET
 		/*
 		 * Also send gratuitous ARPs to notify other nodes about
 		 * the address change.
 		 */
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET)
 				arp_ifinit(ifp, ifa);
 		}
 #endif
 	}
 	return (0);
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 int
 if_printf(struct ifnet *ifp, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ifp->if_xname);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		_IF_DROP(ifq);
 		IF_UNLOCK(ifq);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		ifp->if_obytes += m->m_pkthdr.len + adjust;
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			ifp->if_omcasts++;
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 	
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 	
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
  
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 
 int
 if_setflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags |= set;
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 
 	return (0);
 }
 
 int
 if_getflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 /* Statistics */
 int
 if_incipackets(if_t ifp, int pkts)
 {
 	((struct ifnet *)ifp)->if_ipackets += pkts;
 	return (0);
 }
 
 int
 if_incopackets(if_t ifp, int pkts)
 {
 	((struct ifnet *)ifp)->if_opackets += pkts;
 	return (0);
 }
 
 int
 if_incierrors(if_t ifp, int ierrors)
 {
 	((struct ifnet *)ifp)->if_ierrors += ierrors;
 	return (0);
 }
 
 
 int
 if_setierrors(if_t ifp, int ierrors)
 {
 	((struct ifnet *)ifp)->if_ierrors = ierrors;
 	return (0);
 }
 
 int
 if_setoerrors(if_t ifp, int oerrors)
 {
 	((struct ifnet *)ifp)->if_oerrors = oerrors;
 	return (0);
 }
 
 int if_incoerrors(if_t ifp, int oerrors)
 {
 	((struct ifnet *)ifp)->if_oerrors += oerrors;
 	return (0);
 }
 
 int if_inciqdrops(if_t ifp, int val)
 {
 	((struct ifnet *)ifp)->if_iqdrops += val;
 	return (0);
 }
 
 int
 if_setcollisions(if_t ifp, int collisions)
 {
 	((struct ifnet *)ifp)->if_collisions = collisions;
 	return (0);
 }
 
 int
 if_inccollisions(if_t ifp, int collisions)
 {
 	((struct ifnet *)ifp)->if_collisions += collisions;
 	return (0);
 }
  
 int
 if_setipackets(if_t ifp, int pkts)
 {
 	((struct ifnet *)ifp)->if_ipackets = pkts;
 	return (0);
 }
 
 int
 if_setopackets(if_t ifp, int pkts)
 {
 	((struct ifnet *)ifp)->if_opackets = pkts;
 	return (0);
 }
 
 int
 if_incobytes(if_t ifp, int bytes)
 {
 	((struct ifnet *)ifp)->if_obytes += bytes;
 	return (0);
 }
 
 int
 if_setibytes(if_t ifp, int bytes)
 {
 	((struct ifnet *)ifp)->if_ibytes = bytes;
 	return (0);
 }
 
 int
 if_setobytes(if_t ifp, int bytes)
 {
 	((struct ifnet *)ifp)->if_obytes = bytes;
 	return (0);
 }
 
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 int if_getiqdrops(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_iqdrops;
 }
 
 int
 if_incimcasts(if_t ifp, int mcast)
 {
 	((struct ifnet *)ifp)->if_imcasts += mcast;
 	return (0);
 }
 
 
 int
 if_incomcasts(if_t ifp, int mcast)
 {
 	((struct ifnet *)ifp)->if_omcasts += mcast;
 	return (0);
 }
 
 int
 if_setimcasts(if_t ifp, int mcast)
 {
 	((struct ifnet *)ifp)->if_imcasts = mcast;
 	return (0);
 }
 
 
 struct ifaddr *
 if_getifaddr(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 /* XXX */
 #ifndef ETH_ADDR_LEN
 #define ETH_ADDR_LEN 6
 #endif
 
 int 
 if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max)
 {
 	struct ifmultiaddr *ifma;
 	uint8_t *lmta = (uint8_t *)mta;
 	int mcnt = 0;
 
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 
 		if (mcnt == max)
 			break;
 
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
 		    &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
 		mcnt++;
 	}
 	*cnt = mcnt;
 
 	return (0);
 }
 
 int
 if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max)
 {
 	int error;
 
 	if_maddr_rlock(ifp);
 	error = if_setupmultiaddr(ifp, mta, cnt, max);
 	if_maddr_runlock(ifp);
 	return (error);
 }
 
 int
 if_multiaddr_count(if_t ifp, int max)
 {
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	if_maddr_rlock(ifp);
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count++;
 		if (count == max)
 			break;
 	}
 	if_maddr_runlock(ifp);
 	return (count);
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 void
 if_setinitfn(if_t ifp, void (*init_fn)(void *))
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setstartfn(if_t ifp, void (*start_fn)(if_t))
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 	
 }
 
 /* Revisit these - These are inline functions originally. */
 int
 drbr_inuse_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_inuse_drv(ifh, br);
 }
 
 struct mbuf*
 drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_dequeue(ifh, br);
 }
 
 int
 drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_needs_enqueue(ifh, br);
 }
 
 int
 drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m)
 {
 	return drbr_enqueue(ifh, br, m);
 
 }
Index: user/ae/inet6/sys/net/if_var.h
===================================================================
--- user/ae/inet6/sys/net/if_var.h	(revision 271452)
+++ user/ae/inet6/sys/net/if_var.h	(revision 271453)
@@ -1,622 +1,619 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef	_NET_IF_VAR_H_
 #define	_NET_IF_VAR_H_
 
 /*
  * Structures defining a network interface, providing a packet
  * transport mechanism (ala level 0 of the PUP protocols).
  *
  * Each interface accepts output datagrams of a specified maximum
  * length, and provides higher level routines with input datagrams
  * received from its medium.
  *
  * Output occurs when the routine if_output is called, with three parameters:
  *	(*ifp->if_output)(ifp, m, dst, rt)
  * Here m is the mbuf chain to be sent and dst is the destination address.
  * The output routine encapsulates the supplied datagram if necessary,
  * and then transmits it on its medium.
  *
  * On input, each interface unwraps the data received by it, and either
  * places it on the input queue of an internetwork datagram routine
  * and posts the associated software interrupt, or passes the datagram to a raw
  * packet input routine.
  *
  * Routines exist for locating interfaces by their addresses
  * or for locating an interface on a certain network, as well as more general
  * routing and gateway routines maintaining information used to locate
  * interfaces.  These routines live in the files if.c and route.c
  */
 
 struct	rtentry;		/* ifa_rtrequest */
 struct	rt_addrinfo;		/* ifa_rtrequest */
 struct	socket;
 struct	carp_if;
 struct	carp_softc;
 struct  ifvlantrunk;
 struct	route;			/* if_output */
 struct	vnet;
 struct	ifmedia;
 struct	netmap_adapter;
 
 #ifdef _KERNEL
 #include <sys/mbuf.h>		/* ifqueue only? */
 #include <sys/buf_ring.h>
 #include <net/vnet.h>
 #endif /* _KERNEL */
 #include <sys/counter.h>
 #include <sys/lock.h>		/* XXX */
 #include <sys/mutex.h>		/* struct ifqueue */
 #include <sys/rwlock.h>		/* XXX */
 #include <sys/sx.h>		/* XXX */
 #include <sys/_task.h>		/* if_link_task */
 
 #define	IF_DUNIT_NONE	-1
 
 #include <altq/if_altq.h>
 
 TAILQ_HEAD(ifnethead, ifnet);	/* we use TAILQs so that the order of */
 TAILQ_HEAD(ifaddrhead, ifaddr);	/* instantiation is preserved in the list */
 TAILQ_HEAD(ifmultihead, ifmultiaddr);
 TAILQ_HEAD(ifgrouphead, ifg_group);
 
 #ifdef _KERNEL
 VNET_DECLARE(struct pfil_head, link_pfil_hook);	/* packet filter hooks */
 #define	V_link_pfil_hook	VNET(link_pfil_hook)
 #endif /* _KERNEL */
 
 typedef enum {
 	IFCOUNTER_IPACKETS = 1,
 	IFCOUNTER_IERRORS,
 	IFCOUNTER_OPACKETS,
 	IFCOUNTER_OERRORS,
 	IFCOUNTER_COLLISIONS,
 	IFCOUNTER_IBYTES,
 	IFCOUNTER_OBYTES,
 	IFCOUNTER_IMCASTS,
 	IFCOUNTER_OMCASTS,
 	IFCOUNTER_IQDROPS,
 	IFCOUNTER_OQDROPS,
 	IFCOUNTER_NOPROTO,
 } ifnet_counter;
 
 typedef struct ifnet * if_t;
 
 typedef	void (*if_start_fn_t)(if_t);
 typedef	int (*if_ioctl_fn_t)(if_t, u_long, caddr_t);
 typedef	void (*if_init_fn_t)(void *);
 typedef void (*if_qflush_fn_t)(if_t);
 typedef int (*if_transmit_fn_t)(if_t, struct mbuf *);
 typedef	uint64_t (*if_get_counter_t)(if_t, ifnet_counter);
 
 /*
  * Structure defining a network interface.
  *
  * Size ILP32:  592 (approx)
  *	 LP64: 1048 (approx)
  */
 struct ifnet {
 	/* General book keeping of interface lists. */
 	TAILQ_ENTRY(ifnet) if_link; 	/* all struct ifnets are chained */
 	LIST_ENTRY(ifnet) if_clones;	/* interfaces of a cloner */
 	TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */
 					/* protected by if_addr_lock */
 	u_char	if_alloctype;		/* if_type at time of allocation */
 
 	/* Driver and protocol specific information that remains stable. */
 	void	*if_softc;		/* pointer to driver state */
 	void	*if_llsoftc;		/* link layer softc */
 	void	*if_l2com;		/* pointer to protocol bits */
 	const char *if_dname;		/* driver name */
 	int	if_dunit;		/* unit or IF_DUNIT_NONE */
 	u_short	if_index;		/* numeric abbreviation for this if  */
 	short	if_index_reserved;	/* spare space to grow if_index */
 	char	if_xname[IFNAMSIZ];	/* external name (name + unit) */
 	char	*if_description;	/* interface description */
 
 	/* Variable fields that are touched by the stack and drivers. */
 	int	if_flags;		/* up/down, broadcast, etc. */
 	int	if_drv_flags;		/* driver-managed status flags */
 	int	if_capabilities;	/* interface features & capabilities */
 	int	if_capenable;		/* enabled features & capabilities */
 	void	*if_linkmib;		/* link-type-specific MIB data */
 	size_t	if_linkmiblen;		/* length of above data */
 	u_int	if_refcount;		/* reference count */
 
 	/* These fields are shared with struct if_data. */
 	uint8_t		if_type;	/* ethernet, tokenring, etc */
 	uint8_t		if_addrlen;	/* media address length */
 	uint8_t		if_hdrlen;	/* media header length */
 	uint8_t		if_link_state;	/* current link state */
 	uint32_t	if_mtu;		/* maximum transmission unit */
 	uint32_t	if_metric;	/* routing metric (external only) */
 	uint64_t	if_baudrate;	/* linespeed */
 	uint64_t	if_hwassist;	/* HW offload capabilities, see IFCAP */
 	time_t		if_epoch;	/* uptime at attach or stat reset */
 	struct timeval	if_lastchange;	/* time of last administrative change */
 
 	struct  ifaltq if_snd;		/* output queue (includes altq) */
 	struct	task if_linktask;	/* task for link change events */
 
 	/* Addresses of different protocol families assigned to this if. */
 	struct	rwlock if_addr_lock;	/* lock to protect address lists */
 		/*
 		 * if_addrhead is the list of all addresses associated to
 		 * an interface.
 		 * Some code in the kernel assumes that first element
 		 * of the list has type AF_LINK, and contains sockaddr_dl
 		 * addresses which store the link-level address and the name
 		 * of the interface.
 		 * However, access to the AF_LINK address through this
 		 * field is deprecated. Use if_addr or ifaddr_byindex() instead.
 		 */
 	struct	ifaddrhead if_addrhead;	/* linked list of addresses per if */
 	struct	ifmultihead if_multiaddrs; /* multicast addresses configured */
 	int	if_amcount;		/* number of all-multicast requests */
 	struct	ifaddr	*if_addr;	/* pointer to link-level address */
 	const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */
 	struct	rwlock if_afdata_lock;
 	void	*if_afdata[AF_MAX];
 	int	if_afdata_initialized;
 
 	/* Additional features hung off the interface. */
 	u_int	if_fib;			/* interface FIB */
 	struct	vnet *if_vnet;		/* pointer to network stack instance */
 	struct	vnet *if_home_vnet;	/* where this ifnet originates from */
 	struct  ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */
 	struct	bpf_if *if_bpf;		/* packet filter structure */
 	int	if_pcount;		/* number of promiscuous listeners */
 	void	*if_bridge;		/* bridge glue */
 	void	*if_lagg;		/* lagg glue */
 	void	*if_pf_kif;		/* pf glue */
 	struct	carp_if *if_carp;	/* carp interface structure */
 	struct	label *if_label;	/* interface MAC label */
 	struct	netmap_adapter *if_netmap; /* netmap(4) softc */
 
 	/* Various procedures of the layer2 encapsulation and drivers. */
 	int	(*if_output)		/* output routine (enqueue) */
 		(struct ifnet *, struct mbuf *, const struct sockaddr *,
 		     struct route *);
 	void	(*if_input)		/* input routine (from h/w driver) */
 		(struct ifnet *, struct mbuf *);
 	if_start_fn_t	if_start;	/* initiate output routine */
 	if_ioctl_fn_t	if_ioctl;	/* ioctl routine */
 	if_init_fn_t	if_init;	/* Init routine */
 	int	(*if_resolvemulti)	/* validate/resolve multicast */
 		(struct ifnet *, struct sockaddr **, struct sockaddr *);
 	if_qflush_fn_t	if_qflush;	/* flush any queue */	
 	if_transmit_fn_t if_transmit;   /* initiate output routine */
 
 	void	(*if_reassign)		/* reassign to vnet routine */
 		(struct ifnet *, struct vnet *, char *);
 	if_get_counter_t if_get_counter; /* get counter values */
 
 	/* Stuff that's only temporary and doesn't belong here. */
 	u_int	if_hw_tsomax;		/* tso burst length limit, the minimum
 					 * is (IP_MAXPACKET / 8).
 					 * XXXAO: Have to find a better place
 					 * for it eventually. */
 	/*
 	 * Old, racy and expensive statistics, should not be used in
 	 * new drivers.
 	 */
 	uint64_t	if_ipackets;	/* packets received on interface */
 	uint64_t	if_ierrors;	/* input errors on interface */
 	uint64_t	if_opackets;	/* packets sent on interface */
 	uint64_t	if_oerrors;	/* output errors on interface */
 	uint64_t	if_collisions;	/* collisions on csma interfaces */
 	uint64_t	if_ibytes;	/* total number of octets received */
 	uint64_t	if_obytes;	/* total number of octets sent */
 	uint64_t	if_imcasts;	/* packets received via multicast */
 	uint64_t	if_omcasts;	/* packets sent via multicast */
 	uint64_t	if_iqdrops;	/* dropped on input */
 	uint64_t	if_oqdrops;	/* dropped on output */
 	uint64_t	if_noproto;	/* destined for unsupported protocol */
 
 	/*
 	 * Spare fields to be added before branching a stable branch, so
 	 * that structure can be enhanced without changing the kernel
 	 * binary interface.
 	 */
 };
 
 #include <net/ifq.h>	/* XXXAO: temporary unconditional include */
 
 /* for compatibility with other BSDs */
 #define	if_addrlist	if_addrhead
 #define	if_list		if_link
 #define	if_name(ifp)	((ifp)->if_xname)
 
 /*
  * Locks for address lists on the network interface.
  */
 #define	IF_ADDR_LOCK_INIT(if)	rw_init(&(if)->if_addr_lock, "if_addr_lock")
 #define	IF_ADDR_LOCK_DESTROY(if)	rw_destroy(&(if)->if_addr_lock)
 #define	IF_ADDR_WLOCK(if)	rw_wlock(&(if)->if_addr_lock)
 #define	IF_ADDR_WUNLOCK(if)	rw_wunlock(&(if)->if_addr_lock)
 #define	IF_ADDR_RLOCK(if)	rw_rlock(&(if)->if_addr_lock)
 #define	IF_ADDR_RUNLOCK(if)	rw_runlock(&(if)->if_addr_lock)
 #define	IF_ADDR_LOCK_ASSERT(if)	rw_assert(&(if)->if_addr_lock, RA_LOCKED)
 #define	IF_ADDR_WLOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_WLOCKED)
 
 /*
  * Function variations on locking macros intended to be used by loadable
  * kernel modules in order to divorce them from the internals of address list
  * locking.
  */
 void	if_addr_rlock(struct ifnet *ifp);	/* if_addrhead */
 void	if_addr_runlock(struct ifnet *ifp);	/* if_addrhead */
 void	if_maddr_rlock(if_t ifp);	/* if_multiaddrs */
 void	if_maddr_runlock(if_t ifp);	/* if_multiaddrs */
 
 #ifdef _KERNEL
 #ifdef _SYS_EVENTHANDLER_H_
 /* interface link layer address change event */
 typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t);
 /* interface address change event */
 typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t);
 /* new interface arrival event */
 typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t);
 /* interface departure event */
 typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t);
 /* Interface link state change event */
 typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int);
 EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t);
 #endif /* _SYS_EVENTHANDLER_H_ */
 
 /*
  * interface groups
  */
 struct ifg_group {
 	char				 ifg_group[IFNAMSIZ];
 	u_int				 ifg_refcnt;
 	void				*ifg_pf_kif;
 	TAILQ_HEAD(, ifg_member)	 ifg_members;
 	TAILQ_ENTRY(ifg_group)		 ifg_next;
 };
 
 struct ifg_member {
 	TAILQ_ENTRY(ifg_member)	 ifgm_next;
 	struct ifnet		*ifgm_ifp;
 };
 
 struct ifg_list {
 	struct ifg_group	*ifgl_group;
 	TAILQ_ENTRY(ifg_list)	 ifgl_next;
 };
 
 #ifdef _SYS_EVENTHANDLER_H_
 /* group attach event */
 typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
 /* group detach event */
 typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
 /* group change event */
 typedef void (*group_change_event_handler_t)(void *, const char *);
 EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);
 #endif /* _SYS_EVENTHANDLER_H_ */
 
 #define	IF_AFDATA_LOCK_INIT(ifp)	\
 	rw_init(&(ifp)->if_afdata_lock, "if_afdata")
 
 #define	IF_AFDATA_WLOCK(ifp)	rw_wlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_RLOCK(ifp)	rw_rlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_WUNLOCK(ifp)	rw_wunlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_RUNLOCK(ifp)	rw_runlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_LOCK(ifp)	IF_AFDATA_WLOCK(ifp)
 #define	IF_AFDATA_UNLOCK(ifp)	IF_AFDATA_WUNLOCK(ifp)
 #define	IF_AFDATA_TRYLOCK(ifp)	rw_try_wlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_DESTROY(ifp)	rw_destroy(&(ifp)->if_afdata_lock)
 
 #define	IF_AFDATA_LOCK_ASSERT(ifp)	rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED)
 #define	IF_AFDATA_RLOCK_ASSERT(ifp)	rw_assert(&(ifp)->if_afdata_lock, RA_RLOCKED)
 #define	IF_AFDATA_WLOCK_ASSERT(ifp)	rw_assert(&(ifp)->if_afdata_lock, RA_WLOCKED)
 #define	IF_AFDATA_UNLOCK_ASSERT(ifp)	rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED)
 
 /*
  * 72 was chosen below because it is the size of a TCP/IP
  * header (40) + the minimum mss (32).
  */
 #define	IF_MINMTU	72
 #define	IF_MAXMTU	65535
 
 #define	TOEDEV(ifp)	((ifp)->if_llsoftc)
 
 #endif /* _KERNEL */
 
 /*
  * The ifaddr structure contains information about one address
  * of an interface.  They are maintained by the different address families,
  * are allocated and attached when an address is set, and are linked
  * together so all addresses for an interface can be located.
  *
  * NOTE: a 'struct ifaddr' is always at the beginning of a larger
  * chunk of malloc'ed memory, where we store the three addresses
  * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here.
  */
 #if defined(_KERNEL) || defined(_WANT_IFADDR)
 struct ifaddr {
 	struct	sockaddr *ifa_addr;	/* address of interface */
 	struct	sockaddr *ifa_dstaddr;	/* other end of p-to-p link */
 #define	ifa_broadaddr	ifa_dstaddr	/* broadcast address interface */
 	struct	sockaddr *ifa_netmask;	/* used to determine subnet */
 	struct	ifnet *ifa_ifp;		/* back-pointer to interface */
 	struct	carp_softc *ifa_carp;	/* pointer to CARP data */
 	TAILQ_ENTRY(ifaddr) ifa_link;	/* queue macro glue */
 	void	(*ifa_rtrequest)	/* check or clean routes (+ or -)'d */
 		(int, struct rtentry *, struct rt_addrinfo *);
 	u_short	ifa_flags;		/* mostly rt_flags for cloning */
 	u_int	ifa_refcnt;		/* references to this structure */
 
 	counter_u64_t	ifa_ipackets;
 	counter_u64_t	ifa_opackets;	 
 	counter_u64_t	ifa_ibytes;
 	counter_u64_t	ifa_obytes;
 };
 #endif
 
 #ifdef _KERNEL
 #define	IFA_ROUTE	RTF_UP		/* route installed */
 #define	IFA_RTSELF	RTF_HOST	/* loopback route to self installed */
 
 /* For compatibility with other BSDs. SCTP uses it. */
 #define	ifa_list	ifa_link
 
 struct ifaddr *	ifa_alloc(size_t size, int flags);
 void	ifa_free(struct ifaddr *ifa);
 void	ifa_ref(struct ifaddr *ifa);
 #endif /* _KERNEL */
 
 /*
  * Multicast address structure.  This is analogous to the ifaddr
  * structure except that it keeps track of multicast addresses.
  */
 struct ifmultiaddr {
 	TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */
 	struct	sockaddr *ifma_addr; 	/* address this membership is for */
 	struct	sockaddr *ifma_lladdr;	/* link-layer translation, if any */
 	struct	ifnet *ifma_ifp;	/* back-pointer to interface */
 	u_int	ifma_refcount;		/* reference count */
 	void	*ifma_protospec;	/* protocol-specific state, if any */
 	struct	ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */
 };
 
 #ifdef _KERNEL
 
 extern	struct rwlock ifnet_rwlock;
 extern	struct sx ifnet_sxlock;
 
 #define	IFNET_LOCK_INIT() do {						\
 	rw_init_flags(&ifnet_rwlock, "ifnet_rw",  RW_RECURSE);		\
 	sx_init_flags(&ifnet_sxlock, "ifnet_sx",  SX_RECURSE);		\
 } while(0)
 
 #define	IFNET_WLOCK() do {						\
 	sx_xlock(&ifnet_sxlock);					\
 	rw_wlock(&ifnet_rwlock);					\
 } while (0)
 
 #define	IFNET_WUNLOCK() do {						\
 	rw_wunlock(&ifnet_rwlock);					\
 	sx_xunlock(&ifnet_sxlock);					\
 } while (0)
 
 /*
  * To assert the ifnet lock, you must know not only whether it's for read or
  * write, but also whether it was acquired with sleep support or not.
  */
 #define	IFNET_RLOCK_ASSERT()		sx_assert(&ifnet_sxlock, SA_SLOCKED)
 #define	IFNET_RLOCK_NOSLEEP_ASSERT()	rw_assert(&ifnet_rwlock, RA_RLOCKED)
 #define	IFNET_WLOCK_ASSERT() do {					\
 	sx_assert(&ifnet_sxlock, SA_XLOCKED);				\
 	rw_assert(&ifnet_rwlock, RA_WLOCKED);				\
 } while (0)
 
 #define	IFNET_RLOCK()		sx_slock(&ifnet_sxlock)
 #define	IFNET_RLOCK_NOSLEEP()	rw_rlock(&ifnet_rwlock)
 #define	IFNET_RUNLOCK()		sx_sunlock(&ifnet_sxlock)
 #define	IFNET_RUNLOCK_NOSLEEP()	rw_runlock(&ifnet_rwlock)
 
 /*
  * Look up an ifnet given its index; the _ref variant also acquires a
  * reference that must be freed using if_rele().  It is almost always a bug
  * to call ifnet_byindex() instead if ifnet_byindex_ref().
  */
 struct ifnet	*ifnet_byindex(u_short idx);
 struct ifnet	*ifnet_byindex_locked(u_short idx);
 struct ifnet	*ifnet_byindex_ref(u_short idx);
 
 /*
  * Given the index, ifaddr_byindex() returns the one and only
  * link-level ifaddr for the interface. You are not supposed to use
  * it to traverse the list of addresses associated to the interface.
  */
 struct ifaddr	*ifaddr_byindex(u_short idx);
 
 VNET_DECLARE(struct ifnethead, ifnet);
 VNET_DECLARE(struct ifgrouphead, ifg_head);
 VNET_DECLARE(int, if_index);
 VNET_DECLARE(struct ifnet *, loif);	/* first loopback interface */
 
 #define	V_ifnet		VNET(ifnet)
 #define	V_ifg_head	VNET(ifg_head)
 #define	V_if_index	VNET(if_index)
 #define	V_loif		VNET(loif)
 
 int	if_addgroup(struct ifnet *, const char *);
 int	if_delgroup(struct ifnet *, const char *);
 int	if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **);
 int	if_allmulti(struct ifnet *, int);
 struct	ifnet* if_alloc(u_char);
 void	if_attach(struct ifnet *);
 void	if_dead(struct ifnet *);
 int	if_delmulti(struct ifnet *, struct sockaddr *);
 void	if_delmulti_ifma(struct ifmultiaddr *);
 void	if_detach(struct ifnet *);
 void	if_vmove(struct ifnet *, struct vnet *);
 void	if_purgeaddrs(struct ifnet *);
 void	if_delallmulti(struct ifnet *);
 void	if_down(struct ifnet *);
 struct ifmultiaddr *
 	if_findmulti(struct ifnet *, struct sockaddr *);
 void	if_free(struct ifnet *);
 void	if_initname(struct ifnet *, const char *, int);
 void	if_link_state_change(struct ifnet *, int);
 int	if_printf(struct ifnet *, const char *, ...) __printflike(2, 3);
 void	if_ref(struct ifnet *);
 void	if_rele(struct ifnet *);
 int	if_setlladdr(struct ifnet *, const u_char *, int);
 void	if_up(struct ifnet *);
 int	ifioctl(struct socket *, u_long, caddr_t, struct thread *);
 int	ifpromisc(struct ifnet *, int);
 struct	ifnet *ifunit(const char *);
 struct	ifnet *ifunit_ref(const char *);
 
 int	ifa_add_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_del_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *, int fib);
 
 struct	ifaddr *ifa_ifwithaddr(struct sockaddr *);
 int		ifa_ifwithaddr_check(struct sockaddr *);
-struct	ifaddr *ifa_ifwithbroadaddr(struct sockaddr *);
-struct	ifaddr *ifa_ifwithdstaddr(struct sockaddr *);
-struct	ifaddr *ifa_ifwithdstaddr_fib(struct sockaddr *, int);
-struct	ifaddr *ifa_ifwithnet(struct sockaddr *, int);
-struct	ifaddr *ifa_ifwithnet_fib(struct sockaddr *, int, int);
-struct	ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *);
-struct	ifaddr *ifa_ifwithroute_fib(int, struct sockaddr *, struct sockaddr *, u_int);
+struct	ifaddr *ifa_ifwithbroadaddr(struct sockaddr *, int);
+struct	ifaddr *ifa_ifwithdstaddr(struct sockaddr *, int);
+struct	ifaddr *ifa_ifwithnet(struct sockaddr *, int, int);
+struct	ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int);
 struct	ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *);
 int	ifa_preferred(struct ifaddr *, struct ifaddr *);
 
 int	if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen);
 
 typedef	void *if_com_alloc_t(u_char type, struct ifnet *ifp);
 typedef	void if_com_free_t(void *com, u_char type);
 void	if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f);
 void	if_deregister_com_alloc(u_char type);
 void	if_data_copy(struct ifnet *, struct if_data *);
 uint64_t if_get_counter_compat(struct ifnet *, ifnet_counter);
 
 #define IF_LLADDR(ifp)							\
     LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr))
 
 uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate);
 uint64_t if_getbaudrate(if_t ifp);
 int if_setcapabilities(if_t ifp, int capabilities);
 int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit);
 int if_getcapabilities(if_t ifp);
 int if_togglecapenable(if_t ifp, int togglecap);
 int if_setcapenable(if_t ifp, int capenable);
 int if_setcapenablebit(if_t ifp, int setcap, int clearcap);
 int if_getcapenable(if_t ifp);
 const char *if_getdname(if_t ifp);
 int if_setdev(if_t ifp, void *dev);
 int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags);
 int if_getdrvflags(if_t ifp);
 int if_setdrvflags(if_t ifp, int flags);
 int if_clearhwassist(if_t ifp);
 int if_sethwassistbits(if_t ifp, int toset, int toclear);
 int if_sethwassist(if_t ifp, int hwassist_bit);
 int if_gethwassist(if_t ifp);
 int if_setsoftc(if_t ifp, void *softc);
 void *if_getsoftc(if_t ifp);
 int if_setflags(if_t ifp, int flags);
 int if_setmtu(if_t ifp, int mtu);
 int if_getmtu(if_t ifp);
 int if_setflagbits(if_t ifp, int set, int clear);
 int if_getflags(if_t ifp);
 int if_sendq_empty(if_t ifp);
 int if_setsendqready(if_t ifp);
 int if_setsendqlen(if_t ifp, int tx_desc_count);
 int if_input(if_t ifp, struct mbuf* sendmp);
 int if_sendq_prepend(if_t ifp, struct mbuf *m);
 struct mbuf *if_dequeue(if_t ifp);
 int if_setifheaderlen(if_t ifp, int len);
 void if_setrcvif(struct mbuf *m, if_t ifp);
 void if_setvtag(struct mbuf *m, u_int16_t tag);
 u_int16_t if_getvtag(struct mbuf *m);
 int if_vlantrunkinuse(if_t ifp);
 caddr_t if_getlladdr(if_t ifp);
 void *if_gethandle(u_char);
 void if_bpfmtap(if_t ifp, struct mbuf *m);
 void if_etherbpfmtap(if_t ifp, struct mbuf *m);
 void if_vlancap(if_t ifp);
 
 int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max);
 int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max);
 int if_multiaddr_count(if_t ifp, int max);
 
 int if_getamcount(if_t ifp);
 struct ifaddr * if_getifaddr(if_t ifp);
 
 /* Statistics */
 
 int if_incipackets(if_t ifp, int pkt);
 int if_incopackets(if_t ifp, int pkts);
 int if_incierrors(if_t ifp, int ierrors);
 int if_incoerrors(if_t ifp, int oerrors);
 int if_inciqdrops(if_t ifp, int val);
 int if_setierrors(if_t ifp, int ierrors);
 int if_setoerrors(if_t ifp, int oerrors);
 int if_setcollisions(if_t ifp, int collisions);
 int if_inccollisions(if_t ifp, int collisions);
 int if_incobytes(if_t ifp, int bytes);
 int if_getiqdrops(if_t ifp);
 int if_incimcasts(if_t ifp, int imcasts);
 int if_incomcasts(if_t ifp, int imcasts);
 int if_setipackets(if_t ifp, int pkts);
 int if_setopackets(if_t ifp, int pkts);
 int if_setibytes(if_t ifp, int bytes);
 int if_setobytes(if_t ifp, int bytes);
 int if_setimcasts(if_t ifp, int pkts);
 
 /* Functions */
 void if_setinitfn(if_t ifp, void (*)(void *));
 void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t));
 void if_setstartfn(if_t ifp, void (*)(if_t));
 void if_settransmitfn(if_t ifp, if_transmit_fn_t);
 void if_setqflushfn(if_t ifp, if_qflush_fn_t);
  
 /* Revisit the below. These are inline functions originally */
 int drbr_inuse_drv(if_t ifp, struct buf_ring *br);
 struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br);
 int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br);
 int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m);
 
 #endif /* _KERNEL */
 #endif /* !_NET_IF_VAR_H_ */
Index: user/ae/inet6/sys/net/route.c
===================================================================
--- user/ae/inet6/sys/net/route.c	(revision 271452)
+++ user/ae/inet6/sys/net/route.c	(revision 271453)
@@ -1,1934 +1,1924 @@
 /*-
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 /************************************************************************
  * Note: In this file a 'fib' is a "forwarding information base"	*
  * Which is the new name for an in kernel routing (next hop) table.	*
  ***********************************************************************/
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_mrouting.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/syslog.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <net/flowtable.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/ip_mroute.h>
 
 #include <vm/uma.h>
 
 #define	RT_MAXFIBS	UINT16_MAX
 
 /* Kernel config default option. */
 #ifdef ROUTETABLES
 #if ROUTETABLES <= 0
 #error "ROUTETABLES defined too low"
 #endif
 #if ROUTETABLES > RT_MAXFIBS
 #error "ROUTETABLES defined too big"
 #endif
 #define	RT_NUMFIBS	ROUTETABLES
 #endif /* ROUTETABLES */
 /* Initialize to default if not otherwise set. */
 #ifndef	RT_NUMFIBS
 #define	RT_NUMFIBS	1
 #endif
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
 #endif /* SCTP */
 #endif
 
 
 /* This is read-only.. */
 u_int rt_numfibs = RT_NUMFIBS;
 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, "");
 
 /*
  * By default add routes to all fibs for new interfaces.
  * Once this is set to 0 then only allocate routes on interface
  * changes for the FIB of the caller when adding a new set of addresses
  * to an interface.  XXX this is a shotgun aproach to a problem that needs
  * a more fine grained solution.. that will come.
  * XXX also has the problems getting the FIB from curthread which will not
  * always work given the fib can be overridden and prefixes can be added
  * from the network stack context.
  */
 u_int rt_add_addr_allfibs = 1;
 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN,
     &rt_add_addr_allfibs, 0, "");
 
 VNET_DEFINE(struct rtstat, rtstat);
 #define	V_rtstat	VNET(rtstat)
 
 VNET_DEFINE(struct radix_node_head *, rt_tables);
 #define	V_rt_tables	VNET(rt_tables)
 
 VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
 #define	V_rttrash	VNET(rttrash)
 
 
 /*
  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
  * The operation can be done safely (in this code) because a
  * 'struct rtentry' starts with two 'struct radix_node''s, the first
  * one representing leaf nodes in the routing tree, which is
  * what the code in radix.c passes us as a 'struct radix_node'.
  *
  * But because there are a lot of assumptions in this conversion,
  * do not cast explicitly, but always use the macro below.
  */
 #define RNTORT(p)	((struct rtentry *)(p))
 
 static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
 #define	V_rtzone	VNET(rtzone)
 
 static int rtrequest1_fib_change(struct radix_node_head *, struct rt_addrinfo *,
     struct rtentry **, u_int);
 static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *);
 
 /*
  * handler for net.my_fibnum
  */
 static int
 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
 {
         int fibnum;
         int error;
  
         fibnum = curthread->td_proc->p_fibnum;
         error = sysctl_handle_int(oidp, &fibnum, 0, req);
         return (error);
 }
 
 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
 
 static __inline struct radix_node_head **
 rt_tables_get_rnh_ptr(int table, int fam)
 {
 	struct radix_node_head **rnh;
 
 	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
 	    __func__));
 	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
 	    __func__));
 
 	/* rnh is [fib=0][af=0]. */
 	rnh = (struct radix_node_head **)V_rt_tables;
 	/* Get the offset to the requested table and fam. */
 	rnh += table * (AF_MAX+1) + fam;
 
 	return (rnh);
 }
 
 struct radix_node_head *
 rt_tables_get_rnh(int table, int fam)
 {
 
 	return (*rt_tables_get_rnh_ptr(table, fam));
 }
 
 /*
  * route initialization must occur before ip6_init2(), which happenas at
  * SI_ORDER_MIDDLE.
  */
 static void
 route_init(void)
 {
 
 	/* whack the tunable ints into  line. */
 	if (rt_numfibs > RT_MAXFIBS)
 		rt_numfibs = RT_MAXFIBS;
 	if (rt_numfibs == 0)
 		rt_numfibs = 1;
 }
 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
 
 static int
 rtentry_zinit(void *mem, int size, int how)
 {
 	struct rtentry *rt = mem;
 
 	rt->rt_pksent = counter_u64_alloc(how);
 	if (rt->rt_pksent == NULL)
 		return (ENOMEM);
 
 	RT_LOCK_INIT(rt);
 
 	return (0);
 }
 
 static void
 rtentry_zfini(void *mem, int size)
 {
 	struct rtentry *rt = mem;
 
 	RT_LOCK_DESTROY(rt);
 	counter_u64_free(rt->rt_pksent);
 }
 
 static int
 rtentry_ctor(void *mem, int size, void *arg, int how)
 {
 	struct rtentry *rt = mem;
 
 	bzero(rt, offsetof(struct rtentry, rt_endzero));
 	counter_u64_zero(rt->rt_pksent);
 
 	return (0);
 }
 
 static void
 rtentry_dtor(void *mem, int size, void *arg)
 {
 	struct rtentry *rt = mem;
 
 	RT_UNLOCK_COND(rt);
 }
 
 static void
 vnet_route_init(const void *unused __unused)
 {
 	struct domain *dom;
 	struct radix_node_head **rnh;
 	int table;
 	int fam;
 
 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
 	    sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
 
 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
 	    rtentry_ctor, rtentry_dtor,
 	    rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0);
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtattach == NULL)
 			continue;
 
 		for  (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			/*
 			 * XXX MRT rtattach will be also called from
 			 * vfs_export.c but the offset will be 0 (only for
 			 * AF_INET and AF_INET6 which don't need it anyhow).
 			 */
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtattach((void **)rnh, dom->dom_rtoffset);
 		}
 	}
 }
 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     vnet_route_init, 0);
 
 #ifdef VIMAGE
 static void
 vnet_route_uninit(const void *unused __unused)
 {
 	int table;
 	int fam;
 	struct domain *dom;
 	struct radix_node_head **rnh;
 
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtdetach == NULL)
 			continue;
 
 		for (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset);
 		}
 	}
 
 	free(V_rt_tables, M_RTABLE);
 	uma_zdestroy(V_rtzone);
 }
 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_route_uninit, 0);
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct setfib_args {
 	int     fibnum;
 };
 #endif
 int
 sys_setfib(struct thread *td, struct setfib_args *uap)
 {
 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
 		return EINVAL;
 	td->td_proc->p_fibnum = uap->fibnum;
 	return (0);
 }
 
 /*
  * Packet routing routines.
  */
 void
 rtalloc(struct route *ro)
 {
 
 	rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB);
 }
 
 void
 rtalloc_fib(struct route *ro, u_int fibnum)
 {
 	rtalloc_ign_fib(ro, 0UL, fibnum);
 }
 
 void
 rtalloc_ign(struct route *ro, u_long ignore)
 {
 	struct rtentry *rt;
 
 	if ((rt = ro->ro_rt) != NULL) {
 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
 			return;
 		RTFREE(rt);
 		ro->ro_rt = NULL;
 	}
 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB);
 	if (ro->ro_rt)
 		RT_UNLOCK(ro->ro_rt);
 }
 
 void
 rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
 {
 	struct rtentry *rt;
 
 	if ((rt = ro->ro_rt) != NULL) {
 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
 			return;
 		RTFREE(rt);
 		ro->ro_rt = NULL;
 	}
 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
 	if (ro->ro_rt)
 		RT_UNLOCK(ro->ro_rt);
 }
 
 /*
  * Look up the route that matches the address given
  * Or, at least try.. Create a cloned route if needed.
  *
  * The returned route, if any, is locked.
  */
 struct rtentry *
 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
 {
 
 	return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
 }
 
 struct rtentry *
 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
 		    u_int fibnum)
 {
 	struct radix_node_head *rnh;
 	struct radix_node *rn;
 	struct rtentry *newrt;
 	struct rt_addrinfo info;
 	int err = 0, msgtype = RTM_MISS;
 	int needlock;
 
 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	newrt = NULL;
 	if (rnh == NULL)
 		goto miss;
 
 	/*
 	 * Look up the address in the table for that Address Family
 	 */
 	needlock = !(ignflags & RTF_RNH_LOCKED);
 	if (needlock)
 		RADIX_NODE_HEAD_RLOCK(rnh);
 #ifdef INVARIANTS	
 	else
 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
 #endif
 	rn = rnh->rnh_matchaddr(dst, rnh);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		newrt = RNTORT(rn);
 		RT_LOCK(newrt);
 		RT_ADDREF(newrt);
 		if (needlock)
 			RADIX_NODE_HEAD_RUNLOCK(rnh);
 		goto done;
 
 	} else if (needlock)
 		RADIX_NODE_HEAD_RUNLOCK(rnh);
 	
 	/*
 	 * Either we hit the root or couldn't find any match,
 	 * Which basically means
 	 * "caint get there frm here"
 	 */
 miss:
 	V_rtstat.rts_unreach++;
 
 	if (report) {
 		/*
 		 * If required, report the failure to the supervising
 		 * Authorities.
 		 * For a delete, this is not an error. (report == 0)
 		 */
 		bzero(&info, sizeof(info));
 		info.rti_info[RTAX_DST] = dst;
 		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
 	}	
 done:
 	if (newrt)
 		RT_LOCK_ASSERT(newrt);
 	return (newrt);
 }
 
 /*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
 void
 rtfree(struct rtentry *rt)
 {
 	struct radix_node_head *rnh;
 
 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
 
 	RT_LOCK_ASSERT(rt);
 
 	/*
 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
 	 * we should come here exactly with the last reference.
 	 */
 	RT_REMREF(rt);
 	if (rt->rt_refcnt > 0) {
 		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
 		goto done;
 	}
 
 	/*
 	 * On last reference give the "close method" a chance
 	 * to cleanup private state.  This also permits (for
 	 * IPv4 and IPv6) a chance to decide if the routing table
 	 * entry should be purged immediately or at a later time.
 	 * When an immediate purge is to happen the close routine
 	 * typically calls rtexpunge which clears the RTF_UP flag
 	 * on the entry so that the code below reclaims the storage.
 	 */
 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
 		rnh->rnh_close((struct radix_node *)rt, rnh);
 
 	/*
 	 * If we are no longer "up" (and ref == 0)
 	 * then we can free the resources associated
 	 * with the route.
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0) {
 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic("rtfree 2");
 		/*
 		 * the rtentry must have been removed from the routing table
 		 * so it is represented in rttrash.. remove that now.
 		 */
 		V_rttrash--;
 #ifdef	DIAGNOSTIC
 		if (rt->rt_refcnt < 0) {
 			printf("rtfree: %p not freed (neg refs)\n", rt);
 			goto done;
 		}
 #endif
 		/*
 		 * release references on items we hold them on..
 		 * e.g other routes and ifaddrs.
 		 */
 		if (rt->rt_ifa)
 			ifa_free(rt->rt_ifa);
 		/*
 		 * The key is separatly alloc'd so free it (see rt_setgate()).
 		 * This also frees the gateway, as they are always malloc'd
 		 * together.
 		 */
 		Free(rt_key(rt));
 
 		/*
 		 * and the rtentry itself of course
 		 */
 		uma_zfree(V_rtzone, rt);
 		return;
 	}
 done:
 	RT_UNLOCK(rt);
 }
 
 
 /*
  * Force a routing table entry to the specified
  * destination to go through the given gateway.
  * Normally called as a result of a routing redirect
  * message from the network layer.
  */
 void
 rtredirect(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src)
 {
 
 	rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB);
 }
 
 void
 rtredirect_fib(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	struct rtentry *rt, *rt0 = NULL;
 	int error = 0;
 	short *stat = NULL;
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 	struct radix_node_head *rnh;
 
 	ifa = NULL;
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL) {
 		error = EAFNOSUPPORT;
 		goto out;
 	}
 
 	/* verify the gateway is directly reachable */
-	if ((ifa = ifa_ifwithnet_fib(gateway, 0, fibnum)) == NULL) {
+	if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) {
 		error = ENETUNREACH;
 		goto out;
 	}
 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
 	/*
 	 * If the redirect isn't from our current router for this dst,
 	 * it's either old or wrong.  If it redirects us to ourselves,
 	 * we have a routing loop, perhaps as a result of an interface
 	 * going down recently.
 	 */
 	if (!(flags & RTF_DONE) && rt &&
 	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
 		error = EINVAL;
 	else if (ifa_ifwithaddr_check(gateway))
 		error = EHOSTUNREACH;
 	if (error)
 		goto done;
 	/*
 	 * Create a new entry if we just got back a wildcard entry
 	 * or the lookup failed.  This is necessary for hosts
 	 * which use routing redirects generated by smart gateways
 	 * to dynamically build the routing tables.
 	 */
 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
 		goto create;
 	/*
 	 * Don't listen to the redirect if it's
 	 * for a route to an interface.
 	 */
 	if (rt->rt_flags & RTF_GATEWAY) {
 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
 			/*
 			 * Changing from route to net => route to host.
 			 * Create new route, rather than smashing route to net.
 			 */
 		create:
 			rt0 = rt;
 			rt = NULL;
 		
 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
 			bzero((caddr_t)&info, sizeof(info));
 			info.rti_info[RTAX_DST] = dst;
 			info.rti_info[RTAX_GATEWAY] = gateway;
 			info.rti_info[RTAX_NETMASK] = netmask;
 			info.rti_ifa = ifa;
 			info.rti_flags = flags;
 			if (rt0 != NULL)
 				RT_UNLOCK(rt0);	/* drop lock to avoid LOR with RNH */
 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
 			if (rt != NULL) {
 				RT_LOCK(rt);
 				if (rt0 != NULL)
 					EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
 				flags = rt->rt_flags;
 			}
 			if (rt0 != NULL)
 				RTFREE(rt0);
 			
 			stat = &V_rtstat.rts_dynamic;
 		} else {
 			struct rtentry *gwrt;
 
 			/*
 			 * Smash the current notion of the gateway to
 			 * this destination.  Should check about netmask!!!
 			 */
 			rt->rt_flags |= RTF_MODIFIED;
 			flags |= RTF_MODIFIED;
 			stat = &V_rtstat.rts_newgateway;
 			/*
 			 * add the key and gateway (in one malloc'd chunk).
 			 */
 			RT_UNLOCK(rt);
 			RADIX_NODE_HEAD_LOCK(rnh);
 			RT_LOCK(rt);
 			rt_setgate(rt, rt_key(rt), gateway);
 			gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
 			RTFREE_LOCKED(gwrt);
 		}
 	} else
 		error = EHOSTUNREACH;
 done:
 	if (rt)
 		RTFREE_LOCKED(rt);
 out:
 	if (error)
 		V_rtstat.rts_badredirect++;
 	else if (stat != NULL)
 		(*stat)++;
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	info.rti_info[RTAX_AUTHOR] = src;
 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
 	if (ifa != NULL)
 		ifa_free(ifa);
 }
 
 int
 rtioctl(u_long req, caddr_t data)
 {
 
 	return (rtioctl_fib(req, data, RT_DEFAULT_FIB));
 }
 
 /*
  * Routing table ioctl interface.
  */
 int
 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
 {
 
 	/*
 	 * If more ioctl commands are added here, make sure the proper
 	 * super-user checks are being performed because it is possible for
 	 * prison-root to make it this far if raw sockets have been enabled
 	 * in jails.
 	 */
 #ifdef INET
 	/* Multicast goop, grrr... */
 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
 #else /* INET */
 	return ENXIO;
 #endif /* INET */
 }
 
-/*
- * For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
- */
 struct ifaddr *
-ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
-{
-
-	return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB));
-}
-
-struct ifaddr *
-ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
+ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway,
 				u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int not_found = 0;
 
 	if ((flags & RTF_GATEWAY) == 0) {
 		/*
 		 * If we are adding a route to an interface,
 		 * and the interface is a pt to pt link
 		 * we should search for the destination
 		 * as our clue to the interface.  Otherwise
 		 * we can use the local address.
 		 */
 		ifa = NULL;
 		if (flags & RTF_HOST)
-			ifa = ifa_ifwithdstaddr_fib(dst, fibnum);
+			ifa = ifa_ifwithdstaddr(dst, fibnum);
 		if (ifa == NULL)
 			ifa = ifa_ifwithaddr(gateway);
 	} else {
 		/*
 		 * If we are adding a route to a remote net
 		 * or host, the gateway may still be on the
 		 * other end of a pt to pt link.
 		 */
-		ifa = ifa_ifwithdstaddr_fib(gateway, fibnum);
+		ifa = ifa_ifwithdstaddr(gateway, fibnum);
 	}
 	if (ifa == NULL)
-		ifa = ifa_ifwithnet_fib(gateway, 0, fibnum);
+		ifa = ifa_ifwithnet(gateway, 0, fibnum);
 	if (ifa == NULL) {
 		struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
 		if (rt == NULL)
 			return (NULL);
 		/*
 		 * dismiss a gateway that is reachable only
 		 * through the default router
 		 */
 		switch (gateway->sa_family) {
 		case AF_INET:
 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
 				not_found = 1;
 			break;
 		case AF_INET6:
 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
 				not_found = 1;
 			break;
 		default:
 			break;
 		}
 		if (!not_found && rt->rt_ifa != NULL) {
 			ifa = rt->rt_ifa;
 			ifa_ref(ifa);
 		}
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 		if (not_found || ifa == NULL)
 			return (NULL);
 	}
 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
 		struct ifaddr *oifa = ifa;
 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
 		if (ifa == NULL)
 			ifa = oifa;
 		else
 			ifa_free(oifa);
 	}
 	return (ifa);
 }
 
 /*
  * Do appropriate manipulations of a routing tree given
  * all the bits of info needed
  */
 int
 rtrequest(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt)
 {
 
 	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt,
 	    RT_DEFAULT_FIB));
 }
 
 int
 rtrequest_fib(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	struct rt_addrinfo info;
 
 	if (dst->sa_len == 0)
 		return(EINVAL);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_flags = flags;
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
 }
 
 /*
  * These (questionable) definitions of apparent local variables apply
  * to the next two functions.  XXXXXX!!!
  */
 #define	dst	info->rti_info[RTAX_DST]
 #define	gateway	info->rti_info[RTAX_GATEWAY]
 #define	netmask	info->rti_info[RTAX_NETMASK]
 #define	ifaaddr	info->rti_info[RTAX_IFA]
 #define	ifpaddr	info->rti_info[RTAX_IFP]
 #define	flags	info->rti_flags
 
 int
 rt_getifa(struct rt_addrinfo *info)
 {
 
 	return (rt_getifa_fib(info, RT_DEFAULT_FIB));
 }
 
 /*
  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
  * it will be referenced so the caller must free it.
  */
 int
 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int error = 0;
 
 	/*
 	 * ifp may be specified by sockaddr_dl
 	 * when protocol address is ambiguous.
 	 */
 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
 	    ifpaddr->sa_family == AF_LINK &&
-	    (ifa = ifa_ifwithnet_fib(ifpaddr, 0, fibnum)) != NULL) {
+	    (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) {
 		info->rti_ifp = ifa->ifa_ifp;
 		ifa_free(ifa);
 	}
 	if (info->rti_ifa == NULL && ifaaddr != NULL)
 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
 	if (info->rti_ifa == NULL) {
 		struct sockaddr *sa;
 
 		sa = ifaaddr != NULL ? ifaaddr :
 		    (gateway != NULL ? gateway : dst);
 		if (sa != NULL && info->rti_ifp != NULL)
 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
 		else if (dst != NULL && gateway != NULL)
-			info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
+			info->rti_ifa = ifa_ifwithroute(flags, dst, gateway,
 							fibnum);
 		else if (sa != NULL)
-			info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
+			info->rti_ifa = ifa_ifwithroute(flags, sa, sa,
 							fibnum);
 	}
 	if ((ifa = info->rti_ifa) != NULL) {
 		if (info->rti_ifp == NULL)
 			info->rti_ifp = ifa->ifa_ifp;
 	} else
 		error = ENETUNREACH;
 	return (error);
 }
 
 /*
  * Expunges references to a route that's about to be reclaimed.
  * The route must be locked.
  */
 int
 rt_expunge(struct radix_node_head *rnh, struct rtentry *rt)
 {
 #if !defined(RADIX_MPATH)
 	struct radix_node *rn;
 #else
 	struct rt_addrinfo info;
 	int fib;
 	struct rtentry *rt0;
 #endif
 	struct ifaddr *ifa;
 	int error = 0;
 
 	RT_LOCK_ASSERT(rt);
 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
 
 #ifdef RADIX_MPATH
 	fib = rt->rt_fibnum;
 	bzero(&info, sizeof(info));
 	info.rti_ifp = rt->rt_ifp;
 	info.rti_flags = RTF_RNH_LOCKED;
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
 
 	RT_UNLOCK(rt);
 	error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
 
 	if (error == 0 && rt0 != NULL) {
 		rt = rt0;
 		RT_LOCK(rt);
 	} else if (error != 0) {
 		RT_LOCK(rt);
 		return (error);
 	}
 #else
 	/*
 	 * Remove the item from the tree; it should be there,
 	 * but when callers invoke us blindly it may not (sigh).
 	 */
 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
 	if (rn == NULL) {
 		error = ESRCH;
 		goto bad;
 	}
 	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
 		("unexpected flags 0x%x", rn->rn_flags));
 	KASSERT(rt == RNTORT(rn),
 		("lookup mismatch, rt %p rn %p", rt, rn));
 #endif /* RADIX_MPATH */
 
 	rt->rt_flags &= ~RTF_UP;
 
 	/*
 	 * Give the protocol a chance to keep things in sync.
 	 */
 	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
 		struct rt_addrinfo info;
 
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_flags = rt->rt_flags;
 		info.rti_info[RTAX_DST] = rt_key(rt);
 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
 	}
 
 	/*
 	 * one more rtentry floating around that is not
 	 * linked to the routing table.
 	 */
 	V_rttrash++;
 #if !defined(RADIX_MPATH)
 bad:
 #endif
 	return (error);
 }
 
 #if 0
 int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
 int rt_print(char *buf, int buflen, struct rtentry *rt);
 
 int
 p_sockaddr(char *buf, int buflen, struct sockaddr *s)
 {
 	void *paddr = NULL;
 
 	switch (s->sa_family) {
 	case AF_INET:
 		paddr = &((struct sockaddr_in *)s)->sin_addr;
 		break;
 	case AF_INET6:
 		paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
 		break;
 	}
 
 	if (paddr == NULL)
 		return (0);
 
 	if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
 		return (0);
 	
 	return (strlen(buf));
 }
 
 int
 rt_print(char *buf, int buflen, struct rtentry *rt)
 {
 	struct sockaddr *addr, *mask;
 	int i = 0;
 
 	addr = rt_key(rt);
 	mask = rt_mask(rt);
 
 	i = p_sockaddr(buf, buflen, addr);
 	if (!(rt->rt_flags & RTF_HOST)) {
 		buf[i++] = '/';
 		i += p_sockaddr(buf + i, buflen - i, mask);
 	}
 
 	if (rt->rt_flags & RTF_GATEWAY) {
 		buf[i++] = '>';
 		i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway);
 	}
 
 	return (i);
 }
 #endif
 
 #ifdef RADIX_MPATH
 static int
 rn_mpath_update(int req, struct rt_addrinfo *info,
     struct radix_node_head *rnh, struct rtentry **ret_nrt)
 {
 	/*
 	 * if we got multipath routes, we require users to specify
 	 * a matching RTAX_GATEWAY.
 	 */
 	struct rtentry *rt, *rto = NULL;
 	struct radix_node *rn;
 	int error = 0;
 
 	rn = rnh->rnh_lookup(dst, netmask, rnh);
 	if (rn == NULL)
 		return (ESRCH);
 	rto = rt = RNTORT(rn);
 
 	rt = rt_mpath_matchgate(rt, gateway);
 	if (rt == NULL)
 		return (ESRCH);
 	/*
 	 * this is the first entry in the chain
 	 */
 	if (rto == rt) {
 		rn = rn_mpath_next((struct radix_node *)rt);
 		/*
 		 * there is another entry, now it's active
 		 */
 		if (rn) {
 			rto = RNTORT(rn);
 			RT_LOCK(rto);
 			rto->rt_flags |= RTF_UP;
 			RT_UNLOCK(rto);
 		} else if (rt->rt_flags & RTF_GATEWAY) {
 			/*
 			 * For gateway routes, we need to 
 			 * make sure that we we are deleting
 			 * the correct gateway. 
 			 * rt_mpath_matchgate() does not 
 			 * check the case when there is only
 			 * one route in the chain.  
 			 */
 			if (gateway &&
 			    (rt->rt_gateway->sa_len != gateway->sa_len ||
 				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
 				error = ESRCH;
 			else {
 				/*
 				 * remove from tree before returning it
 				 * to the caller
 				 */
 				rn = rnh->rnh_deladdr(dst, netmask, rnh);
 				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
 				goto gwdelete;
 			}
 			
 		}
 		/*
 		 * use the normal delete code to remove
 		 * the first entry
 		 */
 		if (req != RTM_DELETE) 
 			goto nondelete;
 
 		error = ENOENT;
 		goto done;
 	}
 		
 	/*
 	 * if the entry is 2nd and on up
 	 */
 	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
 		panic ("rtrequest1: rt_mpath_deldup");
 gwdelete:
 	RT_LOCK(rt);
 	RT_ADDREF(rt);
 	if (req == RTM_DELETE) {
 		rt->rt_flags &= ~RTF_UP;
 		/*
 		 * One more rtentry floating around that is not
 		 * linked to the routing table. rttrash will be decremented
 		 * when RTFREE(rt) is eventually called.
 		 */
 		V_rttrash++;
 	}
 	
 nondelete:
 	if (req != RTM_DELETE)
 		panic("unrecognized request %d", req);
 	
 
 	/*
 	 * If the caller wants it, then it can have it,
 	 * but it's up to it to free the rtentry as we won't be
 	 * doing it.
 	 */
 	if (ret_nrt) {
 		*ret_nrt = rt;
 		RT_UNLOCK(rt);
 	} else
 		RTFREE_LOCKED(rt);
 done:
 	return (error);
 }
 #endif
 
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
 {
 	int error = 0, needlock = 0;
 	struct rtentry *rt;
 #ifdef FLOWTABLE
 	struct rtentry *rt0;
 #endif
 	struct radix_node *rn;
 	struct radix_node_head *rnh;
 	struct ifaddr *ifa;
 	struct sockaddr *ndst;
 	struct sockaddr_storage mdst;
 #define senderr(x) { error = x ; goto bad; }
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 
 	/*
 	 * Find the correct routing tree to use for this Address Family
 	 */
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 	needlock = ((flags & RTF_RNH_LOCKED) == 0);
 	flags &= ~RTF_RNH_LOCKED;
 	if (needlock)
 		RADIX_NODE_HEAD_LOCK(rnh);
 	else
 		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
 	 */
 	if (flags & RTF_HOST)
 		netmask = NULL;
 
 	switch (req) {
 	case RTM_DELETE:
 		if (netmask) {
 			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
 			dst = (struct sockaddr *)&mdst;
 		}
 #ifdef RADIX_MPATH
 		if (rn_mpath_capable(rnh)) {
 			error = rn_mpath_update(req, info, rnh, ret_nrt);
 			/*
 			 * "bad" holds true for the success case
 			 * as well
 			 */
 			if (error != ENOENT)
 				goto bad;
 			error = 0;
 		}
 #endif
 		if ((flags & RTF_PINNED) == 0) {
 			/* Check if target route can be deleted */
 			rt = (struct rtentry *)rnh->rnh_lookup(dst,
 			    netmask, rnh);
 			if ((rt != NULL) && (rt->rt_flags & RTF_PINNED))
 				senderr(EADDRINUSE);
 		}
 
 		/*
 		 * Remove the item from the tree and return it.
 		 * Complain if it is not there and do no more processing.
 		 */
 		rn = rnh->rnh_deladdr(dst, netmask, rnh);
 		if (rn == NULL)
 			senderr(ESRCH);
 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic ("rtrequest delete");
 		rt = RNTORT(rn);
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		rt->rt_flags &= ~RTF_UP;
 
 		/*
 		 * give the protocol a chance to keep things in sync.
 		 */
 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 
 		/*
 		 * One more rtentry floating around that is not
 		 * linked to the routing table. rttrash will be decremented
 		 * when RTFREE(rt) is eventually called.
 		 */
 		V_rttrash++;
 
 		/*
 		 * If the caller wants it, then it can have it,
 		 * but it's up to it to free the rtentry as we won't be
 		 * doing it.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_UNLOCK(rt);
 		} else
 			RTFREE_LOCKED(rt);
 		break;
 	case RTM_RESOLVE:
 		/*
 		 * resolve was only used for route cloning
 		 * here for compat
 		 */
 		break;
 	case RTM_ADD:
 		if ((flags & RTF_GATEWAY) && !gateway)
 			senderr(EINVAL);
 		if (dst && gateway && (dst->sa_family != gateway->sa_family) && 
 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
 			senderr(EINVAL);
 
 		if (info->rti_ifa == NULL) {
 			error = rt_getifa_fib(info, fibnum);
 			if (error)
 				senderr(error);
 		} else
 			ifa_ref(info->rti_ifa);
 		ifa = info->rti_ifa;
 		rt = uma_zalloc(V_rtzone, M_NOWAIT);
 		if (rt == NULL) {
 			ifa_free(ifa);
 			senderr(ENOBUFS);
 		}
 		rt->rt_flags = RTF_UP | flags;
 		rt->rt_fibnum = fibnum;
 		/*
 		 * Add the gateway. Possibly re-malloc-ing the storage for it.
 		 */
 		RT_LOCK(rt);
 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
 			ifa_free(ifa);
 			uma_zfree(V_rtzone, rt);
 			senderr(error);
 		}
 
 		/*
 		 * point to the (possibly newly malloc'd) dest address.
 		 */
 		ndst = (struct sockaddr *)rt_key(rt);
 
 		/*
 		 * make sure it contains the value we want (masked if needed).
 		 */
 		if (netmask) {
 			rt_maskedcopy(dst, ndst, netmask);
 		} else
 			bcopy(dst, ndst, dst->sa_len);
 
 		/*
 		 * We use the ifa reference returned by rt_getifa_fib().
 		 * This moved from below so that rnh->rnh_addaddr() can
 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
 		 */
 		rt->rt_ifa = ifa;
 		rt->rt_ifp = ifa->ifa_ifp;
 		rt->rt_weight = 1;
 
 #ifdef RADIX_MPATH
 		/* do not permit exactly the same dst/mask/gw pair */
 		if (rn_mpath_capable(rnh) &&
 			rt_mpath_conflict(rnh, rt, netmask)) {
 			ifa_free(rt->rt_ifa);
 			Free(rt_key(rt));
 			uma_zfree(V_rtzone, rt);
 			senderr(EEXIST);
 		}
 #endif
 
 #ifdef FLOWTABLE
 		rt0 = NULL;
 		/* "flow-table" only supports IPv6 and IPv4 at the moment. */
 		switch (dst->sa_family) {
 #ifdef INET6
 		case AF_INET6:
 #endif
 #ifdef INET
 		case AF_INET:
 #endif
 #if defined(INET6) || defined(INET)
 			rn = rnh->rnh_matchaddr(dst, rnh);
 			if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 				struct sockaddr *mask;
 				u_char *m, *n;
 				int len;
 				
 				/*
 				 * compare mask to see if the new route is
 				 * more specific than the existing one
 				 */
 				rt0 = RNTORT(rn);
 				RT_LOCK(rt0);
 				RT_ADDREF(rt0);
 				RT_UNLOCK(rt0);
 				/*
 				 * A host route is already present, so 
 				 * leave the flow-table entries as is.
 				 */
 				if (rt0->rt_flags & RTF_HOST) {
 					RTFREE(rt0);
 					rt0 = NULL;
 				} else if (!(flags & RTF_HOST) && netmask) {
 					mask = rt_mask(rt0);
 					len = mask->sa_len;
 					m = (u_char *)mask;
 					n = (u_char *)netmask;
 					while (len-- > 0) {
 						if (*n != *m)
 							break;
 						n++;
 						m++;
 					}
 					if (len == 0 || (*n < *m)) {
 						RTFREE(rt0);
 						rt0 = NULL;
 					}
 				}
 			}
 #endif/* INET6 || INET */
 		}
 #endif /* FLOWTABLE */
 
 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
 		rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
 		/*
 		 * If it still failed to go into the tree,
 		 * then un-make it (this should be a function)
 		 */
 		if (rn == NULL) {
 			ifa_free(rt->rt_ifa);
 			Free(rt_key(rt));
 			uma_zfree(V_rtzone, rt);
 #ifdef FLOWTABLE
 			if (rt0 != NULL)
 				RTFREE(rt0);
 #endif
 			senderr(EEXIST);
 		} 
 #ifdef FLOWTABLE
 		else if (rt0 != NULL) {
 			flowtable_route_flush(dst->sa_family, rt0);
 			RTFREE(rt0);
 		}
 #endif
 
 		/*
 		 * If this protocol has something to add to this then
 		 * allow it to do that as well.
 		 */
 		if (ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(req, rt, info);
 
 		rt_setmetrics(info, rt);
 
 		/*
 		 * actually return a resultant rtentry and
 		 * give the caller a single reference.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_ADDREF(rt);
 		}
 		RT_UNLOCK(rt);
 		break;
 	case RTM_CHANGE:
 		error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum);
 		break;
 	default:
 		error = EOPNOTSUPP;
 	}
 bad:
 	if (needlock)
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 	return (error);
 #undef senderr
 }
 
 #undef dst
 #undef gateway
 #undef netmask
 #undef ifaaddr
 #undef ifpaddr
 #undef flags
 
 static int
 rtrequest1_fib_change(struct radix_node_head *rnh, struct rt_addrinfo *info,
     struct rtentry **ret_nrt, u_int fibnum)
 {
 	struct rtentry *rt = NULL;
 	int error = 0;
 	int free_ifa = 0;
 
 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], rnh);
 
 	if (rt == NULL)
 		return (ESRCH);
 
 #ifdef RADIX_MPATH
 	/*
 	 * If we got multipath routes,
 	 * we require users to specify a matching RTAX_GATEWAY.
 	 */
 	if (rn_mpath_capable(rnh)) {
 		rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
 		if (rt == NULL)
 			return (ESRCH);
 	}
 #endif
 
 	RT_LOCK(rt);
 
 	/*
 	 * New gateway could require new ifaddr, ifp;
 	 * flags may also be different; ifp may be specified
 	 * by ll sockaddr when protocol address is ambiguous
 	 */
 	if (((rt->rt_flags & RTF_GATEWAY) &&
 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
 	    info->rti_info[RTAX_IFP] != NULL ||
 	    (info->rti_info[RTAX_IFA] != NULL &&
 	     !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) {
 
 		error = rt_getifa_fib(info, fibnum);
 		if (info->rti_ifa != NULL)
 			free_ifa = 1;
 
 		if (error != 0)
 			goto bad;
 	}
 
 	/* Check if outgoing interface has changed */
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa &&
 	    rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) {
 		rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 		ifa_free(rt->rt_ifa);
 	}
 	/* Update gateway address */
 	if (info->rti_info[RTAX_GATEWAY] != NULL) {
 		error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]);
 		if (error != 0)
 			goto bad;
 
 		rt->rt_flags &= ~RTF_GATEWAY;
 		rt->rt_flags |= (RTF_GATEWAY & info->rti_flags);
 	}
 
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) {
 		ifa_ref(info->rti_ifa);
 		rt->rt_ifa = info->rti_ifa;
 		rt->rt_ifp = info->rti_ifp;
 	}
 	/* Allow some flags to be toggled on change. */
 	rt->rt_flags &= ~RTF_FMASK;
 	rt->rt_flags |= info->rti_flags & RTF_FMASK;
 
 	if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL)
 	       rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
 
 	rt_setmetrics(info, rt);
 
 	if (ret_nrt) {
 		*ret_nrt = rt;
 		RT_ADDREF(rt);
 	}
 bad:
 	RT_UNLOCK(rt);
 	if (free_ifa != 0)
 		ifa_free(info->rti_ifa);
 	return (error);
 }
 
 static void
 rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt)
 {
 
 	if (info->rti_mflags & RTV_MTU)
 		rt->rt_mtu = info->rti_rmx->rmx_mtu;
 	if (info->rti_mflags & RTV_WEIGHT)
 		rt->rt_weight = info->rti_rmx->rmx_weight;
 	/* Kernel -> userland timebase conversion. */
 	if (info->rti_mflags & RTV_EXPIRE)
 		rt->rt_expire = info->rti_rmx->rmx_expire ?
 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
 }
 
 int
 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
 {
 	/* XXX dst may be overwritten, can we move this to below */
 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
 #ifdef INVARIANTS
 	struct radix_node_head *rnh;
 
 	rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
 #endif
 
 	RT_LOCK_ASSERT(rt);
 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
 	
 	/*
 	 * Prepare to store the gateway in rt->rt_gateway.
 	 * Both dst and gateway are stored one after the other in the same
 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
 	 * rt_gateway already points to the right place.
 	 * Otherwise, malloc a new block and update the 'dst' address.
 	 */
 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
 		caddr_t new;
 
 		R_Malloc(new, caddr_t, dlen + glen);
 		if (new == NULL)
 			return ENOBUFS;
 		/*
 		 * XXX note, we copy from *dst and not *rt_key(rt) because
 		 * rt_setgate() can be called to initialize a newly
 		 * allocated route entry, in which case rt_key(rt) == NULL
 		 * (and also rt->rt_gateway == NULL).
 		 * Free()/free() handle a NULL argument just fine.
 		 */
 		bcopy(dst, new, dlen);
 		Free(rt_key(rt));	/* free old block, if any */
 		rt_key(rt) = (struct sockaddr *)new;
 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
 	}
 
 	/*
 	 * Copy the new gateway value into the memory chunk.
 	 */
 	bcopy(gate, rt->rt_gateway, glen);
 
 	return (0);
 }
 
 void
 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
 {
 	u_char *cp1 = (u_char *)src;
 	u_char *cp2 = (u_char *)dst;
 	u_char *cp3 = (u_char *)netmask;
 	u_char *cplim = cp2 + *cp3;
 	u_char *cplim2 = cp2 + *cp1;
 
 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 	cp3 += 2;
 	if (cplim > cplim2)
 		cplim = cplim2;
 	while (cp2 < cplim)
 		*cp2++ = *cp1++ & *cp3++;
 	if (cp2 < cplim2)
 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
 static inline  int
 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
 {
 	struct sockaddr *dst;
 	struct sockaddr *netmask;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	int error = 0;
 	int startfib, endfib;
 	char tempbuf[_SOCKADDR_TMPSIZE];
 	int didwork = 0;
 	int a_failure = 0;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 	struct radix_node_head *rnh;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 		netmask = NULL;
 	} else {
 		dst = ifa->ifa_addr;
 		netmask = ifa->ifa_netmask;
 	}
 	if (dst->sa_len == 0)
 		return(EINVAL);
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 	if (fibnum == RT_ALL_FIBS) {
 		if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
 			startfib = endfib = ifa->ifa_ifp->if_fib;
 		} else {
 			startfib = 0;
 			endfib = rt_numfibs - 1;
 		}
 	} else {
 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
 		startfib = fibnum;
 		endfib = fibnum;
 	}
 
 	/*
 	 * If it's a delete, check that if it exists,
 	 * it's on the correct interface or we might scrub
 	 * a route to another ifa which would
 	 * be confusing at best and possibly worse.
 	 */
 	if (cmd == RTM_DELETE) {
 		/*
 		 * It's a delete, so it should already exist..
 		 * If it's a net, mask off the host bits
 		 * (Assuming we have a mask)
 		 * XXX this is kinda inet specific..
 		 */
 		if (netmask != NULL) {
 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
 			dst = (struct sockaddr *)tempbuf;
 		}
 	}
 	/*
 	 * Now go through all the requested tables (fibs) and do the
 	 * requested action. Realistically, this will either be fib 0
 	 * for protocols that don't do multiple tables or all the
 	 * tables for those that do.
 	 */
 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
 		if (cmd == RTM_DELETE) {
 			struct radix_node *rn;
 			/*
 			 * Look up an rtentry that is in the routing tree and
 			 * contains the correct info.
 			 */
 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 			if (rnh == NULL)
 				/* this table doesn't exist but others might */
 				continue;
 			RADIX_NODE_HEAD_RLOCK(rnh);
 			rn = rnh->rnh_lookup(dst, netmask, rnh);
 #ifdef RADIX_MPATH
 			if (rn_mpath_capable(rnh)) {
 
 				if (rn == NULL) 
 					error = ESRCH;
 				else {
 					rt = RNTORT(rn);
 					/*
 					 * for interface route the
 					 * rt->rt_gateway is sockaddr_intf
 					 * for cloning ARP entries, so
 					 * rt_mpath_matchgate must use the
 					 * interface address
 					 */
 					rt = rt_mpath_matchgate(rt,
 					    ifa->ifa_addr);
 					if (rt == NULL) 
 						error = ESRCH;
 				}
 			}
 #endif
 			error = (rn == NULL ||
 			    (rn->rn_flags & RNF_ROOT) ||
 			    RNTORT(rn)->rt_ifa != ifa);
 			RADIX_NODE_HEAD_RUNLOCK(rnh);
 			if (error) {
 				/* this is only an error if bad on ALL tables */
 				continue;
 			}
 		}
 		/*
 		 * Do the actual request
 		 */
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_ifa = ifa;
 		info.rti_flags = flags |
 		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
 		info.rti_info[RTAX_DST] = dst;
 		/* 
 		 * doing this for compatibility reasons
 		 */
 		if (cmd == RTM_ADD)
 			info.rti_info[RTAX_GATEWAY] =
 			    (struct sockaddr *)&null_sdl;
 		else
 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
 		info.rti_info[RTAX_NETMASK] = netmask;
 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 
 		if ((error == EEXIST) && (cmd == RTM_ADD)) {
 			/*
 			 * Interface route addition failed.
 			 * Atomically delete current prefix generating
 			 * RTM_DELETE message, and retry adding
 			 * interface prefix.
 			 */
 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 			RADIX_NODE_HEAD_LOCK(rnh);
 
 			/* Delete old prefix */
 			info.rti_ifa = NULL;
 			info.rti_flags = RTF_RNH_LOCKED;
 
 			error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
 			if (error == 0) {
 				info.rti_ifa = ifa;
 				info.rti_flags = flags | RTF_RNH_LOCKED |
 				    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
 				error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 			}
 
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 
 
 		if (error == 0 && rt != NULL) {
 			/*
 			 * notify any listening routing agents of the change
 			 */
 			RT_LOCK(rt);
 #ifdef RADIX_MPATH
 			/*
 			 * in case address alias finds the first address
 			 * e.g. ifconfig bge0 192.0.2.246/24
 			 * e.g. ifconfig bge0 192.0.2.247/24
 			 * the address set in the route is 192.0.2.246
 			 * so we need to replace it with 192.0.2.247
 			 */
 			if (memcmp(rt->rt_ifa->ifa_addr,
 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
 				ifa_free(rt->rt_ifa);
 				ifa_ref(ifa);
 				rt->rt_ifp = ifa->ifa_ifp;
 				rt->rt_ifa = ifa;
 			}
 #endif
 			/* 
 			 * doing this for compatibility reasons
 			 */
 			if (cmd == RTM_ADD) {
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
 				rt->rt_ifp->if_type;
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 				rt->rt_ifp->if_index;
 			}
 			RT_ADDREF(rt);
 			RT_UNLOCK(rt);
 			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
 			RT_LOCK(rt);
 			RT_REMREF(rt);
 			if (cmd == RTM_DELETE) {
 				/*
 				 * If we are deleting, and we found an entry,
 				 * then it's been removed from the tree..
 				 * now throw it away.
 				 */
 				RTFREE_LOCKED(rt);
 			} else {
 				if (cmd == RTM_ADD) {
 					/*
 					 * We just wanted to add it..
 					 * we don't actually need a reference.
 					 */
 					RT_REMREF(rt);
 				}
 				RT_UNLOCK(rt);
 			}
 			didwork = 1;
 		}
 		if (error)
 			a_failure = error;
 	}
 	if (cmd == RTM_DELETE) {
 		if (didwork) {
 			error = 0;
 		} else {
 			/* we only give an error if it wasn't in any table */
 			error = ((flags & RTF_HOST) ?
 			    EHOSTUNREACH : ENETUNREACH);
 		}
 	} else {
 		if (a_failure) {
 			/* return an error if any of them failed */
 			error = a_failure;
 		}
 	}
 	return (error);
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 int
 rtinit(struct ifaddr *ifa, int cmd, int flags)
 {
 	struct sockaddr *dst;
 	int fib = RT_DEFAULT_FIB;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 	} else {
 		dst = ifa->ifa_addr;
 	}
 
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We do support multiple FIBs. */
 		fib = RT_ALL_FIBS;
 		break;
 	}
 	return (rtinit1(ifa, cmd, flags, fib));
 }
 
 /*
  * Announce interface address arrival/withdraw
  * Returns 0 on success.
  */
 int
 rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 	/*
 	 * notify the SCTP stack
 	 * this will only get called when an address is added/deleted
 	 * XXX pass the ifaddr struct instead if ifa->ifa_addr...
 	 */
 	sctp_addr_change(ifa, cmd);
 #endif /* SCTP */
 #endif
 	return (rtsock_addrmsg(cmd, ifa, fibnum));
 }
 
 /*
  * Announce route addition/removal.
  * Users of this function MUST validate input data BEFORE calling.
  * However we have to be able to handle invalid data:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  * Returns 0 on success.
  */
 int
 rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
 
 	return (rtsock_routemsg(cmd, ifp, error, rt, fibnum));
 }
 
 void
 rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
 {
 
 	rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS);
 }
 
 /*
  * This is called to generate messages from the routing socket
  * indicating a network interface has had addresses associated with it.
  */
 void
 rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 		("unexpected cmd %u", cmd));
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	if (cmd == RTM_ADD) {
 		rt_addrmsg(cmd, ifa, fibnum);
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 	} else {
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 		rt_addrmsg(cmd, ifa, fibnum);
 	}
 }
 
Index: user/ae/inet6/sys/net/rtsock.c
===================================================================
--- user/ae/inet6/sys/net/rtsock.c	(revision 271452)
+++ user/ae/inet6/sys/net/rtsock.c	(revision 271453)
@@ -1,1934 +1,1935 @@
 /*-
  * Copyright (c) 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)rtsock.c	8.7 (Berkeley) 10/12/95
  * $FreeBSD$
  */
 #include "opt_compat.h"
 #include "opt_mpath.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/raw_cb.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct if_msghdr32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	struct	if_data ifm_data;
 };
 
 struct if_msghdrl32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	uint16_t ifm_len;
 	uint16_t ifm_data_off;
 	struct	if_data ifm_data;
 };
 
 struct ifa_msghdrl32 {
 	uint16_t ifam_msglen;
 	uint8_t	ifam_version;
 	uint8_t	ifam_type;
 	int32_t	ifam_addrs;
 	int32_t	ifam_flags;
 	uint16_t ifam_index;
 	uint16_t _ifam_spare1;
 	uint16_t ifam_len;
 	uint16_t ifam_data_off;
 	int32_t	ifam_metric;
 	struct	if_data ifam_data;
 };
 #endif /* COMPAT_FREEBSD32 */
 
 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
 
 /* NB: these are not modified */
 static struct	sockaddr route_src = { 2, PF_ROUTE, };
 static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, };
 
 /* These are external hooks for CARP. */
 int	(*carp_get_vhid_p)(struct ifaddr *);
 
 /*
  * Used by rtsock/raw_input callback code to decide whether to filter the update
  * notification to a socket bound to a particular FIB.
  */
 #define	RTS_FILTER_FIB	M_PROTO8
 
 typedef struct {
 	int	ip_count;	/* attached w/ AF_INET */
 	int	ip6_count;	/* attached w/ AF_INET6 */
 	int	any_count;	/* total attached */
 } route_cb_t;
 static VNET_DEFINE(route_cb_t, route_cb);
 #define	V_route_cb VNET(route_cb)
 
 struct mtx rtsock_mtx;
 MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
 
 #define	RTSOCK_LOCK()	mtx_lock(&rtsock_mtx)
 #define	RTSOCK_UNLOCK()	mtx_unlock(&rtsock_mtx)
 #define	RTSOCK_LOCK_ASSERT()	mtx_assert(&rtsock_mtx, MA_OWNED)
 
 static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, "");
 
 struct walkarg {
 	int	w_tmemsize;
 	int	w_op, w_arg;
 	caddr_t	w_tmem;
 	struct sysctl_req *w_req;
 };
 
 static void	rts_input(struct mbuf *m);
 static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
 static int	rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
 			struct walkarg *w, int *plen);
 static int	rt_xaddrs(caddr_t cp, caddr_t cplim,
 			struct rt_addrinfo *rtinfo);
 static int	sysctl_dumpentry(struct radix_node *rn, void *vw);
 static int	sysctl_iflist(int af, struct walkarg *w);
 static int	sysctl_ifmalist(int af, struct walkarg *w);
 static int	route_output(struct mbuf *m, struct socket *so, ...);
 static void	rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out);
 static void	rt_dispatch(struct mbuf *, sa_family_t);
 static struct sockaddr	*rtsock_fix_netmask(struct sockaddr *dst,
 			struct sockaddr *smask, struct sockaddr_storage *dmask);
 
 static struct netisr_handler rtsock_nh = {
 	.nh_name = "rtsock",
 	.nh_handler = rts_input,
 	.nh_proto = NETISR_ROUTE,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 static int
 sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&rtsock_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
         if (error || !req->newptr)
                 return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&rtsock_nh, qlimit));
 }
 SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_route_netisr_maxqlen, "I",
     "maximum routing socket dispatch queue length");
 
 static void
 rts_init(void)
 {
 	int tmp;
 
 	if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
 		rtsock_nh.nh_qlimit = tmp;
 	netisr_register(&rtsock_nh);
 }
 SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0);
 
 static int
 raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src,
     struct rawcb *rp)
 {
 	int fibnum;
 
 	KASSERT(m != NULL, ("%s: m is NULL", __func__));
 	KASSERT(proto != NULL, ("%s: proto is NULL", __func__));
 	KASSERT(rp != NULL, ("%s: rp is NULL", __func__));
 
 	/* No filtering requested. */
 	if ((m->m_flags & RTS_FILTER_FIB) == 0)
 		return (0);
 
 	/* Check if it is a rts and the fib matches the one of the socket. */
 	fibnum = M_GETFIB(m);
 	if (proto->sp_family != PF_ROUTE ||
 	    rp->rcb_socket == NULL ||
 	    rp->rcb_socket->so_fibnum == fibnum)
 		return (0);
 
 	/* Filtering requested and no match, the socket shall be skipped. */
 	return (1);
 }
 
 static void
 rts_input(struct mbuf *m)
 {
 	struct sockproto route_proto;
 	unsigned short *family;
 	struct m_tag *tag;
 
 	route_proto.sp_family = PF_ROUTE;
 	tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL);
 	if (tag != NULL) {
 		family = (unsigned short *)(tag + 1);
 		route_proto.sp_protocol = *family;
 		m_tag_delete(m, tag);
 	} else
 		route_proto.sp_protocol = 0;
 
 	raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb);
 }
 
 /*
  * It really doesn't make any sense at all for this code to share much
  * with raw_usrreq.c, since its functionality is so restricted.  XXX
  */
 static void
 rts_abort(struct socket *so)
 {
 
 	raw_usrreqs.pru_abort(so);
 }
 
 static void
 rts_close(struct socket *so)
 {
 
 	raw_usrreqs.pru_close(so);
 }
 
 /* pru_accept is EOPNOTSUPP */
 
 static int
 rts_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct rawcb *rp;
 	int error;
 
 	KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL"));
 
 	/* XXX */
 	rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO);
 	if (rp == NULL)
 		return ENOBUFS;
 
 	so->so_pcb = (caddr_t)rp;
 	so->so_fibnum = td->td_proc->p_fibnum;
 	error = raw_attach(so, proto);
 	rp = sotorawcb(so);
 	if (error) {
 		so->so_pcb = NULL;
 		free(rp, M_PCB);
 		return error;
 	}
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count++;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count++;
 		break;
 	}
 	V_route_cb.any_count++;
 	RTSOCK_UNLOCK();
 	soisconnected(so);
 	so->so_options |= SO_USELOOPBACK;
 	return 0;
 }
 
 static int
 rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */
 }
 
 static int
 rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */
 }
 
 /* pru_connect2 is EOPNOTSUPP */
 /* pru_control is EOPNOTSUPP */
 
 static void
 rts_detach(struct socket *so)
 {
 	struct rawcb *rp = sotorawcb(so);
 
 	KASSERT(rp != NULL, ("rts_detach: rp == NULL"));
 
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count--;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count--;
 		break;
 	}
 	V_route_cb.any_count--;
 	RTSOCK_UNLOCK();
 	raw_usrreqs.pru_detach(so);
 }
 
 static int
 rts_disconnect(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_disconnect(so));
 }
 
 /* pru_listen is EOPNOTSUPP */
 
 static int
 rts_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_peeraddr(so, nam));
 }
 
 /* pru_rcvd is EOPNOTSUPP */
 /* pru_rcvoob is EOPNOTSUPP */
 
 static int
 rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_send(so, flags, m, nam, control, td));
 }
 
 /* pru_sense is null */
 
 static int
 rts_shutdown(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_shutdown(so));
 }
 
 static int
 rts_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_sockaddr(so, nam));
 }
 
 static struct pr_usrreqs route_usrreqs = {
 	.pru_abort =		rts_abort,
 	.pru_attach =		rts_attach,
 	.pru_bind =		rts_bind,
 	.pru_connect =		rts_connect,
 	.pru_detach =		rts_detach,
 	.pru_disconnect =	rts_disconnect,
 	.pru_peeraddr =		rts_peeraddr,
 	.pru_send =		rts_send,
 	.pru_shutdown =		rts_shutdown,
 	.pru_sockaddr =		rts_sockaddr,
 	.pru_close =		rts_close,
 };
 
 #ifndef _SOCKADDR_UNION_DEFINED
 #define	_SOCKADDR_UNION_DEFINED
 /*
  * The union of all possible address formats we handle.
  */
 union sockaddr_union {
 	struct sockaddr		sa;
 	struct sockaddr_in	sin;
 	struct sockaddr_in6	sin6;
 };
 #endif /* _SOCKADDR_UNION_DEFINED */
 
 static int
 rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
     struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred)
 {
 
 	/* First, see if the returned address is part of the jail. */
 	if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) {
 		info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		return (0);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct in_addr ia;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			ia = ((struct sockaddr_in *)sa)->sin_addr;
 			if (prison_check_ip4(cred, &ia) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->
 			    sin_addr;
 			if (prison_get_ip4(cred, &ia) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin, sizeof(struct sockaddr_in));
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_addr.s_addr = ia.s_addr;
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			if (prison_if(cred, ifa->ifa_addr) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		bzero(&saun->sin6, sizeof(struct sockaddr_in6));
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_family = AF_INET6;
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			if (prison_get_ip6(cred, &saun->sin6) != 0)
 				return (ESRCH);
 		} else {
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 			saun->sin6.sin6_addr = sin6->sin6_addr;
 			saun->sin6.sin6_scope_id = sin6->sin6_scope_id;
 		}
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
 		break;
 	}
 #endif
 	default:
 		return (ESRCH);
 	}
 	return (0);
 }
 
 #ifdef INET6
 static int
 in6_rt_handle_lla(struct rt_msghdr **ortm, struct rt_addrinfo *info)
 {
 	struct sockaddr_dl sdl;
 	struct sockaddr_in6 *sin6;
 	struct rt_msghdr *rtm = *ortm;
 	struct llentry *lle;
 	struct ifnet *ifp;
 	int i;
 
 	if (rtm->rtm_type != RTM_GET)
 		return (EOPNOTSUPP);
 	sin6 = (struct sockaddr_in6 *)info->rti_info[RTAX_DST];
 	if (sin6->sin6_scope_id == 0)
 		return (EADDRNOTAVAIL);
 	ifp = in6_getlinkifnet(sin6->sin6_scope_id);
 	if (ifp == NULL)
 		return (ESRCH);
 	/* Clear all sockaddr pointers except DST */
 	for (i = RTAX_GATEWAY; i < RTAX_MAX; i++)
 		info->rti_info[i] = NULL;
 
 	IF_AFDATA_RLOCK(ifp);
 	lle = lla_lookup(LLTABLE6(ifp), 0, info->rti_info[RTAX_DST]);
 	IF_AFDATA_RUNLOCK(ifp);
 	if (lle != NULL) {
 		bzero(&sdl, sizeof(sdl));
 		info->rti_info[RTAX_GATEWAY] = (struct sockaddr *)&sdl;
 		sdl.sdl_family = AF_LINK;
 		sdl.sdl_len = sizeof(sdl);
 		sdl.sdl_alen = ifp->if_addrlen;
 		sdl.sdl_index = ifp->if_index;
 		sdl.sdl_type = ifp->if_type;
 		bcopy(&lle->ll_addr, LLADDR(&sdl), ifp->if_addrlen);
 		if (lle->la_flags & LLE_PUB)
 			rtm->rtm_flags |= RTF_ANNOUNCE;
 		if (lle->la_flags & LLE_STATIC) {
 			rtm->rtm_flags |= RTF_STATIC;
 			rtm->rtm_rmx.rmx_expire = 0;
 		} else
 			rtm->rtm_rmx.rmx_expire = lle->la_expire;
 		LLE_RUNLOCK(lle);
 	} else
 		info->rti_info[RTAX_GATEWAY] = ifp->if_addr->ifa_addr;
 	rtm->rtm_flags |= RTF_UP | RTF_HOST;
 	rtm->rtm_index = ifp->if_index;
 	if (rtm->rtm_addrs & (RTA_IFA | RTA_IFP))
 		info->rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	i = rt_msg2(rtm->rtm_type, info, NULL, NULL);
 	if (i > rtm->rtm_msglen) {
 		R_Malloc(rtm, struct rt_msghdr *, i);
 		if (rtm == NULL)
 			return (ENOBUFS);
 		bcopy(*ortm, rtm, i);
 		Free(*ortm);
 		*ortm = rtm;
 	}
 	rt_msg2(rtm->rtm_type, info, (caddr_t)rtm, NULL);
 	rtm->rtm_addrs = info->rti_addrs;
 	return (0);
 }
 #endif
 
 /*ARGSUSED*/
 static int
 route_output(struct mbuf *m, struct socket *so, ...)
 {
 	struct rt_msghdr *rtm = NULL;
 	struct rtentry *rt = NULL;
 	struct radix_node_head *rnh;
 	struct rt_addrinfo info;
 	int alloc_len = 0, len, error = 0, fibnum;
 	struct ifnet *ifp = NULL;
 	union sockaddr_union saun;
 	sa_family_t saf = AF_UNSPEC;
 	struct rawcb *rp = NULL;
 	struct walkarg w;
 
 	fibnum = so->so_fibnum;
 
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
 		       (m = m_pullup(m, sizeof(long))) == NULL))
 		return (ENOBUFS);
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("route_output");
 	len = m->m_pkthdr.len;
 	if (len < sizeof(*rtm) ||
 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen)
 		senderr(EINVAL);
 
 	/*
 	 * Most of current messages are in range 200-240 bytes,
 	 * minimize possible re-allocation on reply using larger size
 	 * buffer aligned on 1k boundaty.
 	 */
 	alloc_len = roundup2(len, 1024);
 	if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL)
 		senderr(ENOBUFS);
 
 	m_copydata(m, 0, len, (caddr_t)rtm);
 	bzero(&info, sizeof(info));
 	bzero(&w, sizeof(w));
 
 	if (rtm->rtm_version != RTM_VERSION) {
 		/* Do not touch message since format is unknown */
 		free(rtm, M_TEMP);
 		rtm = NULL;
 		senderr(EPROTONOSUPPORT);
 	}
 
 	/*
 	 * Starting from here, it is possible
 	 * to alter original message and insert
 	 * caller PID and error value.
 	 */
 
 	rtm->rtm_pid = curproc->p_pid;
 	info.rti_addrs = rtm->rtm_addrs;
 
 	info.rti_mflags = rtm->rtm_inits;
 	info.rti_rmx = &rtm->rtm_rmx;
 
 	/*
 	 * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6
 	 * link-local address because rtrequest requires addresses with
 	 * embedded scope id.
 	 */
 	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info))
 		senderr(EINVAL);
 
 	info.rti_flags = rtm->rtm_flags;
 	if (info.rti_info[RTAX_DST] == NULL ||
 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
 	     info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
 		senderr(EINVAL);
 	saf = info.rti_info[RTAX_DST]->sa_family;
 	/*
 	 * Verify that the caller has the appropriate privilege; RTM_GET
 	 * is the only operation the non-superuser is allowed.
 	 */
 	if (rtm->rtm_type != RTM_GET) {
 		error = priv_check(curthread, PRIV_NET_ROUTE);
 		if (error)
 			senderr(error);
 	}
 
 	/*
 	 * The given gateway address may be an interface address.
 	 * For example, issuing a "route change" command on a route
 	 * entry that was created from a tunnel, and the gateway
 	 * address given is the local end point. In this case the 
 	 * RTF_GATEWAY flag must be cleared or the destination will
 	 * not be reachable even though there is no error message.
 	 */
 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
 	    info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
 		struct route gw_ro;
 
 		bzero(&gw_ro, sizeof(gw_ro));
 		gw_ro.ro_dst = *info.rti_info[RTAX_GATEWAY];
 		rtalloc_ign_fib(&gw_ro, 0, fibnum);
 		/* 
 		 * A host route through the loopback interface is 
 		 * installed for each interface adddress. In pre 8.0
 		 * releases the interface address of a PPP link type
 		 * is not reachable locally. This behavior is fixed as 
 		 * part of the new L2/L3 redesign and rewrite work. The
 		 * signature of this interface address route is the
 		 * AF_LINK sa_family type of the rt_gateway, and the
 		 * rt_ifp has the IFF_LOOPBACK flag set.
 		 */
 		if (gw_ro.ro_rt != NULL &&
 		    gw_ro.ro_rt->rt_gateway->sa_family == AF_LINK &&
 		    gw_ro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) {
 			info.rti_flags &= ~RTF_GATEWAY;
 			info.rti_flags |= RTF_GWFLAG_COMPAT;
 		}
 		if (gw_ro.ro_rt != NULL)
 			RTFREE(gw_ro.ro_rt);
 	}
 
 	switch (rtm->rtm_type) {
 		struct rtentry *saved_nrt;
 
 	case RTM_ADD:
 	case RTM_CHANGE:
 		if (info.rti_info[RTAX_GATEWAY] == NULL)
 			senderr(EINVAL);
 		saved_nrt = NULL;
 
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 			break;
 		}
 		error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt,
 		    fibnum);
 		if (error == 0 && saved_nrt != NULL) {
 			RT_LOCK(saved_nrt);
 			rtm->rtm_index = saved_nrt->rt_ifp->if_index;
 			RT_REMREF(saved_nrt);
 			RT_UNLOCK(saved_nrt);
 		}
 		break;
 
 	case RTM_DELETE:
 		saved_nrt = NULL;
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY] && 
 		    (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 			break;
 		}
 		error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum);
 		if (error == 0) {
 			RT_LOCK(saved_nrt);
 			rt = saved_nrt;
 			goto report;
 		}
 		break;
 
 	case RTM_GET:
 #ifdef INET6
 		if (info.rti_info[RTAX_DST]->sa_family == AF_INET6) {
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)info.rti_info[RTAX_DST];
 			if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 				error = in6_rt_handle_lla(&rtm, &info);
 				break;
 			}
 		}
 #endif
 		rnh = rt_tables_get_rnh(fibnum, saf);
 		if (rnh == NULL)
 			senderr(EAFNOSUPPORT);
 
 		RADIX_NODE_HEAD_RLOCK(rnh);
 
 		if (info.rti_info[RTAX_NETMASK] == NULL &&
 		    rtm->rtm_type == RTM_GET) {
 			/*
 			 * Provide logest prefix match for
 			 * address lookup (no mask).
 			 * 'route -n get addr'
 			 */
 			rt = (struct rtentry *) rnh->rnh_matchaddr(
 			    info.rti_info[RTAX_DST], rnh);
 		} else
 			rt = (struct rtentry *) rnh->rnh_lookup(
 			    info.rti_info[RTAX_DST],
 			    info.rti_info[RTAX_NETMASK], rnh);
 
 		if (rt == NULL) {
 			RADIX_NODE_HEAD_RUNLOCK(rnh);
 			senderr(ESRCH);
 		}
 #ifdef RADIX_MPATH
 		/*
 		 * for RTM_CHANGE/LOCK, if we got multipath routes,
 		 * we require users to specify a matching RTAX_GATEWAY.
 		 *
 		 * for RTM_GET, gate is optional even with multipath.
 		 * if gate == NULL the first match is returned.
 		 * (no need to call rt_mpath_matchgate if gate == NULL)
 		 */
 		if (rn_mpath_capable(rnh) &&
 		    (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
 			rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
 			if (!rt) {
 				RADIX_NODE_HEAD_RUNLOCK(rnh);
 				senderr(ESRCH);
 			}
 		}
 #endif
 		/*
 		 * If performing proxied L2 entry insertion, and
 		 * the actual PPP host entry is found, perform
 		 * another search to retrieve the prefix route of
 		 * the local end point of the PPP link.
 		 */
 		if (rtm->rtm_flags & RTF_ANNOUNCE) {
 			struct sockaddr laddr;
 
 			if (rt->rt_ifp != NULL && 
 			    rt->rt_ifp->if_type == IFT_PROPVIRTUAL) {
 				struct ifaddr *ifa;
 
-				ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1);
+				ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1,
+						RT_ALL_FIBS);
 				if (ifa != NULL)
 					rt_maskedcopy(ifa->ifa_addr,
 						      &laddr,
 						      ifa->ifa_netmask);
 			} else
 				rt_maskedcopy(rt->rt_ifa->ifa_addr,
 					      &laddr,
 					      rt->rt_ifa->ifa_netmask);
 			/* 
 			 * refactor rt and no lock operation necessary
 			 */
 			rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh);
 			if (rt == NULL) {
 				RADIX_NODE_HEAD_RUNLOCK(rnh);
 				senderr(ESRCH);
 			}
 		} 
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		RADIX_NODE_HEAD_RUNLOCK(rnh);
 
 report:
 		RT_LOCK_ASSERT(rt);
 		if ((rt->rt_flags & RTF_HOST) == 0
 		    ? jailed_without_vnet(curthread->td_ucred)
 		    : prison_if(curthread->td_ucred,
 		    rt_key(rt)) != 0) {
 			RT_UNLOCK(rt);
 			senderr(ESRCH);
 		}
 		info.rti_info[RTAX_DST] = rt_key(rt);
 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 		info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 		    rt_mask(rt), &ss);
 		info.rti_info[RTAX_GENMASK] = 0;
 		if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
 			ifp = rt->rt_ifp;
 			if (ifp) {
 				info.rti_info[RTAX_IFP] =
 				    ifp->if_addr->ifa_addr;
 				error = rtm_get_jailed(&info, ifp, rt,
 				    &saun, curthread->td_ucred);
 				if (error != 0) {
 					RT_UNLOCK(rt);
 					senderr(error);
 				}
 				if (ifp->if_flags & IFF_POINTOPOINT)
 					info.rti_info[RTAX_BRD] =
 					    rt->rt_ifa->ifa_dstaddr;
 				rtm->rtm_index = ifp->if_index;
 			} else {
 				info.rti_info[RTAX_IFP] = NULL;
 				info.rti_info[RTAX_IFA] = NULL;
 			}
 		} else if ((ifp = rt->rt_ifp) != NULL) {
 			rtm->rtm_index = ifp->if_index;
 		}
 
 		/* Check if we need to realloc storage */
 		rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len);
 		if (len > alloc_len) {
 			struct rt_msghdr *new_rtm;
 			new_rtm = malloc(len, M_TEMP, M_NOWAIT);
 			if (new_rtm == NULL) {
 				RT_UNLOCK(rt);
 				senderr(ENOBUFS);
 			}
 			bcopy(rtm, new_rtm, rtm->rtm_msglen);
 			free(rtm, M_TEMP);
 			rtm = new_rtm;
 			alloc_len = len;
 		}
 
 		w.w_tmem = (caddr_t)rtm;
 		w.w_tmemsize = alloc_len;
 		rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len);
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_addrs = info.rti_addrs;
 
 		RT_UNLOCK(rt);
 		break;
 
 	default:
 		senderr(EOPNOTSUPP);
 	}
 
 flush:
 	if (rt != NULL)
 		RTFREE(rt);
 	/*
 	 * Check to see if we don't want our own messages.
 	 */
 	if ((so->so_options & SO_USELOOPBACK) == 0) {
 		if (V_route_cb.any_count <= 1) {
 			if (rtm != NULL)
 				free(rtm, M_TEMP);
 			m_freem(m);
 			return (error);
 		}
 		/* There is another listener, so construct message */
 		rp = sotorawcb(so);
 	}
 
 	if (rtm != NULL) {
 		if (error != 0)
 			rtm->rtm_errno = error;
 		else
 			rtm->rtm_flags |= RTF_DONE;
 
 		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
 		if (m->m_pkthdr.len < rtm->rtm_msglen) {
 			m_freem(m);
 			m = NULL;
 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
 
 		free(rtm, M_TEMP);
 	}
 	if (m != NULL) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 		if (rp) {
 			/*
 			 * XXX insure we don't get a copy by
 			 * invalidating our protocol
 			 */
 			unsigned short family = rp->rcb_proto.sp_family;
 			rp->rcb_proto.sp_family = 0;
 			rt_dispatch(m, saf);
 			rp->rcb_proto.sp_family = family;
 		} else
 			rt_dispatch(m, saf);
 	}
 
 	return (error);
 }
 
 static void
 rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
 {
 
 	bzero(out, sizeof(*out));
 	out->rmx_mtu = rt->rt_mtu;
 	out->rmx_weight = rt->rt_weight;
 	out->rmx_pksent = counter_u64_fetch(rt->rt_pksent);
 	/* Kernel -> userland timebase conversion. */
 	out->rmx_expire = rt->rt_expire ?
 	    rt->rt_expire - time_uptime + time_second : 0;
 }
 
 /*
  * Extract the addresses of the passed sockaddrs.
  * Do a little sanity checking so as to avoid bad memory references.
  * This data is derived straight from userland.
  */
 static int
 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr *sa;
 	int i;
 
 	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
 			continue;
 		sa = (struct sockaddr *)cp;
 		/*
 		 * It won't fit.
 		 */
 		if (cp + sa->sa_len > cplim)
 			return (EINVAL);
 		/*
 		 * there are no more.. quit now
 		 * If there are more bits, they are in error.
 		 * I've seen this. route(1) can evidently generate these. 
 		 * This causes kernel to core dump.
 		 * for compatibility, If we see this, point to a safe address.
 		 */
 		if (sa->sa_len == 0) {
 			rtinfo->rti_info[i] = &sa_zero;
 			return (0); /* should be EINVAL but for compat */
 		}
 		/* accept it */
 #ifdef INET6
 		/*
 		 * XXX: some software use embedded scope ids.
 		 * We remove id from address and initialize sin6_scope_id
 		 * instead.
 		 */
 		if (sa->sa_family == AF_INET6)
 			sa6_recoverscope((struct sockaddr_in6 *)sa);
 #endif
 		rtinfo->rti_info[i] = sa;
 		cp += SA_SIZE(sa);
 	}
 	return (0);
 }
 
 /*
  * Fill in @dmask with valid netmask leaving original @smask
  * intact. Mostly used with radix netmasks.
  */
 static struct sockaddr *
 rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask,
     struct sockaddr_storage *dmask)
 {
 	if (dst == NULL || smask == NULL)
 		return (NULL);
 
 	memset(dmask, 0, dst->sa_len);
 	memcpy(dmask, smask, smask->sa_len);
 	dmask->ss_len = dst->sa_len;
 	dmask->ss_family = dst->sa_family;
 
 	return ((struct sockaddr *)dmask);
 }
 
 /*
  * Writes information related to @rtinfo object to newly-allocated mbuf.
  * Assumes MCLBYTES is enough to construct any message.
  * Used for OS notifications of vaious events (if/ifa announces,etc)
  *
  * Returns allocated mbuf or NULL on failure.
  */
 static struct mbuf *
 rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	struct sockaddr *sa;
 	int len, i;
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_DELMADDR:
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	case RTM_IFINFO:
 		len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_IFANNOUNCE:
 	case RTM_IEEE80211:
 		len = sizeof(struct if_announcemsghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	/* XXXGL: can we use MJUMPAGESIZE cluster here? */
 	KASSERT(len <= MCLBYTES, ("%s: message too big", __func__));
 	if (len > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (m);
 
 	m->m_pkthdr.len = m->m_len = len;
 	rtm = mtod(m, struct rt_msghdr *);
 	bzero((caddr_t)rtm, len);
 	for (i = 0; i < RTAX_MAX; i++) {
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		m_copyback(m, len, SA_SIZE(sa), (caddr_t)sa);
 		len += SA_SIZE(sa);
 	}
 	if (m->m_pkthdr.len != len) {
 		m_freem(m);
 		return (NULL);
 	}
 	rtm->rtm_msglen = len;
 	rtm->rtm_version = RTM_VERSION;
 	rtm->rtm_type = type;
 	return (m);
 }
 
 /*
  * Writes information related to @rtinfo object to preallocated buffer.
  * Stores needed size in @plen. If @w is NULL, calculates size without
  * writing.
  * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation.
  *
  * Returns 0 on success.
  *
  */
 static int
 rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen)
 {
 	caddr_t cp = NULL;
 	struct rt_msghdr *rtm = NULL;
 	int len, i, second_time = 0;
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		if (w != NULL && w->w_op == NET_RT_IFLISTL) {
 #ifdef COMPAT_FREEBSD32
 			if (w->w_req->flags & SCTL_MASK32)
 				len = sizeof(struct ifa_msghdrl32);
 			else
 #endif
 				len = sizeof(struct ifa_msghdrl);
 		} else
 			len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_IFINFO:
 #ifdef COMPAT_FREEBSD32
 		if (w != NULL && w->w_req->flags & SCTL_MASK32) {
 			if (w->w_op == NET_RT_IFLISTL)
 				len = sizeof(struct if_msghdrl32);
 			else
 				len = sizeof(struct if_msghdr32);
 			break;
 		}
 #endif
 		if (w != NULL && w->w_op == NET_RT_IFLISTL)
 			len = sizeof(struct if_msghdrl);
 		else
 			len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	if (w != NULL) {
 		rtm = (struct rt_msghdr *)w->w_tmem;
 		buflen = w->w_tmemsize - len;
 		cp = (caddr_t)w->w_tmem + len;
 	}
 
 	rtinfo->rti_addrs = 0;
 	for (i = 0; i < RTAX_MAX; i++) {
 		struct sockaddr *sa;
 
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		if (cp != NULL && buflen >= SA_SIZE(sa)) {
 			bcopy((caddr_t)sa, cp, SA_SIZE(sa));
 			cp += SA_SIZE(sa);
 			buflen -= SA_SIZE(sa);
 		} else if (cp != NULL) {
 			/*
 			 * Buffer too small. Count needed size
 			 * and return with error.
 			 */
 			cp = NULL;
 		}
 		len += SA_SIZE(sa);
 	}
 
 	if (cp != NULL) {
 		if (buflen < ALIGN(len) - len)
 			cp = NULL;
 		else
 			buflen -= ALIGN(len) - len;
 	}
 	len = ALIGN(len);
 
 	if (cp != NULL) {
 		/* fill header iff buffer is large enough */
 		rtm->rtm_version = RTM_VERSION;
 		rtm->rtm_type = type;
 		rtm->rtm_msglen = len;
 	}
 
 	*plen = len;
 
 	if (w != NULL && cp == NULL)
 		return (ENOBUFS);
 
 	return (0);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that a redirect has occured, a routing lookup
  * has failed, or that a protocol has detected timeouts to a particular
  * destination.
  */
 void
 rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error,
     int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
 
 	if (V_route_cb.any_count == 0)
 		return;
 	m = rtsock_msg_mbuf(type, rtinfo);
 	if (m == NULL)
 		return;
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_flags = RTF_DONE | flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = rtinfo->rti_addrs;
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 }
 
 void
 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
 {
 
 	rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that the status of a network interface has changed.
  */
 void
 rt_ifmsg(struct ifnet *ifp)
 {
 	struct if_msghdr *ifm;
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return;
 	bzero((caddr_t)&info, sizeof(info));
 	m = rtsock_msg_mbuf(RTM_IFINFO, &info);
 	if (m == NULL)
 		return;
 	ifm = mtod(m, struct if_msghdr *);
 	ifm->ifm_index = ifp->if_index;
 	ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 	if_data_copy(ifp, &ifm->ifm_data);
 	ifm->ifm_addrs = 0;
 	rt_dispatch(m, AF_UNSPEC);
 }
 
 /*
  * Announce interface address arrival/withdraw.
  * Please do not call directly, use rt_addrmsg().
  * Assume input data to be valid.
  * Returns 0 on success.
  */
 int
 rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	int ncmd;
 	struct mbuf *m;
 	struct ifa_msghdr *ifam;
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
 	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 	    info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss);
 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 	if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL)
 		return (ENOBUFS);
 	ifam = mtod(m, struct ifa_msghdr *);
 	ifam->ifam_index = ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * Announce route addition/removal.
  * Please do not call directly, use rt_routemsg().
  * Note that @rt data MAY be inconsistent/invalid:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  *
  * Returns 0 on success.
  */
 int
 rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	struct mbuf *m;
 	struct rt_msghdr *rtm;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = sa = rt_key(rt);
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL)
 		return (ENOBUFS);
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_index = ifp->if_index;
 	rtm->rtm_flags |= rt->rt_flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * This is the analogue to the rt_newaddrmsg which performs the same
  * function but for multicast group memberhips.  This is easier since
  * there is no route state to worry about.
  */
 void
 rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
 {
 	struct rt_addrinfo info;
 	struct mbuf *m = NULL;
 	struct ifnet *ifp = ifma->ifma_ifp;
 	struct ifma_msghdr *ifmam;
 
 	if (V_route_cb.any_count == 0)
 		return;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 	info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL;
 	/*
 	 * If a link-layer address is present, present it as a ``gateway''
 	 * (similarly to how ARP entries, e.g., are presented).
 	 */
 	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
 	m = rtsock_msg_mbuf(cmd, &info);
 	if (m == NULL)
 		return;
 	ifmam = mtod(m, struct ifma_msghdr *);
 	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
 	    __func__));
 	ifmam->ifmam_index = ifp->if_index;
 	ifmam->ifmam_addrs = info.rti_addrs;
 	rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC);
 }
 
 static struct mbuf *
 rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
 	struct rt_addrinfo *info)
 {
 	struct if_announcemsghdr *ifan;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return NULL;
 	bzero((caddr_t)info, sizeof(*info));
 	m = rtsock_msg_mbuf(type, info);
 	if (m != NULL) {
 		ifan = mtod(m, struct if_announcemsghdr *);
 		ifan->ifan_index = ifp->if_index;
 		strlcpy(ifan->ifan_name, ifp->if_xname,
 			sizeof(ifan->ifan_name));
 		ifan->ifan_what = what;
 	}
 	return m;
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * IEEE80211 wireless events.
  * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
  */
 void
 rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
 	if (m != NULL) {
 		/*
 		 * Append the ieee80211 data.  Try to stick it in the
 		 * mbuf containing the ifannounce msg; otherwise allocate
 		 * a new mbuf and append.
 		 *
 		 * NB: we assume m is a single mbuf.
 		 */
 		if (data_len > M_TRAILINGSPACE(m)) {
 			struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
 			if (n == NULL) {
 				m_freem(m);
 				return;
 			}
 			bcopy(data, mtod(n, void *), data_len);
 			n->m_len = data_len;
 			m->m_next = n;
 		} else if (data_len > 0) {
 			bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
 			m->m_len += data_len;
 		}
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len += data_len;
 		mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
 		rt_dispatch(m, AF_UNSPEC);
 	}
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * network interface arrival and departure.
  */
 void
 rt_ifannouncemsg(struct ifnet *ifp, int what)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
 	if (m != NULL)
 		rt_dispatch(m, AF_UNSPEC);
 }
 
 static void
 rt_dispatch(struct mbuf *m, sa_family_t saf)
 {
 	struct m_tag *tag;
 
 	/*
 	 * Preserve the family from the sockaddr, if any, in an m_tag for
 	 * use when injecting the mbuf into the routing socket buffer from
 	 * the netisr.
 	 */
 	if (saf != AF_UNSPEC) {
 		tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short),
 		    M_NOWAIT);
 		if (tag == NULL) {
 			m_freem(m);
 			return;
 		}
 		*(unsigned short *)(tag + 1) = saf;
 		m_tag_prepend(m, tag);
 	}
 #ifdef VIMAGE
 	if (V_loif)
 		m->m_pkthdr.rcvif = V_loif;
 	else {
 		m_freem(m);
 		return;
 	}
 #endif
 	netisr_queue(NETISR_ROUTE, m);	/* mbuf is free'd on failure. */
 }
 
 /*
  * This is used in dumping the kernel table via sysctl().
  */
 static int
 sysctl_dumpentry(struct radix_node *rn, void *vw)
 {
 	struct walkarg *w = vw;
 	struct rtentry *rt = (struct rtentry *)rn;
 	int error = 0, size;
 	struct rt_addrinfo info;
 	struct sockaddr_storage ss;
 
 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
 		return 0;
 	if ((rt->rt_flags & RTF_HOST) == 0
 	    ? jailed_without_vnet(w->w_req->td->td_ucred)
 	    : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0)
 		return (0);
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 	    rt_mask(rt), &ss);
 	info.rti_info[RTAX_GENMASK] = 0;
 	if (rt->rt_ifp) {
 		info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
 	}
 	if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
 		return (error);
 	if (w->w_req && w->w_tmem) {
 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_index = rt->rt_ifp->if_index;
 		rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
 		rtm->rtm_addrs = info.rti_addrs;
 		error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
 		return (error);
 	}
 	return (error);
 }
 
 static int
 sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdrl *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdrl32 *ifm32;
 
 		ifm32 = (struct if_msghdrl32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifm32->ifm_len = sizeof(*ifm32);
 		ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data);
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifm->ifm_len = sizeof(*ifm);
 		ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data);
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	/* Some drivers still use ifqueue(9), add its stats. */
 	ifd->ifi_oqdrops += ifp->if_snd.ifq_drops;
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdr *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdr *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdr32 *ifm32;
 
 		ifm32 = (struct if_msghdr32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	/* Some drivers still use ifqueue(9), add its stats. */
 	ifd->ifi_oqdrops += ifp->if_snd.ifq_drops;
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdrl *ifam;
 	struct if_data *ifd;
 
 	ifam = (struct ifa_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct ifa_msghdrl32 *ifam32;
 
 		ifam32 = (struct ifa_msghdrl32 *)ifam;
 		ifam32->ifam_addrs = info->rti_addrs;
 		ifam32->ifam_flags = ifa->ifa_flags;
 		ifam32->ifam_index = ifa->ifa_ifp->if_index;
 		ifam32->_ifam_spare1 = 0;
 		ifam32->ifam_len = sizeof(*ifam32);
 		ifam32->ifam_data_off =
 		    offsetof(struct ifa_msghdrl32, ifam_data);
 		ifam32->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam32->ifam_data;
 	} else
 #endif
 	{
 		ifam->ifam_addrs = info->rti_addrs;
 		ifam->ifam_flags = ifa->ifa_flags;
 		ifam->ifam_index = ifa->ifa_ifp->if_index;
 		ifam->_ifam_spare1 = 0;
 		ifam->ifam_len = sizeof(*ifam);
 		ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data);
 		ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam->ifam_data;
 	}
 
 	bzero(ifd, sizeof(*ifd));
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets);
 	ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets);
 	ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes);
 	ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes);
 
 	/* Fixup if_data carp(4) vhid. */
 	if (carp_get_vhid_p != NULL)
 		ifd->ifi_vhid = (*carp_get_vhid_p)(ifa);
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdr *ifam;
 
 	ifam = (struct ifa_msghdr *)w->w_tmem;
 	ifam->ifam_addrs = info->rti_addrs;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_index = ifa->ifa_ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct rt_addrinfo info;
 	int len, error = 0;
 	struct sockaddr_storage ss;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
 		error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len);
 		if (error != 0)
 			goto done;
 		info.rti_info[RTAX_IFP] = NULL;
 		if (w->w_req && w->w_tmem) {
 			if (w->w_op == NET_RT_IFLISTL)
 				error = sysctl_iflist_ifml(ifp, &info, w, len);
 			else
 				error = sysctl_iflist_ifm(ifp, &info, w, len);
 			if (error)
 				goto done;
 		}
 		while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) {
 			if (af && af != ifa->ifa_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifa->ifa_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
 			info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 			    ifa->ifa_addr, ifa->ifa_netmask, &ss);
 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 			error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				if (w->w_op == NET_RT_IFLISTL)
 					error = sysctl_iflist_ifaml(ifa, &info,
 					    w, len);
 				else
 					error = sysctl_iflist_ifam(ifa, &info,
 					    w, len);
 				if (error)
 					goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		info.rti_info[RTAX_IFA] = NULL;
 		info.rti_info[RTAX_NETMASK] = NULL;
 		info.rti_info[RTAX_BRD] = NULL;
 	}
 done:
 	if (ifp != NULL)
 		IF_ADDR_RUNLOCK(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_ifmalist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma;
 	struct	rt_addrinfo info;
 	int	len, error = 0;
 	struct ifaddr *ifa;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (af && af != ifma->ifma_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifma->ifma_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 			info.rti_info[RTAX_GATEWAY] =
 			    (ifma->ifma_addr->sa_family != AF_LINK) ?
 			    ifma->ifma_lladdr : NULL;
 			error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				struct ifma_msghdr *ifmam;
 
 				ifmam = (struct ifma_msghdr *)w->w_tmem;
 				ifmam->ifmam_index = ifma->ifma_ifp->if_index;
 				ifmam->ifmam_flags = 0;
 				ifmam->ifmam_addrs = info.rti_addrs;
 				error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
 				if (error) {
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_rtsock(SYSCTL_HANDLER_ARGS)
 {
 	int	*name = (int *)arg1;
 	u_int	namelen = arg2;
 	struct radix_node_head *rnh = NULL; /* silence compiler. */
 	int	i, lim, error = EINVAL;
 	int	fib = 0;
 	u_char	af;
 	struct	walkarg w;
 
 	name ++;
 	namelen--;
 	if (req->newptr)
 		return (EPERM);
 	if (name[1] == NET_RT_DUMP) {
 		if (namelen == 3)
 			fib = req->td->td_proc->p_fibnum;
 		else if (namelen == 4)
 			fib = (name[3] == RT_ALL_FIBS) ?
 			    req->td->td_proc->p_fibnum : name[3];
 		else
 			return ((namelen < 3) ? EISDIR : ENOTDIR);
 		if (fib < 0 || fib >= rt_numfibs)
 			return (EINVAL);
 	} else if (namelen != 3)
 		return ((namelen < 3) ? EISDIR : ENOTDIR);
 	af = name[0];
 	if (af > AF_MAX)
 		return (EINVAL);
 	bzero(&w, sizeof(w));
 	w.w_op = name[1];
 	w.w_arg = name[2];
 	w.w_req = req;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 	
 	/*
 	 * Allocate reply buffer in advance.
 	 * All rtsock messages has maximum length of u_short.
 	 */
 	w.w_tmemsize = 65536;
 	w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK);
 
 	switch (w.w_op) {
 
 	case NET_RT_DUMP:
 	case NET_RT_FLAGS:
 		if (af == 0) {			/* dump all tables */
 			i = 1;
 			lim = AF_MAX;
 		} else				/* dump only one table */
 			i = lim = af;
 
 		/*
 		 * take care of llinfo entries, the caller must
 		 * specify an AF
 		 */
 		if (w.w_op == NET_RT_FLAGS &&
 		    (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
 			if (af != 0)
 				error = lltable_sysctl_dumparp(af, w.w_req);
 			else
 				error = EINVAL;
 			break;
 		}
 		/*
 		 * take care of routing entries
 		 */
 		for (error = 0; error == 0 && i <= lim; i++) {
 			rnh = rt_tables_get_rnh(fib, i);
 			if (rnh != NULL) {
 				RADIX_NODE_HEAD_RLOCK(rnh); 
 			    	error = rnh->rnh_walktree(rnh,
 				    sysctl_dumpentry, &w);
 				RADIX_NODE_HEAD_RUNLOCK(rnh);
 			} else if (af != 0)
 				error = EAFNOSUPPORT;
 		}
 		break;
 
 	case NET_RT_IFLIST:
 	case NET_RT_IFLISTL:
 		error = sysctl_iflist(af, &w);
 		break;
 
 	case NET_RT_IFMALIST:
 		error = sysctl_ifmalist(af, &w);
 		break;
 	}
 
 	free(w.w_tmem, M_TEMP);
 	return (error);
 }
 
 static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, "");
 
 /*
  * Definitions of protocols supported in the ROUTE domain.
  */
 
 static struct domain routedomain;		/* or at least forward */
 
 static struct protosw routesw[] = {
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&routedomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_output =		route_output,
 	.pr_ctlinput =		raw_ctlinput,
 	.pr_init =		raw_init,
 	.pr_usrreqs =		&route_usrreqs
 }
 };
 
 static struct domain routedomain = {
 	.dom_family =		PF_ROUTE,
 	.dom_name =		 "route",
 	.dom_protosw =		routesw,
 	.dom_protoswNPROTOSW =	&routesw[sizeof(routesw)/sizeof(routesw[0])]
 };
 
 VNET_DOMAIN_SET(route);
Index: user/ae/inet6/sys/netinet/in_pcb.c
===================================================================
--- user/ae/inet6/sys/netinet/in_pcb.c	(revision 271452)
+++ user/ae/inet6/sys/netinet/in_pcb.c	(revision 271453)
@@ -1,2630 +1,2633 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/callout.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #endif
 #ifdef INET
 #include <netinet/in_var.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 
 #include <security/mac/mac_framework.h>
 
 static struct callout	ipport_tick_callout;
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Variables dealing with random ephemeral port allocation. */
 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
 VNET_DEFINE(int, ipport_tcpallocs);
 static VNET_DEFINE(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
 static void	in_pcbremlists(struct inpcb *inp);
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
     "IP Ports");
 
 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first,
 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last,
 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
 	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0,
 	&sysctl_net_ipport_check, "I", "");
 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, "");
 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
 SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 #endif /* INET */
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 /*
  * Initialize an inpcbinfo -- we should be able to reduce the number of
  * arguments in time.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
     char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
     uint32_t inpcbzone_flags, u_int hashfields)
 {
 
 	INP_INFO_LOCK_INIT(pcbinfo, name);
 	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	pcbinfo->ipi_listhead = listhead;
 	LIST_INIT(pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
 	    NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
 	    inpcbzone_flags);
 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
 	uma_zone_set_warning(pcbinfo->ipi_zone,
 	    "kern.ipc.maxsockets limit reached");
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_destroy(pcbinfo);
 #endif
 	uma_zdestroy(pcbinfo->ipi_zone);
 	INP_HASH_LOCK_DESTROY(pcbinfo);
 	INP_INFO_LOCK_DESTROY(pcbinfo);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(inp, inp_zero_size);
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #ifdef IPSEC
 	error = ipsec_init_policy(so, &inp->inp_sp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	INP_WLOCK(inp);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
 #if defined(IPSEC) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 /*
  * Select a local port (number) to use.
  */
 #if defined(INET) || defined(INET6)
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, dorandom, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 	/*
 	 * For UDP(-Lite), use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
 		pcbinfo == &V_ulitecbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP(-Lite) packets in the count. */
 	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
 		V_ipport_tcpallocs++;
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	/* Make the compiler happy. */
 	laddr.s_addr = 0;
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
 		    __func__, inp));
 		laddr = *laddrp;
 	}
 #endif
 	tmpinp = NULL;	/* Make compiler happy. */
 	lport = *lportp;
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV6) != 0)
 			tmpinp = in6_pcblookup_local(pcbinfo,
 			    &inp->in6p_laddr, inp->in6p_zoneid, lport,
 			    lookupflags, cred);
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 			tmpinp = in_pcblookup_local(pcbinfo, laddr,
 			    lport, lookupflags, cred);
 #endif
 	} while (tmpinp != NULL);
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
 		laddrp->s_addr = laddr.s_addr;
 #endif
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Return cached socket options.
  */
 short
 inp_so_options(const struct inpcb *inp)
 {
    short so_options;
 
    so_options = 0;
 
    if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 	   so_options |= SO_REUSEPORT;
    if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 	   so_options |= SO_REUSEADDR;
    return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the exisitng inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		if (nam->sa_len != sizeof (*sin))
 			return (EINVAL);
 #ifdef notdef
 		/*
 		 * We should check the family, but old programs
 		 * incorrectly fail to initialize it.
 		 */
 		if (sin->sin_family != AF_INET)
 			return (EAFNOSUPPORT);
 #endif
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address? 
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
 			    0))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred,
 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    (reuseport & tw->tw_so_options) == 0)
 					return (EADDRINUSE);
 			} else if (t &&
 			    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 			    (reuseport & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 				return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	in_pcbrehash_mbuf(inp, m);
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin;
 	struct route sro;
 	int error;
 
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 	bzero(&sro, sizeof(sro));
 
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 * 
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
-		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
+		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
+					RT_ALL_FIBS));
 		if (ia == NULL)
-			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0));
+			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
+						RT_ALL_FIBS));
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			ifa_free(&ia->ia_ifa);
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ifa_free(&ia->ia_ifa);
 		ia = NULL;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto done;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = sro.ro_rt->rt_ifp;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto done;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct sockaddr_in sain;
 		struct in_ifaddr *ia;
 
 		bzero(&sain, sizeof(struct sockaddr_in));
 		sain.sin_family = AF_INET;
 		sain.sin_len = sizeof(struct sockaddr_in);
 		sain.sin_addr.s_addr = faddr->s_addr;
 
-		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
+		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), RT_ALL_FIBS));
 		if (ia == NULL)
-			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0));
+			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
+						RT_ALL_FIBS));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			ifa_free(&ia->ia_ifa);
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ifa_free(&ia->ia_ifa);
 			ia = NULL;
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			IF_ADDR_RUNLOCK(ifp);
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	if (sro.ro_rt != NULL)
 		RTFREE(sro.ro_rt);
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 
 	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			IN_IFADDR_RLOCK();
 			faddr =
 			    IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			IN_IFADDR_RUNLOCK();
 			if (cred != NULL &&
 			    (error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			IN_IFADDR_RLOCK();
 			if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&TAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 			IN_IFADDR_RUNLOCK();
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				IN_IFADDR_RLOCK();
 				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if ((ia->ia_ifp == ifp) &&
 					    (cred == NULL ||
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0))
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 				IN_IFADDR_RUNLOCK();
 			}
 		}
 		if (error)
 			return (error);
 	}
 	oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
 	    laddr, lport, 0, NULL);
 	if (oinp != NULL) {
 		if (oinpp != NULL)
 			*oinpp = oinp;
 		return (EADDRINUSE);
 	}
 	if (lport == 0) {
 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
 		    cred);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock may already held, or when acquiring a reference
  * via a pcbgroup.
  *
  * in_pcbref() should be used only to provide brief memory stability, and
  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
  * garbage collect the inpcb if it has been in_pcbfree()'d from another
  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
  * lock and rele are the *only* safe operations that may be performed on the
  * inpcb.
  *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
  * using in_pcbrele().
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	refcount_acquire(&inp->inp_refcount);
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
  *
  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
  * reference on an inpcb.  Historically more work was done here (actually, in
  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
  * about memory stability (and continued use of the write lock).
  */
 int
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_RUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 int
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0)
 		return (0);
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 /*
  * Temporary wrapper.
  */
 int
 in_pcbrele(struct inpcb *inp)
 {
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
  * work, including removal from global lists, is done in this context, where
  * the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/* XXXRW: Do as much as possible here. */
 #ifdef IPSEC
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	in_pcbremlists(inp);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		if (inp->in6p_moptions != NULL)
 			ip6_freemoptions(inp->in6p_moptions);
 	}
 #endif
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 #ifdef INET
 	if (inp->inp_moptions != NULL)
 		inp_freemoptions(inp->inp_moptions);
 #endif
 	inp->inp_vflag = 0;
 	inp->inp_flags2 |= INP_FREED;
 	crfree(inp->inp_cred);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 	 * the hash lock...?
 	 */
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 #ifdef PCBGROUP
 		in_pcbgroup_remove(inp);
 #endif
 	}
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct ip_moptions *imo;
 	int i, gap;
 
 	INP_INFO_RLOCK(pcbinfo);
 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 */
 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
 			    i++) {
 				if (imo->imo_membership[i]->inm_ifp == ifp) {
 					in_delmulti(imo->imo_membership[i]);
 					gap++;
 				} else if (gap != 0)
 					imo->imo_membership[i - gap] =
 					    imo->imo_membership[i];
 			}
 			imo->imo_num_memberships -= gap;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    prison_equal_ip4(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
 					cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbgroup->ipg_hashmask)];
 	LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 #ifdef	RSS
 	/*
 	 * For incoming connections, we may wish to do a wildcard
 	 * match for an RSS-local socket.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
 		    lport, 0, pcbgroup->ipg_hashmask)];
 		LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			/* XXX inp locking */
 			if (ifp && ifp->if_type == IFT_FAITH &&
 			    (inp->inp_flags & INP_FAITH) == 0)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	}
 #endif
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_wildmask)];
 		LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			/* XXX inp locking */
 			if (ifp && ifp->if_type == IFT_FAITH &&
 			    (inp->inp_flags & INP_FAITH) == 0)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (lookupflags & INPLOOKUP_WLOCKPCB) {
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp))
 			return (NULL);
 	} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 		INP_RLOCK(inp);
 		if (in_pcbrele_rlocked(inp))
 			return (NULL);
 	} else
 		panic("%s: locking bug", __func__);
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has locked the hash list, and will not perform any further
  * locking or reference operations on either the hash list or the connection.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			/* XXX inp locking */
 			if (ifp && ifp->if_type == IFT_FAITH &&
 			    (inp->inp_flags & INP_FAITH) == 0)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcb *inp;
 
 	INP_HASH_RLOCK(pcbinfo);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 	if (inp != NULL) {
 		in_pcbref(inp);
 		INP_HASH_RUNLOCK(pcbinfo);
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		} else
 			panic("%s: locking bug", __func__);
 	} else
 		INP_HASH_RUNLOCK(pcbinfo);
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in_pcbgroup.c.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
 		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 static int
 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		phd->phd_port = inp->inp_lport;
 		LIST_INIT(&phd->phd_pcblist);
 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 #ifdef PCBGROUP
 	if (do_pcbgroup_update)
 		in_pcbgroup_update(inp);
 #endif
 	return (0);
 }
 
 /*
  * For now, there are two public interfaces to insert an inpcb into the hash
  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
  * is used only in the TCP syncache, where in_pcbinshash is called before the
  * full 4-tuple is set for the inpcb, and we don't want to install in the
  * pcbgroup until later.
  *
  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
  * connection groups, and partially initialised inpcbs should not be exposed
  * to either reservation hash tables or pcbgroups.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 1));
 }
 
 int
 in_pcbinshash_nopcbgroup(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 0));
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	LIST_REMOVE(inp, inp_hash);
 	LIST_INSERT_HEAD(head, inp, inp_hash);
 
 #ifdef PCBGROUP
 	if (m != NULL)
 		in_pcbgroup_update_mbuf(inp, m);
 	else
 		in_pcbgroup_update(inp);
 #endif
 }
 
 void
 in_pcbrehash(struct inpcb *inp)
 {
 
 	in_pcbrehash_mbuf(inp, NULL);
 }
 
 /*
  * Remove PCB from various lists.
  */
 static void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		INP_HASH_WUNLOCK(pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 	LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 #ifdef PCBGROUP
 	in_pcbgroup_remove(inp);
 #endif
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 static void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 static void
 ip_fini(void *xtp)
 {
 
 	callout_stop(&ipport_tick_callout);
 }
 
 /* 
  * The ipport_callout should start running at about the time we attach the
  * inet or inet6 domains.
  */
 static void
 ipport_tick_init(const void *unused __unused)
 {
 
 	/* Start ipport_tick. */
 	callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 }
 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
     ipport_tick_init, NULL);
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANTS
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_FAITH) {
 		db_printf("%sINP_FAITH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
Index: user/ae/inet6/sys/netinet/ip_options.c
===================================================================
--- user/ae/inet6/sys/netinet/ip_options.c	(revision 271452)
+++ user/ae/inet6/sys/netinet/ip_options.c	(revision 271453)
@@ -1,737 +1,740 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *      The Regents of the University of California.
  * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipstealth.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip_icmp.h>
 #include <machine/in_cksum.h>
 
 #include <sys/socketvar.h>
 
 static int	ip_dosourceroute = 0;
 SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
     &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
 
 static int	ip_acceptsourceroute = 0;
 SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, 
     CTLFLAG_RW, &ip_acceptsourceroute, 0, 
     "Enable accepting source routed IP packets");
 
 int		ip_doopts = 1;	/* 0 = ignore, 1 = process, 2 = reject */
 SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW,
     &ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)");
 
 static void	save_rte(struct mbuf *m, u_char *, struct in_addr);
 
 /*
  * Do option processing on a datagram, possibly discarding it if bad options
  * are encountered, or forwarding it if source-routed.
  *
  * The pass argument is used when operating in the IPSTEALTH mode to tell
  * what options to process: [LS]SRR (pass 0) or the others (pass 1).  The
  * reason for as many as two passes is that when doing IPSTEALTH, non-routing
  * options should be processed only if the packet is for us.
  *
  * Returns 1 if packet has been forwarded/freed, 0 if the packet should be
  * processed further.
  */
 int
 ip_dooptions(struct mbuf *m, int pass)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	u_char *cp;
 	struct in_ifaddr *ia;
 	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
 	struct in_addr *sin, dst;
 	uint32_t ntime;
 	struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
 
 	/* Ignore or reject packets with IP options. */
 	if (ip_doopts == 0)
 		return 0;
 	else if (ip_doopts == 2) {
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_FILTER_PROHIB;
 		goto bad;
 	}
 
 	dst = ip->ip_dst;
 	cp = (u_char *)(ip + 1);
 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			optlen = cp[IPOPT_OLEN];
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		/*
 		 * Source routing with record.  Find interface with current
 		 * destination address.  If none on this machine then drop if
 		 * strictly routed, or do nothing if loosely routed.  Record
 		 * interface address and bring up next address component.  If
 		 * strictly routed make sure next address is on directly
 		 * accessible net.
 		 */
 		case IPOPT_LSRR:
 		case IPOPT_SSRR:
 #ifdef IPSTEALTH
 			if (V_ipstealth && pass > 0)
 				break;
 #endif
 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			ipaddr.sin_addr = ip->ip_dst;
 			if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr)
 			    == 0) {
 				if (opt == IPOPT_SSRR) {
 					type = ICMP_UNREACH;
 					code = ICMP_UNREACH_SRCFAIL;
 					goto bad;
 				}
 				if (!ip_dosourceroute)
 					goto nosourcerouting;
 				/*
 				 * Loose routing, and not at next destination
 				 * yet; nothing to do except forward.
 				 */
 				break;
 			}
 			off--;			/* 0 origin */
 			if (off > optlen - (int)sizeof(struct in_addr)) {
 				/*
 				 * End of source route.  Should be for us.
 				 */
 				if (!ip_acceptsourceroute)
 					goto nosourcerouting;
 				save_rte(m, cp, ip->ip_src);
 				break;
 			}
 #ifdef IPSTEALTH
 			if (V_ipstealth)
 				goto dropit;
 #endif
 			if (!ip_dosourceroute) {
 				if (V_ipforwarding) {
 					char buf[16]; /* aaa.bbb.ccc.ddd\0 */
 					/*
 					 * Acting as a router, so generate
 					 * ICMP
 					 */
 nosourcerouting:
 					strcpy(buf, inet_ntoa(ip->ip_dst));
 					log(LOG_WARNING, 
 					    "attempted source route from %s to %s\n",
 					    inet_ntoa(ip->ip_src), buf);
 					type = ICMP_UNREACH;
 					code = ICMP_UNREACH_SRCFAIL;
 					goto bad;
 				} else {
 					/*
 					 * Not acting as a router, so
 					 * silently drop.
 					 */
 #ifdef IPSTEALTH
 dropit:
 #endif
 					IPSTAT_INC(ips_cantforward);
 					m_freem(m);
 					return (1);
 				}
 			}
 
 			/*
 			 * locate outgoing interface
 			 */
 			(void)memcpy(&ipaddr.sin_addr, cp + off,
 			    sizeof(ipaddr.sin_addr));
 
 			if (opt == IPOPT_SSRR) {
 #define	INA	struct in_ifaddr *
 #define	SA	struct sockaddr *
-			    if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL)
-				    ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0);
+			    ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr,
+					    RT_ALL_FIBS);
+			    if (ia == NULL)
+				    ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0,
+						    RT_ALL_FIBS);
 			} else
 /* XXX MRT 0 for routing */
 				ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m));
 			if (ia == NULL) {
 				type = ICMP_UNREACH;
 				code = ICMP_UNREACH_SRCFAIL;
 				goto bad;
 			}
 			ip->ip_dst = ipaddr.sin_addr;
 			(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
 			    sizeof(struct in_addr));
 			ifa_free(&ia->ia_ifa);
 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 			/*
 			 * Let ip_intr's mcast routing check handle mcast pkts
 			 */
 			forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
 			break;
 
 		case IPOPT_RR:
 #ifdef IPSTEALTH
 			if (V_ipstealth && pass == 0)
 				break;
 #endif
 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			/*
 			 * If no space remains, ignore.
 			 */
 			off--;			/* 0 origin */
 			if (off > optlen - (int)sizeof(struct in_addr))
 				break;
 			(void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
 			    sizeof(ipaddr.sin_addr));
 			/*
 			 * Locate outgoing interface; if we're the
 			 * destination, use the incoming interface (should be
 			 * same).
 			 */
 			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
 			    (ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) {
 				type = ICMP_UNREACH;
 				code = ICMP_UNREACH_HOST;
 				goto bad;
 			}
 			(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
 			    sizeof(struct in_addr));
 			ifa_free(&ia->ia_ifa);
 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 			break;
 
 		case IPOPT_TS:
 #ifdef IPSTEALTH
 			if (V_ipstealth && pass == 0)
 				break;
 #endif
 			code = cp - (u_char *)ip;
 			if (optlen < 4 || optlen > 40) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < 5) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if (off > optlen - (int)sizeof(int32_t)) {
 				cp[IPOPT_OFFSET + 1] += (1 << 4);
 				if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				break;
 			}
 			off--;				/* 0 origin */
 			sin = (struct in_addr *)(cp + off);
 			switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
 
 			case IPOPT_TS_TSONLY:
 				break;
 
 			case IPOPT_TS_TSANDADDR:
 				if (off + sizeof(uint32_t) +
 				    sizeof(struct in_addr) > optlen) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				ipaddr.sin_addr = dst;
 				ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
 							    m->m_pkthdr.rcvif);
 				if (ia == NULL)
 					continue;
 				(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
 				    sizeof(struct in_addr));
 				ifa_free(&ia->ia_ifa);
 				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 				off += sizeof(struct in_addr);
 				break;
 
 			case IPOPT_TS_PRESPEC:
 				if (off + sizeof(uint32_t) +
 				    sizeof(struct in_addr) > optlen) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				(void)memcpy(&ipaddr.sin_addr, sin,
 				    sizeof(struct in_addr));
 				if (ifa_ifwithaddr_check((SA)&ipaddr) == 0)
 					continue;
 				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 				off += sizeof(struct in_addr);
 				break;
 
 			default:
 				code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
 				goto bad;
 			}
 			ntime = iptime();
 			(void)memcpy(cp + off, &ntime, sizeof(uint32_t));
 			cp[IPOPT_OFFSET] += sizeof(uint32_t);
 		}
 	}
 	if (forward && V_ipforwarding) {
 		ip_forward(m, 1);
 		return (1);
 	}
 	return (0);
 bad:
 	icmp_error(m, type, code, 0, 0);
 	IPSTAT_INC(ips_badoptions);
 	return (1);
 }
 
 /*
  * Save incoming source route for use in replies, to be picked up later by
  * ip_srcroute if the receiver is interested.
  */
 static void
 save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
 {
 	unsigned olen;
 	struct ipopt_tag *opts;
 
 	opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS,
 	    sizeof(struct ipopt_tag), M_NOWAIT);
 	if (opts == NULL)
 		return;
 
 	olen = option[IPOPT_OLEN];
 	if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) {
 		m_tag_free((struct m_tag *)opts);
 		return;
 	}
 	bcopy(option, opts->ip_srcrt.srcopt, olen);
 	opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
 	opts->ip_srcrt.dst = dst;
 	m_tag_prepend(m, (struct m_tag *)opts);
 }
 
 /*
  * Retrieve incoming source route for use in replies, in the same form used
  * by setsockopt.  The first hop is placed before the options, will be
  * removed later.
  */
 struct mbuf *
 ip_srcroute(struct mbuf *m0)
 {
 	struct in_addr *p, *q;
 	struct mbuf *m;
 	struct ipopt_tag *opts;
 
 	opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL);
 	if (opts == NULL)
 		return (NULL);
 
 	if (opts->ip_nhops == 0)
 		return (NULL);
 	m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 #define OPTSIZ	(sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt))
 
 	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
 	m->m_len = opts->ip_nhops * sizeof(struct in_addr) +
 	    sizeof(struct in_addr) + OPTSIZ;
 
 	/*
 	 * First, save first hop for return route.
 	 */
 	p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]);
 	*(mtod(m, struct in_addr *)) = *p--;
 
 	/*
 	 * Copy option fields and padding (nop) to mbuf.
 	 */
 	opts->ip_srcrt.nop = IPOPT_NOP;
 	opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
 	(void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
 	    &(opts->ip_srcrt.nop), OPTSIZ);
 	q = (struct in_addr *)(mtod(m, caddr_t) +
 	    sizeof(struct in_addr) + OPTSIZ);
 #undef OPTSIZ
 	/*
 	 * Record return path as an IP source route, reversing the path
 	 * (pointers are now aligned).
 	 */
 	while (p >= opts->ip_srcrt.route) {
 		*q++ = *p--;
 	}
 	/*
 	 * Last hop goes to final destination.
 	 */
 	*q = opts->ip_srcrt.dst;
 	m_tag_delete(m0, (struct m_tag *)opts);
 	return (m);
 }
 
 /*
  * Strip out IP options, at higher level protocol in the kernel.
  */
 void
 ip_stripoptions(struct mbuf *m)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	int olen;
 
 	olen = (ip->ip_hl << 2) - sizeof(struct ip);
 	m->m_len -= olen;
 	if (m->m_flags & M_PKTHDR)
 		m->m_pkthdr.len -= olen;
 	ip->ip_len = htons(ntohs(ip->ip_len) - olen);
 	ip->ip_hl = sizeof(struct ip) >> 2;
 
 	bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1),
 	    (size_t )(m->m_len - sizeof(struct ip)));
 }
 
 /*
  * Insert IP options into preformed packet.  Adjust IP destination as
  * required for IP source routing, as indicated by a non-zero in_addr at the
  * start of the options.
  *
  * XXX This routine assumes that the packet has no options in place.
  */
 struct mbuf *
 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
 {
 	struct ipoption *p = mtod(opt, struct ipoption *);
 	struct mbuf *n;
 	struct ip *ip = mtod(m, struct ip *);
 	unsigned optlen;
 
 	optlen = opt->m_len - sizeof(p->ipopt_dst);
 	if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) {
 		*phlen = 0;
 		return (m);		/* XXX should fail */
 	}
 	if (p->ipopt_dst.s_addr)
 		ip->ip_dst = p->ipopt_dst;
 	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
 		n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			*phlen = 0;
 			return (m);
 		}
 		m_move_pkthdr(n, m);
 		n->m_pkthdr.rcvif = NULL;
 		n->m_pkthdr.len += optlen;
 		m->m_len -= sizeof(struct ip);
 		m->m_data += sizeof(struct ip);
 		n->m_next = m;
 		m = n;
 		m->m_len = optlen + sizeof(struct ip);
 		m->m_data += max_linkhdr;
 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
 	} else {
 		m->m_data -= optlen;
 		m->m_len += optlen;
 		m->m_pkthdr.len += optlen;
 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
 	}
 	ip = mtod(m, struct ip *);
 	bcopy(p->ipopt_list, ip + 1, optlen);
 	*phlen = sizeof(struct ip) + optlen;
 	ip->ip_v = IPVERSION;
 	ip->ip_hl = *phlen >> 2;
 	ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
 	return (m);
 }
 
 /*
  * Copy options from ip to jp, omitting those not copied during
  * fragmentation.
  */
 int
 ip_optcopy(struct ip *ip, struct ip *jp)
 {
 	u_char *cp, *dp;
 	int opt, optlen, cnt;
 
 	cp = (u_char *)(ip + 1);
 	dp = (u_char *)(jp + 1);
 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP) {
 			/* Preserve for IP mcast tunnel's LSRR alignment. */
 			*dp++ = IPOPT_NOP;
 			optlen = 1;
 			continue;
 		}
 
 		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
 		    ("ip_optcopy: malformed ipv4 option"));
 		optlen = cp[IPOPT_OLEN];
 		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
 		    ("ip_optcopy: malformed ipv4 option"));
 
 		/* Bogus lengths should have been caught by ip_dooptions. */
 		if (optlen > cnt)
 			optlen = cnt;
 		if (IPOPT_COPIED(opt)) {
 			bcopy(cp, dp, optlen);
 			dp += optlen;
 		}
 	}
 	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
 		*dp++ = IPOPT_EOL;
 	return (optlen);
 }
 
 /*
  * Set up IP options in pcb for insertion in output packets.  Store in mbuf
  * with pointer in pcbopt, adding pseudo-option with destination address if
  * source routed.
  */
 int
 ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m)
 {
 	int cnt, optlen;
 	u_char *cp;
 	struct mbuf **pcbopt;
 	u_char opt;
 
 	INP_WLOCK_ASSERT(inp);
 
 	pcbopt = &inp->inp_options;
 
 	/* turn off any old options */
 	if (*pcbopt)
 		(void)m_free(*pcbopt);
 	*pcbopt = 0;
 	if (m == NULL || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options.
 		 */
 		if (m != NULL)
 			(void)m_free(m);
 		return (0);
 	}
 
 	if (m->m_len % sizeof(int32_t))
 		goto bad;
 	/*
 	 * IP first-hop destination address will be stored before actual
 	 * options; move other options back and clear it when none present.
 	 */
 	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
 		goto bad;
 	cnt = m->m_len;
 	m->m_len += sizeof(struct in_addr);
 	cp = mtod(m, u_char *) + sizeof(struct in_addr);
 	bcopy(mtod(m, void *), cp, (unsigned)cnt);
 	bzero(mtod(m, void *), sizeof(struct in_addr));
 
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < IPOPT_OLEN + sizeof(*cp))
 				goto bad;
 			optlen = cp[IPOPT_OLEN];
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
 				goto bad;
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 		case IPOPT_SSRR:
 			/*
 			 * User process specifies route as:
 			 *
 			 *	->A->B->C->D
 			 *
 			 * D must be our final destination (but we can't
 			 * check that since we may not have connected yet).
 			 * A is first hop destination, which doesn't appear
 			 * in actual IP option, but is stored before the
 			 * options.
 			 */
 			/* XXX-BZ PRIV_NETINET_SETHDROPTS? */
 			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
 				goto bad;
 			m->m_len -= sizeof(struct in_addr);
 			cnt -= sizeof(struct in_addr);
 			optlen -= sizeof(struct in_addr);
 			cp[IPOPT_OLEN] = optlen;
 			/*
 			 * Move first hop before start of options.
 			 */
 			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
 			    sizeof(struct in_addr));
 			/*
 			 * Then copy rest of options back
 			 * to close up the deleted entry.
 			 */
 			bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
 			    &cp[IPOPT_OFFSET+1],
 			    (unsigned)cnt - (IPOPT_MINOFF - 1));
 			break;
 		}
 	}
 	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
 		goto bad;
 	*pcbopt = m;
 	return (0);
 
 bad:
 	(void)m_free(m);
 	return (EINVAL);
 }
 
 /*
  * Check for the presence of the IP Router Alert option [RFC2113]
  * in the header of an IPv4 datagram.
  *
  * This call is not intended for use from the forwarding path; it is here
  * so that protocol domains may check for the presence of the option.
  * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
  * option does not have much relevance to the implementation, though this
  * may change in future.
  * Router alert options SHOULD be passed if running in IPSTEALTH mode and
  * we are not the endpoint.
  * Length checks on individual options should already have been peformed
  * by ip_dooptions() therefore they are folded under INVARIANTS here.
  *
  * Return zero if not present or options are invalid, non-zero if present.
  */
 int
 ip_checkrouteralert(struct mbuf *m)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	u_char *cp;
 	int opt, optlen, cnt, found_ra;
 
 	found_ra = 0;
 	cp = (u_char *)(ip + 1);
 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 #ifdef INVARIANTS
 			if (cnt < IPOPT_OLEN + sizeof(*cp))
 				break;
 #endif
 			optlen = cp[IPOPT_OLEN];
 #ifdef INVARIANTS
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
 				break;
 #endif
 		}
 		switch (opt) {
 		case IPOPT_RA:
 #ifdef INVARIANTS
 			if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
 			    (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0))
 			    break;
 			else
 #endif
 			found_ra = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	return (found_ra);
 }
Index: user/ae/inet6/sys/netinet/ip_output.c
===================================================================
--- user/ae/inet6/sys/netinet/ip_output.c	(revision 271452)
+++ user/ae/inet6/sys/netinet/ip_output.c	(revision 271453)
@@ -1,1416 +1,1420 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/flowtable.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #endif /* IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(u_short, ip_id);
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback
 	(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
 
 
 extern int in_mcast_loop;
 extern	struct protosw inetsw[];
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int error = 0;
 	struct sockaddr_in *dst;
 	const struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	struct route iproute;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
 	struct in_addr odst;
 	struct m_tag *fwd_tag = NULL;
 	int have_ia_ref;
 #ifdef IPSEC
 	int no_route_but_check_spd = 0;
 #endif
 	M_ASSERTPKTHDR(m);
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if (((flags & IP_NODEFAULTFLOWID) == 0) &&
 		    inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 			m->m_flags |= M_FLOWID;
 		}
 	}
 
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
 	}
 
 #ifdef FLOWTABLE
 	if (ro->ro_rt == NULL)
 		(void )flowtable_lookup(AF_INET, m, ro);
 #endif
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	/*
 	 * Fill in IP header.  If we are not allowing fragmentation,
 	 * then the ip_id field is meaningless, but we don't set it
 	 * to zero.  Doing so causes various problems when devices along
 	 * the path (routers, load balancers, firewalls, etc.) illegally
 	 * disable DF on our packet.  Note that a 16-bit counter
 	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
 	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
 	 * for Counting NATted Hosts", Proc. IMW'02, available at
 	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
 	 */
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip->ip_id = ip_newid();
 		IPSTAT_INC(ips_localout);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 
 	/*
 	 * dst/gw handling:
 	 *
 	 * dst can be rewritten but always points to &ro->ro_dst.
 	 * gw is readonly but can point either to dst OR rt_gateway,
 	 * therefore we need restore gw if we're redoing lookup.
 	 */
 	gw = dst = (struct sockaddr_in *)&ro->ro_dst;
 again:
 	ia = NULL;
 	have_ia_ref = 0;
 	/*
 	 * If there is a cached route, check that it is to the same
 	 * destination and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing
 	 * the cache with IPv6.
 	 */
 	rte = ro->ro_rt;
 	if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp) ||
 			  dst->sin_family != AF_INET ||
 			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
 		RO_RTFREE(ro);
 		ro->ro_lle = NULL;
 		rte = NULL;
 		gw = dst;
 	}
 	if (rte == NULL && fwd_tag == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
-		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
-		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
+		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
+						      RT_ALL_FIBS))) == NULL &&
+		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
+						    RT_ALL_FIBS))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		have_ia_ref = 1;
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 	} else if (flags & IP_ROUTETOIF) {
-		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
-		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) {
+		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
+						    RT_ALL_FIBS))) == NULL &&
+		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
+						RT_ALL_FIBS))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		have_ia_ref = 1;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = in_broadcast(dst->sin_addr, ifp);
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		IFP_TO_IA(ifp, ia);
 		if (ia)
 			have_ia_ref = 1;
 		isbroadcast = 0;	/* fool gcc */
 	} else {
 		/*
 		 * We want to do any cloning requested by the link layer,
 		 * as this is probably required in all cases for correct
 		 * operation (as it is for ARP).
 		 */
 		if (rte == NULL) {
 #ifdef RADIX_MPATH
 			rtalloc_mpath_fib(ro,
 			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
 #else
 			in_rtalloc_ign(ro, 0,
 			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
 #endif
 			rte = ro->ro_rt;
 		}
 		if (rte == NULL ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp)) {
 #ifdef IPSEC
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			mtu = 0; /* Silence GCC warning. */
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ia = ifatoia(rte->rt_ifa);
 		ifp = rte->rt_ifp;
 		counter_u64_add(rte->rt_pksent, 1);
 		if (rte->rt_flags & RTF_GATEWAY)
 			gw = (struct sockaddr_in *)rte->rt_gateway;
 		if (rte->rt_flags & RTF_HOST)
 			isbroadcast = (rte->rt_flags & RTF_BROADCAST);
 		else
 			isbroadcast = in_broadcast(gw->sin_addr, ifp);
 	}
 	/*
 	 * Calculate MTU.  If we have a route that is up, use that,
 	 * otherwise use the interface's MTU.
 	 */
 	if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) {
 		/*
 		 * This case can happen if the user changed the MTU
 		 * of an interface after enabling IP on it.  Because
 		 * most netifs don't keep track of routes pointing to
 		 * them, there is no way for one to update all its
 		 * routes when the MTU is changed.
 		 */
 		if (rte->rt_mtu > ifp->if_mtu)
 			rte->rt_mtu = ifp->if_mtu;
 		mtu = rte->rt_mtu;
 	} else {
 		mtu = ifp->if_mtu;
 	}
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
 	    __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp));
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "gw"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		gw = dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY) {
 			/* Interface may have no addresses. */
 			if (ia != NULL)
 				ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, dst, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY) {
 		/* Interface may have no addresses. */
 		if (ia != NULL) {
 			ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 	}
 
 	/*
 	 * Both in the SMP world, pre-emption world if_transmit() world,
 	 * the following code doesn't really function as intended any further.
 	 *
 	 * + There can and will be multiple CPUs running this code path
 	 *   in parallel, and we do no lock holding when checking the
 	 *   queue depth;
 	 * + And since other threads can be running concurrently, even if
 	 *   we do pass this check, another thread may queue some frames
 	 *   before this thread does and it will end up partially or fully
 	 *   failing to send anyway;
 	 * + if_transmit() based drivers don't necessarily set ifq_len
 	 *   at all.
 	 *
 	 * This should be replaced with a method of pushing an entire list
 	 * of fragment frames to the driver and have the driver decide
 	 * whether it can queue or not queue the entire set.
 	 */
 #if 0
 	/*
 	 * Verify that we have any chance at all of being able to queue the
 	 * packet or packet fragments, unless ALTQ is enabled on the given
 	 * interface in which case packetdrop should be done by queueing.
 	 */
 	n = ip_len / mtu + 1; /* how many fragments ? */
 	if (
 #ifdef ALTQ
 	    (!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
 #endif /* ALTQ */
 	    (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) {
 		error = ENOBUFS;
 		IPSTAT_INC(ips_odropped);
 		ifp->if_snd.ifq_drops += n;
 		goto bad;
 	}
 #endif
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #ifdef IPSEC
 	switch(ip_ipsec_output(&m, inp, &flags, &error)) {
 	case 1:
 		goto bad;
 	case -1:
 		goto done;
 	case 0:
 	default:
 		break;	/* Continue with packet processing. */
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passout;
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
 	if (error != 0 || m == NULL)
 		goto done;
 
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IP, m);
 			goto done;
 		} else {
 			if (have_ia_ref)
 				ifa_free(&ia->ia_ifa);
 			goto again;	/* Redo the routing table lookup. */
 		}
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			    CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		error = netisr_queue(NETISR_IP, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		if (have_ia_ref)
 			ifa_free(&ia->ia_ifa);
 		goto again;
 	}
 
 passout:
 	/* 127/8 must not appear on wire - RFC1122. */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags & CSUM_TSO)
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = (*ifp->if_output)(ifp, m,
 		    (const struct sockaddr *)gw, ro);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 			error = (*ifp->if_output)(ifp, m,
 			    (const struct sockaddr *)gw, ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	if (ro == &iproute)
 		RO_RTFREE(ro);
 	if (have_ia_ref)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if (ip_off & IP_DF) {	/* Fragmentation not allowed */
 		IPSTAT_INC(ips_cantfrag);
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/*
 		 * Fragment large datagrams such that each segment
 		 * contains a multiple of PAGE_SIZE amount of data,
 		 * plus headers. This enables a receiver to perform
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 		struct mbuf *m;
 
 		for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
 			off += m->m_len;
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_flags |= (m0->m_flags & M_MCAST);
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 		m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	uint16_t csum, offset, ip_len;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 	ip_len = ntohs(ip->ip_len);
 	csum = in_cksum_skip(m, ip_len, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	/* find the mbuf in the chain where the checksum starts*/
 	while ((m != NULL) && (offset >= m->m_len)) {
 		offset -= m->m_len;
 		m = m->m_next;
 	}
 	KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain."));
 	KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs."));
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_BINDMULTI:
 #ifdef	RSS
 		case IP_RSS_LISTEN_BUCKET:
 #endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RECVRSSBUCKETID:
 #endif
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 #define	OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_FAITH:
 				OPTSET(INP_FAITH);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			case IP_BINDMULTI:
 				OPTSET2(INP_BINDMULTI, optval);
 				break;
 			case IP_RECVFLOWID:
 				OPTSET2(INP_RECVFLOWID, optval);
 				break;
 #ifdef	RSS
 			case IP_RSS_LISTEN_BUCKET:
 				if ((optval >= 0) &&
 				    (optval < rss_getnumbuckets())) {
 					inp->inp_rss_listen_bucket = optval;
 					OPTSET2(INP_RSS_BUCKET_SET, 1);
 				} else {
 					error = EINVAL;
 				}
 				break;
 			case IP_RECVRSSBUCKETID:
 				OPTSET2(INP_RECVRSSBUCKETID, optval);
 				break;
 #endif
 			}
 			break;
 #undef OPTSET
 #undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			caddr_t req;
 			struct mbuf *m;
 
 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 				break;
 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 				break;
 			req = mtod(m, caddr_t);
 			error = ipsec_set_policy(inp, sopt->sopt_name, req,
 			    m->m_len, (sopt->sopt_td != NULL) ?
 			    sopt->sopt_td->td_ucred : NULL);
 			m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			if (inp->inp_options)
 				error = sooptcopyout(sopt,
 						     mtod(inp->inp_options,
 							  char *),
 						     inp->inp_options->m_len);
 			else
 				sopt->sopt_valsize = 0;
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RSSBUCKETID:
 		case IP_RECVRSSBUCKETID:
 #endif
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 #define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_FAITH:
 				optval = OPTBIT(INP_FAITH);
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			case IP_FLOWID:
 				optval = inp->inp_flowid;
 				break;
 			case IP_FLOWTYPE:
 				optval = inp->inp_flowtype;
 				break;
 			case IP_RECVFLOWID:
 				optval = OPTBIT2(INP_RECVFLOWID);
 				break;
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				retval = rss_hash2bucket(inp->inp_flowid,
 				    inp->inp_flowtype,
 				    &rss_bucket);
 				if (retval == 0)
 					optval = rss_bucket;
 				else
 					error = EINVAL;
 				break;
 			case IP_RECVRSSBUCKETID:
 				optval = OPTBIT2(INP_RECVRSSBUCKETID);
 				break;
 #endif
 			case IP_BINDMULTI:
 				optval = OPTBIT2(INP_BINDMULTI);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			struct mbuf *m = NULL;
 			caddr_t req = NULL;
 			size_t len = 0;
 
 			if (m != 0) {
 				req = mtod(m, caddr_t);
 				len = m->m_len;
 			}
 			error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
 			if (error == 0)
 				error = soopt_mcopyout(sopt, m); /* XXX */
 			if (error == 0)
 				m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
     int hlen)
 {
 	register struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 #if 1 /* XXX */
 		if (dst->sin_family != AF_INET) {
 			printf("ip_mloopback: bad address family %d\n",
 						dst->sin_family);
 			dst->sin_family = AF_INET;
 		}
 #endif
 		if_simloop(ifp, copym, dst->sin_family, 0);
 	}
 }
Index: user/ae/inet6/sys/sys/param.h
===================================================================
--- user/ae/inet6/sys/sys/param.h	(revision 271452)
+++ user/ae/inet6/sys/sys/param.h	(revision 271453)
@@ -1,347 +1,347 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /* 
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	head/en_US.ISO8859-1/books/porters-handbook/book.xml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1100031	/* Master, propagated to newvers */
+#define __FreeBSD_version 1100032	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #ifdef _KERNEL
 #define	P_OSREL_SIGWAIT		700000
 #define	P_OSREL_SIGSEGV		700004
 #define	P_OSREL_MAP_ANON	800104
 
 #define	P_OSREL_MAJOR(x)	((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	63		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 /* Signals. */
 #include <sys/signal.h>
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS
 #define MAXPHYS		(128 * 1024)	/* max raw I/O transfer size */
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual 
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use 
  *		the KVM memory reserved for the buffer cache and will wind 
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #define BKVASIZE	16384	/* must be power of 2 */
 #define BKVAMASK	(BKVASIZE-1)
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) ((x)&(~((y)-1)))          /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef lint
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* lint */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
  
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */
Index: user/ae/inet6/sys
===================================================================
--- user/ae/inet6/sys	(revision 271452)
+++ user/ae/inet6/sys	(revision 271453)

Property changes on: user/ae/inet6/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys:r271428-271452
Index: user/ae/inet6/usr.bin/mkimg/bsd.c
===================================================================
--- user/ae/inet6/usr.bin/mkimg/bsd.c	(revision 271452)
+++ user/ae/inet6/usr.bin/mkimg/bsd.c	(revision 271453)
@@ -1,135 +1,141 @@
 /*-
  * Copyright (c) 2014 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/disklabel.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "image.h"
 #include "mkimg.h"
 #include "scheme.h"
 
 #ifndef FS_NANDFS
 #define	FS_NANDFS	30
 #endif
 
 static struct mkimg_alias bsd_aliases[] = {
     {	ALIAS_FREEBSD_NANDFS, ALIAS_INT2TYPE(FS_NANDFS) },
     {	ALIAS_FREEBSD_SWAP, ALIAS_INT2TYPE(FS_SWAP) },
     {	ALIAS_FREEBSD_UFS, ALIAS_INT2TYPE(FS_BSDFFS) },
     {	ALIAS_FREEBSD_VINUM, ALIAS_INT2TYPE(FS_VINUM) },
     {	ALIAS_FREEBSD_ZFS, ALIAS_INT2TYPE(FS_ZFS) },
     {	ALIAS_NONE, 0 }
 };
 
 static u_int
 bsd_metadata(u_int where)
 {
 	u_int secs;
 
 	secs = BBSIZE / secsz;
 	return ((where == SCHEME_META_IMG_START) ? secs : 0);
 }
 
 static int
 bsd_write(lba_t imgsz, void *bootcode)
 {
 	u_char *buf, *p;
 	struct disklabel *d;
 	struct partition *dp;
 	struct part *part;
-	int error, n;
+	int bsdparts, error, n;
 	uint16_t checksum;
 
 	buf = malloc(BBSIZE);
 	if (buf == NULL)
 		return (ENOMEM);
 	if (bootcode != NULL) {
 		memcpy(buf, bootcode, BBSIZE);
-		memset(buf + secsz, 0, secsz);
+		memset(buf + secsz, 0, sizeof(struct disklabel));
 	} else
 		memset(buf, 0, BBSIZE);
 
+	bsdparts = nparts + 1;	/* Account for c partition */
+	if (bsdparts < MAXPARTITIONS)
+		bsdparts = MAXPARTITIONS;
 	imgsz = (lba_t)ncyls * nheads * nsecs;
 	error = image_set_size(imgsz);
 	if (error) {
 		free(buf);
 		return (error);
 	}
 
 	d = (void *)(buf + secsz);
 	le32enc(&d->d_magic, DISKMAGIC);
 	le32enc(&d->d_secsize, secsz);
 	le32enc(&d->d_nsectors, nsecs);
 	le32enc(&d->d_ntracks, nheads);
 	le32enc(&d->d_ncylinders, ncyls);
 	le32enc(&d->d_secpercyl, nsecs * nheads);
 	le32enc(&d->d_secperunit, imgsz);
 	le16enc(&d->d_rpm, 3600);
 	le32enc(&d->d_magic2, DISKMAGIC);
-	le16enc(&d->d_npartitions, (8 > nparts + 1) ? 8 : nparts + 1);
+	le16enc(&d->d_npartitions, bsdparts);
 	le32enc(&d->d_bbsize, BBSIZE);
 
 	dp = &d->d_partitions[RAW_PART];
 	le32enc(&dp->p_size, imgsz);
 	STAILQ_FOREACH(part, &partlist, link) {
 		n = part->index + ((part->index >= RAW_PART) ? 1 : 0);
 		dp = &d->d_partitions[n];
 		le32enc(&dp->p_size, part->size);
 		le32enc(&dp->p_offset, part->block);
+		le32enc(&dp->p_fsize, 0);
 		dp->p_fstype = ALIAS_TYPE2INT(part->type);
+		dp->p_frag = 0;
+		le16enc(&dp->p_cpg, 0);
 	}
 
-	dp = &d->d_partitions[nparts + 1];
+	dp = &d->d_partitions[bsdparts];
 	checksum = 0;
-	for (p = buf; p < (u_char *)dp; p += 2)
+	for (p = (void *)d; p < (u_char *)dp; p += 2)
 		checksum ^= le16dec(p);
 	le16enc(&d->d_checksum, checksum);
 
 	error = image_write(0, buf, BBSIZE / secsz);
 	free(buf);
 	return (error);
 }
 
 static struct mkimg_scheme bsd_scheme = {
 	.name = "bsd",
 	.description = "BSD disk label",
 	.aliases = bsd_aliases,
 	.metadata = bsd_metadata,
 	.write = bsd_write,
 	.nparts = 19,
 	.bootcode = BBSIZE,
 	.maxsecsz = 512
 };
 
 SCHEME_DEFINE(bsd_scheme);
Index: user/ae/inet6/usr.bin/mkimg
===================================================================
--- user/ae/inet6/usr.bin/mkimg	(revision 271452)
+++ user/ae/inet6/usr.bin/mkimg	(revision 271453)

Property changes on: user/ae/inet6/usr.bin/mkimg
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/usr.bin/mkimg:r271428-271452
Index: user/ae/inet6/usr.bin/rctl/rctl.8
===================================================================
--- user/ae/inet6/usr.bin/rctl/rctl.8	(revision 271452)
+++ user/ae/inet6/usr.bin/rctl/rctl.8	(revision 271453)
@@ -1,276 +1,276 @@
 .\"-
 .\" Copyright (c) 2009 Edward Tomasz Napierala
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR THE VOICES IN HIS HEAD BE
 .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 16, 2014
+.Dd September 11, 2014
 .Dt RCTL 8
 .Os
 .Sh NAME
 .Nm rctl
 .Nd display and update resource limits database
 .Sh SYNOPSIS
 .Nm
 .Op Fl h
 .Op Fl n
 .Op Ar filter
 .Nm
 .Fl a
 .Op Ar rule
 .Nm
 .Fl l
 .Op Fl h
 .Op Fl n
 .Op Ar filter
 .Nm
 .Fl r
 .Op Ar filter
 .Nm
 .Fl u
 .Op Fl h
 .Op Ar filter
 .Pp
 .Nm
 requires the kernel to be compiled with:
 .Bd -ragged -offset indent
 .Cd "options RACCT"
 .Cd "options RCTL"
 .Ed
 .Sh DESCRIPTION
 When called without options, the
 .Nm
 command writes currently defined RCTL rules to standard output.
 .Pp
 If a
 .Ar filter
 argument is specified, only rules matching the filter are displayed.
 The options are as follows:
 .Bl -tag -width indent
 .It Fl a Ar rule
 Add
 .Ar rule
 to the RCTL database.
 .It Fl l Ar filter
 Display rules applicable to the process defined by
 .Ar filter .
 Note that this is different from showing the rules when called without
 any options, as it shows not just the rules with subject equal to that
 of process, but also rules for the user, jail, and login class applicable
 to the process.
 .It Fl r Ar filter
 Remove rules matching
 .Ar filter
 from the RCTL database.
 .It Fl u Ar filter
 Display resource usage for a subject
 .Po
 .Sy process ,
 .Sy user ,
 .Sy loginclass
 or
 .Sy jail
 .Pc
 matching the
 .Ar filter .
 .It Fl h
 "Human-readable" output.
 Use unit suffixes: Byte, Kilobyte, Megabyte,
 Gigabyte, Terabyte and Petabyte.
 .It Fl n
 Display user IDs numerically rather than converting them to a user name.
 .El
 .Pp
 Modifying rules affects all currently running and future processes matching
 the rule.
 .Sh RULE SYNTAX
 Syntax for a rule is subject:subject-id:resource:action=amount/per.
 .Pp
 .Bl -tag -width "subject-id" -compact -offset indent
 .It subject
 defines the kind of entity the rule applies to.
 It can be either
 .Sy process ,
 .Sy user ,
 .Sy loginclass ,
 or
 .Sy jail .
 .It subject-id
 identifies the
 .Em subject .
 It can be a process ID, user name, numerical user ID, login class name from
 .Xr login.conf 5 ,
 or jail name.
 .It resource
 identifies the resource the rule controls.
 See the
 .Sx RESOURCES
 section below for details.
 .It action
 defines what will happen when a process exceeds the allowed
 .Em amount .
 See the
 .Sx ACTIONS
 section below for details.
 .It amount
 defines how much of the resource a process can use before
 the defined
 .Em action
 triggers.
 Resources which limit bytes may use prefixes from
 .Xr expand_number 3 .
 .It per
 defines what entity the
 .Em amount
 gets accounted for.
 For example, rule "loginclass:users:vmem:deny=100M/process" means
 that each process of any user belonging to login class "users" may allocate
 up to 100MB of virtual memory.
 Rule "loginclass:users:vmem:deny=100M/user" would mean that for each
 user belonging to the login class "users", the sum of virtual memory allocated
 by all the processes of that user will not exceed 100MB.
 Rule "loginclass:users:vmem:deny=100M/loginclass" would mean that the sum of
 virtual memory allocated by all processes of all users belonging to that login
 class will not exceed 100MB.
 .El
 .Pp
 A valid rule has all those fields specified, except for
 .Em per ,
 which defaults
 to the value of
 .Em subject .
 .Pp
 A filter is a rule for which one of more fields other than
 .Em per
 is left empty.
 For example, a filter that matches every rule could be written as ":::=/",
 or, in short, ":".
 A filter that matches all the login classes would be "loginclass:".
 A filter that matches all defined rules for
 .Sy maxproc
 resource would be
 "::maxproc".
 .Sh SUBJECTS
 .Bl -column -offset 3n "pseudoterminals" ".Sy username or numerical User ID"
 .It Em subject Ta Em subject-id
 .It Sy process Ta numerical Process ID
 .It Sy user Ta user name or numerical User ID
 .It Sy loginclass Ta login class from
 .Xr login.conf 5
 .It Sy jail Ta jail name
 .El
 .Sh RESOURCES
 .Bl -column -offset 3n "pseudoterminals"
 .It Em resource
 .It Sy cputime Ta "CPU time, in seconds"
 .It Sy datasize Ta "data size, in bytes"
 .It Sy stacksize Ta "stack size, in bytes"
 .It Sy coredumpsize Ta "core dump size, in bytes"
 .It Sy memoryuse Ta "resident set size, in bytes"
 .It Sy memorylocked Ta "locked memory, in bytes"
 .It Sy maxproc Ta "number of processes"
 .It Sy openfiles Ta "file descriptor table size"
 .It Sy vmemoryuse Ta "address space limit, in bytes"
 .It Sy pseudoterminals Ta "number of PTYs"
 .It Sy swapuse Ta "swap usage, in bytes"
 .It Sy nthr Ta "number of threads"
 .It Sy msgqqueued Ta "number of queued SysV messages"
 .It Sy msgqsize Ta "SysV message queue size, in bytes"
 .It Sy nmsgq Ta "number of SysV message queues"
 .It Sy nsem Ta "number of SysV semaphores"
 .It Sy nsemop Ta "number of SysV semaphores modified in a single semop(2) call"
 .It Sy nshm Ta "number of SysV shared memory segments"
 .It Sy shmsize Ta "SysV shared memory size, in bytes"
 .It Sy wallclock Ta "wallclock time, in seconds"
 .It Sy pcpu Ta "%CPU, in percents of a single CPU core"
 .El
 .Sh ACTIONS
 .Bl -column -offset 3n "pseudoterminals"
 .It Em action
 .It Sy deny Ta deny the allocation; not supported for
-.Sy cpu
+.Sy cputime
 and
 .Sy wallclock
 .It Sy log Ta "log a warning to the console"
 .It Sy devctl Ta "send notification to"
 .Xr devd 8
 using
 .Sy system
 = "RCTL",
 .Sy subsystem
 = "rule",
 .Sy type
 = "matched"
 .It sig*	e.g.
 .Sy sigterm ;
 send a signal to the offending process.
 See
 .Xr signal 3
 for a list of supported signals
 .El
 .Pp
 Not all actions are supported for all resources.
 Attempting to add a rule with an action not supported by a given resource will
 result in error.
 .Sh EXIT STATUS
 .Ex -std
 .Sh EXAMPLES
 Prevent user "joe" from allocating more than 1GB of virtual memory:
 .Dl Nm Fl a Ar user:joe:vmemoryuse:deny=1g
 .Pp
 Remove all RCTL rules:
 .Dl Nm Fl r Ar \&:
 .Pp
 Display resource usage information for jail named "www":
 .Dl Nm Fl hu Ar jail:www
 .Pp
 Display all the rules applicable to process with PID 512:
 .Dl Nm Fl l Ar process:512
 .Pp
 Display all rules:
 .Dl Nm
 .Pp
 Display all rules matching user "joe":
 .Dl Nm Ar user:joe
 .Pp
 Display all rules matching login classes:
 .Dl Nm Ar loginclass:
 .Sh SEE ALSO
 .Xr rctl.conf 5
 .Sh HISTORY
 The
 .Nm
 command appeared in
 .Fx 9.0 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 was developed by
 .An Edward Tomasz Napierala Aq Mt trasz@FreeBSD.org
 under sponsorship from the FreeBSD Foundation.
 .Sh BUGS
 Limiting
 .Sy memoryuse
 may kill the machine due to thrashing.
Index: user/ae/inet6/usr.sbin/bhyve/block_if.c
===================================================================
--- user/ae/inet6/usr.sbin/bhyve/block_if.c	(revision 271452)
+++ user/ae/inet6/usr.sbin/bhyve/block_if.c	(revision 271453)
@@ -1,474 +1,475 @@
 /*-
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
 #include <assert.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <unistd.h>
 
 #include "bhyverun.h"
 #include "block_if.h"
 
 #define BLOCKIF_SIG	0xb109b109
 
 #define BLOCKIF_MAXREQ	32
 
 enum blockop {
 	BOP_READ,
 	BOP_WRITE,
 	BOP_FLUSH,
 	BOP_CANCEL
 };
 
 enum blockstat {
 	BST_FREE,
 	BST_INUSE
 };
 
 struct blockif_elem {
 	TAILQ_ENTRY(blockif_elem) be_link;
 	struct blockif_req  *be_req;
 	enum blockop	     be_op;
 	enum blockstat	     be_status;
 };
 
 struct blockif_ctxt {
 	int			bc_magic;
 	int			bc_fd;
 	int			bc_rdonly;
 	off_t			bc_size;
 	int			bc_sectsz;
 	pthread_t		bc_btid;
         pthread_mutex_t		bc_mtx;
         pthread_cond_t		bc_cond;
 	int			bc_closing;
 
 	/* Request elements and free/inuse queues */
 	TAILQ_HEAD(, blockif_elem) bc_freeq;       
 	TAILQ_HEAD(, blockif_elem) bc_inuseq;       
 	u_int			bc_req_count;
 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
 };
 
 static int
 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	struct blockif_elem *be;
 
 	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
 
 	be = TAILQ_FIRST(&bc->bc_freeq);
 	assert(be != NULL);
 	assert(be->be_status == BST_FREE);
 
 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
 	be->be_status = BST_INUSE;
 	be->be_req = breq;
 	be->be_op = op;
 	TAILQ_INSERT_TAIL(&bc->bc_inuseq, be, be_link);
 
 	bc->bc_req_count++;
 
 	return (0);
 }
 
 static int
 blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem *el)
 {
 	struct blockif_elem *be;
 
 	if (bc->bc_req_count == 0)
 		return (ENOENT);
 
 	be = TAILQ_FIRST(&bc->bc_inuseq);
 	assert(be != NULL);
 	assert(be->be_status == BST_INUSE);
 	*el = *be;
 
 	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
 	be->be_status = BST_FREE;
 	be->be_req = NULL;
 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
 	
 	bc->bc_req_count--;
 
 	return (0);
 }
 
 static void
 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
 {
 	struct blockif_req *br;
 	int err;
 
 	br = be->be_req;
 	err = 0;
 
 	switch (be->be_op) {
 	case BOP_READ:
 		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
 			   br->br_offset) < 0)
 			err = errno;
 		break;
 	case BOP_WRITE:
 		if (bc->bc_rdonly)
 			err = EROFS;
 		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
 			     br->br_offset) < 0)
 			err = errno;
 		break;
 	case BOP_FLUSH:
 		break;
 	case BOP_CANCEL:
 		err = EINTR;
 		break;
 	default:
 		err = EINVAL;
 		break;
 	}
 
 	(*br->br_callback)(br, err);
 }
 
 static void *
 blockif_thr(void *arg)
 {
 	struct blockif_ctxt *bc;
 	struct blockif_elem req;
 
 	bc = arg;
 
 	for (;;) {
 		pthread_mutex_lock(&bc->bc_mtx);
 		while (!blockif_dequeue(bc, &req)) {
 			pthread_mutex_unlock(&bc->bc_mtx);
 			blockif_proc(bc, &req);
 			pthread_mutex_lock(&bc->bc_mtx);
 		}
 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
 		pthread_mutex_unlock(&bc->bc_mtx);
 
 		/*
 		 * Check ctxt status here to see if exit requested
 		 */
 		if (bc->bc_closing)
 			pthread_exit(NULL);
 	}
 
 	/* Not reached */
 	return (NULL);
 }
 
 struct blockif_ctxt *
 blockif_open(const char *optstr, const char *ident)
 {
 	char tname[MAXCOMLEN + 1];
 	char *nopt, *xopts;
 	struct blockif_ctxt *bc;
 	struct stat sbuf;
 	off_t size;
 	int extra, fd, i, sectsz;
 	int nocache, sync, ro;
 
 	nocache = 0;
 	sync = 0;
 	ro = 0;
 
 	/*
 	 * The first element in the optstring is always a pathname.
 	 * Optional elements follow
 	 */
 	nopt = strdup(optstr);
 	for (xopts = strtok(nopt, ",");
 	     xopts != NULL;
 	     xopts = strtok(NULL, ",")) {
 		if (!strcmp(xopts, "nocache"))
 			nocache = 1;
 		else if (!strcmp(xopts, "sync"))
 			sync = 1;
 		else if (!strcmp(xopts, "ro"))
 			ro = 1;
 	}
 
 	extra = 0;
 	if (nocache)
 		extra |= O_DIRECT;
 	if (sync)
 		extra |= O_SYNC;
 
 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
 	if (fd < 0 && !ro) {
 		/* Attempt a r/w fail with a r/o open */
 		fd = open(nopt, O_RDONLY | extra);
 		ro = 1;
 	}
 
 	if (fd < 0) {
 		perror("Could not open backing file");
 		return (NULL);
 	}
 
         if (fstat(fd, &sbuf) < 0) {
                 perror("Could not stat backing file");
                 close(fd);
                 return (NULL);
         }
 
         /*
 	 * Deal with raw devices
 	 */
         size = sbuf.st_size;
 	sectsz = DEV_BSIZE;
 	if (S_ISCHR(sbuf.st_mode)) {
 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
 			perror("Could not fetch dev blk/sector size");
 			close(fd);
 			return (NULL);
 		}
 		assert(size != 0);
 		assert(sectsz != 0);
 	}
 
 	bc = calloc(1, sizeof(struct blockif_ctxt));
 	if (bc == NULL) {
 		close(fd);
 		return (NULL);
 	}
 
 	bc->bc_magic = BLOCKIF_SIG;
 	bc->bc_fd = fd;
+	bc->bc_rdonly = ro;
 	bc->bc_size = size;
 	bc->bc_sectsz = sectsz;
 	pthread_mutex_init(&bc->bc_mtx, NULL);
 	pthread_cond_init(&bc->bc_cond, NULL);
 	TAILQ_INIT(&bc->bc_freeq);
 	TAILQ_INIT(&bc->bc_inuseq);
 	bc->bc_req_count = 0;
 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
 		bc->bc_reqs[i].be_status = BST_FREE;
 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
 	}
 
 	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
 
 	snprintf(tname, sizeof(tname), "blk-%s", ident);
 	pthread_set_name_np(bc->bc_btid, tname);
 
 	return (bc);
 }
 
 static int
 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	int err;
 
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
 		/*
 		 * Enqueue and inform the block i/o thread
 		 * that there is work available
 		 */
 		blockif_enqueue(bc, breq, op);
 		pthread_cond_signal(&bc->bc_cond);
 	} else {
 		/*
 		 * Callers are not allowed to enqueue more than
 		 * the specified blockif queue limit. Return an
 		 * error to indicate that the queue length has been
 		 * exceeded.
 		 */
 		err = E2BIG;
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	return (err);
 }
 
 int
 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_READ));
 }
 
 int
 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_WRITE));
 }
 
 int
 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_FLUSH));
 }
 
 int
 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_CANCEL));
 }
 
 int
 blockif_close(struct blockif_ctxt *bc)
 {
 	void *jval;
 	int err;
 
 	err = 0;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	/*
 	 * Stop the block i/o thread
 	 */
 	bc->bc_closing = 1;
 	pthread_cond_signal(&bc->bc_cond);
 	pthread_join(bc->bc_btid, &jval);
 
 	/* XXX Cancel queued i/o's ??? */
 
 	/*
 	 * Release resources
 	 */
 	bc->bc_magic = 0;
 	close(bc->bc_fd);
 	free(bc);
 
 	return (0);
 }
 
 /*
  * Return virtual C/H/S values for a given block. Use the algorithm
  * outlined in the VHD specification to calculate values.
  */
 void
 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
 {
 	off_t sectors;		/* total sectors of the block dev */
 	off_t hcyl;		/* cylinders times heads */
 	uint16_t secpt;		/* sectors per track */
 	uint8_t heads;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	sectors = bc->bc_size / bc->bc_sectsz;
 
 	/* Clamp the size to the largest possible with CHS */
 	if (sectors > 65535UL*16*255)
 		sectors = 65535UL*16*255;
 
 	if (sectors >= 65536UL*16*63) {
 		secpt = 255;
 		heads = 16;
 		hcyl = sectors / secpt;
 	} else {
 		secpt = 17;
 		hcyl = sectors / secpt;
 		heads = (hcyl + 1023) / 1024;
 
 		if (heads < 4)
 			heads = 4;
 
 		if (hcyl >= (heads * 1024) || heads > 16) {
 			secpt = 31;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 		if (hcyl >= (heads * 1024)) {
 			secpt = 63;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 	}
 
 	*c = hcyl / heads;
 	*h = heads;
 	*s = secpt;
 }
 
 /*
  * Accessors
  */
 off_t
 blockif_size(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_size);
 }
 
 int
 blockif_sectsz(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_sectsz);
 }
 
 int
 blockif_queuesz(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (BLOCKIF_MAXREQ);
 }
 
 int
 blockif_is_ro(struct blockif_ctxt *bc)
 {
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_rdonly);
 }
Index: user/ae/inet6/usr.sbin/bhyve
===================================================================
--- user/ae/inet6/usr.sbin/bhyve	(revision 271452)
+++ user/ae/inet6/usr.sbin/bhyve	(revision 271453)

Property changes on: user/ae/inet6/usr.sbin/bhyve
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/usr.sbin/bhyve:r271428-271452
Index: user/ae/inet6/usr.sbin/ctld/ctl.conf.5
===================================================================
--- user/ae/inet6/usr.sbin/ctld/ctl.conf.5	(revision 271452)
+++ user/ae/inet6/usr.sbin/ctld/ctl.conf.5	(revision 271453)
@@ -1,290 +1,357 @@
 .\" Copyright (c) 2012 The FreeBSD Foundation
 .\" All rights reserved.
 .\"
 .\" This software was developed by Edward Tomasz Napierala under sponsorship
 .\" from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd September 5, 2014
+.Dd September 11, 2014
 .Dt CTL.CONF 5
 .Os
 .Sh NAME
 .Nm ctl.conf
 .Nd CAM Target Layer / iSCSI target daemon configuration file
 .Sh DESCRIPTION
 The
 .Nm
 configuration file is used by the
 .Xr ctld 8
 daemon.
 Lines starting with
 .Ql #
 are interpreted as comments.
 The general syntax of the
 .Nm
 file is:
 .Bd -literal -offset indent
-pidfile <path>
+.No pidfile Ar path
 
-auth-group <name> {
-	chap <user> <secret>
-	...
+.No auth-group Ar name No {
+.Dl chap Ar user Ar secret
+.Dl ...
 }
 
-portal-group <name> {
-	listen <address>
-	listen-iser <address>
-	discovery-auth-group <name>
-	...
+.No portal-group Ar name No {
+.Dl listen Ar address
+.Dl listen-iser Ar address
+.Dl discovery-auth-group Ar name
+.Dl ...
 }
 
-target <name> {
-	auth-group <name>
-	portal-group <name>
-	lun <number> {
-		path <path>
-	}
-	...
+.No target Ar name {
+.Dl auth-group Ar name
+.Dl portal-group Ar name
+.Dl lun Ar number No {
+.Dl 	path Ar path
+.Dl }
+.Dl ...
 }
 .Ed
-.Ss global level
-The following statements are available at the global level:
+.Ss Global Context
 .Bl -tag -width indent
-.It Ic auth-group Aq Ar name
-Opens an auth-group section, defining an authentication group,
+.It Ic auth-group Ar name
+Create an
+.Sy auth-group
+configuration context,
 which can then be assigned to any number of targets.
-.It Ic debug Aq Ar level
-Specifies debug level.
+.It Ic debug Ar level
+The debug verbosity level.
 The default is 0.
-.It Ic maxproc Aq Ar number
-Specifies limit for concurrently running child processes handling
+.It Ic maxproc Ar number
+The limit for concurrently running child processes handling
 incoming connections.
 The default is 30.
-Setting it to 0 disables the limit.
-.It Ic pidfile Aq Ar path
-Specifies path to pidfile.
+A setting of 0 disables the limit.
+.It Ic pidfile Ar path
+The path to the pidfile.
 The default is
 .Pa /var/run/ctld.pid .
-.It Ic portal-group Aq Ar name
-Opens a portal-group section, defining a portal group,
+.It Ic portal-group Ar name
+Create a
+.Sy portal-group
+configuration context,
 which can then be assigned to any number of targets.
-.It Ic target Aq Ar name
-Opens a target configuration section.
-.It Ic timeout Aq Ar seconds
-Specifies timeout for login session, after which the connection
+.It Ic target Ar name
+Create a
+.Sy target
+configuration context, which can contain one or more
+.Sy lun
+contexts.
+.It Ic timeout Ar seconds
+The timeout for login sessions, after which the connection
 will be forcibly terminated.
 The default is 60.
-Setting it to 0 disables the timeout.
+A setting of 0 disables the timeout.
 .El
-.Ss auth-group level
-The following statements are available at the auth-group level:
+.Ss auth-group Context
 .Bl -tag -width indent
-.It Ic auth-type Ao Ar type Ac
-Specifies authentication type.
-Type can be either "none", "deny", "chap", or "chap-mutual".
+
+.It Ic auth-type Ar type
+Sets the authentication type.
+Type can be either
+.Qq Ar none ,
+.Qq Ar deny ,
+.Qq Ar chap ,
+or
+.Qq Ar chap-mutual .
 In most cases it is not necessary to set the type using this clause;
-it is usually used to disable authentication for a given auth-group.
-.It Ic chap Ao Ar user Ac Aq Ar secret
-Specifies CHAP authentication credentials.
-.It Ic chap-mutual Ao Ar user Ac Ao Ar secret Ac Ao Ar mutualuser Ac Aq Ar mutualsecret
-Specifies mutual CHAP authentication credentials.
-Note that for any auth-group, configuration may contain either chap,
-or chap-mutual entries; it is an error to mix them.
-.It Ic initiator-name Ao Ar initiator-name Ac
-Specifies iSCSI initiator name.
+it is usually used to disable authentication for a given
+.Sy auth-group .
+.It Ic chap Ar user Ar secret
+A set of CHAP authentication credentials.
+Note that for any
+.Sy auth-group ,
+the configuration may only contain either
+.Sy chap
+or
+.Sy chap-mutual
+entries; it is an error to mix them.
+.It Ic chap-mutual Ar user Ar secret Ar mutualuser Ar mutualsecret
+A set of mutual CHAP authentication credentials.
+Note that for any
+.Sy auth-group ,
+the configuration may only contain either
+.Sy chap
+or
+.Sy chap-mutual
+entries; it is an error to mix them.
+.It Ic initiator-name Ar initiator-name
+An iSCSI initiator name.
+Only initiators with a name matching one of the defined
+names will be allowed to connect.
 If not defined, there will be no restrictions based on initiator
 name.
-Otherwise, only initiators with names matching one of defined
-ones will be allowed to connect.
-.It Ic initiator-portal Ao Ar address Ac Ao Ar / prefixlen Ac
-Specifies the iSCSI initiator portal: an IPv4 or IPv6 address, optionally
-followed by slash and prefix length.
+.It Ic initiator-portal Ar address Ns Op / Ns Ar prefixlen
+An iSCSI initiator portal: an IPv4 or IPv6 address, optionally
+followed by a literal slash and a prefix length.
+Only initiators with an address matching one of the defined
+addresses will be allowed to connect.
 If not defined, there will be no restrictions based on initiator
 address.
-Otherwise, only initiators with addresses matching one of defined
-ones will be allowed to connect.
 .El
-.Ss portal-group level
-The following statements are available at the portal-group level:
+.Ss portal-group Context
 .Bl -tag -width indent
-.It Ic discovery-auth-group Aq Ar name
-Assigns previously defined authentication group to the portal group,
+.It Ic discovery-auth-group Ar name
+Assign a previously defined authentication group to the portal group,
 to be used for target discovery.
 By default, portal groups that do not specify their own auth settings,
-using clauses such as "chap" or "initiator-name", are assigned
-predefined auth-group "default", which denies discovery.
-Another predefined auth-group, "no-authentication", may be used
+using clauses such as
+.Sy chap
+or
+.Sy initiator-name ,
+are assigned
+predefined
+.Sy auth-group
+.Qq Ar default ,
+which denies discovery.
+Another predefined
+.Sy auth-group ,
+.Qq Ar no-authentication ,
+may be used
 to permit discovery without authentication.
-.It Ic listen Aq Ar address
-Specifies IPv4 or IPv6 address and port to listen on for incoming connections.
-.It Ic listen-iser Aq Ar address
-Specifies IPv4 or IPv6 address and port to listen on for incoming connections
+.It Ic listen Ar address
+An IPv4 or IPv6 address and port to listen on for incoming connections.
+.It Ic listen-iser Ar address
+An IPv4 or IPv6 address and port to listen on for incoming connections
 using iSER (iSCSI over RDMA) protocol.
 .El
-.Ss target level:
-The following statements are available at the target level:
+.Ss target Context
 .Bl -tag -width indent
-.It Ic alias Aq Ar text
-Assigns human-readable description to the target.
+.It Ic alias Ar text
+Assign a human-readable description to the target.
 There is no default.
-.It Ic auth-group Aq Ar name
-Assigns previously defined authentication group to the target.
+.It Ic auth-group Ar name
+Assign a previously defined authentication group to the target.
 By default, targets that do not specify their own auth settings,
-using clauses such as "chap" or "initiator-name", are assigned
-predefined auth-group "default", which denies all access.
-Another predefined auth-group, "no-authentication", may be used to permit access
+using clauses such as
+.Sy chap
+or
+.Sy initiator-name ,
+are assigned
+predefined
+.Sy auth-group
+.Qq Ar default ,
+which denies all access.
+Another predefined
+.Sy auth-group ,
+.Qq Ar no-authentication ,
+may be used to permit access
 without authentication.
-.It Ic auth-type Ao Ar type Ac
-Specifies authentication type.
-Type can be either "none", "deny", "chap", or "chap-mutual".
+Note that targets must only use one of
+.Sy auth-group , chap , No or Sy chap-mutual ;
+it is a configuration error to mix multiple types in one target.
+.It Ic auth-type Ar type
+Sets the authentication type.
+Type can be either
+.Qq Ar none ,
+.Qq Ar deny ,
+.Qq Ar chap ,
+or
+.Qq Ar chap-mutual .
 In most cases it is not necessary to set the type using this clause;
-it is usually used to disable authentication for a given target.
-This clause is mutually exclusive with auth-group; one cannot use
+it is usually used to disable authentication for a given
+.Sy target .
+This clause is mutually exclusive with
+.Sy auth-group ;
+one cannot use
 both in a single target.
-.It Ic chap Ao Ar user Ac Aq Ar secret
-Specifies CHAP authentication credentials.
-Note that targets must use either auth-group, or chap,
-or chap-mutual clauses; it is a configuration error to mix them in one target.
-.It Ic chap-mutual Ao Ar user Ac Ao Ar secret Ac Ao Ar mutualuser Ac Aq Ar mutualsecret
-Specifies mutual CHAP authentication credentials.
-Note that targets must use either auth-group, chap, or
-chap-mutual clauses; it is a configuration error to mix them in one target.
-.It Ic initiator-name Ao Ar initiator-name Ac
-Specifies iSCSI initiator name.
+.It Ic chap Ar user Ar secret
+A set of CHAP authentication credentials.
+Note that targets must only use one of
+.Sy auth-group , chap , No or Sy chap-mutual ;
+it is a configuration error to mix multiple types in one target.
+.It Ic chap-mutual Ar user Ar secret Ar mutualuser Ar mutualsecret
+A set of mutual CHAP authentication credentials.
+Note that targets must only use one of
+.Sy auth-group , chap , No or Sy chap-mutual ;
+it is a configuration error to mix multiple types in one target.
+.It Ic initiator-name Ar initiator-name
+An iSCSI initiator name.
+Only initiators with a name matching one of the defined
+names will be allowed to connect.
 If not defined, there will be no restrictions based on initiator
 name.
-Otherwise, only initiators with names matching one of defined
-ones will be allowed to connect.
-This clause is mutually exclusive with auth-group; one cannot use
+This clause is mutually exclusive with
+.Sy auth-group ;
+one cannot use
 both in a single target.
-.It Ic initiator-portal Ao Ar address Ac Ao Ar / prefixlen Ac
-Specifies the iSCSI initiator portal: an IPv4 or IPv6 address, optionally
-followed by slash and prefix length.
+.It Ic initiator-portal Ar address Ns Op / Ns Ar prefixlen
+An iSCSI initiator portal: an IPv4 or IPv6 address, optionally
+followed by a literal slash and a prefix length.
+Only initiators with an address matching one of the defined
+addresses will be allowed to connect.
 If not defined, there will be no restrictions based on initiator
 address.
-Otherwise, only initiators with addresses matching one of defined
-ones will be allowed to connect.
-This clause is mutually exclusive with auth-group; one cannot use
+This clause is mutually exclusive with
+.Sy auth-group ;
+one cannot use
 both in a single target.
-.It Ic portal-group Aq Ar name
-Assigns previously defined portal group to the target.
-Default portal group is "default", which makes the target available
+.It Ic portal-group Ar name
+Assign a previously defined portal group to the target.
+The default portal group is
+.Qq Ar default ,
+which makes the target available
 on TCP port 3260 on all configured IPv4 and IPv6 addresses.
-.It Ic lun Aq Ar number
-Opens a lun configuration section, defining LUN exported by a target.
+.It Ic lun Ar number
+Create a
+.Sy lun
+configuration context, defining a LUN exported by the parent target.
 .El
-.Ss lun level
-The following statements are available at the lun level:
+.Ss lun Context
 .Bl -tag -width indent
-.It Ic backend Ao Ar block | Ar ramdisk Ac
-Specifies the CTL backend to use for a given LUN.
+.It Ic backend Ar block No | Ar ramdisk
+The CTL backend to use for a given LUN.
 Valid choices are
-.Dq block
+.Qq Ar block
 and
-.Dq ramdisk ;
+.Qq Ar ramdisk ;
 block is used for LUNs backed
 by files or disk device nodes; ramdisk is a bitsink device, used mostly for
 testing.
 The default backend is block.
-.It Ic blocksize Aq Ar size
-Specifies blocksize visible to the initiator.
+.It Ic blocksize Ar size
+The blocksize visible to the initiator.
 The default blocksize is 512.
-.It Ic device-id Aq Ar string
-Specifies SCSI Device Identification string presented to the initiator.
-.It Ic option Ao Ar name Ac Aq Ar value
-Specifies CTL-specific options passed to the kernel.
-.It Ic path Aq Ar path
-Specifies path to file or device node used to back the LUN.
-.It Ic serial Aq Ar string
-Specifies SCSI serial number presented to the initiator.
-.It Ic size Aq Ar size
-Specifies LUN size, in bytes.
+.It Ic device-id Ar string
+The SCSI Device Identification string presented to the initiator.
+.It Ic option Ar name Ar value
+The CTL-specific options passed to the kernel.
+All CTL-specific options are documented in the
+.Sx OPTIONS
+section of
+.Xr ctladm 8
+.It Ic path Ar path
+The path to the file or device node used to back the LUN.
+.It Ic serial Ar string
+The SCSI serial number presented to the initiator.
+.It Ic size Ar size
+The LUN size, in bytes.
 .El
 .Sh FILES
 .Bl -tag -width ".Pa /etc/ctl.conf" -compact
 .It Pa /etc/ctl.conf
 The default location of the
 .Xr ctld 8
 configuration file.
 .El
 .Sh EXAMPLES
 .Bd -literal
 pidfile	/var/run/ctld.pid
 
 auth-group example2 {
 	chap-mutual "user" "secret" "mutualuser" "mutualsecret"
 	chap-mutual "user2" "secret2" "mutualuser" "mutualsecret"
 	initiator-portal 192.168.1.1/16
 }
 
 portal-group example2 {
 	discovery-auth-group no-authentication
 	listen 127.0.0.1
 	listen 0.0.0.0:3261
 	listen [::]:3261
 	listen [fe80::be:ef]
 }
 
 target iqn.2012-06.com.example:target0 {
 	alias "Example target"
 	auth-group no-authentication
 	lun 0 {
 		path /dev/zvol/example_0
 		blocksize 4096
 		size 4G
 	}
 }
 
 target iqn.2012-06.com.example:target3 {
 	chap chapuser chapsecret
 	lun 0 {
 		path /dev/zvol/example_3
 	}
 }
 
 target iqn.2012-06.com.example:target2 {
 	auth-group example2
 	portal-group example2
 	lun 0 {
 		path /dev/zvol/example2_0
 	}
 	lun 1 {
 		path /dev/zvol/example2_1
 		option foo bar
 	}
 }
 .Ed
 .Sh SEE ALSO
 .Xr ctl 4 ,
 .Xr ctladm 8 ,
 .Xr ctld 8
 .Sh AUTHORS
 The
 .Nm
 configuration file functionality for
 .Xr ctld 8
 was developed by
 .An Edward Tomasz Napierala Aq Mt trasz@FreeBSD.org
 under sponsorship from the FreeBSD Foundation.
Index: user/ae/inet6/usr.sbin/iscsid/login.c
===================================================================
--- user/ae/inet6/usr.sbin/iscsid/login.c	(revision 271452)
+++ user/ae/inet6/usr.sbin/iscsid/login.c	(revision 271453)
@@ -1,959 +1,981 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <assert.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <netinet/in.h>
 #include <openssl/err.h>
 #include <openssl/md5.h>
 #include <openssl/rand.h>
 
 #include "iscsid.h"
 #include "iscsi_proto.h"
 
 static int
 login_nsg(const struct pdu *response)
 {
 	struct iscsi_bhs_login_response *bhslr;
 
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 
 	return (bhslr->bhslr_flags & 0x03);
 }
 
 static void
 login_set_nsg(struct pdu *request, int nsg)
 {
 	struct iscsi_bhs_login_request *bhslr;
 
 	assert(nsg == BHSLR_STAGE_SECURITY_NEGOTIATION ||
 	    nsg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION ||
 	    nsg == BHSLR_STAGE_FULL_FEATURE_PHASE);
 
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 
 	bhslr->bhslr_flags &= 0xFC;
 	bhslr->bhslr_flags |= nsg;
 }
 
 static void
 login_set_csg(struct pdu *request, int csg)
 {
 	struct iscsi_bhs_login_request *bhslr;
 
 	assert(csg == BHSLR_STAGE_SECURITY_NEGOTIATION ||
 	    csg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION ||
 	    csg == BHSLR_STAGE_FULL_FEATURE_PHASE);
 
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 
 	bhslr->bhslr_flags &= 0xF3;
 	bhslr->bhslr_flags |= csg << 2;
 }
 
 static const char *
 login_target_error_str(int class, int detail)
 {
 	static char msg[128];
 
 	/*
 	 * RFC 3270, 10.13.5.  Status-Class and Status-Detail
 	 */
 	switch (class) {
 	case 0x01:
 		switch (detail) {
 		case 0x01:
 			return ("Target moved temporarily");
 		case 0x02:
 			return ("Target moved permanently");
 		default:
 			snprintf(msg, sizeof(msg), "unknown redirection; "
 			    "Status-Class 0x%x, Status-Detail 0x%x",
 			    class, detail);
 			return (msg);
 		}
 	case 0x02:
 		switch (detail) {
 		case 0x00:
 			return ("Initiator error");
 		case 0x01:
 			return ("Authentication failure");
 		case 0x02:
 			return ("Authorization failure");
 		case 0x03:
 			return ("Not found");
 		case 0x04:
 			return ("Target removed");
 		case 0x05:
 			return ("Unsupported version");
 		case 0x06:
 			return ("Too many connections");
 		case 0x07:
 			return ("Missing parameter");
 		case 0x08:
 			return ("Can't include in session");
 		case 0x09:
 			return ("Session type not supported");
 		case 0x0a:
 			return ("Session does not exist");
 		case 0x0b:
 			return ("Invalid during login");
 		default:
 			snprintf(msg, sizeof(msg), "unknown initiator error; "
 			    "Status-Class 0x%x, Status-Detail 0x%x",
 			    class, detail);
 			return (msg);
 		}
 	case 0x03:
 		switch (detail) {
 		case 0x00:
 			return ("Target error");
 		case 0x01:
 			return ("Service unavailable");
 		case 0x02:
 			return ("Out of resources");
 		default:
 			snprintf(msg, sizeof(msg), "unknown target error; "
 			    "Status-Class 0x%x, Status-Detail 0x%x",
 			    class, detail);
 			return (msg);
 		}
 	default:
 		snprintf(msg, sizeof(msg), "unknown error; "
 		    "Status-Class 0x%x, Status-Detail 0x%x",
 		    class, detail);
 		return (msg);
 	}
 }
 
 static void
 kernel_modify(const struct connection *conn, const char *target_address)
 {
 	struct iscsi_session_modify ism;
 	int error;
 
 	memset(&ism, 0, sizeof(ism));
 	ism.ism_session_id = conn->conn_session_id;
 	memcpy(&ism.ism_conf, &conn->conn_conf, sizeof(ism.ism_conf));
 	strlcpy(ism.ism_conf.isc_target_addr, target_address,
 	    sizeof(ism.ism_conf.isc_target));
 	error = ioctl(conn->conn_iscsi_fd, ISCSISMODIFY, &ism);
 	if (error != 0) {
 		log_err(1, "failed to redirect to %s: ISCSISMODIFY",
 		    target_address);
 	}
 }
 
 /*
  * XXX:	The way it works is suboptimal; what should happen is described
  *	in draft-gilligan-iscsi-fault-tolerance-00.  That, however, would
  *	be much more complicated: we would need to keep "dependencies"
  *	for sessions, so that, in case described in draft and using draft
  *	terminology, we would have three sessions: one for discovery,
  *	one for initial target portal, and one for redirect portal.  
  *	This would allow us to "backtrack" on connection failure,
  *	as described in draft.
  */
 static void
 login_handle_redirection(struct connection *conn, struct pdu *response)
 {
 	struct iscsi_bhs_login_response *bhslr;
 	struct keys *response_keys;
 	const char *target_address;
 
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	assert (bhslr->bhslr_status_class == 1);
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
 	target_address = keys_find(response_keys, "TargetAddress");
 	if (target_address == NULL)
 		log_errx(1, "received redirection without TargetAddress");
 	if (target_address[0] == '\0')
 		log_errx(1, "received redirection with empty TargetAddress");
 	if (strlen(target_address) >=
 	    sizeof(conn->conn_conf.isc_target_addr) - 1)
 		log_errx(1, "received TargetAddress is too long");
 
 	log_debugx("received redirection to \"%s\"", target_address);
 	kernel_modify(conn, target_address);
 	keys_delete(response_keys);
 }
 
 static struct pdu *
 login_receive(struct connection *conn)
 {
 	struct pdu *response;
 	struct iscsi_bhs_login_response *bhslr;
 	const char *errorstr;
 	static bool initial = true;
 
 	response = pdu_new(conn);
 	pdu_receive(response);
 	if (response->pdu_bhs->bhs_opcode != ISCSI_BHS_OPCODE_LOGIN_RESPONSE) {
 		log_errx(1, "protocol error: received invalid opcode 0x%x",
 		    response->pdu_bhs->bhs_opcode);
 	}
 	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	/*
 	 * XXX: Implement the C flag some day.
 	 */
 	if ((bhslr->bhslr_flags & BHSLR_FLAGS_CONTINUE) != 0)
 		log_errx(1, "received Login PDU with unsupported \"C\" flag");
 	if (bhslr->bhslr_version_max != 0x00)
 		log_errx(1, "received Login PDU with unsupported "
 		    "Version-max 0x%x", bhslr->bhslr_version_max);
 	if (bhslr->bhslr_version_active != 0x00)
 		log_errx(1, "received Login PDU with unsupported "
 		    "Version-active 0x%x", bhslr->bhslr_version_active);
 	if (bhslr->bhslr_status_class == 1) {
 		login_handle_redirection(conn, response);
 		log_debugx("redirection handled; exiting");
 		exit(0);
 	}
 	if (bhslr->bhslr_status_class != 0) {
 		errorstr = login_target_error_str(bhslr->bhslr_status_class,
 		    bhslr->bhslr_status_detail);
 		fail(conn, errorstr);
 		log_errx(1, "target returned error: %s", errorstr);
 	}
 	if (initial == false &&
 	    ntohl(bhslr->bhslr_statsn) != conn->conn_statsn + 1) {
 		/*
 		 * It's a warning, not an error, to work around what seems
 		 * to be bug in NetBSD iSCSI target.
 		 */
 		log_warnx("received Login PDU with wrong StatSN: "
 		    "is %d, should be %d", ntohl(bhslr->bhslr_statsn),
 		    conn->conn_statsn + 1);
 	}
 	conn->conn_tsih = ntohs(bhslr->bhslr_tsih);
 	conn->conn_statsn = ntohl(bhslr->bhslr_statsn);
 
 	initial = false;
 
 	return (response);
 }
 
 static struct pdu *
 login_new_request(struct connection *conn, int csg)
 {
 	struct pdu *request;
 	struct iscsi_bhs_login_request *bhslr;
 	int nsg;
 
 	request = pdu_new(conn);
 	bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs;
 	bhslr->bhslr_opcode = ISCSI_BHS_OPCODE_LOGIN_REQUEST |
 	    ISCSI_BHS_OPCODE_IMMEDIATE;
 
 	bhslr->bhslr_flags = BHSLR_FLAGS_TRANSIT;
 	switch (csg) {
 	case BHSLR_STAGE_SECURITY_NEGOTIATION:
 		nsg = BHSLR_STAGE_OPERATIONAL_NEGOTIATION;
 		break;
 	case BHSLR_STAGE_OPERATIONAL_NEGOTIATION:
 		nsg = BHSLR_STAGE_FULL_FEATURE_PHASE;
 		break;
 	default:
 		assert(!"invalid csg");
 		log_errx(1, "invalid csg %d", csg);
 	}
 	login_set_csg(request, csg);
 	login_set_nsg(request, nsg);
 
 	memcpy(bhslr->bhslr_isid, &conn->conn_isid, sizeof(bhslr->bhslr_isid));
 	bhslr->bhslr_tsih = htons(conn->conn_tsih);
 	bhslr->bhslr_initiator_task_tag = 0;
 	bhslr->bhslr_cmdsn = 0;
 	bhslr->bhslr_expstatsn = htonl(conn->conn_statsn + 1);
 
 	return (request);
 }
 
 static int
 login_list_prefers(const char *list,
     const char *choice1, const char *choice2)
 {
 	char *tofree, *str, *token;
 
 	tofree = str = checked_strdup(list);
 
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (strcmp(token, choice1) == 0) {
 			free(tofree);
 			return (1);
 		}
 		if (strcmp(token, choice2) == 0) {
 			free(tofree);
 			return (2);
 		}
 	}
 	free(tofree);
 	return (-1);
 }
 
 static int
 login_hex2int(const char hex)
 {
 	switch (hex) {
 	case '0':
 		return (0x00);
 	case '1':
 		return (0x01);
 	case '2':
 		return (0x02);
 	case '3':
 		return (0x03);
 	case '4':
 		return (0x04);
 	case '5':
 		return (0x05);
 	case '6':
 		return (0x06);
 	case '7':
 		return (0x07);
 	case '8':
 		return (0x08);
 	case '9':
 		return (0x09);
 	case 'a':
 	case 'A':
 		return (0x0a);
 	case 'b':
 	case 'B':
 		return (0x0b);
 	case 'c':
 	case 'C':
 		return (0x0c);
 	case 'd':
 	case 'D':
 		return (0x0d);
 	case 'e':
 	case 'E':
 		return (0x0e);
 	case 'f':
 	case 'F':
 		return (0x0f);
 	default:
 		return (-1);
 	}
 }
 
 /*
  * XXX: Review this _carefully_.
  */
 static int
 login_hex2bin(const char *hex, char **binp, size_t *bin_lenp)
 {
 	int i, hex_len, nibble;
 	bool lo = true; /* As opposed to 'hi'. */
 	char *bin;
 	size_t bin_off, bin_len;
 
 	if (strncasecmp(hex, "0x", strlen("0x")) != 0) {
 		log_warnx("malformed variable, should start with \"0x\"");
 		return (-1);
 	}
 
 	hex += strlen("0x");
 	hex_len = strlen(hex);
 	if (hex_len < 1) {
 		log_warnx("malformed variable; doesn't contain anything "
 		    "but \"0x\"");
 		return (-1);
 	}
 
 	bin_len = hex_len / 2 + hex_len % 2;
 	bin = calloc(bin_len, 1);
 	if (bin == NULL)
 		log_err(1, "calloc");
 
 	bin_off = bin_len - 1;
 	for (i = hex_len - 1; i >= 0; i--) {
 		nibble = login_hex2int(hex[i]);
 		if (nibble < 0) {
 			log_warnx("malformed variable, invalid char \"%c\"",
 			    hex[i]);
 			return (-1);
 		}
 
 		assert(bin_off < bin_len);
 		if (lo) {
 			bin[bin_off] = nibble;
 			lo = false;
 		} else {
 			bin[bin_off] |= nibble << 4;
 			bin_off--;
 			lo = true;
 		}
 	}
 
 	*binp = bin;
 	*bin_lenp = bin_len;
 	return (0);
 }
 
 static char *
 login_bin2hex(const char *bin, size_t bin_len)
 {
 	unsigned char *hex, *tmp, ch;
 	size_t hex_len;
 	size_t i;
 
 	hex_len = bin_len * 2 + 3; /* +2 for "0x", +1 for '\0'. */
 	hex = malloc(hex_len);
 	if (hex == NULL)
 		log_err(1, "malloc");
 
 	tmp = hex;
 	tmp += sprintf(tmp, "0x");
 	for (i = 0; i < bin_len; i++) {
 		ch = bin[i];
 		tmp += sprintf(tmp, "%02x", ch);
 	}
 
 	return (hex);
 }
 
 static void
 login_compute_md5(const char id, const char *secret,
     const void *challenge, size_t challenge_len, void *response,
     size_t response_len)
 {
 	MD5_CTX ctx;
 	int rv;
 
 	assert(response_len == MD5_DIGEST_LENGTH);
 
 	MD5_Init(&ctx);
 	MD5_Update(&ctx, &id, sizeof(id));
 	MD5_Update(&ctx, secret, strlen(secret));
 	MD5_Update(&ctx, challenge, challenge_len);
 	rv = MD5_Final(response, &ctx);
 	if (rv != 1)
 		log_errx(1, "MD5_Final");
 }
 
 static void
 login_negotiate_key(struct connection *conn, const char *name,
     const char *value)
 {
 	int which, tmp;
 
 	if (strcmp(name, "TargetAlias") == 0) {
 		strlcpy(conn->conn_target_alias, value,
 		    sizeof(conn->conn_target_alias));
 	} else if (strcmp(value, "Irrelevant") == 0) {
 		/* Ignore. */
 	} else if (strcmp(name, "HeaderDigest") == 0) {
 		which = login_list_prefers(value, "CRC32C", "None");
 		switch (which) {
 		case 1:
 			log_debugx("target prefers CRC32C "
 			    "for header digest; we'll use it");
 			conn->conn_header_digest = CONN_DIGEST_CRC32C;
 			break;
 		case 2:
 			log_debugx("target prefers not to do "
 			    "header digest; we'll comply");
 			break;
 		default:
 			log_warnx("target sent unrecognized "
 			    "HeaderDigest value \"%s\"; will use None", value);
 			break;
 		}
 	} else if (strcmp(name, "DataDigest") == 0) {
 		which = login_list_prefers(value, "CRC32C", "None");
 		switch (which) {
 		case 1:
 			log_debugx("target prefers CRC32C "
 			    "for data digest; we'll use it");
 			conn->conn_data_digest = CONN_DIGEST_CRC32C;
 			break;
 		case 2:
 			log_debugx("target prefers not to do "
 			    "data digest; we'll comply");
 			break;
 		default:
 			log_warnx("target sent unrecognized "
 			    "DataDigest value \"%s\"; will use None", value);
 			break;
 		}
 	} else if (strcmp(name, "MaxConnections") == 0) {
 		/* Ignore. */
 	} else if (strcmp(name, "InitialR2T") == 0) {
 		if (strcmp(value, "Yes") == 0)
 			conn->conn_initial_r2t = true;
 		else
 			conn->conn_initial_r2t = false;
 	} else if (strcmp(name, "ImmediateData") == 0) {
 		if (strcmp(value, "Yes") == 0)
 			conn->conn_immediate_data = true;
 		else
 			conn->conn_immediate_data = false;
 	} else if (strcmp(name, "MaxRecvDataSegmentLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0)
 			log_errx(1, "received invalid "
 			    "MaxRecvDataSegmentLength");
 		conn->conn_max_data_segment_length = tmp;
 	} else if (strcmp(name, "MaxBurstLength") == 0) {
 		if (conn->conn_immediate_data) {
 			tmp = strtoul(value, NULL, 10);
 			if (tmp <= 0)
 				log_errx(1, "received invalid MaxBurstLength");
 			conn->conn_max_burst_length = tmp;
 		}
 	} else if (strcmp(name, "FirstBurstLength") == 0) {
 		tmp = strtoul(value, NULL, 10);
 		if (tmp <= 0)
 			log_errx(1, "received invalid FirstBurstLength");
 		conn->conn_first_burst_length = tmp;
 	} else if (strcmp(name, "DefaultTime2Wait") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "DefaultTime2Retain") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "MaxOutstandingR2T") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "DataPDUInOrder") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "DataSequenceInOrder") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "ErrorRecoveryLevel") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "OFMarker") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "IFMarker") == 0) {
 		/* Ignore */
 	} else if (strcmp(name, "TargetPortalGroupTag") == 0) {
 		/* Ignore */
 	} else {
 		log_debugx("unknown key \"%s\"; ignoring",  name);
 	}
 }
 
 static void
 login_negotiate(struct connection *conn)
 {
 	struct pdu *request, *response;
 	struct keys *request_keys, *response_keys;
 	struct iscsi_bhs_login_response *bhslr;
-	int i;
+	int i, nrequests = 0;
 
 	log_debugx("beginning operational parameter negotiation");
 	request = login_new_request(conn, BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
 	request_keys = keys_new();
 
 	/*
 	 * The following keys are irrelevant for discovery sessions.
 	 */
 	if (conn->conn_conf.isc_discovery == 0) {
 		if (conn->conn_conf.isc_header_digest != 0)
 			keys_add(request_keys, "HeaderDigest", "CRC32C");
 		else
 			keys_add(request_keys, "HeaderDigest", "None");
 		if (conn->conn_conf.isc_data_digest != 0)
 			keys_add(request_keys, "DataDigest", "CRC32C");
 		else
 			keys_add(request_keys, "DataDigest", "None");
 
 		keys_add(request_keys, "ImmediateData", "Yes");
 		keys_add_int(request_keys, "MaxBurstLength",
 		    ISCSI_MAX_DATA_SEGMENT_LENGTH);
 		keys_add_int(request_keys, "FirstBurstLength",
 		    ISCSI_MAX_DATA_SEGMENT_LENGTH);
 		keys_add(request_keys, "InitialR2T", "Yes");
 	} else {
 		keys_add(request_keys, "HeaderDigest", "None");
 		keys_add(request_keys, "DataDigest", "None");
 	}
 
 	keys_add_int(request_keys, "MaxRecvDataSegmentLength",
 	    ISCSI_MAX_DATA_SEGMENT_LENGTH);
 	keys_add(request_keys, "DefaultTime2Wait", "0");
 	keys_add(request_keys, "DefaultTime2Retain", "0");
 	keys_add(request_keys, "ErrorRecoveryLevel", "0");
 	keys_add(request_keys, "MaxOutstandingR2T", "1");
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	request_keys = NULL;
 	pdu_send(request);
 	pdu_delete(request);
 	request = NULL;
 
 	response = login_receive(conn);
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 	for (i = 0; i < KEYS_MAX; i++) {
 		if (response_keys->keys_names[i] == NULL)
 			break;
 
 		login_negotiate_key(conn,
 		    response_keys->keys_names[i], response_keys->keys_values[i]);
 	}
 
-	bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
-	if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) == 0)
-		log_warnx("received final login response "
-		    "without the \"T\" flag");
-	else if (login_nsg(response) != BHSLR_STAGE_FULL_FEATURE_PHASE)
+	keys_delete(response_keys);
+	response_keys = NULL;
+
+	for (;;) {
+		bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs;
+		if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0)
+			break;
+
+		nrequests++;
+		if (nrequests > 5) {
+			log_warnx("received login response "
+			    "without the \"T\" flag too many times; giving up");
+			break;
+		}
+
+		log_debugx("received login response "
+		    "without the \"T\" flag; sending another request");
+
+		pdu_delete(response);
+
+		request = login_new_request(conn,
+		    BHSLR_STAGE_OPERATIONAL_NEGOTIATION);
+		pdu_send(request);
+		pdu_delete(request);
+
+		response = login_receive(conn);
+	}
+
+	if (login_nsg(response) != BHSLR_STAGE_FULL_FEATURE_PHASE)
 		log_warnx("received final login response with wrong NSG 0x%x",
 		    login_nsg(response));
+	pdu_delete(response);
 
 	log_debugx("operational parameter negotiation done; "
 	    "transitioning to Full Feature phase");
-
-	keys_delete(response_keys);
-	pdu_delete(response);
 }
 
 static void
 login_send_chap_a(struct connection *conn)
 {
 	struct pdu *request;
 	struct keys *request_keys;
 
 	request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION);
 	request_keys = keys_new();
 	keys_add(request_keys, "CHAP_A", "5");
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	pdu_send(request);
 	pdu_delete(request);
 }
 
 static void
 login_send_chap_r(struct pdu *response)
 {
 	struct connection *conn;
 	struct pdu *request;
 	struct keys *request_keys, *response_keys;
 	const char *chap_a, *chap_c, *chap_i;
 	char *chap_r, *challenge, response_bin[MD5_DIGEST_LENGTH];
 	size_t challenge_len;
 	int error, rv;
 	unsigned char id;
         char *mutual_chap_c, mutual_chap_i[4];
 
 	/*
 	 * As in the rest of the initiator, 'request' means
 	 * 'initiator -> target', and 'response' means 'target -> initiator',
 	 *
 	 * So, here the 'response' from the target is the packet that contains
 	 * CHAP challenge; our CHAP response goes into 'request'.
 	 */
 
 	conn = response->pdu_connection;
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
 	/*
 	 * First, compute the response.
 	 */
 	chap_a = keys_find(response_keys, "CHAP_A");
 	if (chap_a == NULL)
 		log_errx(1, "received CHAP packet without CHAP_A");
 	chap_c = keys_find(response_keys, "CHAP_C");
 	if (chap_c == NULL)
 		log_errx(1, "received CHAP packet without CHAP_C");
 	chap_i = keys_find(response_keys, "CHAP_I");
 	if (chap_i == NULL)
 		log_errx(1, "received CHAP packet without CHAP_I");
 
 	if (strcmp(chap_a, "5") != 0)
 		log_errx(1, "received CHAP packet "
 		    "with unsupported CHAP_A \"%s\"", chap_a);
 	id = strtoul(chap_i, NULL, 10);
 	error = login_hex2bin(chap_c, &challenge, &challenge_len);
 	if (error != 0)
 		log_errx(1, "received CHAP packet with malformed CHAP_C");
 	login_compute_md5(id, conn->conn_conf.isc_secret,
 	    challenge, challenge_len, response_bin, sizeof(response_bin));
 	free(challenge);
 	chap_r = login_bin2hex(response_bin, sizeof(response_bin));
 
 	keys_delete(response_keys);
 
 	request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION);
 	request_keys = keys_new();
 	keys_add(request_keys, "CHAP_N", conn->conn_conf.isc_user);
 	keys_add(request_keys, "CHAP_R", chap_r);
 	free(chap_r);
 
 	/*
 	 * If we want mutual authentication, we're expected to send
 	 * our CHAP_I/CHAP_C now.
 	 */
 	if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 		log_debugx("requesting mutual authentication; "
 		    "binary challenge size is %zd bytes",
 		    sizeof(conn->conn_mutual_challenge));
 
 		rv = RAND_bytes(conn->conn_mutual_challenge,
 		    sizeof(conn->conn_mutual_challenge));
 		if (rv != 1) {
 			log_errx(1, "RAND_bytes failed: %s",
 			    ERR_error_string(ERR_get_error(), NULL));
 		}
 		rv = RAND_bytes(&conn->conn_mutual_id,
 		    sizeof(conn->conn_mutual_id));
 		if (rv != 1) {
 			log_errx(1, "RAND_bytes failed: %s",
 			    ERR_error_string(ERR_get_error(), NULL));
 		}
 		mutual_chap_c = login_bin2hex(conn->conn_mutual_challenge,
 		    sizeof(conn->conn_mutual_challenge));
 		snprintf(mutual_chap_i, sizeof(mutual_chap_i),
 		    "%d", conn->conn_mutual_id);
 		keys_add(request_keys, "CHAP_I", mutual_chap_i);
 		keys_add(request_keys, "CHAP_C", mutual_chap_c);
 		free(mutual_chap_c);
 	}
 
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	pdu_send(request);
 	pdu_delete(request);
 }
 
 static void
 login_verify_mutual(const struct pdu *response)
 {
 	struct connection *conn;
 	struct keys *response_keys;
 	const char *chap_n, *chap_r;
 	char *response_bin, expected_response_bin[MD5_DIGEST_LENGTH];
 	size_t response_bin_len;
 	int error;
 
 	conn = response->pdu_connection;
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
         chap_n = keys_find(response_keys, "CHAP_N");
         if (chap_n == NULL)
                 log_errx(1, "received CHAP Response PDU without CHAP_N");
         chap_r = keys_find(response_keys, "CHAP_R");
         if (chap_r == NULL)
                 log_errx(1, "received CHAP Response PDU without CHAP_R");
         error = login_hex2bin(chap_r, &response_bin, &response_bin_len);
         if (error != 0)
                 log_errx(1, "received CHAP Response PDU with malformed CHAP_R");
 
 	if (strcmp(chap_n, conn->conn_conf.isc_mutual_user) != 0) {
 		fail(conn, "Mutual CHAP failed");
 		log_errx(1, "mutual CHAP authentication failed: wrong user");
 	}
 
 	login_compute_md5(conn->conn_mutual_id,
 	    conn->conn_conf.isc_mutual_secret, conn->conn_mutual_challenge,
 	    sizeof(conn->conn_mutual_challenge), expected_response_bin,
 	    sizeof(expected_response_bin));
 
         if (memcmp(response_bin, expected_response_bin,
             sizeof(expected_response_bin)) != 0) {
 		fail(conn, "Mutual CHAP failed");
                 log_errx(1, "mutual CHAP authentication failed: wrong secret");
 	}
 
         keys_delete(response_keys);
         free(response_bin);
 
 	log_debugx("mutual CHAP authentication succeeded");
 }
 
 static void
 login_chap(struct connection *conn)
 {
 	struct pdu *response;
 
 	log_debugx("beginning CHAP authentication; sending CHAP_A");
 	login_send_chap_a(conn);
 
 	log_debugx("waiting for CHAP_A/CHAP_C/CHAP_I");
 	response = login_receive(conn);
 
 	log_debugx("sending CHAP_N/CHAP_R");
 	login_send_chap_r(response);
 	pdu_delete(response);
 
 	/*
 	 * XXX: Make sure this is not susceptible to MITM.
 	 */
 
 	log_debugx("waiting for CHAP result");
 	response = login_receive(conn);
 	if (conn->conn_conf.isc_mutual_user[0] != '\0')
 		login_verify_mutual(response);
 	pdu_delete(response);
 
 	log_debugx("CHAP authentication done");
 }
 
 void
 login(struct connection *conn)
 {
 	struct pdu *request, *response;
 	struct keys *request_keys, *response_keys;
 	struct iscsi_bhs_login_response *bhslr2;
 	const char *auth_method;
 	int i;
 
 	log_debugx("beginning Login phase; sending Login PDU");
 	request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION);
 	request_keys = keys_new();
 	if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 		keys_add(request_keys, "AuthMethod", "CHAP");
 	} else if (conn->conn_conf.isc_user[0] != '\0') {
 		/*
 		 * Give target a chance to skip authentication if it
 		 * doesn't feel like it.
 		 *
 		 * None is first, CHAP second; this is to work around
 		 * what seems to be LIO (Linux target) bug: otherwise,
 		 * if target is configured with no authentication,
 		 * and we are configured to authenticate, the target
 		 * will erroneously respond with AuthMethod=CHAP
 		 * instead of AuthMethod=None, and will subsequently
 		 * fail the connection.  This usually happens with
 		 * Discovery sessions, which default to no authentication.
 		 */
 		keys_add(request_keys, "AuthMethod", "None,CHAP");
 	} else {
 		keys_add(request_keys, "AuthMethod", "None");
 	}
 	keys_add(request_keys, "InitiatorName",
 	    conn->conn_conf.isc_initiator);
 	if (conn->conn_conf.isc_initiator_alias[0] != '\0') {
 		keys_add(request_keys, "InitiatorAlias",
 		    conn->conn_conf.isc_initiator_alias);
 	}
 	if (conn->conn_conf.isc_discovery == 0) {
 		keys_add(request_keys, "SessionType", "Normal");
 		keys_add(request_keys,
 		    "TargetName", conn->conn_conf.isc_target);
 	} else {
 		keys_add(request_keys, "SessionType", "Discovery");
 	}
 	keys_save(request_keys, request);
 	keys_delete(request_keys);
 	pdu_send(request);
 	pdu_delete(request);
 
 	response = login_receive(conn);
 
 	response_keys = keys_new();
 	keys_load(response_keys, response);
 
 	for (i = 0; i < KEYS_MAX; i++) {
 		if (response_keys->keys_names[i] == NULL)
 			break;
 
 		/*
 		 * Not interested in AuthMethod at this point; we only need
 		 * to parse things such as TargetAlias.
 		 *
 		 * XXX: This is somewhat ugly.  We should have a way to apply
 		 * 	all the keys to the session and use that by default
 		 * 	instead of discarding them.
 		 */
 		if (strcmp(response_keys->keys_names[i], "AuthMethod") == 0)
 			continue;
 
 		login_negotiate_key(conn,
 		    response_keys->keys_names[i], response_keys->keys_values[i]);
 	}
 
 	bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs;
 	if ((bhslr2->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0 &&
 	    login_nsg(response) == BHSLR_STAGE_OPERATIONAL_NEGOTIATION) {
 		if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 			log_errx(1, "target requested transition "
 			    "to operational parameter negotiation, "
 			    "but we require mutual CHAP");
 		}
 
 		log_debugx("target requested transition "
 		    "to operational parameter negotiation");
 		keys_delete(response_keys);
 		pdu_delete(response);
 		login_negotiate(conn);
 		return;
 	}
 
 	auth_method = keys_find(response_keys, "AuthMethod");
 	if (auth_method == NULL)
 		log_errx(1, "received response without AuthMethod");
 	if (strcmp(auth_method, "None") == 0) {
 		if (conn->conn_conf.isc_mutual_user[0] != '\0') {
 			log_errx(1, "target does not require authantication, "
 			    "but we require mutual CHAP");
 		}
 
 		log_debugx("target does not require authentication");
 		keys_delete(response_keys);
 		pdu_delete(response);
 		login_negotiate(conn);
 		return;
 	}
 
 	if (strcmp(auth_method, "CHAP") != 0) {
 		fail(conn, "Unsupported AuthMethod");
 		log_errx(1, "received response "
 		    "with unsupported AuthMethod \"%s\"", auth_method);
 	}
 
 	if (conn->conn_conf.isc_user[0] == '\0' ||
 	    conn->conn_conf.isc_secret[0] == '\0') {
 		fail(conn, "Authentication required");
 		log_errx(1, "target requests CHAP authentication, but we don't "
 		    "have user and secret");
 	}
 
 	keys_delete(response_keys);
 	response_keys = NULL;
 	pdu_delete(response);
 	response = NULL;
 
 	login_chap(conn);
 	login_negotiate(conn);
 }
Index: user/ae/inet6
===================================================================
--- user/ae/inet6	(revision 271452)
+++ user/ae/inet6	(revision 271453)

Property changes on: user/ae/inet6
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r271428-271452