Index: user/ae/inet6/contrib/llvm/patches/patch-r271432-clang-r205331-debug-info-crash.diff =================================================================== --- user/ae/inet6/contrib/llvm/patches/patch-r271432-clang-r205331-debug-info-crash.diff (nonexistent) +++ user/ae/inet6/contrib/llvm/patches/patch-r271432-clang-r205331-debug-info-crash.diff (revision 271453) @@ -0,0 +1,46 @@ +commit 96365aef99ec463375dfdaf6eb260823e0477b6a +Author: Adrian Prantl +Date: Tue Apr 1 17:52:06 2014 +0000 + + Debug info: fix a crash when emitting IndirectFieldDecls, which were + previously not handled at all. + rdar://problem/16348575 + + git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@205331 91177308-0d34-0410-b5e6-96231b3b80d8 + +diff --git a/lib/CodeGen/CGDebugInfo.cpp b/lib/CodeGen/CGDebugInfo.cpp +index 82db942..2556cf9 100644 +--- tools/clang/lib/CodeGen/CGDebugInfo.cpp ++++ tools/clangb/lib/CodeGen/CGDebugInfo.cpp +@@ -1252,7 +1252,7 @@ CollectTemplateParams(const TemplateParameterList *TPList, + V = CGM.GetAddrOfFunction(FD); + // Member data pointers have special handling too to compute the fixed + // offset within the object. +- if (isa(D)) { ++ if (isa(D) || isa(D)) { + // These five lines (& possibly the above member function pointer + // handling) might be able to be refactored to use similar code in + // CodeGenModule::getMemberPointerConstant +diff --git a/test/CodeGenCXX/debug-info-indirect-field-decl.cpp b/test/CodeGenCXX/debug-info-indirect-field-decl.cpp +new file mode 100644 +index 0000000..131ceba +--- /dev/null ++++ tools/clang/test/CodeGenCXX/debug-info-indirect-field-decl.cpp +@@ -0,0 +1,17 @@ ++// RUN: %clang_cc1 -emit-llvm -g -triple x86_64-apple-darwin %s -o - | FileCheck %s ++// ++// Test that indirect field decls are handled gracefully. ++// rdar://problem/16348575 ++// ++template class Foo { }; ++ ++struct Bar { ++ int i1; ++ // CHECK: [ DW_TAG_member ] [line [[@LINE+1]], size 32, align 32, offset 32] [from _ZTSN3BarUt_E] ++ union { ++ // CHECK: [ DW_TAG_member ] [i2] [line [[@LINE+1]], size 32, align 32, offset 0] [from int] ++ int i2; ++ }; ++}; ++ ++Foo the_foo; Index: user/ae/inet6/contrib/llvm/tools/clang/lib/CodeGen/CGDebugInfo.cpp =================================================================== --- user/ae/inet6/contrib/llvm/tools/clang/lib/CodeGen/CGDebugInfo.cpp (revision 271452) +++ user/ae/inet6/contrib/llvm/tools/clang/lib/CodeGen/CGDebugInfo.cpp (revision 271453) @@ -1,3294 +1,3294 @@ //===--- CGDebugInfo.cpp - Emit Debug Information for a Module ------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This coordinates the debug information generation while generating code. // //===----------------------------------------------------------------------===// #include "CGDebugInfo.h" #include "CGBlocks.h" #include "CGCXXABI.h" #include "CGObjCRuntime.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "clang/AST/ASTContext.h" #include "clang/AST/DeclFriend.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/Expr.h" #include "clang/AST/RecordLayout.h" #include "clang/Basic/FileManager.h" #include "clang/Basic/SourceManager.h" #include "clang/Basic/Version.h" #include "clang/Frontend/CodeGenOptions.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" using namespace clang; using namespace clang::CodeGen; CGDebugInfo::CGDebugInfo(CodeGenModule &CGM) : CGM(CGM), DebugKind(CGM.getCodeGenOpts().getDebugInfo()), DBuilder(CGM.getModule()) { CreateCompileUnit(); } CGDebugInfo::~CGDebugInfo() { assert(LexicalBlockStack.empty() && "Region stack mismatch, stack not empty!"); } NoLocation::NoLocation(CodeGenFunction &CGF, CGBuilderTy &B) : DI(CGF.getDebugInfo()), Builder(B) { if (DI) { SavedLoc = DI->getLocation(); DI->CurLoc = SourceLocation(); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); } } NoLocation::~NoLocation() { if (DI) { assert(Builder.getCurrentDebugLocation().isUnknown()); DI->CurLoc = SavedLoc; } } ArtificialLocation::ArtificialLocation(CodeGenFunction &CGF, CGBuilderTy &B) : DI(CGF.getDebugInfo()), Builder(B) { if (DI) { SavedLoc = DI->getLocation(); DI->CurLoc = SourceLocation(); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); } } void ArtificialLocation::Emit() { if (DI) { // Sync the Builder. DI->EmitLocation(Builder, SavedLoc); DI->CurLoc = SourceLocation(); // Construct a location that has a valid scope, but no line info. assert(!DI->LexicalBlockStack.empty()); llvm::DIDescriptor Scope(DI->LexicalBlockStack.back()); Builder.SetCurrentDebugLocation(llvm::DebugLoc::get(0, 0, Scope)); } } ArtificialLocation::~ArtificialLocation() { if (DI) { assert(Builder.getCurrentDebugLocation().getLine() == 0); DI->CurLoc = SavedLoc; } } void CGDebugInfo::setLocation(SourceLocation Loc) { // If the new location isn't valid return. if (Loc.isInvalid()) return; CurLoc = CGM.getContext().getSourceManager().getExpansionLoc(Loc); // If we've changed files in the middle of a lexical scope go ahead // and create a new lexical scope with file node if it's different // from the one in the scope. if (LexicalBlockStack.empty()) return; SourceManager &SM = CGM.getContext().getSourceManager(); PresumedLoc PCLoc = SM.getPresumedLoc(CurLoc); PresumedLoc PPLoc = SM.getPresumedLoc(PrevLoc); if (PCLoc.isInvalid() || PPLoc.isInvalid() || !strcmp(PPLoc.getFilename(), PCLoc.getFilename())) return; llvm::MDNode *LB = LexicalBlockStack.back(); llvm::DIScope Scope = llvm::DIScope(LB); if (Scope.isLexicalBlockFile()) { llvm::DILexicalBlockFile LBF = llvm::DILexicalBlockFile(LB); llvm::DIDescriptor D = DBuilder.createLexicalBlockFile(LBF.getScope(), getOrCreateFile(CurLoc)); llvm::MDNode *N = D; LexicalBlockStack.pop_back(); LexicalBlockStack.push_back(N); } else if (Scope.isLexicalBlock() || Scope.isSubprogram()) { llvm::DIDescriptor D = DBuilder.createLexicalBlockFile(Scope, getOrCreateFile(CurLoc)); llvm::MDNode *N = D; LexicalBlockStack.pop_back(); LexicalBlockStack.push_back(N); } } /// getContextDescriptor - Get context info for the decl. llvm::DIScope CGDebugInfo::getContextDescriptor(const Decl *Context) { if (!Context) return TheCU; llvm::DenseMap::iterator I = RegionMap.find(Context); if (I != RegionMap.end()) { llvm::Value *V = I->second; return llvm::DIScope(dyn_cast_or_null(V)); } // Check namespace. if (const NamespaceDecl *NSDecl = dyn_cast(Context)) return getOrCreateNameSpace(NSDecl); if (const RecordDecl *RDecl = dyn_cast(Context)) if (!RDecl->isDependentType()) return getOrCreateType(CGM.getContext().getTypeDeclType(RDecl), getOrCreateMainFile()); return TheCU; } /// getFunctionName - Get function name for the given FunctionDecl. If the /// name is constructed on demand (e.g. C++ destructor) then the name /// is stored on the side. StringRef CGDebugInfo::getFunctionName(const FunctionDecl *FD) { assert (FD && "Invalid FunctionDecl!"); IdentifierInfo *FII = FD->getIdentifier(); FunctionTemplateSpecializationInfo *Info = FD->getTemplateSpecializationInfo(); if (!Info && FII) return FII->getName(); // Otherwise construct human readable name for debug info. SmallString<128> NS; llvm::raw_svector_ostream OS(NS); FD->printName(OS); // Add any template specialization args. if (Info) { const TemplateArgumentList *TArgs = Info->TemplateArguments; const TemplateArgument *Args = TArgs->data(); unsigned NumArgs = TArgs->size(); PrintingPolicy Policy(CGM.getLangOpts()); TemplateSpecializationType::PrintTemplateArgumentList(OS, Args, NumArgs, Policy); } // Copy this name on the side and use its reference. return internString(OS.str()); } StringRef CGDebugInfo::getObjCMethodName(const ObjCMethodDecl *OMD) { SmallString<256> MethodName; llvm::raw_svector_ostream OS(MethodName); OS << (OMD->isInstanceMethod() ? '-' : '+') << '['; const DeclContext *DC = OMD->getDeclContext(); if (const ObjCImplementationDecl *OID = dyn_cast(DC)) { OS << OID->getName(); } else if (const ObjCInterfaceDecl *OID = dyn_cast(DC)) { OS << OID->getName(); } else if (const ObjCCategoryImplDecl *OCD = dyn_cast(DC)){ OS << ((const NamedDecl *)OCD)->getIdentifier()->getNameStart() << '(' << OCD->getIdentifier()->getNameStart() << ')'; } else if (isa(DC)) { // We can extract the type of the class from the self pointer. if (ImplicitParamDecl* SelfDecl = OMD->getSelfDecl()) { QualType ClassTy = cast(SelfDecl->getType())->getPointeeType(); ClassTy.print(OS, PrintingPolicy(LangOptions())); } } OS << ' ' << OMD->getSelector().getAsString() << ']'; return internString(OS.str()); } /// getSelectorName - Return selector name. This is used for debugging /// info. StringRef CGDebugInfo::getSelectorName(Selector S) { return internString(S.getAsString()); } /// getClassName - Get class name including template argument list. StringRef CGDebugInfo::getClassName(const RecordDecl *RD) { const ClassTemplateSpecializationDecl *Spec = dyn_cast(RD); if (!Spec) return RD->getName(); const TemplateArgument *Args; unsigned NumArgs; if (TypeSourceInfo *TAW = Spec->getTypeAsWritten()) { const TemplateSpecializationType *TST = cast(TAW->getType()); Args = TST->getArgs(); NumArgs = TST->getNumArgs(); } else { const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs(); Args = TemplateArgs.data(); NumArgs = TemplateArgs.size(); } StringRef Name = RD->getIdentifier()->getName(); PrintingPolicy Policy(CGM.getLangOpts()); SmallString<128> TemplateArgList; { llvm::raw_svector_ostream OS(TemplateArgList); TemplateSpecializationType::PrintTemplateArgumentList(OS, Args, NumArgs, Policy); } // Copy this name on the side and use its reference. return internString(Name, TemplateArgList); } /// getOrCreateFile - Get the file debug info descriptor for the input location. llvm::DIFile CGDebugInfo::getOrCreateFile(SourceLocation Loc) { if (!Loc.isValid()) // If Location is not valid then use main input file. return DBuilder.createFile(TheCU.getFilename(), TheCU.getDirectory()); SourceManager &SM = CGM.getContext().getSourceManager(); PresumedLoc PLoc = SM.getPresumedLoc(Loc); if (PLoc.isInvalid() || StringRef(PLoc.getFilename()).empty()) // If the location is not valid then use main input file. return DBuilder.createFile(TheCU.getFilename(), TheCU.getDirectory()); // Cache the results. const char *fname = PLoc.getFilename(); llvm::DenseMap::iterator it = DIFileCache.find(fname); if (it != DIFileCache.end()) { // Verify that the information still exists. if (llvm::Value *V = it->second) return llvm::DIFile(cast(V)); } llvm::DIFile F = DBuilder.createFile(PLoc.getFilename(), getCurrentDirname()); DIFileCache[fname] = F; return F; } /// getOrCreateMainFile - Get the file info for main compile unit. llvm::DIFile CGDebugInfo::getOrCreateMainFile() { return DBuilder.createFile(TheCU.getFilename(), TheCU.getDirectory()); } /// getLineNumber - Get line number for the location. If location is invalid /// then use current location. unsigned CGDebugInfo::getLineNumber(SourceLocation Loc) { if (Loc.isInvalid() && CurLoc.isInvalid()) return 0; SourceManager &SM = CGM.getContext().getSourceManager(); PresumedLoc PLoc = SM.getPresumedLoc(Loc.isValid() ? Loc : CurLoc); return PLoc.isValid()? PLoc.getLine() : 0; } /// getColumnNumber - Get column number for the location. unsigned CGDebugInfo::getColumnNumber(SourceLocation Loc, bool Force) { // We may not want column information at all. if (!Force && !CGM.getCodeGenOpts().DebugColumnInfo) return 0; // If the location is invalid then use the current column. if (Loc.isInvalid() && CurLoc.isInvalid()) return 0; SourceManager &SM = CGM.getContext().getSourceManager(); PresumedLoc PLoc = SM.getPresumedLoc(Loc.isValid() ? Loc : CurLoc); return PLoc.isValid()? PLoc.getColumn() : 0; } StringRef CGDebugInfo::getCurrentDirname() { if (!CGM.getCodeGenOpts().DebugCompilationDir.empty()) return CGM.getCodeGenOpts().DebugCompilationDir; if (!CWDName.empty()) return CWDName; SmallString<256> CWD; llvm::sys::fs::current_path(CWD); return CWDName = internString(CWD); } /// CreateCompileUnit - Create new compile unit. void CGDebugInfo::CreateCompileUnit() { // Get absolute path name. SourceManager &SM = CGM.getContext().getSourceManager(); std::string MainFileName = CGM.getCodeGenOpts().MainFileName; if (MainFileName.empty()) MainFileName = ""; // The main file name provided via the "-main-file-name" option contains just // the file name itself with no path information. This file name may have had // a relative path, so we look into the actual file entry for the main // file to determine the real absolute path for the file. std::string MainFileDir; if (const FileEntry *MainFile = SM.getFileEntryForID(SM.getMainFileID())) { MainFileDir = MainFile->getDir()->getName(); if (MainFileDir != ".") { llvm::SmallString<1024> MainFileDirSS(MainFileDir); llvm::sys::path::append(MainFileDirSS, MainFileName); MainFileName = MainFileDirSS.str(); } } // Save filename string. StringRef Filename = internString(MainFileName); // Save split dwarf file string. std::string SplitDwarfFile = CGM.getCodeGenOpts().SplitDwarfFile; StringRef SplitDwarfFilename = internString(SplitDwarfFile); unsigned LangTag; const LangOptions &LO = CGM.getLangOpts(); if (LO.CPlusPlus) { if (LO.ObjC1) LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus; else LangTag = llvm::dwarf::DW_LANG_C_plus_plus; } else if (LO.ObjC1) { LangTag = llvm::dwarf::DW_LANG_ObjC; } else if (LO.C99) { LangTag = llvm::dwarf::DW_LANG_C99; } else { LangTag = llvm::dwarf::DW_LANG_C89; } std::string Producer = getClangFullVersion(); // Figure out which version of the ObjC runtime we have. unsigned RuntimeVers = 0; if (LO.ObjC1) RuntimeVers = LO.ObjCRuntime.isNonFragile() ? 2 : 1; // Create new compile unit. // FIXME - Eliminate TheCU. TheCU = DBuilder.createCompileUnit(LangTag, Filename, getCurrentDirname(), Producer, LO.Optimize, CGM.getCodeGenOpts().DwarfDebugFlags, RuntimeVers, SplitDwarfFilename); } /// CreateType - Get the Basic type from the cache or create a new /// one if necessary. llvm::DIType CGDebugInfo::CreateType(const BuiltinType *BT) { unsigned Encoding = 0; StringRef BTName; switch (BT->getKind()) { #define BUILTIN_TYPE(Id, SingletonId) #define PLACEHOLDER_TYPE(Id, SingletonId) \ case BuiltinType::Id: #include "clang/AST/BuiltinTypes.def" case BuiltinType::Dependent: llvm_unreachable("Unexpected builtin type"); case BuiltinType::NullPtr: return DBuilder.createNullPtrType(); case BuiltinType::Void: return llvm::DIType(); case BuiltinType::ObjCClass: if (ClassTy) return ClassTy; ClassTy = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type, "objc_class", TheCU, getOrCreateMainFile(), 0); return ClassTy; case BuiltinType::ObjCId: { // typedef struct objc_class *Class; // typedef struct objc_object { // Class isa; // } *id; if (ObjTy) return ObjTy; if (!ClassTy) ClassTy = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type, "objc_class", TheCU, getOrCreateMainFile(), 0); unsigned Size = CGM.getContext().getTypeSize(CGM.getContext().VoidPtrTy); llvm::DIType ISATy = DBuilder.createPointerType(ClassTy, Size); ObjTy = DBuilder.createStructType(TheCU, "objc_object", getOrCreateMainFile(), 0, 0, 0, 0, llvm::DIType(), llvm::DIArray()); ObjTy.setTypeArray(DBuilder.getOrCreateArray(&*DBuilder.createMemberType( ObjTy, "isa", getOrCreateMainFile(), 0, Size, 0, 0, 0, ISATy))); return ObjTy; } case BuiltinType::ObjCSel: { if (SelTy) return SelTy; SelTy = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type, "objc_selector", TheCU, getOrCreateMainFile(), 0); return SelTy; } case BuiltinType::OCLImage1d: return getOrCreateStructPtrType("opencl_image1d_t", OCLImage1dDITy); case BuiltinType::OCLImage1dArray: return getOrCreateStructPtrType("opencl_image1d_array_t", OCLImage1dArrayDITy); case BuiltinType::OCLImage1dBuffer: return getOrCreateStructPtrType("opencl_image1d_buffer_t", OCLImage1dBufferDITy); case BuiltinType::OCLImage2d: return getOrCreateStructPtrType("opencl_image2d_t", OCLImage2dDITy); case BuiltinType::OCLImage2dArray: return getOrCreateStructPtrType("opencl_image2d_array_t", OCLImage2dArrayDITy); case BuiltinType::OCLImage3d: return getOrCreateStructPtrType("opencl_image3d_t", OCLImage3dDITy); case BuiltinType::OCLSampler: return DBuilder.createBasicType("opencl_sampler_t", CGM.getContext().getTypeSize(BT), CGM.getContext().getTypeAlign(BT), llvm::dwarf::DW_ATE_unsigned); case BuiltinType::OCLEvent: return getOrCreateStructPtrType("opencl_event_t", OCLEventDITy); case BuiltinType::UChar: case BuiltinType::Char_U: Encoding = llvm::dwarf::DW_ATE_unsigned_char; break; case BuiltinType::Char_S: case BuiltinType::SChar: Encoding = llvm::dwarf::DW_ATE_signed_char; break; case BuiltinType::Char16: case BuiltinType::Char32: Encoding = llvm::dwarf::DW_ATE_UTF; break; case BuiltinType::UShort: case BuiltinType::UInt: case BuiltinType::UInt128: case BuiltinType::ULong: case BuiltinType::WChar_U: case BuiltinType::ULongLong: Encoding = llvm::dwarf::DW_ATE_unsigned; break; case BuiltinType::Short: case BuiltinType::Int: case BuiltinType::Int128: case BuiltinType::Long: case BuiltinType::WChar_S: case BuiltinType::LongLong: Encoding = llvm::dwarf::DW_ATE_signed; break; case BuiltinType::Bool: Encoding = llvm::dwarf::DW_ATE_boolean; break; case BuiltinType::Half: case BuiltinType::Float: case BuiltinType::LongDouble: case BuiltinType::Double: Encoding = llvm::dwarf::DW_ATE_float; break; } switch (BT->getKind()) { case BuiltinType::Long: BTName = "long int"; break; case BuiltinType::LongLong: BTName = "long long int"; break; case BuiltinType::ULong: BTName = "long unsigned int"; break; case BuiltinType::ULongLong: BTName = "long long unsigned int"; break; default: BTName = BT->getName(CGM.getLangOpts()); break; } // Bit size, align and offset of the type. uint64_t Size = CGM.getContext().getTypeSize(BT); uint64_t Align = CGM.getContext().getTypeAlign(BT); llvm::DIType DbgTy = DBuilder.createBasicType(BTName, Size, Align, Encoding); return DbgTy; } llvm::DIType CGDebugInfo::CreateType(const ComplexType *Ty) { // Bit size, align and offset of the type. unsigned Encoding = llvm::dwarf::DW_ATE_complex_float; if (Ty->isComplexIntegerType()) Encoding = llvm::dwarf::DW_ATE_lo_user; uint64_t Size = CGM.getContext().getTypeSize(Ty); uint64_t Align = CGM.getContext().getTypeAlign(Ty); llvm::DIType DbgTy = DBuilder.createBasicType("complex", Size, Align, Encoding); return DbgTy; } /// CreateCVRType - Get the qualified type from the cache or create /// a new one if necessary. llvm::DIType CGDebugInfo::CreateQualifiedType(QualType Ty, llvm::DIFile Unit) { QualifierCollector Qc; const Type *T = Qc.strip(Ty); // Ignore these qualifiers for now. Qc.removeObjCGCAttr(); Qc.removeAddressSpace(); Qc.removeObjCLifetime(); // We will create one Derived type for one qualifier and recurse to handle any // additional ones. unsigned Tag; if (Qc.hasConst()) { Tag = llvm::dwarf::DW_TAG_const_type; Qc.removeConst(); } else if (Qc.hasVolatile()) { Tag = llvm::dwarf::DW_TAG_volatile_type; Qc.removeVolatile(); } else if (Qc.hasRestrict()) { Tag = llvm::dwarf::DW_TAG_restrict_type; Qc.removeRestrict(); } else { assert(Qc.empty() && "Unknown type qualifier for debug info"); return getOrCreateType(QualType(T, 0), Unit); } llvm::DIType FromTy = getOrCreateType(Qc.apply(CGM.getContext(), T), Unit); // No need to fill in the Name, Line, Size, Alignment, Offset in case of // CVR derived types. llvm::DIType DbgTy = DBuilder.createQualifiedType(Tag, FromTy); return DbgTy; } llvm::DIType CGDebugInfo::CreateType(const ObjCObjectPointerType *Ty, llvm::DIFile Unit) { // The frontend treats 'id' as a typedef to an ObjCObjectType, // whereas 'id' is treated as an ObjCPointerType. For the // debug info, we want to emit 'id' in both cases. if (Ty->isObjCQualifiedIdType()) return getOrCreateType(CGM.getContext().getObjCIdType(), Unit); llvm::DIType DbgTy = CreatePointerLikeType(llvm::dwarf::DW_TAG_pointer_type, Ty, Ty->getPointeeType(), Unit); return DbgTy; } llvm::DIType CGDebugInfo::CreateType(const PointerType *Ty, llvm::DIFile Unit) { return CreatePointerLikeType(llvm::dwarf::DW_TAG_pointer_type, Ty, Ty->getPointeeType(), Unit); } /// In C++ mode, types have linkage, so we can rely on the ODR and /// on their mangled names, if they're external. static SmallString<256> getUniqueTagTypeName(const TagType *Ty, CodeGenModule &CGM, llvm::DICompileUnit TheCU) { SmallString<256> FullName; // FIXME: ODR should apply to ObjC++ exactly the same wasy it does to C++. // For now, only apply ODR with C++. const TagDecl *TD = Ty->getDecl(); if (TheCU.getLanguage() != llvm::dwarf::DW_LANG_C_plus_plus || !TD->isExternallyVisible()) return FullName; // Microsoft Mangler does not have support for mangleCXXRTTIName yet. if (CGM.getTarget().getCXXABI().isMicrosoft()) return FullName; // TODO: This is using the RTTI name. Is there a better way to get // a unique string for a type? llvm::raw_svector_ostream Out(FullName); CGM.getCXXABI().getMangleContext().mangleCXXRTTIName(QualType(Ty, 0), Out); Out.flush(); return FullName; } // Creates a forward declaration for a RecordDecl in the given context. llvm::DICompositeType CGDebugInfo::getOrCreateRecordFwdDecl(const RecordType *Ty, llvm::DIDescriptor Ctx) { const RecordDecl *RD = Ty->getDecl(); if (llvm::DIType T = getTypeOrNull(CGM.getContext().getRecordType(RD))) return llvm::DICompositeType(T); llvm::DIFile DefUnit = getOrCreateFile(RD->getLocation()); unsigned Line = getLineNumber(RD->getLocation()); StringRef RDName = getClassName(RD); unsigned Tag = 0; if (RD->isStruct() || RD->isInterface()) Tag = llvm::dwarf::DW_TAG_structure_type; else if (RD->isUnion()) Tag = llvm::dwarf::DW_TAG_union_type; else { assert(RD->isClass()); Tag = llvm::dwarf::DW_TAG_class_type; } // Create the type. SmallString<256> FullName = getUniqueTagTypeName(Ty, CGM, TheCU); return DBuilder.createForwardDecl(Tag, RDName, Ctx, DefUnit, Line, 0, 0, 0, FullName); } llvm::DIType CGDebugInfo::CreatePointerLikeType(unsigned Tag, const Type *Ty, QualType PointeeTy, llvm::DIFile Unit) { if (Tag == llvm::dwarf::DW_TAG_reference_type || Tag == llvm::dwarf::DW_TAG_rvalue_reference_type) return DBuilder.createReferenceType(Tag, getOrCreateType(PointeeTy, Unit)); // Bit size, align and offset of the type. // Size is always the size of a pointer. We can't use getTypeSize here // because that does not return the correct value for references. unsigned AS = CGM.getContext().getTargetAddressSpace(PointeeTy); uint64_t Size = CGM.getTarget().getPointerWidth(AS); uint64_t Align = CGM.getContext().getTypeAlign(Ty); return DBuilder.createPointerType(getOrCreateType(PointeeTy, Unit), Size, Align); } llvm::DIType CGDebugInfo::getOrCreateStructPtrType(StringRef Name, llvm::DIType &Cache) { if (Cache) return Cache; Cache = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type, Name, TheCU, getOrCreateMainFile(), 0); unsigned Size = CGM.getContext().getTypeSize(CGM.getContext().VoidPtrTy); Cache = DBuilder.createPointerType(Cache, Size); return Cache; } llvm::DIType CGDebugInfo::CreateType(const BlockPointerType *Ty, llvm::DIFile Unit) { if (BlockLiteralGeneric) return BlockLiteralGeneric; SmallVector EltTys; llvm::DIType FieldTy; QualType FType; uint64_t FieldSize, FieldOffset; unsigned FieldAlign; llvm::DIArray Elements; llvm::DIType EltTy, DescTy; FieldOffset = 0; FType = CGM.getContext().UnsignedLongTy; EltTys.push_back(CreateMemberType(Unit, FType, "reserved", &FieldOffset)); EltTys.push_back(CreateMemberType(Unit, FType, "Size", &FieldOffset)); Elements = DBuilder.getOrCreateArray(EltTys); EltTys.clear(); unsigned Flags = llvm::DIDescriptor::FlagAppleBlock; unsigned LineNo = getLineNumber(CurLoc); EltTy = DBuilder.createStructType(Unit, "__block_descriptor", Unit, LineNo, FieldOffset, 0, Flags, llvm::DIType(), Elements); // Bit size, align and offset of the type. uint64_t Size = CGM.getContext().getTypeSize(Ty); DescTy = DBuilder.createPointerType(EltTy, Size); FieldOffset = 0; FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy); EltTys.push_back(CreateMemberType(Unit, FType, "__isa", &FieldOffset)); FType = CGM.getContext().IntTy; EltTys.push_back(CreateMemberType(Unit, FType, "__flags", &FieldOffset)); EltTys.push_back(CreateMemberType(Unit, FType, "__reserved", &FieldOffset)); FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy); EltTys.push_back(CreateMemberType(Unit, FType, "__FuncPtr", &FieldOffset)); FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy); FieldTy = DescTy; FieldSize = CGM.getContext().getTypeSize(Ty); FieldAlign = CGM.getContext().getTypeAlign(Ty); FieldTy = DBuilder.createMemberType(Unit, "__descriptor", Unit, LineNo, FieldSize, FieldAlign, FieldOffset, 0, FieldTy); EltTys.push_back(FieldTy); FieldOffset += FieldSize; Elements = DBuilder.getOrCreateArray(EltTys); EltTy = DBuilder.createStructType(Unit, "__block_literal_generic", Unit, LineNo, FieldOffset, 0, Flags, llvm::DIType(), Elements); BlockLiteralGeneric = DBuilder.createPointerType(EltTy, Size); return BlockLiteralGeneric; } llvm::DIType CGDebugInfo::CreateType(const TypedefType *Ty, llvm::DIFile Unit) { // Typedefs are derived from some other type. If we have a typedef of a // typedef, make sure to emit the whole chain. llvm::DIType Src = getOrCreateType(Ty->getDecl()->getUnderlyingType(), Unit); if (!Src) return llvm::DIType(); // We don't set size information, but do specify where the typedef was // declared. unsigned Line = getLineNumber(Ty->getDecl()->getLocation()); const TypedefNameDecl *TyDecl = Ty->getDecl(); llvm::DIDescriptor TypedefContext = getContextDescriptor(cast(Ty->getDecl()->getDeclContext())); return DBuilder.createTypedef(Src, TyDecl->getName(), Unit, Line, TypedefContext); } llvm::DIType CGDebugInfo::CreateType(const FunctionType *Ty, llvm::DIFile Unit) { SmallVector EltTys; // Add the result type at least. EltTys.push_back(getOrCreateType(Ty->getResultType(), Unit)); // Set up remainder of arguments if there is a prototype. // FIXME: IF NOT, HOW IS THIS REPRESENTED? llvm-gcc doesn't represent '...'! if (isa(Ty)) EltTys.push_back(DBuilder.createUnspecifiedParameter()); else if (const FunctionProtoType *FPT = dyn_cast(Ty)) { for (unsigned i = 0, e = FPT->getNumArgs(); i != e; ++i) EltTys.push_back(getOrCreateType(FPT->getArgType(i), Unit)); if (FPT->isVariadic()) EltTys.push_back(DBuilder.createUnspecifiedParameter()); } llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(EltTys); return DBuilder.createSubroutineType(Unit, EltTypeArray); } llvm::DIType CGDebugInfo::createFieldType(StringRef name, QualType type, uint64_t sizeInBitsOverride, SourceLocation loc, AccessSpecifier AS, uint64_t offsetInBits, llvm::DIFile tunit, llvm::DIScope scope) { llvm::DIType debugType = getOrCreateType(type, tunit); // Get the location for the field. llvm::DIFile file = getOrCreateFile(loc); unsigned line = getLineNumber(loc); uint64_t sizeInBits = 0; unsigned alignInBits = 0; if (!type->isIncompleteArrayType()) { llvm::tie(sizeInBits, alignInBits) = CGM.getContext().getTypeInfo(type); if (sizeInBitsOverride) sizeInBits = sizeInBitsOverride; } unsigned flags = 0; if (AS == clang::AS_private) flags |= llvm::DIDescriptor::FlagPrivate; else if (AS == clang::AS_protected) flags |= llvm::DIDescriptor::FlagProtected; return DBuilder.createMemberType(scope, name, file, line, sizeInBits, alignInBits, offsetInBits, flags, debugType); } /// CollectRecordLambdaFields - Helper for CollectRecordFields. void CGDebugInfo:: CollectRecordLambdaFields(const CXXRecordDecl *CXXDecl, SmallVectorImpl &elements, llvm::DIType RecordTy) { // For C++11 Lambdas a Field will be the same as a Capture, but the Capture // has the name and the location of the variable so we should iterate over // both concurrently. const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(CXXDecl); RecordDecl::field_iterator Field = CXXDecl->field_begin(); unsigned fieldno = 0; for (CXXRecordDecl::capture_const_iterator I = CXXDecl->captures_begin(), E = CXXDecl->captures_end(); I != E; ++I, ++Field, ++fieldno) { const LambdaExpr::Capture C = *I; if (C.capturesVariable()) { VarDecl *V = C.getCapturedVar(); llvm::DIFile VUnit = getOrCreateFile(C.getLocation()); StringRef VName = V->getName(); uint64_t SizeInBitsOverride = 0; if (Field->isBitField()) { SizeInBitsOverride = Field->getBitWidthValue(CGM.getContext()); assert(SizeInBitsOverride && "found named 0-width bitfield"); } llvm::DIType fieldType = createFieldType(VName, Field->getType(), SizeInBitsOverride, C.getLocation(), Field->getAccess(), layout.getFieldOffset(fieldno), VUnit, RecordTy); elements.push_back(fieldType); } else { // TODO: Need to handle 'this' in some way by probably renaming the // this of the lambda class and having a field member of 'this' or // by using AT_object_pointer for the function and having that be // used as 'this' for semantic references. assert(C.capturesThis() && "Field that isn't captured and isn't this?"); FieldDecl *f = *Field; llvm::DIFile VUnit = getOrCreateFile(f->getLocation()); QualType type = f->getType(); llvm::DIType fieldType = createFieldType("this", type, 0, f->getLocation(), f->getAccess(), layout.getFieldOffset(fieldno), VUnit, RecordTy); elements.push_back(fieldType); } } } /// Helper for CollectRecordFields. llvm::DIDerivedType CGDebugInfo::CreateRecordStaticField(const VarDecl *Var, llvm::DIType RecordTy) { // Create the descriptor for the static variable, with or without // constant initializers. llvm::DIFile VUnit = getOrCreateFile(Var->getLocation()); llvm::DIType VTy = getOrCreateType(Var->getType(), VUnit); unsigned LineNumber = getLineNumber(Var->getLocation()); StringRef VName = Var->getName(); llvm::Constant *C = NULL; if (Var->getInit()) { const APValue *Value = Var->evaluateValue(); if (Value) { if (Value->isInt()) C = llvm::ConstantInt::get(CGM.getLLVMContext(), Value->getInt()); if (Value->isFloat()) C = llvm::ConstantFP::get(CGM.getLLVMContext(), Value->getFloat()); } } unsigned Flags = 0; AccessSpecifier Access = Var->getAccess(); if (Access == clang::AS_private) Flags |= llvm::DIDescriptor::FlagPrivate; else if (Access == clang::AS_protected) Flags |= llvm::DIDescriptor::FlagProtected; llvm::DIDerivedType GV = DBuilder.createStaticMemberType( RecordTy, VName, VUnit, LineNumber, VTy, Flags, C); StaticDataMemberCache[Var->getCanonicalDecl()] = llvm::WeakVH(GV); return GV; } /// CollectRecordNormalField - Helper for CollectRecordFields. void CGDebugInfo:: CollectRecordNormalField(const FieldDecl *field, uint64_t OffsetInBits, llvm::DIFile tunit, SmallVectorImpl &elements, llvm::DIType RecordTy) { StringRef name = field->getName(); QualType type = field->getType(); // Ignore unnamed fields unless they're anonymous structs/unions. if (name.empty() && !type->isRecordType()) return; uint64_t SizeInBitsOverride = 0; if (field->isBitField()) { SizeInBitsOverride = field->getBitWidthValue(CGM.getContext()); assert(SizeInBitsOverride && "found named 0-width bitfield"); } llvm::DIType fieldType = createFieldType(name, type, SizeInBitsOverride, field->getLocation(), field->getAccess(), OffsetInBits, tunit, RecordTy); elements.push_back(fieldType); } /// CollectRecordFields - A helper function to collect debug info for /// record fields. This is used while creating debug info entry for a Record. void CGDebugInfo::CollectRecordFields(const RecordDecl *record, llvm::DIFile tunit, SmallVectorImpl &elements, llvm::DICompositeType RecordTy) { const CXXRecordDecl *CXXDecl = dyn_cast(record); if (CXXDecl && CXXDecl->isLambda()) CollectRecordLambdaFields(CXXDecl, elements, RecordTy); else { const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(record); // Field number for non-static fields. unsigned fieldNo = 0; // Static and non-static members should appear in the same order as // the corresponding declarations in the source program. for (RecordDecl::decl_iterator I = record->decls_begin(), E = record->decls_end(); I != E; ++I) if (const VarDecl *V = dyn_cast(*I)) { // Reuse the existing static member declaration if one exists llvm::DenseMap::iterator MI = StaticDataMemberCache.find(V->getCanonicalDecl()); if (MI != StaticDataMemberCache.end()) { assert(MI->second && "Static data member declaration should still exist"); elements.push_back( llvm::DIDerivedType(cast(MI->second))); } else elements.push_back(CreateRecordStaticField(V, RecordTy)); } else if (FieldDecl *field = dyn_cast(*I)) { CollectRecordNormalField(field, layout.getFieldOffset(fieldNo), tunit, elements, RecordTy); // Bump field number for next field. ++fieldNo; } } } /// getOrCreateMethodType - CXXMethodDecl's type is a FunctionType. This /// function type is not updated to include implicit "this" pointer. Use this /// routine to get a method type which includes "this" pointer. llvm::DICompositeType CGDebugInfo::getOrCreateMethodType(const CXXMethodDecl *Method, llvm::DIFile Unit) { const FunctionProtoType *Func = Method->getType()->getAs(); if (Method->isStatic()) return llvm::DICompositeType(getOrCreateType(QualType(Func, 0), Unit)); return getOrCreateInstanceMethodType(Method->getThisType(CGM.getContext()), Func, Unit); } llvm::DICompositeType CGDebugInfo::getOrCreateInstanceMethodType( QualType ThisPtr, const FunctionProtoType *Func, llvm::DIFile Unit) { // Add "this" pointer. llvm::DIArray Args = llvm::DICompositeType( getOrCreateType(QualType(Func, 0), Unit)).getTypeArray(); assert (Args.getNumElements() && "Invalid number of arguments!"); SmallVector Elts; // First element is always return type. For 'void' functions it is NULL. Elts.push_back(Args.getElement(0)); // "this" pointer is always first argument. const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl(); if (isa(RD)) { // Create pointer type directly in this case. const PointerType *ThisPtrTy = cast(ThisPtr); QualType PointeeTy = ThisPtrTy->getPointeeType(); unsigned AS = CGM.getContext().getTargetAddressSpace(PointeeTy); uint64_t Size = CGM.getTarget().getPointerWidth(AS); uint64_t Align = CGM.getContext().getTypeAlign(ThisPtrTy); llvm::DIType PointeeType = getOrCreateType(PointeeTy, Unit); llvm::DIType ThisPtrType = DBuilder.createPointerType(PointeeType, Size, Align); TypeCache[ThisPtr.getAsOpaquePtr()] = ThisPtrType; // TODO: This and the artificial type below are misleading, the // types aren't artificial the argument is, but the current // metadata doesn't represent that. ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType); Elts.push_back(ThisPtrType); } else { llvm::DIType ThisPtrType = getOrCreateType(ThisPtr, Unit); TypeCache[ThisPtr.getAsOpaquePtr()] = ThisPtrType; ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType); Elts.push_back(ThisPtrType); } // Copy rest of the arguments. for (unsigned i = 1, e = Args.getNumElements(); i != e; ++i) Elts.push_back(Args.getElement(i)); llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(Elts); return DBuilder.createSubroutineType(Unit, EltTypeArray); } /// isFunctionLocalClass - Return true if CXXRecordDecl is defined /// inside a function. static bool isFunctionLocalClass(const CXXRecordDecl *RD) { if (const CXXRecordDecl *NRD = dyn_cast(RD->getDeclContext())) return isFunctionLocalClass(NRD); if (isa(RD->getDeclContext())) return true; return false; } /// CreateCXXMemberFunction - A helper function to create a DISubprogram for /// a single member function GlobalDecl. llvm::DISubprogram CGDebugInfo::CreateCXXMemberFunction(const CXXMethodDecl *Method, llvm::DIFile Unit, llvm::DIType RecordTy) { bool IsCtorOrDtor = isa(Method) || isa(Method); StringRef MethodName = getFunctionName(Method); llvm::DICompositeType MethodTy = getOrCreateMethodType(Method, Unit); // Since a single ctor/dtor corresponds to multiple functions, it doesn't // make sense to give a single ctor/dtor a linkage name. StringRef MethodLinkageName; if (!IsCtorOrDtor && !isFunctionLocalClass(Method->getParent())) MethodLinkageName = CGM.getMangledName(Method); // Get the location for the method. llvm::DIFile MethodDefUnit; unsigned MethodLine = 0; if (!Method->isImplicit()) { MethodDefUnit = getOrCreateFile(Method->getLocation()); MethodLine = getLineNumber(Method->getLocation()); } // Collect virtual method info. llvm::DIType ContainingType; unsigned Virtuality = 0; unsigned VIndex = 0; if (Method->isVirtual()) { if (Method->isPure()) Virtuality = llvm::dwarf::DW_VIRTUALITY_pure_virtual; else Virtuality = llvm::dwarf::DW_VIRTUALITY_virtual; // It doesn't make sense to give a virtual destructor a vtable index, // since a single destructor has two entries in the vtable. // FIXME: Add proper support for debug info for virtual calls in // the Microsoft ABI, where we may use multiple vptrs to make a vftable // lookup if we have multiple or virtual inheritance. if (!isa(Method) && !CGM.getTarget().getCXXABI().isMicrosoft()) VIndex = CGM.getItaniumVTableContext().getMethodVTableIndex(Method); ContainingType = RecordTy; } unsigned Flags = 0; if (Method->isImplicit()) Flags |= llvm::DIDescriptor::FlagArtificial; AccessSpecifier Access = Method->getAccess(); if (Access == clang::AS_private) Flags |= llvm::DIDescriptor::FlagPrivate; else if (Access == clang::AS_protected) Flags |= llvm::DIDescriptor::FlagProtected; if (const CXXConstructorDecl *CXXC = dyn_cast(Method)) { if (CXXC->isExplicit()) Flags |= llvm::DIDescriptor::FlagExplicit; } else if (const CXXConversionDecl *CXXC = dyn_cast(Method)) { if (CXXC->isExplicit()) Flags |= llvm::DIDescriptor::FlagExplicit; } if (Method->hasPrototype()) Flags |= llvm::DIDescriptor::FlagPrototyped; llvm::DIArray TParamsArray = CollectFunctionTemplateParams(Method, Unit); llvm::DISubprogram SP = DBuilder.createMethod(RecordTy, MethodName, MethodLinkageName, MethodDefUnit, MethodLine, MethodTy, /*isLocalToUnit=*/false, /* isDefinition=*/ false, Virtuality, VIndex, ContainingType, Flags, CGM.getLangOpts().Optimize, NULL, TParamsArray); SPCache[Method->getCanonicalDecl()] = llvm::WeakVH(SP); return SP; } /// CollectCXXMemberFunctions - A helper function to collect debug info for /// C++ member functions. This is used while creating debug info entry for /// a Record. void CGDebugInfo:: CollectCXXMemberFunctions(const CXXRecordDecl *RD, llvm::DIFile Unit, SmallVectorImpl &EltTys, llvm::DIType RecordTy) { // Since we want more than just the individual member decls if we // have templated functions iterate over every declaration to gather // the functions. for(DeclContext::decl_iterator I = RD->decls_begin(), E = RD->decls_end(); I != E; ++I) { if (const CXXMethodDecl *Method = dyn_cast(*I)) { // Reuse the existing member function declaration if it exists. // It may be associated with the declaration of the type & should be // reused as we're building the definition. // // This situation can arise in the vtable-based debug info reduction where // implicit members are emitted in a non-vtable TU. llvm::DenseMap::iterator MI = SPCache.find(Method->getCanonicalDecl()); if (MI == SPCache.end()) { // If the member is implicit, lazily create it when we see the // definition, not before. (an ODR-used implicit default ctor that's // never actually code generated should not produce debug info) if (!Method->isImplicit()) EltTys.push_back(CreateCXXMemberFunction(Method, Unit, RecordTy)); } else EltTys.push_back(MI->second); } else if (const FunctionTemplateDecl *FTD = dyn_cast(*I)) { // Add any template specializations that have already been seen. Like // implicit member functions, these may have been added to a declaration // in the case of vtable-based debug info reduction. for (FunctionTemplateDecl::spec_iterator SI = FTD->spec_begin(), SE = FTD->spec_end(); SI != SE; ++SI) { llvm::DenseMap::iterator MI = SPCache.find(cast(*SI)->getCanonicalDecl()); if (MI != SPCache.end()) EltTys.push_back(MI->second); } } } } /// CollectCXXBases - A helper function to collect debug info for /// C++ base classes. This is used while creating debug info entry for /// a Record. void CGDebugInfo:: CollectCXXBases(const CXXRecordDecl *RD, llvm::DIFile Unit, SmallVectorImpl &EltTys, llvm::DIType RecordTy) { const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD); for (CXXRecordDecl::base_class_const_iterator BI = RD->bases_begin(), BE = RD->bases_end(); BI != BE; ++BI) { unsigned BFlags = 0; uint64_t BaseOffset; const CXXRecordDecl *Base = cast(BI->getType()->getAs()->getDecl()); if (BI->isVirtual()) { // virtual base offset offset is -ve. The code generator emits dwarf // expression where it expects +ve number. BaseOffset = 0 - CGM.getItaniumVTableContext() .getVirtualBaseOffsetOffset(RD, Base).getQuantity(); BFlags = llvm::DIDescriptor::FlagVirtual; } else BaseOffset = CGM.getContext().toBits(RL.getBaseClassOffset(Base)); // FIXME: Inconsistent units for BaseOffset. It is in bytes when // BI->isVirtual() and bits when not. AccessSpecifier Access = BI->getAccessSpecifier(); if (Access == clang::AS_private) BFlags |= llvm::DIDescriptor::FlagPrivate; else if (Access == clang::AS_protected) BFlags |= llvm::DIDescriptor::FlagProtected; llvm::DIType DTy = DBuilder.createInheritance(RecordTy, getOrCreateType(BI->getType(), Unit), BaseOffset, BFlags); EltTys.push_back(DTy); } } /// CollectTemplateParams - A helper function to collect template parameters. llvm::DIArray CGDebugInfo:: CollectTemplateParams(const TemplateParameterList *TPList, ArrayRef TAList, llvm::DIFile Unit) { SmallVector TemplateParams; for (unsigned i = 0, e = TAList.size(); i != e; ++i) { const TemplateArgument &TA = TAList[i]; StringRef Name; if (TPList) Name = TPList->getParam(i)->getName(); switch (TA.getKind()) { case TemplateArgument::Type: { llvm::DIType TTy = getOrCreateType(TA.getAsType(), Unit); llvm::DITemplateTypeParameter TTP = DBuilder.createTemplateTypeParameter(TheCU, Name, TTy); TemplateParams.push_back(TTP); } break; case TemplateArgument::Integral: { llvm::DIType TTy = getOrCreateType(TA.getIntegralType(), Unit); llvm::DITemplateValueParameter TVP = DBuilder.createTemplateValueParameter( TheCU, Name, TTy, llvm::ConstantInt::get(CGM.getLLVMContext(), TA.getAsIntegral())); TemplateParams.push_back(TVP); } break; case TemplateArgument::Declaration: { const ValueDecl *D = TA.getAsDecl(); bool InstanceMember = D->isCXXInstanceMember(); QualType T = InstanceMember ? CGM.getContext().getMemberPointerType( D->getType(), cast(D->getDeclContext()) ->getTypeForDecl()) : CGM.getContext().getPointerType(D->getType()); llvm::DIType TTy = getOrCreateType(T, Unit); llvm::Value *V = 0; // Variable pointer template parameters have a value that is the address // of the variable. if (const VarDecl *VD = dyn_cast(D)) V = CGM.GetAddrOfGlobalVar(VD); // Member function pointers have special support for building them, though // this is currently unsupported in LLVM CodeGen. if (InstanceMember) { if (const CXXMethodDecl *method = dyn_cast(D)) V = CGM.getCXXABI().EmitMemberPointer(method); } else if (const FunctionDecl *FD = dyn_cast(D)) V = CGM.GetAddrOfFunction(FD); // Member data pointers have special handling too to compute the fixed // offset within the object. - if (isa(D)) { + if (isa(D) || isa(D)) { // These five lines (& possibly the above member function pointer // handling) might be able to be refactored to use similar code in // CodeGenModule::getMemberPointerConstant uint64_t fieldOffset = CGM.getContext().getFieldOffset(D); CharUnits chars = CGM.getContext().toCharUnitsFromBits((int64_t) fieldOffset); V = CGM.getCXXABI().EmitMemberDataPointer( cast(T.getTypePtr()), chars); } llvm::DITemplateValueParameter TVP = DBuilder.createTemplateValueParameter(TheCU, Name, TTy, V->stripPointerCasts()); TemplateParams.push_back(TVP); } break; case TemplateArgument::NullPtr: { QualType T = TA.getNullPtrType(); llvm::DIType TTy = getOrCreateType(T, Unit); llvm::Value *V = 0; // Special case member data pointer null values since they're actually -1 // instead of zero. if (const MemberPointerType *MPT = dyn_cast(T.getTypePtr())) // But treat member function pointers as simple zero integers because // it's easier than having a special case in LLVM's CodeGen. If LLVM // CodeGen grows handling for values of non-null member function // pointers then perhaps we could remove this special case and rely on // EmitNullMemberPointer for member function pointers. if (MPT->isMemberDataPointer()) V = CGM.getCXXABI().EmitNullMemberPointer(MPT); if (!V) V = llvm::ConstantInt::get(CGM.Int8Ty, 0); llvm::DITemplateValueParameter TVP = DBuilder.createTemplateValueParameter(TheCU, Name, TTy, V); TemplateParams.push_back(TVP); } break; case TemplateArgument::Template: { llvm::DITemplateValueParameter TVP = DBuilder.createTemplateTemplateParameter( TheCU, Name, llvm::DIType(), TA.getAsTemplate().getAsTemplateDecl() ->getQualifiedNameAsString()); TemplateParams.push_back(TVP); } break; case TemplateArgument::Pack: { llvm::DITemplateValueParameter TVP = DBuilder.createTemplateParameterPack( TheCU, Name, llvm::DIType(), CollectTemplateParams(NULL, TA.getPackAsArray(), Unit)); TemplateParams.push_back(TVP); } break; case TemplateArgument::Expression: { const Expr *E = TA.getAsExpr(); QualType T = E->getType(); llvm::Value *V = CGM.EmitConstantExpr(E, T); assert(V && "Expression in template argument isn't constant"); llvm::DIType TTy = getOrCreateType(T, Unit); llvm::DITemplateValueParameter TVP = DBuilder.createTemplateValueParameter(TheCU, Name, TTy, V->stripPointerCasts()); TemplateParams.push_back(TVP); } break; // And the following should never occur: case TemplateArgument::TemplateExpansion: case TemplateArgument::Null: llvm_unreachable( "These argument types shouldn't exist in concrete types"); } } return DBuilder.getOrCreateArray(TemplateParams); } /// CollectFunctionTemplateParams - A helper function to collect debug /// info for function template parameters. llvm::DIArray CGDebugInfo:: CollectFunctionTemplateParams(const FunctionDecl *FD, llvm::DIFile Unit) { if (FD->getTemplatedKind() == FunctionDecl::TK_FunctionTemplateSpecialization) { const TemplateParameterList *TList = FD->getTemplateSpecializationInfo()->getTemplate() ->getTemplateParameters(); return CollectTemplateParams( TList, FD->getTemplateSpecializationArgs()->asArray(), Unit); } return llvm::DIArray(); } /// CollectCXXTemplateParams - A helper function to collect debug info for /// template parameters. llvm::DIArray CGDebugInfo:: CollectCXXTemplateParams(const ClassTemplateSpecializationDecl *TSpecial, llvm::DIFile Unit) { llvm::PointerUnion PU = TSpecial->getSpecializedTemplateOrPartial(); TemplateParameterList *TPList = PU.is() ? PU.get()->getTemplateParameters() : PU.get()->getTemplateParameters(); const TemplateArgumentList &TAList = TSpecial->getTemplateInstantiationArgs(); return CollectTemplateParams(TPList, TAList.asArray(), Unit); } /// getOrCreateVTablePtrType - Return debug info descriptor for vtable. llvm::DIType CGDebugInfo::getOrCreateVTablePtrType(llvm::DIFile Unit) { if (VTablePtrType.isValid()) return VTablePtrType; ASTContext &Context = CGM.getContext(); /* Function type */ llvm::Value *STy = getOrCreateType(Context.IntTy, Unit); llvm::DIArray SElements = DBuilder.getOrCreateArray(STy); llvm::DIType SubTy = DBuilder.createSubroutineType(Unit, SElements); unsigned Size = Context.getTypeSize(Context.VoidPtrTy); llvm::DIType vtbl_ptr_type = DBuilder.createPointerType(SubTy, Size, 0, "__vtbl_ptr_type"); VTablePtrType = DBuilder.createPointerType(vtbl_ptr_type, Size); return VTablePtrType; } /// getVTableName - Get vtable name for the given Class. StringRef CGDebugInfo::getVTableName(const CXXRecordDecl *RD) { // Copy the gdb compatible name on the side and use its reference. return internString("_vptr$", RD->getNameAsString()); } /// CollectVTableInfo - If the C++ class has vtable info then insert appropriate /// debug info entry in EltTys vector. void CGDebugInfo:: CollectVTableInfo(const CXXRecordDecl *RD, llvm::DIFile Unit, SmallVectorImpl &EltTys) { const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD); // If there is a primary base then it will hold vtable info. if (RL.getPrimaryBase()) return; // If this class is not dynamic then there is not any vtable info to collect. if (!RD->isDynamicClass()) return; unsigned Size = CGM.getContext().getTypeSize(CGM.getContext().VoidPtrTy); llvm::DIType VPTR = DBuilder.createMemberType(Unit, getVTableName(RD), Unit, 0, Size, 0, 0, llvm::DIDescriptor::FlagArtificial, getOrCreateVTablePtrType(Unit)); EltTys.push_back(VPTR); } /// getOrCreateRecordType - Emit record type's standalone debug info. llvm::DIType CGDebugInfo::getOrCreateRecordType(QualType RTy, SourceLocation Loc) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); llvm::DIType T = getOrCreateType(RTy, getOrCreateFile(Loc)); return T; } /// getOrCreateInterfaceType - Emit an objective c interface type standalone /// debug info. llvm::DIType CGDebugInfo::getOrCreateInterfaceType(QualType D, SourceLocation Loc) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); llvm::DIType T = getOrCreateType(D, getOrCreateFile(Loc)); RetainedTypes.push_back(D.getAsOpaquePtr()); return T; } void CGDebugInfo::completeType(const RecordDecl *RD) { if (DebugKind > CodeGenOptions::LimitedDebugInfo || !CGM.getLangOpts().CPlusPlus) completeRequiredType(RD); } void CGDebugInfo::completeRequiredType(const RecordDecl *RD) { if (const CXXRecordDecl *CXXDecl = dyn_cast(RD)) if (CXXDecl->isDynamicClass()) return; QualType Ty = CGM.getContext().getRecordType(RD); llvm::DIType T = getTypeOrNull(Ty); if (T && T.isForwardDecl()) completeClassData(RD); } void CGDebugInfo::completeClassData(const RecordDecl *RD) { if (DebugKind <= CodeGenOptions::DebugLineTablesOnly) return; QualType Ty = CGM.getContext().getRecordType(RD); void* TyPtr = Ty.getAsOpaquePtr(); if (CompletedTypeCache.count(TyPtr)) return; llvm::DIType Res = CreateTypeDefinition(Ty->castAs()); assert(!Res.isForwardDecl()); CompletedTypeCache[TyPtr] = Res; TypeCache[TyPtr] = Res; } /// CreateType - get structure or union type. llvm::DIType CGDebugInfo::CreateType(const RecordType *Ty) { RecordDecl *RD = Ty->getDecl(); const CXXRecordDecl *CXXDecl = dyn_cast(RD); // Always emit declarations for types that aren't required to be complete when // in limit-debug-info mode. If the type is later found to be required to be // complete this declaration will be upgraded to a definition by // `completeRequiredType`. // If the type is dynamic, only emit the definition in TUs that require class // data. This is handled by `completeClassData`. llvm::DICompositeType T(getTypeOrNull(QualType(Ty, 0))); // If we've already emitted the type, just use that, even if it's only a // declaration. The completeType, completeRequiredType, and completeClassData // callbacks will handle promoting the declaration to a definition. if (T || // Under -flimit-debug-info: (DebugKind <= CodeGenOptions::LimitedDebugInfo && // Emit only a forward declaration unless the type is required. ((!RD->isCompleteDefinitionRequired() && CGM.getLangOpts().CPlusPlus) || // If the class is dynamic, only emit a declaration. A definition will be // emitted whenever the vtable is emitted. (CXXDecl && CXXDecl->hasDefinition() && CXXDecl->isDynamicClass())))) { llvm::DIDescriptor FDContext = getContextDescriptor(cast(RD->getDeclContext())); if (!T) T = getOrCreateRecordFwdDecl(Ty, FDContext); return T; } return CreateTypeDefinition(Ty); } llvm::DIType CGDebugInfo::CreateTypeDefinition(const RecordType *Ty) { RecordDecl *RD = Ty->getDecl(); // Get overall information about the record type for the debug info. llvm::DIFile DefUnit = getOrCreateFile(RD->getLocation()); // Records and classes and unions can all be recursive. To handle them, we // first generate a debug descriptor for the struct as a forward declaration. // Then (if it is a definition) we go through and get debug info for all of // its members. Finally, we create a descriptor for the complete type (which // may refer to the forward decl if the struct is recursive) and replace all // uses of the forward declaration with the final definition. llvm::DICompositeType FwdDecl(getOrCreateLimitedType(Ty, DefUnit)); assert(FwdDecl.isCompositeType() && "The debug type of a RecordType should be a llvm::DICompositeType"); if (FwdDecl.isForwardDecl()) return FwdDecl; if (const CXXRecordDecl *CXXDecl = dyn_cast(RD)) CollectContainingType(CXXDecl, FwdDecl); // Push the struct on region stack. LexicalBlockStack.push_back(&*FwdDecl); RegionMap[Ty->getDecl()] = llvm::WeakVH(FwdDecl); // Add this to the completed-type cache while we're completing it recursively. CompletedTypeCache[QualType(Ty, 0).getAsOpaquePtr()] = FwdDecl; // Convert all the elements. SmallVector EltTys; // what about nested types? // Note: The split of CXXDecl information here is intentional, the // gdb tests will depend on a certain ordering at printout. The debug // information offsets are still correct if we merge them all together // though. const CXXRecordDecl *CXXDecl = dyn_cast(RD); if (CXXDecl) { CollectCXXBases(CXXDecl, DefUnit, EltTys, FwdDecl); CollectVTableInfo(CXXDecl, DefUnit, EltTys); } // Collect data fields (including static variables and any initializers). CollectRecordFields(RD, DefUnit, EltTys, FwdDecl); if (CXXDecl) CollectCXXMemberFunctions(CXXDecl, DefUnit, EltTys, FwdDecl); LexicalBlockStack.pop_back(); RegionMap.erase(Ty->getDecl()); llvm::DIArray Elements = DBuilder.getOrCreateArray(EltTys); FwdDecl.setTypeArray(Elements); RegionMap[Ty->getDecl()] = llvm::WeakVH(FwdDecl); return FwdDecl; } /// CreateType - get objective-c object type. llvm::DIType CGDebugInfo::CreateType(const ObjCObjectType *Ty, llvm::DIFile Unit) { // Ignore protocols. return getOrCreateType(Ty->getBaseType(), Unit); } /// \return true if Getter has the default name for the property PD. static bool hasDefaultGetterName(const ObjCPropertyDecl *PD, const ObjCMethodDecl *Getter) { assert(PD); if (!Getter) return true; assert(Getter->getDeclName().isObjCZeroArgSelector()); return PD->getName() == Getter->getDeclName().getObjCSelector().getNameForSlot(0); } /// \return true if Setter has the default name for the property PD. static bool hasDefaultSetterName(const ObjCPropertyDecl *PD, const ObjCMethodDecl *Setter) { assert(PD); if (!Setter) return true; assert(Setter->getDeclName().isObjCOneArgSelector()); return SelectorTable::constructSetterName(PD->getName()) == Setter->getDeclName().getObjCSelector().getNameForSlot(0); } /// CreateType - get objective-c interface type. llvm::DIType CGDebugInfo::CreateType(const ObjCInterfaceType *Ty, llvm::DIFile Unit) { ObjCInterfaceDecl *ID = Ty->getDecl(); if (!ID) return llvm::DIType(); // Get overall information about the record type for the debug info. llvm::DIFile DefUnit = getOrCreateFile(ID->getLocation()); unsigned Line = getLineNumber(ID->getLocation()); unsigned RuntimeLang = TheCU.getLanguage(); // If this is just a forward declaration return a special forward-declaration // debug type since we won't be able to lay out the entire type. ObjCInterfaceDecl *Def = ID->getDefinition(); if (!Def) { llvm::DIType FwdDecl = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type, ID->getName(), TheCU, DefUnit, Line, RuntimeLang); return FwdDecl; } ID = Def; // Bit size, align and offset of the type. uint64_t Size = CGM.getContext().getTypeSize(Ty); uint64_t Align = CGM.getContext().getTypeAlign(Ty); unsigned Flags = 0; if (ID->getImplementation()) Flags |= llvm::DIDescriptor::FlagObjcClassComplete; llvm::DICompositeType RealDecl = DBuilder.createStructType(Unit, ID->getName(), DefUnit, Line, Size, Align, Flags, llvm::DIType(), llvm::DIArray(), RuntimeLang); // Otherwise, insert it into the CompletedTypeCache so that recursive uses // will find it and we're emitting the complete type. QualType QualTy = QualType(Ty, 0); CompletedTypeCache[QualTy.getAsOpaquePtr()] = RealDecl; // Push the struct on region stack. LexicalBlockStack.push_back(static_cast(RealDecl)); RegionMap[Ty->getDecl()] = llvm::WeakVH(RealDecl); // Convert all the elements. SmallVector EltTys; ObjCInterfaceDecl *SClass = ID->getSuperClass(); if (SClass) { llvm::DIType SClassTy = getOrCreateType(CGM.getContext().getObjCInterfaceType(SClass), Unit); if (!SClassTy.isValid()) return llvm::DIType(); llvm::DIType InhTag = DBuilder.createInheritance(RealDecl, SClassTy, 0, 0); EltTys.push_back(InhTag); } // Create entries for all of the properties. for (ObjCContainerDecl::prop_iterator I = ID->prop_begin(), E = ID->prop_end(); I != E; ++I) { const ObjCPropertyDecl *PD = *I; SourceLocation Loc = PD->getLocation(); llvm::DIFile PUnit = getOrCreateFile(Loc); unsigned PLine = getLineNumber(Loc); ObjCMethodDecl *Getter = PD->getGetterMethodDecl(); ObjCMethodDecl *Setter = PD->getSetterMethodDecl(); llvm::MDNode *PropertyNode = DBuilder.createObjCProperty(PD->getName(), PUnit, PLine, hasDefaultGetterName(PD, Getter) ? "" : getSelectorName(PD->getGetterName()), hasDefaultSetterName(PD, Setter) ? "" : getSelectorName(PD->getSetterName()), PD->getPropertyAttributes(), getOrCreateType(PD->getType(), PUnit)); EltTys.push_back(PropertyNode); } const ASTRecordLayout &RL = CGM.getContext().getASTObjCInterfaceLayout(ID); unsigned FieldNo = 0; for (ObjCIvarDecl *Field = ID->all_declared_ivar_begin(); Field; Field = Field->getNextIvar(), ++FieldNo) { llvm::DIType FieldTy = getOrCreateType(Field->getType(), Unit); if (!FieldTy.isValid()) return llvm::DIType(); StringRef FieldName = Field->getName(); // Ignore unnamed fields. if (FieldName.empty()) continue; // Get the location for the field. llvm::DIFile FieldDefUnit = getOrCreateFile(Field->getLocation()); unsigned FieldLine = getLineNumber(Field->getLocation()); QualType FType = Field->getType(); uint64_t FieldSize = 0; unsigned FieldAlign = 0; if (!FType->isIncompleteArrayType()) { // Bit size, align and offset of the type. FieldSize = Field->isBitField() ? Field->getBitWidthValue(CGM.getContext()) : CGM.getContext().getTypeSize(FType); FieldAlign = CGM.getContext().getTypeAlign(FType); } uint64_t FieldOffset; if (CGM.getLangOpts().ObjCRuntime.isNonFragile()) { // We don't know the runtime offset of an ivar if we're using the // non-fragile ABI. For bitfields, use the bit offset into the first // byte of storage of the bitfield. For other fields, use zero. if (Field->isBitField()) { FieldOffset = CGM.getObjCRuntime().ComputeBitfieldBitOffset( CGM, ID, Field); FieldOffset %= CGM.getContext().getCharWidth(); } else { FieldOffset = 0; } } else { FieldOffset = RL.getFieldOffset(FieldNo); } unsigned Flags = 0; if (Field->getAccessControl() == ObjCIvarDecl::Protected) Flags = llvm::DIDescriptor::FlagProtected; else if (Field->getAccessControl() == ObjCIvarDecl::Private) Flags = llvm::DIDescriptor::FlagPrivate; llvm::MDNode *PropertyNode = NULL; if (ObjCImplementationDecl *ImpD = ID->getImplementation()) { if (ObjCPropertyImplDecl *PImpD = ImpD->FindPropertyImplIvarDecl(Field->getIdentifier())) { if (ObjCPropertyDecl *PD = PImpD->getPropertyDecl()) { SourceLocation Loc = PD->getLocation(); llvm::DIFile PUnit = getOrCreateFile(Loc); unsigned PLine = getLineNumber(Loc); ObjCMethodDecl *Getter = PD->getGetterMethodDecl(); ObjCMethodDecl *Setter = PD->getSetterMethodDecl(); PropertyNode = DBuilder.createObjCProperty(PD->getName(), PUnit, PLine, hasDefaultGetterName(PD, Getter) ? "" : getSelectorName(PD->getGetterName()), hasDefaultSetterName(PD, Setter) ? "" : getSelectorName(PD->getSetterName()), PD->getPropertyAttributes(), getOrCreateType(PD->getType(), PUnit)); } } } FieldTy = DBuilder.createObjCIVar(FieldName, FieldDefUnit, FieldLine, FieldSize, FieldAlign, FieldOffset, Flags, FieldTy, PropertyNode); EltTys.push_back(FieldTy); } llvm::DIArray Elements = DBuilder.getOrCreateArray(EltTys); RealDecl.setTypeArray(Elements); // If the implementation is not yet set, we do not want to mark it // as complete. An implementation may declare additional // private ivars that we would miss otherwise. if (ID->getImplementation() == 0) CompletedTypeCache.erase(QualTy.getAsOpaquePtr()); LexicalBlockStack.pop_back(); return RealDecl; } llvm::DIType CGDebugInfo::CreateType(const VectorType *Ty, llvm::DIFile Unit) { llvm::DIType ElementTy = getOrCreateType(Ty->getElementType(), Unit); int64_t Count = Ty->getNumElements(); if (Count == 0) // If number of elements are not known then this is an unbounded array. // Use Count == -1 to express such arrays. Count = -1; llvm::Value *Subscript = DBuilder.getOrCreateSubrange(0, Count); llvm::DIArray SubscriptArray = DBuilder.getOrCreateArray(Subscript); uint64_t Size = CGM.getContext().getTypeSize(Ty); uint64_t Align = CGM.getContext().getTypeAlign(Ty); return DBuilder.createVectorType(Size, Align, ElementTy, SubscriptArray); } llvm::DIType CGDebugInfo::CreateType(const ArrayType *Ty, llvm::DIFile Unit) { uint64_t Size; uint64_t Align; // FIXME: make getTypeAlign() aware of VLAs and incomplete array types if (const VariableArrayType *VAT = dyn_cast(Ty)) { Size = 0; Align = CGM.getContext().getTypeAlign(CGM.getContext().getBaseElementType(VAT)); } else if (Ty->isIncompleteArrayType()) { Size = 0; if (Ty->getElementType()->isIncompleteType()) Align = 0; else Align = CGM.getContext().getTypeAlign(Ty->getElementType()); } else if (Ty->isIncompleteType()) { Size = 0; Align = 0; } else { // Size and align of the whole array, not the element type. Size = CGM.getContext().getTypeSize(Ty); Align = CGM.getContext().getTypeAlign(Ty); } // Add the dimensions of the array. FIXME: This loses CV qualifiers from // interior arrays, do we care? Why aren't nested arrays represented the // obvious/recursive way? SmallVector Subscripts; QualType EltTy(Ty, 0); while ((Ty = dyn_cast(EltTy))) { // If the number of elements is known, then count is that number. Otherwise, // it's -1. This allows us to represent a subrange with an array of 0 // elements, like this: // // struct foo { // int x[0]; // }; int64_t Count = -1; // Count == -1 is an unbounded array. if (const ConstantArrayType *CAT = dyn_cast(Ty)) Count = CAT->getSize().getZExtValue(); // FIXME: Verify this is right for VLAs. Subscripts.push_back(DBuilder.getOrCreateSubrange(0, Count)); EltTy = Ty->getElementType(); } llvm::DIArray SubscriptArray = DBuilder.getOrCreateArray(Subscripts); llvm::DIType DbgTy = DBuilder.createArrayType(Size, Align, getOrCreateType(EltTy, Unit), SubscriptArray); return DbgTy; } llvm::DIType CGDebugInfo::CreateType(const LValueReferenceType *Ty, llvm::DIFile Unit) { return CreatePointerLikeType(llvm::dwarf::DW_TAG_reference_type, Ty, Ty->getPointeeType(), Unit); } llvm::DIType CGDebugInfo::CreateType(const RValueReferenceType *Ty, llvm::DIFile Unit) { return CreatePointerLikeType(llvm::dwarf::DW_TAG_rvalue_reference_type, Ty, Ty->getPointeeType(), Unit); } llvm::DIType CGDebugInfo::CreateType(const MemberPointerType *Ty, llvm::DIFile U) { llvm::DIType ClassType = getOrCreateType(QualType(Ty->getClass(), 0), U); if (!Ty->getPointeeType()->isFunctionType()) return DBuilder.createMemberPointerType( getOrCreateType(Ty->getPointeeType(), U), ClassType); return DBuilder.createMemberPointerType(getOrCreateInstanceMethodType( CGM.getContext().getPointerType( QualType(Ty->getClass(), Ty->getPointeeType().getCVRQualifiers())), Ty->getPointeeType()->getAs(), U), ClassType); } llvm::DIType CGDebugInfo::CreateType(const AtomicType *Ty, llvm::DIFile U) { // Ignore the atomic wrapping // FIXME: What is the correct representation? return getOrCreateType(Ty->getValueType(), U); } /// CreateEnumType - get enumeration type. llvm::DIType CGDebugInfo::CreateEnumType(const EnumType *Ty) { const EnumDecl *ED = Ty->getDecl(); uint64_t Size = 0; uint64_t Align = 0; if (!ED->getTypeForDecl()->isIncompleteType()) { Size = CGM.getContext().getTypeSize(ED->getTypeForDecl()); Align = CGM.getContext().getTypeAlign(ED->getTypeForDecl()); } SmallString<256> FullName = getUniqueTagTypeName(Ty, CGM, TheCU); // If this is just a forward declaration, construct an appropriately // marked node and just return it. if (!ED->getDefinition()) { llvm::DIDescriptor EDContext; EDContext = getContextDescriptor(cast(ED->getDeclContext())); llvm::DIFile DefUnit = getOrCreateFile(ED->getLocation()); unsigned Line = getLineNumber(ED->getLocation()); StringRef EDName = ED->getName(); return DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_enumeration_type, EDName, EDContext, DefUnit, Line, 0, Size, Align, FullName); } // Create DIEnumerator elements for each enumerator. SmallVector Enumerators; ED = ED->getDefinition(); for (EnumDecl::enumerator_iterator Enum = ED->enumerator_begin(), EnumEnd = ED->enumerator_end(); Enum != EnumEnd; ++Enum) { Enumerators.push_back( DBuilder.createEnumerator(Enum->getName(), Enum->getInitVal().getSExtValue())); } // Return a CompositeType for the enum itself. llvm::DIArray EltArray = DBuilder.getOrCreateArray(Enumerators); llvm::DIFile DefUnit = getOrCreateFile(ED->getLocation()); unsigned Line = getLineNumber(ED->getLocation()); llvm::DIDescriptor EnumContext = getContextDescriptor(cast(ED->getDeclContext())); llvm::DIType ClassTy = ED->isFixed() ? getOrCreateType(ED->getIntegerType(), DefUnit) : llvm::DIType(); llvm::DIType DbgTy = DBuilder.createEnumerationType(EnumContext, ED->getName(), DefUnit, Line, Size, Align, EltArray, ClassTy, FullName); return DbgTy; } static QualType UnwrapTypeForDebugInfo(QualType T, const ASTContext &C) { Qualifiers Quals; do { Qualifiers InnerQuals = T.getLocalQualifiers(); // Qualifiers::operator+() doesn't like it if you add a Qualifier // that is already there. Quals += Qualifiers::removeCommonQualifiers(Quals, InnerQuals); Quals += InnerQuals; QualType LastT = T; switch (T->getTypeClass()) { default: return C.getQualifiedType(T.getTypePtr(), Quals); case Type::TemplateSpecialization: T = cast(T)->desugar(); break; case Type::TypeOfExpr: T = cast(T)->getUnderlyingExpr()->getType(); break; case Type::TypeOf: T = cast(T)->getUnderlyingType(); break; case Type::Decltype: T = cast(T)->getUnderlyingType(); break; case Type::UnaryTransform: T = cast(T)->getUnderlyingType(); break; case Type::Attributed: T = cast(T)->getEquivalentType(); break; case Type::Elaborated: T = cast(T)->getNamedType(); break; case Type::Paren: T = cast(T)->getInnerType(); break; case Type::SubstTemplateTypeParm: T = cast(T)->getReplacementType(); break; case Type::Auto: QualType DT = cast(T)->getDeducedType(); if (DT.isNull()) return T; T = DT; break; } assert(T != LastT && "Type unwrapping failed to unwrap!"); (void)LastT; } while (true); } /// getType - Get the type from the cache or return null type if it doesn't /// exist. llvm::DIType CGDebugInfo::getTypeOrNull(QualType Ty) { // Unwrap the type as needed for debug information. Ty = UnwrapTypeForDebugInfo(Ty, CGM.getContext()); // Check for existing entry. if (Ty->getTypeClass() == Type::ObjCInterface) { llvm::Value *V = getCachedInterfaceTypeOrNull(Ty); if (V) return llvm::DIType(cast(V)); else return llvm::DIType(); } llvm::DenseMap::iterator it = TypeCache.find(Ty.getAsOpaquePtr()); if (it != TypeCache.end()) { // Verify that the debug info still exists. if (llvm::Value *V = it->second) return llvm::DIType(cast(V)); } return llvm::DIType(); } /// getCompletedTypeOrNull - Get the type from the cache or return null if it /// doesn't exist. llvm::DIType CGDebugInfo::getCompletedTypeOrNull(QualType Ty) { // Unwrap the type as needed for debug information. Ty = UnwrapTypeForDebugInfo(Ty, CGM.getContext()); // Check for existing entry. llvm::Value *V = 0; llvm::DenseMap::iterator it = CompletedTypeCache.find(Ty.getAsOpaquePtr()); if (it != CompletedTypeCache.end()) V = it->second; else { V = getCachedInterfaceTypeOrNull(Ty); } // Verify that any cached debug info still exists. return llvm::DIType(cast_or_null(V)); } /// getCachedInterfaceTypeOrNull - Get the type from the interface /// cache, unless it needs to regenerated. Otherwise return null. llvm::Value *CGDebugInfo::getCachedInterfaceTypeOrNull(QualType Ty) { // Is there a cached interface that hasn't changed? llvm::DenseMap > ::iterator it1 = ObjCInterfaceCache.find(Ty.getAsOpaquePtr()); if (it1 != ObjCInterfaceCache.end()) if (ObjCInterfaceDecl* Decl = getObjCInterfaceDecl(Ty)) if (Checksum(Decl) == it1->second.second) // Return cached forward declaration. return it1->second.first; return 0; } /// getOrCreateType - Get the type from the cache or create a new /// one if necessary. llvm::DIType CGDebugInfo::getOrCreateType(QualType Ty, llvm::DIFile Unit) { if (Ty.isNull()) return llvm::DIType(); // Unwrap the type as needed for debug information. Ty = UnwrapTypeForDebugInfo(Ty, CGM.getContext()); if (llvm::DIType T = getCompletedTypeOrNull(Ty)) return T; // Otherwise create the type. llvm::DIType Res = CreateTypeNode(Ty, Unit); void* TyPtr = Ty.getAsOpaquePtr(); // And update the type cache. TypeCache[TyPtr] = Res; // FIXME: this getTypeOrNull call seems silly when we just inserted the type // into the cache - but getTypeOrNull has a special case for cached interface // types. We should probably just pull that out as a special case for the // "else" block below & skip the otherwise needless lookup. llvm::DIType TC = getTypeOrNull(Ty); if (TC && TC.isForwardDecl()) ReplaceMap.push_back(std::make_pair(TyPtr, static_cast(TC))); else if (ObjCInterfaceDecl* Decl = getObjCInterfaceDecl(Ty)) { // Interface types may have elements added to them by a // subsequent implementation or extension, so we keep them in // the ObjCInterfaceCache together with a checksum. Instead of // the (possibly) incomplete interface type, we return a forward // declaration that gets RAUW'd in CGDebugInfo::finalize(). std::pair &V = ObjCInterfaceCache[TyPtr]; if (V.first) return llvm::DIType(cast(V.first)); TC = DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type, Decl->getName(), TheCU, Unit, getLineNumber(Decl->getLocation()), TheCU.getLanguage()); // Store the forward declaration in the cache. V.first = TC; V.second = Checksum(Decl); // Register the type for replacement in finalize(). ReplaceMap.push_back(std::make_pair(TyPtr, static_cast(TC))); return TC; } if (!Res.isForwardDecl()) CompletedTypeCache[TyPtr] = Res; return Res; } /// Currently the checksum of an interface includes the number of /// ivars and property accessors. unsigned CGDebugInfo::Checksum(const ObjCInterfaceDecl *ID) { // The assumption is that the number of ivars can only increase // monotonically, so it is safe to just use their current number as // a checksum. unsigned Sum = 0; for (const ObjCIvarDecl *Ivar = ID->all_declared_ivar_begin(); Ivar != 0; Ivar = Ivar->getNextIvar()) ++Sum; return Sum; } ObjCInterfaceDecl *CGDebugInfo::getObjCInterfaceDecl(QualType Ty) { switch (Ty->getTypeClass()) { case Type::ObjCObjectPointer: return getObjCInterfaceDecl(cast(Ty) ->getPointeeType()); case Type::ObjCInterface: return cast(Ty)->getDecl(); default: return 0; } } /// CreateTypeNode - Create a new debug type node. llvm::DIType CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile Unit) { // Handle qualifiers, which recursively handles what they refer to. if (Ty.hasLocalQualifiers()) return CreateQualifiedType(Ty, Unit); const char *Diag = 0; // Work out details of type. switch (Ty->getTypeClass()) { #define TYPE(Class, Base) #define ABSTRACT_TYPE(Class, Base) #define NON_CANONICAL_TYPE(Class, Base) #define DEPENDENT_TYPE(Class, Base) case Type::Class: #include "clang/AST/TypeNodes.def" llvm_unreachable("Dependent types cannot show up in debug information"); case Type::ExtVector: case Type::Vector: return CreateType(cast(Ty), Unit); case Type::ObjCObjectPointer: return CreateType(cast(Ty), Unit); case Type::ObjCObject: return CreateType(cast(Ty), Unit); case Type::ObjCInterface: return CreateType(cast(Ty), Unit); case Type::Builtin: return CreateType(cast(Ty)); case Type::Complex: return CreateType(cast(Ty)); case Type::Pointer: return CreateType(cast(Ty), Unit); case Type::Decayed: // Decayed types are just pointers in LLVM and DWARF. return CreateType( cast(cast(Ty)->getDecayedType()), Unit); case Type::BlockPointer: return CreateType(cast(Ty), Unit); case Type::Typedef: return CreateType(cast(Ty), Unit); case Type::Record: return CreateType(cast(Ty)); case Type::Enum: return CreateEnumType(cast(Ty)); case Type::FunctionProto: case Type::FunctionNoProto: return CreateType(cast(Ty), Unit); case Type::ConstantArray: case Type::VariableArray: case Type::IncompleteArray: return CreateType(cast(Ty), Unit); case Type::LValueReference: return CreateType(cast(Ty), Unit); case Type::RValueReference: return CreateType(cast(Ty), Unit); case Type::MemberPointer: return CreateType(cast(Ty), Unit); case Type::Atomic: return CreateType(cast(Ty), Unit); case Type::Attributed: case Type::TemplateSpecialization: case Type::Elaborated: case Type::Paren: case Type::SubstTemplateTypeParm: case Type::TypeOfExpr: case Type::TypeOf: case Type::Decltype: case Type::UnaryTransform: case Type::PackExpansion: llvm_unreachable("type should have been unwrapped!"); case Type::Auto: Diag = "auto"; break; } assert(Diag && "Fall through without a diagnostic?"); unsigned DiagID = CGM.getDiags().getCustomDiagID(DiagnosticsEngine::Error, "debug information for %0 is not yet supported"); CGM.getDiags().Report(DiagID) << Diag; return llvm::DIType(); } /// getOrCreateLimitedType - Get the type from the cache or create a new /// limited type if necessary. llvm::DIType CGDebugInfo::getOrCreateLimitedType(const RecordType *Ty, llvm::DIFile Unit) { QualType QTy(Ty, 0); llvm::DICompositeType T(getTypeOrNull(QTy)); // We may have cached a forward decl when we could have created // a non-forward decl. Go ahead and create a non-forward decl // now. if (T && !T.isForwardDecl()) return T; // Otherwise create the type. llvm::DICompositeType Res = CreateLimitedType(Ty); // Propagate members from the declaration to the definition // CreateType(const RecordType*) will overwrite this with the members in the // correct order if the full type is needed. Res.setTypeArray(T.getTypeArray()); if (T && T.isForwardDecl()) ReplaceMap.push_back( std::make_pair(QTy.getAsOpaquePtr(), static_cast(T))); // And update the type cache. TypeCache[QTy.getAsOpaquePtr()] = Res; return Res; } // TODO: Currently used for context chains when limiting debug info. llvm::DICompositeType CGDebugInfo::CreateLimitedType(const RecordType *Ty) { RecordDecl *RD = Ty->getDecl(); // Get overall information about the record type for the debug info. llvm::DIFile DefUnit = getOrCreateFile(RD->getLocation()); unsigned Line = getLineNumber(RD->getLocation()); StringRef RDName = getClassName(RD); llvm::DIDescriptor RDContext = getContextDescriptor(cast(RD->getDeclContext())); // If we ended up creating the type during the context chain construction, // just return that. // FIXME: this could be dealt with better if the type was recorded as // completed before we started this (see the CompletedTypeCache usage in // CGDebugInfo::CreateTypeDefinition(const RecordType*) - that would need to // be pushed to before context creation, but after it was known to be // destined for completion (might still have an issue if this caller only // required a declaration but the context construction ended up creating a // definition) llvm::DICompositeType T(getTypeOrNull(CGM.getContext().getRecordType(RD))); if (T && (!T.isForwardDecl() || !RD->getDefinition())) return T; // If this is just a forward or incomplete declaration, construct an // appropriately marked node and just return it. const RecordDecl *D = RD->getDefinition(); if (!D || !D->isCompleteDefinition()) return getOrCreateRecordFwdDecl(Ty, RDContext); uint64_t Size = CGM.getContext().getTypeSize(Ty); uint64_t Align = CGM.getContext().getTypeAlign(Ty); llvm::DICompositeType RealDecl; SmallString<256> FullName = getUniqueTagTypeName(Ty, CGM, TheCU); if (RD->isUnion()) RealDecl = DBuilder.createUnionType(RDContext, RDName, DefUnit, Line, Size, Align, 0, llvm::DIArray(), 0, FullName); else if (RD->isClass()) { // FIXME: This could be a struct type giving a default visibility different // than C++ class type, but needs llvm metadata changes first. RealDecl = DBuilder.createClassType(RDContext, RDName, DefUnit, Line, Size, Align, 0, 0, llvm::DIType(), llvm::DIArray(), llvm::DIType(), llvm::DIArray(), FullName); } else RealDecl = DBuilder.createStructType(RDContext, RDName, DefUnit, Line, Size, Align, 0, llvm::DIType(), llvm::DIArray(), 0, llvm::DIType(), FullName); RegionMap[Ty->getDecl()] = llvm::WeakVH(RealDecl); TypeCache[QualType(Ty, 0).getAsOpaquePtr()] = RealDecl; if (const ClassTemplateSpecializationDecl *TSpecial = dyn_cast(RD)) RealDecl.setTypeArray(llvm::DIArray(), CollectCXXTemplateParams(TSpecial, DefUnit)); return RealDecl; } void CGDebugInfo::CollectContainingType(const CXXRecordDecl *RD, llvm::DICompositeType RealDecl) { // A class's primary base or the class itself contains the vtable. llvm::DICompositeType ContainingType; const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD); if (const CXXRecordDecl *PBase = RL.getPrimaryBase()) { // Seek non virtual primary base root. while (1) { const ASTRecordLayout &BRL = CGM.getContext().getASTRecordLayout(PBase); const CXXRecordDecl *PBT = BRL.getPrimaryBase(); if (PBT && !BRL.isPrimaryBaseVirtual()) PBase = PBT; else break; } ContainingType = llvm::DICompositeType( getOrCreateType(QualType(PBase->getTypeForDecl(), 0), getOrCreateFile(RD->getLocation()))); } else if (RD->isDynamicClass()) ContainingType = RealDecl; RealDecl.setContainingType(ContainingType); } /// CreateMemberType - Create new member and increase Offset by FType's size. llvm::DIType CGDebugInfo::CreateMemberType(llvm::DIFile Unit, QualType FType, StringRef Name, uint64_t *Offset) { llvm::DIType FieldTy = CGDebugInfo::getOrCreateType(FType, Unit); uint64_t FieldSize = CGM.getContext().getTypeSize(FType); unsigned FieldAlign = CGM.getContext().getTypeAlign(FType); llvm::DIType Ty = DBuilder.createMemberType(Unit, Name, Unit, 0, FieldSize, FieldAlign, *Offset, 0, FieldTy); *Offset += FieldSize; return Ty; } llvm::DIDescriptor CGDebugInfo::getDeclarationOrDefinition(const Decl *D) { // We only need a declaration (not a definition) of the type - so use whatever // we would otherwise do to get a type for a pointee. (forward declarations in // limited debug info, full definitions (if the type definition is available) // in unlimited debug info) if (const TypeDecl *TD = dyn_cast(D)) return getOrCreateType(CGM.getContext().getTypeDeclType(TD), getOrCreateFile(TD->getLocation())); // Otherwise fall back to a fairly rudimentary cache of existing declarations. // This doesn't handle providing declarations (for functions or variables) for // entities without definitions in this TU, nor when the definition proceeds // the call to this function. // FIXME: This should be split out into more specific maps with support for // emitting forward declarations and merging definitions with declarations, // the same way as we do for types. llvm::DenseMap::iterator I = DeclCache.find(D->getCanonicalDecl()); if (I == DeclCache.end()) return llvm::DIDescriptor(); llvm::Value *V = I->second; return llvm::DIDescriptor(dyn_cast_or_null(V)); } /// getFunctionDeclaration - Return debug info descriptor to describe method /// declaration for the given method definition. llvm::DISubprogram CGDebugInfo::getFunctionDeclaration(const Decl *D) { if (!D || DebugKind == CodeGenOptions::DebugLineTablesOnly) return llvm::DISubprogram(); const FunctionDecl *FD = dyn_cast(D); if (!FD) return llvm::DISubprogram(); // Setup context. llvm::DIScope S = getContextDescriptor(cast(D->getDeclContext())); llvm::DenseMap::iterator MI = SPCache.find(FD->getCanonicalDecl()); if (MI == SPCache.end()) { if (const CXXMethodDecl *MD = dyn_cast(FD->getCanonicalDecl())) { llvm::DICompositeType T(S); llvm::DISubprogram SP = CreateCXXMemberFunction(MD, getOrCreateFile(MD->getLocation()), T); T.addMember(SP); return SP; } } if (MI != SPCache.end()) { llvm::Value *V = MI->second; llvm::DISubprogram SP(dyn_cast_or_null(V)); if (SP.isSubprogram() && !SP.isDefinition()) return SP; } for (FunctionDecl::redecl_iterator I = FD->redecls_begin(), E = FD->redecls_end(); I != E; ++I) { const FunctionDecl *NextFD = *I; llvm::DenseMap::iterator MI = SPCache.find(NextFD->getCanonicalDecl()); if (MI != SPCache.end()) { llvm::Value *V = MI->second; llvm::DISubprogram SP(dyn_cast_or_null(V)); if (SP.isSubprogram() && !SP.isDefinition()) return SP; } } return llvm::DISubprogram(); } // getOrCreateFunctionType - Construct DIType. If it is a c++ method, include // implicit parameter "this". llvm::DICompositeType CGDebugInfo::getOrCreateFunctionType(const Decl *D, QualType FnType, llvm::DIFile F) { if (!D || DebugKind == CodeGenOptions::DebugLineTablesOnly) // Create fake but valid subroutine type. Otherwise // llvm::DISubprogram::Verify() would return false, and // subprogram DIE will miss DW_AT_decl_file and // DW_AT_decl_line fields. return DBuilder.createSubroutineType(F, DBuilder.getOrCreateArray(None)); if (const CXXMethodDecl *Method = dyn_cast(D)) return getOrCreateMethodType(Method, F); if (const ObjCMethodDecl *OMethod = dyn_cast(D)) { // Add "self" and "_cmd" SmallVector Elts; // First element is always return type. For 'void' functions it is NULL. QualType ResultTy = OMethod->getResultType(); // Replace the instancetype keyword with the actual type. if (ResultTy == CGM.getContext().getObjCInstanceType()) ResultTy = CGM.getContext().getPointerType( QualType(OMethod->getClassInterface()->getTypeForDecl(), 0)); Elts.push_back(getOrCreateType(ResultTy, F)); // "self" pointer is always first argument. QualType SelfDeclTy = OMethod->getSelfDecl()->getType(); llvm::DIType SelfTy = getOrCreateType(SelfDeclTy, F); Elts.push_back(CreateSelfType(SelfDeclTy, SelfTy)); // "_cmd" pointer is always second argument. llvm::DIType CmdTy = getOrCreateType(OMethod->getCmdDecl()->getType(), F); Elts.push_back(DBuilder.createArtificialType(CmdTy)); // Get rest of the arguments. for (ObjCMethodDecl::param_const_iterator PI = OMethod->param_begin(), PE = OMethod->param_end(); PI != PE; ++PI) Elts.push_back(getOrCreateType((*PI)->getType(), F)); llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(Elts); return DBuilder.createSubroutineType(F, EltTypeArray); } // Variadic function. if (const FunctionDecl *FD = dyn_cast(D)) if (FD->isVariadic()) { SmallVector EltTys; EltTys.push_back(getOrCreateType(FD->getResultType(), F)); if (const FunctionProtoType *FPT = dyn_cast(FnType)) for (unsigned i = 0, e = FPT->getNumArgs(); i != e; ++i) EltTys.push_back(getOrCreateType(FPT->getArgType(i), F)); EltTys.push_back(DBuilder.createUnspecifiedParameter()); llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(EltTys); return DBuilder.createSubroutineType(F, EltTypeArray); } return llvm::DICompositeType(getOrCreateType(FnType, F)); } /// EmitFunctionStart - Constructs the debug code for entering a function. void CGDebugInfo::EmitFunctionStart(GlobalDecl GD, QualType FnType, llvm::Function *Fn, CGBuilderTy &Builder) { StringRef Name; StringRef LinkageName; FnBeginRegionCount.push_back(LexicalBlockStack.size()); const Decl *D = GD.getDecl(); // Function may lack declaration in source code if it is created by Clang // CodeGen (examples: _GLOBAL__I_a, __cxx_global_array_dtor, thunk). bool HasDecl = (D != 0); // Use the location of the declaration. SourceLocation Loc; if (HasDecl) Loc = D->getLocation(); unsigned Flags = 0; llvm::DIFile Unit = getOrCreateFile(Loc); llvm::DIDescriptor FDContext(Unit); llvm::DIArray TParamsArray; if (!HasDecl) { // Use llvm function name. LinkageName = Fn->getName(); } else if (const FunctionDecl *FD = dyn_cast(D)) { // If there is a DISubprogram for this function available then use it. llvm::DenseMap::iterator FI = SPCache.find(FD->getCanonicalDecl()); if (FI != SPCache.end()) { llvm::Value *V = FI->second; llvm::DIDescriptor SP(dyn_cast_or_null(V)); if (SP.isSubprogram() && llvm::DISubprogram(SP).isDefinition()) { llvm::MDNode *SPN = SP; LexicalBlockStack.push_back(SPN); RegionMap[D] = llvm::WeakVH(SP); return; } } Name = getFunctionName(FD); // Use mangled name as linkage name for C/C++ functions. if (FD->hasPrototype()) { LinkageName = CGM.getMangledName(GD); Flags |= llvm::DIDescriptor::FlagPrototyped; } // No need to replicate the linkage name if it isn't different from the // subprogram name, no need to have it at all unless coverage is enabled or // debug is set to more than just line tables. if (LinkageName == Name || (!CGM.getCodeGenOpts().EmitGcovArcs && !CGM.getCodeGenOpts().EmitGcovNotes && DebugKind <= CodeGenOptions::DebugLineTablesOnly)) LinkageName = StringRef(); if (DebugKind >= CodeGenOptions::LimitedDebugInfo) { if (const NamespaceDecl *NSDecl = dyn_cast_or_null(FD->getDeclContext())) FDContext = getOrCreateNameSpace(NSDecl); else if (const RecordDecl *RDecl = dyn_cast_or_null(FD->getDeclContext())) FDContext = getContextDescriptor(cast(RDecl)); // Collect template parameters. TParamsArray = CollectFunctionTemplateParams(FD, Unit); } } else if (const ObjCMethodDecl *OMD = dyn_cast(D)) { Name = getObjCMethodName(OMD); Flags |= llvm::DIDescriptor::FlagPrototyped; } else { // Use llvm function name. Name = Fn->getName(); Flags |= llvm::DIDescriptor::FlagPrototyped; } if (!Name.empty() && Name[0] == '\01') Name = Name.substr(1); unsigned LineNo = getLineNumber(Loc); if (!HasDecl || D->isImplicit()) Flags |= llvm::DIDescriptor::FlagArtificial; llvm::DISubprogram SP = DBuilder.createFunction(FDContext, Name, LinkageName, Unit, LineNo, getOrCreateFunctionType(D, FnType, Unit), Fn->hasInternalLinkage(), true /*definition*/, getLineNumber(CurLoc), Flags, CGM.getLangOpts().Optimize, Fn, TParamsArray, getFunctionDeclaration(D)); if (HasDecl) DeclCache.insert(std::make_pair(D->getCanonicalDecl(), llvm::WeakVH(SP))); // Push function on region stack. llvm::MDNode *SPN = SP; LexicalBlockStack.push_back(SPN); if (HasDecl) RegionMap[D] = llvm::WeakVH(SP); } /// EmitLocation - Emit metadata to indicate a change in line/column /// information in the source file. If the location is invalid, the /// previous location will be reused. void CGDebugInfo::EmitLocation(CGBuilderTy &Builder, SourceLocation Loc, bool ForceColumnInfo) { // Update our current location setLocation(Loc); if (CurLoc.isInvalid() || CurLoc.isMacroID()) return; // Don't bother if things are the same as last time. SourceManager &SM = CGM.getContext().getSourceManager(); if (CurLoc == PrevLoc || SM.getExpansionLoc(CurLoc) == SM.getExpansionLoc(PrevLoc)) // New Builder may not be in sync with CGDebugInfo. if (!Builder.getCurrentDebugLocation().isUnknown() && Builder.getCurrentDebugLocation().getScope(CGM.getLLVMContext()) == LexicalBlockStack.back()) return; // Update last state. PrevLoc = CurLoc; llvm::MDNode *Scope = LexicalBlockStack.back(); Builder.SetCurrentDebugLocation(llvm::DebugLoc::get (getLineNumber(CurLoc), getColumnNumber(CurLoc, ForceColumnInfo), Scope)); } /// CreateLexicalBlock - Creates a new lexical block node and pushes it on /// the stack. void CGDebugInfo::CreateLexicalBlock(SourceLocation Loc) { llvm::DIDescriptor D = DBuilder.createLexicalBlock(LexicalBlockStack.empty() ? llvm::DIDescriptor() : llvm::DIDescriptor(LexicalBlockStack.back()), getOrCreateFile(CurLoc), getLineNumber(CurLoc), getColumnNumber(CurLoc)); llvm::MDNode *DN = D; LexicalBlockStack.push_back(DN); } /// EmitLexicalBlockStart - Constructs the debug code for entering a declarative /// region - beginning of a DW_TAG_lexical_block. void CGDebugInfo::EmitLexicalBlockStart(CGBuilderTy &Builder, SourceLocation Loc) { // Set our current location. setLocation(Loc); // Create a new lexical block and push it on the stack. CreateLexicalBlock(Loc); // Emit a line table change for the current location inside the new scope. Builder.SetCurrentDebugLocation(llvm::DebugLoc::get(getLineNumber(Loc), getColumnNumber(Loc), LexicalBlockStack.back())); } /// EmitLexicalBlockEnd - Constructs the debug code for exiting a declarative /// region - end of a DW_TAG_lexical_block. void CGDebugInfo::EmitLexicalBlockEnd(CGBuilderTy &Builder, SourceLocation Loc) { assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!"); // Provide an entry in the line table for the end of the block. EmitLocation(Builder, Loc); LexicalBlockStack.pop_back(); } /// EmitFunctionEnd - Constructs the debug code for exiting a function. void CGDebugInfo::EmitFunctionEnd(CGBuilderTy &Builder) { assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!"); unsigned RCount = FnBeginRegionCount.back(); assert(RCount <= LexicalBlockStack.size() && "Region stack mismatch"); // Pop all regions for this function. while (LexicalBlockStack.size() != RCount) EmitLexicalBlockEnd(Builder, CurLoc); FnBeginRegionCount.pop_back(); } // EmitTypeForVarWithBlocksAttr - Build up structure info for the byref. // See BuildByRefType. llvm::DIType CGDebugInfo::EmitTypeForVarWithBlocksAttr(const VarDecl *VD, uint64_t *XOffset) { SmallVector EltTys; QualType FType; uint64_t FieldSize, FieldOffset; unsigned FieldAlign; llvm::DIFile Unit = getOrCreateFile(VD->getLocation()); QualType Type = VD->getType(); FieldOffset = 0; FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy); EltTys.push_back(CreateMemberType(Unit, FType, "__isa", &FieldOffset)); EltTys.push_back(CreateMemberType(Unit, FType, "__forwarding", &FieldOffset)); FType = CGM.getContext().IntTy; EltTys.push_back(CreateMemberType(Unit, FType, "__flags", &FieldOffset)); EltTys.push_back(CreateMemberType(Unit, FType, "__size", &FieldOffset)); bool HasCopyAndDispose = CGM.getContext().BlockRequiresCopying(Type, VD); if (HasCopyAndDispose) { FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy); EltTys.push_back(CreateMemberType(Unit, FType, "__copy_helper", &FieldOffset)); EltTys.push_back(CreateMemberType(Unit, FType, "__destroy_helper", &FieldOffset)); } bool HasByrefExtendedLayout; Qualifiers::ObjCLifetime Lifetime; if (CGM.getContext().getByrefLifetime(Type, Lifetime, HasByrefExtendedLayout) && HasByrefExtendedLayout) { FType = CGM.getContext().getPointerType(CGM.getContext().VoidTy); EltTys.push_back(CreateMemberType(Unit, FType, "__byref_variable_layout", &FieldOffset)); } CharUnits Align = CGM.getContext().getDeclAlign(VD); if (Align > CGM.getContext().toCharUnitsFromBits( CGM.getTarget().getPointerAlign(0))) { CharUnits FieldOffsetInBytes = CGM.getContext().toCharUnitsFromBits(FieldOffset); CharUnits AlignedOffsetInBytes = FieldOffsetInBytes.RoundUpToAlignment(Align); CharUnits NumPaddingBytes = AlignedOffsetInBytes - FieldOffsetInBytes; if (NumPaddingBytes.isPositive()) { llvm::APInt pad(32, NumPaddingBytes.getQuantity()); FType = CGM.getContext().getConstantArrayType(CGM.getContext().CharTy, pad, ArrayType::Normal, 0); EltTys.push_back(CreateMemberType(Unit, FType, "", &FieldOffset)); } } FType = Type; llvm::DIType FieldTy = CGDebugInfo::getOrCreateType(FType, Unit); FieldSize = CGM.getContext().getTypeSize(FType); FieldAlign = CGM.getContext().toBits(Align); *XOffset = FieldOffset; FieldTy = DBuilder.createMemberType(Unit, VD->getName(), Unit, 0, FieldSize, FieldAlign, FieldOffset, 0, FieldTy); EltTys.push_back(FieldTy); FieldOffset += FieldSize; llvm::DIArray Elements = DBuilder.getOrCreateArray(EltTys); unsigned Flags = llvm::DIDescriptor::FlagBlockByrefStruct; return DBuilder.createStructType(Unit, "", Unit, 0, FieldOffset, 0, Flags, llvm::DIType(), Elements); } /// EmitDeclare - Emit local variable declaration debug info. void CGDebugInfo::EmitDeclare(const VarDecl *VD, unsigned Tag, llvm::Value *Storage, unsigned ArgNo, CGBuilderTy &Builder) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!"); bool Unwritten = VD->isImplicit() || (isa(VD->getDeclContext()) && cast(VD->getDeclContext())->isImplicit()); llvm::DIFile Unit; if (!Unwritten) Unit = getOrCreateFile(VD->getLocation()); llvm::DIType Ty; uint64_t XOffset = 0; if (VD->hasAttr()) Ty = EmitTypeForVarWithBlocksAttr(VD, &XOffset); else Ty = getOrCreateType(VD->getType(), Unit); // If there is no debug info for this type then do not emit debug info // for this variable. if (!Ty) return; // Get location information. unsigned Line = 0; unsigned Column = 0; if (!Unwritten) { Line = getLineNumber(VD->getLocation()); Column = getColumnNumber(VD->getLocation()); } unsigned Flags = 0; if (VD->isImplicit()) Flags |= llvm::DIDescriptor::FlagArtificial; // If this is the first argument and it is implicit then // give it an object pointer flag. // FIXME: There has to be a better way to do this, but for static // functions there won't be an implicit param at arg1 and // otherwise it is 'self' or 'this'. if (isa(VD) && ArgNo == 1) Flags |= llvm::DIDescriptor::FlagObjectPointer; if (llvm::Argument *Arg = dyn_cast(Storage)) if (Arg->getType()->isPointerTy() && !Arg->hasByValAttr() && !VD->getType()->isPointerType()) Flags |= llvm::DIDescriptor::FlagIndirectVariable; llvm::MDNode *Scope = LexicalBlockStack.back(); StringRef Name = VD->getName(); if (!Name.empty()) { if (VD->hasAttr()) { CharUnits offset = CharUnits::fromQuantity(32); SmallVector addr; llvm::Type *Int64Ty = CGM.Int64Ty; addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus)); // offset of __forwarding field offset = CGM.getContext().toCharUnitsFromBits( CGM.getTarget().getPointerWidth(0)); addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity())); addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref)); addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus)); // offset of x field offset = CGM.getContext().toCharUnitsFromBits(XOffset); addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity())); // Create the descriptor for the variable. llvm::DIVariable D = DBuilder.createComplexVariable(Tag, llvm::DIDescriptor(Scope), VD->getName(), Unit, Line, Ty, addr, ArgNo); // Insert an llvm.dbg.declare into the current block. llvm::Instruction *Call = DBuilder.insertDeclare(Storage, D, Builder.GetInsertBlock()); Call->setDebugLoc(llvm::DebugLoc::get(Line, Column, Scope)); return; } else if (isa(VD->getType())) Flags |= llvm::DIDescriptor::FlagIndirectVariable; } else if (const RecordType *RT = dyn_cast(VD->getType())) { // If VD is an anonymous union then Storage represents value for // all union fields. const RecordDecl *RD = cast(RT->getDecl()); if (RD->isUnion() && RD->isAnonymousStructOrUnion()) { for (RecordDecl::field_iterator I = RD->field_begin(), E = RD->field_end(); I != E; ++I) { FieldDecl *Field = *I; llvm::DIType FieldTy = getOrCreateType(Field->getType(), Unit); StringRef FieldName = Field->getName(); // Ignore unnamed fields. Do not ignore unnamed records. if (FieldName.empty() && !isa(Field->getType())) continue; // Use VarDecl's Tag, Scope and Line number. llvm::DIVariable D = DBuilder.createLocalVariable(Tag, llvm::DIDescriptor(Scope), FieldName, Unit, Line, FieldTy, CGM.getLangOpts().Optimize, Flags, ArgNo); // Insert an llvm.dbg.declare into the current block. llvm::Instruction *Call = DBuilder.insertDeclare(Storage, D, Builder.GetInsertBlock()); Call->setDebugLoc(llvm::DebugLoc::get(Line, Column, Scope)); } return; } } // Create the descriptor for the variable. llvm::DIVariable D = DBuilder.createLocalVariable(Tag, llvm::DIDescriptor(Scope), Name, Unit, Line, Ty, CGM.getLangOpts().Optimize, Flags, ArgNo); // Insert an llvm.dbg.declare into the current block. llvm::Instruction *Call = DBuilder.insertDeclare(Storage, D, Builder.GetInsertBlock()); Call->setDebugLoc(llvm::DebugLoc::get(Line, Column, Scope)); } void CGDebugInfo::EmitDeclareOfAutoVariable(const VarDecl *VD, llvm::Value *Storage, CGBuilderTy &Builder) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); EmitDeclare(VD, llvm::dwarf::DW_TAG_auto_variable, Storage, 0, Builder); } /// Look up the completed type for a self pointer in the TypeCache and /// create a copy of it with the ObjectPointer and Artificial flags /// set. If the type is not cached, a new one is created. This should /// never happen though, since creating a type for the implicit self /// argument implies that we already parsed the interface definition /// and the ivar declarations in the implementation. llvm::DIType CGDebugInfo::CreateSelfType(const QualType &QualTy, llvm::DIType Ty) { llvm::DIType CachedTy = getTypeOrNull(QualTy); if (CachedTy) Ty = CachedTy; else DEBUG(llvm::dbgs() << "No cached type for self."); return DBuilder.createObjectPointerType(Ty); } void CGDebugInfo::EmitDeclareOfBlockDeclRefVariable(const VarDecl *VD, llvm::Value *Storage, CGBuilderTy &Builder, const CGBlockInfo &blockInfo) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!"); if (Builder.GetInsertBlock() == 0) return; bool isByRef = VD->hasAttr(); uint64_t XOffset = 0; llvm::DIFile Unit = getOrCreateFile(VD->getLocation()); llvm::DIType Ty; if (isByRef) Ty = EmitTypeForVarWithBlocksAttr(VD, &XOffset); else Ty = getOrCreateType(VD->getType(), Unit); // Self is passed along as an implicit non-arg variable in a // block. Mark it as the object pointer. if (isa(VD) && VD->getName() == "self") Ty = CreateSelfType(VD->getType(), Ty); // Get location information. unsigned Line = getLineNumber(VD->getLocation()); unsigned Column = getColumnNumber(VD->getLocation()); const llvm::DataLayout &target = CGM.getDataLayout(); CharUnits offset = CharUnits::fromQuantity( target.getStructLayout(blockInfo.StructureType) ->getElementOffset(blockInfo.getCapture(VD).getIndex())); SmallVector addr; llvm::Type *Int64Ty = CGM.Int64Ty; if (isa(Storage)) addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref)); addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus)); addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity())); if (isByRef) { addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref)); addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus)); // offset of __forwarding field offset = CGM.getContext() .toCharUnitsFromBits(target.getPointerSizeInBits(0)); addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity())); addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpDeref)); addr.push_back(llvm::ConstantInt::get(Int64Ty, llvm::DIBuilder::OpPlus)); // offset of x field offset = CGM.getContext().toCharUnitsFromBits(XOffset); addr.push_back(llvm::ConstantInt::get(Int64Ty, offset.getQuantity())); } // Create the descriptor for the variable. llvm::DIVariable D = DBuilder.createComplexVariable(llvm::dwarf::DW_TAG_auto_variable, llvm::DIDescriptor(LexicalBlockStack.back()), VD->getName(), Unit, Line, Ty, addr); // Insert an llvm.dbg.declare into the current block. llvm::Instruction *Call = DBuilder.insertDeclare(Storage, D, Builder.GetInsertPoint()); Call->setDebugLoc(llvm::DebugLoc::get(Line, Column, LexicalBlockStack.back())); } /// EmitDeclareOfArgVariable - Emit call to llvm.dbg.declare for an argument /// variable declaration. void CGDebugInfo::EmitDeclareOfArgVariable(const VarDecl *VD, llvm::Value *AI, unsigned ArgNo, CGBuilderTy &Builder) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); EmitDeclare(VD, llvm::dwarf::DW_TAG_arg_variable, AI, ArgNo, Builder); } namespace { struct BlockLayoutChunk { uint64_t OffsetInBits; const BlockDecl::Capture *Capture; }; bool operator<(const BlockLayoutChunk &l, const BlockLayoutChunk &r) { return l.OffsetInBits < r.OffsetInBits; } } void CGDebugInfo::EmitDeclareOfBlockLiteralArgVariable(const CGBlockInfo &block, llvm::Value *Arg, llvm::Value *LocalAddr, CGBuilderTy &Builder) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); ASTContext &C = CGM.getContext(); const BlockDecl *blockDecl = block.getBlockDecl(); // Collect some general information about the block's location. SourceLocation loc = blockDecl->getCaretLocation(); llvm::DIFile tunit = getOrCreateFile(loc); unsigned line = getLineNumber(loc); unsigned column = getColumnNumber(loc); // Build the debug-info type for the block literal. getContextDescriptor(cast(blockDecl->getDeclContext())); const llvm::StructLayout *blockLayout = CGM.getDataLayout().getStructLayout(block.StructureType); SmallVector fields; fields.push_back(createFieldType("__isa", C.VoidPtrTy, 0, loc, AS_public, blockLayout->getElementOffsetInBits(0), tunit, tunit)); fields.push_back(createFieldType("__flags", C.IntTy, 0, loc, AS_public, blockLayout->getElementOffsetInBits(1), tunit, tunit)); fields.push_back(createFieldType("__reserved", C.IntTy, 0, loc, AS_public, blockLayout->getElementOffsetInBits(2), tunit, tunit)); fields.push_back(createFieldType("__FuncPtr", C.VoidPtrTy, 0, loc, AS_public, blockLayout->getElementOffsetInBits(3), tunit, tunit)); fields.push_back(createFieldType("__descriptor", C.getPointerType(block.NeedsCopyDispose ? C.getBlockDescriptorExtendedType() : C.getBlockDescriptorType()), 0, loc, AS_public, blockLayout->getElementOffsetInBits(4), tunit, tunit)); // We want to sort the captures by offset, not because DWARF // requires this, but because we're paranoid about debuggers. SmallVector chunks; // 'this' capture. if (blockDecl->capturesCXXThis()) { BlockLayoutChunk chunk; chunk.OffsetInBits = blockLayout->getElementOffsetInBits(block.CXXThisIndex); chunk.Capture = 0; chunks.push_back(chunk); } // Variable captures. for (BlockDecl::capture_const_iterator i = blockDecl->capture_begin(), e = blockDecl->capture_end(); i != e; ++i) { const BlockDecl::Capture &capture = *i; const VarDecl *variable = capture.getVariable(); const CGBlockInfo::Capture &captureInfo = block.getCapture(variable); // Ignore constant captures. if (captureInfo.isConstant()) continue; BlockLayoutChunk chunk; chunk.OffsetInBits = blockLayout->getElementOffsetInBits(captureInfo.getIndex()); chunk.Capture = &capture; chunks.push_back(chunk); } // Sort by offset. llvm::array_pod_sort(chunks.begin(), chunks.end()); for (SmallVectorImpl::iterator i = chunks.begin(), e = chunks.end(); i != e; ++i) { uint64_t offsetInBits = i->OffsetInBits; const BlockDecl::Capture *capture = i->Capture; // If we have a null capture, this must be the C++ 'this' capture. if (!capture) { const CXXMethodDecl *method = cast(blockDecl->getNonClosureContext()); QualType type = method->getThisType(C); fields.push_back(createFieldType("this", type, 0, loc, AS_public, offsetInBits, tunit, tunit)); continue; } const VarDecl *variable = capture->getVariable(); StringRef name = variable->getName(); llvm::DIType fieldType; if (capture->isByRef()) { std::pair ptrInfo = C.getTypeInfo(C.VoidPtrTy); // FIXME: this creates a second copy of this type! uint64_t xoffset; fieldType = EmitTypeForVarWithBlocksAttr(variable, &xoffset); fieldType = DBuilder.createPointerType(fieldType, ptrInfo.first); fieldType = DBuilder.createMemberType(tunit, name, tunit, line, ptrInfo.first, ptrInfo.second, offsetInBits, 0, fieldType); } else { fieldType = createFieldType(name, variable->getType(), 0, loc, AS_public, offsetInBits, tunit, tunit); } fields.push_back(fieldType); } SmallString<36> typeName; llvm::raw_svector_ostream(typeName) << "__block_literal_" << CGM.getUniqueBlockCount(); llvm::DIArray fieldsArray = DBuilder.getOrCreateArray(fields); llvm::DIType type = DBuilder.createStructType(tunit, typeName.str(), tunit, line, CGM.getContext().toBits(block.BlockSize), CGM.getContext().toBits(block.BlockAlign), 0, llvm::DIType(), fieldsArray); type = DBuilder.createPointerType(type, CGM.PointerWidthInBits); // Get overall information about the block. unsigned flags = llvm::DIDescriptor::FlagArtificial; llvm::MDNode *scope = LexicalBlockStack.back(); // Create the descriptor for the parameter. llvm::DIVariable debugVar = DBuilder.createLocalVariable(llvm::dwarf::DW_TAG_arg_variable, llvm::DIDescriptor(scope), Arg->getName(), tunit, line, type, CGM.getLangOpts().Optimize, flags, cast(Arg)->getArgNo() + 1); if (LocalAddr) { // Insert an llvm.dbg.value into the current block. llvm::Instruction *DbgVal = DBuilder.insertDbgValueIntrinsic(LocalAddr, 0, debugVar, Builder.GetInsertBlock()); DbgVal->setDebugLoc(llvm::DebugLoc::get(line, column, scope)); } // Insert an llvm.dbg.declare into the current block. llvm::Instruction *DbgDecl = DBuilder.insertDeclare(Arg, debugVar, Builder.GetInsertBlock()); DbgDecl->setDebugLoc(llvm::DebugLoc::get(line, column, scope)); } /// If D is an out-of-class definition of a static data member of a class, find /// its corresponding in-class declaration. llvm::DIDerivedType CGDebugInfo::getOrCreateStaticDataMemberDeclarationOrNull(const VarDecl *D) { if (!D->isStaticDataMember()) return llvm::DIDerivedType(); llvm::DenseMap::iterator MI = StaticDataMemberCache.find(D->getCanonicalDecl()); if (MI != StaticDataMemberCache.end()) { assert(MI->second && "Static data member declaration should still exist"); return llvm::DIDerivedType(cast(MI->second)); } // If the member wasn't found in the cache, lazily construct and add it to the // type (used when a limited form of the type is emitted). llvm::DICompositeType Ctxt( getContextDescriptor(cast(D->getDeclContext()))); llvm::DIDerivedType T = CreateRecordStaticField(D, Ctxt); Ctxt.addMember(T); return T; } /// EmitGlobalVariable - Emit information about a global variable. void CGDebugInfo::EmitGlobalVariable(llvm::GlobalVariable *Var, const VarDecl *D) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); // Create global variable debug descriptor. llvm::DIFile Unit = getOrCreateFile(D->getLocation()); unsigned LineNo = getLineNumber(D->getLocation()); setLocation(D->getLocation()); QualType T = D->getType(); if (T->isIncompleteArrayType()) { // CodeGen turns int[] into int[1] so we'll do the same here. llvm::APInt ConstVal(32, 1); QualType ET = CGM.getContext().getAsArrayType(T)->getElementType(); T = CGM.getContext().getConstantArrayType(ET, ConstVal, ArrayType::Normal, 0); } StringRef DeclName = D->getName(); StringRef LinkageName; if (D->getDeclContext() && !isa(D->getDeclContext()) && !isa(D->getDeclContext())) LinkageName = Var->getName(); if (LinkageName == DeclName) LinkageName = StringRef(); llvm::DIDescriptor DContext = getContextDescriptor(dyn_cast(D->getDeclContext())); llvm::DIGlobalVariable GV = DBuilder.createStaticVariable( DContext, DeclName, LinkageName, Unit, LineNo, getOrCreateType(T, Unit), Var->hasInternalLinkage(), Var, getOrCreateStaticDataMemberDeclarationOrNull(D)); DeclCache.insert(std::make_pair(D->getCanonicalDecl(), llvm::WeakVH(GV))); } /// EmitGlobalVariable - Emit information about an objective-c interface. void CGDebugInfo::EmitGlobalVariable(llvm::GlobalVariable *Var, ObjCInterfaceDecl *ID) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); // Create global variable debug descriptor. llvm::DIFile Unit = getOrCreateFile(ID->getLocation()); unsigned LineNo = getLineNumber(ID->getLocation()); StringRef Name = ID->getName(); QualType T = CGM.getContext().getObjCInterfaceType(ID); if (T->isIncompleteArrayType()) { // CodeGen turns int[] into int[1] so we'll do the same here. llvm::APInt ConstVal(32, 1); QualType ET = CGM.getContext().getAsArrayType(T)->getElementType(); T = CGM.getContext().getConstantArrayType(ET, ConstVal, ArrayType::Normal, 0); } DBuilder.createGlobalVariable(Name, Unit, LineNo, getOrCreateType(T, Unit), Var->hasInternalLinkage(), Var); } /// EmitGlobalVariable - Emit global variable's debug info. void CGDebugInfo::EmitGlobalVariable(const ValueDecl *VD, llvm::Constant *Init) { assert(DebugKind >= CodeGenOptions::LimitedDebugInfo); // Create the descriptor for the variable. llvm::DIFile Unit = getOrCreateFile(VD->getLocation()); StringRef Name = VD->getName(); llvm::DIType Ty = getOrCreateType(VD->getType(), Unit); if (const EnumConstantDecl *ECD = dyn_cast(VD)) { const EnumDecl *ED = cast(ECD->getDeclContext()); assert(isa(ED->getTypeForDecl()) && "Enum without EnumType?"); Ty = getOrCreateType(QualType(ED->getTypeForDecl(), 0), Unit); } // Do not use DIGlobalVariable for enums. if (Ty.getTag() == llvm::dwarf::DW_TAG_enumeration_type) return; llvm::DIGlobalVariable GV = DBuilder.createStaticVariable( Unit, Name, Name, Unit, getLineNumber(VD->getLocation()), Ty, true, Init, getOrCreateStaticDataMemberDeclarationOrNull(cast(VD))); DeclCache.insert(std::make_pair(VD->getCanonicalDecl(), llvm::WeakVH(GV))); } llvm::DIScope CGDebugInfo::getCurrentContextDescriptor(const Decl *D) { if (!LexicalBlockStack.empty()) return llvm::DIScope(LexicalBlockStack.back()); return getContextDescriptor(D); } void CGDebugInfo::EmitUsingDirective(const UsingDirectiveDecl &UD) { if (CGM.getCodeGenOpts().getDebugInfo() < CodeGenOptions::LimitedDebugInfo) return; DBuilder.createImportedModule( getCurrentContextDescriptor(cast(UD.getDeclContext())), getOrCreateNameSpace(UD.getNominatedNamespace()), getLineNumber(UD.getLocation())); } void CGDebugInfo::EmitUsingDecl(const UsingDecl &UD) { if (CGM.getCodeGenOpts().getDebugInfo() < CodeGenOptions::LimitedDebugInfo) return; assert(UD.shadow_size() && "We shouldn't be codegening an invalid UsingDecl containing no decls"); // Emitting one decl is sufficient - debuggers can detect that this is an // overloaded name & provide lookup for all the overloads. const UsingShadowDecl &USD = **UD.shadow_begin(); if (llvm::DIDescriptor Target = getDeclarationOrDefinition(USD.getUnderlyingDecl())) DBuilder.createImportedDeclaration( getCurrentContextDescriptor(cast(USD.getDeclContext())), Target, getLineNumber(USD.getLocation())); } llvm::DIImportedEntity CGDebugInfo::EmitNamespaceAlias(const NamespaceAliasDecl &NA) { if (CGM.getCodeGenOpts().getDebugInfo() < CodeGenOptions::LimitedDebugInfo) return llvm::DIImportedEntity(0); llvm::WeakVH &VH = NamespaceAliasCache[&NA]; if (VH) return llvm::DIImportedEntity(cast(VH)); llvm::DIImportedEntity R(0); if (const NamespaceAliasDecl *Underlying = dyn_cast(NA.getAliasedNamespace())) // This could cache & dedup here rather than relying on metadata deduping. R = DBuilder.createImportedModule( getCurrentContextDescriptor(cast(NA.getDeclContext())), EmitNamespaceAlias(*Underlying), getLineNumber(NA.getLocation()), NA.getName()); else R = DBuilder.createImportedModule( getCurrentContextDescriptor(cast(NA.getDeclContext())), getOrCreateNameSpace(cast(NA.getAliasedNamespace())), getLineNumber(NA.getLocation()), NA.getName()); VH = R; return R; } /// getOrCreateNamesSpace - Return namespace descriptor for the given /// namespace decl. llvm::DINameSpace CGDebugInfo::getOrCreateNameSpace(const NamespaceDecl *NSDecl) { NSDecl = NSDecl->getCanonicalDecl(); llvm::DenseMap::iterator I = NameSpaceCache.find(NSDecl); if (I != NameSpaceCache.end()) return llvm::DINameSpace(cast(I->second)); unsigned LineNo = getLineNumber(NSDecl->getLocation()); llvm::DIFile FileD = getOrCreateFile(NSDecl->getLocation()); llvm::DIDescriptor Context = getContextDescriptor(dyn_cast(NSDecl->getDeclContext())); llvm::DINameSpace NS = DBuilder.createNameSpace(Context, NSDecl->getName(), FileD, LineNo); NameSpaceCache[NSDecl] = llvm::WeakVH(NS); return NS; } void CGDebugInfo::finalize() { for (std::vector >::const_iterator VI = ReplaceMap.begin(), VE = ReplaceMap.end(); VI != VE; ++VI) { llvm::DIType Ty, RepTy; // Verify that the debug info still exists. if (llvm::Value *V = VI->second) Ty = llvm::DIType(cast(V)); llvm::DenseMap::iterator it = TypeCache.find(VI->first); if (it != TypeCache.end()) { // Verify that the debug info still exists. if (llvm::Value *V = it->second) RepTy = llvm::DIType(cast(V)); } if (Ty && Ty.isForwardDecl() && RepTy) Ty.replaceAllUsesWith(RepTy); } // We keep our own list of retained types, because we need to look // up the final type in the type cache. for (std::vector::const_iterator RI = RetainedTypes.begin(), RE = RetainedTypes.end(); RI != RE; ++RI) DBuilder.retainType(llvm::DIType(cast(TypeCache[*RI]))); DBuilder.finalize(); } Index: user/ae/inet6/contrib/llvm/tools/clang =================================================================== --- user/ae/inet6/contrib/llvm/tools/clang (revision 271452) +++ user/ae/inet6/contrib/llvm/tools/clang (revision 271453) Property changes on: user/ae/inet6/contrib/llvm/tools/clang ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/llvm/tools/clang:r271428-271452 Index: user/ae/inet6/contrib/llvm =================================================================== --- user/ae/inet6/contrib/llvm (revision 271452) +++ user/ae/inet6/contrib/llvm (revision 271453) Property changes on: user/ae/inet6/contrib/llvm ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/llvm:r271428-271452 Index: user/ae/inet6/etc/motd =================================================================== --- user/ae/inet6/etc/motd (revision 271452) +++ user/ae/inet6/etc/motd (revision 271453) @@ -1,25 +1,21 @@ FreeBSD ?.?.? (UNKNOWN) -Welcome to FreeBSD! +Welcome to FreeBSD! Handy technical support resources: -Before seeking technical support, please use the following resources: +Security advisories and errata: https://www.FreeBSD.org/releases/ +Handbook: https://www.FreeBSD.org/handbook/ +FAQ: https://www.FreeBSD.org/faq/ +Mailing list: https://lists.FreeBSD.org/mailman/listinfo/freebsd-questions/ +Forums: https://forums.FreeBSD.org/ -o Security advisories and updated errata information for all releases are - at http://www.FreeBSD.org/releases/ - always consult the ERRATA section - for your release first as it's updated frequently. +Documents installed with the system are in the /usr/local/share/doc/freebsd/ +directory, or can be installed later with: pkg install en-freebsd-doc +For other languages, replace "en" with a language code like de or fr. -o The Handbook and FAQ documents are at http://www.FreeBSD.org/ and, - along with the mailing lists, can be searched by going to - http://www.FreeBSD.org/search/. If the doc package has been installed - (or fetched via pkg install lang-freebsd-doc, where lang is the - 2-letter language code, e.g. en), they are also available formatted - in /usr/local/share/doc/freebsd. +Show the version of FreeBSD installed: uname -a +Please include that output and any error messages when posting questions. -If you still have a question or problem, please take the output of -`uname -a', along with any relevant error messages, and email it -as a question to the questions@FreeBSD.org mailing list. If you are -unfamiliar with FreeBSD's directory layout, please refer to the hier(7) -manual page. If you are not familiar with manual pages, type `man man'. +Introduction to manual pages: man man +FreeBSD directory layout: man hier Edit /etc/motd to change this login announcement. - Index: user/ae/inet6/etc =================================================================== --- user/ae/inet6/etc (revision 271452) +++ user/ae/inet6/etc (revision 271453) Property changes on: user/ae/inet6/etc ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/etc:r271428-271452 Index: user/ae/inet6/share/examples/bhyve/vmrun.sh =================================================================== --- user/ae/inet6/share/examples/bhyve/vmrun.sh (revision 271452) +++ user/ae/inet6/share/examples/bhyve/vmrun.sh (revision 271453) @@ -1,254 +1,254 @@ #!/bin/sh # # Copyright (c) 2013 NetApp, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # LOADER=/usr/sbin/bhyveload BHYVECTL=/usr/sbin/bhyvectl FBSDRUN=/usr/sbin/bhyve DEFAULT_MEMSIZE=512M DEFAULT_CPUS=2 DEFAULT_TAPDEV=tap0 DEFAULT_CONSOLE=stdio DEFAULT_VIRTIO_DISK="./diskdev" DEFAULT_ISOFILE="./release.iso" usage() { echo "Usage: vmrun.sh [-ahi] [-c ] [-C ] [-d ]" echo " [-e ] [-g ] [-H ]" echo " [-I ] [-m ]" echo " [-t ] " echo "" echo " -h: display this help message" echo " -a: force memory mapped local APIC access" echo " -c: number of virtual cpus (default is ${DEFAULT_CPUS})" echo " -C: console device (default is ${DEFAULT_CONSOLE})" echo " -d: virtio diskdev file (default is ${DEFAULT_VIRTIO_DISK})" echo " -e: set FreeBSD loader environment variable" echo " -g: listen for connection from kgdb at " echo " -H: host filesystem to export to the loader" echo " -i: force boot of the Installation CDROM image" echo " -I: Installation CDROM image location (default is ${DEFAULT_ISOFILE})" echo " -m: memory size (default is ${DEFAULT_MEMSIZE})" echo " -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)" echo "" echo " This script needs to be executed with superuser privileges" echo "" exit 1 } if [ `id -u` -ne 0 ]; then usage fi kldstat -n vmm > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "vmm.ko is not loaded!" exit 1 fi force_install=0 isofile=${DEFAULT_ISOFILE} memsize=${DEFAULT_MEMSIZE} console=${DEFAULT_CONSOLE} cpus=${DEFAULT_CPUS} tap_total=0 disk_total=0 apic_opt="" gdbport=0 loader_opt="" while getopts ac:C:d:e:g:hH:iI:m:t: c ; do case $c in a) apic_opt="-a" ;; c) cpus=${OPTARG} ;; C) console=${OPTARG} ;; d) eval "disk_dev${disk_total}=\"${OPTARG}\"" disk_total=$(($disk_total + 1)) ;; e) loader_opt="${loader_opt} -e ${OPTARG}" ;; g) gdbport=${OPTARG} ;; H) host_base=`realpath ${OPTARG}` ;; i) force_install=1 ;; I) isofile=${OPTARG} ;; m) memsize=${OPTARG} ;; t) eval "tap_dev${tap_total}=\"${OPTARG}\"" tap_total=$(($tap_total + 1)) ;; *) usage ;; esac done if [ $tap_total -eq 0 ] ; then tap_total=1 tap_dev0="${DEFAULT_TAPDEV}" fi if [ $disk_total -eq 0 ] ; then disk_total=1 disk_dev0="${DEFAULT_VIRTIO_DISK}" fi shift $((${OPTIND} - 1)) if [ $# -ne 1 ]; then usage fi vmname="$1" if [ -n "${host_base}" ]; then loader_opt="${loader_opt} -h ${host_base}" fi make_and_check_diskdev() { local virtio_diskdev="$1" # Create the virtio diskdev file if needed if [ ! -f ${virtio_diskdev} ]; then echo "virtio disk device file \"${virtio_diskdev}\" does not exist." echo "Creating it ..." truncate -s 8G ${virtio_diskdev} > /dev/null fi if [ ! -r ${virtio_diskdev} ]; then echo "virtio disk device file \"${virtio_diskdev}\" is not readable" exit 1 fi if [ ! -w ${virtio_diskdev} ]; then echo "virtio disk device file \"${virtio_diskdev}\" is not writable" exit 1 fi } echo "Launching virtual machine \"$vmname\" ..." virtio_diskdev="$disk_dev0" ${BHYVECTL} --vm=${vmname} --destroy > /dev/null 2>&1 while [ 1 ]; do file -s ${virtio_diskdev} | grep "boot sector" > /dev/null rc=$? if [ $rc -ne 0 ]; then file -s ${virtio_diskdev} | grep ": Unix Fast File sys" > /dev/null rc=$? fi if [ $rc -ne 0 ]; then need_install=1 else need_install=0 fi if [ $force_install -eq 1 -o $need_install -eq 1 ]; then if [ ! -r ${isofile} ]; then echo -n "Installation CDROM image \"${isofile}\" " echo "is not readable" exit 1 fi BOOTDISK=${isofile} - installer_opt="-s 31:0,virtio-blk,${BOOTDISK}" + installer_opt="-s 31:0,ahci-cd,${BOOTDISK}" else BOOTDISK=${virtio_diskdev} installer_opt="" fi ${LOADER} -c ${console} -m ${memsize} -d ${BOOTDISK} ${loader_opt} \ ${vmname} if [ $? -ne 0 ]; then break fi # # Build up args for additional tap and disk devices now. # nextslot=2 # slot 0 is hostbridge, slot 1 is lpc devargs="" # accumulate disk/tap args here i=0 while [ $i -lt $tap_total ] ; do eval "tapname=\$tap_dev${i}" devargs="$devargs -s $nextslot:0,virtio-net,${tapname} " nextslot=$(($nextslot + 1)) i=$(($i + 1)) done i=0 while [ $i -lt $disk_total ] ; do eval "disk=\$disk_dev${i}" make_and_check_diskdev "${disk}" devargs="$devargs -s $nextslot:0,virtio-blk,${disk} " nextslot=$(($nextslot + 1)) i=$(($i + 1)) done ${FBSDRUN} -c ${cpus} -m ${memsize} ${apic_opt} -A -H -P \ -g ${gdbport} \ -s 0:0,hostbridge \ -s 1:0,lpc \ ${devargs} \ -l com1,${console} \ ${installer_opt} \ ${vmname} # bhyve returns the following status codes: # 0 - VM has been reset # 1 - VM has been powered off # 2 - VM has been halted # 3 - VM generated a triple fault # all other non-zero status codes are errors # if [ $? -ne 0 ]; then break fi done exit 99 Index: user/ae/inet6/share/man/man4/cxgbe.4 =================================================================== --- user/ae/inet6/share/man/man4/cxgbe.4 (revision 271452) +++ user/ae/inet6/share/man/man4/cxgbe.4 (revision 271453) @@ -1,317 +1,328 @@ .\" Copyright (c) 2011-2014, Chelsio Inc .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions are met: .\" .\" 1. Redistributions of source code must retain the above copyright notice, .\" this list of conditions and the following disclaimer. .\" .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" 3. Neither the name of the Chelsio Inc nor the names of its .\" contributors may be used to endorse or promote products derived from .\" this software without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" .\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" .\" * Other names and brands may be claimed as the property of others. .\" .\" $FreeBSD$ .\" .Dd March 20, 2014 .Dt CXGBE 4 .Os .Sh NAME .Nm cxgbe .Nd "Chelsio T4 and T5 based 40Gb, 10Gb, and 1Gb Ethernet adapter driver" .Sh SYNOPSIS To compile this driver into the kernel, place the following lines in your kernel configuration file: .Bd -ragged -offset indent .Cd "device cxgbe" .Ed .Pp To load the driver as a module at boot time, place the following lines in .Xr loader.conf 5 : .Bd -literal -offset indent t4fw_cfg_load="YES" t5fw_cfg_load="YES" if_cxgbe_load="YES" .Ed .Sh DESCRIPTION The .Nm driver provides support for PCI Express Ethernet adapters based on the Chelsio Terminator 4 and Terminator 5 ASICs (T4 and T5). The driver supports Jumbo Frames, Transmit/Receive checksum offload, TCP segmentation offload (TSO), Large Receive Offload (LRO), VLAN tag insertion/extraction, VLAN checksum offload, VLAN TSO, and Receive Side Steering (RSS). For further hardware information and questions related to hardware requirements, see .Pa http://www.chelsio.com/ . .Pp Note that ports of T5 cards are named cxl and attach to a t5nex parent device (in contrast to ports named cxgbe that attach to a t4nex parent for a T4 card). Loader tunables with the hw.cxgbe prefix apply to both T4 and T5 cards. The sysctl MIBs are at dev.t5nex and dev.cxl for T5 cards and at dev.t4nex and dev.cxgbe for T4 cards. .Pp For more information on configuring this device, see .Xr ifconfig 8 . .Sh HARDWARE The .Nm driver supports 40Gb, 10Gb and 1Gb Ethernet adapters based on the T5 ASIC (ports will be named cxl): .Pp .Bl -bullet -compact .It Chelsio T580-CR .It Chelsio T580-LP-CR .It Chelsio T580-LP-SO-CR .It Chelsio T560-CR .It Chelsio T540-CR .It Chelsio T540-LP-CR .It Chelsio T522-CR .It Chelsio T520-LL-CR .It Chelsio T520-CR .It Chelsio T520-SO .It Chelsio T520-BT .It Chelsio T504-BT .El .Pp The .Nm driver supports 10Gb and 1Gb Ethernet adapters based on the T4 ASIC: .Pp .Bl -bullet -compact .It Chelsio T420-CR .It Chelsio T422-CR .It Chelsio T440-CR .It Chelsio T420-BCH .It Chelsio T440-BCH .It Chelsio T440-CH .It Chelsio T420-SO .It Chelsio T420-CX .It Chelsio T420-BT .It Chelsio T404-BT .El .Sh LOADER TUNABLES Tunables can be set at the .Xr loader 8 prompt before booting the kernel or stored in .Xr loader.conf 5 . .Bl -tag -width indent .It Va hw.cxgbe.ntxq10g The number of tx queues to use for a 10Gb or 40Gb port. The default is 16 or the number of CPU cores in the system, whichever is less. .It Va hw.cxgbe.nrxq10g The number of rx queues to use for a 10Gb or 40Gb port. The default is 8 or the number of CPU cores in the system, whichever is less. .It Va hw.cxgbe.ntxq1g The number of tx queues to use for a 1Gb port. The default is 4 or the number of CPU cores in the system, whichever is less. .It Va hw.cxgbe.nrxq1g The number of rx queues to use for a 1Gb port. The default is 2 or the number of CPU cores in the system, whichever is less. .It Va hw.cxgbe.nofldtxq10g The number of TOE tx queues to use for a 10Gb or 40Gb port. The default is 8 or the number of CPU cores in the system, whichever is less. .It Va hw.cxgbe.nofldrxq10g The number of TOE rx queues to use for a 10Gb or 40Gb port. The default is 2 or the number of CPU cores in the system, whichever is less. .It Va hw.cxgbe.nofldtxq1g The number of TOE tx queues to use for a 1Gb port. The default is 2 or the number of CPU cores in the system, whichever is less. .It Va hw.cxgbe.nofldrxq1g The number of TOE rx queues to use for a 1Gb port. The default is 1. .It Va hw.cxgbe.holdoff_timer_idx_10G .It Va hw.cxgbe.holdoff_timer_idx_1G The timer index value to use to delay interrupts. The holdoff timer list has the values 1, 5, 10, 50, 100, and 200 by default (all values are in microseconds) and the index selects a value from this list. The default value is 1 which means the timer value is 5us. Different interfaces can be assigned different values at any time via the dev.cxgbe.X.holdoff_tmr_idx or dev.cxl.X.holdoff_tmr_idx sysctl. .It Va hw.cxgbe.holdoff_pktc_idx_10G .It Va hw.cxgbe.holdoff_pktc_idx_1G The packet-count index value to use to delay interrupts. The packet-count list has the values 1, 8, 16, and 32 by default and the index selects a value from this list. The default value is -1 which means packet counting is disabled and interrupts are generated based solely on the holdoff timer value. Different interfaces can be assigned different values via the dev.cxgbe.X.holdoff_pktc_idx or dev.cxl.X.holdoff_pktc_idx sysctl. This sysctl works only when the interface has never been marked up (as done by ifconfig up). .It Va hw.cxgbe.qsize_txq The size, in number of entries, of the descriptor ring used for a tx queue. A buf_ring of the same size is also allocated for additional software queuing. See .Xr ifnet 9 . The default value is 1024. Different interfaces can be assigned different values via the dev.cxgbe.X.qsize_txq sysctl or dev.cxl.X.qsize_txq sysctl. This sysctl works only when the interface has never been marked up (as done by ifconfig up). .It Va hw.cxgbe.qsize_rxq The size, in number of entries, of the descriptor ring used for an rx queue. The default value is 1024. Different interfaces can be assigned different values via the dev.cxgbe.X.qsize_rxq or dev.cxl.X.qsize_rxq sysctl. This sysctl works only when the interface has never been marked up (as done by ifconfig up). .It Va hw.cxgbe.interrupt_types The interrupt types that the driver is allowed to use. Bit 0 represents INTx (line interrupts), bit 1 MSI, bit 2 MSI-X. The default is 7 (all allowed). The driver will select the best possible type out of the allowed types by itself. .It Va hw.cxgbe.fw_install 0 prohibits the driver from installing a firmware on the card. 1 allows the driver to install a new firmware if internal driver heuristics indicate that the new firmware is preferable to the one already on the card. 2 instructs the driver to always install the new firmware on the card as long as it is compatible with the driver and is a different version than the one already on the card. The default is 1. .It Va hw.cxgbe.fl_pktshift The number of bytes of padding inserted before the begining of an Ethernet frame in the receive buffer. The default value of 2 ensures that the Ethernet payload (usually the IP header) is at a 4 byte aligned address. 0-7 are all valid values. .It Va hw.cxgbe.fl_pad A non-zero value ensures that writes from the hardware to a receive buffer are padded up to the specified boundary. The default is -1 which lets the driver pick a pad boundary. 0 disables trailer padding completely. .It Va hw.cxgbe.cong_drop Controls the hardware response to congestion. -1 disables congestion feedback and is not recommended. 0 instructs the hardware to backpressure its pipeline on congestion. -This usually results in the port emitting pause frames. +This usually results in the port emitting PAUSE frames. 1 instructs the hardware to drop frames destined for congested queues. +.It Va hw.cxgbe.pause_settings +PAUSE frame settings. +Bit 0 is rx_pause, bit 1 is tx_pause. +rx_pause = 1 instructs the hardware to heed incoming PAUSE frames, 0 instructs +it to ignore them. +tx_pause = 1 allows the hardware to emit PAUSE frames when its receive FIFO +reaches a high threshold, 0 prohibits the hardware from emitting PAUSE frames. +The default is 3 (both rx_pause and tx_pause = 1). +This tunable establishes the default PAUSE settings for all ports. +Settings can be displayed and controlled on a per-port basis via the +dev.cxgbe.X.pause_settings (dev.cxl.X.pause_settings for T5 cards) sysctl. .It Va hw.cxgbe.buffer_packing Allow the hardware to deliver multiple frames in the same receive buffer opportunistically. The default is -1 which lets the driver decide. 0 or 1 explicitly disable or enable this feature. .It Va hw.cxgbe.allow_mbufs_in_cluster 1 allows the driver to lay down one or more mbufs within the receive buffer opportunistically. This is the default. 0 prohibits the driver from doing so. .It Va hw.cxgbe.largest_rx_cluster .It Va hw.cxgbe.safest_rx_cluster Sizes of rx clusters. Each of these must be set to one of the sizes available (usually 2048, 4096, 9216, and 16384) and largest_rx_cluster must be greater than or equal to safest_rx_cluster. The defaults are 16384 and 4096 respectively. The driver will never attempt to allocate a receive buffer larger than largest_rx_cluster and will fall back to allocating buffers of safest_rx_cluster size if an allocation larger than safest_rx_cluster fails. Note that largest_rx_cluster merely establishes a ceiling -- the driver is allowed to allocate buffers of smaller sizes. .It Va hw.cxgbe.config_file Select a pre-packaged device configuration file. A configuration file contains a recipe for partitioning and configuring the hardware resources on the card. This tunable is for specialized applications only and should not be used in normal operation. The configuration profile currently in use is available in the dev.t4nex.X.cf and dev.t4nex.X.cfcsum (dev.t5nex for T5 cards) sysctls. .It Va hw.cxgbe.linkcaps_allowed .It Va hw.cxgbe.niccaps_allowed .It Va hw.cxgbe.toecaps_allowed .It Va hw.cxgbe.rdmacaps_allowed .It Va hw.cxgbe.iscsicaps_allowed .It Va hw.cxgbe.fcoecaps_allowed Disallowing capabilities provides a hint to the driver and firmware to not reserve hardware resources for that feature. Each of these is a bit field with a bit for each sub-capability within the capability. This tunable is for specialized applications only and should not be used in normal operation. The capabilities for which hardware resources have been reserved are listed in dev.t4nex.X.*caps or dev.t5nex.X.*caps sysctls. .El .Sh SUPPORT For general information and support, go to the Chelsio support website at: .Pa http://www.chelsio.com/ . .Pp If an issue is identified with this driver with a supported adapter, email all the specific information related to the issue to .Aq Mt support@chelsio.com . .Sh SEE ALSO .Xr altq 4 , .Xr arp 4 , .Xr cxgb 4 , .Xr netintro 4 , .Xr ng_ether 4 , .Xr ifconfig 8 .Sh HISTORY The .Nm device driver first appeared in .Fx 9.0 . Support for T5 cards first appeared in .Fx 9.2 and .Fx 10.0 . .Sh AUTHORS .An -nosplit The .Nm driver was written by .An Navdeep Parhar Aq Mt np@FreeBSD.org . Index: user/ae/inet6/share/man/man4 =================================================================== --- user/ae/inet6/share/man/man4 (revision 271452) +++ user/ae/inet6/share/man/man4 (revision 271453) Property changes on: user/ae/inet6/share/man/man4 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/share/man/man4:r271428-271452 Index: user/ae/inet6/share/man/man9/ifnet.9 =================================================================== --- user/ae/inet6/share/man/man9/ifnet.9 (revision 271452) +++ user/ae/inet6/share/man/man9/ifnet.9 (revision 271453) @@ -1,1517 +1,1529 @@ .\" -*- Nroff -*- .\" Copyright 1996, 1997 Massachusetts Institute of Technology .\" .\" Permission to use, copy, modify, and distribute this software and .\" its documentation for any purpose and without fee is hereby .\" granted, provided that both the above copyright notice and this .\" permission notice appear in all copies, that both the above .\" copyright notice and this permission notice appear in all .\" supporting documentation, and that the name of M.I.T. not be used .\" in advertising or publicity pertaining to distribution of the .\" software without specific, written prior permission. M.I.T. makes .\" no representations about the suitability of this software for any .\" purpose. It is provided "as is" without express or implied .\" warranty. .\" .\" THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS .\" ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, .\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF .\" MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT .\" SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, .\" SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT .\" LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF .\" USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND .\" ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, .\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT .\" OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd July 29, 2014 .Dt IFNET 9 .Os .Sh NAME .Nm ifnet , .Nm ifaddr , .Nm ifqueue , .Nm if_data .Nd kernel interfaces for manipulating network interfaces .Sh SYNOPSIS .In sys/param.h .In sys/time.h .In sys/socket.h .In net/if.h .In net/if_var.h .In net/if_types.h .\" .Ss "Interface Manipulation Functions" .Ft "struct ifnet *" .Fn if_alloc "u_char type" .Ft void .Fn if_attach "struct ifnet *ifp" .Ft void .Fn if_detach "struct ifnet *ifp" .Ft void .Fn if_free "struct ifnet *ifp" .Ft void .Fn if_free_type "struct ifnet *ifp" "u_char type" .Ft void .Fn if_down "struct ifnet *ifp" .Ft int .Fn ifioctl "struct socket *so" "u_long cmd" "caddr_t data" "struct thread *td" .Ft int .Fn ifpromisc "struct ifnet *ifp" "int pswitch" .Ft int .Fn if_allmulti "struct ifnet *ifp" "int amswitch" .Ft "struct ifnet *" .Fn ifunit "const char *name" .Ft "struct ifnet *" .Fn ifunit_ref "const char *name" .Ft void .Fn if_up "struct ifnet *ifp" .\" .Ss "Interface Address Functions" .Ft "struct ifaddr *" .Fn ifaddr_byindex "u_short idx" .Ft "struct ifaddr *" .Fn ifa_ifwithaddr "struct sockaddr *addr" .Ft "struct ifaddr *" -.Fn ifa_ifwithdstaddr "struct sockaddr *addr" +.Fn ifa_ifwithdstaddr "struct sockaddr *addr" "int fib" .Ft "struct ifaddr *" -.Fn ifa_ifwithnet "struct sockaddr *addr" "int ignore_ptp" +.Fn ifa_ifwithnet "struct sockaddr *addr" "int ignore_ptp" "int fib" .Ft "struct ifaddr *" .Fn ifaof_ifpforaddr "struct sockaddr *addr" "struct ifnet *ifp" .Ft void .Fn ifa_ref "struct ifaddr *ifa" .Ft void .Fn ifa_free "struct ifaddr *ifa" .\" .Ss "Interface Multicast Address Functions" .Ft int .Fn if_addmulti "struct ifnet *ifp" "struct sockaddr *sa" "struct ifmultiaddr **ifmap" .Ft int .Fn if_delmulti "struct ifnet *ifp" "struct sockaddr *sa" .Ft "struct ifmultiaddr *" .Fn if_findmulti "struct ifnet *ifp" "struct sockaddr *sa" .Ss "Output queue macros" .Fn IF_DEQUEUE "struct ifqueue *ifq" "struct mbuf *m" .\" .Ss "struct ifnet Member Functions" .Ft void .Fn \*(lp*if_input\*(rp "struct ifnet *ifp" "struct mbuf *m" .Ft int .Fo \*(lp*if_output\*(rp .Fa "struct ifnet *ifp" "struct mbuf *m" .Fa "const struct sockaddr *dst" "struct route *ro" .Fc .Ft void .Fn \*(lp*if_start\*(rp "struct ifnet *ifp" .Ft int .Fn \*(lp*if_transmit\*(rp "struct ifnet *ifp" "struct mbuf *m" .Ft void .Fn \*(lp*if_qflush\*(rp "struct ifnet *ifp" .Ft int .Fn \*(lp*if_ioctl\*(rp "struct ifnet *ifp" "u_long cmd" "caddr_t data" .Ft void .Fn \*(lp*if_init\*(rp "void *if_softc" .Ft int .Fo \*(lp*if_resolvemulti\*(rp .Fa "struct ifnet *ifp" "struct sockaddr **retsa" "struct sockaddr *addr" .Fc .Ss "struct ifaddr member function" .Ft void .Fo \*(lp*ifa_rtrequest\*(rp .Fa "int cmd" "struct rtentry *rt" "struct rt_addrinfo *info" .Fc .\" .Ss "Global Variables" .Vt extern struct ifnethead ifnet ; .\" extern struct ifindex_entry *ifindex_table ; .Vt extern int if_index ; .Vt extern int ifqmaxlen ; .Sh DATA STRUCTURES The kernel mechanisms for handling network interfaces reside primarily in the .Vt ifnet , if_data , ifaddr , and .Vt ifmultiaddr structures in .In net/if.h and .In net/if_var.h and the functions named above and defined in .Pa /sys/net/if.c . Those interfaces which are intended to be used by user programs are defined in .In net/if.h ; these include the interface flags, the .Vt if_data structure, and the structures defining the appearance of interface-related messages on the .Xr route 4 routing socket and in .Xr sysctl 3 . The header file .In net/if_var.h defines the kernel-internal interfaces, including the .Vt ifnet , ifaddr , and .Vt ifmultiaddr structures and the functions which manipulate them. (A few user programs will need .In net/if_var.h because it is the prerequisite of some other header file like .In netinet/if_ether.h . Most references to those two files in particular can be replaced by .In net/ethernet.h . ) .Pp The system keeps a linked list of interfaces using the .Li TAILQ macros defined in .Xr queue 3 ; this list is headed by a .Vt "struct ifnethead" called .Va ifnet . The elements of this list are of type .Vt "struct ifnet" , and most kernel routines which manipulate interface as such accept or return pointers to these structures. Each interface structure contains an .Vt if_data structure used for statistics and information. Each interface also has a .Li TAILQ of interface addresses, described by .Vt ifaddr structures. An .Dv AF_LINK address (see .Xr link_addr 3 ) describing the link layer implemented by the interface (if any) is accessed by the .Fn ifaddr_byindex function or .Va if_addr structure. (Some trivial interfaces do not provide any link layer addresses; this structure, while still present, serves only to identify the interface name and index.) .Pp Finally, those interfaces supporting reception of multicast datagrams have a .Li TAILQ of multicast group memberships, described by .Vt ifmultiaddr structures. These memberships are reference-counted. .Pp Interfaces are also associated with an output queue, defined as a .Vt "struct ifqueue" ; this structure is used to hold packets while the interface is in the process of sending another. .Pp .Ss The Vt ifnet Ss structure The fields of .Vt "struct ifnet" are as follows: .Bl -tag -width ".Va if_capabilities" -offset indent .It Va if_softc .Pq Vt "void *" A pointer to the driver's private state block. (Initialized by driver.) .It Va if_l2com .Pq Vt "void *" A pointer to the common data for the interface's layer 2 protocol. (Initialized by .Fn if_alloc . ) .It Va if_vnet .Pq Vt "struct vnet *" A pointer to the virtual network stack instance. (Initialized by .Fn if_attach . ) .It Va if_home_vnet .Pq Vt "struct vnet *" A pointer to the parent virtual network stack, where this .Vt "struct ifnet" originates from. (Initialized by .Fn if_attach . ) .It Va if_link .Pq Fn TAILQ_ENTRY ifnet .Xr queue 3 macro glue. .It Va if_xname .Pq Vt "char *" The name of the interface, (e.g., .Dq Li fxp0 or .Dq Li lo0 ) . (Initialized by driver (usually via .Fn if_initname ) . ) .It Va if_dname .Pq Vt "const char *" The name of the driver. (Initialized by driver (usually via .Fn if_initname ) . ) .It Va if_dunit .Pq Vt int A unique number assigned to each interface managed by a particular driver. Drivers may choose to set this to .Dv IF_DUNIT_NONE if a unit number is not associated with the device. (Initialized by driver (usually via .Fn if_initname ) . ) .It Va if_refcount .Pq Vt u_int The reference count. (Initialized by .Fn if_alloc . ) .It Va if_addrhead .Pq Vt "struct ifaddrhead" The head of the .Xr queue 3 .Li TAILQ containing the list of addresses assigned to this interface. .It Va if_pcount .Pq Vt int A count of promiscuous listeners on this interface, used to reference-count the .Dv IFF_PROMISC flag. .It Va if_carp .Pq Vt "struct carp_if *" A pointer to the CARP interface structure, .Xr carp 4 . (Initialized by the driver-specific .Fn if_ioctl routine.) .It Va if_bpf .Pq Vt "struct bpf_if *" Opaque per-interface data for the packet filter, .Xr bpf 4 . (Initialized by .Fn bpf_attach . ) .It Va if_index .Pq Vt u_short A unique number assigned to each interface in sequence as it is attached. This number can be used in a .Vt "struct sockaddr_dl" to refer to a particular interface by index (see .Xr link_addr 3 ) . (Initialized by .Fn if_alloc . ) .It Va if_vlantrunk .Pq Vt struct ifvlantrunk * A pointer to 802.1Q trunk structure, .Xr vlan 4 . (Initialized by the driver-specific .Fn if_ioctl routine.) .It Va if_flags .Pq Vt int Flags describing operational parameters of this interface (see below). (Manipulated by generic code.) .It Va if_drv_flags .Pq Vt int Flags describing operational status of this interface (see below). (Manipulated by driver.) .It Va if_capabilities .Pq Vt int Flags describing the capabilities the interface supports (see below). .It Va if_capenable .Pq Vt int Flags describing the enabled capabilities of the interface (see below). .It Va if_linkmib .Pq Vt "void *" A pointer to an interface-specific MIB structure exported by .Xr ifmib 4 . (Initialized by driver.) .It Va if_linkmiblen .Pq Vt size_t The size of said structure. (Initialized by driver.) .It Va if_data .Pq Vt "struct if_data" More statistics and information; see .Sx "The if_data structure" , below. (Initialized by driver, manipulated by both driver and generic code.) .It Va if_multiaddrs .Pq Vt struct ifmultihead The head of the .Xr queue 3 .Li TAILQ containing the list of multicast addresses assigned to this interface. .It Va if_amcount .Pq Vt int A number of multicast requests on this interface, used to reference-count the .Dv IFF_ALLMULTI flag. .It Va if_addr .Pq Vt "struct ifaddr *" A pointer to the link-level interface address. (Initialized by .Fn if_alloc . ) .\" .It Va if_llsoftc .\" .Pq Vt "void *" .\" The purpose of the field is unclear. .It Va if_snd .Pq Vt "struct ifaltq" The output queue. (Manipulated by driver.) .It Va if_broadcastaddr .Pq Vt "const u_int8_t *" A link-level broadcast bytestring for protocols with variable address length. .It Va if_bridge .Pq Vt "void *" A pointer to the bridge interface structure, .Xr if_bridge 4 . (Initialized by the driver-specific .Fn if_ioctl routine.) .It Va if_label .Pq Vt "struct label *" A pointer to the MAC Framework label structure, .Xr mac 4 . (Initialized by .Fn if_alloc . ) .It Va if_afdata .Pq Vt "void *" An address family dependent data region. .It Va if_afdata_initialized .Pq Vt int Used to track the current state of address family initialization. .It Va if_afdata_lock .Pq Vt "struct rwlock" An .Xr rwlock 9 lock used to protect .Va if_afdata internals. .It Va if_linktask .Pq Vt "struct task" A .Xr taskqueue 9 task scheduled for link state change events of the interface. .It Va if_addr_lock .Pq Vt "struct rwlock" An .Xr rwlock 9 lock used to protect interface-related address lists. .It Va if_clones .Pq Fn LIST_ENTRY ifnet .Xr queue 3 macro glue for the list of clonable network interfaces. .It Va if_groups .Pq Fn TAILQ_HEAD ", ifg_list" The head of the .Xr queue 3 .Li TAILQ containing the list of groups per interface. .It Va if_pf_kif .Pq Vt "void *" A pointer to the structure used for interface abstraction by .Xr pf 4 . .It Va if_lagg .Pq Vt "void *" A pointer to the .Xr lagg 4 interface structure. .It Va if_alloctype .Pq Vt u_char The type of the interface as it was at the time of its allocation. It is used to cache the type passed to .Fn if_alloc , but unlike .Va if_type , it would not be changed by drivers. .El .Pp References to .Vt ifnet structures are gained by calling the .Fn if_ref function and released by calling the .Fn if_rele function. They are used to allow kernel code walking global interface lists to release the .Vt ifnet lock yet keep the .Vt ifnet structure stable. .Pp There are in addition a number of function pointers which the driver must initialize to complete its interface with the generic interface layer: .Bl -ohang -offset indent .It Fn if_input Pass a packet to an appropriate upper layer as determined from the link-layer header of the packet. This routine is to be called from an interrupt handler or used to emulate reception of a packet on this interface. A single function implementing .Fn if_input can be shared among multiple drivers utilizing the same link-layer framing, e.g., Ethernet. .It Fn if_output Output a packet on interface .Fa ifp , or queue it on the output queue if the interface is already active. .It Fn if_transmit Transmit a packet on an interface or queue it if the interface is in use. This function will return .Dv ENOBUFS if the devices software and hardware queues are both full. This function must be installed after .Fn if_attach to override the default implementation. This function is exposed in order to allow drivers to manage their own queues and to reduce the latency caused by a frequently gratuitous enqueue / dequeue pair to ifq. The suggested internal software queueing mechanism is buf_ring. .It Fn if_qflush Free mbufs in internally managed queues when the interface is marked down. This function must be installed after .Fn if_attach to override the default implementation. This function is exposed in order to allow drivers to manage their own queues and to reduce the latency caused by a frequently gratuitous enqueue / dequeue pair to ifq. The suggested internal software queueing mechanism is buf_ring. .It Fn if_start Start queued output on an interface. This function is exposed in order to provide for some interface classes to share a .Fn if_output among all drivers. .Fn if_start may only be called when the .Dv IFF_DRV_OACTIVE flag is not set. (Thus, .Dv IFF_DRV_OACTIVE does not literally mean that output is active, but rather that the device's internal output queue is full.) Please note that this function will soon be deprecated. .It Fn if_ioctl Process interface-related .Xr ioctl 2 requests (defined in .In sys/sockio.h ) . Preliminary processing is done by the generic routine .Fn ifioctl to check for appropriate privileges, locate the interface being manipulated, and perform certain generic operations like twiddling flags and flushing queues. See the description of .Fn ifioctl below for more information. .It Fn if_init Initialize and bring up the hardware, e.g., reset the chip and enable the receiver unit. Should mark the interface running, but not active .Dv ( IFF_DRV_RUNNING , ~IIF_DRV_OACTIVE ) . .It Fn if_resolvemulti Check the requested multicast group membership, .Fa addr , for validity, and if necessary compute a link-layer group which corresponds to that address which is returned in .Fa *retsa . Returns zero on success, or an error code on failure. .El .Ss "Interface Flags" Interface flags are used for a number of different purposes. Some flags simply indicate information about the type of interface and its capabilities; others are dynamically manipulated to reflect the current state of the interface. Flags of the former kind are marked .Aq S in this table; the latter are marked .Aq D . Flags which begin with .Dq IFF_DRV_ are stored in .Va if_drv_flags ; all other flags are stored in .Va if_flags . .Pp The macro .Dv IFF_CANTCHANGE defines the bits which cannot be set by a user program using the .Dv SIOCSIFFLAGS command to .Xr ioctl 2 ; these are indicated by an asterisk .Pq Ql * in the following listing. .Pp .Bl -tag -width ".Dv IFF_POINTOPOINT" -offset indent -compact .It Dv IFF_UP .Aq D The interface has been configured up by the user-level code. .It Dv IFF_BROADCAST .Aq S* The interface supports broadcast. .It Dv IFF_DEBUG .Aq D Used to enable/disable driver debugging code. .It Dv IFF_LOOPBACK .Aq S The interface is a loopback device. .It Dv IFF_POINTOPOINT .Aq S* The interface is point-to-point; .Dq broadcast address is actually the address of the other end. .It Dv IFF_DRV_RUNNING .Aq D* The interface has been configured and dynamic resources were successfully allocated. Probably only useful internal to the interface. .It Dv IFF_NOARP .Aq D Disable network address resolution on this interface. .It Dv IFF_PROMISC .Aq D* This interface is in promiscuous mode. .It Dv IFF_PPROMISC .Aq D This interface is in the permanently promiscuous mode (implies .Dv IFF_PROMISC ) . .It Dv IFF_ALLMULTI .Aq D* This interface is in all-multicasts mode (used by multicast routers). .It Dv IFF_DRV_OACTIVE .Aq D* The interface's hardware output queue (if any) is full; output packets are to be queued. .It Dv IFF_SIMPLEX .Aq S* The interface cannot hear its own transmissions. .It Dv IFF_LINK0 .It Dv IFF_LINK1 .It Dv IFF_LINK2 .Aq D Control flags for the link layer. (Currently abused to select among multiple physical layers on some devices.) .It Dv IFF_MULTICAST .Aq S* This interface supports multicast. .It Dv IFF_CANTCONFIG .Aq S* The interface is not configurable in a meaningful way. Primarily useful for .Dv IFT_USB interfaces registered at the interface list. .It Dv IFF_MONITOR .Aq D This interface blocks transmission of packets and discards incoming packets after BPF processing. Used to monitor network traffic but not interact with the network in question. .It Dv IFF_STATICARP .Aq D Used to enable/disable ARP requests on this interface. .It Dv IFF_DYING .Aq D* Set when the .Vt ifnet structure of this interface is being released and still has .Va if_refcount references. .It Dv IFF_RENAMING .Aq D* Set when this interface is being renamed. .El .Ss "Interface Capabilities Flags" Interface capabilities are specialized features an interface may or may not support. These capabilities are very hardware-specific and allow, when enabled, to offload specific network processing to the interface or to offer a particular feature for use by other kernel parts. .Pp It should be stressed that a capability can be completely uncontrolled (i.e., stay always enabled with no way to disable it) or allow limited control over itself (e.g., depend on another capability's state.) Such peculiarities are determined solely by the hardware and driver of a particular interface. Only the driver possesses the knowledge on whether and how the interface capabilities can be controlled. Consequently, capabilities flags in .Va if_capenable should never be modified directly by kernel code other than the interface driver. The command .Dv SIOCSIFCAP to .Fn ifioctl is the dedicated means to attempt altering .Va if_capenable on an interface. Userland code shall use .Xr ioctl 2 . .Pp The following capabilities are currently supported by the system: .Bl -tag -width ".Dv IFCAP_POLLING_NOCOUNT" -offset indent .It Dv IFCAP_RXCSUM This interface can do checksum validation on receiving data. Some interfaces do not have sufficient buffer storage to store frames above a certain MTU-size completely. The driver for the interface might disable hardware checksum validation if the MTU is set above the hardcoded limit. .It Dv IFCAP_TXCSUM This interface can do checksum calculation on transmitting data. .It Dv IFCAP_HWCSUM A shorthand for .Pq Dv IFCAP_RXCSUM | IFCAP_TXCSUM . .It Dv IFCAP_NETCONS This interface can be a network console. .It Dv IFCAP_VLAN_MTU The .Xr vlan 4 driver can operate over this interface in software tagging mode without having to decrease MTU on .Xr vlan 4 interfaces below 1500 bytes. This implies the ability of this interface to cope with frames somewhat longer than permitted by the Ethernet specification. .It Dv IFCAP_VLAN_HWTAGGING This interface can do VLAN tagging on output and demultiplex frames by their VLAN tag on input. .It Dv IFCAP_JUMBO_MTU This Ethernet interface can transmit and receive frames up to 9000 bytes long. .It Dv IFCAP_POLLING This interface supports .Xr polling 4 . See below for details. .It Dv IFCAP_VLAN_HWCSUM This interface can do checksum calculation on both transmitting and receiving data on .Xr vlan 4 interfaces (implies .Dv IFCAP_HWCSUM ) . .It Dv IFCAP_TSO4 This Ethernet interface supports TCP4 Segmentation offloading. .It Dv IFCAP_TSO6 This Ethernet interface supports TCP6 Segmentation offloading. .It Dv IFCAP_TSO A shorthand for .Pq Dv IFCAP_TSO4 | IFCAP_TSO6 . .It Dv IFCAP_TOE4 This Ethernet interface supports TCP offloading. .It Dv IFCAP_TOE6 This Ethernet interface supports TCP6 offloading. .It Dv IFCAP_TOE A shorthand for .Pq Dv IFCAP_TOE4 | IFCAP_TOE6 . .It Dv IFCAP_WOL_UCAST This Ethernet interface supports waking up on any Unicast packet. .It Dv IFCAP_WOL_MCAST This Ethernet interface supports waking up on any Multicast packet. .It Dv IFCAP_WOL_MAGIC This Ethernet interface supports waking up on any Magic packet such as those sent by .Xr wake 8 . .It Dv IFCAP_WOL A shorthand for .Pq Dv IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC . .It Dv IFCAP_TOE4 This Ethernet interface supports TCP4 Offload Engine. .It Dv IFCAP_TOE6 This Ethernet interface supports TCP6 Offload Engine. .It Dv IFCAP_TOE A shorthand for .Pq Dv IFCAP_TOE4 | IFCAP_TOE6 . .It Dv IFCAP_VLAN_HWFILTER This interface supports frame filtering in hardware on .Xr vlan 4 interfaces. .It Dv IFCAP_POLLING_NOCOUNT The return value for the number of processed packets should be skipped for this interface. .It Dv IFCAP_VLAN_HWTSO This interface supports TCP Segmentation offloading on .Xr vlan 4 interfaces (implies .Dv IFCAP_TSO ) . .It Dv IFCAP_LINKSTATE This Ethernet interface supports dynamic link state changes. .El .Pp The ability of advanced network interfaces to offload certain computational tasks from the host CPU to the board is limited mostly to TCP/IP. Therefore a separate field associated with an interface (see .Va ifnet.if_data.ifi_hwassist below) keeps a detailed description of its enabled capabilities specific to TCP/IP processing. The TCP/IP module consults the field to see which tasks can be done on an .Em outgoing packet by the interface. The flags defined for that field are a superset of those for .Va mbuf.m_pkthdr.csum_flags , namely: .Bl -tag -width ".Dv CSUM_FRAGMENT" -offset indent .It Dv CSUM_IP The interface will compute IP checksums. .It Dv CSUM_TCP The interface will compute TCP checksums. .It Dv CSUM_UDP The interface will compute UDP checksums. .It Dv CSUM_IP_FRAGS The interface can compute a TCP or UDP checksum for a packet fragmented by the host CPU. Makes sense only along with .Dv CSUM_TCP or .Dv CSUM_UDP . .It Dv CSUM_FRAGMENT The interface will do the fragmentation of IP packets if necessary. The host CPU does not need to care about MTU on this interface as long as a packet to transmit through it is an IP one and it does not exceed the size of the hardware buffer. .El .Pp An interface notifies the TCP/IP module about the tasks the former has performed on an .Em incoming packet by setting the corresponding flags in the field .Va mbuf.m_pkthdr.csum_flags of the .Vt mbuf chain containing the packet. See .Xr mbuf 9 for details. .Pp The capability of a network interface to operate in .Xr polling 4 mode involves several flags in different global variables and per-interface fields. The capability flag .Dv IFCAP_POLLING set in interface's .Va if_capabilities indicates support for .Xr polling 4 on the particular interface. If set in .Va if_capabilities , the same flag can be marked or cleared in the interface's .Va if_capenable within .Fn ifioctl , thus initiating switch of the interface to .Xr polling 4 mode or interrupt mode, respectively. The actual mode change is managed by the driver-specific .Fn if_ioctl routine. The .Xr polling handler returns the number of packets processed. .Ss The Vt if_data Ss Structure The .Vt if_data structure contains statistics and identifying information used by management programs, and which is exported to user programs by way of the .Xr ifmib 4 branch of the .Xr sysctl 3 MIB. The following elements of the .Vt if_data structure are initialized by the interface and are not expected to change significantly over the course of normal operation: .Bl -tag -width ".Va ifi_lastchange" -offset indent .It Va ifi_type .Pq Vt u_char The type of the interface, as defined in .In net/if_types.h and described below in the .Sx "Interface Types" section. .It Va ifi_physical .Pq Vt u_char Intended to represent a selection of physical layers on devices which support more than one; never implemented. .It Va ifi_addrlen .Pq Vt u_char Length of a link-layer address on this device, or zero if there are none. Used to initialized the address length field in .Vt sockaddr_dl structures referring to this interface. .It Va ifi_hdrlen .Pq Vt u_char Maximum length of any link-layer header which might be prepended by the driver to a packet before transmission. The generic code computes the maximum over all interfaces and uses that value to influence the placement of data in .Vt mbuf Ns s to attempt to ensure that there is always sufficient space to prepend a link-layer header without allocating an additional .Vt mbuf . .It Va ifi_datalen .Pq Vt u_char Length of the .Vt if_data structure. Allows some stabilization of the routing socket ABI in the face of increases in the length of .Vt struct ifdata . .It Va ifi_mtu .Pq Vt u_long The maximum transmission unit of the medium, exclusive of any link-layer overhead. .It Va ifi_metric .Pq Vt u_long A dimensionless metric interpreted by a user-mode routing process. .It Va ifi_baudrate .Pq Vt u_long The line rate of the interface, in bits per second. .It Va ifi_hwassist .Pq Vt u_long A detailed interpretation of the capabilities to offload computational tasks for .Em outgoing packets. The interface driver must keep this field in accord with the current value of .Va if_capenable . .It Va ifi_epoch .Pq Vt time_t The system uptime when interface was attached or the statistics below were reset. This is intended to be used to set the SNMP variable .Va ifCounterDiscontinuityTime . It may also be used to determine if two successive queries for an interface of the same index have returned results for the same interface. .El .Pp The structure additionally contains generic statistics applicable to a variety of different interface types (except as noted, all members are of type .Vt u_long ) : .Bl -tag -width ".Va ifi_lastchange" -offset indent .It Va ifi_link_state .Pq Vt u_char The current link state of Ethernet interfaces. See the .Sx Interface Link States section for possible values. .It Va ifi_ipackets Number of packets received. .It Va ifi_ierrors Number of receive errors detected (e.g., FCS errors, DMA overruns, etc.). More detailed breakdowns can often be had by way of a link-specific MIB. .It Va ifi_opackets Number of packets transmitted. .It Va ifi_oerrors Number of output errors detected (e.g., late collisions, DMA overruns, etc.). More detailed breakdowns can often be had by way of a link-specific MIB. .It Va ifi_collisions Total number of collisions detected on output for CSMA interfaces. (This member is sometimes [ab]used by other types of interfaces for other output error counts.) .It Va ifi_ibytes Total traffic received, in bytes. .It Va ifi_obytes Total traffic transmitted, in bytes. .It Va ifi_imcasts Number of packets received which were sent by link-layer multicast. .It Va ifi_omcasts Number of packets sent by link-layer multicast. .It Va ifi_iqdrops Number of packets dropped on input. Rarely implemented. .It Va ifi_noproto Number of packets received for unknown network-layer protocol. .It Va ifi_lastchange .Pq Vt "struct timeval" The time of the last administrative change to the interface (as required for .Tn SNMP ) . .El .Ss Interface Types The header file .In net/if_types.h defines symbolic constants for a number of different types of interfaces. The most common are: .Pp .Bl -tag -offset indent -width ".Dv IFT_PROPVIRTUAL" -compact .It Dv IFT_OTHER none of the following .It Dv IFT_ETHER Ethernet .It Dv IFT_ISO88023 ISO 8802-3 CSMA/CD .It Dv IFT_ISO88024 ISO 8802-4 Token Bus .It Dv IFT_ISO88025 ISO 8802-5 Token Ring .It Dv IFT_ISO88026 ISO 8802-6 DQDB MAN .It Dv IFT_FDDI FDDI .It Dv IFT_PPP Internet Point-to-Point Protocol .Pq Xr ppp 8 .It Dv IFT_LOOP The loopback .Pq Xr lo 4 interface .It Dv IFT_SLIP Serial Line IP .It Dv IFT_PARA Parallel-port IP .Pq Dq Tn PLIP .It Dv IFT_ATM Asynchronous Transfer Mode .It Dv IFT_USB USB Interface .El .Ss Interface Link States The following link states are currently defined: .Pp .Bl -tag -offset indent -width ".Dv LINK_STATE_UNKNOWN" -compact .It Dv LINK_STATE_UNKNOWN The link is in an invalid or unknown state. .It Dv LINK_STATE_DOWN The link is down. .It Dv LINK_STATE_UP The link is up. .El .Ss The Vt ifaddr Ss Structure Every interface is associated with a list (or, rather, a .Li TAILQ ) of addresses, rooted at the interface structure's .Va if_addrlist member. The first element in this list is always an .Dv AF_LINK address representing the interface itself; multi-access network drivers should complete this structure by filling in their link-layer addresses after calling .Fn if_attach . Other members of the structure represent network-layer addresses which have been configured by means of the .Dv SIOCAIFADDR command to .Xr ioctl 2 , called on a socket of the appropriate protocol family. The elements of this list consist of .Vt ifaddr structures. Most protocols will declare their own protocol-specific interface address structures, but all begin with a .Vt "struct ifaddr" which provides the most-commonly-needed functionality across all protocols. Interface addresses are reference-counted. .Pp The members of .Vt "struct ifaddr" are as follows: .Bl -tag -width ".Va ifa_rtrequest" -offset indent .It Va ifa_addr .Pq Vt "struct sockaddr *" The local address of the interface. .It Va ifa_dstaddr .Pq Vt "struct sockaddr *" The remote address of point-to-point interfaces, and the broadcast address of broadcast interfaces. .Va ( ifa_broadaddr is a macro for .Va ifa_dstaddr . ) .It Va ifa_netmask .Pq Vt "struct sockaddr *" The network mask for multi-access interfaces, and the confusion generator for point-to-point interfaces. .It Va ifa_ifp .Pq Vt "struct ifnet *" A link back to the interface structure. .It Va ifa_link .Pq Fn TAILQ_ENTRY ifaddr .Xr queue 3 glue for list of addresses on each interface. .It Va ifa_rtrequest See below. .It Va ifa_flags .Pq Vt u_short Some of the flags which would be used for a route representing this address in the route table. .It Va ifa_refcnt .Pq Vt short The reference count. .El .Pp References to .Vt ifaddr structures are gained by calling the .Fn ifa_ref function and released by calling the .Fn ifa_free function. .Pp .Fn ifa_rtrequest is a pointer to a function which receives callouts from the routing code .Pq Fn rtrequest to perform link-layer-specific actions upon requests to add, or delete routes. The .Fa cmd argument indicates the request in question: .Dv RTM_ADD , or .Dv RTM_DELETE . The .Fa rt argument is the route in question; the .Fa info argument contains the specific destination being manipulated. .Sh FUNCTIONS The functions provided by the generic interface code can be divided into two groups: those which manipulate interfaces, and those which manipulate interface addresses. In addition to these functions, there may also be link-layer support routines which are used by a number of drivers implementing a specific link layer over different hardware; see the documentation for that link layer for more details. .Ss The Vt ifmultiaddr Ss Structure Every multicast-capable interface is associated with a list of multicast group memberships, which indicate at a low level which link-layer multicast addresses (if any) should be accepted, and at a high level, in which network-layer multicast groups a user process has expressed interest. .Pp The elements of the structure are as follows: .Bl -tag -width ".Va ifma_refcount" -offset indent .It Va ifma_link .Pq Fn LIST_ENTRY ifmultiaddr .Xr queue 3 macro glue. .It Va ifma_addr .Pq Vt "struct sockaddr *" A pointer to the address which this record represents. The memberships for various address families are stored in arbitrary order. .It Va ifma_lladdr .Pq Vt "struct sockaddr *" A pointer to the link-layer multicast address, if any, to which the network-layer multicast address in .Va ifma_addr is mapped, else a null pointer. If this element is non-nil, this membership also holds an invisible reference to another membership for that link-layer address. .It Va ifma_refcount .Pq Vt u_int A reference count of requests for this particular membership. .El .Ss Interface Manipulation Functions .Bl -ohang -offset indent .It Fn if_alloc Allocate and initialize .Vt "struct ifnet" . Initialization includes the allocation of an interface index and may include the allocation of a .Fa type specific structure in .Va if_l2com . .It Fn if_attach Link the specified interface .Fa ifp into the list of network interfaces. Also initialize the list of addresses on that interface, and create a link-layer .Vt ifaddr structure to be the first element in that list. (A pointer to this address structure is saved in the .Vt ifnet structure and shall be accessed by the .Fn ifaddr_byindex function.) The .Fa ifp must have been allocated by .Fn if_alloc . .It Fn if_detach Shut down and unlink the specified .Fa ifp from the interface list. .It Fn if_free Free the given .Fa ifp back to the system. The interface must have been previously detached if it was ever attached. .It Fn if_free_type Identical to .Fn if_free except that the given .Fa type is used to free .Va if_l2com instead of the type in .Va if_type . This is intended for use with drivers that change their interface type. .It Fn if_down Mark the interface .Fa ifp as down (i.e., .Dv IFF_UP is not set), flush its output queue, notify protocols of the transition, and generate a message from the .Xr route 4 routing socket. .It Fn if_up Mark the interface .Fa ifp as up, notify protocols of the transition, and generate a message from the .Xr route 4 routing socket. .It Fn ifpromisc Add or remove a promiscuous reference to .Fa ifp . If .Fa pswitch is true, add a reference; if it is false, remove a reference. On reference count transitions from zero to one and one to zero, set the .Dv IFF_PROMISC flag appropriately and call .Fn if_ioctl to set up the interface in the desired mode. .It Fn if_allmulti As .Fn ifpromisc , but for the all-multicasts .Pq Dv IFF_ALLMULTI flag instead of the promiscuous flag. .It Fn ifunit Return an .Vt ifnet pointer for the interface named .Fa name . .It Fn ifunit_ref Return a reference-counted (via .Fn ifa_ref ) .Vt ifnet pointer for the interface named .Fa name . This is the preferred function over .Fn ifunit . The caller is responsible for releasing the reference with .Fn if_rele when it is finished with the ifnet. .It Fn ifioctl Process the ioctl request .Fa cmd , issued on socket .Fa so by thread .Fa td , with data parameter .Fa data . This is the main routine for handling all interface configuration requests from user mode. It is ordinarily only called from the socket-layer .Xr ioctl 2 handler, and only for commands with class .Sq Li i . Any unrecognized commands will be passed down to socket .Fa so Ns 's protocol for further interpretation. The following commands are handled by .Fn ifioctl : .Pp .Bl -tag -width ".Dv SIOCGIFNETMASK" -offset indent -compact .It Dv SIOCGIFCONF Get interface configuration. (No call-down to driver.) .Pp .It Dv SIOCSIFNAME Set the interface name. .Dv RTM_IFANNOUNCE departure and arrival messages are sent so that routing code that relies on the interface name will update its interface list. Caller must have appropriate privilege. (No call-down to driver.) .It Dv SIOCGIFCAP .It Dv SIOCGIFFIB .It Dv SIOCGIFFLAGS .It Dv SIOCGIFMETRIC .It Dv SIOCGIFMTU .It Dv SIOCGIFPHYS Get interface capabilities, FIB, flags, metric, MTU, medium selection. (No call-down to driver.) .Pp .It Dv SIOCSIFCAP Enable or disable interface capabilities. Caller must have appropriate privilege. Before a call to the driver-specific .Fn if_ioctl routine, the requested mask for enabled capabilities is checked against the mask of capabilities supported by the interface, .Va if_capabilities . Requesting to enable an unsupported capability is invalid. The rest is supposed to be done by the driver, which includes updating .Va if_capenable and .Va if_data.ifi_hwassist appropriately. .Pp .It Dv SIOCSIFFIB Sets interface FIB. Caller must have appropriate privilege. FIB values start at 0 and values greater or equals than .Va net.fibs are considered invalid. .It Dv SIOCSIFFLAGS Change interface flags. Caller must have appropriate privilege. If a change to the .Dv IFF_UP flag is requested, .Fn if_up or .Fn if_down is called as appropriate. Flags listed in .Dv IFF_CANTCHANGE are masked off, and the field .Va if_flags in the interface structure is updated. Finally, the driver .Fn if_ioctl routine is called to perform any setup requested. .Pp .It Dv SIOCSIFMETRIC .It Dv SIOCSIFPHYS Change interface metric or medium. Caller must have appropriate privilege. .Pp .It Dv SIOCSIFMTU Change interface MTU. Caller must have appropriate privilege. MTU values less than 72 or greater than 65535 are considered invalid. The driver .Fn if_ioctl routine is called to implement the change; it is responsible for any additional sanity checking and for actually modifying the MTU in the interface structure. .Pp .It Dv SIOCADDMULTI .It Dv SIOCDELMULTI Add or delete permanent multicast group memberships on the interface. Caller must have appropriate privilege. The .Fn if_addmulti or .Fn if_delmulti function is called to perform the operation; qq.v. .Pp .It Dv SIOCAIFADDR .It Dv SIOCDIFADDR The socket's protocol control routine is called to implement the requested action. .El .El .Pp .Fn if_down , .Fn ifioctl , .Fn ifpromisc , and .Fn if_up must be called at .Fn splnet or higher. .Ss "Interface Address Functions" Several functions exist to look up an interface address structure given an address. .Fn ifa_ifwithaddr returns an interface address with either a local address or a broadcast address precisely matching the parameter .Fa addr . .Fn ifa_ifwithdstaddr returns an interface address for a point-to-point interface whose remote .Pq Dq destination address is -.Fa addr . +.Fa addr +and a fib is +.Fa fib . +If +.Fa fib +is +.Dv RT_ALL_FIBS , +then the first interface address matching +.Fa addr +will be returned. .Pp .Fn ifa_ifwithnet returns the most specific interface address which matches the specified address, .Fa addr , subject to its configured netmask, or a point-to-point interface address whose remote address is .Fa addr if one is found. If .Fa ignore_ptp -is true, skip point-to-point interface addresses. +is true, skip point-to-point interface addresses. The +.Fa fib +parameter is handled the same way as by +.Fn ifa_ifwithdstaddr . .Pp .Fn ifaof_ifpforaddr returns the most specific address configured on interface .Fa ifp which matches address .Fa addr , subject to its configured netmask. If the interface is point-to-point, only an interface address whose remote address is precisely .Fa addr will be returned. .Pp .Fn ifaddr_byindex returns the link-level address of the interface with the given index .Fa idx . .Pp All of these functions return a null pointer if no such address can be found. .Ss "Interface Multicast Address Functions" The .Fn if_addmulti , .Fn if_delmulti , and .Fn if_findmulti functions provide support for requesting and relinquishing multicast group memberships, and for querying an interface's membership list, respectively. The .Fn if_addmulti function takes a pointer to an interface, .Fa ifp , and a generic address, .Fa sa . It also takes a pointer to a .Vt "struct ifmultiaddr *" which is filled in on successful return with the address of the group membership control block. The .Fn if_addmulti function performs the following four-step process: .Bl -enum -offset indent .It Call the interface's .Fn if_resolvemulti entry point to determine the link-layer address, if any, corresponding to this membership request, and also to give the link layer an opportunity to veto this membership request should it so desire. .It Check the interface's group membership list for a pre-existing membership for this group. If one is not found, allocate a new one; if one is, increment its reference count. .It If the .Fn if_resolvemulti routine returned a link-layer address corresponding to the group, repeat the previous step for that address as well. .It If the interface's multicast address filter needs to be changed because a new membership was added, call the interface's .Fn if_ioctl routine (with a .Fa cmd argument of .Dv SIOCADDMULTI ) to request that it do so. .El .Pp The .Fn if_delmulti function, given an interface .Fa ifp and an address, .Fa sa , reverses this process. Both functions return zero on success, or a standard error number on failure. .Pp The .Fn if_findmulti function examines the membership list of interface .Fa ifp for an address matching .Fa sa , and returns a pointer to that .Vt "struct ifmultiaddr" if one is found, else it returns a null pointer. .Sh SEE ALSO .Xr ioctl 2 , .Xr link_addr 3 , .Xr queue 3 , .Xr sysctl 3 , .Xr bpf 4 , .Xr ifmib 4 , .Xr lo 4 , .Xr netintro 4 , .Xr polling 4 , .Xr config 8 , .Xr ppp 8 , .Xr mbuf 9 , .Xr rtentry 9 .Rs .%A Gary R. Wright .%A W. Richard Stevens .%B TCP/IP Illustrated .%V Vol. 2 .%O Addison-Wesley, ISBN 0-201-63354-X .Re .Sh AUTHORS This manual page was written by .An Garrett A. Wollman . Index: user/ae/inet6/share/vt/keymaps/Makefile =================================================================== --- user/ae/inet6/share/vt/keymaps/Makefile (revision 271452) +++ user/ae/inet6/share/vt/keymaps/Makefile (revision 271453) @@ -1,91 +1,93 @@ # $FreeBSD$ FILES= INDEX.keymaps \ am.kbd \ be.acc.kbd \ be.kbd \ bg.bds.kbd \ bg.phonetic.kbd \ br.kbd \ br.noacc.kbd \ by.kbd \ ca.kbd \ ca-fr.kbd \ centraleuropean.kbd \ centraleuropean.qwerty.kbd \ ch-fr.acc.kbd \ ch-fr.kbd \ ch.acc.kbd \ ch.kbd \ ch.macbook.acc.kbd \ colemak.acc.kbd \ cz.kbd \ de.acc.kbd \ de.noacc.kbd \ de.kbd \ dk.acc.kbd \ dk.kbd \ dk.macbook.kbd \ ee.kbd \ es.acc.kbd \ es.dvorak.kbd \ es.kbd \ fi.kbd \ + fr.acc.kbd \ fr.dvorak.acc.kbd \ fr.dvorak.kbd \ + fr.kbd \ fr.macbook.kbd \ gr.101.acc.kbd \ gr.elot.acc.kbd \ gr.kbd \ hr.kbd \ hu.101.kbd \ hu.102.kbd \ il.kbd \ is.acc.kbd \ is.kbd \ it.kbd \ jp.capsctrl.kbd \ jp.kbd \ jp.pc98.iso.kbd \ jp.pc98.kbd \ kz.io.kbd \ kz.kst.kbd \ latinamerican.acc.kbd \ latinamerican.kbd \ lt.kbd \ nl.kbd \ no.dvorak.kbd \ no.kbd \ nordic.asus-eee.kbd \ pl.dvorak.kbd \ pl.kbd \ pt.acc.kbd \ pt.kbd \ ru.kbd \ ru.shift.kbd \ ru.win.kbd \ se.kbd \ si.kbd \ sk.kbd \ tr.kbd \ ua.kbd \ ua.shift.alt.kbd \ uk.capsctrl.kbd \ uk.dvorak.kbd \ uk.kbd \ us.acc.kbd \ us.ctrl.kbd \ us.dvorak.kbd \ us.dvorakl.kbd \ us.dvorakp.kbd \ us.dvorakr.kbd \ us.dvorakx.kbd \ us.emacs.kbd \ us.kbd \ us.unix.kbd \ FILESDIR= ${SHAREDIR}/vt/keymaps NO_OBJ= .include Index: user/ae/inet6/share =================================================================== --- user/ae/inet6/share (revision 271452) +++ user/ae/inet6/share (revision 271453) Property changes on: user/ae/inet6/share ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/share:r271428-271452 Index: user/ae/inet6/sys/amd64/include/vmm.h =================================================================== --- user/ae/inet6/sys/amd64/include/vmm.h (revision 271452) +++ user/ae/inet6/sys/amd64/include/vmm.h (revision 271453) @@ -1,616 +1,617 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) #ifdef _KERNEL #define VM_MAX_NAMELEN 32 struct vm; struct vm_exception; struct vm_memory_segment; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, struct pmap *pmap, void *rendezvous_cookie, void *suspend_cookie); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ vmm_cleanup_func_t cleanup; vmm_resume_func_t resume; vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; }; extern struct vmm_ops vmm_ops_intel; extern struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg); int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, vm_offset_t *offset, struct vm_object **object); boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); int vm_inject_extint(struct vm *vm, int vcpu); int vm_extint_pending(struct vm *vm, int vcpuid); void vm_extint_clear(struct vm *vm, int vcpuid); uint64_t *vm_guest_msrs(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * If the rendezvous is being initiated from a vcpu context then the * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg); static __inline int vcpu_rendezvous_pending(void *rendezvous_cookie) { return (*(uintptr_t *)rendezvous_cookie != 0); } static __inline int vcpu_suspended(void *suspend_cookie) { return (*(int *)suspend_cookie); } /* * Return 1 if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return 0 otherwise. */ int vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vm *vm, int vcpu) { return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)); } #endif void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); /* * Inject exception 'vme' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * Returns 0 on success. * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); #endif /* KERNEL */ #define VM_MAXCPU 16 /* maximum virtual cpus */ /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, opsize_override:1, /* Operand size override */ addrsize_override:1; /* Address size override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ index:4, base:4; uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ struct vie_op op; /* opcode description */ }; enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); } static __inline void vm_inject_gp(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); } static __inline void vm_inject_ac(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); } void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); #endif /* _VMM_H_ */ Property changes on: user/ae/inet6/sys/amd64/include/vmm.h ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/amd64/include/vmm.h:r271428-271452 Index: user/ae/inet6/sys/amd64/vmm/intel/vmx.c =================================================================== --- user/ae/inet6/sys/amd64/vmm/intel/vmx.c (revision 271452) +++ user/ae/inet6/sys/amd64/vmm/intel/vmx.c (revision 271453) @@ -1,3308 +1,3354 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_host.h" #include "vmm_ioport.h" #include "vmm_ipi.h" #include "vmm_msr.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "vmx_msr.h" #include "ept.h" #include "vmx_cpufunc.h" #include "vmx.h" #include "x86.h" #include "vmx_controls.h" #define PINBASED_CTLS_ONE_SETTING \ (PINBASED_EXTINT_EXITING | \ PINBASED_NMI_EXITING | \ PINBASED_VIRTUAL_NMI) #define PINBASED_CTLS_ZERO_SETTING 0 #define PROCBASED_CTLS_WINDOW_SETTING \ (PROCBASED_INT_WINDOW_EXITING | \ PROCBASED_NMI_WINDOW_EXITING) #define PROCBASED_CTLS_ONE_SETTING \ (PROCBASED_SECONDARY_CONTROLS | \ PROCBASED_IO_EXITING | \ PROCBASED_MSR_BITMAPS | \ PROCBASED_CTLS_WINDOW_SETTING | \ PROCBASED_CR8_LOAD_EXITING | \ PROCBASED_CR8_STORE_EXITING) #define PROCBASED_CTLS_ZERO_SETTING \ (PROCBASED_CR3_LOAD_EXITING | \ PROCBASED_CR3_STORE_EXITING | \ PROCBASED_IO_BITMAPS) #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT #define PROCBASED_CTLS2_ZERO_SETTING 0 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ (VM_EXIT_HOST_LMA | \ VM_EXIT_SAVE_EFER | \ VM_EXIT_LOAD_EFER) #define VM_EXIT_CTLS_ONE_SETTING \ (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ VM_EXIT_ACKNOWLEDGE_INTERRUPT | \ VM_EXIT_SAVE_PAT | \ VM_EXIT_LOAD_PAT) #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER #define VM_ENTRY_CTLS_ONE_SETTING \ (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ VM_ENTRY_LOAD_PAT) #define VM_ENTRY_CTLS_ZERO_SETTING \ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ VM_ENTRY_INTO_SMM | \ VM_ENTRY_DEACTIVATE_DUAL_MONITOR) #define guest_msr_rw(vmx, msr) \ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) #define guest_msr_ro(vmx, msr) \ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ) #define HANDLED 1 #define UNHANDLED 0 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; static uint32_t exit_ctls, entry_ctls; static uint64_t cr0_ones_mask, cr0_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, &cr0_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, &cr0_zeros_mask, 0, NULL); static uint64_t cr4_ones_mask, cr4_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, &cr4_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, &cr4_zeros_mask, 0, NULL); static int vmx_initialized; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, &vmx_initialized, 0, "Intel VMX initialized"); /* * Optional capabilities */ static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); static int vmx_patmsr; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0, "PAT MSR saved and restored in VCMS"); static int cap_halt_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, "HLT triggers a VM-exit"); static int cap_pause_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 0, "PAUSE triggers a VM-exit"); static int cap_unrestricted_guest; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, &cap_unrestricted_guest, 0, "Unrestricted guests"); static int cap_monitor_trap; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, &cap_monitor_trap, 0, "Monitor trap flag"); static int cap_invpcid; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 0, "Guests are allowed to use INVPCID"); static int virtual_interrupt_delivery; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); static int pirvec; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, &pirvec, 0, "APICv posted interrupt vector"); static struct unrhdr *vpid_unr; static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); /* * Use the last page below 4GB as the APIC access address. This address is * occupied by the boot firmware so it is guaranteed that it will not conflict * with a page in system memory. */ #define APIC_ACCESS_ADDRESS 0xFFFFF000 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static void vmx_inject_pir(struct vlapic *vlapic); #ifdef KTR static const char * exit_reason_to_str(int reason) { static char reasonbuf[32]; switch (reason) { case EXIT_REASON_EXCEPTION: return "exception"; case EXIT_REASON_EXT_INTR: return "extint"; case EXIT_REASON_TRIPLE_FAULT: return "triplefault"; case EXIT_REASON_INIT: return "init"; case EXIT_REASON_SIPI: return "sipi"; case EXIT_REASON_IO_SMI: return "iosmi"; case EXIT_REASON_SMI: return "smi"; case EXIT_REASON_INTR_WINDOW: return "intrwindow"; case EXIT_REASON_NMI_WINDOW: return "nmiwindow"; case EXIT_REASON_TASK_SWITCH: return "taskswitch"; case EXIT_REASON_CPUID: return "cpuid"; case EXIT_REASON_GETSEC: return "getsec"; case EXIT_REASON_HLT: return "hlt"; case EXIT_REASON_INVD: return "invd"; case EXIT_REASON_INVLPG: return "invlpg"; case EXIT_REASON_RDPMC: return "rdpmc"; case EXIT_REASON_RDTSC: return "rdtsc"; case EXIT_REASON_RSM: return "rsm"; case EXIT_REASON_VMCALL: return "vmcall"; case EXIT_REASON_VMCLEAR: return "vmclear"; case EXIT_REASON_VMLAUNCH: return "vmlaunch"; case EXIT_REASON_VMPTRLD: return "vmptrld"; case EXIT_REASON_VMPTRST: return "vmptrst"; case EXIT_REASON_VMREAD: return "vmread"; case EXIT_REASON_VMRESUME: return "vmresume"; case EXIT_REASON_VMWRITE: return "vmwrite"; case EXIT_REASON_VMXOFF: return "vmxoff"; case EXIT_REASON_VMXON: return "vmxon"; case EXIT_REASON_CR_ACCESS: return "craccess"; case EXIT_REASON_DR_ACCESS: return "draccess"; case EXIT_REASON_INOUT: return "inout"; case EXIT_REASON_RDMSR: return "rdmsr"; case EXIT_REASON_WRMSR: return "wrmsr"; case EXIT_REASON_INVAL_VMCS: return "invalvmcs"; case EXIT_REASON_INVAL_MSR: return "invalmsr"; case EXIT_REASON_MWAIT: return "mwait"; case EXIT_REASON_MTF: return "mtf"; case EXIT_REASON_MONITOR: return "monitor"; case EXIT_REASON_PAUSE: return "pause"; case EXIT_REASON_MCE: return "mce"; case EXIT_REASON_TPR: return "tpr"; case EXIT_REASON_APIC_ACCESS: return "apic-access"; case EXIT_REASON_GDTR_IDTR: return "gdtridtr"; case EXIT_REASON_LDTR_TR: return "ldtrtr"; case EXIT_REASON_EPT_FAULT: return "eptfault"; case EXIT_REASON_EPT_MISCONFIG: return "eptmisconfig"; case EXIT_REASON_INVEPT: return "invept"; case EXIT_REASON_RDTSCP: return "rdtscp"; case EXIT_REASON_VMX_PREEMPT: return "vmxpreempt"; case EXIT_REASON_INVVPID: return "invvpid"; case EXIT_REASON_WBINVD: return "wbinvd"; case EXIT_REASON_XSETBV: return "xsetbv"; case EXIT_REASON_APIC_WRITE: return "apic-write"; default: snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); return (reasonbuf); } } #endif /* KTR */ static int vmx_allow_x2apic_msrs(struct vmx *vmx) { int i, error; error = 0; /* * Allow readonly access to the following x2APIC MSRs from the guest. */ error += guest_msr_ro(vmx, MSR_APIC_ID); error += guest_msr_ro(vmx, MSR_APIC_VERSION); error += guest_msr_ro(vmx, MSR_APIC_LDR); error += guest_msr_ro(vmx, MSR_APIC_SVR); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); error += guest_msr_ro(vmx, MSR_APIC_ESR); error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_ICR); /* * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. * * These registers get special treatment described in the section * "Virtualizing MSR-Based APIC Accesses". */ error += guest_msr_rw(vmx, MSR_APIC_TPR); error += guest_msr_rw(vmx, MSR_APIC_EOI); error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); return (error); } u_long vmx_fix_cr0(u_long cr0) { return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); } u_long vmx_fix_cr4(u_long cr4) { return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); } static void vpid_free(int vpid) { if (vpid < 0 || vpid > 0xffff) panic("vpid_free: invalid vpid %d", vpid); /* * VPIDs [0,VM_MAXCPU] are special and are not allocated from * the unit number allocator. */ if (vpid > VM_MAXCPU) free_unr(vpid_unr, vpid); } static void vpid_alloc(uint16_t *vpid, int num) { int i, x; if (num <= 0 || num > VM_MAXCPU) panic("invalid number of vpids requested: %d", num); /* * If the "enable vpid" execution control is not enabled then the * VPID is required to be 0 for all vcpus. */ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { for (i = 0; i < num; i++) vpid[i] = 0; return; } /* * Allocate a unique VPID for each vcpu from the unit number allocator. */ for (i = 0; i < num; i++) { x = alloc_unr(vpid_unr); if (x == -1) break; else vpid[i] = x; } if (i < num) { atomic_add_int(&vpid_alloc_failed, 1); /* * If the unit number allocator does not have enough unique * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. * * These VPIDs are not be unique across VMs but this does not * affect correctness because the combined mappings are also * tagged with the EP4TA which is unique for each VM. * * It is still sub-optimal because the invvpid will invalidate * combined mappings for a particular VPID across all EP4TAs. */ while (i-- > 0) vpid_free(vpid[i]); for (i = 0; i < num; i++) vpid[i] = i + 1; } } static void vpid_init(void) { /* * VPID 0 is required when the "enable VPID" execution control is * disabled. * * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the * unit number allocator does not have sufficient unique VPIDs to * satisfy the allocation. * * The remaining VPIDs are managed by the unit number allocator. */ vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); } static void msr_save_area_init(struct msr_entry *g_area, int *g_count) { int cnt; static struct msr_entry guest_msrs[] = { { MSR_KGSBASE, 0, 0 }, }; cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); if (cnt > GUEST_MSR_MAX_ENTRIES) panic("guest msr save area overrun"); bcopy(guest_msrs, g_area, sizeof(guest_msrs)); *g_count = cnt; } static void vmx_disable(void *arg __unused) { struct invvpid_desc invvpid_desc = { 0 }; struct invept_desc invept_desc = { 0 }; if (vmxon_enabled[curcpu]) { /* * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. * * VMXON or VMXOFF are not required to invalidate any TLB * caching structures. This prevents potential retention of * cached information in the TLB between distinct VMX episodes. */ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); vmxoff(); } load_cr4(rcr4() & ~CR4_VMXE); } static int vmx_cleanup(void) { if (pirvec != 0) vmm_ipi_free(pirvec); if (vpid_unr != NULL) { delete_unrhdr(vpid_unr); vpid_unr = NULL; } smp_rendezvous(NULL, vmx_disable, NULL, NULL); return (0); } static void vmx_enable(void *arg __unused) { int error; uint64_t feature_control; feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | IA32_FEATURE_CONTROL_VMX_EN | IA32_FEATURE_CONTROL_LOCK); } load_cr4(rcr4() | CR4_VMXE); *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); error = vmxon(vmxon_region[curcpu]); if (error == 0) vmxon_enabled[curcpu] = 1; } static void vmx_restore(void) { if (vmxon_enabled[curcpu]) vmxon(vmxon_region[curcpu]); } static int vmx_init(int ipinum) { int error, use_tpr_shadow; uint64_t basic, fixed0, fixed1, feature_control; uint32_t tmp, procbased2_vid_bits; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ if (!(cpu_feature2 & CPUID2_VMX)) { printf("vmx_init: processor does not support VMX operation\n"); return (ENXIO); } /* * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits * are set (bits 0 and 2 respectively). */ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { printf("vmx_init: VMX operation disabled by BIOS\n"); return (ENXIO); } /* * Verify capabilities MSR_VMX_BASIC: * - bit 54 indicates support for INS/OUTS decoding */ basic = rdmsr(MSR_VMX_BASIC); if ((basic & (1UL << 54)) == 0) { printf("vmx_init: processor does not support desired basic " "capabilities\n"); return (EINVAL); } /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); if (error) { printf("vmx_init: processor does not support desired primary " "processor-based controls\n"); return (error); } /* Clear the processor-based ctl bits that are set on demand */ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; /* Check support for secondary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); if (error) { printf("vmx_init: processor does not support desired secondary " "processor-based controls\n"); return (error); } /* Check support for VPID */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_VPID, 0, &tmp); if (error == 0) procbased_ctls2 |= PROCBASED2_ENABLE_VPID; /* Check support for pin-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); if (error) { printf("vmx_init: processor does not support desired " "pin-based controls\n"); return (error); } /* Check support for VM-exit controls */ vmx_patmsr = 1; error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &exit_ctls); if (error) { /* Try again without the PAT MSR bits */ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, VM_EXIT_CTLS_ONE_SETTING_NO_PAT, VM_EXIT_CTLS_ZERO_SETTING, &exit_ctls); if (error) { printf("vmx_init: processor does not support desired " "exit controls\n"); return (error); } else { if (bootverbose) printf("vmm: PAT MSR access not supported\n"); guest_msr_valid(MSR_PAT); vmx_patmsr = 0; } } /* Check support for VM-entry controls */ if (vmx_patmsr) { error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &entry_ctls); } else { error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, VM_ENTRY_CTLS_ZERO_SETTING, &entry_ctls); } if (error) { printf("vmx_init: processor does not support desired " "entry controls\n"); return (error); } /* * Check support for optional features by testing them * as individual bits */ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_HLT_EXITING, 0, &tmp) == 0); cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_PROCBASED_CTLS, PROCBASED_MTF, 0, &tmp) == 0); cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_PAUSE_EXITING, 0, &tmp) == 0); cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp) == 0); cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, &tmp) == 0); /* * Check support for virtual interrupt delivery. */ procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | PROCBASED2_VIRTUALIZE_X2APIC_MODE | PROCBASED2_APIC_REGISTER_VIRTUALIZATION | PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, &tmp) == 0); error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, procbased2_vid_bits, 0, &tmp); if (error == 0 && use_tpr_shadow) { virtual_interrupt_delivery = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", &virtual_interrupt_delivery); } if (virtual_interrupt_delivery) { procbased_ctls |= PROCBASED_USE_TPR_SHADOW; procbased_ctls2 |= procbased2_vid_bits; procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; /* * No need to emulate accesses to %CR8 if virtual * interrupt delivery is enabled. */ procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; /* * Check for Posted Interrupts only if Virtual Interrupt * Delivery is enabled. */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, &tmp); if (error == 0) { pirvec = vmm_ipi_alloc(); if (pirvec == 0) { if (bootverbose) { printf("vmx_init: unable to allocate " "posted interrupt vector\n"); } } else { posted_interrupts = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", &posted_interrupts); } } } if (posted_interrupts) pinbased_ctls |= PINBASED_POSTED_INTERRUPT; /* Initialize EPT */ error = ept_init(ipinum); if (error) { printf("vmx_init: ept initialization failed (%d)\n", error); return (error); } /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); cr0_ones_mask = fixed0 & fixed1; cr0_zeros_mask = ~fixed0 & ~fixed1; /* * CR0_PE and CR0_PG can be set to zero in VMX non-root operation * if unrestricted guest execution is allowed. */ if (cap_unrestricted_guest) cr0_ones_mask &= ~(CR0_PG | CR0_PE); /* * Do not allow the guest to set CR0_NW or CR0_CD. */ cr0_zeros_mask |= (CR0_NW | CR0_CD); fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); cr4_ones_mask = fixed0 & fixed1; cr4_zeros_mask = ~fixed0 & ~fixed1; vpid_init(); /* enable VMX operation */ smp_rendezvous(NULL, vmx_enable, NULL, NULL); vmx_initialized = 1; return (0); } static void vmx_trigger_hostintr(int vector) { uintptr_t func; struct gate_descriptor *gd; gd = &idt[vector]; KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " "invalid vector %d", vector)); KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", vector)); KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " "has invalid type %d", vector, gd->gd_type)); KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " "has invalid dpl %d", vector, gd->gd_dpl)); KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " "for vector %d has invalid selector %d", vector, gd->gd_selector)); KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " "IST %d", vector, gd->gd_ist)); func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); vmx_call_isr(func); } static int vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) { int error, mask_ident, shadow_ident; uint64_t mask_value; if (which != 0 && which != 4) panic("vmx_setup_cr_shadow: unknown cr%d", which); if (which == 0) { mask_ident = VMCS_CR0_MASK; mask_value = cr0_ones_mask | cr0_zeros_mask; shadow_ident = VMCS_CR0_SHADOW; } else { mask_ident = VMCS_CR4_MASK; mask_value = cr4_ones_mask | cr4_zeros_mask; shadow_ident = VMCS_CR4_SHADOW; } error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); if (error) return (error); error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); if (error) return (error); return (0); } #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) static void * vmx_vminit(struct vm *vm, pmap_t pmap) { uint16_t vpid[VM_MAXCPU]; int i, error, guest_msr_count; struct vmx *vmx; struct vmcs *vmcs; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { panic("malloc of struct vmx not aligned on %d byte boundary", PAGE_SIZE); } vmx->vm = vm; vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); /* * Clean up EPTP-tagged guest physical and combined mappings * * VMX transitions are not required to invalidate any guest physical * mappings. So, it may be possible for stale guest physical mappings * to be present in the processor TLBs. * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ ept_invalidate_mappings(vmx->eptp); msr_bitmap_initialize(vmx->msr_bitmap); /* * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. * The guest FSBASE and GSBASE are saved and restored during * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are * always restored from the vmcs host state area on vm-exit. * * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in * how they are saved/restored so can be directly accessed by the * guest. * * Guest KGSBASE is saved and restored in the guest MSR save area. * Host KGSBASE is restored before returning to userland from the pcb. * There will be a window of time when we are executing in the host * kernel context with a value of KGSBASE from the guest. This is ok * because the value of KGSBASE is inconsequential in kernel context. * * MSR_EFER is saved and restored in the guest VMCS area on a * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. * * The TSC MSR is exposed read-only. Writes are disallowed as that * will impact the host TSC. * XXX Writes would be implemented with a wrmsr trap, and * then modifying the TSC offset in the VMCS. */ if (guest_msr_rw(vmx, MSR_GSBASE) || guest_msr_rw(vmx, MSR_FSBASE) || guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_KGSBASE) || guest_msr_rw(vmx, MSR_EFER) || guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); /* * MSR_PAT is saved and restored in the guest VMCS are on a VM exit * and entry respectively. It is also restored from the host VMCS * area on a VM exit. However, if running on a system with no * MSR_PAT save/restore support, leave access disabled so accesses * will be trapped. */ if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT)) panic("vmx_vminit: error setting guest pat msr access"); vpid_alloc(vpid, VM_MAXCPU); if (virtual_interrupt_delivery) { error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, APIC_ACCESS_ADDRESS); /* XXX this should really return an error to the caller */ KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); } for (i = 0; i < VM_MAXCPU; i++) { vmcs = &vmx->vmcs[i]; vmcs->identifier = vmx_revision(); error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", error, i); } error = vmcs_init(vmcs); KASSERT(error == 0, ("vmcs_init error %d", error)); VMPTRLD(vmcs); error = 0; error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); error += vmwrite(VMCS_EPTP, vmx->eptp); error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); error += vmwrite(VMCS_VPID, vpid[i]); if (virtual_interrupt_delivery) { error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); error += vmwrite(VMCS_VIRTUAL_APIC, vtophys(&vmx->apic_page[i])); error += vmwrite(VMCS_EOI_EXIT0, 0); error += vmwrite(VMCS_EOI_EXIT1, 0); error += vmwrite(VMCS_EOI_EXIT2, 0); error += vmwrite(VMCS_EOI_EXIT3, 0); } if (posted_interrupts) { error += vmwrite(VMCS_PIR_VECTOR, pirvec); error += vmwrite(VMCS_PIR_DESC, vtophys(&vmx->pir_desc[i])); } VMCLEAR(vmcs); KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); vmx->cap[i].set = 0; vmx->cap[i].proc_ctls = procbased_ctls; vmx->cap[i].proc_ctls2 = procbased_ctls2; vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]), guest_msr_count); if (error != 0) panic("vmcs_set_msr_save error %d", error); /* * Set up the CR0/4 shadows, and init the read shadow * to the power-on register value from the Intel Sys Arch. * CR0 - 0x60000010 * CR4 - 0 */ error = vmx_setup_cr0_shadow(vmcs, 0x60000010); if (error != 0) panic("vmx_setup_cr0_shadow %d", error); error = vmx_setup_cr4_shadow(vmcs, 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); vmx->ctx[i].pmap = pmap; } return (vmx); } static int vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) { int handled, func; func = vmxctx->guest_rax; handled = x86_emulate_cpuid(vm, vcpu, (uint32_t*)(&vmxctx->guest_rax), (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), (uint32_t*)(&vmxctx->guest_rdx)); return (handled); } static __inline void vmx_run_trace(struct vmx *vmx, int vcpu) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); #endif } static __inline void vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, int handled) { #ifdef KTR VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", handled ? "handled" : "unhandled", exit_reason_to_str(exit_reason), rip); #endif } static __inline void vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); #endif } static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); /* * Invalidate guest mappings identified by its vpid from the TLB. */ static __inline void vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; if (vmxstate->vpid == 0) return; if (!running) { /* * Set the 'lastcpu' to an invalid host cpu. * * This will invalidate TLB entries tagged with the vcpu's * vpid the next time it runs via vmx_set_pcpu_defaults(). */ vmxstate->lastcpu = NOCPU; return; } KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " "critical section", __func__, vcpu)); /* * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated * mappings associated with its 'vpid' during that run. So we must * assume that the mappings associated with 'vpid' on 'curcpu' are * stale and invalidate them. * * Note that we incur this penalty only when the scheduler chooses to * move the thread associated with this vcpu between host cpus. * * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { invvpid_desc._res1 = 0; invvpid_desc._res2 = 0; invvpid_desc.vpid = vmxstate->vpid; invvpid_desc.linear_addr = 0; invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); } else { /* * The invvpid can be skipped if an invept is going to * be performed before entering the guest. The invept * will invalidate combined mappings tagged with * 'vmx->eptp' for all vpids. */ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } static void vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) { struct vmxstate *vmxstate; vmxstate = &vmx->state[vcpu]; if (vmxstate->lastcpu == curcpu) return; vmxstate->lastcpu = curcpu; vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); vmx_invvpid(vmx, vcpu, pmap, 1); } /* * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. */ CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); static void __inline vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); } } static void __inline vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); } static void __inline vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); } } static void __inline vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); } #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) static void vmx_inject_nmi(struct vmx *vmx, int vcpu) { uint32_t gi, info; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " "interruptibility-state %#x", gi)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " "VM-entry interruption information %#x", info)); /* * Inject the virtual NMI. The vector must be the NMI IDT entry * or the VMCS entry check will fail. */ info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; vmcs_write(VMCS_ENTRY_INTR_INFO, info); VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); /* Clear the request */ vm_nmi_clear(vmx->vm, vcpu); } static void vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) { int vector, need_nmi_exiting, extint_pending; uint64_t rflags, entryinfo; uint32_t gi, info; if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " "pending exception: %#lx/%#x", __func__, entryinfo, info)); info = entryinfo; vector = info & 0xff; if (vector == IDT_BP || vector == IDT_OF) { /* * VT-x requires #BP and #OF to be injected as software * exceptions. */ info &= ~VMCS_INTR_T_MASK; info |= VMCS_INTR_T_SWEXCEPTION; } if (info & VMCS_INTR_DEL_ERRCODE) vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); vmcs_write(VMCS_ENTRY_INTR_INFO, info); } if (vm_nmi_pending(vmx->vm, vcpu)) { /* * If there are no conditions blocking NMI injection then * inject it directly here otherwise enable "NMI window * exiting" to inject it as soon as we can. * * We also check for STI_BLOCKING because some implementations * don't allow NMI injection in this case. If we are running * on a processor that doesn't have this restriction it will * immediately exit and the NMI will be injected in the * "NMI window exiting" handler. */ need_nmi_exiting = 1; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { info = vmcs_read(VMCS_ENTRY_INTR_INFO); if ((info & VMCS_INTR_VALID) == 0) { vmx_inject_nmi(vmx, vcpu); need_nmi_exiting = 0; } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " "due to VM-entry intr info %#x", info); } } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " "Guest Interruptibility-state %#x", gi); } if (need_nmi_exiting) vmx_set_nmi_window_exiting(vmx, vcpu); } extint_pending = vm_extint_pending(vmx->vm, vcpu); if (!extint_pending && virtual_interrupt_delivery) { vmx_inject_pir(vlapic); return; } /* * If interrupt-window exiting is already in effect then don't bother * checking for pending interrupts. This is just an optimization and * not needed for correctness. */ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " "pending int_window_exiting"); return; } if (!extint_pending) { /* Ask the local apic for a vector to inject */ if (!vlapic_pending_intr(vlapic, &vector)) return; /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [16,255] can be delivered * through the local APIC. */ KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); } else { /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(vmx->vm, &vector); /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [0,255] can be delivered * through the INTR pin. */ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); } /* Check RFLAGS.IF and the interruptibility state of the guest */ rflags = vmcs_read(VMCS_GUEST_RFLAGS); if ((rflags & PSL_I) == 0) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, rflags); goto cantinject; } gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "Guest Interruptibility-state %#x", vector, gi); goto cantinject; } info = vmcs_read(VMCS_ENTRY_INTR_INFO); if (info & VMCS_INTR_VALID) { /* * This is expected and could happen for multiple reasons: * - A vectoring VM-entry was aborted due to astpending * - A VM-exit happened during event injection. * - An exception was injected above. * - An NMI was injected above or after "NMI window exiting" */ VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "VM-entry intr info %#x", vector, info); goto cantinject; } /* Inject the interrupt */ info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; info |= vector; vmcs_write(VMCS_ENTRY_INTR_INFO, info); if (!extint_pending) { /* Update the Local APIC ISR */ vlapic_intr_accepted(vlapic, vector); } else { vm_extint_clear(vmx->vm, vcpu); vatpic_intr_accepted(vmx->vm, vector); /* * After we accepted the current ExtINT the PIC may * have posted another one. If that is the case, set * the Interrupt Window Exiting execution control so * we can inject that one too. * * Also, interrupt window exiting allows us to inject any * pending APIC vector that was preempted by the ExtINT * as soon as possible. This applies both for the software * emulated vlapic and the hardware assisted virtual APIC. */ vmx_set_int_window_exiting(vmx, vcpu); } VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); return; cantinject: /* * Set the Interrupt Window Exiting execution control so we can inject * the interrupt as soon as blocking condition goes away. */ vmx_set_int_window_exiting(vmx, vcpu); } /* * If the Virtual NMIs execution control is '1' then the logical processor * tracks virtual-NMI blocking in the Guest Interruptibility-state field of * the VMCS. An IRET instruction in VMX non-root operation will remove any * virtual-NMI blocking. * * This unblocking occurs even if the IRET causes a fault. In this case the * hypervisor needs to restore virtual-NMI blocking before resuming the guest. */ static void vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, ("NMI blocking is not in effect %#x", gi)); } static int vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { struct vmxctx *vmxctx; uint64_t xcrval; const struct xsave_limits *limits; vmxctx = &vmx->ctx[vcpu]; limits = vmm_get_xsave_limits(); /* * Note that the processor raises a GP# fault on its own if * xsetbv is executed for CPL != 0, so we do not have to * emulate that fault here. */ /* Only xcr0 is supported. */ if (vmxctx->guest_rcx != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { vm_inject_ud(vmx->vm, vcpu); return (HANDLED); } xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); if ((xcrval & ~limits->xcr0_allowed) != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } if (!(xcrval & XFEATURE_ENABLED_X87)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* AVX (YMM_Hi128) requires SSE. */ if (xcrval & XFEATURE_ENABLED_AVX && (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, * ZMM_Hi256, and Hi16_ZMM. */ if (xcrval & XFEATURE_AVX512 && (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != (XFEATURE_AVX512 | XFEATURE_AVX)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * Intel MPX requires both bound register state flags to be * set. */ if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * This runs "inside" vmrun() with the guest's FPU state, so * modifying xcr0 directly modifies the guest's xcr0, not the * host's. */ load_xcr(0, xcrval); return (HANDLED); } static uint64_t vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) { const struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: return (vmxctx->guest_rax); case 1: return (vmxctx->guest_rcx); case 2: return (vmxctx->guest_rdx); case 3: return (vmxctx->guest_rbx); case 4: return (vmcs_read(VMCS_GUEST_RSP)); case 5: return (vmxctx->guest_rbp); case 6: return (vmxctx->guest_rsi); case 7: return (vmxctx->guest_rdi); case 8: return (vmxctx->guest_r8); case 9: return (vmxctx->guest_r9); case 10: return (vmxctx->guest_r10); case 11: return (vmxctx->guest_r11); case 12: return (vmxctx->guest_r12); case 13: return (vmxctx->guest_r13); case 14: return (vmxctx->guest_r14); case 15: return (vmxctx->guest_r15); default: panic("invalid vmx register %d", ident); } } static void vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) { struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: vmxctx->guest_rax = regval; break; case 1: vmxctx->guest_rcx = regval; break; case 2: vmxctx->guest_rdx = regval; break; case 3: vmxctx->guest_rbx = regval; break; case 4: vmcs_write(VMCS_GUEST_RSP, regval); break; case 5: vmxctx->guest_rbp = regval; break; case 6: vmxctx->guest_rsi = regval; break; case 7: vmxctx->guest_rdi = regval; break; case 8: vmxctx->guest_r8 = regval; break; case 9: vmxctx->guest_r9 = regval; break; case 10: vmxctx->guest_r10 = regval; break; case 11: vmxctx->guest_r11 = regval; break; case 12: vmxctx->guest_r12 = regval; break; case 13: vmxctx->guest_r13 = regval; break; case 14: vmxctx->guest_r14 = regval; break; case 15: vmxctx->guest_r15 = regval; break; default: panic("invalid vmx register %d", ident); } } static int vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr0 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR0_SHADOW, regval); crval = regval | cr0_ones_mask; crval &= ~cr0_zeros_mask; vmcs_write(VMCS_GUEST_CR0, crval); if (regval & CR0_PG) { uint64_t efer, entry_ctls; /* * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and * the "IA-32e mode guest" bit in VM-entry control must be * equal. */ efer = vmcs_read(VMCS_GUEST_IA32_EFER); if (efer & EFER_LME) { efer |= EFER_LMA; vmcs_write(VMCS_GUEST_IA32_EFER, efer); entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); entry_ctls |= VM_ENTRY_GUEST_LMA; vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); } } return (HANDLED); } static int vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr4 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR4_SHADOW, regval); crval = regval | cr4_ones_mask; crval &= ~cr4_zeros_mask; vmcs_write(VMCS_GUEST_CR4, crval); return (HANDLED); } static int vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { struct vlapic *vlapic; uint64_t cr8; int regnum; /* We only handle mov %cr8 to/from a register at this time. */ if ((exitqual & 0xe0) != 0x00) { return (UNHANDLED); } vlapic = vm_lapic(vmx->vm, vcpu); regnum = (exitqual >> 8) & 0xf; if (exitqual & 0x10) { cr8 = vlapic_get_cr8(vlapic); vmx_set_guest_reg(vmx, vcpu, regnum, cr8); } else { cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); vlapic_set_cr8(vlapic, cr8); } return (HANDLED); } /* * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL */ static int vmx_cpl(void) { uint32_t ssar; ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); return ((ssar >> 5) & 0x3); } static enum vm_cpu_mode vmx_cpu_mode(void) { uint32_t csar; if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); if (csar & 0x2000) return (CPU_MODE_64BIT); /* CS.L = 1 */ else return (CPU_MODE_COMPATIBILITY); } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode vmx_paging_mode(void) { if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) return (PAGING_MODE_FLAT); if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) return (PAGING_MODE_32); if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } static uint64_t inout_str_index(struct vmx *vmx, int vcpuid, int in) { uint64_t val; int error; enum vm_reg_name reg; reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; error = vmx_getreg(vmx, vcpuid, reg, &val); KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); return (val); } static uint64_t inout_str_count(struct vmx *vmx, int vcpuid, int rep) { uint64_t val; int error; if (rep) { error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); } else { val = 1; } return (val); } static int inout_str_addrsize(uint32_t inst_info) { uint32_t size; size = (inst_info >> 7) & 0x7; switch (size) { case 0: return (2); /* 16 bit */ case 1: return (4); /* 32 bit */ case 2: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { s = (inst_info >> 15) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); /* XXX modify svm.c to update bit 16 of seg_desc.access (unusable) */ } static void vmx_paging_info(struct vm_guest_paging *paging) { paging->cr3 = vmcs_guest_cr3(); paging->cpl = vmx_cpl(); paging->cpu_mode = vmx_cpu_mode(); paging->paging_mode = vmx_paging_mode(); } static void vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) { struct vm_guest_paging *paging; uint32_t csar; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; vmx_paging_info(paging); switch (paging->cpu_mode) { case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); break; default: vmexit->u.inst_emul.cs_d = 0; break; } } static int ept_fault_type(uint64_t ept_qual) { int fault_type; if (ept_qual & EPT_VIOLATION_DATA_WRITE) fault_type = VM_PROT_WRITE; else if (ept_qual & EPT_VIOLATION_INST_FETCH) fault_type = VM_PROT_EXECUTE; else fault_type= VM_PROT_READ; return (fault_type); } static boolean_t ept_emulation_fault(uint64_t ept_qual) { int read, write; /* EPT fault on an instruction fetch doesn't make sense here */ if (ept_qual & EPT_VIOLATION_INST_FETCH) return (FALSE); /* EPT fault must be a read fault or a write fault */ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; if ((read | write) == 0) return (FALSE); /* * The EPT violation must have been caused by accessing a * guest-physical address that is a translation of a guest-linear * address. */ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { return (FALSE); } return (TRUE); } static __inline int apic_access_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); } static __inline int x2apic_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); } static int vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, uint64_t qual) { int error, handled, offset; uint32_t *apic_regs, vector; bool retu; handled = HANDLED; offset = APIC_WRITE_OFFSET(qual); if (!apic_access_virtualization(vmx, vcpuid)) { /* * In general there should not be any APIC write VM-exits * unless APIC-access virtualization is enabled. * * However self-IPI virtualization can legitimately trigger * an APIC-write VM-exit so treat it specially. */ if (x2apic_virtualization(vmx, vcpuid) && offset == APIC_OFFSET_SELF_IPI) { apic_regs = (uint32_t *)(vlapic->apic_page); vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; vlapic_self_ipi_handler(vlapic, vector); return (HANDLED); } else return (UNHANDLED); } switch (offset) { case APIC_OFFSET_ID: vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_LDR: vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: retu = false; error = vlapic_icrlo_write_handler(vlapic, &retu); if (error != 0 || retu) handled = UNHANDLED; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: vlapic_dcr_write_handler(vlapic); break; default: handled = UNHANDLED; break; } return (handled); } static bool apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) { if (apic_access_virtualization(vmx, vcpuid) && (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) return (true); else return (false); } static int vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint64_t qual; int access_type, offset, allowed; if (!apic_access_virtualization(vmx, vcpuid)) return (UNHANDLED); qual = vmexit->u.vmx.exit_qualification; access_type = APIC_ACCESS_TYPE(qual); offset = APIC_ACCESS_OFFSET(qual); allowed = 0; if (access_type == 0) { /* * Read data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } else if (access_type == 1) { /* * Write data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } if (allowed) { vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, VIE_INVALID_GLA); } /* * Regardless of whether the APIC-access is allowed this handler * always returns UNHANDLED: * - if the access is allowed then it is handled by emulating the * instruction that caused the VM-exit (outside the critical section) * - if the access is not allowed then it will be converted to an * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. */ return (UNHANDLED); } static enum task_switch_reason vmx_task_switch_reason(uint64_t qual) { int reason; reason = (qual >> 30) & 0x3; switch (reason) { case 0: return (TSR_CALL); case 1: return (TSR_IRET); case 2: return (TSR_JMP); case 3: return (TSR_IDT_GATE); default: panic("%s: invalid reason %d", __func__, reason); } } static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { int error, handled, in; struct vmxctx *vmxctx; struct vlapic *vlapic; struct vm_inout_str *vis; struct vm_task_switch *ts; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; uint32_t intr_type, reason; uint64_t exitintinfo, qual, gpa; bool retu; CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); handled = UNHANDLED; vmxctx = &vmx->ctx[vcpu]; qual = vmexit->u.vmx.exit_qualification; reason = vmexit->u.vmx.exit_reason; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); /* * VM exits that can be triggered during event delivery need to * be handled specially by re-injecting the event if the IDT * vectoring information field's valid bit is set. * * See "Information for VM Exits During Event Delivery" in Intel SDM * for details. */ idtvec_info = vmcs_idt_vectoring_info(); if (idtvec_info & VMCS_IDT_VEC_VALID) { idtvec_info &= ~(1 << 12); /* clear undefined bit */ exitintinfo = idtvec_info; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { idtvec_err = vmcs_idt_vectoring_err(); exitintinfo |= (uint64_t)idtvec_err << 32; } error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); KASSERT(error == 0, ("%s: vm_set_intinfo error %d", __func__, error)); /* * If 'virtual NMIs' are being used and the VM-exit * happened while injecting an NMI during the previous * VM-entry, then clear "blocking by NMI" in the * Guest Interruptibility-State so the NMI can be * reinjected on the subsequent VM-entry. * * However, if the NMI was being delivered through a task * gate, then the new task must start execution with NMIs * blocked so don't clear NMI blocking in this case. */ intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type == VMCS_INTR_T_NMI) { if (reason != EXIT_REASON_TASK_SWITCH) vmx_clear_nmi_blocking(vmx, vcpu); else vmx_assert_nmi_blocking(vmx, vcpu); } /* * Update VM-entry instruction length if the event being * delivered was a software interrupt or software exception. */ if (intr_type == VMCS_INTR_T_SWINTR || intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || intr_type == VMCS_INTR_T_SWEXCEPTION) { vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); } } switch (reason) { case EXIT_REASON_TASK_SWITCH: ts = &vmexit->u.task_switch; ts->tsssel = qual & 0xffff; ts->reason = vmx_task_switch_reason(qual); ts->ext = 0; ts->errcode_valid = 0; vmx_paging_info(&ts->paging); /* * If the task switch was due to a CALL, JMP, IRET, software * interrupt (INT n) or software exception (INT3, INTO), * then the saved %rip references the instruction that caused * the task switch. The instruction length field in the VMCS * is valid in this case. * * In all other cases (e.g., NMI, hardware exception) the * saved %rip is one that would have been saved in the old TSS * had the task switch completed normally so the instruction * length field is not needed in this case and is explicitly * set to 0. */ if (ts->reason == TSR_IDT_GATE) { KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, ("invalid idtvec_info %#x for IDT task switch", idtvec_info)); intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type != VMCS_INTR_T_SWINTR && intr_type != VMCS_INTR_T_SWEXCEPTION && intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { /* Task switch triggered by external event */ ts->ext = 1; vmexit->inst_length = 0; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { ts->errcode_valid = 1; ts->errcode = vmcs_idt_vectoring_err(); } } } vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " "%s errcode 0x%016lx", ts->reason, ts->tsssel, ts->ext ? "external" : "internal", ((uint64_t)ts->errcode << 32) | ts->errcode_valid); break; case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); switch (qual & 0xf) { case 0: handled = vmx_emulate_cr0_access(vmx, vcpu, qual); break; case 4: handled = vmx_emulate_cr4_access(vmx, vcpu, qual); break; case 8: handled = vmx_emulate_cr8_access(vmx, vcpu, qual); break; } break; case EXIT_REASON_RDMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); retu = false; ecx = vmxctx->guest_rcx; VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } break; case EXIT_REASON_WRMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); retu = false; eax = vmxctx->guest_rax; ecx = vmxctx->guest_rcx; edx = vmxctx->guest_rdx; VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", ecx, (uint64_t)edx << 32 | eax); error = emulate_wrmsr(vmx->vm, vcpu, ecx, (uint64_t)edx << 32 | eax, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); vmexit->exitcode = VM_EXITCODE_MTRAP; break; case EXIT_REASON_PAUSE: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); vmexit->exitcode = VM_EXITCODE_PAUSE; break; case EXIT_REASON_INTR_WINDOW: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); vmx_clear_int_window_exiting(vmx, vcpu); return (1); case EXIT_REASON_EXT_INTR: /* * External interrupts serve only to cause VM exits and allow * the host interrupt handler to run. * * If this external interrupt triggers a virtual interrupt * to a VM, then that state will be recorded by the * host interrupt handler in the VM's softc. We will inject * this virtual interrupt during the subsequent VM enter. */ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); /* * XXX: Ignore this exit if VMCS_INTR_VALID is not set. * This appears to be a bug in VMware Fusion? */ if (!(intr_info & VMCS_INTR_VALID)) return (1); KASSERT((intr_info & VMCS_INTR_VALID) != 0 && (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, ("VM exit interruption info invalid: %#x", intr_info)); vmx_trigger_hostintr(intr_info & 0xff); /* * This is special. We want to treat this as an 'handled' * VM-exit but not increment the instruction pointer. */ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); return (1); case EXIT_REASON_NMI_WINDOW: /* Exit to allow the pending virtual NMI to be injected */ if (vm_nmi_pending(vmx->vm, vcpu)) vmx_inject_nmi(vmx, vcpu); vmx_clear_nmi_window_exiting(vmx, vcpu); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); return (1); case EXIT_REASON_INOUT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.bytes = (qual & 0x7) + 1; vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; vmexit->u.inout.port = (uint16_t)(qual >> 16); vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); if (vmexit->u.inout.string) { inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; vmx_paging_info(&vis->paging); vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); vis->cr0 = vmcs_read(VMCS_GUEST_CR0); vis->index = inout_str_index(vmx, vcpu, in); vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); vis->addrsize = inout_str_addrsize(inst_info); inout_str_seginfo(vmx, vcpu, inst_info, in, vis); } break; case EXIT_REASON_CPUID: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); /* * If Virtual NMIs control is 1 and the VM-exit is due to a * fault encountered during the execution of IRET then we must * restore the state of "virtual-NMI blocking" before resuming * the guest. * * See "Resuming Guest Software after Handling an Exception". * See "Information for VM Exits Due to Vectored Events". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (intr_info & 0xff) != IDT_DF && (intr_info & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); /* * The NMI has already been handled in vmx_exit_handle_nmi(). */ if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) return (1); break; case EXIT_REASON_EPT_FAULT: /* * If 'gpa' lies within the address space allocated to * memory then this must be a nested page fault otherwise * this must be an instruction that accesses MMIO space. */ gpa = vmcs_gpa(); if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(vmx, vcpu, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = gpa; vmexit->u.paging.fault_type = ept_fault_type(qual); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); } else if (ept_emulation_fault(qual)) { vmexit_inst_emul(vmexit, gpa, vmcs_gla()); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); } /* * If Virtual NMIs control is 1 and the VM-exit is due to an * EPT fault during the execution of IRET then we must restore * the state of "virtual-NMI blocking" before resuming. * * See description of "NMI unblocking due to IRET" in * "Exit Qualification for EPT Violations". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (qual & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); break; case EXIT_REASON_VIRTUALIZED_EOI: vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; vmexit->u.ioapic_eoi.vector = qual & 0xFF; vmexit->inst_length = 0; /* trap-like */ break; case EXIT_REASON_APIC_ACCESS: handled = vmx_handle_apic_access(vmx, vcpu, vmexit); break; case EXIT_REASON_APIC_WRITE: /* * APIC-write VM exit is trap-like so the %rip is already * pointing to the next instruction. */ vmexit->inst_length = 0; vlapic = vm_lapic(vmx->vm, vcpu); handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); break; case EXIT_REASON_XSETBV: handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); break; default: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } if (handled) { /* * It is possible that control is returned to userland * even though we were able to handle the VM exit in the * kernel. * * In such a case we want to make sure that the userland * restarts guest execution at the instruction *after* * the one we just processed. Therefore we update the * guest rip in the VMCS and in 'vmexit'. */ vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; vmcs_write(VMCS_GUEST_RIP, vmexit->rip); } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic VMX exit. */ vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = VM_SUCCESS; vmexit->u.vmx.inst_type = 0; vmexit->u.vmx.inst_error = 0; } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } return (handled); } static __inline void vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) { KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, ("vmx_exit_inst_error: invalid inst_fail_status %d", vmxctx->inst_fail_status)); vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = vmxctx->inst_fail_status; vmexit->u.vmx.inst_error = vmcs_instruction_error(); vmexit->u.vmx.exit_reason = ~0; vmexit->u.vmx.exit_qualification = ~0; switch (rc) { case VMX_VMRESUME_ERROR: case VMX_VMLAUNCH_ERROR: case VMX_INVEPT_ERROR: vmexit->u.vmx.inst_type = rc; break; default: panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); } } /* * If the NMI-exiting VM execution control is set to '1' then an NMI in * non-root operation causes a VM-exit. NMI blocking is in effect so it is * sufficient to simply vector to the NMI handler via a software interrupt. * However, this must be done before maskable interrupts are enabled * otherwise the "iret" issued by an interrupt handler will incorrectly * clear NMI blocking. */ static __inline void vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint32_t intr_info; KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) return; intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " "to NMI has invalid vector: %#x", intr_info)); VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); __asm __volatile("int $2"); } } static int vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, void *rendezvous_cookie, void *suspend_cookie) { int rc, handled, launched; struct vmx *vmx; struct vm *vm; struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; struct vlapic *vlapic; uint64_t rip; uint32_t exit_reason; vmx = arg; vm = vmx->vm; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; vlapic = vm_lapic(vm, vcpu); vmexit = vm_exitinfo(vm, vcpu); launched = 0; KASSERT(vmxctx->pmap == pmap, ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); VMPTRLD(vmcs); /* * XXX * We do this every time because we may setup the virtual machine * from a different process than the one that actually runs it. * * If the life of a virtual machine was spent entirely in the context * of a single process we could do this once in vmx_vminit(). */ vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, startrip); vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { handled = UNHANDLED; /* * Interrupts are disabled from this point on until the * guest starts executing. This is done for the following * reasons: * * If an AST is asserted on this thread after the check below, * then the IPI_AST notification will not be lost, because it * will cause a VM exit due to external interrupt as soon as * the guest state is loaded. * * A posted interrupt after 'vmx_inject_interrupts()' will * not be "lost" because it will be held pending in the host * APIC because interrupts are disabled. The pending interrupt * will be recognized as soon as the guest state is loaded. * * The same reasoning applies to the IPI generated by * pmap_invalidate_ept(). */ disable_intr(); vmx_inject_interrupts(vmx, vcpu, vlapic); /* * Check for vcpu suspension after injecting events because * vmx_inject_interrupts() can suspend the vcpu due to a * triple fault. */ if (vcpu_suspended(suspend_cookie)) { enable_intr(); vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip()); break; } if (vcpu_rendezvous_pending(rendezvous_cookie)) { enable_intr(); vm_exit_rendezvous(vmx->vm, vcpu, vmcs_guest_rip()); break; } if (vcpu_should_yield(vm, vcpu)) { enable_intr(); vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip()); vmx_astpending_trace(vmx, vcpu, vmexit->rip); handled = HANDLED; break; } vmx_run_trace(vmx, vcpu); rc = vmx_enter_guest(vmxctx, vmx, launched); /* Collect some information for VM exit processing */ vmexit->rip = rip = vmcs_guest_rip(); vmexit->inst_length = vmexit_instruction_length(); vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); if (rc == VMX_GUEST_VMEXIT) { vmx_exit_handle_nmi(vmx, vcpu, vmexit); enable_intr(); handled = vmx_exit_process(vmx, vcpu, vmexit); } else { enable_intr(); vmx_exit_inst_error(vmxctx, rc, vmexit); } launched = 1; vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); } while (handled); /* * If a VM exit has been handled then the exitcode must be BOGUS * If a VM exit is not handled then the exitcode must not be BOGUS */ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { panic("Mismatch between handled (%d) and exitcode (%d)", handled, vmexit->exitcode); } if (!handled) vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); VMCLEAR(vmcs); return (0); } static void vmx_vmcleanup(void *arg) { int i; struct vmx *vmx = arg; if (apic_access_virtualization(vmx, 0)) vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); for (i = 0; i < VM_MAXCPU; i++) vpid_free(vmx->state[i].vpid); free(vmx, M_VMX); return; } static register_t * vmxctx_regptr(struct vmxctx *vmxctx, int reg) { switch (reg) { case VM_REG_GUEST_RAX: return (&vmxctx->guest_rax); case VM_REG_GUEST_RBX: return (&vmxctx->guest_rbx); case VM_REG_GUEST_RCX: return (&vmxctx->guest_rcx); case VM_REG_GUEST_RDX: return (&vmxctx->guest_rdx); case VM_REG_GUEST_RSI: return (&vmxctx->guest_rsi); case VM_REG_GUEST_RDI: return (&vmxctx->guest_rdi); case VM_REG_GUEST_RBP: return (&vmxctx->guest_rbp); case VM_REG_GUEST_R8: return (&vmxctx->guest_r8); case VM_REG_GUEST_R9: return (&vmxctx->guest_r9); case VM_REG_GUEST_R10: return (&vmxctx->guest_r10); case VM_REG_GUEST_R11: return (&vmxctx->guest_r11); case VM_REG_GUEST_R12: return (&vmxctx->guest_r12); case VM_REG_GUEST_R13: return (&vmxctx->guest_r13); case VM_REG_GUEST_R14: return (&vmxctx->guest_r14); case VM_REG_GUEST_R15: return (&vmxctx->guest_r15); case VM_REG_GUEST_CR2: return (&vmxctx->guest_cr2); default: break; } return (NULL); } static int vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *retval = *regp; return (0); } else return (EINVAL); } static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *regp = val; return (0); } else return (EINVAL); } static int +vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) +{ + uint64_t gi; + int error; + + error = vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + return (error); +} + +static int +vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) +{ + struct vmcs *vmcs; + uint64_t gi; + int error, ident; + + /* + * Forcing the vcpu into an interrupt shadow is not supported. + */ + if (val) { + error = EINVAL; + goto done; + } + + vmcs = &vmx->vmcs[vcpu]; + ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); + error = vmcs_getreg(vmcs, running, ident, &gi); + if (error == 0) { + gi &= ~HWINTR_BLOCKING; + error = vmcs_setreg(vmcs, running, ident, gi); + } +done: + VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, + error ? "failed" : "succeeded"); + return (error); +} + +static int vmx_shadow_reg(int reg) { int shreg; shreg = -1; switch (reg) { case VM_REG_GUEST_CR0: shreg = VMCS_CR0_SHADOW; break; case VM_REG_GUEST_CR4: shreg = VMCS_CR4_SHADOW; break; default: break; } return (shreg); } static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) { int running, hostcpu; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) return (0); return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); } static int vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) { int error, hostcpu, running, shadow; uint64_t ctls; pmap_t pmap; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) return (0); error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); if (error == 0) { /* * If the "load EFER" VM-entry control is 1 then the * value of EFER.LMA must be identical to "IA-32e mode guest" * bit in the VM-entry control. */ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && (reg == VM_REG_GUEST_EFER)) { vmcs_getreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); if (val & EFER_LMA) ctls |= VM_ENTRY_GUEST_LMA; else ctls &= ~VM_ENTRY_GUEST_LMA; vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); } shadow = vmx_shadow_reg(reg); if (shadow > 0) { /* * Store the unmodified value in the shadow */ error = vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(shadow), val); } if (reg == VM_REG_GUEST_CR3) { /* * Invalidate the guest vcpu's TLB mappings to emulate * the behavior of updating %cr3. * * XXX the processor retains global mappings when %cr3 * is updated but vmx_invvpid() does not. */ pmap = vmx->ctx[vcpu].pmap; vmx_invvpid(vmx, vcpu, pmap, running); } } return (error); } static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_getcap(void *arg, int vcpu, int type, int *retval) { struct vmx *vmx = arg; int vcap; int ret; ret = ENOENT; vcap = vmx->cap[vcpu].set; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) ret = 0; break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) ret = 0; break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) ret = 0; break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) ret = 0; break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) ret = 0; break; default: break; } if (ret == 0) *retval = (vcap & (1 << type)) ? 1 : 0; return (ret); } static int vmx_setcap(void *arg, int vcpu, int type, int val) { struct vmx *vmx = arg; struct vmcs *vmcs = &vmx->vmcs[vcpu]; uint32_t baseval; uint32_t *pptr; int error; int flag; int reg; int retval; retval = ENOENT; pptr = NULL; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_HLT_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_MTF; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_PAUSE_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_UNRESTRICTED_GUEST; reg = VMCS_SEC_PROC_BASED_CTLS; } break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_ENABLE_INVPCID; reg = VMCS_SEC_PROC_BASED_CTLS; } break; default: break; } if (retval == 0) { if (val) { baseval |= flag; } else { baseval &= ~flag; } VMPTRLD(vmcs); error = vmwrite(reg, baseval); VMCLEAR(vmcs); if (error) { retval = error; } else { /* * Update optional stored flags, and record * setting */ if (pptr != NULL) { *pptr = baseval; } if (val) { vmx->cap[vcpu].set |= (1 << type); } else { vmx->cap[vcpu].set &= ~(1 << type); } } } return (retval); } struct vlapic_vtx { struct vlapic vlapic; struct pir_desc *pir_desc; struct vmx *vmx; }; #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ do { \ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ level ? "level" : "edge", vector); \ VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ } while (0) /* * vlapic->ops handlers that utilize the APICv hardware assist described in * Chapter 29 of the Intel SDM. */ static int vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; uint64_t mask; int idx, notify; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; /* * Keep track of interrupt requests in the PIR descriptor. This is * because the virtual APIC page pointed to by the VMCS cannot be * modified if the vcpu is running. */ idx = vector / 64; mask = 1UL << (vector % 64); atomic_set_long(&pir_desc->pir[idx], mask); notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, level, "vmx_set_intr_ready"); return (notify); } static int vmx_pending_intr(struct vlapic *vlapic, int *vecptr) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t pending, pirval; uint32_t ppr, vpr; int i; /* * This function is only expected to be called from the 'HLT' exit * handler which does not care about the vector that is pending. */ KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; pending = atomic_load_acq_long(&pir_desc->pending); if (!pending) return (0); /* common case */ /* * If there is an interrupt pending then it will be recognized only * if its priority is greater than the processor priority. * * Special case: if the processor priority is zero then any pending * interrupt will be recognized. */ lapic = vlapic->apic_page; ppr = lapic->ppr & 0xf0; if (ppr == 0) return (1); VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", lapic->ppr); for (i = 3; i >= 0; i--) { pirval = pir_desc->pir[i]; if (pirval != 0) { vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; return (vpr > ppr); } } return (0); } static void vmx_intr_accepted(struct vlapic *vlapic, int vector) { panic("vmx_intr_accepted: not expected to be called"); } static void vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct vmx *vmx; struct vmcs *vmcs; uint64_t mask, val; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), ("vmx_set_tmr: vcpu cannot be running")); vlapic_vtx = (struct vlapic_vtx *)vlapic; vmx = vlapic_vtx->vmx; vmcs = &vmx->vmcs[vlapic->vcpuid]; mask = 1UL << (vector % 64); VMPTRLD(vmcs); val = vmcs_read(VMCS_EOI_EXIT(vector)); if (level) val |= mask; else val &= ~mask; vmcs_write(VMCS_EOI_EXIT(vector), val); VMCLEAR(vmcs); } static void vmx_enable_x2apic_mode(struct vlapic *vlapic) { struct vmx *vmx; struct vmcs *vmcs; uint32_t proc_ctls2; int vcpuid, error; vcpuid = vlapic->vcpuid; vmx = ((struct vlapic_vtx *)vlapic)->vmx; vmcs = &vmx->vmcs[vcpuid]; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; VMPTRLD(vmcs); vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); VMCLEAR(vmcs); if (vlapic->vcpuid == 0) { /* * The nested page table mappings are shared by all vcpus * so unmap the APIC access page just once. */ error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", __func__, error)); /* * The MSR bitmap is shared by all vcpus so modify it only * once in the context of vcpu 0. */ error = vmx_allow_x2apic_msrs(vmx); KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", __func__, error)); } } static void vmx_post_intr(struct vlapic *vlapic, int hostcpu) { ipi_cpu(hostcpu, pirvec); } /* * Transfer the pending interrupts in the PIR descriptor to the IRR * in the virtual APIC page. */ static void vmx_inject_pir(struct vlapic *vlapic) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t val, pirval; int rvi, pirbase = -1; uint16_t intr_status_old, intr_status_new; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "no posted interrupt pending"); return; } pirval = 0; pirbase = -1; lapic = vlapic->apic_page; val = atomic_readandclear_long(&pir_desc->pir[0]); if (val != 0) { lapic->irr0 |= val; lapic->irr1 |= val >> 32; pirbase = 0; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[1]); if (val != 0) { lapic->irr2 |= val; lapic->irr3 |= val >> 32; pirbase = 64; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[2]); if (val != 0) { lapic->irr4 |= val; lapic->irr5 |= val >> 32; pirbase = 128; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[3]); if (val != 0) { lapic->irr6 |= val; lapic->irr7 |= val >> 32; pirbase = 192; pirval = val; } VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); /* * Update RVI so the processor can evaluate pending virtual * interrupts on VM-entry. * * It is possible for pirval to be 0 here, even though the * pending bit has been set. The scenario is: * CPU-Y is sending a posted interrupt to CPU-X, which * is running a guest and processing posted interrupts in h/w. * CPU-X will eventually exit and the state seen in s/w is * the pending bit set, but no PIR bits set. * * CPU-X CPU-Y * (vm running) (host running) * rx posted interrupt * CLEAR pending bit * SET PIR bit * READ/CLEAR PIR bits * SET pending bit * (vm exit) * pending bit set, PIR 0 */ if (pirval != 0) { rvi = pirbase + flsl(pirval) - 1; intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); intr_status_new = (intr_status_old & 0xFF00) | rvi; if (intr_status_new > intr_status_old) { vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "guest_intr_status changed from 0x%04x to 0x%04x", intr_status_old, intr_status_new); } } } static struct vlapic * vmx_vlapic_init(void *arg, int vcpuid) { struct vmx *vmx; struct vlapic *vlapic; struct vlapic_vtx *vlapic_vtx; vmx = arg; vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = vmx->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; vlapic_vtx = (struct vlapic_vtx *)vlapic; vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; vlapic_vtx->vmx = vmx; if (virtual_interrupt_delivery) { vlapic->ops.set_intr_ready = vmx_set_intr_ready; vlapic->ops.pending_intr = vmx_pending_intr; vlapic->ops.intr_accepted = vmx_intr_accepted; vlapic->ops.set_tmr = vmx_set_tmr; vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; } if (posted_interrupts) vlapic->ops.post_intr = vmx_post_intr; vlapic_init(vlapic); return (vlapic); } static void vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_VLAPIC); } struct vmm_ops vmm_ops_intel = { vmx_init, vmx_cleanup, vmx_restore, vmx_vminit, vmx_run, vmx_vmcleanup, vmx_getreg, vmx_setreg, vmx_getdesc, vmx_setdesc, vmx_getcap, vmx_setcap, ept_vmspace_alloc, ept_vmspace_free, vmx_vlapic_init, vmx_vlapic_cleanup, }; Index: user/ae/inet6/sys/amd64/vmm/vmm.c =================================================================== --- user/ae/inet6/sys/amd64/vmm/vmm.c (revision 271452) +++ user/ae/inet6/sys/amd64/vmm/vmm.c (revision 271453) @@ -1,2310 +1,2326 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vmm_msr.h" #include "vmm_ipi.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int hostcpu; /* (o) vcpu's host cpu */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ struct vm_exception exception; /* (x) exception collateral */ int exception_pending; /* (i) exception pending */ struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { vm_paddr_t gpa; size_t len; boolean_t wired; vm_object_t object; }; #define VM_MAX_MEMORY_SEGMENTS 2 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ volatile cpuset_t active_cpus; /* (i) active vcpus */ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ void *rendezvous_arg; /* (x) rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ int num_mem_segs; /* (o) guest memory segments */ struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ }; static int vmm_initialized; static struct vmm_ops *ops; #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) #define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMSPACE_ALLOC(min, max) \ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) #define VMSPACE_FREE(vmspace) \ (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) #define VMGETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMSETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMGETCAP(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) #define VLAPIC_INIT(vmi, vcpu) \ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) #define VLAPIC_CLEANUP(vmi, vlapic) \ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() static MALLOC_DEFINE(M_VM, "vm", "vm"); CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static void vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } } static void vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = &vm->vcpu[vcpu_id]; if (create) { KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " "initialized", vcpu_id)); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); } vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); guest_msrs_init(vm, vcpu_id); } struct vm_exit * vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; if (cpuid < 0 || cpuid >= VM_MAXCPU) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; return (&vcpu->exitinfo); } static void vmm_resume(void) { VMM_RESUME(); } static int vmm_init(void) { int error; vmm_host_state_init(); vmm_ipinum = vmm_ipi_alloc(); if (vmm_ipinum == 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_amd()) ops = &vmm_ops_amd; else return (ENXIO); vmm_msr_init(); vmm_resume_p = vmm_resume; return (VMM_INIT(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: vmmdev_init(); if (ppt_avail_devices() > 0) iommu_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; break; case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) vmm_ipi_free(vmm_ipinum); error = VMM_CLEANUP(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - iommu initialization must happen after the pci passthru driver has had * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { int i; vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); CPU_ZERO(&vm->active_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); for (i = 0; i < VM_MAXCPU; i++) vcpu_init(vm, i, create); } int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->num_mem_segs = 0; vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm_init(vm, true); *retvm = vm; return (0); } static void vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) { if (seg->object != NULL) vmm_mem_free(vm->vmspace, seg->gpa, seg->len); bzero(seg, sizeof(*seg)); } static void vm_cleanup(struct vm *vm, bool destroy) { int i; ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (i = 0; i < VM_MAXCPU; i++) vcpu_cleanup(vm, i, destroy); VMCLEANUP(vm->cookie); if (destroy) { for (i = 0; i < vm->num_mem_segs; i++) vm_free_mem_seg(vm, &vm->mem_segs[i]); vm->num_mem_segs = 0; VMSPACE_FREE(vm->vmspace); vm->vmspace = NULL; } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) { int i; vm_paddr_t gpabase, gpalimit; for (i = 0; i < vm->num_mem_segs; i++) { gpabase = vm->mem_segs[i].gpa; gpalimit = gpabase + vm->mem_segs[i].len; if (gpa >= gpabase && gpa < gpalimit) return (TRUE); /* 'gpa' is regular memory */ } if (ppt_is_mmio(vm, gpa)) return (TRUE); /* 'gpa' is pci passthru mmio */ return (FALSE); } int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) { int available, allocated; struct mem_seg *seg; vm_object_t object; vm_paddr_t g; if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) return (EINVAL); available = allocated = 0; g = gpa; while (g < gpa + len) { if (vm_mem_allocated(vm, g)) allocated++; else available++; g += PAGE_SIZE; } /* * If there are some allocated and some available pages in the address * range then it is an error. */ if (allocated && available) return (EINVAL); /* * If the entire address range being requested has already been * allocated then there isn't anything more to do. */ if (allocated && available == 0) return (0); if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); seg = &vm->mem_segs[vm->num_mem_segs]; if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) return (ENOMEM); seg->gpa = gpa; seg->len = len; seg->object = object; seg->wired = FALSE; vm->num_mem_segs++; return (0); } static vm_paddr_t vm_maxmem(struct vm *vm) { int i; vm_paddr_t gpa, maxmem; maxmem = 0; for (i = 0; i < vm->num_mem_segs; i++) { gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; if (gpa > maxmem) maxmem = gpa; } return (maxmem); } static void vm_gpa_unwire(struct vm *vm) { int i, rv; struct mem_seg *seg; for (i = 0; i < vm->num_mem_segs; i++) { seg = &vm->mem_segs[i]; if (!seg->wired) continue; rv = vm_map_unwire(&vm->vmspace->vm_map, seg->gpa, seg->gpa + seg->len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " "%#lx/%ld could not be unwired: %d", vm_name(vm), seg->gpa, seg->len, rv)); seg->wired = FALSE; } } static int vm_gpa_wire(struct vm *vm) { int i, rv; struct mem_seg *seg; for (i = 0; i < vm->num_mem_segs; i++) { seg = &vm->mem_segs[i]; if (seg->wired) continue; /* XXX rlimits? */ rv = vm_map_wire(&vm->vmspace->vm_map, seg->gpa, seg->gpa + seg->len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (rv != KERN_SUCCESS) break; seg->wired = TRUE; } if (i < vm->num_mem_segs) { /* * Undo the wiring before returning an error. */ vm_gpa_unwire(vm); return (EAGAIN); } return (0); } static void vm_iommu_modify(struct vm *vm, boolean_t map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_seg *seg; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < vm->num_mem_segs; i++) { seg = &vm->mem_segs[i]; KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", vm_name(vm), seg->gpa, seg->len)); gpa = seg->gpa; while (gpa < seg->gpa + seg->len) { vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); iommu_remove_mapping(host_domain, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); iommu_create_mapping(host_domain, hpa, hpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) { vm_iommu_unmap(vm); vm_gpa_unwire(vm); } return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* * Virtual machines with pci passthru devices get special treatment: * - the guest physical memory is wired * - the iommu is programmed to do the 'gpa' to 'hpa' translation * * We need to do this before the first pci passthru device is attached. */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vm_maxmem(vm); vm->iommu = iommu_create_domain(maxaddr); error = vm_gpa_wire(vm); if (error) return (error); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } void * vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int count, pageoff; vm_page_t m; pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg) { int i; for (i = 0; i < vm->num_mem_segs; i++) { if (gpabase == vm->mem_segs[i].gpa) { seg->gpa = vm->mem_segs[i].gpa; seg->len = vm->mem_segs[i].len; seg->wired = vm->mem_segs[i].wired; return (0); } } return (-1); } int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, vm_offset_t *offset, struct vm_object **object) { int i; size_t seg_len; vm_paddr_t seg_gpa; vm_object_t seg_obj; for (i = 0; i < vm->num_mem_segs; i++) { if ((seg_obj = vm->mem_segs[i].object) == NULL) continue; seg_gpa = vm->mem_segs[i].gpa; seg_len = vm->mem_segs[i].len; if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { *offset = gpa - seg_gpa; *object = seg_obj; vm_object_reference(seg_obj); return (0); } } return (EINVAL); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMGETREG(vm->cookie, vcpu, reg, retval)); } int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMSETREG(vm->cookie, vcpu, reg, val)); } static boolean_t is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (TRUE); default: return (FALSE); } } static boolean_t is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (TRUE); default: return (FALSE); } } int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static void vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) { KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); /* * Update 'rendezvous_func' and execute a write memory barrier to * ensure that it is visible across all host cpus. This is not needed * for correctness but it does ensure that all the vcpus will notice * that the rendezvous is requested immediately. */ vm->rendezvous_func = func; wmb(); } #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ do { \ if (vcpuid >= 0) \ VCPU_CTR0(vm, vcpuid, fmt); \ else \ VM_CTR0(vm, fmt); \ } while (0) static void vm_handle_rendezvous(struct vm *vm, int vcpuid) { KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); if (vcpuid != -1 && CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); vm_set_rendezvous_func(vm, NULL); wakeup(&vm->rendezvous_func); break; } RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", 0); } mtx_unlock(&vm->rendezvous_mtx); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { struct vcpu *vcpu; const char *wmesg; - int t, vcpu_halted, vm_halted; + int error, t, vcpu_halted, vm_halted; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; vcpu_halted = 0; vm_halted = 0; + + /* + * The typical way to halt a cpu is to execute: "sti; hlt" + * + * STI sets RFLAGS.IF to enable interrupts. However, the processor + * remains in an "interrupt shadow" for an additional instruction + * following the STI. This guarantees that "sti; hlt" sequence is + * atomic and a pending interrupt will be recognized after the HLT. + * + * After the HLT emulation is done the vcpu is no longer in an + * interrupt shadow and a pending interrupt can be injected on + * the next entry into the guest. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from VMRUN() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend) break; if (vm_nmi_pending(vm, vcpuid)) break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vm, vcpuid)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VCPU_CTR0(vm, vcpuid, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) { int rv, ftype; struct vm_map *map; struct vcpu *vcpu; struct vm_exit *vme; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) goto done; } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: /* restart execution at the faulting instruction */ vme->inst_length = 0; return (0); } static int vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) { struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; uint64_t gla, gpa; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; vie_init(vie); /* Fetch, decode and emulate the faulting instruction */ error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip, vme->inst_length, vie); if (error == 1) return (0); /* Resume guest to handle page fault */ else if (error == -1) return (EFAULT); else if (error != 0) panic("%s: vmm_fetch_instruction error %d", __func__, error); if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) return (EFAULT); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) { int i, done; struct vcpu *vcpu; done = 0; vcpu = &vm->vcpu[vcpuid]; CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (1) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); vcpu_require_state_locked(vcpu, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); } else { VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); vcpu_unlock(vcpu); vm_handle_rendezvous(vm, vcpuid); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < VM_MAXCPU; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm, i, false); } } *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < VM_MAXCPU; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm, i, false); } return (0); } void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); } void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); } int vm_run(struct vm *vm, struct vm_run *vmrun) { int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval, rip; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; void *rptr, *sptr; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); rptr = &vm->rendezvous_func; sptr = &vm->suspend; pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; rip = vmrun->rip; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; switch (vme->exitcode) { case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: vm_handle_rendezvous(vm, vcpuid); error = 0; break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vm, vcpuid, vme, &retu); break; default: retu = true; /* handled in userland */ break; } } if (error == 0 && retu == false) { rip = vme->rip + vme->inst_length; goto restart; } /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); } int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) { struct vcpu *vcpu; int type, vector; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exception.vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exception.error_code_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exception.error_code << 32; } } return (info); } int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) { struct vcpu *vcpu; uint64_t info1, info2; int valid; KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); vcpu = &vm->vcpu[vcpuid]; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", vcpu->exception.vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vm, vcpuid, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (exception->vector < 0 || exception->vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (exception->vector == IDT_DF) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " "pending exception %d", exception->vector, vcpu->exception.vector); return (EBUSY); } vcpu->exception_pending = 1; vcpu->exception = *exception; VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); return (0); } void vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, int errcode) { struct vm_exception exception; struct vm_exit *vmexit; struct vm *vm; int error; vm = vmarg; exception.vector = vector; exception.error_code = errcode; exception.error_code_valid = errcode_valid; error = vm_inject_exception(vm, vcpuid, &exception); KASSERT(error == 0, ("vm_inject_exception error %d", error)); /* * A fault-like exception allows the instruction to be restarted * after the exception handler returns. * * By setting the inst_length to 0 we ensure that the instruction * pointer remains at the faulting instruction. */ vmexit = vm_exitinfo(vm, vcpuid); vmexit->inst_length = 0; } void vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) { struct vm *vm; int error; vm = vmarg; VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_nmi_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->nmi_pending); } void vm_nmi_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->extint_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_extint_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->extint_pending); } void vm_extint_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMGETCAP(vm->cookie, vcpu, type, retval)); } int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMSETCAP(vm->cookie, vcpu, type, val)); } uint64_t * vm_guest_msrs(struct vm *vm, int cpu) { return (vm->vcpu[cpu].guest_msrs); } struct vlapic * vm_lapic(struct vm *vm, int cpu) { return (vm->vcpu[cpu].vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } boolean_t vmm_is_pptdev(int bus, int slot, int func) { int found, i, n; int b, s, f; char *val, *cp, *cp2; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; for (i = 0; names[i] != NULL && !found; i++) { cp = val = getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = 1; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) { struct vcpu *vcpu; enum vcpu_state state; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vm *vm, int vcpuid) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->active_cpus)) return (EBUSY); VCPU_CTR0(vm, vcpuid, "activated"); CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); return (0); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } void * vcpu_stats(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid].stats); } int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; return (0); } int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (state >= X2APIC_STATE_LAST) return (EINVAL); vm->vcpu[vcpuid].x2apic_state = state; vlapic_set_x2apic_state(vm, vcpuid, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) { int hostcpu; struct vcpu *vcpu; vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { int i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpuid' is one * of the targets of the rendezvous. */ RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); vm_handle_rendezvous(vm, vcpuid); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm_set_rendezvous_func(vm, func); mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < VM_MAXCPU; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm, i, false); } vm_handle_rendezvous(vm, vcpuid); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); if (error) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); return (-1); } else { return (0); } } void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vm->vmspace)); } } static void vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); Index: user/ae/inet6/sys/amd64/vmm =================================================================== --- user/ae/inet6/sys/amd64/vmm (revision 271452) +++ user/ae/inet6/sys/amd64/vmm (revision 271453) Property changes on: user/ae/inet6/sys/amd64/vmm ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/amd64/vmm:r271428-271452 Index: user/ae/inet6/sys/arm/altera/socfpga/files.socfpga =================================================================== --- user/ae/inet6/sys/arm/altera/socfpga/files.socfpga (revision 271452) +++ user/ae/inet6/sys/arm/altera/socfpga/files.socfpga (revision 271453) @@ -1,18 +1,19 @@ # $FreeBSD$ kern/kern_clocksource.c standard arm/arm/bus_space_generic.c standard arm/arm/bus_space_asm_generic.S standard arm/arm/cpufunc_asm_armv5.S standard arm/arm/cpufunc_asm_arm10.S standard arm/arm/cpufunc_asm_arm11.S standard arm/arm/cpufunc_asm_armv7.S standard arm/arm/bus_space-v6.c standard arm/arm/gic.c standard arm/arm/mpcore_timer.c standard arm/altera/socfpga/socfpga_common.c standard arm/altera/socfpga/socfpga_machdep.c standard arm/altera/socfpga/socfpga_manager.c standard +arm/altera/socfpga/socfpga_rstmgr.c standard Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_common.c =================================================================== --- user/ae/inet6/sys/arm/altera/socfpga/socfpga_common.c (revision 271452) +++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_common.c (revision 271453) @@ -1,83 +1,94 @@ /*- * Copyright (c) 2014 Ruslan Bukin * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include -#define RESMAN_BASE 0xFFD05000 -#define RESMAN_CTRL 0x4 -#define SWWARMRSTREQ (1 << 1) +#include void cpu_reset(void) { + uint32_t addr, paddr; bus_addr_t vaddr; + phandle_t node; - if (bus_space_map(fdtbus_bs_tag, RESMAN_BASE, 0x10, 0, &vaddr) == 0) { - bus_space_write_4(fdtbus_bs_tag, vaddr, - RESMAN_CTRL, SWWARMRSTREQ); + if (rstmgr_warmreset() == 0) + goto end; + + node = OF_finddevice("rstmgr"); + if (node == -1) + goto end; + + if ((OF_getprop(node, "reg", &paddr, sizeof(paddr))) > 0) { + addr = fdt32_to_cpu(paddr); + if (bus_space_map(fdtbus_bs_tag, addr, 0x8, 0, &vaddr) == 0) { + bus_space_write_4(fdtbus_bs_tag, vaddr, + RSTMGR_CTRL, CTRL_SWWARMRSTREQ); + } } +end: while (1); } struct fdt_fixup_entry fdt_fixup_table[] = { { NULL, NULL } }; static int fdt_pic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig, int *pol) { if (!fdt_is_compatible(node, "arm,gic")) return (ENXIO); *interrupt = fdt32_to_cpu(intr[0]); *trig = INTR_TRIGGER_CONFORM; *pol = INTR_POLARITY_CONFORM; return (0); } fdt_pic_decode_t fdt_pic_table[] = { &fdt_pic_decode_ic, NULL }; Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h =================================================================== --- user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h (nonexistent) +++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h (revision 271453) @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2014 Ruslan Bukin + * All rights reserved. + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) + * ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define L3REGS_REMAP 0x0 /* Remap */ +#define REMAP_LWHPS2FPGA (1 << 4) +#define REMAP_HPS2FPGA (1 << 3) +#define REMAP_MPUZERO (1 << 0) +#define L3REGS_L4MAIN 0x8 /* L4 main peripherals security */ +#define L3REGS_L4SP 0xC /* L4 SP Peripherals Security */ +#define L3REGS_L4MP 0x10 /* L4 MP Peripherals Security */ +#define L3REGS_L4OSC1 0x14 /* L4 OSC1 Peripherals Security */ +#define L3REGS_L4SPIM 0x18 /* L4 SPIM Peripherals Security */ +#define L3REGS_STM 0x1C /* STM Peripheral Security */ +#define L3REGS_LWHPS2FPGAREGS 0x20 /* LWHPS2FPGA AXI Bridge Security */ +#define L3REGS_USB1 0x28 /* USB1 Peripheral Security */ +#define L3REGS_NANDDATA 0x2C /* NAND Flash Controller Data Sec */ +#define L3REGS_USB0 0x80 /* USB0 Peripheral Security */ +#define L3REGS_NANDREGS 0x84 /* NAND Flash Controller Security */ +#define L3REGS_QSPIDATA 0x88 /* QSPI Flash Controller Data Sec */ +#define L3REGS_FPGAMGRDATA 0x8C /* FPGA Manager Data Peripheral Sec */ +#define L3REGS_HPS2FPGAREGS 0x90 /* HPS2FPGA AXI Bridge Perip. Sec */ +#define L3REGS_ACP 0x94 /* MPU ACP Peripheral Security */ +#define L3REGS_ROM 0x98 /* ROM Peripheral Security */ +#define L3REGS_OCRAM 0x9C /* On-chip RAM Peripheral Security */ +#define L3REGS_SDRDATA 0xA0 /* SDRAM Data Peripheral Security */ Property changes on: user/ae/inet6/sys/arm/altera/socfpga/socfpga_l3regs.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c =================================================================== --- user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c (nonexistent) +++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c (revision 271453) @@ -0,0 +1,259 @@ +/*- + * Copyright (c) 2014 Ruslan Bukin + * All rights reserved. + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) + * ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * SOCFPGA Reset Manager. + * Chapter 3, Cyclone V Device Handbook (CV-5V2 2014.07.22) + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +struct rstmgr_softc { + struct resource *res[1]; + bus_space_tag_t bst; + bus_space_handle_t bsh; + device_t dev; +}; + +struct rstmgr_softc *rstmgr_sc; + +static struct resource_spec rstmgr_spec[] = { + { SYS_RES_MEMORY, 0, RF_ACTIVE }, + { -1, 0 } +}; + +enum { + RSTMGR_SYSCTL_FPGA2HPS, + RSTMGR_SYSCTL_LWHPS2FPGA, + RSTMGR_SYSCTL_HPS2FPGA +}; + +static int +l3remap(struct rstmgr_softc *sc, int remap, int enable) +{ + uint32_t addr, paddr; + bus_addr_t vaddr; + phandle_t node; + int reg; + + /* + * Control whether bridge is visible to L3 masters or not. + * Register is write-only. + */ + + reg = REMAP_MPUZERO; + if (enable) + reg |= (remap); + else + reg &= ~(remap); + + node = OF_finddevice("l3regs"); + if (node == -1) { + device_printf(sc->dev, "Can't find l3regs node\n"); + return (1); + } + + if ((OF_getprop(node, "reg", &paddr, sizeof(paddr))) > 0) { + addr = fdt32_to_cpu(paddr); + if (bus_space_map(fdtbus_bs_tag, addr, 0x4, 0, &vaddr) == 0) { + bus_space_write_4(fdtbus_bs_tag, vaddr, + L3REGS_REMAP, reg); + return (0); + } + } + + return (1); +} + +static int +rstmgr_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct rstmgr_softc *sc; + int enable; + int remap; + int err; + int reg; + int bit; + + sc = arg1; + + switch (arg2) { + case RSTMGR_SYSCTL_FPGA2HPS: + bit = BRGMODRST_FPGA2HPS; + remap = 0; + break; + case RSTMGR_SYSCTL_LWHPS2FPGA: + bit = BRGMODRST_LWHPS2FPGA; + remap = REMAP_LWHPS2FPGA; + break; + case RSTMGR_SYSCTL_HPS2FPGA: + bit = BRGMODRST_HPS2FPGA; + remap = REMAP_HPS2FPGA; + break; + default: + return (1); + }; + + reg = READ4(sc, RSTMGR_BRGMODRST); + enable = reg & bit ? 0 : 1; + + err = sysctl_handle_int(oidp, &enable, 0, req); + if (err || !req->newptr) + return (err); + + if (enable == 1) + reg &= ~(bit); + else if (enable == 0) + reg |= (bit); + else + return (EINVAL); + + WRITE4(sc, RSTMGR_BRGMODRST, reg); + l3remap(sc, remap, enable); + + return (0); +} + +int +rstmgr_warmreset(void) +{ + struct rstmgr_softc *sc; + + sc = rstmgr_sc; + if (sc == NULL) + return (1); + + /* Request warm reset */ + WRITE4(sc, RSTMGR_CTRL, + CTRL_SWWARMRSTREQ); + + return (0); +} + +static int +rstmgr_add_sysctl(struct rstmgr_softc *sc) +{ + struct sysctl_oid_list *children; + struct sysctl_ctx_list *ctx; + + ctx = device_get_sysctl_ctx(sc->dev); + children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); + + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fpga2hps", + CTLTYPE_UINT | CTLFLAG_RW, sc, RSTMGR_SYSCTL_FPGA2HPS, + rstmgr_sysctl, "I", "Enable fpga2hps bridge"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lwhps2fpga", + CTLTYPE_UINT | CTLFLAG_RW, sc, RSTMGR_SYSCTL_LWHPS2FPGA, + rstmgr_sysctl, "I", "Enable lwhps2fpga bridge"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hps2fpga", + CTLTYPE_UINT | CTLFLAG_RW, sc, RSTMGR_SYSCTL_HPS2FPGA, + rstmgr_sysctl, "I", "Enable hps2fpga bridge"); + + return (0); +} + +static int +rstmgr_probe(device_t dev) +{ + + if (!ofw_bus_status_okay(dev)) + return (ENXIO); + + if (!ofw_bus_is_compatible(dev, "altr,rst-mgr")) + return (ENXIO); + + device_set_desc(dev, "Reset Manager"); + return (BUS_PROBE_DEFAULT); +} + +static int +rstmgr_attach(device_t dev) +{ + struct rstmgr_softc *sc; + + sc = device_get_softc(dev); + sc->dev = dev; + + if (bus_alloc_resources(dev, rstmgr_spec, sc->res)) { + device_printf(dev, "could not allocate resources\n"); + return (ENXIO); + } + + /* Memory interface */ + sc->bst = rman_get_bustag(sc->res[0]); + sc->bsh = rman_get_bushandle(sc->res[0]); + + rstmgr_sc = sc; + rstmgr_add_sysctl(sc); + + return (0); +} + +static device_method_t rstmgr_methods[] = { + DEVMETHOD(device_probe, rstmgr_probe), + DEVMETHOD(device_attach, rstmgr_attach), + { 0, 0 } +}; + +static driver_t rstmgr_driver = { + "rstmgr", + rstmgr_methods, + sizeof(struct rstmgr_softc), +}; + +static devclass_t rstmgr_devclass; + +DRIVER_MODULE(rstmgr, simplebus, rstmgr_driver, rstmgr_devclass, 0, 0); Property changes on: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h =================================================================== --- user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h (nonexistent) +++ user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h (revision 271453) @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2014 Ruslan Bukin + * All rights reserved. + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) + * ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define RSTMGR_STAT 0x0 /* Status */ +#define RSTMGR_CTRL 0x4 /* Control */ +#define CTRL_SWWARMRSTREQ (1 << 1) /* Trigger warm reset */ +#define RSTMGR_COUNTS 0x8 /* Reset Cycles Count */ +#define RSTMGR_MPUMODRST 0x10 /* MPU Module Reset */ +#define RSTMGR_PERMODRST 0x14 /* Peripheral Module Reset */ +#define RSTMGR_PER2MODRST 0x18 /* Peripheral 2 Module Reset */ +#define RSTMGR_BRGMODRST 0x1C /* Bridge Module Reset */ +#define BRGMODRST_FPGA2HPS (1 << 2) +#define BRGMODRST_LWHPS2FPGA (1 << 1) +#define BRGMODRST_HPS2FPGA (1 << 0) +#define RSTMGR_MISCMODRST 0x20 /* Miscellaneous Module Reset */ + +int rstmgr_warmreset(void); Property changes on: user/ae/inet6/sys/arm/altera/socfpga/socfpga_rstmgr.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/ae/inet6/sys/boot/fdt/dts/arm/socfpga.dtsi =================================================================== --- user/ae/inet6/sys/boot/fdt/dts/arm/socfpga.dtsi (revision 271452) +++ user/ae/inet6/sys/boot/fdt/dts/arm/socfpga.dtsi (revision 271453) @@ -1,119 +1,131 @@ /*- * Copyright (c) 2014 Ruslan Bukin * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ / { compatible = "altr,socfpga"; #address-cells = <1>; #size-cells = <1>; interrupt-parent = <&GIC>; aliases { soc = &SOC; + rstmgr = &rstmgr; + l3regs = &l3regs; serial0 = &serial0; serial1 = &serial1; }; SOC: socfpga { #address-cells = <1>; #size-cells = <1>; compatible = "simple-bus"; ranges; bus-frequency = <0>; GIC: interrupt-controller@fffed000 { compatible = "arm,gic"; reg = < 0xfffed000 0x1000 >, /* Distributor */ < 0xfffec100 0x100 >; /* CPU Interface */ interrupt-controller; #interrupt-cells = <1>; }; mp_tmr@40002100 { compatible = "arm,mpcore-timers"; clock-frequency = <200000000>; #address-cells = <1>; #size-cells = <0>; reg = < 0xfffec200 0x100 >, /* Global Timer */ < 0xfffec600 0x100 >; /* Private Timer */ interrupts = < 27 29 >; interrupt-parent = < &GIC >; + }; + + rstmgr: rstmgr@ffd05000 { + compatible = "altr,rst-mgr"; + reg = <0xffd05000 0x1000>; + }; + + l3regs: l3regs@ff800000 { + compatible = "altr,l3regs"; + reg = <0xff800000 0x1000>; }; fpgamgr: fpgamgr@ff706000 { compatible = "altr,fpga-mgr"; reg = <0xff706000 0x1000>, /* FPGAMGRREGS */ <0xffb90000 0x1000>; /* FPGAMGRDATA */ interrupts = < 207 >; interrupt-parent = <&GIC>; }; serial0: serial@ffc02000 { compatible = "ns16550"; reg = <0xffc02000 0x1000>; reg-shift = <2>; interrupts = <194>; interrupt-parent = <&GIC>; current-speed = <115200>; clock-frequency = < 100000000 >; status = "disabled"; }; serial1: serial@ffc03000 { compatible = "ns16550"; reg = <0xffc03000 0x1000>; reg-shift = <2>; interrupts = <195>; interrupt-parent = <&GIC>; current-speed = <115200>; clock-frequency = < 100000000 >; status = "disabled"; }; usb0: usb@ffb00000 { compatible = "synopsys,designware-hs-otg2"; reg = <0xffb00000 0xffff>; interrupts = <157>; interrupt-parent = <&GIC>; status = "disabled"; }; usb1: usb@ffb40000 { compatible = "synopsys,designware-hs-otg2"; reg = <0xffb40000 0xffff>; interrupts = <160>; interrupt-parent = <&GIC>; dr_mode = "host"; status = "disabled"; }; }; }; Index: user/ae/inet6/sys/boot =================================================================== --- user/ae/inet6/sys/boot (revision 271452) +++ user/ae/inet6/sys/boot (revision 271453) Property changes on: user/ae/inet6/sys/boot ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/boot:r271428-271452 Index: user/ae/inet6/sys/cam/ctl/ctl.c =================================================================== --- user/ae/inet6/sys/cam/ctl/ctl.c (revision 271452) +++ user/ae/inet6/sys/cam/ctl/ctl.c (revision 271453) @@ -1,14266 +1,14323 @@ /*- * Copyright (c) 2003-2009 Silicon Graphics International Corp. * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl.c#8 $ */ /* * CAM Target Layer, a SCSI device emulation subsystem. * * Author: Ken Merry */ #define _CTL_C #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct ctl_softc *control_softc = NULL; /* * Size and alignment macros needed for Copan-specific HA hardware. These * can go away when the HA code is re-written, and uses busdma for any * hardware. */ #define CTL_ALIGN_8B(target, source, type) \ if (((uint32_t)source & 0x7) != 0) \ target = (type)(source + (0x8 - ((uint32_t)source & 0x7)));\ else \ target = (type)source; #define CTL_SIZE_8B(target, size) \ if ((size & 0x7) != 0) \ target = size + (0x8 - (size & 0x7)); \ else \ target = size; #define CTL_ALIGN_8B_MARGIN 16 /* * Template mode pages. */ /* * Note that these are default values only. The actual values will be * filled in when the user does a mode sense. */ static struct copan_power_subpage power_page_default = { /*page_code*/ PWR_PAGE_CODE | SMPH_SPF, /*subpage*/ PWR_SUBPAGE_CODE, /*page_length*/ {(sizeof(struct copan_power_subpage) - 4) & 0xff00, (sizeof(struct copan_power_subpage) - 4) & 0x00ff}, /*page_version*/ PWR_VERSION, /* total_luns */ 26, /* max_active_luns*/ PWR_DFLT_MAX_LUNS, /*reserved*/ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; static struct copan_power_subpage power_page_changeable = { /*page_code*/ PWR_PAGE_CODE | SMPH_SPF, /*subpage*/ PWR_SUBPAGE_CODE, /*page_length*/ {(sizeof(struct copan_power_subpage) - 4) & 0xff00, (sizeof(struct copan_power_subpage) - 4) & 0x00ff}, /*page_version*/ 0, /* total_luns */ 0, /* max_active_luns*/ 0, /*reserved*/ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; static struct copan_aps_subpage aps_page_default = { APS_PAGE_CODE | SMPH_SPF, //page_code APS_SUBPAGE_CODE, //subpage {(sizeof(struct copan_aps_subpage) - 4) & 0xff00, (sizeof(struct copan_aps_subpage) - 4) & 0x00ff}, //page_length APS_VERSION, //page_version 0, //lock_active {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //reserved }; static struct copan_aps_subpage aps_page_changeable = { APS_PAGE_CODE | SMPH_SPF, //page_code APS_SUBPAGE_CODE, //subpage {(sizeof(struct copan_aps_subpage) - 4) & 0xff00, (sizeof(struct copan_aps_subpage) - 4) & 0x00ff}, //page_length 0, //page_version 0, //lock_active {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //reserved }; static struct copan_debugconf_subpage debugconf_page_default = { DBGCNF_PAGE_CODE | SMPH_SPF, /* page_code */ DBGCNF_SUBPAGE_CODE, /* subpage */ {(sizeof(struct copan_debugconf_subpage) - 4) >> 8, (sizeof(struct copan_debugconf_subpage) - 4) >> 0}, /* page_length */ DBGCNF_VERSION, /* page_version */ {CTL_TIME_IO_DEFAULT_SECS>>8, CTL_TIME_IO_DEFAULT_SECS>>0}, /* ctl_time_io_secs */ }; static struct copan_debugconf_subpage debugconf_page_changeable = { DBGCNF_PAGE_CODE | SMPH_SPF, /* page_code */ DBGCNF_SUBPAGE_CODE, /* subpage */ {(sizeof(struct copan_debugconf_subpage) - 4) >> 8, (sizeof(struct copan_debugconf_subpage) - 4) >> 0}, /* page_length */ 0, /* page_version */ {0xff,0xff}, /* ctl_time_io_secs */ }; static struct scsi_format_page format_page_default = { /*page_code*/SMS_FORMAT_DEVICE_PAGE, /*page_length*/sizeof(struct scsi_format_page) - 2, /*tracks_per_zone*/ {0, 0}, /*alt_sectors_per_zone*/ {0, 0}, /*alt_tracks_per_zone*/ {0, 0}, /*alt_tracks_per_lun*/ {0, 0}, /*sectors_per_track*/ {(CTL_DEFAULT_SECTORS_PER_TRACK >> 8) & 0xff, CTL_DEFAULT_SECTORS_PER_TRACK & 0xff}, /*bytes_per_sector*/ {0, 0}, /*interleave*/ {0, 0}, /*track_skew*/ {0, 0}, /*cylinder_skew*/ {0, 0}, /*flags*/ SFP_HSEC, /*reserved*/ {0, 0, 0} }; static struct scsi_format_page format_page_changeable = { /*page_code*/SMS_FORMAT_DEVICE_PAGE, /*page_length*/sizeof(struct scsi_format_page) - 2, /*tracks_per_zone*/ {0, 0}, /*alt_sectors_per_zone*/ {0, 0}, /*alt_tracks_per_zone*/ {0, 0}, /*alt_tracks_per_lun*/ {0, 0}, /*sectors_per_track*/ {0, 0}, /*bytes_per_sector*/ {0, 0}, /*interleave*/ {0, 0}, /*track_skew*/ {0, 0}, /*cylinder_skew*/ {0, 0}, /*flags*/ 0, /*reserved*/ {0, 0, 0} }; static struct scsi_rigid_disk_page rigid_disk_page_default = { /*page_code*/SMS_RIGID_DISK_PAGE, /*page_length*/sizeof(struct scsi_rigid_disk_page) - 2, /*cylinders*/ {0, 0, 0}, /*heads*/ CTL_DEFAULT_HEADS, /*start_write_precomp*/ {0, 0, 0}, /*start_reduced_current*/ {0, 0, 0}, /*step_rate*/ {0, 0}, /*landing_zone_cylinder*/ {0, 0, 0}, /*rpl*/ SRDP_RPL_DISABLED, /*rotational_offset*/ 0, /*reserved1*/ 0, /*rotation_rate*/ {(CTL_DEFAULT_ROTATION_RATE >> 8) & 0xff, CTL_DEFAULT_ROTATION_RATE & 0xff}, /*reserved2*/ {0, 0} }; static struct scsi_rigid_disk_page rigid_disk_page_changeable = { /*page_code*/SMS_RIGID_DISK_PAGE, /*page_length*/sizeof(struct scsi_rigid_disk_page) - 2, /*cylinders*/ {0, 0, 0}, /*heads*/ 0, /*start_write_precomp*/ {0, 0, 0}, /*start_reduced_current*/ {0, 0, 0}, /*step_rate*/ {0, 0}, /*landing_zone_cylinder*/ {0, 0, 0}, /*rpl*/ 0, /*rotational_offset*/ 0, /*reserved1*/ 0, /*rotation_rate*/ {0, 0}, /*reserved2*/ {0, 0} }; static struct scsi_caching_page caching_page_default = { /*page_code*/SMS_CACHING_PAGE, /*page_length*/sizeof(struct scsi_caching_page) - 2, /*flags1*/ SCP_DISC | SCP_WCE, /*ret_priority*/ 0, /*disable_pf_transfer_len*/ {0xff, 0xff}, /*min_prefetch*/ {0, 0}, /*max_prefetch*/ {0xff, 0xff}, /*max_pf_ceiling*/ {0xff, 0xff}, /*flags2*/ 0, /*cache_segments*/ 0, /*cache_seg_size*/ {0, 0}, /*reserved*/ 0, /*non_cache_seg_size*/ {0, 0, 0} }; static struct scsi_caching_page caching_page_changeable = { /*page_code*/SMS_CACHING_PAGE, /*page_length*/sizeof(struct scsi_caching_page) - 2, /*flags1*/ SCP_WCE | SCP_RCD, /*ret_priority*/ 0, /*disable_pf_transfer_len*/ {0, 0}, /*min_prefetch*/ {0, 0}, /*max_prefetch*/ {0, 0}, /*max_pf_ceiling*/ {0, 0}, /*flags2*/ 0, /*cache_segments*/ 0, /*cache_seg_size*/ {0, 0}, /*reserved*/ 0, /*non_cache_seg_size*/ {0, 0, 0} }; static struct scsi_control_page control_page_default = { /*page_code*/SMS_CONTROL_MODE_PAGE, /*page_length*/sizeof(struct scsi_control_page) - 2, /*rlec*/0, /*queue_flags*/0, /*eca_and_aen*/0, /*flags4*/SCP_TAS, /*aen_holdoff_period*/{0, 0}, /*busy_timeout_period*/{0, 0}, /*extended_selftest_completion_time*/{0, 0} }; static struct scsi_control_page control_page_changeable = { /*page_code*/SMS_CONTROL_MODE_PAGE, /*page_length*/sizeof(struct scsi_control_page) - 2, /*rlec*/SCP_DSENSE, /*queue_flags*/0, /*eca_and_aen*/0, /*flags4*/0, /*aen_holdoff_period*/{0, 0}, /*busy_timeout_period*/{0, 0}, /*extended_selftest_completion_time*/{0, 0} }; /* * XXX KDM move these into the softc. */ static int rcv_sync_msg; static int persis_offset; static uint8_t ctl_pause_rtr; static int ctl_is_single = 1; static int index_to_aps_page; SYSCTL_NODE(_kern_cam, OID_AUTO, ctl, CTLFLAG_RD, 0, "CAM Target Layer"); static int worker_threads = -1; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, worker_threads, CTLFLAG_RDTUN, &worker_threads, 1, "Number of worker threads"); static int verbose = 0; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, verbose, CTLFLAG_RWTUN, &verbose, 0, "Show SCSI errors returned to initiator"); /* * Supported pages (0x00), Serial number (0x80), Device ID (0x83), - * Mode Page Policy (0x87), + * Extended INQUIRY Data (0x86), Mode Page Policy (0x87), * SCSI Ports (0x88), Third-party Copy (0x8F), Block limits (0xB0), * Block Device Characteristics (0xB1) and Logical Block Provisioning (0xB2) */ -#define SCSI_EVPD_NUM_SUPPORTED_PAGES 9 +#define SCSI_EVPD_NUM_SUPPORTED_PAGES 10 static void ctl_isc_event_handler(ctl_ha_channel chanel, ctl_ha_event event, int param); static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest); static int ctl_init(void); void ctl_shutdown(void); static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td); static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td); static void ctl_ioctl_online(void *arg); static void ctl_ioctl_offline(void *arg); static int ctl_ioctl_lun_enable(void *arg, struct ctl_id targ_id, int lun_id); static int ctl_ioctl_lun_disable(void *arg, struct ctl_id targ_id, int lun_id); static int ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio); static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio); static int ctl_ioctl_submit_wait(union ctl_io *io); static void ctl_ioctl_datamove(union ctl_io *io); static void ctl_ioctl_done(union ctl_io *io); static void ctl_ioctl_hard_startstop_callback(void *arg, struct cfi_metatask *metatask); static void ctl_ioctl_bbrread_callback(void *arg,struct cfi_metatask *metatask); static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num, struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries); static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td); static uint32_t ctl_map_lun(int port_num, uint32_t lun); static uint32_t ctl_map_lun_back(int port_num, uint32_t lun); #ifdef unused static union ctl_io *ctl_malloc_io(ctl_io_type io_type, uint32_t targ_port, uint32_t targ_target, uint32_t targ_lun, int can_wait); static void ctl_kfree_io(union ctl_io *io); #endif /* unused */ static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *lun, struct ctl_be_lun *be_lun, struct ctl_id target_id); static int ctl_free_lun(struct ctl_lun *lun); static void ctl_create_lun(struct ctl_be_lun *be_lun); /** static void ctl_failover_change_pages(struct ctl_softc *softc, struct ctl_scsiio *ctsio, int master); **/ static int ctl_do_mode_select(union ctl_io *io); static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key, uint64_t sa_res_key, uint8_t type, uint32_t residx, struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb, struct scsi_per_res_out_parms* param); static void ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg); static void ctl_hndl_per_res_out_on_other_sc(union ctl_ha_msg *msg); static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len); +static int ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio); static int ctl_inquiry_std(struct ctl_scsiio *ctsio); static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint32_t *len); static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2); static ctl_action ctl_check_for_blockage(union ctl_io *pending_io, union ctl_io *ooa_io); static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *starting_io); static int ctl_check_blocked(struct ctl_lun *lun); static int ctl_scsiio_lun_check(struct ctl_softc *ctl_softc, struct ctl_lun *lun, const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio); //static int ctl_check_rtr(union ctl_io *pending_io, struct ctl_softc *softc); static void ctl_failover(void); static int ctl_scsiio_precheck(struct ctl_softc *ctl_softc, struct ctl_scsiio *ctsio); static int ctl_scsiio(struct ctl_scsiio *ctsio); static int ctl_bus_reset(struct ctl_softc *ctl_softc, union ctl_io *io); static int ctl_target_reset(struct ctl_softc *ctl_softc, union ctl_io *io, ctl_ua_type ua_type); static int ctl_lun_reset(struct ctl_lun *lun, union ctl_io *io, ctl_ua_type ua_type); static int ctl_abort_task(union ctl_io *io); static int ctl_abort_task_set(union ctl_io *io); static int ctl_i_t_nexus_reset(union ctl_io *io); static void ctl_run_task(union ctl_io *io); #ifdef CTL_IO_DELAY static void ctl_datamove_timer_wakeup(void *arg); static void ctl_done_timer_wakeup(void *arg); #endif /* CTL_IO_DELAY */ static void ctl_send_datamove_done(union ctl_io *io, int have_lock); static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq); static int ctl_datamove_remote_dm_write_cb(union ctl_io *io); static void ctl_datamove_remote_write(union ctl_io *io); static int ctl_datamove_remote_dm_read_cb(union ctl_io *io); static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq); static int ctl_datamove_remote_sgl_setup(union ctl_io *io); static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command, ctl_ha_dt_cb callback); static void ctl_datamove_remote_read(union ctl_io *io); static void ctl_datamove_remote(union ctl_io *io); static int ctl_process_done(union ctl_io *io); static void ctl_lun_thread(void *arg); static void ctl_work_thread(void *arg); static void ctl_enqueue_incoming(union ctl_io *io); static void ctl_enqueue_rtr(union ctl_io *io); static void ctl_enqueue_done(union ctl_io *io); static void ctl_enqueue_isc(union ctl_io *io); static const struct ctl_cmd_entry * ctl_get_cmd_entry(struct ctl_scsiio *ctsio); static const struct ctl_cmd_entry * ctl_validate_command(struct ctl_scsiio *ctsio); static int ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry); /* * Load the serialization table. This isn't very pretty, but is probably * the easiest way to do it. */ #include "ctl_ser_table.c" /* * We only need to define open, close and ioctl routines for this driver. */ static struct cdevsw ctl_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = ctl_open, .d_close = ctl_close, .d_ioctl = ctl_ioctl, .d_name = "ctl", }; MALLOC_DEFINE(M_CTL, "ctlmem", "Memory used for CTL"); MALLOC_DEFINE(M_CTLIO, "ctlio", "Memory used for CTL requests"); static int ctl_module_event_handler(module_t, int /*modeventtype_t*/, void *); static moduledata_t ctl_moduledata = { "ctl", ctl_module_event_handler, NULL }; DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD); MODULE_VERSION(ctl, 1); static struct ctl_frontend ioctl_frontend = { .name = "ioctl", }; static void ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc, union ctl_ha_msg *msg_info) { struct ctl_scsiio *ctsio; if (msg_info->hdr.original_sc == NULL) { printf("%s: original_sc == NULL!\n", __func__); /* XXX KDM now what? */ return; } ctsio = &msg_info->hdr.original_sc->scsiio; ctsio->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO; ctsio->io_hdr.status = msg_info->hdr.status; ctsio->scsi_status = msg_info->scsi.scsi_status; ctsio->sense_len = msg_info->scsi.sense_len; ctsio->sense_residual = msg_info->scsi.sense_residual; ctsio->residual = msg_info->scsi.residual; memcpy(&ctsio->sense_data, &msg_info->scsi.sense_data, sizeof(ctsio->sense_data)); memcpy(&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes, &msg_info->scsi.lbalen, sizeof(msg_info->scsi.lbalen)); ctl_enqueue_isc((union ctl_io *)ctsio); } static void ctl_isc_handler_finish_ser_only(struct ctl_softc *ctl_softc, union ctl_ha_msg *msg_info) { struct ctl_scsiio *ctsio; if (msg_info->hdr.serializing_sc == NULL) { printf("%s: serializing_sc == NULL!\n", __func__); /* XXX KDM now what? */ return; } ctsio = &msg_info->hdr.serializing_sc->scsiio; #if 0 /* * Attempt to catch the situation where an I/O has * been freed, and we're using it again. */ if (ctsio->io_hdr.io_type == 0xff) { union ctl_io *tmp_io; tmp_io = (union ctl_io *)ctsio; printf("%s: %p use after free!\n", __func__, ctsio); printf("%s: type %d msg %d cdb %x iptl: " "%d:%d:%d:%d tag 0x%04x " "flag %#x status %x\n", __func__, tmp_io->io_hdr.io_type, tmp_io->io_hdr.msg_type, tmp_io->scsiio.cdb[0], tmp_io->io_hdr.nexus.initid.id, tmp_io->io_hdr.nexus.targ_port, tmp_io->io_hdr.nexus.targ_target.id, tmp_io->io_hdr.nexus.targ_lun, (tmp_io->io_hdr.io_type == CTL_IO_TASK) ? tmp_io->taskio.tag_num : tmp_io->scsiio.tag_num, tmp_io->io_hdr.flags, tmp_io->io_hdr.status); } #endif ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO; ctl_enqueue_isc((union ctl_io *)ctsio); } /* * ISC (Inter Shelf Communication) event handler. Events from the HA * subsystem come in here. */ static void ctl_isc_event_handler(ctl_ha_channel channel, ctl_ha_event event, int param) { struct ctl_softc *ctl_softc; union ctl_io *io; struct ctl_prio *presio; ctl_ha_status isc_status; ctl_softc = control_softc; io = NULL; #if 0 printf("CTL: Isc Msg event %d\n", event); #endif if (event == CTL_HA_EVT_MSG_RECV) { union ctl_ha_msg msg_info; isc_status = ctl_ha_msg_recv(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), /*wait*/ 0); #if 0 printf("CTL: msg_type %d\n", msg_info.msg_type); #endif if (isc_status != 0) { printf("Error receiving message, status = %d\n", isc_status); return; } switch (msg_info.hdr.msg_type) { case CTL_MSG_SERIALIZE: #if 0 printf("Serialize\n"); #endif io = ctl_alloc_io((void *)ctl_softc->othersc_pool); if (io == NULL) { printf("ctl_isc_event_handler: can't allocate " "ctl_io!\n"); /* Bad Juju */ /* Need to set busy and send msg back */ msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; msg_info.hdr.status = CTL_SCSI_ERROR; msg_info.scsi.scsi_status = SCSI_STATUS_BUSY; msg_info.scsi.sense_len = 0; if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 0) > CTL_HA_STATUS_SUCCESS){ } goto bailout; } ctl_zero_io(io); // populate ctsio from msg_info io->io_hdr.io_type = CTL_IO_SCSI; io->io_hdr.msg_type = CTL_MSG_SERIALIZE; io->io_hdr.original_sc = msg_info.hdr.original_sc; #if 0 printf("pOrig %x\n", (int)msg_info.original_sc); #endif io->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC | CTL_FLAG_IO_ACTIVE; /* * If we're in serialization-only mode, we don't * want to go through full done processing. Thus * the COPY flag. * * XXX KDM add another flag that is more specific. */ if (ctl_softc->ha_mode == CTL_HA_MODE_SER_ONLY) io->io_hdr.flags |= CTL_FLAG_INT_COPY; io->io_hdr.nexus = msg_info.hdr.nexus; #if 0 printf("targ %d, port %d, iid %d, lun %d\n", io->io_hdr.nexus.targ_target.id, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.initid.id, io->io_hdr.nexus.targ_lun); #endif io->scsiio.tag_num = msg_info.scsi.tag_num; io->scsiio.tag_type = msg_info.scsi.tag_type; memcpy(io->scsiio.cdb, msg_info.scsi.cdb, CTL_MAX_CDBLEN); if (ctl_softc->ha_mode == CTL_HA_MODE_XFER) { const struct ctl_cmd_entry *entry; entry = ctl_get_cmd_entry(&io->scsiio); io->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; io->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; } ctl_enqueue_isc(io); break; /* Performed on the Originating SC, XFER mode only */ case CTL_MSG_DATAMOVE: { struct ctl_sg_entry *sgl; int i, j; io = msg_info.hdr.original_sc; if (io == NULL) { printf("%s: original_sc == NULL!\n", __func__); /* XXX KDM do something here */ break; } io->io_hdr.msg_type = CTL_MSG_DATAMOVE; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; /* * Keep track of this, we need to send it back over * when the datamove is complete. */ io->io_hdr.serializing_sc = msg_info.hdr.serializing_sc; if (msg_info.dt.sg_sequence == 0) { /* * XXX KDM we use the preallocated S/G list * here, but we'll need to change this to * dynamic allocation if we need larger S/G * lists. */ if (msg_info.dt.kern_sg_entries > sizeof(io->io_hdr.remote_sglist) / sizeof(io->io_hdr.remote_sglist[0])) { printf("%s: number of S/G entries " "needed %u > allocated num %zd\n", __func__, msg_info.dt.kern_sg_entries, sizeof(io->io_hdr.remote_sglist)/ sizeof(io->io_hdr.remote_sglist[0])); /* * XXX KDM send a message back to * the other side to shut down the * DMA. The error will come back * through via the normal channel. */ break; } sgl = io->io_hdr.remote_sglist; memset(sgl, 0, sizeof(io->io_hdr.remote_sglist)); io->scsiio.kern_data_ptr = (uint8_t *)sgl; io->scsiio.kern_sg_entries = msg_info.dt.kern_sg_entries; io->scsiio.rem_sg_entries = msg_info.dt.kern_sg_entries; io->scsiio.kern_data_len = msg_info.dt.kern_data_len; io->scsiio.kern_total_len = msg_info.dt.kern_total_len; io->scsiio.kern_data_resid = msg_info.dt.kern_data_resid; io->scsiio.kern_rel_offset = msg_info.dt.kern_rel_offset; /* * Clear out per-DMA flags. */ io->io_hdr.flags &= ~CTL_FLAG_RDMA_MASK; /* * Add per-DMA flags that are set for this * particular DMA request. */ io->io_hdr.flags |= msg_info.dt.flags & CTL_FLAG_RDMA_MASK; } else sgl = (struct ctl_sg_entry *) io->scsiio.kern_data_ptr; for (i = msg_info.dt.sent_sg_entries, j = 0; i < (msg_info.dt.sent_sg_entries + msg_info.dt.cur_sg_entries); i++, j++) { sgl[i].addr = msg_info.dt.sg_list[j].addr; sgl[i].len = msg_info.dt.sg_list[j].len; #if 0 printf("%s: L: %p,%d -> %p,%d j=%d, i=%d\n", __func__, msg_info.dt.sg_list[j].addr, msg_info.dt.sg_list[j].len, sgl[i].addr, sgl[i].len, j, i); #endif } #if 0 memcpy(&sgl[msg_info.dt.sent_sg_entries], msg_info.dt.sg_list, sizeof(*sgl) * msg_info.dt.cur_sg_entries); #endif /* * If this is the last piece of the I/O, we've got * the full S/G list. Queue processing in the thread. * Otherwise wait for the next piece. */ if (msg_info.dt.sg_last != 0) ctl_enqueue_isc(io); break; } /* Performed on the Serializing (primary) SC, XFER mode only */ case CTL_MSG_DATAMOVE_DONE: { if (msg_info.hdr.serializing_sc == NULL) { printf("%s: serializing_sc == NULL!\n", __func__); /* XXX KDM now what? */ break; } /* * We grab the sense information here in case * there was a failure, so we can return status * back to the initiator. */ io = msg_info.hdr.serializing_sc; io->io_hdr.msg_type = CTL_MSG_DATAMOVE_DONE; io->io_hdr.status = msg_info.hdr.status; io->scsiio.scsi_status = msg_info.scsi.scsi_status; io->scsiio.sense_len = msg_info.scsi.sense_len; io->scsiio.sense_residual =msg_info.scsi.sense_residual; io->io_hdr.port_status = msg_info.scsi.fetd_status; io->scsiio.residual = msg_info.scsi.residual; memcpy(&io->scsiio.sense_data,&msg_info.scsi.sense_data, sizeof(io->scsiio.sense_data)); ctl_enqueue_isc(io); break; } /* Preformed on Originating SC, SER_ONLY mode */ case CTL_MSG_R2R: io = msg_info.hdr.original_sc; if (io == NULL) { printf("%s: Major Bummer\n", __func__); return; } else { #if 0 printf("pOrig %x\n",(int) ctsio); #endif } io->io_hdr.msg_type = CTL_MSG_R2R; io->io_hdr.serializing_sc = msg_info.hdr.serializing_sc; ctl_enqueue_isc(io); break; /* * Performed on Serializing(i.e. primary SC) SC in SER_ONLY * mode. * Performed on the Originating (i.e. secondary) SC in XFER * mode */ case CTL_MSG_FINISH_IO: if (ctl_softc->ha_mode == CTL_HA_MODE_XFER) ctl_isc_handler_finish_xfer(ctl_softc, &msg_info); else ctl_isc_handler_finish_ser_only(ctl_softc, &msg_info); break; /* Preformed on Originating SC */ case CTL_MSG_BAD_JUJU: io = msg_info.hdr.original_sc; if (io == NULL) { printf("%s: Bad JUJU!, original_sc is NULL!\n", __func__); break; } ctl_copy_sense_data(&msg_info, io); /* * IO should have already been cleaned up on other * SC so clear this flag so we won't send a message * back to finish the IO there. */ io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; /* io = msg_info.hdr.serializing_sc; */ io->io_hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_enqueue_isc(io); break; /* Handle resets sent from the other side */ case CTL_MSG_MANAGE_TASKS: { struct ctl_taskio *taskio; taskio = (struct ctl_taskio *)ctl_alloc_io( (void *)ctl_softc->othersc_pool); if (taskio == NULL) { printf("ctl_isc_event_handler: can't allocate " "ctl_io!\n"); /* Bad Juju */ /* should I just call the proper reset func here??? */ goto bailout; } ctl_zero_io((union ctl_io *)taskio); taskio->io_hdr.io_type = CTL_IO_TASK; taskio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC; taskio->io_hdr.nexus = msg_info.hdr.nexus; taskio->task_action = msg_info.task.task_action; taskio->tag_num = msg_info.task.tag_num; taskio->tag_type = msg_info.task.tag_type; #ifdef CTL_TIME_IO taskio->io_hdr.start_time = time_uptime; getbintime(&taskio->io_hdr.start_bt); #if 0 cs_prof_gettime(&taskio->io_hdr.start_ticks); #endif #endif /* CTL_TIME_IO */ ctl_run_task((union ctl_io *)taskio); break; } /* Persistent Reserve action which needs attention */ case CTL_MSG_PERS_ACTION: presio = (struct ctl_prio *)ctl_alloc_io( (void *)ctl_softc->othersc_pool); if (presio == NULL) { printf("ctl_isc_event_handler: can't allocate " "ctl_io!\n"); /* Bad Juju */ /* Need to set busy and send msg back */ goto bailout; } ctl_zero_io((union ctl_io *)presio); presio->io_hdr.msg_type = CTL_MSG_PERS_ACTION; presio->pr_msg = msg_info.pr; ctl_enqueue_isc((union ctl_io *)presio); break; case CTL_MSG_SYNC_FE: rcv_sync_msg = 1; break; case CTL_MSG_APS_LOCK: { // It's quicker to execute this then to // queue it. struct ctl_lun *lun; struct ctl_page_index *page_index; struct copan_aps_subpage *current_sp; uint32_t targ_lun; targ_lun = msg_info.hdr.nexus.targ_mapped_lun; lun = ctl_softc->ctl_luns[targ_lun]; mtx_lock(&lun->lun_lock); page_index = &lun->mode_pages.index[index_to_aps_page]; current_sp = (struct copan_aps_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); current_sp->lock_active = msg_info.aps.lock_flag; mtx_unlock(&lun->lun_lock); break; } default: printf("How did I get here?\n"); } } else if (event == CTL_HA_EVT_MSG_SENT) { if (param != CTL_HA_STATUS_SUCCESS) { printf("Bad status from ctl_ha_msg_send status %d\n", param); } return; } else if (event == CTL_HA_EVT_DISCONNECT) { printf("CTL: Got a disconnect from Isc\n"); return; } else { printf("ctl_isc_event_handler: Unknown event %d\n", event); return; } bailout: return; } static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest) { struct scsi_sense_data *sense; sense = &dest->scsiio.sense_data; bcopy(&src->scsi.sense_data, sense, sizeof(*sense)); dest->scsiio.scsi_status = src->scsi.scsi_status; dest->scsiio.sense_len = src->scsi.sense_len; dest->io_hdr.status = src->hdr.status; } static int ctl_init(void) { struct ctl_softc *softc; struct ctl_io_pool *internal_pool, *emergency_pool, *other_pool; struct ctl_port *port; uint8_t sc_id =0; int i, error, retval; //int isc_retval; retval = 0; ctl_pause_rtr = 0; rcv_sync_msg = 0; control_softc = malloc(sizeof(*control_softc), M_DEVBUF, M_WAITOK | M_ZERO); softc = control_softc; softc->dev = make_dev(&ctl_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "cam/ctl"); softc->dev->si_drv1 = softc; /* * By default, return a "bad LUN" peripheral qualifier for unknown * LUNs. The user can override this default using the tunable or * sysctl. See the comment in ctl_inquiry_std() for more details. */ softc->inquiry_pq_no_lun = 1; TUNABLE_INT_FETCH("kern.cam.ctl.inquiry_pq_no_lun", &softc->inquiry_pq_no_lun); sysctl_ctx_init(&softc->sysctl_ctx); softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam), OID_AUTO, "ctl", CTLFLAG_RD, 0, "CAM Target Layer"); if (softc->sysctl_tree == NULL) { printf("%s: unable to allocate sysctl tree\n", __func__); destroy_dev(softc->dev); free(control_softc, M_DEVBUF); control_softc = NULL; return (ENOMEM); } SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "inquiry_pq_no_lun", CTLFLAG_RW, &softc->inquiry_pq_no_lun, 0, "Report no lun possible for invalid LUNs"); mtx_init(&softc->ctl_lock, "CTL mutex", NULL, MTX_DEF); mtx_init(&softc->pool_lock, "CTL pool mutex", NULL, MTX_DEF); softc->open_count = 0; /* * Default to actually sending a SYNCHRONIZE CACHE command down to * the drive. */ softc->flags = CTL_FLAG_REAL_SYNC; /* * In Copan's HA scheme, the "master" and "slave" roles are * figured out through the slot the controller is in. Although it * is an active/active system, someone has to be in charge. */ #ifdef NEEDTOPORT scmicro_rw(SCMICRO_GET_SHELF_ID, &sc_id); #endif if (sc_id == 0) { softc->flags |= CTL_FLAG_MASTER_SHELF; persis_offset = 0; } else persis_offset = CTL_MAX_INITIATORS; /* * XXX KDM need to figure out where we want to get our target ID * and WWID. Is it different on each port? */ softc->target.id = 0; softc->target.wwid[0] = 0x12345678; softc->target.wwid[1] = 0x87654321; STAILQ_INIT(&softc->lun_list); STAILQ_INIT(&softc->pending_lun_queue); STAILQ_INIT(&softc->fe_list); STAILQ_INIT(&softc->port_list); STAILQ_INIT(&softc->be_list); STAILQ_INIT(&softc->io_pools); ctl_tpc_init(softc); if (ctl_pool_create(softc, CTL_POOL_INTERNAL, CTL_POOL_ENTRIES_INTERNAL, &internal_pool)!= 0){ printf("ctl: can't allocate %d entry internal pool, " "exiting\n", CTL_POOL_ENTRIES_INTERNAL); return (ENOMEM); } if (ctl_pool_create(softc, CTL_POOL_EMERGENCY, CTL_POOL_ENTRIES_EMERGENCY, &emergency_pool) != 0) { printf("ctl: can't allocate %d entry emergency pool, " "exiting\n", CTL_POOL_ENTRIES_EMERGENCY); ctl_pool_free(internal_pool); return (ENOMEM); } if (ctl_pool_create(softc, CTL_POOL_4OTHERSC, CTL_POOL_ENTRIES_OTHER_SC, &other_pool) != 0) { printf("ctl: can't allocate %d entry other SC pool, " "exiting\n", CTL_POOL_ENTRIES_OTHER_SC); ctl_pool_free(internal_pool); ctl_pool_free(emergency_pool); return (ENOMEM); } softc->internal_pool = internal_pool; softc->emergency_pool = emergency_pool; softc->othersc_pool = other_pool; if (worker_threads <= 0) worker_threads = max(1, mp_ncpus / 4); if (worker_threads > CTL_MAX_THREADS) worker_threads = CTL_MAX_THREADS; for (i = 0; i < worker_threads; i++) { struct ctl_thread *thr = &softc->threads[i]; mtx_init(&thr->queue_lock, "CTL queue mutex", NULL, MTX_DEF); thr->ctl_softc = softc; STAILQ_INIT(&thr->incoming_queue); STAILQ_INIT(&thr->rtr_queue); STAILQ_INIT(&thr->done_queue); STAILQ_INIT(&thr->isc_queue); error = kproc_kthread_add(ctl_work_thread, thr, &softc->ctl_proc, &thr->thread, 0, 0, "ctl", "work%d", i); if (error != 0) { printf("error creating CTL work thread!\n"); ctl_pool_free(internal_pool); ctl_pool_free(emergency_pool); ctl_pool_free(other_pool); return (error); } } error = kproc_kthread_add(ctl_lun_thread, softc, &softc->ctl_proc, NULL, 0, 0, "ctl", "lun"); if (error != 0) { printf("error creating CTL lun thread!\n"); ctl_pool_free(internal_pool); ctl_pool_free(emergency_pool); ctl_pool_free(other_pool); return (error); } if (bootverbose) printf("ctl: CAM Target Layer loaded\n"); /* * Initialize the ioctl front end. */ ctl_frontend_register(&ioctl_frontend); port = &softc->ioctl_info.port; port->frontend = &ioctl_frontend; sprintf(softc->ioctl_info.port_name, "ioctl"); port->port_type = CTL_PORT_IOCTL; port->num_requested_ctl_io = 100; port->port_name = softc->ioctl_info.port_name; port->port_online = ctl_ioctl_online; port->port_offline = ctl_ioctl_offline; port->onoff_arg = &softc->ioctl_info; port->lun_enable = ctl_ioctl_lun_enable; port->lun_disable = ctl_ioctl_lun_disable; port->targ_lun_arg = &softc->ioctl_info; port->fe_datamove = ctl_ioctl_datamove; port->fe_done = ctl_ioctl_done; port->max_targets = 15; port->max_target_id = 15; if (ctl_port_register(&softc->ioctl_info.port, (softc->flags & CTL_FLAG_MASTER_SHELF)) != 0) { printf("ctl: ioctl front end registration failed, will " "continue anyway\n"); } #ifdef CTL_IO_DELAY if (sizeof(struct callout) > CTL_TIMER_BYTES) { printf("sizeof(struct callout) %zd > CTL_TIMER_BYTES %zd\n", sizeof(struct callout), CTL_TIMER_BYTES); return (EINVAL); } #endif /* CTL_IO_DELAY */ return (0); } void ctl_shutdown(void) { struct ctl_softc *softc; struct ctl_lun *lun, *next_lun; struct ctl_io_pool *pool; softc = (struct ctl_softc *)control_softc; if (ctl_port_deregister(&softc->ioctl_info.port) != 0) printf("ctl: ioctl front end deregistration failed\n"); mtx_lock(&softc->ctl_lock); /* * Free up each LUN. */ for (lun = STAILQ_FIRST(&softc->lun_list); lun != NULL; lun = next_lun){ next_lun = STAILQ_NEXT(lun, links); ctl_free_lun(lun); } mtx_unlock(&softc->ctl_lock); ctl_frontend_deregister(&ioctl_frontend); /* * This will rip the rug out from under any FETDs or anyone else * that has a pool allocated. Since we increment our module * refcount any time someone outside the main CTL module allocates * a pool, we shouldn't have any problems here. The user won't be * able to unload the CTL module until client modules have * successfully unloaded. */ while ((pool = STAILQ_FIRST(&softc->io_pools)) != NULL) ctl_pool_free(pool); #if 0 ctl_shutdown_thread(softc->work_thread); mtx_destroy(&softc->queue_lock); #endif ctl_tpc_shutdown(softc); mtx_destroy(&softc->pool_lock); mtx_destroy(&softc->ctl_lock); destroy_dev(softc->dev); sysctl_ctx_free(&softc->sysctl_ctx); free(control_softc, M_DEVBUF); control_softc = NULL; if (bootverbose) printf("ctl: CAM Target Layer unloaded\n"); } static int ctl_module_event_handler(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (ctl_init()); case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } } /* * XXX KDM should we do some access checks here? Bump a reference count to * prevent a CTL module from being unloaded while someone has it open? */ static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } int ctl_port_enable(ctl_port_type port_type) { struct ctl_softc *softc; struct ctl_port *port; if (ctl_is_single == 0) { union ctl_ha_msg msg_info; int isc_retval; #if 0 printf("%s: HA mode, synchronizing frontend enable\n", __func__); #endif msg_info.hdr.msg_type = CTL_MSG_SYNC_FE; if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 1 )) > CTL_HA_STATUS_SUCCESS) { printf("Sync msg send error retval %d\n", isc_retval); } if (!rcv_sync_msg) { isc_retval=ctl_ha_msg_recv(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 1); } #if 0 printf("CTL:Frontend Enable\n"); } else { printf("%s: single mode, skipping frontend synchronization\n", __func__); #endif } softc = control_softc; STAILQ_FOREACH(port, &softc->port_list, links) { if (port_type & port->port_type) { #if 0 printf("port %d\n", port->targ_port); #endif ctl_port_online(port); } } return (0); } int ctl_port_disable(ctl_port_type port_type) { struct ctl_softc *softc; struct ctl_port *port; softc = control_softc; STAILQ_FOREACH(port, &softc->port_list, links) { if (port_type & port->port_type) ctl_port_offline(port); } return (0); } /* * Returns 0 for success, 1 for failure. * Currently the only failure mode is if there aren't enough entries * allocated. So, in case of a failure, look at num_entries_dropped, * reallocate and try again. */ int ctl_port_list(struct ctl_port_entry *entries, int num_entries_alloced, int *num_entries_filled, int *num_entries_dropped, ctl_port_type port_type, int no_virtual) { struct ctl_softc *softc; struct ctl_port *port; int entries_dropped, entries_filled; int retval; int i; softc = control_softc; retval = 0; entries_filled = 0; entries_dropped = 0; i = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { struct ctl_port_entry *entry; if ((port->port_type & port_type) == 0) continue; if ((no_virtual != 0) && (port->virtual_port != 0)) continue; if (entries_filled >= num_entries_alloced) { entries_dropped++; continue; } entry = &entries[i]; entry->port_type = port->port_type; strlcpy(entry->port_name, port->port_name, sizeof(entry->port_name)); entry->physical_port = port->physical_port; entry->virtual_port = port->virtual_port; entry->wwnn = port->wwnn; entry->wwpn = port->wwpn; i++; entries_filled++; } mtx_unlock(&softc->ctl_lock); if (entries_dropped > 0) retval = 1; *num_entries_dropped = entries_dropped; *num_entries_filled = entries_filled; return (retval); } static void ctl_ioctl_online(void *arg) { struct ctl_ioctl_info *ioctl_info; ioctl_info = (struct ctl_ioctl_info *)arg; ioctl_info->flags |= CTL_IOCTL_FLAG_ENABLED; } static void ctl_ioctl_offline(void *arg) { struct ctl_ioctl_info *ioctl_info; ioctl_info = (struct ctl_ioctl_info *)arg; ioctl_info->flags &= ~CTL_IOCTL_FLAG_ENABLED; } /* * Remove an initiator by port number and initiator ID. * Returns 0 for success, -1 for failure. */ int ctl_remove_initiator(struct ctl_port *port, int iid) { struct ctl_softc *softc = control_softc; mtx_assert(&softc->ctl_lock, MA_NOTOWNED); if (iid > CTL_MAX_INIT_PER_PORT) { printf("%s: initiator ID %u > maximun %u!\n", __func__, iid, CTL_MAX_INIT_PER_PORT); return (-1); } mtx_lock(&softc->ctl_lock); port->wwpn_iid[iid].in_use--; port->wwpn_iid[iid].last_use = time_uptime; mtx_unlock(&softc->ctl_lock); return (0); } /* * Add an initiator to the initiator map. * Returns iid for success, < 0 for failure. */ int ctl_add_initiator(struct ctl_port *port, int iid, uint64_t wwpn, char *name) { struct ctl_softc *softc = control_softc; time_t best_time; int i, best; mtx_assert(&softc->ctl_lock, MA_NOTOWNED); if (iid >= CTL_MAX_INIT_PER_PORT) { printf("%s: WWPN %#jx initiator ID %u > maximum %u!\n", __func__, wwpn, iid, CTL_MAX_INIT_PER_PORT); free(name, M_CTL); return (-1); } mtx_lock(&softc->ctl_lock); if (iid < 0 && (wwpn != 0 || name != NULL)) { for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (wwpn != 0 && wwpn == port->wwpn_iid[i].wwpn) { iid = i; break; } if (name != NULL && port->wwpn_iid[i].name != NULL && strcmp(name, port->wwpn_iid[i].name) == 0) { iid = i; break; } } } if (iid < 0) { for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port->wwpn_iid[i].in_use == 0 && port->wwpn_iid[i].wwpn == 0 && port->wwpn_iid[i].name == NULL) { iid = i; break; } } } if (iid < 0) { best = -1; best_time = INT32_MAX; for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port->wwpn_iid[i].in_use == 0) { if (port->wwpn_iid[i].last_use < best_time) { best = i; best_time = port->wwpn_iid[i].last_use; } } } iid = best; } if (iid < 0) { mtx_unlock(&softc->ctl_lock); free(name, M_CTL); return (-2); } if (port->wwpn_iid[iid].in_use > 0 && (wwpn != 0 || name != NULL)) { /* * This is not an error yet. */ if (wwpn != 0 && wwpn == port->wwpn_iid[iid].wwpn) { #if 0 printf("%s: port %d iid %u WWPN %#jx arrived" " again\n", __func__, port->targ_port, iid, (uintmax_t)wwpn); #endif goto take; } if (name != NULL && port->wwpn_iid[iid].name != NULL && strcmp(name, port->wwpn_iid[iid].name) == 0) { #if 0 printf("%s: port %d iid %u name '%s' arrived" " again\n", __func__, port->targ_port, iid, name); #endif goto take; } /* * This is an error, but what do we do about it? The * driver is telling us we have a new WWPN for this * initiator ID, so we pretty much need to use it. */ printf("%s: port %d iid %u WWPN %#jx '%s' arrived," " but WWPN %#jx '%s' is still at that address\n", __func__, port->targ_port, iid, wwpn, name, (uintmax_t)port->wwpn_iid[iid].wwpn, port->wwpn_iid[iid].name); /* * XXX KDM clear have_ca and ua_pending on each LUN for * this initiator. */ } take: free(port->wwpn_iid[iid].name, M_CTL); port->wwpn_iid[iid].name = name; port->wwpn_iid[iid].wwpn = wwpn; port->wwpn_iid[iid].in_use++; mtx_unlock(&softc->ctl_lock); return (iid); } static int ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf) { int len; switch (port->port_type) { case CTL_PORT_FC: { struct scsi_transportid_fcp *id = (struct scsi_transportid_fcp *)buf; if (port->wwpn_iid[iid].wwpn == 0) return (0); memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_FC; scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->n_port_name); return (sizeof(*id)); } case CTL_PORT_ISCSI: { struct scsi_transportid_iscsi_port *id = (struct scsi_transportid_iscsi_port *)buf; if (port->wwpn_iid[iid].name == NULL) return (0); memset(id, 0, 256); id->format_protocol = SCSI_TRN_ISCSI_FORMAT_PORT | SCSI_PROTO_ISCSI; len = strlcpy(id->iscsi_name, port->wwpn_iid[iid].name, 252) + 1; len = roundup2(min(len, 252), 4); scsi_ulto2b(len, id->additional_length); return (sizeof(*id) + len); } case CTL_PORT_SAS: { struct scsi_transportid_sas *id = (struct scsi_transportid_sas *)buf; if (port->wwpn_iid[iid].wwpn == 0) return (0); memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_SAS; scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->sas_address); return (sizeof(*id)); } default: { struct scsi_transportid_spi *id = (struct scsi_transportid_spi *)buf; memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_SPI; scsi_ulto2b(iid, id->scsi_addr); scsi_ulto2b(port->targ_port, id->rel_trgt_port_id); return (sizeof(*id)); } } } static int ctl_ioctl_lun_enable(void *arg, struct ctl_id targ_id, int lun_id) { return (0); } static int ctl_ioctl_lun_disable(void *arg, struct ctl_id targ_id, int lun_id) { return (0); } /* * Data movement routine for the CTL ioctl frontend port. */ static int ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio) { struct ctl_sg_entry *ext_sglist, *kern_sglist; struct ctl_sg_entry ext_entry, kern_entry; int ext_sglen, ext_sg_entries, kern_sg_entries; int ext_sg_start, ext_offset; int len_to_copy, len_copied; int kern_watermark, ext_watermark; int ext_sglist_malloced; int i, j; ext_sglist_malloced = 0; ext_sg_start = 0; ext_offset = 0; CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n")); /* * If this flag is set, fake the data transfer. */ if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) { ctsio->ext_data_filled = ctsio->ext_data_len; goto bailout; } /* * To simplify things here, if we have a single buffer, stick it in * a S/G entry and just make it a single entry S/G list. */ if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) { int len_seen; ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist); ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL, M_WAITOK); ext_sglist_malloced = 1; if (copyin(ctsio->ext_data_ptr, ext_sglist, ext_sglen) != 0) { ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 0); goto bailout; } ext_sg_entries = ctsio->ext_sg_entries; len_seen = 0; for (i = 0; i < ext_sg_entries; i++) { if ((len_seen + ext_sglist[i].len) >= ctsio->ext_data_filled) { ext_sg_start = i; ext_offset = ctsio->ext_data_filled - len_seen; break; } len_seen += ext_sglist[i].len; } } else { ext_sglist = &ext_entry; ext_sglist->addr = ctsio->ext_data_ptr; ext_sglist->len = ctsio->ext_data_len; ext_sg_entries = 1; ext_sg_start = 0; ext_offset = ctsio->ext_data_filled; } if (ctsio->kern_sg_entries > 0) { kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr; kern_sg_entries = ctsio->kern_sg_entries; } else { kern_sglist = &kern_entry; kern_sglist->addr = ctsio->kern_data_ptr; kern_sglist->len = ctsio->kern_data_len; kern_sg_entries = 1; } kern_watermark = 0; ext_watermark = ext_offset; len_copied = 0; for (i = ext_sg_start, j = 0; i < ext_sg_entries && j < kern_sg_entries;) { uint8_t *ext_ptr, *kern_ptr; len_to_copy = ctl_min(ext_sglist[i].len - ext_watermark, kern_sglist[j].len - kern_watermark); ext_ptr = (uint8_t *)ext_sglist[i].addr; ext_ptr = ext_ptr + ext_watermark; if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) { /* * XXX KDM fix this! */ panic("need to implement bus address support"); #if 0 kern_ptr = bus_to_virt(kern_sglist[j].addr); #endif } else kern_ptr = (uint8_t *)kern_sglist[j].addr; kern_ptr = kern_ptr + kern_watermark; kern_watermark += len_to_copy; ext_watermark += len_to_copy; if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) { CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d " "bytes to user\n", len_to_copy)); CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p " "to %p\n", kern_ptr, ext_ptr)); if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) { ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 0); goto bailout; } } else { CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d " "bytes from user\n", len_to_copy)); CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p " "to %p\n", ext_ptr, kern_ptr)); if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){ ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/0); goto bailout; } } len_copied += len_to_copy; if (ext_sglist[i].len == ext_watermark) { i++; ext_watermark = 0; } if (kern_sglist[j].len == kern_watermark) { j++; kern_watermark = 0; } } ctsio->ext_data_filled += len_copied; CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, " "kern_sg_entries: %d\n", ext_sg_entries, kern_sg_entries)); CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, " "kern_data_len = %d\n", ctsio->ext_data_len, ctsio->kern_data_len)); /* XXX KDM set residual?? */ bailout: if (ext_sglist_malloced != 0) free(ext_sglist, M_CTL); return (CTL_RETVAL_COMPLETE); } /* * Serialize a command that went down the "wrong" side, and so was sent to * this controller for execution. The logic is a little different than the * standard case in ctl_scsiio_precheck(). Errors in this case need to get * sent back to the other side, but in the success case, we execute the * command on this side (XFER mode) or tell the other side to execute it * (SER_ONLY mode). */ static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio) { struct ctl_softc *ctl_softc; union ctl_ha_msg msg_info; struct ctl_lun *lun; int retval = 0; uint32_t targ_lun; ctl_softc = control_softc; targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun; lun = ctl_softc->ctl_luns[targ_lun]; if (lun==NULL) { /* * Why isn't LUN defined? The other side wouldn't * send a cmd if the LUN is undefined. */ printf("%s: Bad JUJU!, LUN is NULL!\n", __func__); /* "Logical unit not supported" */ ctl_set_sense_data(&msg_info.scsi.sense_data, lun, /*sense_format*/SSD_TYPE_NONE, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x25, /*ascq*/ 0x00, SSD_ELEM_NONE); msg_info.scsi.sense_len = SSD_FULL_SIZE; msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND; msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) { } return(1); } mtx_lock(&lun->lun_lock); TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links))) { case CTL_ACTION_BLOCK: ctsio->io_hdr.flags |= CTL_FLAG_BLOCKED; TAILQ_INSERT_TAIL(&lun->blocked_queue, &ctsio->io_hdr, blocked_links); break; case CTL_ACTION_PASS: case CTL_ACTION_SKIP: if (ctl_softc->ha_mode == CTL_HA_MODE_XFER) { ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr((union ctl_io *)ctsio); } else { /* send msg back to other side */ msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = (union ctl_io *)ctsio; msg_info.hdr.msg_type = CTL_MSG_R2R; #if 0 printf("2. pOrig %x\n", (int)msg_info.hdr.original_sc); #endif if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) { } } break; case CTL_ACTION_OVERLAP: /* OVERLAPPED COMMANDS ATTEMPTED */ ctl_set_sense_data(&msg_info.scsi.sense_data, lun, /*sense_format*/SSD_TYPE_NONE, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x4E, /*ascq*/ 0x00, SSD_ELEM_NONE); msg_info.scsi.sense_len = SSD_FULL_SIZE; msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND; msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; #if 0 printf("BAD JUJU:Major Bummer Overlap\n"); #endif TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); retval = 1; if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) { } break; case CTL_ACTION_OVERLAP_TAG: /* TAGGED OVERLAPPED COMMANDS (NN = QUEUE TAG) */ ctl_set_sense_data(&msg_info.scsi.sense_data, lun, /*sense_format*/SSD_TYPE_NONE, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x4D, /*ascq*/ ctsio->tag_num & 0xff, SSD_ELEM_NONE); msg_info.scsi.sense_len = SSD_FULL_SIZE; msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND; msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; #if 0 printf("BAD JUJU:Major Bummer Overlap Tag\n"); #endif TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); retval = 1; if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) { } break; case CTL_ACTION_ERROR: default: /* "Internal target failure" */ ctl_set_sense_data(&msg_info.scsi.sense_data, lun, /*sense_format*/SSD_TYPE_NONE, /*current_error*/ 1, /*sense_key*/ SSD_KEY_HARDWARE_ERROR, /*asc*/ 0x44, /*ascq*/ 0x00, SSD_ELEM_NONE); msg_info.scsi.sense_len = SSD_FULL_SIZE; msg_info.scsi.scsi_status = SCSI_STATUS_CHECK_COND; msg_info.hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; #if 0 printf("BAD JUJU:Major Bummer HW Error\n"); #endif TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); retval = 1; if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 0 ) > CTL_HA_STATUS_SUCCESS) { } break; } mtx_unlock(&lun->lun_lock); return (retval); } static int ctl_ioctl_submit_wait(union ctl_io *io) { struct ctl_fe_ioctl_params params; ctl_fe_ioctl_state last_state; int done, retval; retval = 0; bzero(¶ms, sizeof(params)); mtx_init(¶ms.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF); cv_init(¶ms.sem, "ctlioccv"); params.state = CTL_IOCTL_INPROG; last_state = params.state; io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = ¶ms; CTL_DEBUG_PRINT(("ctl_ioctl_submit_wait\n")); /* This shouldn't happen */ if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE) return (retval); done = 0; do { mtx_lock(¶ms.ioctl_mtx); /* * Check the state here, and don't sleep if the state has * already changed (i.e. wakeup has already occured, but we * weren't waiting yet). */ if (params.state == last_state) { /* XXX KDM cv_wait_sig instead? */ cv_wait(¶ms.sem, ¶ms.ioctl_mtx); } last_state = params.state; switch (params.state) { case CTL_IOCTL_INPROG: /* Why did we wake up? */ /* XXX KDM error here? */ mtx_unlock(¶ms.ioctl_mtx); break; case CTL_IOCTL_DATAMOVE: CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n")); /* * change last_state back to INPROG to avoid * deadlock on subsequent data moves. */ params.state = last_state = CTL_IOCTL_INPROG; mtx_unlock(¶ms.ioctl_mtx); ctl_ioctl_do_datamove(&io->scsiio); /* * Note that in some cases, most notably writes, * this will queue the I/O and call us back later. * In other cases, generally reads, this routine * will immediately call back and wake us up, * probably using our own context. */ io->scsiio.be_move_done(io); break; case CTL_IOCTL_DONE: mtx_unlock(¶ms.ioctl_mtx); CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n")); done = 1; break; default: mtx_unlock(¶ms.ioctl_mtx); /* XXX KDM error here? */ break; } } while (done == 0); mtx_destroy(¶ms.ioctl_mtx); cv_destroy(¶ms.sem); return (CTL_RETVAL_COMPLETE); } static void ctl_ioctl_datamove(union ctl_io *io) { struct ctl_fe_ioctl_params *params; params = (struct ctl_fe_ioctl_params *) io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; mtx_lock(¶ms->ioctl_mtx); params->state = CTL_IOCTL_DATAMOVE; cv_broadcast(¶ms->sem); mtx_unlock(¶ms->ioctl_mtx); } static void ctl_ioctl_done(union ctl_io *io) { struct ctl_fe_ioctl_params *params; params = (struct ctl_fe_ioctl_params *) io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; mtx_lock(¶ms->ioctl_mtx); params->state = CTL_IOCTL_DONE; cv_broadcast(¶ms->sem); mtx_unlock(¶ms->ioctl_mtx); } static void ctl_ioctl_hard_startstop_callback(void *arg, struct cfi_metatask *metatask) { struct ctl_fe_ioctl_startstop_info *sd_info; sd_info = (struct ctl_fe_ioctl_startstop_info *)arg; sd_info->hs_info.status = metatask->status; sd_info->hs_info.total_luns = metatask->taskinfo.startstop.total_luns; sd_info->hs_info.luns_complete = metatask->taskinfo.startstop.luns_complete; sd_info->hs_info.luns_failed = metatask->taskinfo.startstop.luns_failed; cv_broadcast(&sd_info->sem); } static void ctl_ioctl_bbrread_callback(void *arg, struct cfi_metatask *metatask) { struct ctl_fe_ioctl_bbrread_info *fe_bbr_info; fe_bbr_info = (struct ctl_fe_ioctl_bbrread_info *)arg; mtx_lock(fe_bbr_info->lock); fe_bbr_info->bbr_info->status = metatask->status; fe_bbr_info->bbr_info->bbr_status = metatask->taskinfo.bbrread.status; fe_bbr_info->wakeup_done = 1; mtx_unlock(fe_bbr_info->lock); cv_broadcast(&fe_bbr_info->sem); } /* * Returns 0 for success, errno for failure. */ static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num, struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries) { union ctl_io *io; int retval; retval = 0; mtx_lock(&lun->lun_lock); for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); (io != NULL); (*cur_fill_num)++, io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr, ooa_links)) { struct ctl_ooa_entry *entry; /* * If we've got more than we can fit, just count the * remaining entries. */ if (*cur_fill_num >= ooa_hdr->alloc_num) continue; entry = &kern_entries[*cur_fill_num]; entry->tag_num = io->scsiio.tag_num; entry->lun_num = lun->lun; #ifdef CTL_TIME_IO entry->start_bt = io->io_hdr.start_bt; #endif bcopy(io->scsiio.cdb, entry->cdb, io->scsiio.cdb_len); entry->cdb_len = io->scsiio.cdb_len; if (io->io_hdr.flags & CTL_FLAG_BLOCKED) entry->cmd_flags |= CTL_OOACMD_FLAG_BLOCKED; if (io->io_hdr.flags & CTL_FLAG_DMA_INPROG) entry->cmd_flags |= CTL_OOACMD_FLAG_DMA; if (io->io_hdr.flags & CTL_FLAG_ABORT) entry->cmd_flags |= CTL_OOACMD_FLAG_ABORT; if (io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR) entry->cmd_flags |= CTL_OOACMD_FLAG_RTR; if (io->io_hdr.flags & CTL_FLAG_DMA_QUEUED) entry->cmd_flags |= CTL_OOACMD_FLAG_DMA_QUEUED; } mtx_unlock(&lun->lun_lock); return (retval); } static void * ctl_copyin_alloc(void *user_addr, int len, char *error_str, size_t error_str_len) { void *kptr; kptr = malloc(len, M_CTL, M_WAITOK | M_ZERO); if (copyin(user_addr, kptr, len) != 0) { snprintf(error_str, error_str_len, "Error copying %d bytes " "from user address %p to kernel address %p", len, user_addr, kptr); free(kptr, M_CTL); return (NULL); } return (kptr); } static void ctl_free_args(int num_args, struct ctl_be_arg *args) { int i; if (args == NULL) return; for (i = 0; i < num_args; i++) { free(args[i].kname, M_CTL); free(args[i].kvalue, M_CTL); } free(args, M_CTL); } static struct ctl_be_arg * ctl_copyin_args(int num_args, struct ctl_be_arg *uargs, char *error_str, size_t error_str_len) { struct ctl_be_arg *args; int i; args = ctl_copyin_alloc(uargs, num_args * sizeof(*args), error_str, error_str_len); if (args == NULL) goto bailout; for (i = 0; i < num_args; i++) { args[i].kname = NULL; args[i].kvalue = NULL; } for (i = 0; i < num_args; i++) { uint8_t *tmpptr; args[i].kname = ctl_copyin_alloc(args[i].name, args[i].namelen, error_str, error_str_len); if (args[i].kname == NULL) goto bailout; if (args[i].kname[args[i].namelen - 1] != '\0') { snprintf(error_str, error_str_len, "Argument %d " "name is not NUL-terminated", i); goto bailout; } if (args[i].flags & CTL_BEARG_RD) { tmpptr = ctl_copyin_alloc(args[i].value, args[i].vallen, error_str, error_str_len); if (tmpptr == NULL) goto bailout; if ((args[i].flags & CTL_BEARG_ASCII) && (tmpptr[args[i].vallen - 1] != '\0')) { snprintf(error_str, error_str_len, "Argument " "%d value is not NUL-terminated", i); goto bailout; } args[i].kvalue = tmpptr; } else { args[i].kvalue = malloc(args[i].vallen, M_CTL, M_WAITOK | M_ZERO); } } return (args); bailout: ctl_free_args(num_args, args); return (NULL); } static void ctl_copyout_args(int num_args, struct ctl_be_arg *args) { int i; for (i = 0; i < num_args; i++) { if (args[i].flags & CTL_BEARG_WR) copyout(args[i].kvalue, args[i].value, args[i].vallen); } } /* * Escape characters that are illegal or not recommended in XML. */ int ctl_sbuf_printf_esc(struct sbuf *sb, char *str) { int retval; retval = 0; for (; *str; str++) { switch (*str) { case '&': retval = sbuf_printf(sb, "&"); break; case '>': retval = sbuf_printf(sb, ">"); break; case '<': retval = sbuf_printf(sb, "<"); break; default: retval = sbuf_putc(sb, *str); break; } if (retval != 0) break; } return (retval); } static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ctl_softc *softc; int retval; softc = control_softc; retval = 0; switch (cmd) { case CTL_IO: { union ctl_io *io; void *pool_tmp; /* * If we haven't been "enabled", don't allow any SCSI I/O * to this FETD. */ if ((softc->ioctl_info.flags & CTL_IOCTL_FLAG_ENABLED) == 0) { retval = EPERM; break; } io = ctl_alloc_io(softc->ioctl_info.port.ctl_pool_ref); if (io == NULL) { printf("ctl_ioctl: can't allocate ctl_io!\n"); retval = ENOSPC; break; } /* * Need to save the pool reference so it doesn't get * spammed by the user's ctl_io. */ pool_tmp = io->io_hdr.pool; memcpy(io, (void *)addr, sizeof(*io)); io->io_hdr.pool = pool_tmp; /* * No status yet, so make sure the status is set properly. */ io->io_hdr.status = CTL_STATUS_NONE; /* * The user sets the initiator ID, target and LUN IDs. */ io->io_hdr.nexus.targ_port = softc->ioctl_info.port.targ_port; io->io_hdr.flags |= CTL_FLAG_USER_REQ; if ((io->io_hdr.io_type == CTL_IO_SCSI) && (io->scsiio.tag_type != CTL_TAG_UNTAGGED)) io->scsiio.tag_num = softc->ioctl_info.cur_tag_num++; retval = ctl_ioctl_submit_wait(io); if (retval != 0) { ctl_free_io(io); break; } memcpy((void *)addr, io, sizeof(*io)); /* return this to our pool */ ctl_free_io(io); break; } case CTL_ENABLE_PORT: case CTL_DISABLE_PORT: case CTL_SET_PORT_WWNS: { struct ctl_port *port; struct ctl_port_entry *entry; entry = (struct ctl_port_entry *)addr; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { int action, done; action = 0; done = 0; if ((entry->port_type == CTL_PORT_NONE) && (entry->targ_port == port->targ_port)) { /* * If the user only wants to enable or * disable or set WWNs on a specific port, * do the operation and we're done. */ action = 1; done = 1; } else if (entry->port_type & port->port_type) { /* * Compare the user's type mask with the * particular frontend type to see if we * have a match. */ action = 1; done = 0; /* * Make sure the user isn't trying to set * WWNs on multiple ports at the same time. */ if (cmd == CTL_SET_PORT_WWNS) { printf("%s: Can't set WWNs on " "multiple ports\n", __func__); retval = EINVAL; break; } } if (action != 0) { /* * XXX KDM we have to drop the lock here, * because the online/offline operations * can potentially block. We need to * reference count the frontends so they * can't go away, */ mtx_unlock(&softc->ctl_lock); if (cmd == CTL_ENABLE_PORT) { struct ctl_lun *lun; STAILQ_FOREACH(lun, &softc->lun_list, links) { port->lun_enable(port->targ_lun_arg, lun->target, lun->lun); } ctl_port_online(port); } else if (cmd == CTL_DISABLE_PORT) { struct ctl_lun *lun; ctl_port_offline(port); STAILQ_FOREACH(lun, &softc->lun_list, links) { port->lun_disable( port->targ_lun_arg, lun->target, lun->lun); } } mtx_lock(&softc->ctl_lock); if (cmd == CTL_SET_PORT_WWNS) ctl_port_set_wwns(port, (entry->flags & CTL_PORT_WWNN_VALID) ? 1 : 0, entry->wwnn, (entry->flags & CTL_PORT_WWPN_VALID) ? 1 : 0, entry->wwpn); } if (done != 0) break; } mtx_unlock(&softc->ctl_lock); break; } case CTL_GET_PORT_LIST: { struct ctl_port *port; struct ctl_port_list *list; int i; list = (struct ctl_port_list *)addr; if (list->alloc_len != (list->alloc_num * sizeof(struct ctl_port_entry))) { printf("%s: CTL_GET_PORT_LIST: alloc_len %u != " "alloc_num %u * sizeof(struct ctl_port_entry) " "%zu\n", __func__, list->alloc_len, list->alloc_num, sizeof(struct ctl_port_entry)); retval = EINVAL; break; } list->fill_len = 0; list->fill_num = 0; list->dropped_num = 0; i = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { struct ctl_port_entry entry, *list_entry; if (list->fill_num >= list->alloc_num) { list->dropped_num++; continue; } entry.port_type = port->port_type; strlcpy(entry.port_name, port->port_name, sizeof(entry.port_name)); entry.targ_port = port->targ_port; entry.physical_port = port->physical_port; entry.virtual_port = port->virtual_port; entry.wwnn = port->wwnn; entry.wwpn = port->wwpn; if (port->status & CTL_PORT_STATUS_ONLINE) entry.online = 1; else entry.online = 0; list_entry = &list->entries[i]; retval = copyout(&entry, list_entry, sizeof(entry)); if (retval != 0) { printf("%s: CTL_GET_PORT_LIST: copyout " "returned %d\n", __func__, retval); break; } i++; list->fill_num++; list->fill_len += sizeof(entry); } mtx_unlock(&softc->ctl_lock); /* * If this is non-zero, we had a copyout fault, so there's * probably no point in attempting to set the status inside * the structure. */ if (retval != 0) break; if (list->dropped_num > 0) list->status = CTL_PORT_LIST_NEED_MORE_SPACE; else list->status = CTL_PORT_LIST_OK; break; } case CTL_DUMP_OOA: { struct ctl_lun *lun; union ctl_io *io; char printbuf[128]; struct sbuf sb; mtx_lock(&softc->ctl_lock); printf("Dumping OOA queues:\n"); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); for (io = (union ctl_io *)TAILQ_FIRST( &lun->ooa_queue); io != NULL; io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr, ooa_links)) { sbuf_new(&sb, printbuf, sizeof(printbuf), SBUF_FIXEDLEN); sbuf_printf(&sb, "LUN %jd tag 0x%04x%s%s%s%s: ", (intmax_t)lun->lun, io->scsiio.tag_num, (io->io_hdr.flags & CTL_FLAG_BLOCKED) ? "" : " BLOCKED", (io->io_hdr.flags & CTL_FLAG_DMA_INPROG) ? " DMA" : "", (io->io_hdr.flags & CTL_FLAG_ABORT) ? " ABORT" : "", (io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR) ? " RTR" : ""); ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_finish(&sb); printf("%s\n", sbuf_data(&sb)); } mtx_unlock(&lun->lun_lock); } printf("OOA queues dump done\n"); mtx_unlock(&softc->ctl_lock); break; } case CTL_GET_OOA: { struct ctl_lun *lun; struct ctl_ooa *ooa_hdr; struct ctl_ooa_entry *entries; uint32_t cur_fill_num; ooa_hdr = (struct ctl_ooa *)addr; if ((ooa_hdr->alloc_len == 0) || (ooa_hdr->alloc_num == 0)) { printf("%s: CTL_GET_OOA: alloc len %u and alloc num %u " "must be non-zero\n", __func__, ooa_hdr->alloc_len, ooa_hdr->alloc_num); retval = EINVAL; break; } if (ooa_hdr->alloc_len != (ooa_hdr->alloc_num * sizeof(struct ctl_ooa_entry))) { printf("%s: CTL_GET_OOA: alloc len %u must be alloc " "num %d * sizeof(struct ctl_ooa_entry) %zd\n", __func__, ooa_hdr->alloc_len, ooa_hdr->alloc_num,sizeof(struct ctl_ooa_entry)); retval = EINVAL; break; } entries = malloc(ooa_hdr->alloc_len, M_CTL, M_WAITOK | M_ZERO); if (entries == NULL) { printf("%s: could not allocate %d bytes for OOA " "dump\n", __func__, ooa_hdr->alloc_len); retval = ENOMEM; break; } mtx_lock(&softc->ctl_lock); if (((ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) == 0) && ((ooa_hdr->lun_num >= CTL_MAX_LUNS) || (softc->ctl_luns[ooa_hdr->lun_num] == NULL))) { mtx_unlock(&softc->ctl_lock); free(entries, M_CTL); printf("%s: CTL_GET_OOA: invalid LUN %ju\n", __func__, (uintmax_t)ooa_hdr->lun_num); retval = EINVAL; break; } cur_fill_num = 0; if (ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) { STAILQ_FOREACH(lun, &softc->lun_list, links) { retval = ctl_ioctl_fill_ooa(lun, &cur_fill_num, ooa_hdr, entries); if (retval != 0) break; } if (retval != 0) { mtx_unlock(&softc->ctl_lock); free(entries, M_CTL); break; } } else { lun = softc->ctl_luns[ooa_hdr->lun_num]; retval = ctl_ioctl_fill_ooa(lun, &cur_fill_num,ooa_hdr, entries); } mtx_unlock(&softc->ctl_lock); ooa_hdr->fill_num = min(cur_fill_num, ooa_hdr->alloc_num); ooa_hdr->fill_len = ooa_hdr->fill_num * sizeof(struct ctl_ooa_entry); retval = copyout(entries, ooa_hdr->entries, ooa_hdr->fill_len); if (retval != 0) { printf("%s: error copying out %d bytes for OOA dump\n", __func__, ooa_hdr->fill_len); } getbintime(&ooa_hdr->cur_bt); if (cur_fill_num > ooa_hdr->alloc_num) { ooa_hdr->dropped_num = cur_fill_num -ooa_hdr->alloc_num; ooa_hdr->status = CTL_OOA_NEED_MORE_SPACE; } else { ooa_hdr->dropped_num = 0; ooa_hdr->status = CTL_OOA_OK; } free(entries, M_CTL); break; } case CTL_CHECK_OOA: { union ctl_io *io; struct ctl_lun *lun; struct ctl_ooa_info *ooa_info; ooa_info = (struct ctl_ooa_info *)addr; if (ooa_info->lun_id >= CTL_MAX_LUNS) { ooa_info->status = CTL_OOA_INVALID_LUN; break; } mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[ooa_info->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); ooa_info->status = CTL_OOA_INVALID_LUN; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); ooa_info->num_entries = 0; for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); io != NULL; io = (union ctl_io *)TAILQ_NEXT( &io->io_hdr, ooa_links)) { ooa_info->num_entries++; } mtx_unlock(&lun->lun_lock); ooa_info->status = CTL_OOA_SUCCESS; break; } case CTL_HARD_START: case CTL_HARD_STOP: { struct ctl_fe_ioctl_startstop_info ss_info; struct cfi_metatask *metatask; struct mtx hs_mtx; mtx_init(&hs_mtx, "HS Mutex", NULL, MTX_DEF); cv_init(&ss_info.sem, "hard start/stop cv" ); metatask = cfi_alloc_metatask(/*can_wait*/ 1); if (metatask == NULL) { retval = ENOMEM; mtx_destroy(&hs_mtx); break; } if (cmd == CTL_HARD_START) metatask->tasktype = CFI_TASK_STARTUP; else metatask->tasktype = CFI_TASK_SHUTDOWN; metatask->callback = ctl_ioctl_hard_startstop_callback; metatask->callback_arg = &ss_info; cfi_action(metatask); /* Wait for the callback */ mtx_lock(&hs_mtx); cv_wait_sig(&ss_info.sem, &hs_mtx); mtx_unlock(&hs_mtx); /* * All information has been copied from the metatask by the * time cv_broadcast() is called, so we free the metatask here. */ cfi_free_metatask(metatask); memcpy((void *)addr, &ss_info.hs_info, sizeof(ss_info.hs_info)); mtx_destroy(&hs_mtx); break; } case CTL_BBRREAD: { struct ctl_bbrread_info *bbr_info; struct ctl_fe_ioctl_bbrread_info fe_bbr_info; struct mtx bbr_mtx; struct cfi_metatask *metatask; bbr_info = (struct ctl_bbrread_info *)addr; bzero(&fe_bbr_info, sizeof(fe_bbr_info)); bzero(&bbr_mtx, sizeof(bbr_mtx)); mtx_init(&bbr_mtx, "BBR Mutex", NULL, MTX_DEF); fe_bbr_info.bbr_info = bbr_info; fe_bbr_info.lock = &bbr_mtx; cv_init(&fe_bbr_info.sem, "BBR read cv"); metatask = cfi_alloc_metatask(/*can_wait*/ 1); if (metatask == NULL) { mtx_destroy(&bbr_mtx); cv_destroy(&fe_bbr_info.sem); retval = ENOMEM; break; } metatask->tasktype = CFI_TASK_BBRREAD; metatask->callback = ctl_ioctl_bbrread_callback; metatask->callback_arg = &fe_bbr_info; metatask->taskinfo.bbrread.lun_num = bbr_info->lun_num; metatask->taskinfo.bbrread.lba = bbr_info->lba; metatask->taskinfo.bbrread.len = bbr_info->len; cfi_action(metatask); mtx_lock(&bbr_mtx); while (fe_bbr_info.wakeup_done == 0) cv_wait_sig(&fe_bbr_info.sem, &bbr_mtx); mtx_unlock(&bbr_mtx); bbr_info->status = metatask->status; bbr_info->bbr_status = metatask->taskinfo.bbrread.status; bbr_info->scsi_status = metatask->taskinfo.bbrread.scsi_status; memcpy(&bbr_info->sense_data, &metatask->taskinfo.bbrread.sense_data, ctl_min(sizeof(bbr_info->sense_data), sizeof(metatask->taskinfo.bbrread.sense_data))); cfi_free_metatask(metatask); mtx_destroy(&bbr_mtx); cv_destroy(&fe_bbr_info.sem); break; } case CTL_DELAY_IO: { struct ctl_io_delay_info *delay_info; #ifdef CTL_IO_DELAY struct ctl_lun *lun; #endif /* CTL_IO_DELAY */ delay_info = (struct ctl_io_delay_info *)addr; #ifdef CTL_IO_DELAY mtx_lock(&softc->ctl_lock); if ((delay_info->lun_id >= CTL_MAX_LUNS) || (softc->ctl_luns[delay_info->lun_id] == NULL)) { delay_info->status = CTL_DELAY_STATUS_INVALID_LUN; } else { lun = softc->ctl_luns[delay_info->lun_id]; mtx_lock(&lun->lun_lock); delay_info->status = CTL_DELAY_STATUS_OK; switch (delay_info->delay_type) { case CTL_DELAY_TYPE_CONT: break; case CTL_DELAY_TYPE_ONESHOT: break; default: delay_info->status = CTL_DELAY_STATUS_INVALID_TYPE; break; } switch (delay_info->delay_loc) { case CTL_DELAY_LOC_DATAMOVE: lun->delay_info.datamove_type = delay_info->delay_type; lun->delay_info.datamove_delay = delay_info->delay_secs; break; case CTL_DELAY_LOC_DONE: lun->delay_info.done_type = delay_info->delay_type; lun->delay_info.done_delay = delay_info->delay_secs; break; default: delay_info->status = CTL_DELAY_STATUS_INVALID_LOC; break; } mtx_unlock(&lun->lun_lock); } mtx_unlock(&softc->ctl_lock); #else delay_info->status = CTL_DELAY_STATUS_NOT_IMPLEMENTED; #endif /* CTL_IO_DELAY */ break; } case CTL_REALSYNC_SET: { int *syncstate; syncstate = (int *)addr; mtx_lock(&softc->ctl_lock); switch (*syncstate) { case 0: softc->flags &= ~CTL_FLAG_REAL_SYNC; break; case 1: softc->flags |= CTL_FLAG_REAL_SYNC; break; default: retval = EINVAL; break; } mtx_unlock(&softc->ctl_lock); break; } case CTL_REALSYNC_GET: { int *syncstate; syncstate = (int*)addr; mtx_lock(&softc->ctl_lock); if (softc->flags & CTL_FLAG_REAL_SYNC) *syncstate = 1; else *syncstate = 0; mtx_unlock(&softc->ctl_lock); break; } case CTL_SETSYNC: case CTL_GETSYNC: { struct ctl_sync_info *sync_info; struct ctl_lun *lun; sync_info = (struct ctl_sync_info *)addr; mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[sync_info->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); sync_info->status = CTL_GS_SYNC_NO_LUN; } /* * Get or set the sync interval. We're not bounds checking * in the set case, hopefully the user won't do something * silly. */ mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (cmd == CTL_GETSYNC) sync_info->sync_interval = lun->sync_interval; else lun->sync_interval = sync_info->sync_interval; mtx_unlock(&lun->lun_lock); sync_info->status = CTL_GS_SYNC_OK; break; } case CTL_GETSTATS: { struct ctl_stats *stats; struct ctl_lun *lun; int i; stats = (struct ctl_stats *)addr; if ((sizeof(struct ctl_lun_io_stats) * softc->num_luns) > stats->alloc_len) { stats->status = CTL_SS_NEED_MORE_SPACE; stats->num_luns = softc->num_luns; break; } /* * XXX KDM no locking here. If the LUN list changes, * things can blow up. */ for (i = 0, lun = STAILQ_FIRST(&softc->lun_list); lun != NULL; i++, lun = STAILQ_NEXT(lun, links)) { retval = copyout(&lun->stats, &stats->lun_stats[i], sizeof(lun->stats)); if (retval != 0) break; } stats->num_luns = softc->num_luns; stats->fill_len = sizeof(struct ctl_lun_io_stats) * softc->num_luns; stats->status = CTL_SS_OK; #ifdef CTL_TIME_IO stats->flags = CTL_STATS_FLAG_TIME_VALID; #else stats->flags = CTL_STATS_FLAG_NONE; #endif getnanouptime(&stats->timestamp); break; } case CTL_ERROR_INJECT: { struct ctl_error_desc *err_desc, *new_err_desc; struct ctl_lun *lun; err_desc = (struct ctl_error_desc *)addr; new_err_desc = malloc(sizeof(*new_err_desc), M_CTL, M_WAITOK | M_ZERO); bcopy(err_desc, new_err_desc, sizeof(*new_err_desc)); mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[err_desc->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); free(new_err_desc, M_CTL); printf("%s: CTL_ERROR_INJECT: invalid LUN %ju\n", __func__, (uintmax_t)err_desc->lun_id); retval = EINVAL; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); /* * We could do some checking here to verify the validity * of the request, but given the complexity of error * injection requests, the checking logic would be fairly * complex. * * For now, if the request is invalid, it just won't get * executed and might get deleted. */ STAILQ_INSERT_TAIL(&lun->error_list, new_err_desc, links); /* * XXX KDM check to make sure the serial number is unique, * in case we somehow manage to wrap. That shouldn't * happen for a very long time, but it's the right thing to * do. */ new_err_desc->serial = lun->error_serial; err_desc->serial = lun->error_serial; lun->error_serial++; mtx_unlock(&lun->lun_lock); break; } case CTL_ERROR_INJECT_DELETE: { struct ctl_error_desc *delete_desc, *desc, *desc2; struct ctl_lun *lun; int delete_done; delete_desc = (struct ctl_error_desc *)addr; delete_done = 0; mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[delete_desc->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); printf("%s: CTL_ERROR_INJECT_DELETE: invalid LUN %ju\n", __func__, (uintmax_t)delete_desc->lun_id); retval = EINVAL; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) { if (desc->serial != delete_desc->serial) continue; STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links); free(desc, M_CTL); delete_done = 1; } mtx_unlock(&lun->lun_lock); if (delete_done == 0) { printf("%s: CTL_ERROR_INJECT_DELETE: can't find " "error serial %ju on LUN %u\n", __func__, delete_desc->serial, delete_desc->lun_id); retval = EINVAL; break; } break; } case CTL_DUMP_STRUCTS: { int i, j, k, idx; struct ctl_port *port; struct ctl_frontend *fe; mtx_lock(&softc->ctl_lock); printf("CTL Persistent Reservation information start:\n"); for (i = 0; i < CTL_MAX_LUNS; i++) { struct ctl_lun *lun; lun = softc->ctl_luns[i]; if ((lun == NULL) || ((lun->flags & CTL_LUN_DISABLED) != 0)) continue; for (j = 0; j < (CTL_MAX_PORTS * 2); j++) { for (k = 0; k < CTL_MAX_INIT_PER_PORT; k++){ idx = j * CTL_MAX_INIT_PER_PORT + k; if (lun->per_res[idx].registered == 0) continue; printf(" LUN %d port %d iid %d key " "%#jx\n", i, j, k, (uintmax_t)scsi_8btou64( lun->per_res[idx].res_key.key)); } } } printf("CTL Persistent Reservation information end\n"); printf("CTL Ports:\n"); STAILQ_FOREACH(port, &softc->port_list, links) { printf(" Port %d '%s' Frontend '%s' Type %u pp %d vp %d WWNN " "%#jx WWPN %#jx\n", port->targ_port, port->port_name, port->frontend->name, port->port_type, port->physical_port, port->virtual_port, (uintmax_t)port->wwnn, (uintmax_t)port->wwpn); for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (port->wwpn_iid[j].in_use == 0 && port->wwpn_iid[j].wwpn == 0 && port->wwpn_iid[j].name == NULL) continue; printf(" iid %u use %d WWPN %#jx '%s'\n", j, port->wwpn_iid[j].in_use, (uintmax_t)port->wwpn_iid[j].wwpn, port->wwpn_iid[j].name); } } printf("CTL Port information end\n"); mtx_unlock(&softc->ctl_lock); /* * XXX KDM calling this without a lock. We'd likely want * to drop the lock before calling the frontend's dump * routine anyway. */ printf("CTL Frontends:\n"); STAILQ_FOREACH(fe, &softc->fe_list, links) { printf(" Frontend '%s'\n", fe->name); if (fe->fe_dump != NULL) fe->fe_dump(); } printf("CTL Frontend information end\n"); break; } case CTL_LUN_REQ: { struct ctl_lun_req *lun_req; struct ctl_backend_driver *backend; lun_req = (struct ctl_lun_req *)addr; backend = ctl_backend_find(lun_req->backend); if (backend == NULL) { lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Backend \"%s\" not found.", lun_req->backend); break; } if (lun_req->num_be_args > 0) { lun_req->kern_be_args = ctl_copyin_args( lun_req->num_be_args, lun_req->be_args, lun_req->error_str, sizeof(lun_req->error_str)); if (lun_req->kern_be_args == NULL) { lun_req->status = CTL_LUN_ERROR; break; } } retval = backend->ioctl(dev, cmd, addr, flag, td); if (lun_req->num_be_args > 0) { ctl_copyout_args(lun_req->num_be_args, lun_req->kern_be_args); ctl_free_args(lun_req->num_be_args, lun_req->kern_be_args); } break; } case CTL_LUN_LIST: { struct sbuf *sb; struct ctl_lun *lun; struct ctl_lun_list *list; struct ctl_option *opt; list = (struct ctl_lun_list *)addr; /* * Allocate a fixed length sbuf here, based on the length * of the user's buffer. We could allocate an auto-extending * buffer, and then tell the user how much larger our * amount of data is than his buffer, but that presents * some problems: * * 1. The sbuf(9) routines use a blocking malloc, and so * we can't hold a lock while calling them with an * auto-extending buffer. * * 2. There is not currently a LUN reference counting * mechanism, outside of outstanding transactions on * the LUN's OOA queue. So a LUN could go away on us * while we're getting the LUN number, backend-specific * information, etc. Thus, given the way things * currently work, we need to hold the CTL lock while * grabbing LUN information. * * So, from the user's standpoint, the best thing to do is * allocate what he thinks is a reasonable buffer length, * and then if he gets a CTL_LUN_LIST_NEED_MORE_SPACE error, * double the buffer length and try again. (And repeat * that until he succeeds.) */ sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN); if (sb == NULL) { list->status = CTL_LUN_LIST_ERROR; snprintf(list->error_str, sizeof(list->error_str), "Unable to allocate %d bytes for LUN list", list->alloc_len); break; } sbuf_printf(sb, "\n"); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); retval = sbuf_printf(sb, "\n", (uintmax_t)lun->lun); /* * Bail out as soon as we see that we've overfilled * the buffer. */ if (retval != 0) break; retval = sbuf_printf(sb, "\t%s" "\n", (lun->backend == NULL) ? "none" : lun->backend->name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", lun->be_lun->lun_type); if (retval != 0) break; if (lun->backend == NULL) { retval = sbuf_printf(sb, "\n"); if (retval != 0) break; continue; } retval = sbuf_printf(sb, "\t%ju\n", (lun->be_lun->maxlba > 0) ? lun->be_lun->maxlba + 1 : 0); if (retval != 0) break; retval = sbuf_printf(sb, "\t%u\n", lun->be_lun->blocksize); if (retval != 0) break; retval = sbuf_printf(sb, "\t"); if (retval != 0) break; retval = ctl_sbuf_printf_esc(sb, lun->be_lun->serial_num); if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; retval = sbuf_printf(sb, "\t"); if (retval != 0) break; retval = ctl_sbuf_printf_esc(sb,lun->be_lun->device_id); if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; if (lun->backend->lun_info != NULL) { retval = lun->backend->lun_info(lun->be_lun->be_lun, sb); if (retval != 0) break; } STAILQ_FOREACH(opt, &lun->be_lun->options, links) { retval = sbuf_printf(sb, "\t<%s>%s\n", opt->name, opt->value, opt->name); if (retval != 0) break; } retval = sbuf_printf(sb, "\n"); if (retval != 0) break; mtx_unlock(&lun->lun_lock); } if (lun != NULL) mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if ((retval != 0) || ((retval = sbuf_printf(sb, "\n")) != 0)) { retval = 0; sbuf_delete(sb); list->status = CTL_LUN_LIST_NEED_MORE_SPACE; snprintf(list->error_str, sizeof(list->error_str), "Out of space, %d bytes is too small", list->alloc_len); break; } sbuf_finish(sb); retval = copyout(sbuf_data(sb), list->lun_xml, sbuf_len(sb) + 1); list->fill_len = sbuf_len(sb) + 1; list->status = CTL_LUN_LIST_OK; sbuf_delete(sb); break; } case CTL_ISCSI: { struct ctl_iscsi *ci; struct ctl_frontend *fe; ci = (struct ctl_iscsi *)addr; fe = ctl_frontend_find("iscsi"); if (fe == NULL) { ci->status = CTL_ISCSI_ERROR; snprintf(ci->error_str, sizeof(ci->error_str), "Frontend \"iscsi\" not found."); break; } retval = fe->ioctl(dev, cmd, addr, flag, td); break; } case CTL_PORT_REQ: { struct ctl_req *req; struct ctl_frontend *fe; req = (struct ctl_req *)addr; fe = ctl_frontend_find(req->driver); if (fe == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Frontend \"%s\" not found.", req->driver); break; } if (req->num_args > 0) { req->kern_args = ctl_copyin_args(req->num_args, req->args, req->error_str, sizeof(req->error_str)); if (req->kern_args == NULL) { req->status = CTL_LUN_ERROR; break; } } retval = fe->ioctl(dev, cmd, addr, flag, td); if (req->num_args > 0) { ctl_copyout_args(req->num_args, req->kern_args); ctl_free_args(req->num_args, req->kern_args); } break; } case CTL_PORT_LIST: { struct sbuf *sb; struct ctl_port *port; struct ctl_lun_list *list; struct ctl_option *opt; list = (struct ctl_lun_list *)addr; sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN); if (sb == NULL) { list->status = CTL_LUN_LIST_ERROR; snprintf(list->error_str, sizeof(list->error_str), "Unable to allocate %d bytes for LUN list", list->alloc_len); break; } sbuf_printf(sb, "\n"); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { retval = sbuf_printf(sb, "\n", (uintmax_t)port->targ_port); /* * Bail out as soon as we see that we've overfilled * the buffer. */ if (retval != 0) break; retval = sbuf_printf(sb, "\t%s" "\n", port->frontend->name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->port_type); if (retval != 0) break; retval = sbuf_printf(sb, "\t%s\n", (port->status & CTL_PORT_STATUS_ONLINE) ? "YES" : "NO"); if (retval != 0) break; retval = sbuf_printf(sb, "\t%s\n", port->port_name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->physical_port); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->virtual_port); if (retval != 0) break; retval = sbuf_printf(sb, "\t%#jx\n", (uintmax_t)port->wwnn); if (retval != 0) break; retval = sbuf_printf(sb, "\t%#jx\n", (uintmax_t)port->wwpn); if (retval != 0) break; if (port->port_info != NULL) { retval = port->port_info(port->onoff_arg, sb); if (retval != 0) break; } STAILQ_FOREACH(opt, &port->options, links) { retval = sbuf_printf(sb, "\t<%s>%s\n", opt->name, opt->value, opt->name); if (retval != 0) break; } retval = sbuf_printf(sb, "\n"); if (retval != 0) break; } mtx_unlock(&softc->ctl_lock); if ((retval != 0) || ((retval = sbuf_printf(sb, "\n")) != 0)) { retval = 0; sbuf_delete(sb); list->status = CTL_LUN_LIST_NEED_MORE_SPACE; snprintf(list->error_str, sizeof(list->error_str), "Out of space, %d bytes is too small", list->alloc_len); break; } sbuf_finish(sb); retval = copyout(sbuf_data(sb), list->lun_xml, sbuf_len(sb) + 1); list->fill_len = sbuf_len(sb) + 1; list->status = CTL_LUN_LIST_OK; sbuf_delete(sb); break; } default: { /* XXX KDM should we fix this? */ #if 0 struct ctl_backend_driver *backend; unsigned int type; int found; found = 0; /* * We encode the backend type as the ioctl type for backend * ioctls. So parse it out here, and then search for a * backend of this type. */ type = _IOC_TYPE(cmd); STAILQ_FOREACH(backend, &softc->be_list, links) { if (backend->type == type) { found = 1; break; } } if (found == 0) { printf("ctl: unknown ioctl command %#lx or backend " "%d\n", cmd, type); retval = EINVAL; break; } retval = backend->ioctl(dev, cmd, addr, flag, td); #endif retval = ENOTTY; break; } } return (retval); } uint32_t ctl_get_initindex(struct ctl_nexus *nexus) { if (nexus->targ_port < CTL_MAX_PORTS) return (nexus->initid.id + (nexus->targ_port * CTL_MAX_INIT_PER_PORT)); else return (nexus->initid.id + ((nexus->targ_port - CTL_MAX_PORTS) * CTL_MAX_INIT_PER_PORT)); } uint32_t ctl_get_resindex(struct ctl_nexus *nexus) { return (nexus->initid.id + (nexus->targ_port * CTL_MAX_INIT_PER_PORT)); } uint32_t ctl_port_idx(int port_num) { if (port_num < CTL_MAX_PORTS) return(port_num); else return(port_num - CTL_MAX_PORTS); } static uint32_t ctl_map_lun(int port_num, uint32_t lun_id) { struct ctl_port *port; port = control_softc->ctl_ports[ctl_port_idx(port_num)]; if (port == NULL) return (UINT32_MAX); if (port->lun_map == NULL) return (lun_id); return (port->lun_map(port->targ_lun_arg, lun_id)); } static uint32_t ctl_map_lun_back(int port_num, uint32_t lun_id) { struct ctl_port *port; uint32_t i; port = control_softc->ctl_ports[ctl_port_idx(port_num)]; if (port->lun_map == NULL) return (lun_id); for (i = 0; i < CTL_MAX_LUNS; i++) { if (port->lun_map(port->targ_lun_arg, i) == lun_id) return (i); } return (UINT32_MAX); } /* * Note: This only works for bitmask sizes that are at least 32 bits, and * that are a power of 2. */ int ctl_ffz(uint32_t *mask, uint32_t size) { uint32_t num_chunks, num_pieces; int i, j; num_chunks = (size >> 5); if (num_chunks == 0) num_chunks++; num_pieces = ctl_min((sizeof(uint32_t) * 8), size); for (i = 0; i < num_chunks; i++) { for (j = 0; j < num_pieces; j++) { if ((mask[i] & (1 << j)) == 0) return ((i << 5) + j); } } return (-1); } int ctl_set_mask(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) != 0) return (-1); else mask[chunk] |= (1 << piece); return (0); } int ctl_clear_mask(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) == 0) return (-1); else mask[chunk] &= ~(1 << piece); return (0); } int ctl_is_set(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) == 0) return (0); else return (1); } #ifdef unused /* * The bus, target and lun are optional, they can be filled in later. * can_wait is used to determine whether we can wait on the malloc or not. */ union ctl_io* ctl_malloc_io(ctl_io_type io_type, uint32_t targ_port, uint32_t targ_target, uint32_t targ_lun, int can_wait) { union ctl_io *io; if (can_wait) io = (union ctl_io *)malloc(sizeof(*io), M_CTL, M_WAITOK); else io = (union ctl_io *)malloc(sizeof(*io), M_CTL, M_NOWAIT); if (io != NULL) { io->io_hdr.io_type = io_type; io->io_hdr.targ_port = targ_port; /* * XXX KDM this needs to change/go away. We need to move * to a preallocated pool of ctl_scsiio structures. */ io->io_hdr.nexus.targ_target.id = targ_target; io->io_hdr.nexus.targ_lun = targ_lun; } return (io); } void ctl_kfree_io(union ctl_io *io) { free(io, M_CTL); } #endif /* unused */ /* * ctl_softc, pool_type, total_ctl_io are passed in. * npool is passed out. */ int ctl_pool_create(struct ctl_softc *ctl_softc, ctl_pool_type pool_type, uint32_t total_ctl_io, struct ctl_io_pool **npool) { uint32_t i; union ctl_io *cur_io, *next_io; struct ctl_io_pool *pool; int retval; retval = 0; pool = (struct ctl_io_pool *)malloc(sizeof(*pool), M_CTL, M_NOWAIT | M_ZERO); if (pool == NULL) { retval = ENOMEM; goto bailout; } pool->type = pool_type; pool->ctl_softc = ctl_softc; mtx_lock(&ctl_softc->pool_lock); pool->id = ctl_softc->cur_pool_id++; mtx_unlock(&ctl_softc->pool_lock); pool->flags = CTL_POOL_FLAG_NONE; pool->refcount = 1; /* Reference for validity. */ STAILQ_INIT(&pool->free_queue); /* * XXX KDM other options here: * - allocate a page at a time * - allocate one big chunk of memory. * Page allocation might work well, but would take a little more * tracking. */ for (i = 0; i < total_ctl_io; i++) { cur_io = (union ctl_io *)malloc(sizeof(*cur_io), M_CTLIO, M_NOWAIT); if (cur_io == NULL) { retval = ENOMEM; break; } cur_io->io_hdr.pool = pool; STAILQ_INSERT_TAIL(&pool->free_queue, &cur_io->io_hdr, links); pool->total_ctl_io++; pool->free_ctl_io++; } if (retval != 0) { for (cur_io = (union ctl_io *)STAILQ_FIRST(&pool->free_queue); cur_io != NULL; cur_io = next_io) { next_io = (union ctl_io *)STAILQ_NEXT(&cur_io->io_hdr, links); STAILQ_REMOVE(&pool->free_queue, &cur_io->io_hdr, ctl_io_hdr, links); free(cur_io, M_CTLIO); } free(pool, M_CTL); goto bailout; } mtx_lock(&ctl_softc->pool_lock); ctl_softc->num_pools++; STAILQ_INSERT_TAIL(&ctl_softc->io_pools, pool, links); /* * Increment our usage count if this is an external consumer, so we * can't get unloaded until the external consumer (most likely a * FETD) unloads and frees his pool. * * XXX KDM will this increment the caller's module use count, or * mine? */ #if 0 if ((pool_type != CTL_POOL_EMERGENCY) && (pool_type != CTL_POOL_INTERNAL) && (pool_type != CTL_POOL_4OTHERSC)) MOD_INC_USE_COUNT; #endif mtx_unlock(&ctl_softc->pool_lock); *npool = pool; bailout: return (retval); } static int ctl_pool_acquire(struct ctl_io_pool *pool) { mtx_assert(&pool->ctl_softc->pool_lock, MA_OWNED); if (pool->flags & CTL_POOL_FLAG_INVALID) return (EINVAL); pool->refcount++; return (0); } static void ctl_pool_release(struct ctl_io_pool *pool) { struct ctl_softc *ctl_softc = pool->ctl_softc; union ctl_io *io; mtx_assert(&ctl_softc->pool_lock, MA_OWNED); if (--pool->refcount != 0) return; while ((io = (union ctl_io *)STAILQ_FIRST(&pool->free_queue)) != NULL) { STAILQ_REMOVE(&pool->free_queue, &io->io_hdr, ctl_io_hdr, links); free(io, M_CTLIO); } STAILQ_REMOVE(&ctl_softc->io_pools, pool, ctl_io_pool, links); ctl_softc->num_pools--; /* * XXX KDM will this decrement the caller's usage count or mine? */ #if 0 if ((pool->type != CTL_POOL_EMERGENCY) && (pool->type != CTL_POOL_INTERNAL) && (pool->type != CTL_POOL_4OTHERSC)) MOD_DEC_USE_COUNT; #endif free(pool, M_CTL); } void ctl_pool_free(struct ctl_io_pool *pool) { struct ctl_softc *ctl_softc; if (pool == NULL) return; ctl_softc = pool->ctl_softc; mtx_lock(&ctl_softc->pool_lock); pool->flags |= CTL_POOL_FLAG_INVALID; ctl_pool_release(pool); mtx_unlock(&ctl_softc->pool_lock); } /* * This routine does not block (except for spinlocks of course). * It tries to allocate a ctl_io union from the caller's pool as quickly as * possible. */ union ctl_io * ctl_alloc_io(void *pool_ref) { union ctl_io *io; struct ctl_softc *ctl_softc; struct ctl_io_pool *pool, *npool; struct ctl_io_pool *emergency_pool; pool = (struct ctl_io_pool *)pool_ref; if (pool == NULL) { printf("%s: pool is NULL\n", __func__); return (NULL); } emergency_pool = NULL; ctl_softc = pool->ctl_softc; mtx_lock(&ctl_softc->pool_lock); /* * First, try to get the io structure from the user's pool. */ if (ctl_pool_acquire(pool) == 0) { io = (union ctl_io *)STAILQ_FIRST(&pool->free_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&pool->free_queue, links); pool->total_allocated++; pool->free_ctl_io--; mtx_unlock(&ctl_softc->pool_lock); return (io); } else ctl_pool_release(pool); } /* * If he doesn't have any io structures left, search for an * emergency pool and grab one from there. */ STAILQ_FOREACH(npool, &ctl_softc->io_pools, links) { if (npool->type != CTL_POOL_EMERGENCY) continue; if (ctl_pool_acquire(npool) != 0) continue; emergency_pool = npool; io = (union ctl_io *)STAILQ_FIRST(&npool->free_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&npool->free_queue, links); npool->total_allocated++; npool->free_ctl_io--; mtx_unlock(&ctl_softc->pool_lock); return (io); } else ctl_pool_release(npool); } /* Drop the spinlock before we malloc */ mtx_unlock(&ctl_softc->pool_lock); /* * The emergency pool (if it exists) didn't have one, so try an * atomic (i.e. nonblocking) malloc and see if we get lucky. */ io = (union ctl_io *)malloc(sizeof(*io), M_CTLIO, M_NOWAIT); if (io != NULL) { /* * If the emergency pool exists but is empty, add this * ctl_io to its list when it gets freed. */ if (emergency_pool != NULL) { mtx_lock(&ctl_softc->pool_lock); if (ctl_pool_acquire(emergency_pool) == 0) { io->io_hdr.pool = emergency_pool; emergency_pool->total_ctl_io++; /* * Need to bump this, otherwise * total_allocated and total_freed won't * match when we no longer have anything * outstanding. */ emergency_pool->total_allocated++; } mtx_unlock(&ctl_softc->pool_lock); } else io->io_hdr.pool = NULL; } return (io); } void ctl_free_io(union ctl_io *io) { if (io == NULL) return; /* * If this ctl_io has a pool, return it to that pool. */ if (io->io_hdr.pool != NULL) { struct ctl_io_pool *pool; pool = (struct ctl_io_pool *)io->io_hdr.pool; mtx_lock(&pool->ctl_softc->pool_lock); io->io_hdr.io_type = 0xff; STAILQ_INSERT_TAIL(&pool->free_queue, &io->io_hdr, links); pool->total_freed++; pool->free_ctl_io++; ctl_pool_release(pool); mtx_unlock(&pool->ctl_softc->pool_lock); } else { /* * Otherwise, just free it. We probably malloced it and * the emergency pool wasn't available. */ free(io, M_CTLIO); } } void ctl_zero_io(union ctl_io *io) { void *pool_ref; if (io == NULL) return; /* * May need to preserve linked list pointers at some point too. */ pool_ref = io->io_hdr.pool; memset(io, 0, sizeof(*io)); io->io_hdr.pool = pool_ref; } /* * This routine is currently used for internal copies of ctl_ios that need * to persist for some reason after we've already returned status to the * FETD. (Thus the flag set.) * * XXX XXX * Note that this makes a blind copy of all fields in the ctl_io, except * for the pool reference. This includes any memory that has been * allocated! That memory will no longer be valid after done has been * called, so this would be VERY DANGEROUS for command that actually does * any reads or writes. Right now (11/7/2005), this is only used for immediate * start and stop commands, which don't transfer any data, so this is not a * problem. If it is used for anything else, the caller would also need to * allocate data buffer space and this routine would need to be modified to * copy the data buffer(s) as well. */ void ctl_copy_io(union ctl_io *src, union ctl_io *dest) { void *pool_ref; if ((src == NULL) || (dest == NULL)) return; /* * May need to preserve linked list pointers at some point too. */ pool_ref = dest->io_hdr.pool; memcpy(dest, src, ctl_min(sizeof(*src), sizeof(*dest))); dest->io_hdr.pool = pool_ref; /* * We need to know that this is an internal copy, and doesn't need * to get passed back to the FETD that allocated it. */ dest->io_hdr.flags |= CTL_FLAG_INT_COPY; } #ifdef NEEDTOPORT static void ctl_update_power_subpage(struct copan_power_subpage *page) { int num_luns, num_partitions, config_type; struct ctl_softc *softc; cs_BOOL_t aor_present, shelf_50pct_power; cs_raidset_personality_t rs_type; int max_active_luns; softc = control_softc; /* subtract out the processor LUN */ num_luns = softc->num_luns - 1; /* * Default to 7 LUNs active, which was the only number we allowed * in the past. */ max_active_luns = 7; num_partitions = config_GetRsPartitionInfo(); config_type = config_GetConfigType(); shelf_50pct_power = config_GetShelfPowerMode(); aor_present = config_IsAorRsPresent(); rs_type = ddb_GetRsRaidType(1); if ((rs_type != CS_RAIDSET_PERSONALITY_RAID5) && (rs_type != CS_RAIDSET_PERSONALITY_RAID1)) { EPRINT(0, "Unsupported RS type %d!", rs_type); } page->total_luns = num_luns; switch (config_type) { case 40: /* * In a 40 drive configuration, it doesn't matter what DC * cards we have, whether we have AOR enabled or not, * partitioning or not, or what type of RAIDset we have. * In that scenario, we can power up every LUN we present * to the user. */ max_active_luns = num_luns; break; case 64: if (shelf_50pct_power == CS_FALSE) { /* 25% power */ if (aor_present == CS_TRUE) { if (rs_type == CS_RAIDSET_PERSONALITY_RAID5) { max_active_luns = 7; } else if (rs_type == CS_RAIDSET_PERSONALITY_RAID1){ max_active_luns = 14; } else { /* XXX KDM now what?? */ } } else { if (rs_type == CS_RAIDSET_PERSONALITY_RAID5) { max_active_luns = 8; } else if (rs_type == CS_RAIDSET_PERSONALITY_RAID1){ max_active_luns = 16; } else { /* XXX KDM now what?? */ } } } else { /* 50% power */ /* * With 50% power in a 64 drive configuration, we * can power all LUNs we present. */ max_active_luns = num_luns; } break; case 112: if (shelf_50pct_power == CS_FALSE) { /* 25% power */ if (aor_present == CS_TRUE) { if (rs_type == CS_RAIDSET_PERSONALITY_RAID5) { max_active_luns = 7; } else if (rs_type == CS_RAIDSET_PERSONALITY_RAID1){ max_active_luns = 14; } else { /* XXX KDM now what?? */ } } else { if (rs_type == CS_RAIDSET_PERSONALITY_RAID5) { max_active_luns = 8; } else if (rs_type == CS_RAIDSET_PERSONALITY_RAID1){ max_active_luns = 16; } else { /* XXX KDM now what?? */ } } } else { /* 50% power */ if (aor_present == CS_TRUE) { if (rs_type == CS_RAIDSET_PERSONALITY_RAID5) { max_active_luns = 14; } else if (rs_type == CS_RAIDSET_PERSONALITY_RAID1){ /* * We're assuming here that disk * caching is enabled, and so we're * able to power up half of each * LUN, and cache all writes. */ max_active_luns = num_luns; } else { /* XXX KDM now what?? */ } } else { if (rs_type == CS_RAIDSET_PERSONALITY_RAID5) { max_active_luns = 15; } else if (rs_type == CS_RAIDSET_PERSONALITY_RAID1){ max_active_luns = 30; } else { /* XXX KDM now what?? */ } } } break; default: /* * In this case, we have an unknown configuration, so we * just use the default from above. */ break; } page->max_active_luns = max_active_luns; #if 0 printk("%s: total_luns = %d, max_active_luns = %d\n", __func__, page->total_luns, page->max_active_luns); #endif } #endif /* NEEDTOPORT */ /* * This routine could be used in the future to load default and/or saved * mode page parameters for a particuar lun. */ static int ctl_init_page_index(struct ctl_lun *lun) { int i; struct ctl_page_index *page_index; struct ctl_softc *softc; memcpy(&lun->mode_pages.index, page_index_template, sizeof(page_index_template)); softc = lun->ctl_softc; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { page_index = &lun->mode_pages.index[i]; /* * If this is a disk-only mode page, there's no point in * setting it up. For some pages, we have to have some * basic information about the disk in order to calculate the * mode page data. */ if ((lun->be_lun->lun_type != T_DIRECT) && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; switch (page_index->page_code & SMPH_PC_MASK) { case SMS_FORMAT_DEVICE_PAGE: { struct scsi_format_page *format_page; if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("subpage is incorrect!"); /* * Sectors per track are set above. Bytes per * sector need to be set here on a per-LUN basis. */ memcpy(&lun->mode_pages.format_page[CTL_PAGE_CURRENT], &format_page_default, sizeof(format_page_default)); memcpy(&lun->mode_pages.format_page[ CTL_PAGE_CHANGEABLE], &format_page_changeable, sizeof(format_page_changeable)); memcpy(&lun->mode_pages.format_page[CTL_PAGE_DEFAULT], &format_page_default, sizeof(format_page_default)); memcpy(&lun->mode_pages.format_page[CTL_PAGE_SAVED], &format_page_default, sizeof(format_page_default)); format_page = &lun->mode_pages.format_page[ CTL_PAGE_CURRENT]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); format_page = &lun->mode_pages.format_page[ CTL_PAGE_DEFAULT]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); format_page = &lun->mode_pages.format_page[ CTL_PAGE_SAVED]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); page_index->page_data = (uint8_t *)lun->mode_pages.format_page; break; } case SMS_RIGID_DISK_PAGE: { struct scsi_rigid_disk_page *rigid_disk_page; uint32_t sectors_per_cylinder; uint64_t cylinders; #ifndef __XSCALE__ int shift; #endif /* !__XSCALE__ */ if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("invalid subpage value %d", page_index->subpage); /* * Rotation rate and sectors per track are set * above. We calculate the cylinders here based on * capacity. Due to the number of heads and * sectors per track we're using, smaller arrays * may turn out to have 0 cylinders. Linux and * FreeBSD don't pay attention to these mode pages * to figure out capacity, but Solaris does. It * seems to deal with 0 cylinders just fine, and * works out a fake geometry based on the capacity. */ memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_CURRENT], &rigid_disk_page_default, sizeof(rigid_disk_page_default)); memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_CHANGEABLE],&rigid_disk_page_changeable, sizeof(rigid_disk_page_changeable)); memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_DEFAULT], &rigid_disk_page_default, sizeof(rigid_disk_page_default)); memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_SAVED], &rigid_disk_page_default, sizeof(rigid_disk_page_default)); sectors_per_cylinder = CTL_DEFAULT_SECTORS_PER_TRACK * CTL_DEFAULT_HEADS; /* * The divide method here will be more accurate, * probably, but results in floating point being * used in the kernel on i386 (__udivdi3()). On the * XScale, though, __udivdi3() is implemented in * software. * * The shift method for cylinder calculation is * accurate if sectors_per_cylinder is a power of * 2. Otherwise it might be slightly off -- you * might have a bit of a truncation problem. */ #ifdef __XSCALE__ cylinders = (lun->be_lun->maxlba + 1) / sectors_per_cylinder; #else for (shift = 31; shift > 0; shift--) { if (sectors_per_cylinder & (1 << shift)) break; } cylinders = (lun->be_lun->maxlba + 1) >> shift; #endif /* * We've basically got 3 bytes, or 24 bits for the * cylinder size in the mode page. If we're over, * just round down to 2^24. */ if (cylinders > 0xffffff) cylinders = 0xffffff; rigid_disk_page = &lun->mode_pages.rigid_disk_page[ CTL_PAGE_CURRENT]; scsi_ulto3b(cylinders, rigid_disk_page->cylinders); rigid_disk_page = &lun->mode_pages.rigid_disk_page[ CTL_PAGE_DEFAULT]; scsi_ulto3b(cylinders, rigid_disk_page->cylinders); rigid_disk_page = &lun->mode_pages.rigid_disk_page[ CTL_PAGE_SAVED]; scsi_ulto3b(cylinders, rigid_disk_page->cylinders); page_index->page_data = (uint8_t *)lun->mode_pages.rigid_disk_page; break; } case SMS_CACHING_PAGE: { if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("invalid subpage value %d", page_index->subpage); /* * Defaults should be okay here, no calculations * needed. */ memcpy(&lun->mode_pages.caching_page[CTL_PAGE_CURRENT], &caching_page_default, sizeof(caching_page_default)); memcpy(&lun->mode_pages.caching_page[ CTL_PAGE_CHANGEABLE], &caching_page_changeable, sizeof(caching_page_changeable)); memcpy(&lun->mode_pages.caching_page[CTL_PAGE_DEFAULT], &caching_page_default, sizeof(caching_page_default)); memcpy(&lun->mode_pages.caching_page[CTL_PAGE_SAVED], &caching_page_default, sizeof(caching_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.caching_page; break; } case SMS_CONTROL_MODE_PAGE: { if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("invalid subpage value %d", page_index->subpage); /* * Defaults should be okay here, no calculations * needed. */ memcpy(&lun->mode_pages.control_page[CTL_PAGE_CURRENT], &control_page_default, sizeof(control_page_default)); memcpy(&lun->mode_pages.control_page[ CTL_PAGE_CHANGEABLE], &control_page_changeable, sizeof(control_page_changeable)); memcpy(&lun->mode_pages.control_page[CTL_PAGE_DEFAULT], &control_page_default, sizeof(control_page_default)); memcpy(&lun->mode_pages.control_page[CTL_PAGE_SAVED], &control_page_default, sizeof(control_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.control_page; break; } case SMS_VENDOR_SPECIFIC_PAGE:{ switch (page_index->subpage) { case PWR_SUBPAGE_CODE: { struct copan_power_subpage *current_page, *saved_page; memcpy(&lun->mode_pages.power_subpage[ CTL_PAGE_CURRENT], &power_page_default, sizeof(power_page_default)); memcpy(&lun->mode_pages.power_subpage[ CTL_PAGE_CHANGEABLE], &power_page_changeable, sizeof(power_page_changeable)); memcpy(&lun->mode_pages.power_subpage[ CTL_PAGE_DEFAULT], &power_page_default, sizeof(power_page_default)); memcpy(&lun->mode_pages.power_subpage[ CTL_PAGE_SAVED], &power_page_default, sizeof(power_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.power_subpage; current_page = (struct copan_power_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_page = (struct copan_power_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); break; } case APS_SUBPAGE_CODE: { struct copan_aps_subpage *current_page, *saved_page; // This gets set multiple times but // it should always be the same. It's // only done during init so who cares. index_to_aps_page = i; memcpy(&lun->mode_pages.aps_subpage[ CTL_PAGE_CURRENT], &aps_page_default, sizeof(aps_page_default)); memcpy(&lun->mode_pages.aps_subpage[ CTL_PAGE_CHANGEABLE], &aps_page_changeable, sizeof(aps_page_changeable)); memcpy(&lun->mode_pages.aps_subpage[ CTL_PAGE_DEFAULT], &aps_page_default, sizeof(aps_page_default)); memcpy(&lun->mode_pages.aps_subpage[ CTL_PAGE_SAVED], &aps_page_default, sizeof(aps_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.aps_subpage; current_page = (struct copan_aps_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_page = (struct copan_aps_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); break; } case DBGCNF_SUBPAGE_CODE: { struct copan_debugconf_subpage *current_page, *saved_page; memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_CURRENT], &debugconf_page_default, sizeof(debugconf_page_default)); memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_CHANGEABLE], &debugconf_page_changeable, sizeof(debugconf_page_changeable)); memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_DEFAULT], &debugconf_page_default, sizeof(debugconf_page_default)); memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_SAVED], &debugconf_page_default, sizeof(debugconf_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.debugconf_subpage; current_page = (struct copan_debugconf_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_page = (struct copan_debugconf_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); break; } default: panic("invalid subpage value %d", page_index->subpage); break; } break; } default: panic("invalid page value %d", page_index->page_code & SMPH_PC_MASK); break; } } return (CTL_RETVAL_COMPLETE); } /* * LUN allocation. * * Requirements: * - caller allocates and zeros LUN storage, or passes in a NULL LUN if he * wants us to allocate the LUN and he can block. * - ctl_softc is always set * - be_lun is set if the LUN has a backend (needed for disk LUNs) * * Returns 0 for success, non-zero (errno) for failure. */ static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *ctl_lun, struct ctl_be_lun *const be_lun, struct ctl_id target_id) { struct ctl_lun *nlun, *lun; struct ctl_port *port; struct scsi_vpd_id_descriptor *desc; struct scsi_vpd_id_t10 *t10id; const char *eui, *naa, *scsiname, *vendor; int lun_number, i, lun_malloced; int devidlen, idlen1, idlen2 = 0, len; if (be_lun == NULL) return (EINVAL); /* * We currently only support Direct Access or Processor LUN types. */ switch (be_lun->lun_type) { case T_DIRECT: break; case T_PROCESSOR: break; case T_SEQUENTIAL: case T_CHANGER: default: be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); break; } if (ctl_lun == NULL) { lun = malloc(sizeof(*lun), M_CTL, M_WAITOK); lun_malloced = 1; } else { lun_malloced = 0; lun = ctl_lun; } memset(lun, 0, sizeof(*lun)); if (lun_malloced) lun->flags = CTL_LUN_MALLOCED; /* Generate LUN ID. */ devidlen = max(CTL_DEVID_MIN_LEN, strnlen(be_lun->device_id, CTL_DEVID_LEN)); idlen1 = sizeof(*t10id) + devidlen; len = sizeof(struct scsi_vpd_id_descriptor) + idlen1; scsiname = ctl_get_opt(&be_lun->options, "scsiname"); if (scsiname != NULL) { idlen2 = roundup2(strlen(scsiname) + 1, 4); len += sizeof(struct scsi_vpd_id_descriptor) + idlen2; } eui = ctl_get_opt(&be_lun->options, "eui"); if (eui != NULL) { len += sizeof(struct scsi_vpd_id_descriptor) + 8; } naa = ctl_get_opt(&be_lun->options, "naa"); if (naa != NULL) { len += sizeof(struct scsi_vpd_id_descriptor) + 8; } lun->lun_devid = malloc(sizeof(struct ctl_devid) + len, M_CTL, M_WAITOK | M_ZERO); lun->lun_devid->len = len; desc = (struct scsi_vpd_id_descriptor *)lun->lun_devid->data; desc->proto_codeset = SVPD_ID_CODESET_ASCII; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_T10; desc->length = idlen1; t10id = (struct scsi_vpd_id_t10 *)&desc->identifier[0]; memset(t10id->vendor, ' ', sizeof(t10id->vendor)); if ((vendor = ctl_get_opt(&be_lun->options, "vendor")) == NULL) { strncpy((char *)t10id->vendor, CTL_VENDOR, sizeof(t10id->vendor)); } else { strncpy(t10id->vendor, vendor, min(sizeof(t10id->vendor), strlen(vendor))); } strncpy((char *)t10id->vendor_spec_id, (char *)be_lun->device_id, devidlen); if (scsiname != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_UTF8; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_SCSI_NAME; desc->length = idlen2; strlcpy(desc->identifier, scsiname, idlen2); } if (eui != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_EUI64; desc->length = 8; scsi_u64to8b(strtouq(eui, NULL, 0), desc->identifier); } if (naa != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_NAA; desc->length = 8; scsi_u64to8b(strtouq(naa, NULL, 0), desc->identifier); } mtx_lock(&ctl_softc->ctl_lock); /* * See if the caller requested a particular LUN number. If so, see * if it is available. Otherwise, allocate the first available LUN. */ if (be_lun->flags & CTL_LUN_FLAG_ID_REQ) { if ((be_lun->req_lun_id > (CTL_MAX_LUNS - 1)) || (ctl_is_set(ctl_softc->ctl_lun_mask, be_lun->req_lun_id))) { mtx_unlock(&ctl_softc->ctl_lock); if (be_lun->req_lun_id > (CTL_MAX_LUNS - 1)) { printf("ctl: requested LUN ID %d is higher " "than CTL_MAX_LUNS - 1 (%d)\n", be_lun->req_lun_id, CTL_MAX_LUNS - 1); } else { /* * XXX KDM return an error, or just assign * another LUN ID in this case?? */ printf("ctl: requested LUN ID %d is already " "in use\n", be_lun->req_lun_id); } if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); return (ENOSPC); } lun_number = be_lun->req_lun_id; } else { lun_number = ctl_ffz(ctl_softc->ctl_lun_mask, CTL_MAX_LUNS); if (lun_number == -1) { mtx_unlock(&ctl_softc->ctl_lock); printf("ctl: can't allocate LUN on target %ju, out of " "LUNs\n", (uintmax_t)target_id.id); if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); return (ENOSPC); } } ctl_set_mask(ctl_softc->ctl_lun_mask, lun_number); mtx_init(&lun->lun_lock, "CTL LUN", NULL, MTX_DEF); lun->target = target_id; lun->lun = lun_number; lun->be_lun = be_lun; /* * The processor LUN is always enabled. Disk LUNs come on line * disabled, and must be enabled by the backend. */ lun->flags |= CTL_LUN_DISABLED; lun->backend = be_lun->be; be_lun->ctl_lun = lun; be_lun->lun_id = lun_number; atomic_add_int(&be_lun->be->num_luns, 1); if (be_lun->flags & CTL_LUN_FLAG_POWERED_OFF) lun->flags |= CTL_LUN_STOPPED; if (be_lun->flags & CTL_LUN_FLAG_INOPERABLE) lun->flags |= CTL_LUN_INOPERABLE; if (be_lun->flags & CTL_LUN_FLAG_PRIMARY) lun->flags |= CTL_LUN_PRIMARY_SC; lun->ctl_softc = ctl_softc; TAILQ_INIT(&lun->ooa_queue); TAILQ_INIT(&lun->blocked_queue); STAILQ_INIT(&lun->error_list); ctl_tpc_lun_init(lun); /* * Initialize the mode page index. */ ctl_init_page_index(lun); /* * Set the poweron UA for all initiators on this LUN only. */ for (i = 0; i < CTL_MAX_INITIATORS; i++) lun->pending_ua[i] = CTL_UA_POWERON; /* * Now, before we insert this lun on the lun list, set the lun * inventory changed UA for all other luns. */ STAILQ_FOREACH(nlun, &ctl_softc->lun_list, links) { for (i = 0; i < CTL_MAX_INITIATORS; i++) { nlun->pending_ua[i] |= CTL_UA_LUN_CHANGE; } } STAILQ_INSERT_TAIL(&ctl_softc->lun_list, lun, links); ctl_softc->ctl_luns[lun_number] = lun; ctl_softc->num_luns++; /* Setup statistics gathering */ lun->stats.device_type = be_lun->lun_type; lun->stats.lun_number = lun_number; if (lun->stats.device_type == T_DIRECT) lun->stats.blocksize = be_lun->blocksize; else lun->stats.flags = CTL_LUN_STATS_NO_BLOCKSIZE; for (i = 0;i < CTL_MAX_PORTS;i++) lun->stats.ports[i].targ_port = i; mtx_unlock(&ctl_softc->ctl_lock); lun->be_lun->lun_config_status(lun->be_lun->be_lun, CTL_LUN_CONFIG_OK); /* * Run through each registered FETD and bring it online if it isn't * already. Enable the target ID if it hasn't been enabled, and * enable this particular LUN. */ STAILQ_FOREACH(port, &ctl_softc->port_list, links) { int retval; retval = port->lun_enable(port->targ_lun_arg, target_id,lun_number); if (retval != 0) { printf("ctl_alloc_lun: FETD %s port %d returned error " "%d for lun_enable on target %ju lun %d\n", port->port_name, port->targ_port, retval, (uintmax_t)target_id.id, lun_number); } else port->status |= CTL_PORT_STATUS_LUN_ONLINE; } return (0); } /* * Delete a LUN. * Assumptions: * - LUN has already been marked invalid and any pending I/O has been taken * care of. */ static int ctl_free_lun(struct ctl_lun *lun) { struct ctl_softc *softc; #if 0 struct ctl_port *port; #endif struct ctl_lun *nlun; int i; softc = lun->ctl_softc; mtx_assert(&softc->ctl_lock, MA_OWNED); STAILQ_REMOVE(&softc->lun_list, lun, ctl_lun, links); ctl_clear_mask(softc->ctl_lun_mask, lun->lun); softc->ctl_luns[lun->lun] = NULL; if (!TAILQ_EMPTY(&lun->ooa_queue)) panic("Freeing a LUN %p with outstanding I/O!!\n", lun); softc->num_luns--; /* * XXX KDM this scheme only works for a single target/multiple LUN * setup. It needs to be revamped for a multiple target scheme. * * XXX KDM this results in port->lun_disable() getting called twice, * once when ctl_disable_lun() is called, and a second time here. * We really need to re-think the LUN disable semantics. There * should probably be several steps/levels to LUN removal: * - disable * - invalidate * - free * * Right now we only have a disable method when communicating to * the front end ports, at least for individual LUNs. */ #if 0 STAILQ_FOREACH(port, &softc->port_list, links) { int retval; retval = port->lun_disable(port->targ_lun_arg, lun->target, lun->lun); if (retval != 0) { printf("ctl_free_lun: FETD %s port %d returned error " "%d for lun_disable on target %ju lun %jd\n", port->port_name, port->targ_port, retval, (uintmax_t)lun->target.id, (intmax_t)lun->lun); } if (STAILQ_FIRST(&softc->lun_list) == NULL) { port->status &= ~CTL_PORT_STATUS_LUN_ONLINE; retval = port->targ_disable(port->targ_lun_arg,lun->target); if (retval != 0) { printf("ctl_free_lun: FETD %s port %d " "returned error %d for targ_disable on " "target %ju\n", port->port_name, port->targ_port, retval, (uintmax_t)lun->target.id); } else port->status &= ~CTL_PORT_STATUS_TARG_ONLINE; if ((port->status & CTL_PORT_STATUS_TARG_ONLINE) != 0) continue; #if 0 port->port_offline(port->onoff_arg); port->status &= ~CTL_PORT_STATUS_ONLINE; #endif } } #endif /* * Tell the backend to free resources, if this LUN has a backend. */ atomic_subtract_int(&lun->be_lun->be->num_luns, 1); lun->be_lun->lun_shutdown(lun->be_lun->be_lun); ctl_tpc_lun_shutdown(lun); mtx_destroy(&lun->lun_lock); free(lun->lun_devid, M_CTL); if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); STAILQ_FOREACH(nlun, &softc->lun_list, links) { for (i = 0; i < CTL_MAX_INITIATORS; i++) { nlun->pending_ua[i] |= CTL_UA_LUN_CHANGE; } } return (0); } static void ctl_create_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; ctl_softc = control_softc; /* * ctl_alloc_lun() should handle all potential failure cases. */ ctl_alloc_lun(ctl_softc, NULL, be_lun, ctl_softc->target); } int ctl_add_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc = control_softc; mtx_lock(&ctl_softc->ctl_lock); STAILQ_INSERT_TAIL(&ctl_softc->pending_lun_queue, be_lun, links); mtx_unlock(&ctl_softc->ctl_lock); wakeup(&ctl_softc->pending_lun_queue); return (0); } int ctl_enable_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_port *port, *nport; struct ctl_lun *lun; int retval; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&ctl_softc->ctl_lock); mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_DISABLED) == 0) { /* * eh? Why did we get called if the LUN is already * enabled? */ mtx_unlock(&lun->lun_lock); mtx_unlock(&ctl_softc->ctl_lock); return (0); } lun->flags &= ~CTL_LUN_DISABLED; mtx_unlock(&lun->lun_lock); for (port = STAILQ_FIRST(&ctl_softc->port_list); port != NULL; port = nport) { nport = STAILQ_NEXT(port, links); /* * Drop the lock while we call the FETD's enable routine. * This can lead to a callback into CTL (at least in the * case of the internal initiator frontend. */ mtx_unlock(&ctl_softc->ctl_lock); retval = port->lun_enable(port->targ_lun_arg, lun->target,lun->lun); mtx_lock(&ctl_softc->ctl_lock); if (retval != 0) { printf("%s: FETD %s port %d returned error " "%d for lun_enable on target %ju lun %jd\n", __func__, port->port_name, port->targ_port, retval, (uintmax_t)lun->target.id, (intmax_t)lun->lun); } #if 0 else { /* NOTE: TODO: why does lun enable affect port status? */ port->status |= CTL_PORT_STATUS_LUN_ONLINE; } #endif } mtx_unlock(&ctl_softc->ctl_lock); return (0); } int ctl_disable_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_port *port; struct ctl_lun *lun; int retval; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&ctl_softc->ctl_lock); mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); mtx_unlock(&ctl_softc->ctl_lock); return (0); } lun->flags |= CTL_LUN_DISABLED; mtx_unlock(&lun->lun_lock); STAILQ_FOREACH(port, &ctl_softc->port_list, links) { mtx_unlock(&ctl_softc->ctl_lock); /* * Drop the lock before we call the frontend's disable * routine, to avoid lock order reversals. * * XXX KDM what happens if the frontend list changes while * we're traversing it? It's unlikely, but should be handled. */ retval = port->lun_disable(port->targ_lun_arg, lun->target, lun->lun); mtx_lock(&ctl_softc->ctl_lock); if (retval != 0) { printf("ctl_alloc_lun: FETD %s port %d returned error " "%d for lun_disable on target %ju lun %jd\n", port->port_name, port->targ_port, retval, (uintmax_t)lun->target.id, (intmax_t)lun->lun); } } mtx_unlock(&ctl_softc->ctl_lock); return (0); } int ctl_start_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_lun *lun; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_STOPPED; mtx_unlock(&lun->lun_lock); return (0); } int ctl_stop_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_lun *lun; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_STOPPED; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_offline(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_lun *lun; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_OFFLINE; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_online(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_lun *lun; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_OFFLINE; mtx_unlock(&lun->lun_lock); return (0); } int ctl_invalidate_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_lun *lun; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); /* * The LUN needs to be disabled before it can be marked invalid. */ if ((lun->flags & CTL_LUN_DISABLED) == 0) { mtx_unlock(&lun->lun_lock); return (-1); } /* * Mark the LUN invalid. */ lun->flags |= CTL_LUN_INVALID; /* * If there is nothing in the OOA queue, go ahead and free the LUN. * If we have something in the OOA queue, we'll free it when the * last I/O completes. */ if (TAILQ_EMPTY(&lun->ooa_queue)) { mtx_unlock(&lun->lun_lock); mtx_lock(&ctl_softc->ctl_lock); ctl_free_lun(lun); mtx_unlock(&ctl_softc->ctl_lock); } else mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_inoperable(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_lun *lun; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_INOPERABLE; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_operable(struct ctl_be_lun *be_lun) { struct ctl_softc *ctl_softc; struct ctl_lun *lun; ctl_softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_INOPERABLE; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_power_lock(struct ctl_be_lun *be_lun, struct ctl_nexus *nexus, int lock) { struct ctl_softc *softc; struct ctl_lun *lun; struct copan_aps_subpage *current_sp; struct ctl_page_index *page_index; int i; softc = control_softc; mtx_lock(&softc->ctl_lock); lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); page_index = NULL; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) != APS_PAGE_CODE) continue; if (lun->mode_pages.index[i].subpage != APS_SUBPAGE_CODE) continue; page_index = &lun->mode_pages.index[i]; } if (page_index == NULL) { mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); printf("%s: APS subpage not found for lun %ju!\n", __func__, (uintmax_t)lun->lun); return (1); } #if 0 if ((softc->aps_locked_lun != 0) && (softc->aps_locked_lun != lun->lun)) { printf("%s: attempt to lock LUN %llu when %llu is already " "locked\n"); mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); return (1); } #endif current_sp = (struct copan_aps_subpage *)(page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); if (lock != 0) { current_sp->lock_active = APS_LOCK_ACTIVE; softc->aps_locked_lun = lun->lun; } else { current_sp->lock_active = 0; softc->aps_locked_lun = 0; } /* * If we're in HA mode, try to send the lock message to the other * side. */ if (ctl_is_single == 0) { int isc_retval; union ctl_ha_msg lock_msg; lock_msg.hdr.nexus = *nexus; lock_msg.hdr.msg_type = CTL_MSG_APS_LOCK; if (lock != 0) lock_msg.aps.lock_flag = 1; else lock_msg.aps.lock_flag = 0; isc_retval = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &lock_msg, sizeof(lock_msg), 0); if (isc_retval > CTL_HA_STATUS_SUCCESS) { printf("%s: APS (lock=%d) error returned from " "ctl_ha_msg_send: %d\n", __func__, lock, isc_retval); mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); return (1); } } mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); return (0); } void ctl_lun_capacity_changed(struct ctl_be_lun *be_lun) { struct ctl_lun *lun; struct ctl_softc *softc; int i; softc = control_softc; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); for (i = 0; i < CTL_MAX_INITIATORS; i++) lun->pending_ua[i] |= CTL_UA_CAPACITY_CHANGED; mtx_unlock(&lun->lun_lock); } /* * Backend "memory move is complete" callback for requests that never * make it down to say RAIDCore's configuration code. */ int ctl_config_move_done(union ctl_io *io) { int retval; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_config_move_done\n")); /* * XXX KDM this shouldn't happen, but what if it does? */ if (io->io_hdr.io_type != CTL_IO_SCSI) panic("I/O type isn't CTL_IO_SCSI!"); if ((io->io_hdr.port_status == 0) && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) io->io_hdr.status = CTL_SUCCESS; else if ((io->io_hdr.port_status != 0) && ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)){ /* * For hardware error sense keys, the sense key * specific value is defined to be a retry count, * but we use it to pass back an internal FETD * error code. XXX KDM Hopefully the FETD is only * using 16 bits for an error code, since that's * all the space we have in the sks field. */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ io->io_hdr.port_status); if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) free(io->scsiio.kern_data_ptr, M_CTL); ctl_done(io); goto bailout; } if (((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)) { /* * XXX KDM just assuming a single pointer here, and not a * S/G list. If we start using S/G lists for config data, * we'll need to know how to clean them up here as well. */ if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) free(io->scsiio.kern_data_ptr, M_CTL); /* Hopefully the user has already set the status... */ ctl_done(io); } else { /* * XXX KDM now we need to continue data movement. Some * options: * - call ctl_scsiio() again? We don't do this for data * writes, because for those at least we know ahead of * time where the write will go and how long it is. For * config writes, though, that information is largely * contained within the write itself, thus we need to * parse out the data again. * * - Call some other function once the data is in? */ /* * XXX KDM call ctl_scsiio() again for now, and check flag * bits to see whether we're allocated or not. */ retval = ctl_scsiio(&io->scsiio); } bailout: return (retval); } /* * This gets called by a backend driver when it is done with a * data_submit method. */ void ctl_data_submit_done(union ctl_io *io) { /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. * * If there is an error, though, we don't want to keep processing. * Instead, just send status back to the initiator. */ if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) && (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { io->scsiio.io_cont(io); return; } ctl_done(io); } /* * This gets called by a backend driver when it is done with a * configuration write. */ void ctl_config_write_done(union ctl_io *io) { /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. * * If there is an error, though, we don't want to keep processing. * Instead, just send status back to the initiator. */ if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) && (((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE) || ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS))) { io->scsiio.io_cont(io); return; } /* * Since a configuration write can be done for commands that actually * have data allocated, like write buffer, and commands that have * no data, like start/stop unit, we need to check here. */ if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) free(io->scsiio.kern_data_ptr, M_CTL); ctl_done(io); } /* * SCSI release command. */ int ctl_scsi_release(struct ctl_scsiio *ctsio) { int length, longid, thirdparty_id, resv_id; struct ctl_softc *ctl_softc; struct ctl_lun *lun; length = 0; resv_id = 0; CTL_DEBUG_PRINT(("ctl_scsi_release\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctl_softc = control_softc; switch (ctsio->cdb[0]) { case RELEASE_10: { struct scsi_release_10 *cdb; cdb = (struct scsi_release_10 *)ctsio->cdb; if (cdb->byte2 & SR10_LONGID) longid = 1; else thirdparty_id = cdb->thirdparty_id; resv_id = cdb->resv_id; length = scsi_2btoul(cdb->length); break; } } /* * XXX KDM right now, we only support LUN reservation. We don't * support 3rd party reservations, or extent reservations, which * might actually need the parameter list. If we've gotten this * far, we've got a LUN reservation. Anything else got kicked out * above. So, according to SPC, ignore the length. */ length = 0; if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) && (length > 0)) { ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK); ctsio->kern_data_len = length; ctsio->kern_total_len = length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (length > 0) thirdparty_id = scsi_8btou64(ctsio->kern_data_ptr); mtx_lock(&lun->lun_lock); /* * According to SPC, it is not an error for an intiator to attempt * to release a reservation on a LUN that isn't reserved, or that * is reserved by another initiator. The reservation can only be * released, though, by the initiator who made it or by one of * several reset type events. */ if (lun->flags & CTL_LUN_RESERVED) { if ((ctsio->io_hdr.nexus.initid.id == lun->rsv_nexus.initid.id) && (ctsio->io_hdr.nexus.targ_port == lun->rsv_nexus.targ_port) && (ctsio->io_hdr.nexus.targ_target.id == lun->rsv_nexus.targ_target.id)) { lun->flags &= ~CTL_LUN_RESERVED; } } mtx_unlock(&lun->lun_lock); ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.status = CTL_SUCCESS; if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_scsi_reserve(struct ctl_scsiio *ctsio) { int extent, thirdparty, longid; int resv_id, length; uint64_t thirdparty_id; struct ctl_softc *ctl_softc; struct ctl_lun *lun; extent = 0; thirdparty = 0; longid = 0; resv_id = 0; length = 0; thirdparty_id = 0; CTL_DEBUG_PRINT(("ctl_reserve\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctl_softc = control_softc; switch (ctsio->cdb[0]) { case RESERVE_10: { struct scsi_reserve_10 *cdb; cdb = (struct scsi_reserve_10 *)ctsio->cdb; if (cdb->byte2 & SR10_LONGID) longid = 1; else thirdparty_id = cdb->thirdparty_id; resv_id = cdb->resv_id; length = scsi_2btoul(cdb->length); break; } } /* * XXX KDM right now, we only support LUN reservation. We don't * support 3rd party reservations, or extent reservations, which * might actually need the parameter list. If we've gotten this * far, we've got a LUN reservation. Anything else got kicked out * above. So, according to SPC, ignore the length. */ length = 0; if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) && (length > 0)) { ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK); ctsio->kern_data_len = length; ctsio->kern_total_len = length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (length > 0) thirdparty_id = scsi_8btou64(ctsio->kern_data_ptr); mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_RESERVED) { if ((ctsio->io_hdr.nexus.initid.id != lun->rsv_nexus.initid.id) || (ctsio->io_hdr.nexus.targ_port != lun->rsv_nexus.targ_port) || (ctsio->io_hdr.nexus.targ_target.id != lun->rsv_nexus.targ_target.id)) { ctsio->scsi_status = SCSI_STATUS_RESERV_CONFLICT; ctsio->io_hdr.status = CTL_SCSI_ERROR; goto bailout; } } lun->flags |= CTL_LUN_RESERVED; lun->rsv_nexus = ctsio->io_hdr.nexus; ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.status = CTL_SUCCESS; bailout: mtx_unlock(&lun->lun_lock); if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_start_stop(struct ctl_scsiio *ctsio) { struct scsi_start_stop_unit *cdb; struct ctl_lun *lun; struct ctl_softc *ctl_softc; int retval; CTL_DEBUG_PRINT(("ctl_start_stop\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctl_softc = control_softc; retval = 0; cdb = (struct scsi_start_stop_unit *)ctsio->cdb; /* * XXX KDM * We don't support the immediate bit on a stop unit. In order to * do that, we would need to code up a way to know that a stop is * pending, and hold off any new commands until it completes, one * way or another. Then we could accept or reject those commands * depending on its status. We would almost need to do the reverse * of what we do below for an immediate start -- return the copy of * the ctl_io to the FETD with status to send to the host (and to * free the copy!) and then free the original I/O once the stop * actually completes. That way, the OOA queue mechanism can work * to block commands that shouldn't proceed. Another alternative * would be to put the copy in the queue in place of the original, * and return the original back to the caller. That could be * slightly safer.. */ if ((cdb->byte2 & SSS_IMMED) && ((cdb->how & SSS_START) == 0)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if ((lun->flags & CTL_LUN_PR_RESERVED) && ((cdb->how & SSS_START)==0)) { uint32_t residx; residx = ctl_get_resindex(&ctsio->io_hdr.nexus); if (!lun->per_res[residx].registered || (lun->pr_res_idx!=residx && lun->res_type < 4)) { ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } /* * If there is no backend on this device, we can't start or stop * it. In theory we shouldn't get any start/stop commands in the * first place at this level if the LUN doesn't have a backend. * That should get stopped by the command decode code. */ if (lun->backend == NULL) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * XXX KDM Copan-specific offline behavior. * Figure out a reasonable way to port this? */ #ifdef NEEDTOPORT mtx_lock(&lun->lun_lock); if (((cdb->byte2 & SSS_ONOFFLINE) == 0) && (lun->flags & CTL_LUN_OFFLINE)) { /* * If the LUN is offline, and the on/offline bit isn't set, * reject the start or stop. Otherwise, let it through. */ mtx_unlock(&lun->lun_lock); ctl_set_lun_not_ready(ctsio); ctl_done((union ctl_io *)ctsio); } else { mtx_unlock(&lun->lun_lock); #endif /* NEEDTOPORT */ /* * This could be a start or a stop when we're online, * or a stop/offline or start/online. A start or stop when * we're offline is covered in the case above. */ /* * In the non-immediate case, we send the request to * the backend and return status to the user when * it is done. * * In the immediate case, we allocate a new ctl_io * to hold a copy of the request, and send that to * the backend. We then set good status on the * user's request and return it immediately. */ if (cdb->byte2 & SSS_IMMED) { union ctl_io *new_io; new_io = ctl_alloc_io(ctsio->io_hdr.pool); if (new_io == NULL) { ctl_set_busy(ctsio); ctl_done((union ctl_io *)ctsio); } else { ctl_copy_io((union ctl_io *)ctsio, new_io); retval = lun->backend->config_write(new_io); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); } } else { retval = lun->backend->config_write( (union ctl_io *)ctsio); } #ifdef NEEDTOPORT } #endif return (retval); } /* * We support the SYNCHRONIZE CACHE command (10 and 16 byte versions), but * we don't really do anything with the LBA and length fields if the user * passes them in. Instead we'll just flush out the cache for the entire * LUN. */ int ctl_sync_cache(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_softc *ctl_softc; uint64_t starting_lba; uint32_t block_count; int retval; CTL_DEBUG_PRINT(("ctl_sync_cache\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctl_softc = control_softc; retval = 0; switch (ctsio->cdb[0]) { case SYNCHRONIZE_CACHE: { struct scsi_sync_cache *cdb; cdb = (struct scsi_sync_cache *)ctsio->cdb; starting_lba = scsi_4btoul(cdb->begin_lba); block_count = scsi_2btoul(cdb->lb_count); break; } case SYNCHRONIZE_CACHE_16: { struct scsi_sync_cache_16 *cdb; cdb = (struct scsi_sync_cache_16 *)ctsio->cdb; starting_lba = scsi_8btou64(cdb->begin_lba); block_count = scsi_4btoul(cdb->lb_count); break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); goto bailout; break; /* NOTREACHED */ } /* * We check the LBA and length, but don't do anything with them. * A SYNCHRONIZE CACHE will cause the entire cache for this lun to * get flushed. This check will just help satisfy anyone who wants * to see an error for an out of range LBA. */ if ((starting_lba + block_count) > (lun->be_lun->maxlba + 1)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); goto bailout; } /* * If this LUN has no backend, we can't flush the cache anyway. */ if (lun->backend == NULL) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); goto bailout; } /* * Check to see whether we're configured to send the SYNCHRONIZE * CACHE command directly to the back end. */ mtx_lock(&lun->lun_lock); if ((ctl_softc->flags & CTL_FLAG_REAL_SYNC) && (++(lun->sync_count) >= lun->sync_interval)) { lun->sync_count = 0; mtx_unlock(&lun->lun_lock); retval = lun->backend->config_write((union ctl_io *)ctsio); } else { mtx_unlock(&lun->lun_lock); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); } bailout: return (retval); } int ctl_format(struct ctl_scsiio *ctsio) { struct scsi_format *cdb; struct ctl_lun *lun; struct ctl_softc *ctl_softc; int length, defect_list_len; CTL_DEBUG_PRINT(("ctl_format\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctl_softc = control_softc; cdb = (struct scsi_format *)ctsio->cdb; length = 0; if (cdb->byte2 & SF_FMTDATA) { if (cdb->byte2 & SF_LONGLIST) length = sizeof(struct scsi_format_header_long); else length = sizeof(struct scsi_format_header_short); } if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) && (length > 0)) { ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK); ctsio->kern_data_len = length; ctsio->kern_total_len = length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } defect_list_len = 0; if (cdb->byte2 & SF_FMTDATA) { if (cdb->byte2 & SF_LONGLIST) { struct scsi_format_header_long *header; header = (struct scsi_format_header_long *) ctsio->kern_data_ptr; defect_list_len = scsi_4btoul(header->defect_list_len); if (defect_list_len != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto bailout; } } else { struct scsi_format_header_short *header; header = (struct scsi_format_header_short *) ctsio->kern_data_ptr; defect_list_len = scsi_2btoul(header->defect_list_len); if (defect_list_len != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto bailout; } } } /* * The format command will clear out the "Medium format corrupted" * status if set by the configuration code. That status is really * just a way to notify the host that we have lost the media, and * get them to issue a command that will basically make them think * they're blowing away the media. */ mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_INOPERABLE; mtx_unlock(&lun->lun_lock); ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.status = CTL_SUCCESS; bailout: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_buffer(struct ctl_scsiio *ctsio) { struct scsi_read_buffer *cdb; struct ctl_lun *lun; int buffer_offset, len; static uint8_t descr[4]; static uint8_t echo_descr[4] = { 0 }; CTL_DEBUG_PRINT(("ctl_read_buffer\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_read_buffer *)ctsio->cdb; if (lun->flags & CTL_LUN_PR_RESERVED) { uint32_t residx; /* * XXX KDM need a lock here. */ residx = ctl_get_resindex(&ctsio->io_hdr.nexus); if ((lun->res_type == SPR_TYPE_EX_AC && residx != lun->pr_res_idx) || ((lun->res_type == SPR_TYPE_EX_AC_RO || lun->res_type == SPR_TYPE_EX_AC_AR) && !lun->per_res[residx].registered)) { ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } if ((cdb->byte2 & RWB_MODE) != RWB_MODE_DATA && (cdb->byte2 & RWB_MODE) != RWB_MODE_ECHO_DESCR && (cdb->byte2 & RWB_MODE) != RWB_MODE_DESCR) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = scsi_3btoul(cdb->length); buffer_offset = scsi_3btoul(cdb->offset); if (buffer_offset + len > sizeof(lun->write_buffer)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if ((cdb->byte2 & RWB_MODE) == RWB_MODE_DESCR) { descr[0] = 0; scsi_ulto3b(sizeof(lun->write_buffer), &descr[1]); ctsio->kern_data_ptr = descr; len = min(len, sizeof(descr)); } else if ((cdb->byte2 & RWB_MODE) == RWB_MODE_ECHO_DESCR) { ctsio->kern_data_ptr = echo_descr; len = min(len, sizeof(echo_descr)); } else ctsio->kern_data_ptr = lun->write_buffer + buffer_offset; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_write_buffer(struct ctl_scsiio *ctsio) { struct scsi_write_buffer *cdb; struct ctl_lun *lun; int buffer_offset, len; CTL_DEBUG_PRINT(("ctl_write_buffer\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_write_buffer *)ctsio->cdb; if ((cdb->byte2 & RWB_MODE) != RWB_MODE_DATA) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = scsi_3btoul(cdb->length); buffer_offset = scsi_3btoul(cdb->offset); if (buffer_offset + len > sizeof(lun->write_buffer)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = lun->write_buffer + buffer_offset; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_write_same(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int len, retval; uint8_t byte2; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_write_same\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; switch (ctsio->cdb[0]) { case WRITE_SAME_10: { struct scsi_write_same_10 *cdb; cdb = (struct scsi_write_same_10 *)ctsio->cdb; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); byte2 = cdb->byte2; break; } case WRITE_SAME_16: { struct scsi_write_same_16 *cdb; cdb = (struct scsi_write_same_16 *)ctsio->cdb; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); byte2 = cdb->byte2; break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Zero number of blocks means "to the last logical block" */ if (num_blocks == 0) { if ((lun->be_lun->maxlba + 1) - lba > UINT32_MAX) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 0, /*command*/ 1, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } num_blocks = (lun->be_lun->maxlba + 1) - lba; } len = lun->be_lun->blocksize; /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = byte2; retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); } int ctl_unmap(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_unmap *cdb; struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_header *hdr; struct scsi_unmap_desc *buf, *end; uint64_t lba; uint32_t num_blocks; int len, retval; uint8_t byte2; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_unmap\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_unmap *)ctsio->cdb; len = scsi_2btoul(cdb->length); byte2 = cdb->byte2; /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = ctsio->kern_total_len - ctsio->kern_data_resid; hdr = (struct scsi_unmap_header *)ctsio->kern_data_ptr; if (len < sizeof (*hdr) || len < (scsi_2btoul(hdr->length) + sizeof(hdr->length)) || len < (scsi_2btoul(hdr->desc_length) + sizeof (*hdr)) || scsi_2btoul(hdr->desc_length) % sizeof(*buf) != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 0, /*command*/ 0, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = scsi_2btoul(hdr->desc_length); buf = (struct scsi_unmap_desc *)(hdr + 1); end = buf + len / sizeof(*buf); ptrlen = (struct ctl_ptr_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; ptrlen->ptr = (void *)buf; ptrlen->len = len; ptrlen->flags = byte2; for (; buf < end; buf++) { lba = scsi_8btou64(buf->lba); num_blocks = scsi_4btoul(buf->length); if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); } /* * Note that this function currently doesn't actually do anything inside * CTL to enforce things if the DQue bit is turned on. * * Also note that this function can't be used in the default case, because * the DQue bit isn't set in the changeable mask for the control mode page * anyway. This is just here as an example for how to implement a page * handler, and a placeholder in case we want to allow the user to turn * tagged queueing on and off. * * The D_SENSE bit handling is functional, however, and will turn * descriptor sense on and off for a given LUN. */ int ctl_control_page_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { struct scsi_control_page *current_cp, *saved_cp, *user_cp; struct ctl_lun *lun; struct ctl_softc *softc; int set_ua; uint32_t initidx; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); set_ua = 0; user_cp = (struct scsi_control_page *)page_ptr; current_cp = (struct scsi_control_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_cp = (struct scsi_control_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); softc = control_softc; mtx_lock(&lun->lun_lock); if (((current_cp->rlec & SCP_DSENSE) == 0) && ((user_cp->rlec & SCP_DSENSE) != 0)) { /* * Descriptor sense is currently turned off and the user * wants to turn it on. */ current_cp->rlec |= SCP_DSENSE; saved_cp->rlec |= SCP_DSENSE; lun->flags |= CTL_LUN_SENSE_DESC; set_ua = 1; } else if (((current_cp->rlec & SCP_DSENSE) != 0) && ((user_cp->rlec & SCP_DSENSE) == 0)) { /* * Descriptor sense is currently turned on, and the user * wants to turn it off. */ current_cp->rlec &= ~SCP_DSENSE; saved_cp->rlec &= ~SCP_DSENSE; lun->flags &= ~CTL_LUN_SENSE_DESC; set_ua = 1; } if (current_cp->queue_flags & SCP_QUEUE_DQUE) { if (user_cp->queue_flags & SCP_QUEUE_DQUE) { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_UNTAG_TO_UNTAG, csevent_LogType_Trace, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "Received untagged to untagged transition"); #endif /* NEEDTOPORT */ } else { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_UNTAG_TO_TAG, csevent_LogType_ConfigChange, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "Received untagged to tagged " "queueing transition"); #endif /* NEEDTOPORT */ current_cp->queue_flags &= ~SCP_QUEUE_DQUE; saved_cp->queue_flags &= ~SCP_QUEUE_DQUE; set_ua = 1; } } else { if (user_cp->queue_flags & SCP_QUEUE_DQUE) { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_TAG_TO_UNTAG, csevent_LogType_ConfigChange, csevent_Severity_Warning, csevent_AlertLevel_Yellow, csevent_FRU_Firmware, csevent_FRU_Unknown, "Received tagged queueing to untagged " "transition"); #endif /* NEEDTOPORT */ current_cp->queue_flags |= SCP_QUEUE_DQUE; saved_cp->queue_flags |= SCP_QUEUE_DQUE; set_ua = 1; } else { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_TAG_TO_TAG, csevent_LogType_Trace, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "Received tagged queueing to tagged " "queueing transition"); #endif /* NEEDTOPORT */ } } if (set_ua != 0) { int i; /* * Let other initiators know that the mode * parameters for this LUN have changed. */ for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == initidx) continue; lun->pending_ua[i] |= CTL_UA_MODE_CHANGE; } } mtx_unlock(&lun->lun_lock); return (0); } int ctl_caching_sp_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { struct scsi_caching_page *current_cp, *saved_cp, *user_cp; struct ctl_lun *lun; int set_ua; uint32_t initidx; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); set_ua = 0; user_cp = (struct scsi_caching_page *)page_ptr; current_cp = (struct scsi_caching_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_cp = (struct scsi_caching_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); mtx_lock(&lun->lun_lock); if ((current_cp->flags1 & (SCP_WCE | SCP_RCD)) != (user_cp->flags1 & (SCP_WCE | SCP_RCD))) set_ua = 1; current_cp->flags1 &= ~(SCP_WCE | SCP_RCD); current_cp->flags1 |= user_cp->flags1 & (SCP_WCE | SCP_RCD); saved_cp->flags1 &= ~(SCP_WCE | SCP_RCD); saved_cp->flags1 |= user_cp->flags1 & (SCP_WCE | SCP_RCD); if (set_ua != 0) { int i; /* * Let other initiators know that the mode * parameters for this LUN have changed. */ for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == initidx) continue; lun->pending_ua[i] |= CTL_UA_MODE_CHANGE; } } mtx_unlock(&lun->lun_lock); return (0); } int ctl_power_sp_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { return (0); } int ctl_power_sp_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct copan_power_subpage *page; page = (struct copan_power_subpage *)page_index->page_data + (page_index->page_len * pc); switch (pc) { case SMS_PAGE_CTRL_CHANGEABLE >> 6: /* * We don't update the changable bits for this page. */ break; case SMS_PAGE_CTRL_CURRENT >> 6: case SMS_PAGE_CTRL_DEFAULT >> 6: case SMS_PAGE_CTRL_SAVED >> 6: #ifdef NEEDTOPORT ctl_update_power_subpage(page); #endif break; default: #ifdef NEEDTOPORT EPRINT(0, "Invalid PC %d!!", pc); #endif break; } return (0); } int ctl_aps_sp_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { struct copan_aps_subpage *user_sp; struct copan_aps_subpage *current_sp; union ctl_modepage_info *modepage_info; struct ctl_softc *softc; struct ctl_lun *lun; int retval; retval = CTL_RETVAL_COMPLETE; current_sp = (struct copan_aps_subpage *)(page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); softc = control_softc; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; user_sp = (struct copan_aps_subpage *)page_ptr; modepage_info = (union ctl_modepage_info *) ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes; modepage_info->header.page_code = page_index->page_code & SMPH_PC_MASK; modepage_info->header.subpage = page_index->subpage; modepage_info->aps.lock_active = user_sp->lock_active; mtx_lock(&softc->ctl_lock); /* * If there is a request to lock the LUN and another LUN is locked * this is an error. If the requested LUN is already locked ignore * the request. If no LUN is locked attempt to lock it. * if there is a request to unlock the LUN and the LUN is currently * locked attempt to unlock it. Otherwise ignore the request. i.e. * if another LUN is locked or no LUN is locked. */ if (user_sp->lock_active & APS_LOCK_ACTIVE) { if (softc->aps_locked_lun == lun->lun) { /* * This LUN is already locked, so we're done. */ retval = CTL_RETVAL_COMPLETE; } else if (softc->aps_locked_lun == 0) { /* * No one has the lock, pass the request to the * backend. */ retval = lun->backend->config_write( (union ctl_io *)ctsio); } else { /* * Someone else has the lock, throw out the request. */ ctl_set_already_locked(ctsio); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); /* * Set the return value so that ctl_do_mode_select() * won't try to complete the command. We already * completed it here. */ retval = CTL_RETVAL_ERROR; } } else if (softc->aps_locked_lun == lun->lun) { /* * This LUN is locked, so pass the unlock request to the * backend. */ retval = lun->backend->config_write((union ctl_io *)ctsio); } mtx_unlock(&softc->ctl_lock); return (retval); } int ctl_debugconf_sp_select_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { uint8_t *c; int i; c = ((struct copan_debugconf_subpage *)page_ptr)->ctl_time_io_secs; ctl_time_io_secs = (c[0] << 8) | (c[1] << 0) | 0; CTL_DEBUG_PRINT(("set ctl_time_io_secs to %d\n", ctl_time_io_secs)); printf("set ctl_time_io_secs to %d\n", ctl_time_io_secs); printf("page data:"); for (i=0; i<8; i++) printf(" %.2x",page_ptr[i]); printf("\n"); return (0); } int ctl_debugconf_sp_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct copan_debugconf_subpage *page; page = (struct copan_debugconf_subpage *)page_index->page_data + (page_index->page_len * pc); switch (pc) { case SMS_PAGE_CTRL_CHANGEABLE >> 6: case SMS_PAGE_CTRL_DEFAULT >> 6: case SMS_PAGE_CTRL_SAVED >> 6: /* * We don't update the changable or default bits for this page. */ break; case SMS_PAGE_CTRL_CURRENT >> 6: page->ctl_time_io_secs[0] = ctl_time_io_secs >> 8; page->ctl_time_io_secs[1] = ctl_time_io_secs >> 0; break; default: #ifdef NEEDTOPORT EPRINT(0, "Invalid PC %d!!", pc); #endif /* NEEDTOPORT */ break; } return (0); } static int ctl_do_mode_select(union ctl_io *io) { struct scsi_mode_page_header *page_header; struct ctl_page_index *page_index; struct ctl_scsiio *ctsio; int control_dev, page_len; int page_len_offset, page_len_size; union ctl_modepage_info *modepage_info; struct ctl_lun *lun; int *len_left, *len_used; int retval, i; ctsio = &io->scsiio; page_index = NULL; page_len = 0; retval = CTL_RETVAL_COMPLETE; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun->be_lun->lun_type != T_DIRECT) control_dev = 1; else control_dev = 0; modepage_info = (union ctl_modepage_info *) ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes; len_left = &modepage_info->header.len_left; len_used = &modepage_info->header.len_used; do_next_page: page_header = (struct scsi_mode_page_header *) (ctsio->kern_data_ptr + *len_used); if (*len_left == 0) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if (*len_left < sizeof(struct scsi_mode_page_header)) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if ((page_header->page_code & SMPH_SPF) && (*len_left < sizeof(struct scsi_mode_page_header_sp))) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * XXX KDM should we do something with the block descriptor? */ for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { if ((control_dev != 0) && (lun->mode_pages.index[i].page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) != (page_header->page_code & SMPH_PC_MASK)) continue; /* * If neither page has a subpage code, then we've got a * match. */ if (((lun->mode_pages.index[i].page_code & SMPH_SPF) == 0) && ((page_header->page_code & SMPH_SPF) == 0)) { page_index = &lun->mode_pages.index[i]; page_len = page_header->page_length; break; } /* * If both pages have subpages, then the subpage numbers * have to match. */ if ((lun->mode_pages.index[i].page_code & SMPH_SPF) && (page_header->page_code & SMPH_SPF)) { struct scsi_mode_page_header_sp *sph; sph = (struct scsi_mode_page_header_sp *)page_header; if (lun->mode_pages.index[i].subpage == sph->subpage) { page_index = &lun->mode_pages.index[i]; page_len = scsi_2btoul(sph->page_length); break; } } } /* * If we couldn't find the page, or if we don't have a mode select * handler for it, send back an error to the user. */ if ((page_index == NULL) || (page_index->select_handler == NULL)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used, /*bit_valid*/ 0, /*bit*/ 0); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (page_index->page_code & SMPH_SPF) { page_len_offset = 2; page_len_size = 2; } else { page_len_size = 1; page_len_offset = 1; } /* * If the length the initiator gives us isn't the one we specify in * the mode page header, or if they didn't specify enough data in * the CDB to avoid truncating this page, kick out the request. */ if ((page_len != (page_index->page_len - page_len_offset - page_len_size)) || (*len_left < page_index->page_len)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used + page_len_offset, /*bit_valid*/ 0, /*bit*/ 0); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Run through the mode page, checking to make sure that the bits * the user changed are actually legal for him to change. */ for (i = 0; i < page_index->page_len; i++) { uint8_t *user_byte, *change_mask, *current_byte; int bad_bit; int j; user_byte = (uint8_t *)page_header + i; change_mask = page_index->page_data + (page_index->page_len * CTL_PAGE_CHANGEABLE) + i; current_byte = page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT) + i; /* * Check to see whether the user set any bits in this byte * that he is not allowed to set. */ if ((*user_byte & ~(*change_mask)) == (*current_byte & ~(*change_mask))) continue; /* * Go through bit by bit to determine which one is illegal. */ bad_bit = 0; for (j = 7; j >= 0; j--) { if ((((1 << i) & ~(*change_mask)) & *user_byte) != (((1 << i) & ~(*change_mask)) & *current_byte)) { bad_bit = i; break; } } ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used + i, /*bit_valid*/ 1, /*bit*/ bad_bit); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Decrement these before we call the page handler, since we may * end up getting called back one way or another before the handler * returns to this context. */ *len_left -= page_index->page_len; *len_used += page_index->page_len; retval = page_index->select_handler(ctsio, page_index, (uint8_t *)page_header); /* * If the page handler returns CTL_RETVAL_QUEUED, then we need to * wait until this queued command completes to finish processing * the mode page. If it returns anything other than * CTL_RETVAL_COMPLETE (e.g. CTL_RETVAL_ERROR), then it should have * already set the sense information, freed the data pointer, and * completed the io for us. */ if (retval != CTL_RETVAL_COMPLETE) goto bailout_no_done; /* * If the initiator sent us more than one page, parse the next one. */ if (*len_left > 0) goto do_next_page; ctl_set_success(ctsio); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); bailout_no_done: return (CTL_RETVAL_COMPLETE); } int ctl_mode_select(struct ctl_scsiio *ctsio) { int param_len, pf, sp; int header_size, bd_len; int len_left, len_used; struct ctl_page_index *page_index; struct ctl_lun *lun; int control_dev, page_len; union ctl_modepage_info *modepage_info; int retval; pf = 0; sp = 0; page_len = 0; len_used = 0; len_left = 0; retval = 0; bd_len = 0; page_index = NULL; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun->be_lun->lun_type != T_DIRECT) control_dev = 1; else control_dev = 0; switch (ctsio->cdb[0]) { case MODE_SELECT_6: { struct scsi_mode_select_6 *cdb; cdb = (struct scsi_mode_select_6 *)ctsio->cdb; pf = (cdb->byte2 & SMS_PF) ? 1 : 0; sp = (cdb->byte2 & SMS_SP) ? 1 : 0; param_len = cdb->length; header_size = sizeof(struct scsi_mode_header_6); break; } case MODE_SELECT_10: { struct scsi_mode_select_10 *cdb; cdb = (struct scsi_mode_select_10 *)ctsio->cdb; pf = (cdb->byte2 & SMS_PF) ? 1 : 0; sp = (cdb->byte2 & SMS_SP) ? 1 : 0; param_len = scsi_2btoul(cdb->length); header_size = sizeof(struct scsi_mode_header_10); break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * From SPC-3: * "A parameter list length of zero indicates that the Data-Out Buffer * shall be empty. This condition shall not be considered as an error." */ if (param_len == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Since we'll hit this the first time through, prior to * allocation, we don't need to free a data buffer here. */ if (param_len < header_size) { ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Allocate the data buffer and grab the user's data. In theory, * we shouldn't have to sanity check the parameter list length here * because the maximum size is 64K. We should be able to malloc * that much without too many problems. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK); ctsio->kern_data_len = param_len; ctsio->kern_total_len = param_len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } switch (ctsio->cdb[0]) { case MODE_SELECT_6: { struct scsi_mode_header_6 *mh6; mh6 = (struct scsi_mode_header_6 *)ctsio->kern_data_ptr; bd_len = mh6->blk_desc_len; break; } case MODE_SELECT_10: { struct scsi_mode_header_10 *mh10; mh10 = (struct scsi_mode_header_10 *)ctsio->kern_data_ptr; bd_len = scsi_2btoul(mh10->blk_desc_len); break; } default: panic("Invalid CDB type %#x", ctsio->cdb[0]); break; } if (param_len < (header_size + bd_len)) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Set the IO_CONT flag, so that if this I/O gets passed to * ctl_config_write_done(), it'll get passed back to * ctl_do_mode_select() for further processing, or completion if * we're all done. */ ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT; ctsio->io_cont = ctl_do_mode_select; modepage_info = (union ctl_modepage_info *) ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes; memset(modepage_info, 0, sizeof(*modepage_info)); len_left = param_len - header_size - bd_len; len_used = header_size + bd_len; modepage_info->header.len_left = len_left; modepage_info->header.len_used = len_used; return (ctl_do_mode_select((union ctl_io *)ctsio)); } int ctl_mode_sense(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; int pc, page_code, dbd, llba, subpage; int alloc_len, page_len, header_len, total_len; struct scsi_mode_block_descr *block_desc; struct ctl_page_index *page_index; int control_dev; dbd = 0; llba = 0; block_desc = NULL; page_index = NULL; CTL_DEBUG_PRINT(("ctl_mode_sense\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun->be_lun->lun_type != T_DIRECT) control_dev = 1; else control_dev = 0; if (lun->flags & CTL_LUN_PR_RESERVED) { uint32_t residx; /* * XXX KDM need a lock here. */ residx = ctl_get_resindex(&ctsio->io_hdr.nexus); if ((lun->res_type == SPR_TYPE_EX_AC && residx != lun->pr_res_idx) || ((lun->res_type == SPR_TYPE_EX_AC_RO || lun->res_type == SPR_TYPE_EX_AC_AR) && !lun->per_res[residx].registered)) { ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } switch (ctsio->cdb[0]) { case MODE_SENSE_6: { struct scsi_mode_sense_6 *cdb; cdb = (struct scsi_mode_sense_6 *)ctsio->cdb; header_len = sizeof(struct scsi_mode_hdr_6); if (cdb->byte2 & SMS_DBD) dbd = 1; else header_len += sizeof(struct scsi_mode_block_descr); pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SMS_PAGE_CODE; subpage = cdb->subpage; alloc_len = cdb->length; break; } case MODE_SENSE_10: { struct scsi_mode_sense_10 *cdb; cdb = (struct scsi_mode_sense_10 *)ctsio->cdb; header_len = sizeof(struct scsi_mode_hdr_10); if (cdb->byte2 & SMS_DBD) dbd = 1; else header_len += sizeof(struct scsi_mode_block_descr); if (cdb->byte2 & SMS10_LLBAA) llba = 1; pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SMS_PAGE_CODE; subpage = cdb->subpage; alloc_len = scsi_2btoul(cdb->length); break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * We have to make a first pass through to calculate the size of * the pages that match the user's query. Then we allocate enough * memory to hold it, and actually copy the data into the buffer. */ switch (page_code) { case SMS_ALL_PAGES_PAGE: { int i; page_len = 0; /* * At the moment, values other than 0 and 0xff here are * reserved according to SPC-3. */ if ((subpage != SMS_SUBPAGE_PAGE_0) && (subpage != SMS_SUBPAGE_ALL)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 3, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { if ((control_dev != 0) && (lun->mode_pages.index[i].page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; /* * We don't use this subpage if the user didn't * request all subpages. */ if ((lun->mode_pages.index[i].subpage != 0) && (subpage == SMS_SUBPAGE_PAGE_0)) continue; #if 0 printf("found page %#x len %d\n", lun->mode_pages.index[i].page_code & SMPH_PC_MASK, lun->mode_pages.index[i].page_len); #endif page_len += lun->mode_pages.index[i].page_len; } break; } default: { int i; page_len = 0; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { /* Look for the right page code */ if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if ((lun->mode_pages.index[i].subpage != subpage) && (subpage != SMS_SUBPAGE_ALL)) continue; /* Make sure the page is supported for this dev type */ if ((control_dev != 0) && (lun->mode_pages.index[i].page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; #if 0 printf("found page %#x len %d\n", lun->mode_pages.index[i].page_code & SMPH_PC_MASK, lun->mode_pages.index[i].page_len); #endif page_len += lun->mode_pages.index[i].page_len; } if (page_len == 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 5); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } break; } } total_len = header_len + page_len; #if 0 printf("header_len = %d, page_len = %d, total_len = %d\n", header_len, page_len, total_len); #endif ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } switch (ctsio->cdb[0]) { case MODE_SENSE_6: { struct scsi_mode_hdr_6 *header; header = (struct scsi_mode_hdr_6 *)ctsio->kern_data_ptr; header->datalen = ctl_min(total_len - 1, 254); if (control_dev == 0) header->dev_specific = 0x10; /* DPOFUA */ if (dbd) header->block_descr_len = 0; else header->block_descr_len = sizeof(struct scsi_mode_block_descr); block_desc = (struct scsi_mode_block_descr *)&header[1]; break; } case MODE_SENSE_10: { struct scsi_mode_hdr_10 *header; int datalen; header = (struct scsi_mode_hdr_10 *)ctsio->kern_data_ptr; datalen = ctl_min(total_len - 2, 65533); scsi_ulto2b(datalen, header->datalen); if (control_dev == 0) header->dev_specific = 0x10; /* DPOFUA */ if (dbd) scsi_ulto2b(0, header->block_descr_len); else scsi_ulto2b(sizeof(struct scsi_mode_block_descr), header->block_descr_len); block_desc = (struct scsi_mode_block_descr *)&header[1]; break; } default: panic("invalid CDB type %#x", ctsio->cdb[0]); break; /* NOTREACHED */ } /* * If we've got a disk, use its blocksize in the block * descriptor. Otherwise, just set it to 0. */ if (dbd == 0) { if (control_dev != 0) scsi_ulto3b(lun->be_lun->blocksize, block_desc->block_len); else scsi_ulto3b(0, block_desc->block_len); } switch (page_code) { case SMS_ALL_PAGES_PAGE: { int i, data_used; data_used = header_len; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { struct ctl_page_index *page_index; page_index = &lun->mode_pages.index[i]; if ((control_dev != 0) && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; /* * We don't use this subpage if the user didn't * request all subpages. We already checked (above) * to make sure the user only specified a subpage * of 0 or 0xff in the SMS_ALL_PAGES_PAGE case. */ if ((page_index->subpage != 0) && (subpage == SMS_SUBPAGE_PAGE_0)) continue; /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index,pc); memcpy(ctsio->kern_data_ptr + data_used, page_index->page_data + (page_index->page_len * pc), page_index->page_len); data_used += page_index->page_len; } break; } default: { int i, data_used; data_used = header_len; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { struct ctl_page_index *page_index; page_index = &lun->mode_pages.index[i]; /* Look for the right page code */ if ((page_index->page_code & SMPH_PC_MASK) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if ((page_index->subpage != subpage) && (subpage != SMS_SUBPAGE_ALL)) continue; /* Make sure the page is supported for this dev type */ if ((control_dev != 0) && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index,pc); memcpy(ctsio->kern_data_ptr + data_used, page_index->page_data + (page_index->page_len * pc), page_index->page_len); data_used += page_index->page_len; } break; } } ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_capacity(struct ctl_scsiio *ctsio) { struct scsi_read_capacity *cdb; struct scsi_read_capacity_data *data; struct ctl_lun *lun; uint32_t lba; CTL_DEBUG_PRINT(("ctl_read_capacity\n")); cdb = (struct scsi_read_capacity *)ctsio->cdb; lba = scsi_4btoul(cdb->addr); if (((cdb->pmi & SRC_PMI) == 0) && (lba != 0)) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_read_capacity_data *)ctsio->kern_data_ptr; ctsio->residual = 0; ctsio->kern_data_len = sizeof(*data); ctsio->kern_total_len = sizeof(*data); ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * If the maximum LBA is greater than 0xfffffffe, the user must * issue a SERVICE ACTION IN (16) command, with the read capacity * serivce action set. */ if (lun->be_lun->maxlba > 0xfffffffe) scsi_ulto4b(0xffffffff, data->addr); else scsi_ulto4b(lun->be_lun->maxlba, data->addr); /* * XXX KDM this may not be 512 bytes... */ scsi_ulto4b(lun->be_lun->blocksize, data->length); ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_capacity_16(struct ctl_scsiio *ctsio) { struct scsi_read_capacity_16 *cdb; struct scsi_read_capacity_data_long *data; struct ctl_lun *lun; uint64_t lba; uint32_t alloc_len; CTL_DEBUG_PRINT(("ctl_read_capacity_16\n")); cdb = (struct scsi_read_capacity_16 *)ctsio->cdb; alloc_len = scsi_4btoul(cdb->alloc_len); lba = scsi_8btou64(cdb->addr); if ((cdb->reladr & SRC16_PMI) && (lba != 0)) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_read_capacity_data_long *)ctsio->kern_data_ptr; if (sizeof(*data) < alloc_len) { ctsio->residual = alloc_len - sizeof(*data); ctsio->kern_data_len = sizeof(*data); ctsio->kern_total_len = sizeof(*data); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; scsi_u64to8b(lun->be_lun->maxlba, data->addr); /* XXX KDM this may not be 512 bytes... */ scsi_ulto4b(lun->be_lun->blocksize, data->length); data->prot_lbppbe = lun->be_lun->pblockexp & SRC16_LBPPBE; scsi_ulto2b(lun->be_lun->pblockoff & SRC16_LALBA_A, data->lalba_lbp); if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) data->lalba_lbp[0] |= SRC16_LBPME | SRC16_LBPRZ; ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_report_tagret_port_groups(struct ctl_scsiio *ctsio) { struct scsi_maintenance_in *cdb; int retval; int alloc_len, ext, total_len = 0, g, p, pc, pg; int num_target_port_groups, num_target_ports, single; struct ctl_lun *lun; struct ctl_softc *softc; struct ctl_port *port; struct scsi_target_group_data *rtg_ptr; struct scsi_target_group_data_extended *rtg_ext_ptr; struct scsi_target_port_group_descriptor *tpg_desc; CTL_DEBUG_PRINT(("ctl_report_tagret_port_groups\n")); cdb = (struct scsi_maintenance_in *)ctsio->cdb; softc = control_softc; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; switch (cdb->byte2 & STG_PDF_MASK) { case STG_PDF_LENGTH: ext = 0; break; case STG_PDF_EXTENDED: ext = 1; break; default: ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 5); ctl_done((union ctl_io *)ctsio); return(retval); } single = ctl_is_single; if (single) num_target_port_groups = 1; else num_target_port_groups = NUM_TARGET_PORT_GROUPS; num_target_ports = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (ctl_map_lun_back(port->targ_port, lun->lun) >= CTL_MAX_LUNS) continue; num_target_ports++; } mtx_unlock(&softc->ctl_lock); if (ext) total_len = sizeof(struct scsi_target_group_data_extended); else total_len = sizeof(struct scsi_target_group_data); total_len += sizeof(struct scsi_target_port_group_descriptor) * num_target_port_groups + sizeof(struct scsi_target_port_descriptor) * num_target_ports * num_target_port_groups; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; if (ext) { rtg_ext_ptr = (struct scsi_target_group_data_extended *) ctsio->kern_data_ptr; scsi_ulto4b(total_len - 4, rtg_ext_ptr->length); rtg_ext_ptr->format_type = 0x10; rtg_ext_ptr->implicit_transition_time = 0; tpg_desc = &rtg_ext_ptr->groups[0]; } else { rtg_ptr = (struct scsi_target_group_data *) ctsio->kern_data_ptr; scsi_ulto4b(total_len - 4, rtg_ptr->length); tpg_desc = &rtg_ptr->groups[0]; } pg = ctsio->io_hdr.nexus.targ_port / CTL_MAX_PORTS; mtx_lock(&softc->ctl_lock); for (g = 0; g < num_target_port_groups; g++) { if (g == pg) tpg_desc->pref_state = TPG_PRIMARY | TPG_ASYMMETRIC_ACCESS_OPTIMIZED; else tpg_desc->pref_state = TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED; tpg_desc->support = TPG_AO_SUP; if (!single) tpg_desc->support |= TPG_AN_SUP; scsi_ulto2b(g + 1, tpg_desc->target_port_group); tpg_desc->status = TPG_IMPLICIT; pc = 0; STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (ctl_map_lun_back(port->targ_port, lun->lun) >= CTL_MAX_LUNS) continue; p = port->targ_port % CTL_MAX_PORTS + g * CTL_MAX_PORTS; scsi_ulto2b(p, tpg_desc->descriptors[pc]. relative_target_port_identifier); pc++; } tpg_desc->target_port_count = pc; tpg_desc = (struct scsi_target_port_group_descriptor *) &tpg_desc->descriptors[pc]; } mtx_unlock(&softc->ctl_lock); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; CTL_DEBUG_PRINT(("buf = %x %x %x %x %x %x %x %x\n", ctsio->kern_data_ptr[0], ctsio->kern_data_ptr[1], ctsio->kern_data_ptr[2], ctsio->kern_data_ptr[3], ctsio->kern_data_ptr[4], ctsio->kern_data_ptr[5], ctsio->kern_data_ptr[6], ctsio->kern_data_ptr[7])); ctl_datamove((union ctl_io *)ctsio); return(retval); } int ctl_report_supported_opcodes(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_report_supported_opcodes *cdb; const struct ctl_cmd_entry *entry, *sentry; struct scsi_report_supported_opcodes_all *all; struct scsi_report_supported_opcodes_descr *descr; struct scsi_report_supported_opcodes_one *one; int retval; int alloc_len, total_len; int opcode, service_action, i, j, num; CTL_DEBUG_PRINT(("ctl_report_supported_opcodes\n")); cdb = (struct scsi_report_supported_opcodes *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; opcode = cdb->requested_opcode; service_action = scsi_2btoul(cdb->requested_service_action); switch (cdb->options & RSO_OPTIONS_MASK) { case RSO_OPTIONS_ALL: num = 0; for (i = 0; i < 256; i++) { entry = &ctl_cmd_table[i]; if (entry->flags & CTL_CMD_FLAG_SA5) { for (j = 0; j < 32; j++) { sentry = &((const struct ctl_cmd_entry *) entry->execute)[j]; if (ctl_cmd_applicable( lun->be_lun->lun_type, sentry)) num++; } } else { if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) num++; } } total_len = sizeof(struct scsi_report_supported_opcodes_all) + num * sizeof(struct scsi_report_supported_opcodes_descr); break; case RSO_OPTIONS_OC: if (ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32; break; case RSO_OPTIONS_OC_SA: if ((ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) == 0 || service_action >= 32) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32; break; default: ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; switch (cdb->options & RSO_OPTIONS_MASK) { case RSO_OPTIONS_ALL: all = (struct scsi_report_supported_opcodes_all *) ctsio->kern_data_ptr; num = 0; for (i = 0; i < 256; i++) { entry = &ctl_cmd_table[i]; if (entry->flags & CTL_CMD_FLAG_SA5) { for (j = 0; j < 32; j++) { sentry = &((const struct ctl_cmd_entry *) entry->execute)[j]; if (!ctl_cmd_applicable( lun->be_lun->lun_type, sentry)) continue; descr = &all->descr[num++]; descr->opcode = i; scsi_ulto2b(j, descr->service_action); descr->flags = RSO_SERVACTV; scsi_ulto2b(sentry->length, descr->cdb_length); } } else { if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) continue; descr = &all->descr[num++]; descr->opcode = i; scsi_ulto2b(0, descr->service_action); descr->flags = 0; scsi_ulto2b(entry->length, descr->cdb_length); } } scsi_ulto4b( num * sizeof(struct scsi_report_supported_opcodes_descr), all->length); break; case RSO_OPTIONS_OC: one = (struct scsi_report_supported_opcodes_one *) ctsio->kern_data_ptr; entry = &ctl_cmd_table[opcode]; goto fill_one; case RSO_OPTIONS_OC_SA: one = (struct scsi_report_supported_opcodes_one *) ctsio->kern_data_ptr; entry = &ctl_cmd_table[opcode]; entry = &((const struct ctl_cmd_entry *) entry->execute)[service_action]; fill_one: if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) { one->support = 3; scsi_ulto2b(entry->length, one->cdb_length); one->cdb_usage[0] = opcode; memcpy(&one->cdb_usage[1], entry->usage, entry->length - 1); } else one->support = 1; break; } ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return(retval); } int ctl_report_supported_tmf(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_report_supported_tmf *cdb; struct scsi_report_supported_tmf_data *data; int retval; int alloc_len, total_len; CTL_DEBUG_PRINT(("ctl_report_supported_tmf\n")); cdb = (struct scsi_report_supported_tmf *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; total_len = sizeof(struct scsi_report_supported_tmf_data); alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_report_supported_tmf_data *)ctsio->kern_data_ptr; data->byte1 |= RST_ATS | RST_ATSS | RST_CTSS | RST_LURS | RST_TRS; data->byte2 |= RST_ITNRS; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_report_timestamp(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_report_timestamp *cdb; struct scsi_report_timestamp_data *data; struct timeval tv; int64_t timestamp; int retval; int alloc_len, total_len; CTL_DEBUG_PRINT(("ctl_report_timestamp\n")); cdb = (struct scsi_report_timestamp *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; total_len = sizeof(struct scsi_report_timestamp_data); alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_report_timestamp_data *)ctsio->kern_data_ptr; scsi_ulto2b(sizeof(*data) - 2, data->length); data->origin = RTS_ORIG_OUTSIDE; getmicrotime(&tv); timestamp = (int64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000; scsi_ulto4b(timestamp >> 16, data->timestamp); scsi_ulto2b(timestamp & 0xffff, &data->timestamp[4]); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_persistent_reserve_in(struct ctl_scsiio *ctsio) { struct scsi_per_res_in *cdb; int alloc_len, total_len = 0; /* struct scsi_per_res_in_rsrv in_data; */ struct ctl_lun *lun; struct ctl_softc *softc; CTL_DEBUG_PRINT(("ctl_persistent_reserve_in\n")); softc = control_softc; cdb = (struct scsi_per_res_in *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retry: mtx_lock(&lun->lun_lock); switch (cdb->action) { case SPRI_RK: /* read keys */ total_len = sizeof(struct scsi_per_res_in_keys) + lun->pr_key_count * sizeof(struct scsi_per_res_key); break; case SPRI_RR: /* read reservation */ if (lun->flags & CTL_LUN_PR_RESERVED) total_len = sizeof(struct scsi_per_res_in_rsrv); else total_len = sizeof(struct scsi_per_res_in_header); break; case SPRI_RC: /* report capabilities */ total_len = sizeof(struct scsi_per_res_cap); break; case SPRI_RS: /* read full status */ total_len = sizeof(struct scsi_per_res_in_header) + (sizeof(struct scsi_per_res_in_full_desc) + 256) * lun->pr_key_count; break; default: panic("Invalid PR type %x", cdb->action); } mtx_unlock(&lun->lun_lock); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; mtx_lock(&lun->lun_lock); switch (cdb->action) { case SPRI_RK: { // read keys struct scsi_per_res_in_keys *res_keys; int i, key_count; res_keys = (struct scsi_per_res_in_keys*)ctsio->kern_data_ptr; /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (total_len != (sizeof(struct scsi_per_res_in_keys) + (lun->pr_key_count * sizeof(struct scsi_per_res_key)))){ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation length changed, retrying\n", __func__); goto retry; } scsi_ulto4b(lun->PRGeneration, res_keys->header.generation); scsi_ulto4b(sizeof(struct scsi_per_res_key) * lun->pr_key_count, res_keys->header.length); for (i = 0, key_count = 0; i < 2*CTL_MAX_INITIATORS; i++) { if (!lun->per_res[i].registered) continue; /* * We used lun->pr_key_count to calculate the * size to allocate. If it turns out the number of * initiators with the registered flag set is * larger than that (i.e. they haven't been kept in * sync), we've got a problem. */ if (key_count >= lun->pr_key_count) { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_PR_ERROR, csevent_LogType_Fault, csevent_AlertLevel_Yellow, csevent_FRU_ShelfController, csevent_FRU_Firmware, csevent_FRU_Unknown, "registered keys %d >= key " "count %d", key_count, lun->pr_key_count); #endif key_count++; continue; } memcpy(res_keys->keys[key_count].key, lun->per_res[i].res_key.key, ctl_min(sizeof(res_keys->keys[key_count].key), sizeof(lun->per_res[i].res_key))); key_count++; } break; } case SPRI_RR: { // read reservation struct scsi_per_res_in_rsrv *res; int tmp_len, header_only; res = (struct scsi_per_res_in_rsrv *)ctsio->kern_data_ptr; scsi_ulto4b(lun->PRGeneration, res->header.generation); if (lun->flags & CTL_LUN_PR_RESERVED) { tmp_len = sizeof(struct scsi_per_res_in_rsrv); scsi_ulto4b(sizeof(struct scsi_per_res_in_rsrv_data), res->header.length); header_only = 0; } else { tmp_len = sizeof(struct scsi_per_res_in_header); scsi_ulto4b(0, res->header.length); header_only = 1; } /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (tmp_len != total_len) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation status changed, retrying\n", __func__); goto retry; } /* * No reservation held, so we're done. */ if (header_only != 0) break; /* * If the registration is an All Registrants type, the key * is 0, since it doesn't really matter. */ if (lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) { memcpy(res->data.reservation, &lun->per_res[lun->pr_res_idx].res_key, sizeof(struct scsi_per_res_key)); } res->data.scopetype = lun->res_type; break; } case SPRI_RC: //report capabilities { struct scsi_per_res_cap *res_cap; uint16_t type_mask; res_cap = (struct scsi_per_res_cap *)ctsio->kern_data_ptr; scsi_ulto2b(sizeof(*res_cap), res_cap->length); res_cap->flags2 |= SPRI_TMV | SPRI_ALLOW_3; type_mask = SPRI_TM_WR_EX_AR | SPRI_TM_EX_AC_RO | SPRI_TM_WR_EX_RO | SPRI_TM_EX_AC | SPRI_TM_WR_EX | SPRI_TM_EX_AC_AR; scsi_ulto2b(type_mask, res_cap->type_mask); break; } case SPRI_RS: { // read full status struct scsi_per_res_in_full *res_status; struct scsi_per_res_in_full_desc *res_desc; struct ctl_port *port; int i, len; res_status = (struct scsi_per_res_in_full*)ctsio->kern_data_ptr; /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (total_len < (sizeof(struct scsi_per_res_in_header) + (sizeof(struct scsi_per_res_in_full_desc) + 256) * lun->pr_key_count)){ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation length changed, retrying\n", __func__); goto retry; } scsi_ulto4b(lun->PRGeneration, res_status->header.generation); res_desc = &res_status->desc[0]; for (i = 0; i < 2*CTL_MAX_INITIATORS; i++) { if (!lun->per_res[i].registered) continue; memcpy(&res_desc->res_key, &lun->per_res[i].res_key.key, sizeof(res_desc->res_key)); if ((lun->flags & CTL_LUN_PR_RESERVED) && (lun->pr_res_idx == i || lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS)) { res_desc->flags = SPRI_FULL_R_HOLDER; res_desc->scopetype = lun->res_type; } scsi_ulto2b(i / CTL_MAX_INIT_PER_PORT, res_desc->rel_trgt_port_id); len = 0; port = softc->ctl_ports[ ctl_port_idx(i / CTL_MAX_INIT_PER_PORT)]; if (port != NULL) len = ctl_create_iid(port, i % CTL_MAX_INIT_PER_PORT, res_desc->transport_id); scsi_ulto4b(len, res_desc->additional_length); res_desc = (struct scsi_per_res_in_full_desc *) &res_desc->transport_id[len]; } scsi_ulto4b((uint8_t *)res_desc - (uint8_t *)&res_status->desc[0], res_status->header.length); break; } default: /* * This is a bug, because we just checked for this above, * and should have returned an error. */ panic("Invalid PR type %x", cdb->action); break; /* NOTREACHED */ } mtx_unlock(&lun->lun_lock); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; CTL_DEBUG_PRINT(("buf = %x %x %x %x %x %x %x %x\n", ctsio->kern_data_ptr[0], ctsio->kern_data_ptr[1], ctsio->kern_data_ptr[2], ctsio->kern_data_ptr[3], ctsio->kern_data_ptr[4], ctsio->kern_data_ptr[5], ctsio->kern_data_ptr[6], ctsio->kern_data_ptr[7])); ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Returns 0 if ctl_persistent_reserve_out() should continue, non-zero if * it should return. */ static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key, uint64_t sa_res_key, uint8_t type, uint32_t residx, struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb, struct scsi_per_res_out_parms* param) { union ctl_ha_msg persis_io; int retval, i; int isc_retval; retval = 0; mtx_lock(&lun->lun_lock); if (sa_res_key == 0) { if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { /* validate scope and type */ if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (1); } if (type>8 || type==2 || type==4 || type==0) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } /* temporarily unregister this nexus */ lun->per_res[residx].registered = 0; /* * Unregister everybody else and build UA for * them */ for(i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (lun->per_res[i].registered == 0) continue; if (!persis_offset && i pending_ua[i] |= CTL_UA_REG_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i-persis_offset] |= CTL_UA_REG_PREEMPT; lun->per_res[i].registered = 0; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); } lun->per_res[residx].registered = 1; lun->pr_key_count = 1; lun->res_type = type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned " "from ctl_ha_msg_send %d\n", isc_retval); } } else { /* not all registrants */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 8, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS || !(lun->flags & CTL_LUN_PR_RESERVED)) { int found = 0; if (res_key == sa_res_key) { /* special case */ /* * The spec implies this is not good but doesn't * say what to do. There are two choices either * generate a res conflict or check condition * with illegal field in parameter data. Since * that is what is done when the sa_res_key is * zero I'll take that approach since this has * to do with the sa_res_key. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 8, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } for (i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (lun->per_res[i].registered && memcmp(param->serv_act_res_key, lun->per_res[i].res_key.key, sizeof(struct scsi_per_res_key)) != 0) continue; found = 1; lun->per_res[i].registered = 0; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); lun->pr_key_count--; if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_REG_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i-persis_offset] |= CTL_UA_REG_PREEMPT; } if (!found) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned from " "ctl_ha_msg_send %d\n", isc_retval); } } else { /* Reserved but not all registrants */ /* sa_res_key is res holder */ if (memcmp(param->serv_act_res_key, lun->per_res[lun->pr_res_idx].res_key.key, sizeof(struct scsi_per_res_key)) == 0) { /* validate scope and type */ if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (1); } if (type>8 || type==2 || type==4 || type==0) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } /* * Do the following: * if sa_res_key != res_key remove all * registrants w/sa_res_key and generate UA * for these registrants(Registrations * Preempted) if it wasn't an exclusive * reservation generate UA(Reservations * Preempted) for all other registered nexuses * if the type has changed. Establish the new * reservation and holder. If res_key and * sa_res_key are the same do the above * except don't unregister the res holder. */ /* * Temporarily unregister so it won't get * removed or UA generated */ lun->per_res[residx].registered = 0; for(i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (lun->per_res[i].registered == 0) continue; if (memcmp(param->serv_act_res_key, lun->per_res[i].res_key.key, sizeof(struct scsi_per_res_key)) == 0) { lun->per_res[i].registered = 0; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); lun->pr_key_count--; if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_REG_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i-persis_offset] |= CTL_UA_REG_PREEMPT; } else if (type != lun->res_type && (lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type ==SPR_TYPE_EX_AC_RO)){ if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_RES_RELEASE; else if (persis_offset && i >= persis_offset) lun->pending_ua[ i-persis_offset] |= CTL_UA_RES_RELEASE; } } lun->per_res[residx].registered = 1; lun->res_type = type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned " "from ctl_ha_msg_send %d\n", isc_retval); } } else { /* * sa_res_key is not the res holder just * remove registrants */ int found=0; for (i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (memcmp(param->serv_act_res_key, lun->per_res[i].res_key.key, sizeof(struct scsi_per_res_key)) != 0) continue; found = 1; lun->per_res[i].registered = 0; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); lun->pr_key_count--; if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_REG_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i-persis_offset] |= CTL_UA_REG_PREEMPT; } if (!found) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (1); } persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned " "from ctl_ha_msg_send %d\n", isc_retval); } } } lun->PRGeneration++; mtx_unlock(&lun->lun_lock); return (retval); } static void ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg) { int i; if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS || lun->pr_res_idx == CTL_PR_NO_RESERVATION || memcmp(&lun->per_res[lun->pr_res_idx].res_key, msg->pr.pr_info.sa_res_key, sizeof(struct scsi_per_res_key)) != 0) { uint64_t sa_res_key; sa_res_key = scsi_8btou64(msg->pr.pr_info.sa_res_key); if (sa_res_key == 0) { /* temporarily unregister this nexus */ lun->per_res[msg->pr.pr_info.residx].registered = 0; /* * Unregister everybody else and build UA for * them */ for(i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (lun->per_res[i].registered == 0) continue; if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_REG_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i - persis_offset] |= CTL_UA_REG_PREEMPT; lun->per_res[i].registered = 0; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); } lun->per_res[msg->pr.pr_info.residx].registered = 1; lun->pr_key_count = 1; lun->res_type = msg->pr.pr_info.res_type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = msg->pr.pr_info.residx; } else { for (i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (memcmp(msg->pr.pr_info.sa_res_key, lun->per_res[i].res_key.key, sizeof(struct scsi_per_res_key)) != 0) continue; lun->per_res[i].registered = 0; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); lun->pr_key_count--; if (!persis_offset && i < persis_offset) lun->pending_ua[i] |= CTL_UA_REG_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i - persis_offset] |= CTL_UA_REG_PREEMPT; } } } else { /* * Temporarily unregister so it won't get removed * or UA generated */ lun->per_res[msg->pr.pr_info.residx].registered = 0; for (i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (lun->per_res[i].registered == 0) continue; if (memcmp(msg->pr.pr_info.sa_res_key, lun->per_res[i].res_key.key, sizeof(struct scsi_per_res_key)) == 0) { lun->per_res[i].registered = 0; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); lun->pr_key_count--; if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_REG_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i - persis_offset] |= CTL_UA_REG_PREEMPT; } else if (msg->pr.pr_info.res_type != lun->res_type && (lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type == SPR_TYPE_EX_AC_RO)) { if (!persis_offset && i < persis_offset) lun->pending_ua[i] |= CTL_UA_RES_RELEASE; else if (persis_offset && i >= persis_offset) lun->pending_ua[i - persis_offset] |= CTL_UA_RES_RELEASE; } } lun->per_res[msg->pr.pr_info.residx].registered = 1; lun->res_type = msg->pr.pr_info.res_type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = msg->pr.pr_info.residx; else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; } lun->PRGeneration++; } int ctl_persistent_reserve_out(struct ctl_scsiio *ctsio) { int retval; int isc_retval; u_int32_t param_len; struct scsi_per_res_out *cdb; struct ctl_lun *lun; struct scsi_per_res_out_parms* param; struct ctl_softc *softc; uint32_t residx; uint64_t res_key, sa_res_key; uint8_t type; union ctl_ha_msg persis_io; int i; CTL_DEBUG_PRINT(("ctl_persistent_reserve_out\n")); retval = CTL_RETVAL_COMPLETE; softc = control_softc; cdb = (struct scsi_per_res_out *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; /* * We only support whole-LUN scope. The scope & type are ignored for * register, register and ignore existing key and clear. * We sometimes ignore scope and type on preempts too!! * Verify reservation type here as well. */ type = cdb->scope_type & SPR_TYPE_MASK; if ((cdb->action == SPRO_RESERVE) || (cdb->action == SPRO_RELEASE)) { if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (type>8 || type==2 || type==4 || type==0) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } param_len = scsi_4btoul(cdb->length); if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK); ctsio->kern_data_len = param_len; ctsio->kern_total_len = param_len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } param = (struct scsi_per_res_out_parms *)ctsio->kern_data_ptr; residx = ctl_get_resindex(&ctsio->io_hdr.nexus); res_key = scsi_8btou64(param->res_key.key); sa_res_key = scsi_8btou64(param->serv_act_res_key); /* * Validate the reservation key here except for SPRO_REG_IGNO * This must be done for all other service actions */ if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REG_IGNO) { mtx_lock(&lun->lun_lock); if (lun->per_res[residx].registered) { if (memcmp(param->res_key.key, lun->per_res[residx].res_key.key, ctl_min(sizeof(param->res_key), sizeof(lun->per_res[residx].res_key))) != 0) { /* * The current key passed in doesn't match * the one the initiator previously * registered. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } else if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REGISTER) { /* * We are not registered */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if (res_key != 0) { /* * We are not registered and trying to register but * the register key isn't zero. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_unlock(&lun->lun_lock); } switch (cdb->action & SPRO_ACTION_MASK) { case SPRO_REGISTER: case SPRO_REG_IGNO: { #if 0 printf("Registration received\n"); #endif /* * We don't support any of these options, as we report in * the read capabilities request (see * ctl_persistent_reserve_in(), above). */ if ((param->flags & SPR_SPEC_I_PT) || (param->flags & SPR_ALL_TG_PT) || (param->flags & SPR_APTPL)) { int bit_ptr; if (param->flags & SPR_APTPL) bit_ptr = 0; else if (param->flags & SPR_ALL_TG_PT) bit_ptr = 2; else /* SPR_SPEC_I_PT */ bit_ptr = 3; free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 20, /*bit_valid*/ 1, /*bit*/ bit_ptr); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_lock(&lun->lun_lock); /* * The initiator wants to clear the * key/unregister. */ if (sa_res_key == 0) { if ((res_key == 0 && (cdb->action & SPRO_ACTION_MASK) == SPRO_REGISTER) || ((cdb->action & SPRO_ACTION_MASK) == SPRO_REG_IGNO && !lun->per_res[residx].registered)) { mtx_unlock(&lun->lun_lock); goto done; } lun->per_res[residx].registered = 0; memset(&lun->per_res[residx].res_key, 0, sizeof(lun->per_res[residx].res_key)); lun->pr_key_count--; if (residx == lun->pr_res_idx) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; if ((lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type == SPR_TYPE_EX_AC_RO) && lun->pr_key_count) { /* * If the reservation is a registrants * only type we need to generate a UA * for other registered inits. The * sense code should be RESERVATIONS * RELEASED */ for (i = 0; i < CTL_MAX_INITIATORS;i++){ if (lun->per_res[ i+persis_offset].registered == 0) continue; lun->pending_ua[i] |= CTL_UA_RES_RELEASE; } } lun->res_type = 0; } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { if (lun->pr_key_count==0) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; } } persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_UNREG_KEY; persis_io.pr.pr_info.residx = residx; if ((isc_retval = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0 )) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned from " "ctl_ha_msg_send %d\n", isc_retval); } } else /* sa_res_key != 0 */ { /* * If we aren't registered currently then increment * the key count and set the registered flag. */ if (!lun->per_res[residx].registered) { lun->pr_key_count++; lun->per_res[residx].registered = 1; } memcpy(&lun->per_res[residx].res_key, param->serv_act_res_key, ctl_min(sizeof(param->serv_act_res_key), sizeof(lun->per_res[residx].res_key))); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_REG_KEY; persis_io.pr.pr_info.residx = residx; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned from " "ctl_ha_msg_send %d\n", isc_retval); } } lun->PRGeneration++; mtx_unlock(&lun->lun_lock); break; } case SPRO_RESERVE: #if 0 printf("Reserve executed type %d\n", type); #endif mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_PR_RESERVED) { /* * if this isn't the reservation holder and it's * not a "all registrants" type or if the type is * different then we have a conflict */ if ((lun->pr_res_idx != residx && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) || lun->res_type != type) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_unlock(&lun->lun_lock); } else /* create a reservation */ { /* * If it's not an "all registrants" type record * reservation holder */ if (type != SPR_TYPE_WR_EX_AR && type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; /* Res holder */ else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; lun->flags |= CTL_LUN_PR_RESERVED; lun->res_type = type; mtx_unlock(&lun->lun_lock); /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_RESERVE; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned from " "ctl_ha_msg_send %d\n", isc_retval); } } break; case SPRO_RELEASE: mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_PR_RESERVED) == 0) { /* No reservation exists return good status */ mtx_unlock(&lun->lun_lock); goto done; } /* * Is this nexus a reservation holder? */ if (lun->pr_res_idx != residx && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) { /* * not a res holder return good status but * do nothing */ mtx_unlock(&lun->lun_lock); goto done; } if (lun->res_type != type) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_illegal_pr_release(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* okay to release */ lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; lun->res_type = 0; /* * if this isn't an exclusive access * res generate UA for all other * registrants. */ if (type != SPR_TYPE_EX_AC && type != SPR_TYPE_WR_EX) { /* * temporarily unregister so we don't generate UA */ lun->per_res[residx].registered = 0; for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (lun->per_res[i+persis_offset].registered == 0) continue; lun->pending_ua[i] |= CTL_UA_RES_RELEASE; } lun->per_res[residx].registered = 1; } mtx_unlock(&lun->lun_lock); /* Send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_RELEASE; if ((isc_retval=ctl_ha_msg_send( CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned from " "ctl_ha_msg_send %d\n", isc_retval); } break; case SPRO_CLEAR: /* send msg to other side */ mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_key_count = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; memset(&lun->per_res[residx].res_key, 0, sizeof(lun->per_res[residx].res_key)); lun->per_res[residx].registered = 0; for (i=0; i < 2*CTL_MAX_INITIATORS; i++) if (lun->per_res[i].registered) { if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_RES_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i-persis_offset] |= CTL_UA_RES_PREEMPT; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); lun->per_res[i].registered = 0; } lun->PRGeneration++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_CLEAR; if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Persis Out error returned from " "ctl_ha_msg_send %d\n", isc_retval); } break; case SPRO_PREEMPT: { int nretval; nretval = ctl_pro_preempt(softc, lun, res_key, sa_res_key, type, residx, ctsio, cdb, param); if (nretval != 0) return (CTL_RETVAL_COMPLETE); break; } default: panic("Invalid PR type %x", cdb->action); } done: free(ctsio->kern_data_ptr, M_CTL); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } /* * This routine is for handling a message from the other SC pertaining to * persistent reserve out. All the error checking will have been done * so only perorming the action need be done here to keep the two * in sync. */ static void ctl_hndl_per_res_out_on_other_sc(union ctl_ha_msg *msg) { struct ctl_lun *lun; struct ctl_softc *softc; int i; uint32_t targ_lun; softc = control_softc; targ_lun = msg->hdr.nexus.targ_mapped_lun; lun = softc->ctl_luns[targ_lun]; mtx_lock(&lun->lun_lock); switch(msg->pr.pr_info.action) { case CTL_PR_REG_KEY: if (!lun->per_res[msg->pr.pr_info.residx].registered) { lun->per_res[msg->pr.pr_info.residx].registered = 1; lun->pr_key_count++; } lun->PRGeneration++; memcpy(&lun->per_res[msg->pr.pr_info.residx].res_key, msg->pr.pr_info.sa_res_key, sizeof(struct scsi_per_res_key)); break; case CTL_PR_UNREG_KEY: lun->per_res[msg->pr.pr_info.residx].registered = 0; memset(&lun->per_res[msg->pr.pr_info.residx].res_key, 0, sizeof(struct scsi_per_res_key)); lun->pr_key_count--; /* XXX Need to see if the reservation has been released */ /* if so do we need to generate UA? */ if (msg->pr.pr_info.residx == lun->pr_res_idx) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; if ((lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type == SPR_TYPE_EX_AC_RO) && lun->pr_key_count) { /* * If the reservation is a registrants * only type we need to generate a UA * for other registered inits. The * sense code should be RESERVATIONS * RELEASED */ for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (lun->per_res[i+ persis_offset].registered == 0) continue; lun->pending_ua[i] |= CTL_UA_RES_RELEASE; } } lun->res_type = 0; } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { if (lun->pr_key_count==0) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; } } lun->PRGeneration++; break; case CTL_PR_RESERVE: lun->flags |= CTL_LUN_PR_RESERVED; lun->res_type = msg->pr.pr_info.res_type; lun->pr_res_idx = msg->pr.pr_info.residx; break; case CTL_PR_RELEASE: /* * if this isn't an exclusive access res generate UA for all * other registrants. */ if (lun->res_type != SPR_TYPE_EX_AC && lun->res_type != SPR_TYPE_WR_EX) { for (i = 0; i < CTL_MAX_INITIATORS; i++) if (lun->per_res[i+persis_offset].registered) lun->pending_ua[i] |= CTL_UA_RES_RELEASE; } lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; lun->res_type = 0; break; case CTL_PR_PREEMPT: ctl_pro_preempt_other(lun, msg); break; case CTL_PR_CLEAR: lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_key_count = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; for (i=0; i < 2*CTL_MAX_INITIATORS; i++) { if (lun->per_res[i].registered == 0) continue; if (!persis_offset && i < CTL_MAX_INITIATORS) lun->pending_ua[i] |= CTL_UA_RES_PREEMPT; else if (persis_offset && i >= persis_offset) lun->pending_ua[i-persis_offset] |= CTL_UA_RES_PREEMPT; memset(&lun->per_res[i].res_key, 0, sizeof(struct scsi_per_res_key)); lun->per_res[i].registered = 0; } lun->PRGeneration++; break; } mtx_unlock(&lun->lun_lock); } int ctl_read_write(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int flags, retval; int isread; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_read_write: command: %#x\n", ctsio->cdb[0])); flags = 0; retval = CTL_RETVAL_COMPLETE; isread = ctsio->cdb[0] == READ_6 || ctsio->cdb[0] == READ_10 || ctsio->cdb[0] == READ_12 || ctsio->cdb[0] == READ_16; if (lun->flags & CTL_LUN_PR_RESERVED && isread) { uint32_t residx; /* * XXX KDM need a lock here. */ residx = ctl_get_resindex(&ctsio->io_hdr.nexus); if ((lun->res_type == SPR_TYPE_EX_AC && residx != lun->pr_res_idx) || ((lun->res_type == SPR_TYPE_EX_AC_RO || lun->res_type == SPR_TYPE_EX_AC_AR) && !lun->per_res[residx].registered)) { ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } switch (ctsio->cdb[0]) { case READ_6: case WRITE_6: { struct scsi_rw_6 *cdb; cdb = (struct scsi_rw_6 *)ctsio->cdb; lba = scsi_3btoul(cdb->addr); /* only 5 bits are valid in the most significant address byte */ lba &= 0x1fffff; num_blocks = cdb->length; /* * This is correct according to SBC-2. */ if (num_blocks == 0) num_blocks = 256; break; } case READ_10: case WRITE_10: { struct scsi_rw_10 *cdb; cdb = (struct scsi_rw_10 *)ctsio->cdb; if (cdb->byte2 & SRW10_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW10_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case WRITE_VERIFY_10: { struct scsi_write_verify_10 *cdb; cdb = (struct scsi_write_verify_10 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case READ_12: case WRITE_12: { struct scsi_rw_12 *cdb; cdb = (struct scsi_rw_12 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_12: { struct scsi_write_verify_12 *cdb; cdb = (struct scsi_write_verify_12 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case READ_16: case WRITE_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_16: { struct scsi_write_verify_16 *cdb; cdb = (struct scsi_write_verify_16 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. * Note that this cannot happen with WRITE(6) or READ(6), since 0 * translates to 256 blocks for those commands. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Set FUA and/or DPO if caches are disabled. */ if (isread) { if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 & SCP_RCD) != 0) flags |= CTL_LLF_FUA | CTL_LLF_DPO; } else { if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 & SCP_WCE) == 0) flags |= CTL_LLF_FUA; } lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags; ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize; ctsio->kern_rel_offset = 0; CTL_DEBUG_PRINT(("ctl_read_write: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } static int ctl_cnw_cont(union ctl_io *io) { struct ctl_scsiio *ctsio; struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; int retval; ctsio = &io->scsiio; ctsio->io_hdr.status = CTL_STATUS_NONE; ctsio->io_hdr.flags &= ~CTL_FLAG_IO_CONT; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->flags &= ~CTL_LLF_COMPARE; lbalen->flags |= CTL_LLF_WRITE; CTL_DEBUG_PRINT(("ctl_cnw_cont: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_cnw(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int flags, retval; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_cnw: command: %#x\n", ctsio->cdb[0])); flags = 0; retval = CTL_RETVAL_COMPLETE; switch (ctsio->cdb[0]) { case COMPARE_AND_WRITE: { struct scsi_compare_and_write *cdb; cdb = (struct scsi_compare_and_write *)ctsio->cdb; if (cdb->byte2 & SRW10_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW10_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = cdb->length; break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Set FUA if write cache is disabled. */ if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 & SCP_WCE) == 0) flags |= CTL_LLF_FUA; ctsio->kern_total_len = 2 * num_blocks * lun->be_lun->blocksize; ctsio->kern_rel_offset = 0; /* * Set the IO_CONT flag, so that if this I/O gets passed to * ctl_data_submit_done(), it'll get passed back to * ctl_ctl_cnw_cont() for further processing. */ ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT; ctsio->io_cont = ctl_cnw_cont; lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = CTL_LLF_COMPARE | flags; CTL_DEBUG_PRINT(("ctl_cnw: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_verify(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int bytchk, flags; int retval; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_verify: command: %#x\n", ctsio->cdb[0])); bytchk = 0; flags = CTL_LLF_FUA; retval = CTL_RETVAL_COMPLETE; switch (ctsio->cdb[0]) { case VERIFY_10: { struct scsi_verify_10 *cdb; cdb = (struct scsi_verify_10 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case VERIFY_12: { struct scsi_verify_12 *cdb; cdb = (struct scsi_verify_12 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case VERIFY_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; if (bytchk) { lbalen->flags = CTL_LLF_COMPARE | flags; ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize; } else { lbalen->flags = CTL_LLF_VERIFY | flags; ctsio->kern_total_len = 0; } ctsio->kern_rel_offset = 0; CTL_DEBUG_PRINT(("ctl_verify: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_report_luns(struct ctl_scsiio *ctsio) { struct scsi_report_luns *cdb; struct scsi_report_luns_data *lun_data; struct ctl_lun *lun, *request_lun; int num_luns, retval; uint32_t alloc_len, lun_datalen; int num_filled, well_known; uint32_t initidx, targ_lun_id, lun_id; retval = CTL_RETVAL_COMPLETE; well_known = 0; cdb = (struct scsi_report_luns *)ctsio->cdb; CTL_DEBUG_PRINT(("ctl_report_luns\n")); mtx_lock(&control_softc->ctl_lock); num_luns = control_softc->num_luns; mtx_unlock(&control_softc->ctl_lock); switch (cdb->select_report) { case RPL_REPORT_DEFAULT: case RPL_REPORT_ALL: break; case RPL_REPORT_WELLKNOWN: well_known = 1; num_luns = 0; break; default: ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); break; /* NOTREACHED */ } alloc_len = scsi_4btoul(cdb->length); /* * The initiator has to allocate at least 16 bytes for this request, * so he can at least get the header and the first LUN. Otherwise * we reject the request (per SPC-3 rev 14, section 6.21). */ if (alloc_len < (sizeof(struct scsi_report_luns_data) + sizeof(struct scsi_report_luns_lundata))) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } request_lun = (struct ctl_lun *) ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; lun_datalen = sizeof(*lun_data) + (num_luns * sizeof(struct scsi_report_luns_lundata)); ctsio->kern_data_ptr = malloc(lun_datalen, M_CTL, M_WAITOK | M_ZERO); lun_data = (struct scsi_report_luns_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); mtx_lock(&control_softc->ctl_lock); for (targ_lun_id = 0, num_filled = 0; targ_lun_id < CTL_MAX_LUNS && num_filled < num_luns; targ_lun_id++) { lun_id = ctl_map_lun(ctsio->io_hdr.nexus.targ_port, targ_lun_id); if (lun_id >= CTL_MAX_LUNS) continue; lun = control_softc->ctl_luns[lun_id]; if (lun == NULL) continue; if (targ_lun_id <= 0xff) { /* * Peripheral addressing method, bus number 0. */ lun_data->luns[num_filled].lundata[0] = RPL_LUNDATA_ATYP_PERIPH; lun_data->luns[num_filled].lundata[1] = targ_lun_id; num_filled++; } else if (targ_lun_id <= 0x3fff) { /* * Flat addressing method. */ lun_data->luns[num_filled].lundata[0] = RPL_LUNDATA_ATYP_FLAT | (targ_lun_id & RPL_LUNDATA_FLAT_LUN_MASK); #ifdef OLDCTLHEADERS (SRLD_ADDR_FLAT << SRLD_ADDR_SHIFT) | (targ_lun_id & SRLD_BUS_LUN_MASK); #endif lun_data->luns[num_filled].lundata[1] = #ifdef OLDCTLHEADERS targ_lun_id >> SRLD_BUS_LUN_BITS; #endif targ_lun_id >> RPL_LUNDATA_FLAT_LUN_BITS; num_filled++; } else { printf("ctl_report_luns: bogus LUN number %jd, " "skipping\n", (intmax_t)targ_lun_id); } /* * According to SPC-3, rev 14 section 6.21: * * "The execution of a REPORT LUNS command to any valid and * installed logical unit shall clear the REPORTED LUNS DATA * HAS CHANGED unit attention condition for all logical * units of that target with respect to the requesting * initiator. A valid and installed logical unit is one * having a PERIPHERAL QUALIFIER of 000b in the standard * INQUIRY data (see 6.4.2)." * * If request_lun is NULL, the LUN this report luns command * was issued to is either disabled or doesn't exist. In that * case, we shouldn't clear any pending lun change unit * attention. */ if (request_lun != NULL) { mtx_lock(&lun->lun_lock); lun->pending_ua[initidx] &= ~CTL_UA_LUN_CHANGE; mtx_unlock(&lun->lun_lock); } } mtx_unlock(&control_softc->ctl_lock); /* * It's quite possible that we've returned fewer LUNs than we allocated * space for. Trim it. */ lun_datalen = sizeof(*lun_data) + (num_filled * sizeof(struct scsi_report_luns_lundata)); if (lun_datalen < alloc_len) { ctsio->residual = alloc_len - lun_datalen; ctsio->kern_data_len = lun_datalen; ctsio->kern_total_len = lun_datalen; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * We set this to the actual data length, regardless of how much * space we actually have to return results. If the user looks at * this value, he'll know whether or not he allocated enough space * and reissue the command if necessary. We don't support well * known logical units, so if the user asks for that, return none. */ scsi_ulto4b(lun_datalen - 8, lun_data->length); /* * We can only return SCSI_STATUS_CHECK_COND when we can't satisfy * this request. */ ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_request_sense(struct ctl_scsiio *ctsio) { struct scsi_request_sense *cdb; struct scsi_sense_data *sense_ptr; struct ctl_lun *lun; uint32_t initidx; int have_error; scsi_sense_data_type sense_format; cdb = (struct scsi_request_sense *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_request_sense\n")); /* * Determine which sense format the user wants. */ if (cdb->byte2 & SRS_DESC) sense_format = SSD_TYPE_DESC; else sense_format = SSD_TYPE_FIXED; ctsio->kern_data_ptr = malloc(sizeof(*sense_ptr), M_CTL, M_WAITOK); sense_ptr = (struct scsi_sense_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; /* * struct scsi_sense_data, which is currently set to 256 bytes, is * larger than the largest allowed value for the length field in the * REQUEST SENSE CDB, which is 252 bytes as of SPC-4. */ ctsio->residual = 0; ctsio->kern_data_len = cdb->length; ctsio->kern_total_len = cdb->length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * If we don't have a LUN, we don't have any pending sense. */ if (lun == NULL) goto no_sense; have_error = 0; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); /* * Check for pending sense, and then for pending unit attentions. * Pending sense gets returned first, then pending unit attentions. */ mtx_lock(&lun->lun_lock); #ifdef CTL_WITH_CA if (ctl_is_set(lun->have_ca, initidx)) { scsi_sense_data_type stored_format; /* * Check to see which sense format was used for the stored * sense data. */ stored_format = scsi_sense_type(&lun->pending_sense[initidx]); /* * If the user requested a different sense format than the * one we stored, then we need to convert it to the other * format. If we're going from descriptor to fixed format * sense data, we may lose things in translation, depending * on what options were used. * * If the stored format is SSD_TYPE_NONE (i.e. invalid), * for some reason we'll just copy it out as-is. */ if ((stored_format == SSD_TYPE_FIXED) && (sense_format == SSD_TYPE_DESC)) ctl_sense_to_desc((struct scsi_sense_data_fixed *) &lun->pending_sense[initidx], (struct scsi_sense_data_desc *)sense_ptr); else if ((stored_format == SSD_TYPE_DESC) && (sense_format == SSD_TYPE_FIXED)) ctl_sense_to_fixed((struct scsi_sense_data_desc *) &lun->pending_sense[initidx], (struct scsi_sense_data_fixed *)sense_ptr); else memcpy(sense_ptr, &lun->pending_sense[initidx], ctl_min(sizeof(*sense_ptr), sizeof(lun->pending_sense[initidx]))); ctl_clear_mask(lun->have_ca, initidx); have_error = 1; } else #endif if (lun->pending_ua[initidx] != CTL_UA_NONE) { ctl_ua_type ua_type; ua_type = ctl_build_ua(lun->pending_ua[initidx], sense_ptr, sense_format); if (ua_type != CTL_UA_NONE) { have_error = 1; /* We're reporting this UA, so clear it */ lun->pending_ua[initidx] &= ~ua_type; } } mtx_unlock(&lun->lun_lock); /* * We already have a pending error, return it. */ if (have_error != 0) { /* * We report the SCSI status as OK, since the status of the * request sense command itself is OK. */ ctsio->scsi_status = SCSI_STATUS_OK; /* * We report 0 for the sense length, because we aren't doing * autosense in this case. We're reporting sense as * parameter data. */ ctsio->sense_len = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } no_sense: /* * No sense information to report, so we report that everything is * okay. */ ctl_set_sense_data(sense_ptr, lun, sense_format, /*current_error*/ 1, /*sense_key*/ SSD_KEY_NO_SENSE, /*asc*/ 0x00, /*ascq*/ 0x00, SSD_ELEM_NONE); ctsio->scsi_status = SCSI_STATUS_OK; /* * We report 0 for the sense length, because we aren't doing * autosense in this case. We're reporting sense as parameter data. */ ctsio->sense_len = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_tur(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_tur\n")); if (lun == NULL) return (EINVAL); ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.status = CTL_SUCCESS; ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } #ifdef notyet static int ctl_cmddt_inquiry(struct ctl_scsiio *ctsio) { } #endif static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_supported_pages *pages; int sup_page_size; struct ctl_lun *lun; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; sup_page_size = sizeof(struct scsi_vpd_supported_pages) * SCSI_EVPD_NUM_SUPPORTED_PAGES; ctsio->kern_data_ptr = malloc(sup_page_size, M_CTL, M_WAITOK | M_ZERO); pages = (struct scsi_vpd_supported_pages *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sup_page_size < alloc_len) { ctsio->residual = alloc_len - sup_page_size; ctsio->kern_data_len = sup_page_size; ctsio->kern_total_len = sup_page_size; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) pages->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else pages->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; pages->length = SCSI_EVPD_NUM_SUPPORTED_PAGES; /* Supported VPD pages */ pages->page_list[0] = SVPD_SUPPORTED_PAGES; /* Serial Number */ pages->page_list[1] = SVPD_UNIT_SERIAL_NUMBER; /* Device Identification */ pages->page_list[2] = SVPD_DEVICE_ID; + /* Extended INQUIRY Data */ + pages->page_list[3] = SVPD_EXTENDED_INQUIRY_DATA; /* Mode Page Policy */ - pages->page_list[3] = SVPD_MODE_PAGE_POLICY; + pages->page_list[4] = SVPD_MODE_PAGE_POLICY; /* SCSI Ports */ - pages->page_list[4] = SVPD_SCSI_PORTS; + pages->page_list[5] = SVPD_SCSI_PORTS; /* Third-party Copy */ - pages->page_list[5] = SVPD_SCSI_TPC; + pages->page_list[6] = SVPD_SCSI_TPC; /* Block limits */ - pages->page_list[6] = SVPD_BLOCK_LIMITS; + pages->page_list[7] = SVPD_BLOCK_LIMITS; /* Block Device Characteristics */ - pages->page_list[7] = SVPD_BDC; + pages->page_list[8] = SVPD_BDC; /* Logical Block Provisioning */ - pages->page_list[8] = SVPD_LBP; + pages->page_list[9] = SVPD_LBP; ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_unit_serial_number *sn_ptr; struct ctl_lun *lun; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*sn_ptr), M_CTL, M_WAITOK | M_ZERO); sn_ptr = (struct scsi_vpd_unit_serial_number *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sizeof(*sn_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*sn_ptr); ctsio->kern_data_len = sizeof(*sn_ptr); ctsio->kern_total_len = sizeof(*sn_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) sn_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else sn_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; sn_ptr->page_code = SVPD_UNIT_SERIAL_NUMBER; sn_ptr->length = ctl_min(sizeof(*sn_ptr) - 4, CTL_SN_LEN); /* * If we don't have a LUN, we just leave the serial number as * all spaces. */ memset(sn_ptr->serial_num, 0x20, sizeof(sn_ptr->serial_num)); if (lun != NULL) { strncpy((char *)sn_ptr->serial_num, (char *)lun->be_lun->serial_num, CTL_SN_LEN); } ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int +ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len) +{ + struct scsi_vpd_extended_inquiry_data *eid_ptr; + struct ctl_lun *lun; + int data_len; + + lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; + + data_len = sizeof(struct scsi_vpd_mode_page_policy) + + sizeof(struct scsi_vpd_mode_page_policy_descr); + + ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); + eid_ptr = (struct scsi_vpd_extended_inquiry_data *)ctsio->kern_data_ptr; + ctsio->kern_sg_entries = 0; + + if (data_len < alloc_len) { + ctsio->residual = alloc_len - data_len; + ctsio->kern_data_len = data_len; + ctsio->kern_total_len = data_len; + } else { + ctsio->residual = 0; + ctsio->kern_data_len = alloc_len; + ctsio->kern_total_len = alloc_len; + } + ctsio->kern_data_resid = 0; + ctsio->kern_rel_offset = 0; + ctsio->kern_sg_entries = 0; + + /* + * The control device is always connected. The disk device, on the + * other hand, may not be online all the time. + */ + if (lun != NULL) + eid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | + lun->be_lun->lun_type; + else + eid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; + eid_ptr->page_code = SVPD_EXTENDED_INQUIRY_DATA; + eid_ptr->page_length = data_len - 4; + eid_ptr->flags2 = SVPD_EID_HEADSUP | SVPD_EID_ORDSUP | SVPD_EID_SIMPSUP; + eid_ptr->flags3 = SVPD_EID_V_SUP; + + ctsio->scsi_status = SCSI_STATUS_OK; + ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctsio->be_move_done = ctl_config_move_done; + ctl_datamove((union ctl_io *)ctsio); + + return (CTL_RETVAL_COMPLETE); +} + +static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_mode_page_policy *mpp_ptr; struct ctl_lun *lun; int data_len; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data_len = sizeof(struct scsi_vpd_mode_page_policy) + sizeof(struct scsi_vpd_mode_page_policy_descr); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); mpp_ptr = (struct scsi_vpd_mode_page_policy *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) mpp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else mpp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; mpp_ptr->page_code = SVPD_MODE_PAGE_POLICY; scsi_ulto2b(data_len - 4, mpp_ptr->page_length); mpp_ptr->descr[0].page_code = 0x3f; mpp_ptr->descr[0].subpage_code = 0xff; mpp_ptr->descr[0].policy = SVPD_MPP_SHARED; ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_device_id *devid_ptr; struct scsi_vpd_id_descriptor *desc; struct ctl_softc *ctl_softc; struct ctl_lun *lun; struct ctl_port *port; int data_len; uint8_t proto; ctl_softc = control_softc; port = ctl_softc->ctl_ports[ctl_port_idx(ctsio->io_hdr.nexus.targ_port)]; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data_len = sizeof(struct scsi_vpd_device_id) + sizeof(struct scsi_vpd_id_descriptor) + sizeof(struct scsi_vpd_id_rel_trgt_port_id) + sizeof(struct scsi_vpd_id_descriptor) + sizeof(struct scsi_vpd_id_trgt_port_grp_id); if (lun && lun->lun_devid) data_len += lun->lun_devid->len; if (port->port_devid) data_len += port->port_devid->len; if (port->target_devid) data_len += port->target_devid->len; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); devid_ptr = (struct scsi_vpd_device_id *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) devid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else devid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; devid_ptr->page_code = SVPD_DEVICE_ID; scsi_ulto2b(data_len - 4, devid_ptr->length); if (port->port_type == CTL_PORT_FC) proto = SCSI_PROTO_FC << 4; else if (port->port_type == CTL_PORT_ISCSI) proto = SCSI_PROTO_ISCSI << 4; else proto = SCSI_PROTO_SPI << 4; desc = (struct scsi_vpd_id_descriptor *)devid_ptr->desc_list; /* * We're using a LUN association here. i.e., this device ID is a * per-LUN identifier. */ if (lun && lun->lun_devid) { memcpy(desc, lun->lun_devid->data, lun->lun_devid->len); desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc + lun->lun_devid->len); } /* * This is for the WWPN which is a port association. */ if (port->port_devid) { memcpy(desc, port->port_devid->data, port->port_devid->len); desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc + port->port_devid->len); } /* * This is for the Relative Target Port(type 4h) identifier */ desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT | SVPD_ID_TYPE_RELTARG; desc->length = 4; scsi_ulto2b(ctsio->io_hdr.nexus.targ_port, &desc->identifier[2]); desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + sizeof(struct scsi_vpd_id_rel_trgt_port_id)); /* * This is for the Target Port Group(type 5h) identifier */ desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT | SVPD_ID_TYPE_TPORTGRP; desc->length = 4; scsi_ulto2b(ctsio->io_hdr.nexus.targ_port / CTL_MAX_PORTS + 1, &desc->identifier[2]); desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + sizeof(struct scsi_vpd_id_trgt_port_grp_id)); /* * This is for the Target identifier */ if (port->target_devid) { memcpy(desc, port->target_devid->data, port->target_devid->len); } ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_softc *softc = control_softc; struct scsi_vpd_scsi_ports *sp; struct scsi_vpd_port_designation *pd; struct scsi_vpd_port_designation_cont *pdc; struct ctl_lun *lun; struct ctl_port *port; int data_len, num_target_ports, iid_len, id_len, g, pg, p; int num_target_port_groups, single; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; single = ctl_is_single; if (single) num_target_port_groups = 1; else num_target_port_groups = NUM_TARGET_PORT_GROUPS; num_target_ports = 0; iid_len = 0; id_len = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (lun != NULL && ctl_map_lun_back(port->targ_port, lun->lun) >= CTL_MAX_LUNS) continue; num_target_ports++; if (port->init_devid) iid_len += port->init_devid->len; if (port->port_devid) id_len += port->port_devid->len; } mtx_unlock(&softc->ctl_lock); data_len = sizeof(struct scsi_vpd_scsi_ports) + num_target_port_groups * num_target_ports * (sizeof(struct scsi_vpd_port_designation) + sizeof(struct scsi_vpd_port_designation_cont)) + iid_len + id_len; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); sp = (struct scsi_vpd_scsi_ports *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) sp->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else sp->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; sp->page_code = SVPD_SCSI_PORTS; scsi_ulto2b(data_len - sizeof(struct scsi_vpd_scsi_ports), sp->page_length); pd = &sp->design[0]; mtx_lock(&softc->ctl_lock); if (softc->flags & CTL_FLAG_MASTER_SHELF) pg = 0; else pg = 1; for (g = 0; g < num_target_port_groups; g++) { STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (lun != NULL && ctl_map_lun_back(port->targ_port, lun->lun) >= CTL_MAX_LUNS) continue; p = port->targ_port % CTL_MAX_PORTS + g * CTL_MAX_PORTS; scsi_ulto2b(p, pd->relative_port_id); if (port->init_devid && g == pg) { iid_len = port->init_devid->len; memcpy(pd->initiator_transportid, port->init_devid->data, port->init_devid->len); } else iid_len = 0; scsi_ulto2b(iid_len, pd->initiator_transportid_length); pdc = (struct scsi_vpd_port_designation_cont *) (&pd->initiator_transportid[iid_len]); if (port->port_devid && g == pg) { id_len = port->port_devid->len; memcpy(pdc->target_port_descriptors, port->port_devid->data, port->port_devid->len); } else id_len = 0; scsi_ulto2b(id_len, pdc->target_port_descriptors_length); pd = (struct scsi_vpd_port_designation *) ((uint8_t *)pdc->target_port_descriptors + id_len); } } mtx_unlock(&softc->ctl_lock); ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_block_limits *bl_ptr; struct ctl_lun *lun; int bs; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*bl_ptr), M_CTL, M_WAITOK | M_ZERO); bl_ptr = (struct scsi_vpd_block_limits *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sizeof(*bl_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*bl_ptr); ctsio->kern_data_len = sizeof(*bl_ptr); ctsio->kern_total_len = sizeof(*bl_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) bl_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else bl_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; bl_ptr->page_code = SVPD_BLOCK_LIMITS; scsi_ulto2b(sizeof(*bl_ptr), bl_ptr->page_length); bl_ptr->max_cmp_write_len = 0xff; scsi_ulto4b(0xffffffff, bl_ptr->max_txfer_len); if (lun != NULL) { bs = lun->be_lun->blocksize; scsi_ulto4b(MAXPHYS / bs, bl_ptr->opt_txfer_len); if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) { scsi_ulto4b(0xffffffff, bl_ptr->max_unmap_lba_cnt); scsi_ulto4b(0xffffffff, bl_ptr->max_unmap_blk_cnt); if (lun->be_lun->pblockexp != 0) { scsi_ulto4b((1 << lun->be_lun->pblockexp), bl_ptr->opt_unmap_grain); scsi_ulto4b(0x80000000 | lun->be_lun->pblockoff, bl_ptr->unmap_grain_align); } } } scsi_u64to8b(UINT64_MAX, bl_ptr->max_write_same_length); ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_block_device_characteristics *bdc_ptr; struct ctl_lun *lun; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*bdc_ptr), M_CTL, M_WAITOK | M_ZERO); bdc_ptr = (struct scsi_vpd_block_device_characteristics *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sizeof(*bdc_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*bdc_ptr); ctsio->kern_data_len = sizeof(*bdc_ptr); ctsio->kern_total_len = sizeof(*bdc_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) bdc_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else bdc_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; bdc_ptr->page_code = SVPD_BDC; scsi_ulto2b(sizeof(*bdc_ptr) - 4, bdc_ptr->page_length); scsi_ulto2b(SVPD_NON_ROTATING, bdc_ptr->medium_rotation_rate); bdc_ptr->flags = SVPD_FUAB | SVPD_VBULS; ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_logical_block_prov *lbp_ptr; struct ctl_lun *lun; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*lbp_ptr), M_CTL, M_WAITOK | M_ZERO); lbp_ptr = (struct scsi_vpd_logical_block_prov *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sizeof(*lbp_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*lbp_ptr); ctsio->kern_data_len = sizeof(*lbp_ptr); ctsio->kern_total_len = sizeof(*lbp_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) lbp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else lbp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; lbp_ptr->page_code = SVPD_LBP; scsi_ulto2b(sizeof(*lbp_ptr) - 4, lbp_ptr->page_length); if (lun != NULL && lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) { lbp_ptr->flags = SVPD_LBP_UNMAP | SVPD_LBP_WS16 | SVPD_LBP_WS10 | SVPD_LBP_RZ | SVPD_LBP_ANC_SUP; lbp_ptr->prov_type = SVPD_LBP_RESOURCE; } ctsio->scsi_status = SCSI_STATUS_OK; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio) { struct scsi_inquiry *cdb; struct ctl_lun *lun; int alloc_len, retval; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_inquiry *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; alloc_len = scsi_2btoul(cdb->length); switch (cdb->page_code) { case SVPD_SUPPORTED_PAGES: retval = ctl_inquiry_evpd_supported(ctsio, alloc_len); break; case SVPD_UNIT_SERIAL_NUMBER: retval = ctl_inquiry_evpd_serial(ctsio, alloc_len); break; case SVPD_DEVICE_ID: retval = ctl_inquiry_evpd_devid(ctsio, alloc_len); + break; + case SVPD_EXTENDED_INQUIRY_DATA: + retval = ctl_inquiry_evpd_eid(ctsio, alloc_len); break; case SVPD_MODE_PAGE_POLICY: retval = ctl_inquiry_evpd_mpp(ctsio, alloc_len); break; case SVPD_SCSI_PORTS: retval = ctl_inquiry_evpd_scsi_ports(ctsio, alloc_len); break; case SVPD_SCSI_TPC: retval = ctl_inquiry_evpd_tpc(ctsio, alloc_len); break; case SVPD_BLOCK_LIMITS: retval = ctl_inquiry_evpd_block_limits(ctsio, alloc_len); break; case SVPD_BDC: retval = ctl_inquiry_evpd_bdc(ctsio, alloc_len); break; case SVPD_LBP: retval = ctl_inquiry_evpd_lbp(ctsio, alloc_len); break; default: ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } static int ctl_inquiry_std(struct ctl_scsiio *ctsio) { struct scsi_inquiry_data *inq_ptr; struct scsi_inquiry *cdb; struct ctl_softc *ctl_softc; struct ctl_lun *lun; char *val; uint32_t alloc_len; ctl_port_type port_type; ctl_softc = control_softc; /* * Figure out whether we're talking to a Fibre Channel port or not. * We treat the ioctl front end, and any SCSI adapters, as packetized * SCSI front ends. */ port_type = ctl_softc->ctl_ports[ ctl_port_idx(ctsio->io_hdr.nexus.targ_port)]->port_type; if (port_type == CTL_PORT_IOCTL || port_type == CTL_PORT_INTERNAL) port_type = CTL_PORT_SCSI; lun = ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_inquiry *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); /* * We malloc the full inquiry data size here and fill it * in. If the user only asks for less, we'll give him * that much. */ ctsio->kern_data_ptr = malloc(sizeof(*inq_ptr), M_CTL, M_WAITOK | M_ZERO); inq_ptr = (struct scsi_inquiry_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; if (sizeof(*inq_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*inq_ptr); ctsio->kern_data_len = sizeof(*inq_ptr); ctsio->kern_total_len = sizeof(*inq_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } /* * If we have a LUN configured, report it as connected. Otherwise, * report that it is offline or no device is supported, depending * on the value of inquiry_pq_no_lun. * * According to the spec (SPC-4 r34), the peripheral qualifier * SID_QUAL_LU_OFFLINE (001b) is used in the following scenario: * * "A peripheral device having the specified peripheral device type * is not connected to this logical unit. However, the device * server is capable of supporting the specified peripheral device * type on this logical unit." * * According to the same spec, the peripheral qualifier * SID_QUAL_BAD_LU (011b) is used in this scenario: * * "The device server is not capable of supporting a peripheral * device on this logical unit. For this peripheral qualifier the * peripheral device type shall be set to 1Fh. All other peripheral * device type values are reserved for this peripheral qualifier." * * Given the text, it would seem that we probably want to report that * the LUN is offline here. There is no LUN connected, but we can * support a LUN at the given LUN number. * * In the real world, though, it sounds like things are a little * different: * * - Linux, when presented with a LUN with the offline peripheral * qualifier, will create an sg driver instance for it. So when * you attach it to CTL, you wind up with a ton of sg driver * instances. (One for every LUN that Linux bothered to probe.) * Linux does this despite the fact that it issues a REPORT LUNs * to LUN 0 to get the inventory of supported LUNs. * * - There is other anecdotal evidence (from Emulex folks) about * arrays that use the offline peripheral qualifier for LUNs that * are on the "passive" path in an active/passive array. * * So the solution is provide a hopefully reasonable default * (return bad/no LUN) and allow the user to change the behavior * with a tunable/sysctl variable. */ if (lun != NULL) inq_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else if (ctl_softc->inquiry_pq_no_lun == 0) inq_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; else inq_ptr->device = (SID_QUAL_BAD_LU << 5) | T_NODEVICE; /* RMB in byte 2 is 0 */ inq_ptr->version = SCSI_REV_SPC4; /* * According to SAM-3, even if a device only supports a single * level of LUN addressing, it should still set the HISUP bit: * * 4.9.1 Logical unit numbers overview * * All logical unit number formats described in this standard are * hierarchical in structure even when only a single level in that * hierarchy is used. The HISUP bit shall be set to one in the * standard INQUIRY data (see SPC-2) when any logical unit number * format described in this standard is used. Non-hierarchical * formats are outside the scope of this standard. * * Therefore we set the HiSup bit here. * * The reponse format is 2, per SPC-3. */ inq_ptr->response_format = SID_HiSup | 2; inq_ptr->additional_length = offsetof(struct scsi_inquiry_data, vendor_specific1) - (offsetof(struct scsi_inquiry_data, additional_length) + 1); CTL_DEBUG_PRINT(("additional_length = %d\n", inq_ptr->additional_length)); inq_ptr->spc3_flags = SPC3_SID_3PC; if (!ctl_is_single) inq_ptr->spc3_flags |= SPC3_SID_TPGS_IMPLICIT; /* 16 bit addressing */ if (port_type == CTL_PORT_SCSI) inq_ptr->spc2_flags = SPC2_SID_ADDR16; /* XXX set the SID_MultiP bit here if we're actually going to respond on multiple ports */ inq_ptr->spc2_flags |= SPC2_SID_MultiP; /* 16 bit data bus, synchronous transfers */ if (port_type == CTL_PORT_SCSI) inq_ptr->flags = SID_WBus16 | SID_Sync; /* * XXX KDM do we want to support tagged queueing on the control * device at all? */ if ((lun == NULL) || (lun->be_lun->lun_type != T_PROCESSOR)) inq_ptr->flags |= SID_CmdQue; /* * Per SPC-3, unused bytes in ASCII strings are filled with spaces. * We have 8 bytes for the vendor name, and 16 bytes for the device * name and 4 bytes for the revision. */ if (lun == NULL || (val = ctl_get_opt(&lun->be_lun->options, "vendor")) == NULL) { strncpy(inq_ptr->vendor, CTL_VENDOR, sizeof(inq_ptr->vendor)); } else { memset(inq_ptr->vendor, ' ', sizeof(inq_ptr->vendor)); strncpy(inq_ptr->vendor, val, min(sizeof(inq_ptr->vendor), strlen(val))); } if (lun == NULL) { strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT, sizeof(inq_ptr->product)); } else if ((val = ctl_get_opt(&lun->be_lun->options, "product")) == NULL) { switch (lun->be_lun->lun_type) { case T_DIRECT: strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT, sizeof(inq_ptr->product)); break; case T_PROCESSOR: strncpy(inq_ptr->product, CTL_PROCESSOR_PRODUCT, sizeof(inq_ptr->product)); break; default: strncpy(inq_ptr->product, CTL_UNKNOWN_PRODUCT, sizeof(inq_ptr->product)); break; } } else { memset(inq_ptr->product, ' ', sizeof(inq_ptr->product)); strncpy(inq_ptr->product, val, min(sizeof(inq_ptr->product), strlen(val))); } /* * XXX make this a macro somewhere so it automatically gets * incremented when we make changes. */ if (lun == NULL || (val = ctl_get_opt(&lun->be_lun->options, "revision")) == NULL) { strncpy(inq_ptr->revision, "0001", sizeof(inq_ptr->revision)); } else { memset(inq_ptr->revision, ' ', sizeof(inq_ptr->revision)); strncpy(inq_ptr->revision, val, min(sizeof(inq_ptr->revision), strlen(val))); } /* * For parallel SCSI, we support double transition and single * transition clocking. We also support QAS (Quick Arbitration * and Selection) and Information Unit transfers on both the * control and array devices. */ if (port_type == CTL_PORT_SCSI) inq_ptr->spi3data = SID_SPI_CLOCK_DT_ST | SID_SPI_QAS | SID_SPI_IUS; /* SAM-5 (no version claimed) */ scsi_ulto2b(0x00A0, inq_ptr->version1); /* SPC-4 (no version claimed) */ scsi_ulto2b(0x0460, inq_ptr->version2); if (port_type == CTL_PORT_FC) { /* FCP-2 ANSI INCITS.350:2003 */ scsi_ulto2b(0x0917, inq_ptr->version3); } else if (port_type == CTL_PORT_SCSI) { /* SPI-4 ANSI INCITS.362:200x */ scsi_ulto2b(0x0B56, inq_ptr->version3); } else if (port_type == CTL_PORT_ISCSI) { /* iSCSI (no version claimed) */ scsi_ulto2b(0x0960, inq_ptr->version3); } else if (port_type == CTL_PORT_SAS) { /* SAS (no version claimed) */ scsi_ulto2b(0x0BE0, inq_ptr->version3); } if (lun == NULL) { /* SBC-3 (no version claimed) */ scsi_ulto2b(0x04C0, inq_ptr->version4); } else { switch (lun->be_lun->lun_type) { case T_DIRECT: /* SBC-3 (no version claimed) */ scsi_ulto2b(0x04C0, inq_ptr->version4); break; case T_PROCESSOR: default: break; } } ctsio->scsi_status = SCSI_STATUS_OK; if (ctsio->kern_data_len > 0) { ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); } else { ctsio->io_hdr.status = CTL_SUCCESS; ctl_done((union ctl_io *)ctsio); } return (CTL_RETVAL_COMPLETE); } int ctl_inquiry(struct ctl_scsiio *ctsio) { struct scsi_inquiry *cdb; int retval; cdb = (struct scsi_inquiry *)ctsio->cdb; retval = 0; CTL_DEBUG_PRINT(("ctl_inquiry\n")); /* * Right now, we don't support the CmdDt inquiry information. * This would be nice to support in the future. When we do * support it, we should change this test so that it checks to make * sure SI_EVPD and SI_CMDDT aren't both set at the same time. */ #ifdef notyet if (((cdb->byte2 & SI_EVPD) && (cdb->byte2 & SI_CMDDT))) #endif if (cdb->byte2 & SI_CMDDT) { /* * Point to the SI_CMDDT bit. We might change this * when we support SI_CMDDT, but since both bits would be * "wrong", this should probably just stay as-is then. */ ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 1); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (cdb->byte2 & SI_EVPD) retval = ctl_inquiry_evpd(ctsio); #ifdef notyet else if (cdb->byte2 & SI_CMDDT) retval = ctl_inquiry_cmddt(ctsio); #endif else retval = ctl_inquiry_std(ctsio); return (retval); } /* * For known CDB types, parse the LBA and length. */ static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint32_t *len) { if (io->io_hdr.io_type != CTL_IO_SCSI) return (1); switch (io->scsiio.cdb[0]) { case COMPARE_AND_WRITE: { struct scsi_compare_and_write *cdb; cdb = (struct scsi_compare_and_write *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = cdb->length; break; } case READ_6: case WRITE_6: { struct scsi_rw_6 *cdb; cdb = (struct scsi_rw_6 *)io->scsiio.cdb; *lba = scsi_3btoul(cdb->addr); /* only 5 bits are valid in the most significant address byte */ *lba &= 0x1fffff; *len = cdb->length; break; } case READ_10: case WRITE_10: { struct scsi_rw_10 *cdb; cdb = (struct scsi_rw_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case WRITE_VERIFY_10: { struct scsi_write_verify_10 *cdb; cdb = (struct scsi_write_verify_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case READ_12: case WRITE_12: { struct scsi_rw_12 *cdb; cdb = (struct scsi_rw_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_12: { struct scsi_write_verify_12 *cdb; cdb = (struct scsi_write_verify_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case READ_16: case WRITE_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_16: { struct scsi_write_verify_16 *cdb; cdb = (struct scsi_write_verify_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_SAME_10: { struct scsi_write_same_10 *cdb; cdb = (struct scsi_write_same_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case WRITE_SAME_16: { struct scsi_write_same_16 *cdb; cdb = (struct scsi_write_same_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case VERIFY_10: { struct scsi_verify_10 *cdb; cdb = (struct scsi_verify_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case VERIFY_12: { struct scsi_verify_12 *cdb; cdb = (struct scsi_verify_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case VERIFY_16: { struct scsi_verify_16 *cdb; cdb = (struct scsi_verify_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } default: return (1); break; /* NOTREACHED */ } return (0); } static ctl_action ctl_extent_check_lba(uint64_t lba1, uint32_t len1, uint64_t lba2, uint32_t len2) { uint64_t endlba1, endlba2; endlba1 = lba1 + len1 - 1; endlba2 = lba2 + len2 - 1; if ((endlba1 < lba2) || (endlba2 < lba1)) return (CTL_ACTION_PASS); else return (CTL_ACTION_BLOCK); } static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2) { uint64_t lba1, lba2; uint32_t len1, len2; int retval; retval = ctl_get_lba_len(io1, &lba1, &len1); if (retval != 0) return (CTL_ACTION_ERROR); retval = ctl_get_lba_len(io2, &lba2, &len2); if (retval != 0) return (CTL_ACTION_ERROR); return (ctl_extent_check_lba(lba1, len1, lba2, len2)); } static ctl_action ctl_check_for_blockage(union ctl_io *pending_io, union ctl_io *ooa_io) { const struct ctl_cmd_entry *pending_entry, *ooa_entry; ctl_serialize_action *serialize_row; /* * The initiator attempted multiple untagged commands at the same * time. Can't do that. */ if ((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED) && (ooa_io->scsiio.tag_type == CTL_TAG_UNTAGGED) && ((pending_io->io_hdr.nexus.targ_port == ooa_io->io_hdr.nexus.targ_port) && (pending_io->io_hdr.nexus.initid.id == ooa_io->io_hdr.nexus.initid.id)) && ((ooa_io->io_hdr.flags & CTL_FLAG_ABORT) == 0)) return (CTL_ACTION_OVERLAP); /* * The initiator attempted to send multiple tagged commands with * the same ID. (It's fine if different initiators have the same * tag ID.) * * Even if all of those conditions are true, we don't kill the I/O * if the command ahead of us has been aborted. We won't end up * sending it to the FETD, and it's perfectly legal to resend a * command with the same tag number as long as the previous * instance of this tag number has been aborted somehow. */ if ((pending_io->scsiio.tag_type != CTL_TAG_UNTAGGED) && (ooa_io->scsiio.tag_type != CTL_TAG_UNTAGGED) && (pending_io->scsiio.tag_num == ooa_io->scsiio.tag_num) && ((pending_io->io_hdr.nexus.targ_port == ooa_io->io_hdr.nexus.targ_port) && (pending_io->io_hdr.nexus.initid.id == ooa_io->io_hdr.nexus.initid.id)) && ((ooa_io->io_hdr.flags & CTL_FLAG_ABORT) == 0)) return (CTL_ACTION_OVERLAP_TAG); /* * If we get a head of queue tag, SAM-3 says that we should * immediately execute it. * * What happens if this command would normally block for some other * reason? e.g. a request sense with a head of queue tag * immediately after a write. Normally that would block, but this * will result in its getting executed immediately... * * We currently return "pass" instead of "skip", so we'll end up * going through the rest of the queue to check for overlapped tags. * * XXX KDM check for other types of blockage first?? */ if (pending_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE) return (CTL_ACTION_PASS); /* * Ordered tags have to block until all items ahead of them * have completed. If we get called with an ordered tag, we always * block, if something else is ahead of us in the queue. */ if (pending_io->scsiio.tag_type == CTL_TAG_ORDERED) return (CTL_ACTION_BLOCK); /* * Simple tags get blocked until all head of queue and ordered tags * ahead of them have completed. I'm lumping untagged commands in * with simple tags here. XXX KDM is that the right thing to do? */ if (((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED) || (pending_io->scsiio.tag_type == CTL_TAG_SIMPLE)) && ((ooa_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE) || (ooa_io->scsiio.tag_type == CTL_TAG_ORDERED))) return (CTL_ACTION_BLOCK); pending_entry = ctl_get_cmd_entry(&pending_io->scsiio); ooa_entry = ctl_get_cmd_entry(&ooa_io->scsiio); serialize_row = ctl_serialize_table[ooa_entry->seridx]; switch (serialize_row[pending_entry->seridx]) { case CTL_SER_BLOCK: return (CTL_ACTION_BLOCK); break; /* NOTREACHED */ case CTL_SER_EXTENT: return (ctl_extent_check(pending_io, ooa_io)); break; /* NOTREACHED */ case CTL_SER_PASS: return (CTL_ACTION_PASS); break; /* NOTREACHED */ case CTL_SER_SKIP: return (CTL_ACTION_SKIP); break; default: panic("invalid serialization value %d", serialize_row[pending_entry->seridx]); break; /* NOTREACHED */ } return (CTL_ACTION_ERROR); } /* * Check for blockage or overlaps against the OOA (Order Of Arrival) queue. * Assumptions: * - pending_io is generally either incoming, or on the blocked queue * - starting I/O is the I/O we want to start the check with. */ static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *starting_io) { union ctl_io *ooa_io; ctl_action action; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run back along the OOA queue, starting with the current * blocked I/O and going through every I/O before it on the * queue. If starting_io is NULL, we'll just end up returning * CTL_ACTION_PASS. */ for (ooa_io = starting_io; ooa_io != NULL; ooa_io = (union ctl_io *)TAILQ_PREV(&ooa_io->io_hdr, ctl_ooaq, ooa_links)){ /* * This routine just checks to see whether * cur_blocked is blocked by ooa_io, which is ahead * of it in the queue. It doesn't queue/dequeue * cur_blocked. */ action = ctl_check_for_blockage(pending_io, ooa_io); switch (action) { case CTL_ACTION_BLOCK: case CTL_ACTION_OVERLAP: case CTL_ACTION_OVERLAP_TAG: case CTL_ACTION_SKIP: case CTL_ACTION_ERROR: return (action); break; /* NOTREACHED */ case CTL_ACTION_PASS: break; default: panic("invalid action %d", action); break; /* NOTREACHED */ } } return (CTL_ACTION_PASS); } /* * Assumptions: * - An I/O has just completed, and has been removed from the per-LUN OOA * queue, so some items on the blocked queue may now be unblocked. */ static int ctl_check_blocked(struct ctl_lun *lun) { union ctl_io *cur_blocked, *next_blocked; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run forward from the head of the blocked queue, checking each * entry against the I/Os prior to it on the OOA queue to see if * there is still any blockage. * * We cannot use the TAILQ_FOREACH() macro, because it can't deal * with our removing a variable on it while it is traversing the * list. */ for (cur_blocked = (union ctl_io *)TAILQ_FIRST(&lun->blocked_queue); cur_blocked != NULL; cur_blocked = next_blocked) { union ctl_io *prev_ooa; ctl_action action; next_blocked = (union ctl_io *)TAILQ_NEXT(&cur_blocked->io_hdr, blocked_links); prev_ooa = (union ctl_io *)TAILQ_PREV(&cur_blocked->io_hdr, ctl_ooaq, ooa_links); /* * If cur_blocked happens to be the first item in the OOA * queue now, prev_ooa will be NULL, and the action * returned will just be CTL_ACTION_PASS. */ action = ctl_check_ooa(lun, cur_blocked, prev_ooa); switch (action) { case CTL_ACTION_BLOCK: /* Nothing to do here, still blocked */ break; case CTL_ACTION_OVERLAP: case CTL_ACTION_OVERLAP_TAG: /* * This shouldn't happen! In theory we've already * checked this command for overlap... */ break; case CTL_ACTION_PASS: case CTL_ACTION_SKIP: { struct ctl_softc *softc; const struct ctl_cmd_entry *entry; uint32_t initidx; int isc_retval; /* * The skip case shouldn't happen, this transaction * should have never made it onto the blocked queue. */ /* * This I/O is no longer blocked, we can remove it * from the blocked queue. Since this is a TAILQ * (doubly linked list), we can do O(1) removals * from any place on the list. */ TAILQ_REMOVE(&lun->blocked_queue, &cur_blocked->io_hdr, blocked_links); cur_blocked->io_hdr.flags &= ~CTL_FLAG_BLOCKED; if (cur_blocked->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC){ /* * Need to send IO back to original side to * run */ union ctl_ha_msg msg_info; msg_info.hdr.original_sc = cur_blocked->io_hdr.original_sc; msg_info.hdr.serializing_sc = cur_blocked; msg_info.hdr.msg_type = CTL_MSG_R2R; if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:Check Blocked error from " "ctl_ha_msg_send %d\n", isc_retval); } break; } entry = ctl_get_cmd_entry(&cur_blocked->scsiio); softc = control_softc; initidx = ctl_get_initindex(&cur_blocked->io_hdr.nexus); /* * Check this I/O for LUN state changes that may * have happened while this command was blocked. * The LUN state may have been changed by a command * ahead of us in the queue, so we need to re-check * for any states that can be caused by SCSI * commands. */ if (ctl_scsiio_lun_check(softc, lun, entry, &cur_blocked->scsiio) == 0) { cur_blocked->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr(cur_blocked); } else ctl_done(cur_blocked); break; } default: /* * This probably shouldn't happen -- we shouldn't * get CTL_ACTION_ERROR, or anything else. */ break; } } return (CTL_RETVAL_COMPLETE); } /* * This routine (with one exception) checks LUN flags that can be set by * commands ahead of us in the OOA queue. These flags have to be checked * when a command initially comes in, and when we pull a command off the * blocked queue and are preparing to execute it. The reason we have to * check these flags for commands on the blocked queue is that the LUN * state may have been changed by a command ahead of us while we're on the * blocked queue. * * Ordering is somewhat important with these checks, so please pay * careful attention to the placement of any new checks. */ static int ctl_scsiio_lun_check(struct ctl_softc *ctl_softc, struct ctl_lun *lun, const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio) { int retval; retval = 0; mtx_assert(&lun->lun_lock, MA_OWNED); /* * If this shelf is a secondary shelf controller, we have to reject * any media access commands. */ #if 0 /* No longer needed for HA */ if (((ctl_softc->flags & CTL_FLAG_MASTER_SHELF) == 0) && ((entry->flags & CTL_CMD_FLAG_OK_ON_SECONDARY) == 0)) { ctl_set_lun_standby(ctsio); retval = 1; goto bailout; } #endif /* * Check for a reservation conflict. If this command isn't allowed * even on reserved LUNs, and if this initiator isn't the one who * reserved us, reject the command with a reservation conflict. */ if ((lun->flags & CTL_LUN_RESERVED) && ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_RESV) == 0)) { if ((ctsio->io_hdr.nexus.initid.id != lun->rsv_nexus.initid.id) || (ctsio->io_hdr.nexus.targ_port != lun->rsv_nexus.targ_port) || (ctsio->io_hdr.nexus.targ_target.id != lun->rsv_nexus.targ_target.id)) { ctsio->scsi_status = SCSI_STATUS_RESERV_CONFLICT; ctsio->io_hdr.status = CTL_SCSI_ERROR; retval = 1; goto bailout; } } if ( (lun->flags & CTL_LUN_PR_RESERVED) && ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_RESV) == 0)) { uint32_t residx; residx = ctl_get_resindex(&ctsio->io_hdr.nexus); /* * if we aren't registered or it's a res holder type * reservation and this isn't the res holder then set a * conflict. * NOTE: Commands which might be allowed on write exclusive * type reservations are checked in the particular command * for a conflict. Read and SSU are the only ones. */ if (!lun->per_res[residx].registered || (residx != lun->pr_res_idx && lun->res_type < 4)) { ctsio->scsi_status = SCSI_STATUS_RESERV_CONFLICT; ctsio->io_hdr.status = CTL_SCSI_ERROR; retval = 1; goto bailout; } } if ((lun->flags & CTL_LUN_OFFLINE) && ((entry->flags & CTL_CMD_FLAG_OK_ON_OFFLINE) == 0)) { ctl_set_lun_not_ready(ctsio); retval = 1; goto bailout; } /* * If the LUN is stopped, see if this particular command is allowed * for a stopped lun. Otherwise, reject it with 0x04,0x02. */ if ((lun->flags & CTL_LUN_STOPPED) && ((entry->flags & CTL_CMD_FLAG_OK_ON_STOPPED) == 0)) { /* "Logical unit not ready, initializing cmd. required" */ ctl_set_lun_stopped(ctsio); retval = 1; goto bailout; } if ((lun->flags & CTL_LUN_INOPERABLE) && ((entry->flags & CTL_CMD_FLAG_OK_ON_INOPERABLE) == 0)) { /* "Medium format corrupted" */ ctl_set_medium_format_corrupted(ctsio); retval = 1; goto bailout; } bailout: return (retval); } static void ctl_failover_io(union ctl_io *io, int have_lock) { ctl_set_busy(&io->scsiio); ctl_done(io); } static void ctl_failover(void) { struct ctl_lun *lun; struct ctl_softc *ctl_softc; union ctl_io *next_io, *pending_io; union ctl_io *io; int lun_idx; int i; ctl_softc = control_softc; mtx_lock(&ctl_softc->ctl_lock); /* * Remove any cmds from the other SC from the rtr queue. These * will obviously only be for LUNs for which we're the primary. * We can't send status or get/send data for these commands. * Since they haven't been executed yet, we can just remove them. * We'll either abort them or delete them below, depending on * which HA mode we're in. */ #ifdef notyet mtx_lock(&ctl_softc->queue_lock); for (io = (union ctl_io *)STAILQ_FIRST(&ctl_softc->rtr_queue); io != NULL; io = next_io) { next_io = (union ctl_io *)STAILQ_NEXT(&io->io_hdr, links); if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) STAILQ_REMOVE(&ctl_softc->rtr_queue, &io->io_hdr, ctl_io_hdr, links); } mtx_unlock(&ctl_softc->queue_lock); #endif for (lun_idx=0; lun_idx < ctl_softc->num_luns; lun_idx++) { lun = ctl_softc->ctl_luns[lun_idx]; if (lun==NULL) continue; /* * Processor LUNs are primary on both sides. * XXX will this always be true? */ if (lun->be_lun->lun_type == T_PROCESSOR) continue; if ((lun->flags & CTL_LUN_PRIMARY_SC) && (ctl_softc->ha_mode == CTL_HA_MODE_SER_ONLY)) { printf("FAILOVER: primary lun %d\n", lun_idx); /* * Remove all commands from the other SC. First from the * blocked queue then from the ooa queue. Once we have * removed them. Call ctl_check_blocked to see if there * is anything that can run. */ for (io = (union ctl_io *)TAILQ_FIRST( &lun->blocked_queue); io != NULL; io = next_io) { next_io = (union ctl_io *)TAILQ_NEXT( &io->io_hdr, blocked_links); if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) { TAILQ_REMOVE(&lun->blocked_queue, &io->io_hdr,blocked_links); io->io_hdr.flags &= ~CTL_FLAG_BLOCKED; TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); ctl_free_io(io); } } for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); io != NULL; io = next_io) { next_io = (union ctl_io *)TAILQ_NEXT( &io->io_hdr, ooa_links); if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) { TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); ctl_free_io(io); } } ctl_check_blocked(lun); } else if ((lun->flags & CTL_LUN_PRIMARY_SC) && (ctl_softc->ha_mode == CTL_HA_MODE_XFER)) { printf("FAILOVER: primary lun %d\n", lun_idx); /* * Abort all commands from the other SC. We can't * send status back for them now. These should get * cleaned up when they are completed or come out * for a datamove operation. */ for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); io != NULL; io = next_io) { next_io = (union ctl_io *)TAILQ_NEXT( &io->io_hdr, ooa_links); if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) io->io_hdr.flags |= CTL_FLAG_ABORT; } } else if (((lun->flags & CTL_LUN_PRIMARY_SC) == 0) && (ctl_softc->ha_mode == CTL_HA_MODE_XFER)) { printf("FAILOVER: secondary lun %d\n", lun_idx); lun->flags |= CTL_LUN_PRIMARY_SC; /* * We send all I/O that was sent to this controller * and redirected to the other side back with * busy status, and have the initiator retry it. * Figuring out how much data has been transferred, * etc. and picking up where we left off would be * very tricky. * * XXX KDM need to remove I/O from the blocked * queue as well! */ for (pending_io = (union ctl_io *)TAILQ_FIRST( &lun->ooa_queue); pending_io != NULL; pending_io = next_io) { next_io = (union ctl_io *)TAILQ_NEXT( &pending_io->io_hdr, ooa_links); pending_io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; if (pending_io->io_hdr.flags & CTL_FLAG_IO_ACTIVE) { pending_io->io_hdr.flags |= CTL_FLAG_FAILOVER; } else { ctl_set_busy(&pending_io->scsiio); ctl_done(pending_io); } } /* * Build Unit Attention */ for (i = 0; i < CTL_MAX_INITIATORS; i++) { lun->pending_ua[i] |= CTL_UA_ASYM_ACC_CHANGE; } } else if (((lun->flags & CTL_LUN_PRIMARY_SC) == 0) && (ctl_softc->ha_mode == CTL_HA_MODE_SER_ONLY)) { printf("FAILOVER: secondary lun %d\n", lun_idx); /* * if the first io on the OOA is not on the RtR queue * add it. */ lun->flags |= CTL_LUN_PRIMARY_SC; pending_io = (union ctl_io *)TAILQ_FIRST( &lun->ooa_queue); if (pending_io==NULL) { printf("Nothing on OOA queue\n"); continue; } pending_io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; if ((pending_io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR) == 0) { pending_io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr(pending_io); } #if 0 else { printf("Tag 0x%04x is running\n", pending_io->scsiio.tag_num); } #endif next_io = (union ctl_io *)TAILQ_NEXT( &pending_io->io_hdr, ooa_links); for (pending_io=next_io; pending_io != NULL; pending_io = next_io) { pending_io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; next_io = (union ctl_io *)TAILQ_NEXT( &pending_io->io_hdr, ooa_links); if (pending_io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR) { #if 0 printf("Tag 0x%04x is running\n", pending_io->scsiio.tag_num); #endif continue; } switch (ctl_check_ooa(lun, pending_io, (union ctl_io *)TAILQ_PREV( &pending_io->io_hdr, ctl_ooaq, ooa_links))) { case CTL_ACTION_BLOCK: TAILQ_INSERT_TAIL(&lun->blocked_queue, &pending_io->io_hdr, blocked_links); pending_io->io_hdr.flags |= CTL_FLAG_BLOCKED; break; case CTL_ACTION_PASS: case CTL_ACTION_SKIP: pending_io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr(pending_io); break; case CTL_ACTION_OVERLAP: ctl_set_overlapped_cmd( (struct ctl_scsiio *)pending_io); ctl_done(pending_io); break; case CTL_ACTION_OVERLAP_TAG: ctl_set_overlapped_tag( (struct ctl_scsiio *)pending_io, pending_io->scsiio.tag_num & 0xff); ctl_done(pending_io); break; case CTL_ACTION_ERROR: default: ctl_set_internal_failure( (struct ctl_scsiio *)pending_io, 0, // sks_valid 0); //retry count ctl_done(pending_io); break; } } /* * Build Unit Attention */ for (i = 0; i < CTL_MAX_INITIATORS; i++) { lun->pending_ua[i] |= CTL_UA_ASYM_ACC_CHANGE; } } else { panic("Unhandled HA mode failover, LUN flags = %#x, " "ha_mode = #%x", lun->flags, ctl_softc->ha_mode); } } ctl_pause_rtr = 0; mtx_unlock(&ctl_softc->ctl_lock); } static int ctl_scsiio_precheck(struct ctl_softc *ctl_softc, struct ctl_scsiio *ctsio) { struct ctl_lun *lun; const struct ctl_cmd_entry *entry; uint32_t initidx, targ_lun; int retval; retval = 0; lun = NULL; targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun; if ((targ_lun < CTL_MAX_LUNS) && (ctl_softc->ctl_luns[targ_lun] != NULL)) { lun = ctl_softc->ctl_luns[targ_lun]; /* * If the LUN is invalid, pretend that it doesn't exist. * It will go away as soon as all pending I/O has been * completed. */ if (lun->flags & CTL_LUN_DISABLED) { lun = NULL; } else { ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = lun; ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr = lun->be_lun; if (lun->be_lun->lun_type == T_PROCESSOR) { ctsio->io_hdr.flags |= CTL_FLAG_CONTROL_DEV; } /* * Every I/O goes into the OOA queue for a * particular LUN, and stays there until completion. */ mtx_lock(&lun->lun_lock); TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); } } else { ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = NULL; ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr = NULL; } /* Get command entry and return error if it is unsuppotyed. */ entry = ctl_validate_command(ctsio); if (entry == NULL) { if (lun) mtx_unlock(&lun->lun_lock); return (retval); } ctsio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; ctsio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; /* * Check to see whether we can send this command to LUNs that don't * exist. This should pretty much only be the case for inquiry * and request sense. Further checks, below, really require having * a LUN, so we can't really check the command anymore. Just put * it on the rtr queue. */ if (lun == NULL) { if (entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) { ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr((union ctl_io *)ctsio); return (retval); } ctl_set_unsupported_lun(ctsio); ctl_done((union ctl_io *)ctsio); CTL_DEBUG_PRINT(("ctl_scsiio_precheck: bailing out due to invalid LUN\n")); return (retval); } else { /* * Make sure we support this particular command on this LUN. * e.g., we don't support writes to the control LUN. */ if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } } initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); #ifdef CTL_WITH_CA /* * If we've got a request sense, it'll clear the contingent * allegiance condition. Otherwise, if we have a CA condition for * this initiator, clear it, because it sent down a command other * than request sense. */ if ((ctsio->cdb[0] != REQUEST_SENSE) && (ctl_is_set(lun->have_ca, initidx))) ctl_clear_mask(lun->have_ca, initidx); #endif /* * If the command has this flag set, it handles its own unit * attention reporting, we shouldn't do anything. Otherwise we * check for any pending unit attentions, and send them back to the * initiator. We only do this when a command initially comes in, * not when we pull it off the blocked queue. * * According to SAM-3, section 5.3.2, the order that things get * presented back to the host is basically unit attentions caused * by some sort of reset event, busy status, reservation conflicts * or task set full, and finally any other status. * * One issue here is that some of the unit attentions we report * don't fall into the "reset" category (e.g. "reported luns data * has changed"). So reporting it here, before the reservation * check, may be technically wrong. I guess the only thing to do * would be to check for and report the reset events here, and then * check for the other unit attention types after we check for a * reservation conflict. * * XXX KDM need to fix this */ if ((entry->flags & CTL_CMD_FLAG_NO_SENSE) == 0) { ctl_ua_type ua_type; ua_type = lun->pending_ua[initidx]; if (ua_type != CTL_UA_NONE) { scsi_sense_data_type sense_format; if (lun != NULL) sense_format = (lun->flags & CTL_LUN_SENSE_DESC) ? SSD_TYPE_DESC : SSD_TYPE_FIXED; else sense_format = SSD_TYPE_FIXED; ua_type = ctl_build_ua(ua_type, &ctsio->sense_data, sense_format); if (ua_type != CTL_UA_NONE) { ctsio->scsi_status = SCSI_STATUS_CHECK_COND; ctsio->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; ctsio->sense_len = SSD_FULL_SIZE; lun->pending_ua[initidx] &= ~ua_type; mtx_unlock(&lun->lun_lock); ctl_done((union ctl_io *)ctsio); return (retval); } } } if (ctl_scsiio_lun_check(ctl_softc, lun, entry, ctsio) != 0) { mtx_unlock(&lun->lun_lock); ctl_done((union ctl_io *)ctsio); return (retval); } /* * XXX CHD this is where we want to send IO to other side if * this LUN is secondary on this SC. We will need to make a copy * of the IO and flag the IO on this side as SENT_2OTHER and the flag * the copy we send as FROM_OTHER. * We also need to stuff the address of the original IO so we can * find it easily. Something similar will need be done on the other * side so when we are done we can find the copy. */ if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0) { union ctl_ha_msg msg_info; int isc_retval; ctsio->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC; msg_info.hdr.msg_type = CTL_MSG_SERIALIZE; msg_info.hdr.original_sc = (union ctl_io *)ctsio; #if 0 printf("1. ctsio %p\n", ctsio); #endif msg_info.hdr.serializing_sc = NULL; msg_info.hdr.nexus = ctsio->io_hdr.nexus; msg_info.scsi.tag_num = ctsio->tag_num; msg_info.scsi.tag_type = ctsio->tag_type; memcpy(msg_info.scsi.cdb, ctsio->cdb, CTL_MAX_CDBLEN); ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; if ((isc_retval=ctl_ha_msg_send(CTL_HA_CHAN_CTL, (void *)&msg_info, sizeof(msg_info), 0)) > CTL_HA_STATUS_SUCCESS) { printf("CTL:precheck, ctl_ha_msg_send returned %d\n", isc_retval); printf("CTL:opcode is %x\n", ctsio->cdb[0]); } else { #if 0 printf("CTL:Precheck sent msg, opcode is %x\n",opcode); #endif } /* * XXX KDM this I/O is off the incoming queue, but hasn't * been inserted on any other queue. We may need to come * up with a holding queue while we wait for serialization * so that we have an idea of what we're waiting for from * the other side. */ mtx_unlock(&lun->lun_lock); return (retval); } switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links))) { case CTL_ACTION_BLOCK: ctsio->io_hdr.flags |= CTL_FLAG_BLOCKED; TAILQ_INSERT_TAIL(&lun->blocked_queue, &ctsio->io_hdr, blocked_links); mtx_unlock(&lun->lun_lock); return (retval); case CTL_ACTION_PASS: case CTL_ACTION_SKIP: ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; mtx_unlock(&lun->lun_lock); ctl_enqueue_rtr((union ctl_io *)ctsio); break; case CTL_ACTION_OVERLAP: mtx_unlock(&lun->lun_lock); ctl_set_overlapped_cmd(ctsio); ctl_done((union ctl_io *)ctsio); break; case CTL_ACTION_OVERLAP_TAG: mtx_unlock(&lun->lun_lock); ctl_set_overlapped_tag(ctsio, ctsio->tag_num & 0xff); ctl_done((union ctl_io *)ctsio); break; case CTL_ACTION_ERROR: default: mtx_unlock(&lun->lun_lock); ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 0); ctl_done((union ctl_io *)ctsio); break; } return (retval); } const struct ctl_cmd_entry * ctl_get_cmd_entry(struct ctl_scsiio *ctsio) { const struct ctl_cmd_entry *entry; int service_action; entry = &ctl_cmd_table[ctsio->cdb[0]]; if (entry->flags & CTL_CMD_FLAG_SA5) { service_action = ctsio->cdb[1] & SERVICE_ACTION_MASK; entry = &((const struct ctl_cmd_entry *) entry->execute)[service_action]; } return (entry); } const struct ctl_cmd_entry * ctl_validate_command(struct ctl_scsiio *ctsio) { const struct ctl_cmd_entry *entry; int i; uint8_t diff; entry = ctl_get_cmd_entry(ctsio); if (entry->execute == NULL) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (NULL); } KASSERT(entry->length > 0, ("Not defined length for command 0x%02x/0x%02x", ctsio->cdb[0], ctsio->cdb[1])); for (i = 1; i < entry->length; i++) { diff = ctsio->cdb[i] & ~entry->usage[i - 1]; if (diff == 0) continue; ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ i, /*bit_valid*/ 1, /*bit*/ fls(diff) - 1); ctl_done((union ctl_io *)ctsio); return (NULL); } return (entry); } static int ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry) { switch (lun_type) { case T_PROCESSOR: if (((entry->flags & CTL_CMD_FLAG_OK_ON_PROC) == 0) && ((entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) == 0)) return (0); break; case T_DIRECT: if (((entry->flags & CTL_CMD_FLAG_OK_ON_SLUN) == 0) && ((entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) == 0)) return (0); break; default: return (0); } return (1); } static int ctl_scsiio(struct ctl_scsiio *ctsio) { int retval; const struct ctl_cmd_entry *entry; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_scsiio cdb[0]=%02X\n", ctsio->cdb[0])); entry = ctl_get_cmd_entry(ctsio); /* * If this I/O has been aborted, just send it straight to * ctl_done() without executing it. */ if (ctsio->io_hdr.flags & CTL_FLAG_ABORT) { ctl_done((union ctl_io *)ctsio); goto bailout; } /* * All the checks should have been handled by ctl_scsiio_precheck(). * We should be clear now to just execute the I/O. */ retval = entry->execute(ctsio); bailout: return (retval); } /* * Since we only implement one target right now, a bus reset simply resets * our single target. */ static int ctl_bus_reset(struct ctl_softc *ctl_softc, union ctl_io *io) { return(ctl_target_reset(ctl_softc, io, CTL_UA_BUS_RESET)); } static int ctl_target_reset(struct ctl_softc *ctl_softc, union ctl_io *io, ctl_ua_type ua_type) { struct ctl_lun *lun; int retval; if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg_info; io->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC; msg_info.hdr.nexus = io->io_hdr.nexus; if (ua_type==CTL_UA_TARG_RESET) msg_info.task.task_action = CTL_TASK_TARGET_RESET; else msg_info.task.task_action = CTL_TASK_BUS_RESET; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; if (CTL_HA_STATUS_SUCCESS != ctl_ha_msg_send(CTL_HA_CHAN_CTL, (void *)&msg_info, sizeof(msg_info), 0)) { } } retval = 0; mtx_lock(&ctl_softc->ctl_lock); STAILQ_FOREACH(lun, &ctl_softc->lun_list, links) retval += ctl_lun_reset(lun, io, ua_type); mtx_unlock(&ctl_softc->ctl_lock); return (retval); } /* * The LUN should always be set. The I/O is optional, and is used to * distinguish between I/Os sent by this initiator, and by other * initiators. We set unit attention for initiators other than this one. * SAM-3 is vague on this point. It does say that a unit attention should * be established for other initiators when a LUN is reset (see section * 5.7.3), but it doesn't specifically say that the unit attention should * be established for this particular initiator when a LUN is reset. Here * is the relevant text, from SAM-3 rev 8: * * 5.7.2 When a SCSI initiator port aborts its own tasks * * When a SCSI initiator port causes its own task(s) to be aborted, no * notification that the task(s) have been aborted shall be returned to * the SCSI initiator port other than the completion response for the * command or task management function action that caused the task(s) to * be aborted and notification(s) associated with related effects of the * action (e.g., a reset unit attention condition). * * XXX KDM for now, we're setting unit attention for all initiators. */ static int ctl_lun_reset(struct ctl_lun *lun, union ctl_io *io, ctl_ua_type ua_type) { union ctl_io *xio; #if 0 uint32_t initindex; #endif int i; mtx_lock(&lun->lun_lock); /* * Run through the OOA queue and abort each I/O. */ #if 0 TAILQ_FOREACH((struct ctl_io_hdr *)xio, &lun->ooa_queue, ooa_links) { #endif for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { xio->io_hdr.flags |= CTL_FLAG_ABORT | CTL_FLAG_ABORT_STATUS; } /* * This version sets unit attention for every */ #if 0 initindex = ctl_get_initindex(&io->io_hdr.nexus); for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (initindex == i) continue; lun->pending_ua[i] |= ua_type; } #endif /* * A reset (any kind, really) clears reservations established with * RESERVE/RELEASE. It does not clear reservations established * with PERSISTENT RESERVE OUT, but we don't support that at the * moment anyway. See SPC-2, section 5.6. SPC-3 doesn't address * reservations made with the RESERVE/RELEASE commands, because * those commands are obsolete in SPC-3. */ lun->flags &= ~CTL_LUN_RESERVED; for (i = 0; i < CTL_MAX_INITIATORS; i++) { #ifdef CTL_WITH_CA ctl_clear_mask(lun->have_ca, i); #endif lun->pending_ua[i] |= ua_type; } mtx_unlock(&lun->lun_lock); return (0); } static void ctl_abort_tasks_lun(struct ctl_lun *lun, uint32_t targ_port, uint32_t init_id, int other_sc) { union ctl_io *xio; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run through the OOA queue and attempt to find the given I/O. * The target port, initiator ID, tag type and tag number have to * match the values that we got from the initiator. If we have an * untagged command to abort, simply abort the first untagged command * we come to. We only allow one untagged command at a time of course. */ for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { if ((targ_port == UINT32_MAX || targ_port == xio->io_hdr.nexus.targ_port) && (init_id == UINT32_MAX || init_id == xio->io_hdr.nexus.initid.id)) { if (targ_port != xio->io_hdr.nexus.targ_port || init_id != xio->io_hdr.nexus.initid.id) xio->io_hdr.flags |= CTL_FLAG_ABORT_STATUS; xio->io_hdr.flags |= CTL_FLAG_ABORT; if (!other_sc && !(lun->flags & CTL_LUN_PRIMARY_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = xio->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_ABORT_TASK; msg_info.task.tag_num = xio->scsiio.tag_num; msg_info.task.tag_type = xio->scsiio.tag_type; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, (void *)&msg_info, sizeof(msg_info), 0); } } } } static int ctl_abort_task_set(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_lun *lun; uint32_t targ_lun; /* * Look up the LUN. */ targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if ((targ_lun < CTL_MAX_LUNS) && (softc->ctl_luns[targ_lun] != NULL)) lun = softc->ctl_luns[targ_lun]; else { mtx_unlock(&softc->ctl_lock); return (1); } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (io->taskio.task_action == CTL_TASK_ABORT_TASK_SET) { ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.initid.id, (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); } else { /* CTL_TASK_CLEAR_TASK_SET */ ctl_abort_tasks_lun(lun, UINT32_MAX, UINT32_MAX, (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); } mtx_unlock(&lun->lun_lock); return (0); } static int ctl_i_t_nexus_reset(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_lun *lun; uint32_t initindex; initindex = ctl_get_initindex(&io->io_hdr.nexus); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.initid.id, (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); #ifdef CTL_WITH_CA ctl_clear_mask(lun->have_ca, initindex); #endif lun->pending_ua[initindex] |= CTL_UA_I_T_NEXUS_LOSS; mtx_unlock(&lun->lun_lock); } mtx_unlock(&softc->ctl_lock); return (0); } static int ctl_abort_task(union ctl_io *io) { union ctl_io *xio; struct ctl_lun *lun; struct ctl_softc *ctl_softc; #if 0 struct sbuf sb; char printbuf[128]; #endif int found; uint32_t targ_lun; ctl_softc = control_softc; found = 0; /* * Look up the LUN. */ targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&ctl_softc->ctl_lock); if ((targ_lun < CTL_MAX_LUNS) && (ctl_softc->ctl_luns[targ_lun] != NULL)) lun = ctl_softc->ctl_luns[targ_lun]; else { mtx_unlock(&ctl_softc->ctl_lock); return (1); } #if 0 printf("ctl_abort_task: called for lun %lld, tag %d type %d\n", lun->lun, io->taskio.tag_num, io->taskio.tag_type); #endif mtx_lock(&lun->lun_lock); mtx_unlock(&ctl_softc->ctl_lock); /* * Run through the OOA queue and attempt to find the given I/O. * The target port, initiator ID, tag type and tag number have to * match the values that we got from the initiator. If we have an * untagged command to abort, simply abort the first untagged command * we come to. We only allow one untagged command at a time of course. */ #if 0 TAILQ_FOREACH((struct ctl_io_hdr *)xio, &lun->ooa_queue, ooa_links) { #endif for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { #if 0 sbuf_new(&sb, printbuf, sizeof(printbuf), SBUF_FIXEDLEN); sbuf_printf(&sb, "LUN %lld tag %d type %d%s%s%s%s: ", lun->lun, xio->scsiio.tag_num, xio->scsiio.tag_type, (xio->io_hdr.blocked_links.tqe_prev == NULL) ? "" : " BLOCKED", (xio->io_hdr.flags & CTL_FLAG_DMA_INPROG) ? " DMA" : "", (xio->io_hdr.flags & CTL_FLAG_ABORT) ? " ABORT" : "", (xio->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR ? " RTR" : "")); ctl_scsi_command_string(&xio->scsiio, NULL, &sb); sbuf_finish(&sb); printf("%s\n", sbuf_data(&sb)); #endif if ((xio->io_hdr.nexus.targ_port == io->io_hdr.nexus.targ_port) && (xio->io_hdr.nexus.initid.id == io->io_hdr.nexus.initid.id)) { /* * If the abort says that the task is untagged, the * task in the queue must be untagged. Otherwise, * we just check to see whether the tag numbers * match. This is because the QLogic firmware * doesn't pass back the tag type in an abort * request. */ #if 0 if (((xio->scsiio.tag_type == CTL_TAG_UNTAGGED) && (io->taskio.tag_type == CTL_TAG_UNTAGGED)) || (xio->scsiio.tag_num == io->taskio.tag_num)) { #endif /* * XXX KDM we've got problems with FC, because it * doesn't send down a tag type with aborts. So we * can only really go by the tag number... * This may cause problems with parallel SCSI. * Need to figure that out!! */ if (xio->scsiio.tag_num == io->taskio.tag_num) { xio->io_hdr.flags |= CTL_FLAG_ABORT; found = 1; if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0 && !(lun->flags & CTL_LUN_PRIMARY_SC)) { union ctl_ha_msg msg_info; io->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_ABORT_TASK; msg_info.task.tag_num = io->taskio.tag_num; msg_info.task.tag_type = io->taskio.tag_type; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; #if 0 printf("Sent Abort to other side\n"); #endif if (CTL_HA_STATUS_SUCCESS != ctl_ha_msg_send(CTL_HA_CHAN_CTL, (void *)&msg_info, sizeof(msg_info), 0)) { } } #if 0 printf("ctl_abort_task: found I/O to abort\n"); #endif break; } } } mtx_unlock(&lun->lun_lock); if (found == 0) { /* * This isn't really an error. It's entirely possible for * the abort and command completion to cross on the wire. * This is more of an informative/diagnostic error. */ #if 0 printf("ctl_abort_task: ABORT sent for nonexistent I/O: " "%d:%d:%d:%d tag %d type %d\n", io->io_hdr.nexus.initid.id, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_target.id, io->io_hdr.nexus.targ_lun, io->taskio.tag_num, io->taskio.tag_type); #endif } return (0); } static void ctl_run_task(union ctl_io *io) { struct ctl_softc *ctl_softc = control_softc; int retval = 1; const char *task_desc; CTL_DEBUG_PRINT(("ctl_run_task\n")); KASSERT(io->io_hdr.io_type == CTL_IO_TASK, ("ctl_run_task: Unextected io_type %d\n", io->io_hdr.io_type)); task_desc = ctl_scsi_task_string(&io->taskio); if (task_desc != NULL) { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_TASK_REPORT, csevent_LogType_Trace, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "CTL: received task: %s",task_desc); #endif } else { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_TASK_REPORT, csevent_LogType_Trace, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "CTL: received unknown task " "type: %d (%#x)", io->taskio.task_action, io->taskio.task_action); #endif } switch (io->taskio.task_action) { case CTL_TASK_ABORT_TASK: retval = ctl_abort_task(io); break; case CTL_TASK_ABORT_TASK_SET: case CTL_TASK_CLEAR_TASK_SET: retval = ctl_abort_task_set(io); break; case CTL_TASK_CLEAR_ACA: break; case CTL_TASK_I_T_NEXUS_RESET: retval = ctl_i_t_nexus_reset(io); break; case CTL_TASK_LUN_RESET: { struct ctl_lun *lun; uint32_t targ_lun; targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&ctl_softc->ctl_lock); if ((targ_lun < CTL_MAX_LUNS) && (ctl_softc->ctl_luns[targ_lun] != NULL)) lun = ctl_softc->ctl_luns[targ_lun]; else { mtx_unlock(&ctl_softc->ctl_lock); retval = 1; break; } if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg_info; io->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_LUN_RESET; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; if (CTL_HA_STATUS_SUCCESS != ctl_ha_msg_send(CTL_HA_CHAN_CTL, (void *)&msg_info, sizeof(msg_info), 0)) { } } retval = ctl_lun_reset(lun, io, CTL_UA_LUN_RESET); mtx_unlock(&ctl_softc->ctl_lock); break; } case CTL_TASK_TARGET_RESET: retval = ctl_target_reset(ctl_softc, io, CTL_UA_TARG_RESET); break; case CTL_TASK_BUS_RESET: retval = ctl_bus_reset(ctl_softc, io); break; case CTL_TASK_PORT_LOGIN: break; case CTL_TASK_PORT_LOGOUT: break; default: printf("ctl_run_task: got unknown task management event %d\n", io->taskio.task_action); break; } if (retval == 0) io->io_hdr.status = CTL_SUCCESS; else io->io_hdr.status = CTL_ERROR; ctl_done(io); } /* * For HA operation. Handle commands that come in from the other * controller. */ static void ctl_handle_isc(union ctl_io *io) { int free_io; struct ctl_lun *lun; struct ctl_softc *ctl_softc; uint32_t targ_lun; ctl_softc = control_softc; targ_lun = io->io_hdr.nexus.targ_mapped_lun; lun = ctl_softc->ctl_luns[targ_lun]; switch (io->io_hdr.msg_type) { case CTL_MSG_SERIALIZE: free_io = ctl_serialize_other_sc_cmd(&io->scsiio); break; case CTL_MSG_R2R: { const struct ctl_cmd_entry *entry; /* * This is only used in SER_ONLY mode. */ free_io = 0; entry = ctl_get_cmd_entry(&io->scsiio); mtx_lock(&lun->lun_lock); if (ctl_scsiio_lun_check(ctl_softc, lun, entry, (struct ctl_scsiio *)io) != 0) { mtx_unlock(&lun->lun_lock); ctl_done(io); break; } io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; mtx_unlock(&lun->lun_lock); ctl_enqueue_rtr(io); break; } case CTL_MSG_FINISH_IO: if (ctl_softc->ha_mode == CTL_HA_MODE_XFER) { free_io = 0; ctl_done(io); } else { free_io = 1; mtx_lock(&lun->lun_lock); TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); ctl_check_blocked(lun); mtx_unlock(&lun->lun_lock); } break; case CTL_MSG_PERS_ACTION: ctl_hndl_per_res_out_on_other_sc( (union ctl_ha_msg *)&io->presio.pr_msg); free_io = 1; break; case CTL_MSG_BAD_JUJU: free_io = 0; ctl_done(io); break; case CTL_MSG_DATAMOVE: /* Only used in XFER mode */ free_io = 0; ctl_datamove_remote(io); break; case CTL_MSG_DATAMOVE_DONE: /* Only used in XFER mode */ free_io = 0; io->scsiio.be_move_done(io); break; default: free_io = 1; printf("%s: Invalid message type %d\n", __func__, io->io_hdr.msg_type); break; } if (free_io) ctl_free_io(io); } /* * Returns the match type in the case of a match, or CTL_LUN_PAT_NONE if * there is no match. */ static ctl_lun_error_pattern ctl_cmd_pattern_match(struct ctl_scsiio *ctsio, struct ctl_error_desc *desc) { const struct ctl_cmd_entry *entry; ctl_lun_error_pattern filtered_pattern, pattern; pattern = desc->error_pattern; /* * XXX KDM we need more data passed into this function to match a * custom pattern, and we actually need to implement custom pattern * matching. */ if (pattern & CTL_LUN_PAT_CMD) return (CTL_LUN_PAT_CMD); if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_ANY) return (CTL_LUN_PAT_ANY); entry = ctl_get_cmd_entry(ctsio); filtered_pattern = entry->pattern & pattern; /* * If the user requested specific flags in the pattern (e.g. * CTL_LUN_PAT_RANGE), make sure the command supports all of those * flags. * * If the user did not specify any flags, it doesn't matter whether * or not the command supports the flags. */ if ((filtered_pattern & ~CTL_LUN_PAT_MASK) != (pattern & ~CTL_LUN_PAT_MASK)) return (CTL_LUN_PAT_NONE); /* * If the user asked for a range check, see if the requested LBA * range overlaps with this command's LBA range. */ if (filtered_pattern & CTL_LUN_PAT_RANGE) { uint64_t lba1; uint32_t len1; ctl_action action; int retval; retval = ctl_get_lba_len((union ctl_io *)ctsio, &lba1, &len1); if (retval != 0) return (CTL_LUN_PAT_NONE); action = ctl_extent_check_lba(lba1, len1, desc->lba_range.lba, desc->lba_range.len); /* * A "pass" means that the LBA ranges don't overlap, so * this doesn't match the user's range criteria. */ if (action == CTL_ACTION_PASS) return (CTL_LUN_PAT_NONE); } return (filtered_pattern); } static void ctl_inject_error(struct ctl_lun *lun, union ctl_io *io) { struct ctl_error_desc *desc, *desc2; mtx_assert(&lun->lun_lock, MA_OWNED); STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) { ctl_lun_error_pattern pattern; /* * Check to see whether this particular command matches * the pattern in the descriptor. */ pattern = ctl_cmd_pattern_match(&io->scsiio, desc); if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_NONE) continue; switch (desc->lun_error & CTL_LUN_INJ_TYPE) { case CTL_LUN_INJ_ABORTED: ctl_set_aborted(&io->scsiio); break; case CTL_LUN_INJ_MEDIUM_ERR: ctl_set_medium_error(&io->scsiio); break; case CTL_LUN_INJ_UA: /* 29h/00h POWER ON, RESET, OR BUS DEVICE RESET * OCCURRED */ ctl_set_ua(&io->scsiio, 0x29, 0x00); break; case CTL_LUN_INJ_CUSTOM: /* * We're assuming the user knows what he is doing. * Just copy the sense information without doing * checks. */ bcopy(&desc->custom_sense, &io->scsiio.sense_data, ctl_min(sizeof(desc->custom_sense), sizeof(io->scsiio.sense_data))); io->scsiio.scsi_status = SCSI_STATUS_CHECK_COND; io->scsiio.sense_len = SSD_FULL_SIZE; io->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; break; case CTL_LUN_INJ_NONE: default: /* * If this is an error injection type we don't know * about, clear the continuous flag (if it is set) * so it will get deleted below. */ desc->lun_error &= ~CTL_LUN_INJ_CONTINUOUS; break; } /* * By default, each error injection action is a one-shot */ if (desc->lun_error & CTL_LUN_INJ_CONTINUOUS) continue; STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links); free(desc, M_CTL); } } #ifdef CTL_IO_DELAY static void ctl_datamove_timer_wakeup(void *arg) { union ctl_io *io; io = (union ctl_io *)arg; ctl_datamove(io); } #endif /* CTL_IO_DELAY */ void ctl_datamove(union ctl_io *io) { void (*fe_datamove)(union ctl_io *io); mtx_assert(&control_softc->ctl_lock, MA_NOTOWNED); CTL_DEBUG_PRINT(("ctl_datamove\n")); #ifdef CTL_TIME_IO if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) { char str[256]; char path_str[64]; struct sbuf sb; ctl_scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); break; case CTL_IO_TASK: sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, " "Tag Type: %d\n", io->taskio.task_action, io->taskio.tag_num, io->taskio.tag_type); break; default: printf("Invalid CTL I/O type %d\n", io->io_hdr.io_type); panic("Invalid CTL I/O type %d\n", io->io_hdr.io_type); break; } sbuf_cat(&sb, path_str); sbuf_printf(&sb, "ctl_datamove: %jd seconds\n", (intmax_t)time_uptime - io->io_hdr.start_time); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); } #endif /* CTL_TIME_IO */ #ifdef CTL_IO_DELAY if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) { struct ctl_lun *lun; lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE; } else { struct ctl_lun *lun; lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if ((lun != NULL) && (lun->delay_info.datamove_delay > 0)) { struct callout *callout; callout = (struct callout *)&io->io_hdr.timer_bytes; callout_init(callout, /*mpsafe*/ 1); io->io_hdr.flags |= CTL_FLAG_DELAY_DONE; callout_reset(callout, lun->delay_info.datamove_delay * hz, ctl_datamove_timer_wakeup, io); if (lun->delay_info.datamove_type == CTL_DELAY_TYPE_ONESHOT) lun->delay_info.datamove_delay = 0; return; } } #endif /* * This command has been aborted. Set the port status, so we fail * the data move. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { printf("ctl_datamove: tag 0x%04x on (%ju:%d:%ju:%d) aborted\n", io->scsiio.tag_num,(uintmax_t)io->io_hdr.nexus.initid.id, io->io_hdr.nexus.targ_port, (uintmax_t)io->io_hdr.nexus.targ_target.id, io->io_hdr.nexus.targ_lun); io->io_hdr.port_status = 31337; /* * Note that the backend, in this case, will get the * callback in its context. In other cases it may get * called in the frontend's interrupt thread context. */ io->scsiio.be_move_done(io); return; } /* * If we're in XFER mode and this I/O is from the other shelf * controller, we need to send the DMA to the other side to * actually transfer the data to/from the host. In serialize only * mode the transfer happens below CTL and ctl_datamove() is only * called on the machine that originally received the I/O. */ if ((control_softc->ha_mode == CTL_HA_MODE_XFER) && (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg; uint32_t sg_entries_sent; int do_sg_copy; int i; memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_DATAMOVE; msg.hdr.original_sc = io->io_hdr.original_sc; msg.hdr.serializing_sc = io; msg.hdr.nexus = io->io_hdr.nexus; msg.dt.flags = io->io_hdr.flags; /* * We convert everything into a S/G list here. We can't * pass by reference, only by value between controllers. * So we can't pass a pointer to the S/G list, only as many * S/G entries as we can fit in here. If it's possible for * us to get more than CTL_HA_MAX_SG_ENTRIES S/G entries, * then we need to break this up into multiple transfers. */ if (io->scsiio.kern_sg_entries == 0) { msg.dt.kern_sg_entries = 1; /* * If this is in cached memory, flush the cache * before we send the DMA request to the other * controller. We want to do this in either the * read or the write case. The read case is * straightforward. In the write case, we want to * make sure nothing is in the local cache that * could overwrite the DMAed data. */ if ((io->io_hdr.flags & CTL_FLAG_NO_DATASYNC) == 0) { /* * XXX KDM use bus_dmamap_sync() here. */ } /* * Convert to a physical address if this is a * virtual address. */ if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) { msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr; } else { /* * XXX KDM use busdma here! */ #if 0 msg.dt.sg_list[0].addr = (void *) vtophys(io->scsiio.kern_data_ptr); #endif } msg.dt.sg_list[0].len = io->scsiio.kern_data_len; do_sg_copy = 0; } else { struct ctl_sg_entry *sgl; do_sg_copy = 1; msg.dt.kern_sg_entries = io->scsiio.kern_sg_entries; sgl = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr; if ((io->io_hdr.flags & CTL_FLAG_NO_DATASYNC) == 0) { /* * XXX KDM use bus_dmamap_sync() here. */ } } msg.dt.kern_data_len = io->scsiio.kern_data_len; msg.dt.kern_total_len = io->scsiio.kern_total_len; msg.dt.kern_data_resid = io->scsiio.kern_data_resid; msg.dt.kern_rel_offset = io->scsiio.kern_rel_offset; msg.dt.sg_sequence = 0; /* * Loop until we've sent all of the S/G entries. On the * other end, we'll recompose these S/G entries into one * contiguous list before passing it to the */ for (sg_entries_sent = 0; sg_entries_sent < msg.dt.kern_sg_entries; msg.dt.sg_sequence++) { msg.dt.cur_sg_entries = ctl_min((sizeof(msg.dt.sg_list)/ sizeof(msg.dt.sg_list[0])), msg.dt.kern_sg_entries - sg_entries_sent); if (do_sg_copy != 0) { struct ctl_sg_entry *sgl; int j; sgl = (struct ctl_sg_entry *) io->scsiio.kern_data_ptr; /* * If this is in cached memory, flush the cache * before we send the DMA request to the other * controller. We want to do this in either * the * read or the write case. The read * case is straightforward. In the write * case, we want to make sure nothing is * in the local cache that could overwrite * the DMAed data. */ for (i = sg_entries_sent, j = 0; i < msg.dt.cur_sg_entries; i++, j++) { if ((io->io_hdr.flags & CTL_FLAG_NO_DATASYNC) == 0) { /* * XXX KDM use bus_dmamap_sync() */ } if ((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0) { /* * XXX KDM use busdma. */ #if 0 msg.dt.sg_list[j].addr =(void *) vtophys(sgl[i].addr); #endif } else { msg.dt.sg_list[j].addr = sgl[i].addr; } msg.dt.sg_list[j].len = sgl[i].len; } } sg_entries_sent += msg.dt.cur_sg_entries; if (sg_entries_sent >= msg.dt.kern_sg_entries) msg.dt.sg_last = 1; else msg.dt.sg_last = 0; /* * XXX KDM drop and reacquire the lock here? */ if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg), 0) > CTL_HA_STATUS_SUCCESS) { /* * XXX do something here. */ } msg.dt.sent_sg_entries = sg_entries_sent; } io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; if (io->io_hdr.flags & CTL_FLAG_FAILOVER) ctl_failover_io(io, /*have_lock*/ 0); } else { /* * Lookup the fe_datamove() function for this particular * front end. */ fe_datamove = control_softc->ctl_ports[ctl_port_idx(io->io_hdr.nexus.targ_port)]->fe_datamove; fe_datamove(io); } } static void ctl_send_datamove_done(union ctl_io *io, int have_lock) { union ctl_ha_msg msg; int isc_status; memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_DATAMOVE_DONE; msg.hdr.original_sc = io; msg.hdr.serializing_sc = io->io_hdr.serializing_sc; msg.hdr.nexus = io->io_hdr.nexus; msg.hdr.status = io->io_hdr.status; msg.scsi.tag_num = io->scsiio.tag_num; msg.scsi.tag_type = io->scsiio.tag_type; msg.scsi.scsi_status = io->scsiio.scsi_status; memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data, sizeof(io->scsiio.sense_data)); msg.scsi.sense_len = io->scsiio.sense_len; msg.scsi.sense_residual = io->scsiio.sense_residual; msg.scsi.fetd_status = io->io_hdr.port_status; msg.scsi.residual = io->scsiio.residual; io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; if (io->io_hdr.flags & CTL_FLAG_FAILOVER) { ctl_failover_io(io, /*have_lock*/ have_lock); return; } isc_status = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg), 0); if (isc_status > CTL_HA_STATUS_SUCCESS) { /* XXX do something if this fails */ } } /* * The DMA to the remote side is done, now we need to tell the other side * we're done so it can continue with its data movement. */ static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq) { union ctl_io *io; io = rq->context; if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA write failed with error %d", __func__, rq->ret); ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ rq->ret); } ctl_dt_req_free(rq); /* * In this case, we had to malloc the memory locally. Free it. */ if ((io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) == 0) { int i; for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(io->io_hdr.local_sglist[i].addr, M_CTL); } /* * The data is in local and remote memory, so now we need to send * status (good or back) back to the other side. */ ctl_send_datamove_done(io, /*have_lock*/ 0); } /* * We've moved the data from the host/controller into local memory. Now we * need to push it over to the remote controller's memory. */ static int ctl_datamove_remote_dm_write_cb(union ctl_io *io) { int retval; retval = 0; retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_WRITE, ctl_datamove_remote_write_cb); return (retval); } static void ctl_datamove_remote_write(union ctl_io *io) { int retval; void (*fe_datamove)(union ctl_io *io); /* * - Get the data from the host/HBA into local memory. * - DMA memory from the local controller to the remote controller. * - Send status back to the remote controller. */ retval = ctl_datamove_remote_sgl_setup(io); if (retval != 0) return; /* Switch the pointer over so the FETD knows what to do */ io->scsiio.kern_data_ptr = (uint8_t *)io->io_hdr.local_sglist; /* * Use a custom move done callback, since we need to send completion * back to the other controller, not to the backend on this side. */ io->scsiio.be_move_done = ctl_datamove_remote_dm_write_cb; fe_datamove = control_softc->ctl_ports[ctl_port_idx(io->io_hdr.nexus.targ_port)]->fe_datamove; fe_datamove(io); return; } static int ctl_datamove_remote_dm_read_cb(union ctl_io *io) { #if 0 char str[256]; char path_str[64]; struct sbuf sb; #endif /* * In this case, we had to malloc the memory locally. Free it. */ if ((io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) == 0) { int i; for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(io->io_hdr.local_sglist[i].addr, M_CTL); } #if 0 scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "%s: flags %#x, status %#x\n", __func__, io->io_hdr.flags, io->io_hdr.status); sbuf_finish(&sb); printk("%s", sbuf_data(&sb)); #endif /* * The read is done, now we need to send status (good or bad) back * to the other side. */ ctl_send_datamove_done(io, /*have_lock*/ 0); return (0); } static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq) { union ctl_io *io; void (*fe_datamove)(union ctl_io *io); io = rq->context; if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA read failed with error %d", __func__, rq->ret); ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ rq->ret); } ctl_dt_req_free(rq); /* Switch the pointer over so the FETD knows what to do */ io->scsiio.kern_data_ptr = (uint8_t *)io->io_hdr.local_sglist; /* * Use a custom move done callback, since we need to send completion * back to the other controller, not to the backend on this side. */ io->scsiio.be_move_done = ctl_datamove_remote_dm_read_cb; /* XXX KDM add checks like the ones in ctl_datamove? */ fe_datamove = control_softc->ctl_ports[ctl_port_idx(io->io_hdr.nexus.targ_port)]->fe_datamove; fe_datamove(io); } static int ctl_datamove_remote_sgl_setup(union ctl_io *io) { struct ctl_sg_entry *local_sglist, *remote_sglist; struct ctl_sg_entry *local_dma_sglist, *remote_dma_sglist; struct ctl_softc *softc; int retval; int i; retval = 0; softc = control_softc; local_sglist = io->io_hdr.local_sglist; local_dma_sglist = io->io_hdr.local_dma_sglist; remote_sglist = io->io_hdr.remote_sglist; remote_dma_sglist = io->io_hdr.remote_dma_sglist; if (io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) { for (i = 0; i < io->scsiio.kern_sg_entries; i++) { local_sglist[i].len = remote_sglist[i].len; /* * XXX Detect the situation where the RS-level I/O * redirector on the other side has already read the * data off of the AOR RS on this side, and * transferred it to remote (mirror) memory on the * other side. Since we already have the data in * memory here, we just need to use it. * * XXX KDM this can probably be removed once we * get the cache device code in and take the * current AOR implementation out. */ #ifdef NEEDTOPORT if ((remote_sglist[i].addr >= (void *)vtophys(softc->mirr->addr)) && (remote_sglist[i].addr < ((void *)vtophys(softc->mirr->addr) + CacheMirrorOffset))) { local_sglist[i].addr = remote_sglist[i].addr - CacheMirrorOffset; if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) io->io_hdr.flags |= CTL_FLAG_REDIR_DONE; } else { local_sglist[i].addr = remote_sglist[i].addr + CacheMirrorOffset; } #endif #if 0 printf("%s: local %p, remote %p, len %d\n", __func__, local_sglist[i].addr, remote_sglist[i].addr, local_sglist[i].len); #endif } } else { uint32_t len_to_go; /* * In this case, we don't have automatically allocated * memory for this I/O on this controller. This typically * happens with internal CTL I/O -- e.g. inquiry, mode * sense, etc. Anything coming from RAIDCore will have * a mirror area available. */ len_to_go = io->scsiio.kern_data_len; /* * Clear the no datasync flag, we have to use malloced * buffers. */ io->io_hdr.flags &= ~CTL_FLAG_NO_DATASYNC; /* * The difficult thing here is that the size of the various * S/G segments may be different than the size from the * remote controller. That'll make it harder when DMAing * the data back to the other side. */ for (i = 0; (i < sizeof(io->io_hdr.remote_sglist) / sizeof(io->io_hdr.remote_sglist[0])) && (len_to_go > 0); i++) { local_sglist[i].len = ctl_min(len_to_go, 131072); CTL_SIZE_8B(local_dma_sglist[i].len, local_sglist[i].len); local_sglist[i].addr = malloc(local_dma_sglist[i].len, M_CTL,M_WAITOK); local_dma_sglist[i].addr = local_sglist[i].addr; if (local_sglist[i].addr == NULL) { int j; printf("malloc failed for %zd bytes!", local_dma_sglist[i].len); for (j = 0; j < i; j++) { free(local_sglist[j].addr, M_CTL); } ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ 4857); retval = 1; goto bailout_error; } /* XXX KDM do we need a sync here? */ len_to_go -= local_sglist[i].len; } /* * Reset the number of S/G entries accordingly. The * original number of S/G entries is available in * rem_sg_entries. */ io->scsiio.kern_sg_entries = i; #if 0 printf("%s: kern_sg_entries = %d\n", __func__, io->scsiio.kern_sg_entries); for (i = 0; i < io->scsiio.kern_sg_entries; i++) printf("%s: sg[%d] = %p, %d (DMA: %d)\n", __func__, i, local_sglist[i].addr, local_sglist[i].len, local_dma_sglist[i].len); #endif } return (retval); bailout_error: ctl_send_datamove_done(io, /*have_lock*/ 0); return (retval); } static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command, ctl_ha_dt_cb callback) { struct ctl_ha_dt_req *rq; struct ctl_sg_entry *remote_sglist, *local_sglist; struct ctl_sg_entry *remote_dma_sglist, *local_dma_sglist; uint32_t local_used, remote_used, total_used; int retval; int i, j; retval = 0; rq = ctl_dt_req_alloc(); /* * If we failed to allocate the request, and if the DMA didn't fail * anyway, set busy status. This is just a resource allocation * failure. */ if ((rq == NULL) && ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) ctl_set_busy(&io->scsiio); if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE) { if (rq != NULL) ctl_dt_req_free(rq); /* * The data move failed. We need to return status back * to the other controller. No point in trying to DMA * data to the remote controller. */ ctl_send_datamove_done(io, /*have_lock*/ 0); retval = 1; goto bailout; } local_sglist = io->io_hdr.local_sglist; local_dma_sglist = io->io_hdr.local_dma_sglist; remote_sglist = io->io_hdr.remote_sglist; remote_dma_sglist = io->io_hdr.remote_dma_sglist; local_used = 0; remote_used = 0; total_used = 0; if (io->io_hdr.flags & CTL_FLAG_REDIR_DONE) { rq->ret = CTL_HA_STATUS_SUCCESS; rq->context = io; callback(rq); goto bailout; } /* * Pull/push the data over the wire from/to the other controller. * This takes into account the possibility that the local and * remote sglists may not be identical in terms of the size of * the elements and the number of elements. * * One fundamental assumption here is that the length allocated for * both the local and remote sglists is identical. Otherwise, we've * essentially got a coding error of some sort. */ for (i = 0, j = 0; total_used < io->scsiio.kern_data_len; ) { int isc_ret; uint32_t cur_len, dma_length; uint8_t *tmp_ptr; rq->id = CTL_HA_DATA_CTL; rq->command = command; rq->context = io; /* * Both pointers should be aligned. But it is possible * that the allocation length is not. They should both * also have enough slack left over at the end, though, * to round up to the next 8 byte boundary. */ cur_len = ctl_min(local_sglist[i].len - local_used, remote_sglist[j].len - remote_used); /* * In this case, we have a size issue and need to decrease * the size, except in the case where we actually have less * than 8 bytes left. In that case, we need to increase * the DMA length to get the last bit. */ if ((cur_len & 0x7) != 0) { if (cur_len > 0x7) { cur_len = cur_len - (cur_len & 0x7); dma_length = cur_len; } else { CTL_SIZE_8B(dma_length, cur_len); } } else dma_length = cur_len; /* * If we had to allocate memory for this I/O, instead of using * the non-cached mirror memory, we'll need to flush the cache * before trying to DMA to the other controller. * * We could end up doing this multiple times for the same * segment if we have a larger local segment than remote * segment. That shouldn't be an issue. */ if ((io->io_hdr.flags & CTL_FLAG_NO_DATASYNC) == 0) { /* * XXX KDM use bus_dmamap_sync() here. */ } rq->size = dma_length; tmp_ptr = (uint8_t *)local_sglist[i].addr; tmp_ptr += local_used; /* Use physical addresses when talking to ISC hardware */ if ((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0) { /* XXX KDM use busdma */ #if 0 rq->local = vtophys(tmp_ptr); #endif } else rq->local = tmp_ptr; tmp_ptr = (uint8_t *)remote_sglist[j].addr; tmp_ptr += remote_used; rq->remote = tmp_ptr; rq->callback = NULL; local_used += cur_len; if (local_used >= local_sglist[i].len) { i++; local_used = 0; } remote_used += cur_len; if (remote_used >= remote_sglist[j].len) { j++; remote_used = 0; } total_used += cur_len; if (total_used >= io->scsiio.kern_data_len) rq->callback = callback; if ((rq->size & 0x7) != 0) { printf("%s: warning: size %d is not on 8b boundary\n", __func__, rq->size); } if (((uintptr_t)rq->local & 0x7) != 0) { printf("%s: warning: local %p not on 8b boundary\n", __func__, rq->local); } if (((uintptr_t)rq->remote & 0x7) != 0) { printf("%s: warning: remote %p not on 8b boundary\n", __func__, rq->local); } #if 0 printf("%s: %s: local %#x remote %#x size %d\n", __func__, (command == CTL_HA_DT_CMD_WRITE) ? "WRITE" : "READ", rq->local, rq->remote, rq->size); #endif isc_ret = ctl_dt_single(rq); if (isc_ret == CTL_HA_STATUS_WAIT) continue; if (isc_ret == CTL_HA_STATUS_DISCONNECT) { rq->ret = CTL_HA_STATUS_SUCCESS; } else { rq->ret = isc_ret; } callback(rq); goto bailout; } bailout: return (retval); } static void ctl_datamove_remote_read(union ctl_io *io) { int retval; int i; /* * This will send an error to the other controller in the case of a * failure. */ retval = ctl_datamove_remote_sgl_setup(io); if (retval != 0) return; retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_READ, ctl_datamove_remote_read_cb); if ((retval != 0) && ((io->io_hdr.flags & CTL_FLAG_AUTO_MIRROR) == 0)) { /* * Make sure we free memory if there was an error.. The * ctl_datamove_remote_xfer() function will send the * datamove done message, or call the callback with an * error if there is a problem. */ for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(io->io_hdr.local_sglist[i].addr, M_CTL); } return; } /* * Process a datamove request from the other controller. This is used for * XFER mode only, not SER_ONLY mode. For writes, we DMA into local memory * first. Once that is complete, the data gets DMAed into the remote * controller's memory. For reads, we DMA from the remote controller's * memory into our memory first, and then move it out to the FETD. */ static void ctl_datamove_remote(union ctl_io *io) { struct ctl_softc *softc; softc = control_softc; mtx_assert(&softc->ctl_lock, MA_NOTOWNED); /* * Note that we look for an aborted I/O here, but don't do some of * the other checks that ctl_datamove() normally does. * We don't need to run the datamove delay code, since that should * have been done if need be on the other controller. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { printf("%s: tag 0x%04x on (%d:%d:%d:%d) aborted\n", __func__, io->scsiio.tag_num, io->io_hdr.nexus.initid.id, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_target.id, io->io_hdr.nexus.targ_lun); io->io_hdr.port_status = 31338; ctl_send_datamove_done(io, /*have_lock*/ 0); return; } if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) { ctl_datamove_remote_write(io); } else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN){ ctl_datamove_remote_read(io); } else { union ctl_ha_msg msg; struct scsi_sense_data *sense; uint8_t sks[3]; int retry_count; memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_BAD_JUJU; msg.hdr.status = CTL_SCSI_ERROR; msg.scsi.scsi_status = SCSI_STATUS_CHECK_COND; retry_count = 4243; sense = &msg.scsi.sense_data; sks[0] = SSD_SCS_VALID; sks[1] = (retry_count >> 8) & 0xff; sks[2] = retry_count & 0xff; /* "Internal target failure" */ scsi_set_sense_data(sense, /*sense_format*/ SSD_TYPE_NONE, /*current_error*/ 1, /*sense_key*/ SSD_KEY_HARDWARE_ERROR, /*asc*/ 0x44, /*ascq*/ 0x00, /*type*/ SSD_ELEM_SKS, /*size*/ sizeof(sks), /*data*/ sks, SSD_ELEM_NONE); io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; if (io->io_hdr.flags & CTL_FLAG_FAILOVER) { ctl_failover_io(io, /*have_lock*/ 1); return; } if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg), 0) > CTL_HA_STATUS_SUCCESS) { /* XXX KDM what to do if this fails? */ } return; } } static int ctl_process_done(union ctl_io *io) { struct ctl_lun *lun; struct ctl_softc *ctl_softc; void (*fe_done)(union ctl_io *io); uint32_t targ_port = ctl_port_idx(io->io_hdr.nexus.targ_port); CTL_DEBUG_PRINT(("ctl_process_done\n")); fe_done = control_softc->ctl_ports[targ_port]->fe_done; #ifdef CTL_TIME_IO if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) { char str[256]; char path_str[64]; struct sbuf sb; ctl_scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); break; case CTL_IO_TASK: sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, " "Tag Type: %d\n", io->taskio.task_action, io->taskio.tag_num, io->taskio.tag_type); break; default: printf("Invalid CTL I/O type %d\n", io->io_hdr.io_type); panic("Invalid CTL I/O type %d\n", io->io_hdr.io_type); break; } sbuf_cat(&sb, path_str); sbuf_printf(&sb, "ctl_process_done: %jd seconds\n", (intmax_t)time_uptime - io->io_hdr.start_time); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); } #endif /* CTL_TIME_IO */ switch (io->io_hdr.io_type) { case CTL_IO_SCSI: break; case CTL_IO_TASK: if (bootverbose || verbose > 0) ctl_io_error_print(io, NULL); if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) ctl_free_io(io); else fe_done(io); return (CTL_RETVAL_COMPLETE); break; default: printf("ctl_process_done: invalid io type %d\n", io->io_hdr.io_type); panic("ctl_process_done: invalid io type %d\n", io->io_hdr.io_type); break; /* NOTREACHED */ } lun = (struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun == NULL) { CTL_DEBUG_PRINT(("NULL LUN for lun %d\n", io->io_hdr.nexus.targ_mapped_lun)); fe_done(io); goto bailout; } ctl_softc = lun->ctl_softc; mtx_lock(&lun->lun_lock); /* * Check to see if we have any errors to inject here. We only * inject errors for commands that don't already have errors set. */ if ((STAILQ_FIRST(&lun->error_list) != NULL) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) ctl_inject_error(lun, io); /* * XXX KDM how do we treat commands that aren't completed * successfully? * * XXX KDM should we also track I/O latency? */ if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS && io->io_hdr.io_type == CTL_IO_SCSI) { #ifdef CTL_TIME_IO struct bintime cur_bt; #endif int type; if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) type = CTL_STATS_READ; else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) type = CTL_STATS_WRITE; else type = CTL_STATS_NO_IO; lun->stats.ports[targ_port].bytes[type] += io->scsiio.kern_total_len; lun->stats.ports[targ_port].operations[type]++; #ifdef CTL_TIME_IO bintime_add(&lun->stats.ports[targ_port].dma_time[type], &io->io_hdr.dma_bt); lun->stats.ports[targ_port].num_dmas[type] += io->io_hdr.num_dmas; getbintime(&cur_bt); bintime_sub(&cur_bt, &io->io_hdr.start_bt); bintime_add(&lun->stats.ports[targ_port].time[type], &cur_bt); #endif } /* * Remove this from the OOA queue. */ TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); /* * Run through the blocked queue on this LUN and see if anything * has become unblocked, now that this transaction is done. */ ctl_check_blocked(lun); /* * If the LUN has been invalidated, free it if there is nothing * left on its OOA queue. */ if ((lun->flags & CTL_LUN_INVALID) && TAILQ_EMPTY(&lun->ooa_queue)) { mtx_unlock(&lun->lun_lock); mtx_lock(&ctl_softc->ctl_lock); ctl_free_lun(lun); mtx_unlock(&ctl_softc->ctl_lock); } else mtx_unlock(&lun->lun_lock); /* * If this command has been aborted, make sure we set the status * properly. The FETD is responsible for freeing the I/O and doing * whatever it needs to do to clean up its state. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) ctl_set_task_aborted(&io->scsiio); /* * We print out status for every task management command. For SCSI * commands, we filter out any unit attention errors; they happen * on every boot, and would clutter up the log. Note: task * management commands aren't printed here, they are printed above, * since they should never even make it down here. */ switch (io->io_hdr.io_type) { case CTL_IO_SCSI: { int error_code, sense_key, asc, ascq; sense_key = 0; if (((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SCSI_ERROR) && (io->scsiio.scsi_status == SCSI_STATUS_CHECK_COND)) { /* * Since this is just for printing, no need to * show errors here. */ scsi_extract_sense_len(&io->scsiio.sense_data, io->scsiio.sense_len, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 0); } if (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) && (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SCSI_ERROR) || (io->scsiio.scsi_status != SCSI_STATUS_CHECK_COND) || (sense_key != SSD_KEY_UNIT_ATTENTION))) { if ((time_uptime - ctl_softc->last_print_jiffies) <= 0){ ctl_softc->skipped_prints++; } else { uint32_t skipped_prints; skipped_prints = ctl_softc->skipped_prints; ctl_softc->skipped_prints = 0; ctl_softc->last_print_jiffies = time_uptime; if (skipped_prints > 0) { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_ERROR_REPORT, csevent_LogType_Trace, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "High CTL error volume, %d prints " "skipped", skipped_prints); #endif } if (bootverbose || verbose > 0) ctl_io_error_print(io, NULL); } } break; } case CTL_IO_TASK: if (bootverbose || verbose > 0) ctl_io_error_print(io, NULL); break; default: break; } /* * Tell the FETD or the other shelf controller we're done with this * command. Note that only SCSI commands get to this point. Task * management commands are completed above. * * We only send status to the other controller if we're in XFER * mode. In SER_ONLY mode, the I/O is done on the controller that * received the I/O (from CTL's perspective), and so the status is * generated there. * * XXX KDM if we hold the lock here, we could cause a deadlock * if the frontend comes back in in this context to queue * something. */ if ((ctl_softc->ha_mode == CTL_HA_MODE_XFER) && (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg; memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_FINISH_IO; msg.hdr.original_sc = io->io_hdr.original_sc; msg.hdr.nexus = io->io_hdr.nexus; msg.hdr.status = io->io_hdr.status; msg.scsi.scsi_status = io->scsiio.scsi_status; msg.scsi.tag_num = io->scsiio.tag_num; msg.scsi.tag_type = io->scsiio.tag_type; msg.scsi.sense_len = io->scsiio.sense_len; msg.scsi.sense_residual = io->scsiio.sense_residual; msg.scsi.residual = io->scsiio.residual; memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data, sizeof(io->scsiio.sense_data)); /* * We copy this whether or not this is an I/O-related * command. Otherwise, we'd have to go and check to see * whether it's a read/write command, and it really isn't * worth it. */ memcpy(&msg.scsi.lbalen, &io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes, sizeof(msg.scsi.lbalen)); if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg), 0) > CTL_HA_STATUS_SUCCESS) { /* XXX do something here */ } ctl_free_io(io); } else fe_done(io); bailout: return (CTL_RETVAL_COMPLETE); } #ifdef CTL_WITH_CA /* * Front end should call this if it doesn't do autosense. When the request * sense comes back in from the initiator, we'll dequeue this and send it. */ int ctl_queue_sense(union ctl_io *io) { struct ctl_lun *lun; struct ctl_softc *ctl_softc; uint32_t initidx, targ_lun; ctl_softc = control_softc; CTL_DEBUG_PRINT(("ctl_queue_sense\n")); /* * LUN lookup will likely move to the ctl_work_thread() once we * have our new queueing infrastructure (that doesn't put things on * a per-LUN queue initially). That is so that we can handle * things like an INQUIRY to a LUN that we don't have enabled. We * can't deal with that right now. */ mtx_lock(&ctl_softc->ctl_lock); /* * If we don't have a LUN for this, just toss the sense * information. */ targ_lun = io->io_hdr.nexus.targ_lun; targ_lun = ctl_map_lun(io->io_hdr.nexus.targ_port, targ_lun); if ((targ_lun < CTL_MAX_LUNS) && (ctl_softc->ctl_luns[targ_lun] != NULL)) lun = ctl_softc->ctl_luns[targ_lun]; else goto bailout; initidx = ctl_get_initindex(&io->io_hdr.nexus); mtx_lock(&lun->lun_lock); /* * Already have CA set for this LUN...toss the sense information. */ if (ctl_is_set(lun->have_ca, initidx)) { mtx_unlock(&lun->lun_lock); goto bailout; } memcpy(&lun->pending_sense[initidx], &io->scsiio.sense_data, ctl_min(sizeof(lun->pending_sense[initidx]), sizeof(io->scsiio.sense_data))); ctl_set_mask(lun->have_ca, initidx); mtx_unlock(&lun->lun_lock); bailout: mtx_unlock(&ctl_softc->ctl_lock); ctl_free_io(io); return (CTL_RETVAL_COMPLETE); } #endif /* * Primary command inlet from frontend ports. All SCSI and task I/O * requests must go through this function. */ int ctl_queue(union ctl_io *io) { struct ctl_softc *ctl_softc; CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); ctl_softc = control_softc; #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; getbintime(&io->io_hdr.start_bt); #endif /* CTL_TIME_IO */ /* Map FE-specific LUN ID into global one. */ io->io_hdr.nexus.targ_mapped_lun = ctl_map_lun(io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: case CTL_IO_TASK: ctl_enqueue_incoming(io); break; default: printf("ctl_queue: unknown I/O type %d\n", io->io_hdr.io_type); return (EINVAL); } return (CTL_RETVAL_COMPLETE); } #ifdef CTL_IO_DELAY static void ctl_done_timer_wakeup(void *arg) { union ctl_io *io; io = (union ctl_io *)arg; ctl_done(io); } #endif /* CTL_IO_DELAY */ void ctl_done(union ctl_io *io) { struct ctl_softc *ctl_softc; ctl_softc = control_softc; /* * Enable this to catch duplicate completion issues. */ #if 0 if (io->io_hdr.flags & CTL_FLAG_ALREADY_DONE) { printf("%s: type %d msg %d cdb %x iptl: " "%d:%d:%d:%d tag 0x%04x " "flag %#x status %x\n", __func__, io->io_hdr.io_type, io->io_hdr.msg_type, io->scsiio.cdb[0], io->io_hdr.nexus.initid.id, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_target.id, io->io_hdr.nexus.targ_lun, (io->io_hdr.io_type == CTL_IO_TASK) ? io->taskio.tag_num : io->scsiio.tag_num, io->io_hdr.flags, io->io_hdr.status); } else io->io_hdr.flags |= CTL_FLAG_ALREADY_DONE; #endif /* * This is an internal copy of an I/O, and should not go through * the normal done processing logic. */ if (io->io_hdr.flags & CTL_FLAG_INT_COPY) return; /* * We need to send a msg to the serializing shelf to finish the IO * as well. We don't send a finish message to the other shelf if * this is a task management command. Task management commands * aren't serialized in the OOA queue, but rather just executed on * both shelf controllers for commands that originated on that * controller. */ if ((io->io_hdr.flags & CTL_FLAG_SENT_2OTHER_SC) && (io->io_hdr.io_type != CTL_IO_TASK)) { union ctl_ha_msg msg_io; msg_io.hdr.msg_type = CTL_MSG_FINISH_IO; msg_io.hdr.serializing_sc = io->io_hdr.serializing_sc; if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_io, sizeof(msg_io), 0 ) != CTL_HA_STATUS_SUCCESS) { } /* continue on to finish IO */ } #ifdef CTL_IO_DELAY if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) { struct ctl_lun *lun; lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE; } else { struct ctl_lun *lun; lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if ((lun != NULL) && (lun->delay_info.done_delay > 0)) { struct callout *callout; callout = (struct callout *)&io->io_hdr.timer_bytes; callout_init(callout, /*mpsafe*/ 1); io->io_hdr.flags |= CTL_FLAG_DELAY_DONE; callout_reset(callout, lun->delay_info.done_delay * hz, ctl_done_timer_wakeup, io); if (lun->delay_info.done_type == CTL_DELAY_TYPE_ONESHOT) lun->delay_info.done_delay = 0; return; } } #endif /* CTL_IO_DELAY */ ctl_enqueue_done(io); } int ctl_isc(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; int retval; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_isc: command: %02x\n", ctsio->cdb[0])); CTL_DEBUG_PRINT(("ctl_isc: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } static void ctl_work_thread(void *arg) { struct ctl_thread *thr = (struct ctl_thread *)arg; struct ctl_softc *softc = thr->ctl_softc; union ctl_io *io; int retval; CTL_DEBUG_PRINT(("ctl_work_thread starting\n")); for (;;) { retval = 0; /* * We handle the queues in this order: * - ISC * - done queue (to free up resources, unblock other commands) * - RtR queue * - incoming queue * * If those queues are empty, we break out of the loop and * go to sleep. */ mtx_lock(&thr->queue_lock); io = (union ctl_io *)STAILQ_FIRST(&thr->isc_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->isc_queue, links); mtx_unlock(&thr->queue_lock); ctl_handle_isc(io); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->done_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->done_queue, links); /* clear any blocked commands, call fe_done */ mtx_unlock(&thr->queue_lock); retval = ctl_process_done(io); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->incoming_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->incoming_queue, links); mtx_unlock(&thr->queue_lock); if (io->io_hdr.io_type == CTL_IO_TASK) ctl_run_task(io); else ctl_scsiio_precheck(softc, &io->scsiio); continue; } if (!ctl_pause_rtr) { io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->rtr_queue, links); mtx_unlock(&thr->queue_lock); retval = ctl_scsiio(&io->scsiio); if (retval != CTL_RETVAL_COMPLETE) CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); continue; } } /* Sleep until we have something to do. */ mtx_sleep(thr, &thr->queue_lock, PDROP | PRIBIO, "-", 0); } } static void ctl_lun_thread(void *arg) { struct ctl_softc *softc = (struct ctl_softc *)arg; struct ctl_be_lun *be_lun; int retval; CTL_DEBUG_PRINT(("ctl_lun_thread starting\n")); for (;;) { retval = 0; mtx_lock(&softc->ctl_lock); be_lun = STAILQ_FIRST(&softc->pending_lun_queue); if (be_lun != NULL) { STAILQ_REMOVE_HEAD(&softc->pending_lun_queue, links); mtx_unlock(&softc->ctl_lock); ctl_create_lun(be_lun); continue; } /* Sleep until we have something to do. */ mtx_sleep(&softc->pending_lun_queue, &softc->ctl_lock, PDROP | PRIBIO, "-", 0); } } static void ctl_enqueue_incoming(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; u_int idx; idx = (io->io_hdr.nexus.targ_port * 127 + io->io_hdr.nexus.initid.id) % worker_threads; thr = &softc->threads[idx]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->incoming_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_rtr(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->rtr_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_done(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->done_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_isc(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->isc_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } /* Initialization and failover */ void ctl_init_isc_msg(void) { printf("CTL: Still calling this thing\n"); } /* * Init component * Initializes component into configuration defined by bootMode * (see hasc-sv.c) * returns hasc_Status: * OK * ERROR - fatal error */ static ctl_ha_comp_status ctl_isc_init(struct ctl_ha_component *c) { ctl_ha_comp_status ret = CTL_HA_COMP_STATUS_OK; c->status = ret; return ret; } /* Start component * Starts component in state requested. If component starts successfully, * it must set its own state to the requestrd state * When requested state is HASC_STATE_HA, the component may refine it * by adding _SLAVE or _MASTER flags. * Currently allowed state transitions are: * UNKNOWN->HA - initial startup * UNKNOWN->SINGLE - initial startup when no parter detected * HA->SINGLE - failover * returns ctl_ha_comp_status: * OK - component successfully started in requested state * FAILED - could not start the requested state, failover may * be possible * ERROR - fatal error detected, no future startup possible */ static ctl_ha_comp_status ctl_isc_start(struct ctl_ha_component *c, ctl_ha_state state) { ctl_ha_comp_status ret = CTL_HA_COMP_STATUS_OK; printf("%s: go\n", __func__); // UNKNOWN->HA or UNKNOWN->SINGLE (bootstrap) if (c->state == CTL_HA_STATE_UNKNOWN ) { ctl_is_single = 0; if (ctl_ha_msg_create(CTL_HA_CHAN_CTL, ctl_isc_event_handler) != CTL_HA_STATUS_SUCCESS) { printf("ctl_isc_start: ctl_ha_msg_create failed.\n"); ret = CTL_HA_COMP_STATUS_ERROR; } } else if (CTL_HA_STATE_IS_HA(c->state) && CTL_HA_STATE_IS_SINGLE(state)){ // HA->SINGLE transition ctl_failover(); ctl_is_single = 1; } else { printf("ctl_isc_start:Invalid state transition %X->%X\n", c->state, state); ret = CTL_HA_COMP_STATUS_ERROR; } if (CTL_HA_STATE_IS_SINGLE(state)) ctl_is_single = 1; c->state = state; c->status = ret; return ret; } /* * Quiesce component * The component must clear any error conditions (set status to OK) and * prepare itself to another Start call * returns ctl_ha_comp_status: * OK * ERROR */ static ctl_ha_comp_status ctl_isc_quiesce(struct ctl_ha_component *c) { int ret = CTL_HA_COMP_STATUS_OK; ctl_pause_rtr = 1; c->status = ret; return ret; } struct ctl_ha_component ctl_ha_component_ctlisc = { .name = "CTL ISC", .state = CTL_HA_STATE_UNKNOWN, .init = ctl_isc_init, .start = ctl_isc_start, .quiesce = ctl_isc_quiesce }; /* * vim: ts=8 */ Index: user/ae/inet6/sys/cam/scsi/scsi_all.h =================================================================== --- user/ae/inet6/sys/cam/scsi/scsi_all.h (revision 271452) +++ user/ae/inet6/sys/cam/scsi/scsi_all.h (revision 271453) @@ -1,3672 +1,3721 @@ /*- * Largely written by Julian Elischer (julian@tfs.com) * for TRW Financial Systems. * * TRW Financial Systems, in accordance with their agreement with Carnegie * Mellon University, makes this software available to CMU to distribute * or use in any manner that they see fit as long as this message is kept with * the software. For this reason TFS also grants any other persons or * organisations permission to use or modify this software. * * TFS supplies this software to be publicly redistributed * on the understanding that TFS is not responsible for the correct * functioning of this software in any circumstances. * * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992 * * $FreeBSD$ */ /* * SCSI general interface description */ #ifndef _SCSI_SCSI_ALL_H #define _SCSI_SCSI_ALL_H 1 #include #include #ifdef _KERNEL /* * This is the number of seconds we wait for devices to settle after a SCSI * bus reset. */ extern int scsi_delay; #endif /* _KERNEL */ /* * SCSI command format */ /* * Define dome bits that are in ALL (or a lot of) scsi commands */ #define SCSI_CTL_LINK 0x01 #define SCSI_CTL_FLAG 0x02 #define SCSI_CTL_VENDOR 0xC0 #define SCSI_CMD_LUN 0xA0 /* these two should not be needed */ #define SCSI_CMD_LUN_SHIFT 5 /* LUN in the cmd is no longer SCSI */ #define SCSI_MAX_CDBLEN 16 /* * 16 byte commands are in the * SCSI-3 spec */ #if defined(CAM_MAX_CDBLEN) && (CAM_MAX_CDBLEN < SCSI_MAX_CDBLEN) #error "CAM_MAX_CDBLEN cannot be less than SCSI_MAX_CDBLEN" #endif /* 6byte CDBs special case 0 length to be 256 */ #define SCSI_CDB6_LEN(len) ((len) == 0 ? 256 : len) /* * This type defines actions to be taken when a particular sense code is * received. Right now, these flags are only defined to take up 16 bits, * but can be expanded in the future if necessary. */ typedef enum { SS_NOP = 0x000000, /* Do nothing */ SS_RETRY = 0x010000, /* Retry the command */ SS_FAIL = 0x020000, /* Bail out */ SS_START = 0x030000, /* Send a Start Unit command to the device, * then retry the original command. */ SS_TUR = 0x040000, /* Send a Test Unit Ready command to the * device, then retry the original command. */ SS_MASK = 0xff0000 } scsi_sense_action; typedef enum { SSQ_NONE = 0x0000, SSQ_DECREMENT_COUNT = 0x0100, /* Decrement the retry count */ SSQ_MANY = 0x0200, /* send lots of recovery commands */ SSQ_RANGE = 0x0400, /* * This table entry represents the * end of a range of ASCQs that * have identical error actions * and text. */ SSQ_PRINT_SENSE = 0x0800, SSQ_UA = 0x1000, /* Broadcast UA. */ SSQ_RESCAN = 0x2000, /* Rescan target for LUNs. */ SSQ_LOST = 0x4000, /* Destroy the LUNs. */ SSQ_MASK = 0xff00 } scsi_sense_action_qualifier; /* Mask for error status values */ #define SS_ERRMASK 0xff /* The default, retyable, error action */ #define SS_RDEF SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE|EIO /* The retyable, error action, with table specified error code */ #define SS_RET SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE /* Fatal error action, with table specified error code */ #define SS_FATAL SS_FAIL|SSQ_PRINT_SENSE struct scsi_generic { u_int8_t opcode; u_int8_t bytes[11]; }; struct scsi_request_sense { u_int8_t opcode; u_int8_t byte2; #define SRS_DESC 0x01 u_int8_t unused[2]; u_int8_t length; u_int8_t control; }; struct scsi_test_unit_ready { u_int8_t opcode; u_int8_t byte2; u_int8_t unused[3]; u_int8_t control; }; struct scsi_receive_diag { uint8_t opcode; uint8_t byte2; #define SRD_PCV 0x01 uint8_t page_code; uint8_t length[2]; uint8_t control; }; struct scsi_send_diag { uint8_t opcode; uint8_t byte2; #define SSD_UNITOFFL 0x01 #define SSD_DEVOFFL 0x02 #define SSD_SELFTEST 0x04 #define SSD_PF 0x10 #define SSD_SELF_TEST_CODE_MASK 0xE0 #define SSD_SELF_TEST_CODE_SHIFT 5 #define SSD_SELF_TEST_CODE_NONE 0x00 #define SSD_SELF_TEST_CODE_BG_SHORT 0x01 #define SSD_SELF_TEST_CODE_BG_EXTENDED 0x02 #define SSD_SELF_TEST_CODE_BG_ABORT 0x04 #define SSD_SELF_TEST_CODE_FG_SHORT 0x05 #define SSD_SELF_TEST_CODE_FG_EXTENDED 0x06 uint8_t reserved; uint8_t length[2]; uint8_t control; }; struct scsi_sense { u_int8_t opcode; u_int8_t byte2; u_int8_t unused[2]; u_int8_t length; u_int8_t control; }; struct scsi_inquiry { u_int8_t opcode; u_int8_t byte2; #define SI_EVPD 0x01 #define SI_CMDDT 0x02 u_int8_t page_code; u_int8_t length[2]; u_int8_t control; }; struct scsi_mode_sense_6 { u_int8_t opcode; u_int8_t byte2; #define SMS_DBD 0x08 u_int8_t page; #define SMS_PAGE_CODE 0x3F #define SMS_VENDOR_SPECIFIC_PAGE 0x00 #define SMS_DISCONNECT_RECONNECT_PAGE 0x02 #define SMS_FORMAT_DEVICE_PAGE 0x03 #define SMS_GEOMETRY_PAGE 0x04 #define SMS_CACHE_PAGE 0x08 #define SMS_PERIPHERAL_DEVICE_PAGE 0x09 #define SMS_CONTROL_MODE_PAGE 0x0A #define SMS_PROTO_SPECIFIC_PAGE 0x19 #define SMS_INFO_EXCEPTIONS_PAGE 0x1C #define SMS_ALL_PAGES_PAGE 0x3F #define SMS_PAGE_CTRL_MASK 0xC0 #define SMS_PAGE_CTRL_CURRENT 0x00 #define SMS_PAGE_CTRL_CHANGEABLE 0x40 #define SMS_PAGE_CTRL_DEFAULT 0x80 #define SMS_PAGE_CTRL_SAVED 0xC0 u_int8_t subpage; #define SMS_SUBPAGE_PAGE_0 0x00 #define SMS_SUBPAGE_ALL 0xff u_int8_t length; u_int8_t control; }; struct scsi_mode_sense_10 { u_int8_t opcode; u_int8_t byte2; /* same bits as small version */ #define SMS10_LLBAA 0x10 u_int8_t page; /* same bits as small version */ u_int8_t subpage; u_int8_t unused[3]; u_int8_t length[2]; u_int8_t control; }; struct scsi_mode_select_6 { u_int8_t opcode; u_int8_t byte2; #define SMS_SP 0x01 #define SMS_PF 0x10 u_int8_t unused[2]; u_int8_t length; u_int8_t control; }; struct scsi_mode_select_10 { u_int8_t opcode; u_int8_t byte2; /* same bits as small version */ u_int8_t unused[5]; u_int8_t length[2]; u_int8_t control; }; /* * When sending a mode select to a tape drive, the medium type must be 0. */ struct scsi_mode_hdr_6 { u_int8_t datalen; u_int8_t medium_type; u_int8_t dev_specific; u_int8_t block_descr_len; }; struct scsi_mode_hdr_10 { u_int8_t datalen[2]; u_int8_t medium_type; u_int8_t dev_specific; u_int8_t reserved[2]; u_int8_t block_descr_len[2]; }; struct scsi_mode_block_descr { u_int8_t density_code; u_int8_t num_blocks[3]; u_int8_t reserved; u_int8_t block_len[3]; }; struct scsi_per_res_in { u_int8_t opcode; u_int8_t action; #define SPRI_RK 0x00 #define SPRI_RR 0x01 #define SPRI_RC 0x02 #define SPRI_RS 0x03 u_int8_t reserved[5]; u_int8_t length[2]; #define SPRI_MAX_LEN 0xffff u_int8_t control; }; struct scsi_per_res_in_header { u_int8_t generation[4]; u_int8_t length[4]; }; struct scsi_per_res_key { u_int8_t key[8]; }; struct scsi_per_res_in_keys { struct scsi_per_res_in_header header; struct scsi_per_res_key keys[0]; }; struct scsi_per_res_cap { uint8_t length[2]; uint8_t flags1; #define SPRI_RLR_C 0x80 #define SPRI_CRH 0x10 #define SPRI_SIP_C 0x08 #define SPRI_ATP_C 0x04 #define SPRI_PTPL_C 0x01 uint8_t flags2; #define SPRI_TMV 0x80 #define SPRI_ALLOW_CMD_MASK 0x70 #define SPRI_ALLOW_CMD_SHIFT 4 #define SPRI_ALLOW_NA 0x00 #define SPRI_ALLOW_1 0x10 #define SPRI_ALLOW_2 0x20 #define SPRI_ALLOW_3 0x30 #define SPRI_ALLOW_4 0x40 #define SPRI_PTPL_A 0x01 uint8_t type_mask[2]; #define SPRI_TM_WR_EX_AR 0x8000 #define SPRI_TM_EX_AC_RO 0x4000 #define SPRI_TM_WR_EX_RO 0x2000 #define SPRI_TM_EX_AC 0x0800 #define SPRI_TM_WR_EX 0x0200 #define SPRI_TM_EX_AC_AR 0x0001 uint8_t reserved[2]; }; struct scsi_per_res_in_rsrv_data { uint8_t reservation[8]; uint8_t scope_addr[4]; uint8_t reserved; uint8_t scopetype; #define SPRT_WE 0x01 #define SPRT_EA 0x03 #define SPRT_WERO 0x05 #define SPRT_EARO 0x06 #define SPRT_WEAR 0x07 #define SPRT_EAAR 0x08 uint8_t extent_length[2]; }; struct scsi_per_res_in_rsrv { struct scsi_per_res_in_header header; struct scsi_per_res_in_rsrv_data data; }; struct scsi_per_res_in_full_desc { struct scsi_per_res_key res_key; uint8_t reserved1[4]; uint8_t flags; #define SPRI_FULL_ALL_TG_PT 0x02 #define SPRI_FULL_R_HOLDER 0x01 uint8_t scopetype; uint8_t reserved2[4]; uint8_t rel_trgt_port_id[2]; uint8_t additional_length[4]; uint8_t transport_id[]; }; struct scsi_per_res_in_full { struct scsi_per_res_in_header header; struct scsi_per_res_in_full_desc desc[]; }; struct scsi_per_res_out { u_int8_t opcode; u_int8_t action; #define SPRO_REGISTER 0x00 #define SPRO_RESERVE 0x01 #define SPRO_RELEASE 0x02 #define SPRO_CLEAR 0x03 #define SPRO_PREEMPT 0x04 #define SPRO_PRE_ABO 0x05 #define SPRO_REG_IGNO 0x06 #define SPRO_REG_MOVE 0x07 #define SPRO_REPL_LOST_RES 0x08 #define SPRO_ACTION_MASK 0x1f u_int8_t scope_type; #define SPR_SCOPE_MASK 0xf0 #define SPR_SCOPE_SHIFT 4 #define SPR_LU_SCOPE 0x00 #define SPR_EXTENT_SCOPE 0x10 #define SPR_ELEMENT_SCOPE 0x20 #define SPR_TYPE_MASK 0x0f #define SPR_TYPE_RD_SHARED 0x00 #define SPR_TYPE_WR_EX 0x01 #define SPR_TYPE_RD_EX 0x02 #define SPR_TYPE_EX_AC 0x03 #define SPR_TYPE_SHARED 0x04 #define SPR_TYPE_WR_EX_RO 0x05 #define SPR_TYPE_EX_AC_RO 0x06 #define SPR_TYPE_WR_EX_AR 0x07 #define SPR_TYPE_EX_AC_AR 0x08 u_int8_t reserved[2]; u_int8_t length[4]; u_int8_t control; }; struct scsi_per_res_out_parms { struct scsi_per_res_key res_key; u_int8_t serv_act_res_key[8]; u_int8_t scope_spec_address[4]; u_int8_t flags; #define SPR_SPEC_I_PT 0x08 #define SPR_ALL_TG_PT 0x04 #define SPR_APTPL 0x01 u_int8_t reserved1; u_int8_t extent_length[2]; u_int8_t transport_id_list[]; }; struct scsi_per_res_out_trans_ids { u_int8_t additional_length[4]; u_int8_t transport_ids[]; }; /* * Used with REGISTER AND MOVE serivce action of the PERSISTENT RESERVE OUT * command. */ struct scsi_per_res_reg_move { struct scsi_per_res_key res_key; u_int8_t serv_act_res_key[8]; u_int8_t reserved; u_int8_t flags; #define SPR_REG_MOVE_UNREG 0x02 #define SPR_REG_MOVE_APTPL 0x01 u_int8_t rel_trgt_port_id[2]; u_int8_t transport_id_length[4]; u_int8_t transport_id[]; }; struct scsi_transportid_header { uint8_t format_protocol; #define SCSI_TRN_FORMAT_MASK 0xc0 #define SCSI_TRN_FORMAT_SHIFT 6 #define SCSI_TRN_PROTO_MASK 0x0f }; struct scsi_transportid_fcp { uint8_t format_protocol; #define SCSI_TRN_FCP_FORMAT_DEFAULT 0x00 uint8_t reserved1[7]; uint8_t n_port_name[8]; uint8_t reserved2[8]; }; struct scsi_transportid_spi { uint8_t format_protocol; #define SCSI_TRN_SPI_FORMAT_DEFAULT 0x00 uint8_t reserved1; uint8_t scsi_addr[2]; uint8_t obsolete[2]; uint8_t rel_trgt_port_id[2]; uint8_t reserved2[16]; }; struct scsi_transportid_1394 { uint8_t format_protocol; #define SCSI_TRN_1394_FORMAT_DEFAULT 0x00 uint8_t reserved1[7]; uint8_t eui64[8]; uint8_t reserved2[8]; }; struct scsi_transportid_rdma { uint8_t format_protocol; #define SCSI_TRN_RDMA_FORMAT_DEFAULT 0x00 uint8_t reserved[7]; #define SCSI_TRN_RDMA_PORT_LEN 16 uint8_t initiator_port_id[SCSI_TRN_RDMA_PORT_LEN]; }; struct scsi_transportid_iscsi_device { uint8_t format_protocol; #define SCSI_TRN_ISCSI_FORMAT_DEVICE 0x00 uint8_t reserved; uint8_t additional_length[2]; uint8_t iscsi_name[]; }; struct scsi_transportid_iscsi_port { uint8_t format_protocol; #define SCSI_TRN_ISCSI_FORMAT_PORT 0x40 uint8_t reserved; uint8_t additional_length[2]; uint8_t iscsi_name[]; /* * Followed by a separator and iSCSI initiator session ID */ }; struct scsi_transportid_sas { uint8_t format_protocol; #define SCSI_TRN_SAS_FORMAT_DEFAULT 0x00 uint8_t reserved1[3]; uint8_t sas_address[8]; uint8_t reserved2[12]; }; struct scsi_sop_routing_id_norm { uint8_t bus; uint8_t devfunc; #define SCSI_TRN_SOP_BUS_MAX 0xff #define SCSI_TRN_SOP_DEV_MAX 0x1f #define SCSI_TRN_SOP_DEV_MASK 0xf8 #define SCSI_TRN_SOP_DEV_SHIFT 3 #define SCSI_TRN_SOP_FUNC_NORM_MASK 0x07 #define SCSI_TRN_SOP_FUNC_NORM_MAX 0x07 }; struct scsi_sop_routing_id_alt { uint8_t bus; uint8_t function; #define SCSI_TRN_SOP_FUNC_ALT_MAX 0xff }; struct scsi_transportid_sop { uint8_t format_protocol; #define SCSI_TRN_SOP_FORMAT_DEFAULT 0x00 uint8_t reserved1; uint8_t routing_id[2]; uint8_t reserved2[20]; }; struct scsi_log_sense { u_int8_t opcode; u_int8_t byte2; #define SLS_SP 0x01 #define SLS_PPC 0x02 u_int8_t page; #define SLS_PAGE_CODE 0x3F #define SLS_ALL_PAGES_PAGE 0x00 #define SLS_OVERRUN_PAGE 0x01 #define SLS_ERROR_WRITE_PAGE 0x02 #define SLS_ERROR_READ_PAGE 0x03 #define SLS_ERROR_READREVERSE_PAGE 0x04 #define SLS_ERROR_VERIFY_PAGE 0x05 #define SLS_ERROR_NONMEDIUM_PAGE 0x06 #define SLS_ERROR_LASTN_PAGE 0x07 #define SLS_SELF_TEST_PAGE 0x10 #define SLS_IE_PAGE 0x2f #define SLS_PAGE_CTRL_MASK 0xC0 #define SLS_PAGE_CTRL_THRESHOLD 0x00 #define SLS_PAGE_CTRL_CUMULATIVE 0x40 #define SLS_PAGE_CTRL_THRESH_DEFAULT 0x80 #define SLS_PAGE_CTRL_CUMUL_DEFAULT 0xC0 u_int8_t reserved[2]; u_int8_t paramptr[2]; u_int8_t length[2]; u_int8_t control; }; struct scsi_log_select { u_int8_t opcode; u_int8_t byte2; /* SLS_SP 0x01 */ #define SLS_PCR 0x02 u_int8_t page; /* SLS_PAGE_CTRL_MASK 0xC0 */ /* SLS_PAGE_CTRL_THRESHOLD 0x00 */ /* SLS_PAGE_CTRL_CUMULATIVE 0x40 */ /* SLS_PAGE_CTRL_THRESH_DEFAULT 0x80 */ /* SLS_PAGE_CTRL_CUMUL_DEFAULT 0xC0 */ u_int8_t reserved[4]; u_int8_t length[2]; u_int8_t control; }; struct scsi_log_header { u_int8_t page; u_int8_t reserved; u_int8_t datalen[2]; }; struct scsi_log_param_header { u_int8_t param_code[2]; u_int8_t param_control; #define SLP_LP 0x01 #define SLP_LBIN 0x02 #define SLP_TMC_MASK 0x0C #define SLP_TMC_ALWAYS 0x00 #define SLP_TMC_EQUAL 0x04 #define SLP_TMC_NOTEQUAL 0x08 #define SLP_TMC_GREATER 0x0C #define SLP_ETC 0x10 #define SLP_TSD 0x20 #define SLP_DS 0x40 #define SLP_DU 0x80 u_int8_t param_len; }; struct scsi_control_page { u_int8_t page_code; u_int8_t page_length; u_int8_t rlec; #define SCP_RLEC 0x01 /*Report Log Exception Cond*/ #define SCP_GLTSD 0x02 /*Global Logging target save disable */ #define SCP_DSENSE 0x04 /*Descriptor Sense */ #define SCP_DPICZ 0x08 /*Disable Prot. Info Check if Prot. Field is Zero */ #define SCP_TMF_ONLY 0x10 /*TM Functions Only*/ #define SCP_TST_MASK 0xE0 /*Task Set Type Mask*/ #define SCP_TST_ONE 0x00 /*One Task Set*/ #define SCP_TST_SEPARATE 0x20 /*Separate Task Sets*/ u_int8_t queue_flags; #define SCP_QUEUE_ALG_MASK 0xF0 #define SCP_QUEUE_ALG_RESTRICTED 0x00 #define SCP_QUEUE_ALG_UNRESTRICTED 0x10 #define SCP_NUAR 0x08 /*No UA on release*/ #define SCP_QUEUE_ERR 0x02 /*Queued I/O aborted for CACs*/ #define SCP_QUEUE_DQUE 0x01 /*Queued I/O disabled*/ u_int8_t eca_and_aen; #define SCP_EECA 0x80 /*Enable Extended CA*/ #define SCP_RAC 0x40 /*Report a check*/ #define SCP_SWP 0x08 /*Software Write Protect*/ #define SCP_RAENP 0x04 /*Ready AEN Permission*/ #define SCP_UAAENP 0x02 /*UA AEN Permission*/ #define SCP_EAENP 0x01 /*Error AEN Permission*/ u_int8_t flags4; #define SCP_ATO 0x80 /*Application tag owner*/ #define SCP_TAS 0x40 /*Task aborted status*/ #define SCP_ATMPE 0x20 /*Application tag mode page*/ #define SCP_RWWP 0x10 /*Reject write without prot*/ u_int8_t aen_holdoff_period[2]; u_int8_t busy_timeout_period[2]; u_int8_t extended_selftest_completion_time[2]; }; struct scsi_cache_page { u_int8_t page_code; #define SCHP_PAGE_SAVABLE 0x80 /* Page is savable */ u_int8_t page_length; u_int8_t cache_flags; #define SCHP_FLAGS_WCE 0x04 /* Write Cache Enable */ #define SCHP_FLAGS_MF 0x02 /* Multiplication factor */ #define SCHP_FLAGS_RCD 0x01 /* Read Cache Disable */ u_int8_t rw_cache_policy; u_int8_t dis_prefetch[2]; u_int8_t min_prefetch[2]; u_int8_t max_prefetch[2]; u_int8_t max_prefetch_ceil[2]; }; /* * XXX KDM * Updated version of the cache page, as of SBC. Update this to SBC-3 and * rationalize the two. */ struct scsi_caching_page { uint8_t page_code; #define SMS_CACHING_PAGE 0x08 uint8_t page_length; uint8_t flags1; #define SCP_IC 0x80 #define SCP_ABPF 0x40 #define SCP_CAP 0x20 #define SCP_DISC 0x10 #define SCP_SIZE 0x08 #define SCP_WCE 0x04 #define SCP_MF 0x02 #define SCP_RCD 0x01 uint8_t ret_priority; uint8_t disable_pf_transfer_len[2]; uint8_t min_prefetch[2]; uint8_t max_prefetch[2]; uint8_t max_pf_ceiling[2]; uint8_t flags2; #define SCP_FSW 0x80 #define SCP_LBCSS 0x40 #define SCP_DRA 0x20 #define SCP_VS1 0x10 #define SCP_VS2 0x08 uint8_t cache_segments; uint8_t cache_seg_size[2]; uint8_t reserved; uint8_t non_cache_seg_size[3]; }; /* * XXX KDM move this off to a vendor shim. */ struct copan_power_subpage { uint8_t page_code; #define PWR_PAGE_CODE 0x00 uint8_t subpage; #define PWR_SUBPAGE_CODE 0x02 uint8_t page_length[2]; uint8_t page_version; #define PWR_VERSION 0x01 uint8_t total_luns; uint8_t max_active_luns; #define PWR_DFLT_MAX_LUNS 0x07 uint8_t reserved[25]; }; /* * XXX KDM move this off to a vendor shim. */ struct copan_aps_subpage { uint8_t page_code; #define APS_PAGE_CODE 0x00 uint8_t subpage; #define APS_SUBPAGE_CODE 0x03 uint8_t page_length[2]; uint8_t page_version; #define APS_VERSION 0x00 uint8_t lock_active; #define APS_LOCK_ACTIVE 0x01 #define APS_LOCK_INACTIVE 0x00 uint8_t reserved[26]; }; /* * XXX KDM move this off to a vendor shim. */ struct copan_debugconf_subpage { uint8_t page_code; #define DBGCNF_PAGE_CODE 0x00 uint8_t subpage; #define DBGCNF_SUBPAGE_CODE 0xF0 uint8_t page_length[2]; uint8_t page_version; #define DBGCNF_VERSION 0x00 uint8_t ctl_time_io_secs[2]; }; struct scsi_info_exceptions_page { u_int8_t page_code; #define SIEP_PAGE_SAVABLE 0x80 /* Page is savable */ u_int8_t page_length; u_int8_t info_flags; #define SIEP_FLAGS_PERF 0x80 #define SIEP_FLAGS_EBF 0x20 #define SIEP_FLAGS_EWASC 0x10 #define SIEP_FLAGS_DEXCPT 0x08 #define SIEP_FLAGS_TEST 0x04 #define SIEP_FLAGS_EBACKERR 0x02 #define SIEP_FLAGS_LOGERR 0x01 u_int8_t mrie; u_int8_t interval_timer[4]; u_int8_t report_count[4]; }; /* * SCSI protocol identifier values, current as of SPC4r36l. */ #define SCSI_PROTO_FC 0x00 /* Fibre Channel */ #define SCSI_PROTO_SPI 0x01 /* Parallel SCSI */ #define SCSI_PROTO_SSA 0x02 /* Serial Storage Arch. */ #define SCSI_PROTO_1394 0x03 /* IEEE 1394 (Firewire) */ #define SCSI_PROTO_RDMA 0x04 /* SCSI RDMA Protocol */ #define SCSI_PROTO_ISCSI 0x05 /* Internet SCSI */ #define SCSI_PROTO_iSCSI 0x05 /* Internet SCSI */ #define SCSI_PROTO_SAS 0x06 /* SAS Serial SCSI Protocol */ #define SCSI_PROTO_ADT 0x07 /* Automation/Drive Int. Trans. Prot.*/ #define SCSI_PROTO_ADITP 0x07 /* Automation/Drive Int. Trans. Prot.*/ #define SCSI_PROTO_ATA 0x08 /* AT Attachment Interface */ #define SCSI_PROTO_UAS 0x09 /* USB Atached SCSI */ #define SCSI_PROTO_SOP 0x0a /* SCSI over PCI Express */ #define SCSI_PROTO_NONE 0x0f /* No specific protocol */ struct scsi_proto_specific_page { u_int8_t page_code; #define SPSP_PAGE_SAVABLE 0x80 /* Page is savable */ u_int8_t page_length; u_int8_t protocol; #define SPSP_PROTO_FC SCSI_PROTO_FC #define SPSP_PROTO_SPI SCSI_PROTO_SPI #define SPSP_PROTO_SSA SCSI_PROTO_SSA #define SPSP_PROTO_1394 SCSI_PROTO_1394 #define SPSP_PROTO_RDMA SCSI_PROTO_RDMA #define SPSP_PROTO_ISCSI SCSI_PROTO_ISCSI #define SPSP_PROTO_SAS SCSI_PROTO_SAS #define SPSP_PROTO_ADT SCSI_PROTO_ADITP #define SPSP_PROTO_ATA SCSI_PROTO_ATA #define SPSP_PROTO_UAS SCSI_PROTO_UAS #define SPSP_PROTO_SOP SCSI_PROTO_SOP #define SPSP_PROTO_NONE SCSI_PROTO_NONE }; struct scsi_reserve { u_int8_t opcode; u_int8_t byte2; #define SR_EXTENT 0x01 #define SR_ID_MASK 0x0e #define SR_3RDPTY 0x10 #define SR_LUN_MASK 0xe0 u_int8_t resv_id; u_int8_t length[2]; u_int8_t control; }; struct scsi_reserve_10 { uint8_t opcode; uint8_t byte2; #define SR10_3RDPTY 0x10 #define SR10_LONGID 0x02 #define SR10_EXTENT 0x01 uint8_t resv_id; uint8_t thirdparty_id; uint8_t reserved[3]; uint8_t length[2]; uint8_t control; }; struct scsi_release { u_int8_t opcode; u_int8_t byte2; u_int8_t resv_id; u_int8_t unused[1]; u_int8_t length; u_int8_t control; }; struct scsi_release_10 { uint8_t opcode; uint8_t byte2; uint8_t resv_id; uint8_t thirdparty_id; uint8_t reserved[3]; uint8_t length[2]; uint8_t control; }; struct scsi_prevent { u_int8_t opcode; u_int8_t byte2; u_int8_t unused[2]; u_int8_t how; u_int8_t control; }; #define PR_PREVENT 0x01 #define PR_ALLOW 0x00 struct scsi_sync_cache { u_int8_t opcode; u_int8_t byte2; #define SSC_IMMED 0x02 #define SSC_RELADR 0x01 u_int8_t begin_lba[4]; u_int8_t reserved; u_int8_t lb_count[2]; u_int8_t control; }; struct scsi_sync_cache_16 { uint8_t opcode; uint8_t byte2; uint8_t begin_lba[8]; uint8_t lb_count[4]; uint8_t reserved; uint8_t control; }; struct scsi_format { uint8_t opcode; uint8_t byte2; #define SF_LONGLIST 0x20 #define SF_FMTDATA 0x10 #define SF_CMPLIST 0x08 #define SF_FORMAT_MASK 0x07 #define SF_FORMAT_BLOCK 0x00 #define SF_FORMAT_LONG_BLOCK 0x03 #define SF_FORMAT_BFI 0x04 #define SF_FORMAT_PHYS 0x05 uint8_t vendor; uint8_t interleave[2]; uint8_t control; }; struct scsi_format_header_short { uint8_t reserved; #define SF_DATA_FOV 0x80 #define SF_DATA_DPRY 0x40 #define SF_DATA_DCRT 0x20 #define SF_DATA_STPF 0x10 #define SF_DATA_IP 0x08 #define SF_DATA_DSP 0x04 #define SF_DATA_IMMED 0x02 #define SF_DATA_VS 0x01 uint8_t byte2; uint8_t defect_list_len[2]; }; struct scsi_format_header_long { uint8_t reserved; uint8_t byte2; uint8_t reserved2[2]; uint8_t defect_list_len[4]; }; struct scsi_changedef { u_int8_t opcode; u_int8_t byte2; u_int8_t unused1; u_int8_t how; u_int8_t unused[4]; u_int8_t datalen; u_int8_t control; }; struct scsi_read_buffer { u_int8_t opcode; u_int8_t byte2; #define RWB_MODE 0x1F #define RWB_MODE_HDR_DATA 0x00 #define RWB_MODE_VENDOR 0x01 #define RWB_MODE_DATA 0x02 #define RWB_MODE_DESCR 0x03 #define RWB_MODE_DOWNLOAD 0x04 #define RWB_MODE_DOWNLOAD_SAVE 0x05 #define RWB_MODE_ECHO 0x0A #define RWB_MODE_ECHO_DESCR 0x0B #define RWB_MODE_ERROR_HISTORY 0x1C u_int8_t buffer_id; u_int8_t offset[3]; u_int8_t length[3]; u_int8_t control; }; struct scsi_write_buffer { u_int8_t opcode; u_int8_t byte2; u_int8_t buffer_id; u_int8_t offset[3]; u_int8_t length[3]; u_int8_t control; }; struct scsi_rw_6 { u_int8_t opcode; u_int8_t addr[3]; /* only 5 bits are valid in the MSB address byte */ #define SRW_TOPADDR 0x1F u_int8_t length; u_int8_t control; }; struct scsi_rw_10 { u_int8_t opcode; #define SRW10_RELADDR 0x01 /* EBP defined for WRITE(10) only */ #define SRW10_EBP 0x04 #define SRW10_FUA 0x08 #define SRW10_DPO 0x10 u_int8_t byte2; u_int8_t addr[4]; u_int8_t reserved; u_int8_t length[2]; u_int8_t control; }; struct scsi_rw_12 { u_int8_t opcode; #define SRW12_RELADDR 0x01 #define SRW12_FUA 0x08 #define SRW12_DPO 0x10 u_int8_t byte2; u_int8_t addr[4]; u_int8_t length[4]; u_int8_t reserved; u_int8_t control; }; struct scsi_rw_16 { u_int8_t opcode; #define SRW16_RELADDR 0x01 #define SRW16_FUA 0x08 #define SRW16_DPO 0x10 u_int8_t byte2; u_int8_t addr[8]; u_int8_t length[4]; u_int8_t reserved; u_int8_t control; }; struct scsi_write_same_10 { uint8_t opcode; uint8_t byte2; #define SWS_LBDATA 0x02 #define SWS_PBDATA 0x04 #define SWS_UNMAP 0x08 #define SWS_ANCHOR 0x10 uint8_t addr[4]; uint8_t group; uint8_t length[2]; uint8_t control; }; struct scsi_write_same_16 { uint8_t opcode; uint8_t byte2; uint8_t addr[8]; uint8_t length[4]; uint8_t group; uint8_t control; }; struct scsi_unmap { uint8_t opcode; uint8_t byte2; #define SU_ANCHOR 0x01 uint8_t reserved[4]; uint8_t group; uint8_t length[2]; uint8_t control; }; struct scsi_unmap_header { uint8_t length[2]; uint8_t desc_length[2]; uint8_t reserved[4]; }; struct scsi_unmap_desc { uint8_t lba[8]; uint8_t length[4]; uint8_t reserved[4]; }; struct scsi_write_verify_10 { uint8_t opcode; uint8_t byte2; #define SWV_BYTCHK 0x02 #define SWV_DPO 0x10 #define SWV_WRPROECT_MASK 0xe0 uint8_t addr[4]; uint8_t group; uint8_t length[2]; uint8_t control; }; struct scsi_write_verify_12 { uint8_t opcode; uint8_t byte2; uint8_t addr[4]; uint8_t length[4]; uint8_t group; uint8_t control; }; struct scsi_write_verify_16 { uint8_t opcode; uint8_t byte2; uint8_t addr[8]; uint8_t length[4]; uint8_t group; uint8_t control; }; struct scsi_start_stop_unit { u_int8_t opcode; u_int8_t byte2; #define SSS_IMMED 0x01 u_int8_t reserved[2]; u_int8_t how; #define SSS_START 0x01 #define SSS_LOEJ 0x02 #define SSS_PC_MASK 0xf0 #define SSS_PC_START_VALID 0x00 #define SSS_PC_ACTIVE 0x10 #define SSS_PC_IDLE 0x20 #define SSS_PC_STANDBY 0x30 #define SSS_PC_LU_CONTROL 0x70 #define SSS_PC_FORCE_IDLE_0 0xa0 #define SSS_PC_FORCE_STANDBY_0 0xb0 u_int8_t control; }; struct ata_pass_12 { u_int8_t opcode; u_int8_t protocol; #define AP_PROTO_HARD_RESET (0x00 << 1) #define AP_PROTO_SRST (0x01 << 1) #define AP_PROTO_NON_DATA (0x03 << 1) #define AP_PROTO_PIO_IN (0x04 << 1) #define AP_PROTO_PIO_OUT (0x05 << 1) #define AP_PROTO_DMA (0x06 << 1) #define AP_PROTO_DMA_QUEUED (0x07 << 1) #define AP_PROTO_DEVICE_DIAG (0x08 << 1) #define AP_PROTO_DEVICE_RESET (0x09 << 1) #define AP_PROTO_UDMA_IN (0x0a << 1) #define AP_PROTO_UDMA_OUT (0x0b << 1) #define AP_PROTO_FPDMA (0x0c << 1) #define AP_PROTO_RESP_INFO (0x0f << 1) #define AP_MULTI 0xe0 u_int8_t flags; #define AP_T_LEN 0x03 #define AP_BB 0x04 #define AP_T_DIR 0x08 #define AP_CK_COND 0x20 #define AP_OFFLINE 0x60 u_int8_t features; u_int8_t sector_count; u_int8_t lba_low; u_int8_t lba_mid; u_int8_t lba_high; u_int8_t device; u_int8_t command; u_int8_t reserved; u_int8_t control; }; struct scsi_maintenance_in { uint8_t opcode; uint8_t byte2; #define SERVICE_ACTION_MASK 0x1f #define SA_RPRT_TRGT_GRP 0x0a uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_supported_opcodes { uint8_t opcode; uint8_t service_action; uint8_t options; #define RSO_RCTD 0x80 #define RSO_OPTIONS_MASK 0x07 #define RSO_OPTIONS_ALL 0x00 #define RSO_OPTIONS_OC 0x01 #define RSO_OPTIONS_OC_SA 0x02 uint8_t requested_opcode; uint8_t requested_service_action[2]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_supported_opcodes_timeout { uint8_t length[2]; uint8_t reserved; uint8_t cmd_specific; uint8_t nominal_time[4]; uint8_t recommended_time[4]; }; struct scsi_report_supported_opcodes_descr { uint8_t opcode; uint8_t reserved; uint8_t service_action[2]; uint8_t reserved2; uint8_t flags; #define RSO_SERVACTV 0x01 #define RSO_CTDP 0x02 uint8_t cdb_length[2]; struct scsi_report_supported_opcodes_timeout timeout[0]; }; struct scsi_report_supported_opcodes_all { uint8_t length[4]; struct scsi_report_supported_opcodes_descr descr[0]; }; struct scsi_report_supported_opcodes_one { uint8_t reserved; uint8_t support; #define RSO_ONE_CTDP 0x80 uint8_t cdb_length[2]; uint8_t cdb_usage[]; }; struct scsi_report_supported_tmf { uint8_t opcode; uint8_t service_action; uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_supported_tmf_data { uint8_t byte1; #define RST_WAKES 0x01 #define RST_TRS 0x02 #define RST_QTS 0x04 #define RST_LURS 0x08 #define RST_CTSS 0x10 #define RST_CACAS 0x20 #define RST_ATSS 0x40 #define RST_ATS 0x80 uint8_t byte2; #define RST_ITNRS 0x01 #define RST_QTSS 0x02 #define RST_QAES 0x04 uint8_t reserved[2]; }; struct scsi_report_timestamp { uint8_t opcode; uint8_t service_action; uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_timestamp_data { uint8_t length[2]; uint8_t origin; #define RTS_ORIG_MASK 0x00 #define RTS_ORIG_ZERO 0x00 #define RTS_ORIG_SET 0x02 #define RTS_ORIG_OUTSIDE 0x03 uint8_t reserved; uint8_t timestamp[6]; uint8_t reserve2[2]; }; struct scsi_receive_copy_status_lid1 { uint8_t opcode; uint8_t service_action; #define RCS_RCS_LID1 0x00 uint8_t list_identifier; uint8_t reserved[7]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_status_lid1_data { uint8_t available_data[4]; uint8_t copy_command_status; #define RCS_CCS_INPROG 0x00 #define RCS_CCS_COMPLETED 0x01 #define RCS_CCS_ERROR 0x02 uint8_t segments_processed[2]; uint8_t transfer_count_units; #define RCS_TC_BYTES 0x00 #define RCS_TC_KBYTES 0x01 #define RCS_TC_MBYTES 0x02 #define RCS_TC_GBYTES 0x03 #define RCS_TC_TBYTES 0x04 #define RCS_TC_PBYTES 0x05 #define RCS_TC_EBYTES 0x06 #define RCS_TC_LBAS 0xf1 uint8_t transfer_count[4]; }; struct scsi_receive_copy_failure_details { uint8_t opcode; uint8_t service_action; #define RCS_RCFD 0x04 uint8_t list_identifier; uint8_t reserved[7]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_failure_details_data { uint8_t available_data[4]; uint8_t reserved[52]; uint8_t copy_command_status; uint8_t reserved2; uint8_t sense_data_length[2]; uint8_t sense_data[]; }; struct scsi_receive_copy_status_lid4 { uint8_t opcode; uint8_t service_action; #define RCS_RCS_LID4 0x05 uint8_t list_identifier[4]; uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_status_lid4_data { uint8_t available_data[4]; uint8_t response_to_service_action; uint8_t copy_command_status; #define RCS_CCS_COMPLETED_PROD 0x03 #define RCS_CCS_COMPLETED_RESID 0x04 #define RCS_CCS_INPROG_FGBG 0x10 #define RCS_CCS_INPROG_FG 0x11 #define RCS_CCS_INPROG_BG 0x12 #define RCS_CCS_ABORTED 0x60 uint8_t operation_counter[2]; uint8_t estimated_status_update_delay[4]; uint8_t extended_copy_completion_status; uint8_t length_of_the_sense_data_field; uint8_t sense_data_length; uint8_t transfer_count_units; uint8_t transfer_count[8]; uint8_t segments_processed[2]; uint8_t reserved[6]; uint8_t sense_data[]; }; struct scsi_receive_copy_operating_parameters { uint8_t opcode; uint8_t service_action; #define RCS_RCOP 0x03 uint8_t reserved[8]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_operating_parameters_data { uint8_t length[4]; uint8_t snlid; #define RCOP_SNLID 0x01 uint8_t reserved[3]; uint8_t maximum_cscd_descriptor_count[2]; uint8_t maximum_segment_descriptor_count[2]; uint8_t maximum_descriptor_list_length[4]; uint8_t maximum_segment_length[4]; uint8_t maximum_inline_data_length[4]; uint8_t held_data_limit[4]; uint8_t maximum_stream_device_transfer_size[4]; uint8_t reserved2[2]; uint8_t total_concurrent_copies[2]; uint8_t maximum_concurrent_copies; uint8_t data_segment_granularity; uint8_t inline_data_granularity; uint8_t held_data_granularity; uint8_t reserved3[3]; uint8_t implemented_descriptor_list_length; uint8_t list_of_implemented_descriptor_type_codes[0]; }; struct scsi_extended_copy { uint8_t opcode; uint8_t service_action; #define EC_EC_LID1 0x00 #define EC_EC_LID4 0x01 uint8_t reserved[8]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_ec_cscd_dtsp { uint8_t flags; #define EC_CSCD_FIXED 0x01 #define EC_CSCD_PAD 0x04 uint8_t block_length[3]; }; struct scsi_ec_cscd { uint8_t type_code; #define EC_CSCD_EXT 0xff uint8_t luidt_pdt; #define EC_LUIDT_MASK 0xc0 #define EC_LUIDT_LUN 0x00 #define EC_LUIDT_PROXY_TOKEN 0x40 uint8_t relative_initiator_port[2]; uint8_t cscd_params[24]; struct scsi_ec_cscd_dtsp dtsp; }; struct scsi_ec_cscd_id { uint8_t type_code; #define EC_CSCD_ID 0xe4 uint8_t luidt_pdt; uint8_t relative_initiator_port[2]; uint8_t codeset; uint8_t id_type; uint8_t reserved; uint8_t length; uint8_t designator[20]; struct scsi_ec_cscd_dtsp dtsp; }; struct scsi_ec_segment { uint8_t type_code; uint8_t flags; #define EC_SEG_DC 0x02 #define EC_SEG_CAT 0x01 uint8_t descr_length[2]; uint8_t params[]; }; struct scsi_ec_segment_b2b { uint8_t type_code; #define EC_SEG_B2B 0x02 uint8_t flags; uint8_t descr_length[2]; uint8_t src_cscd[2]; uint8_t dst_cscd[2]; uint8_t reserved[2]; uint8_t number_of_blocks[2]; uint8_t src_lba[8]; uint8_t dst_lba[8]; }; struct scsi_ec_segment_verify { uint8_t type_code; #define EC_SEG_VERIFY 0x07 uint8_t reserved; uint8_t descr_length[2]; uint8_t src_cscd[2]; uint8_t reserved2[2]; uint8_t tur; uint8_t reserved3[3]; }; struct scsi_ec_segment_register_key { uint8_t type_code; #define EC_SEG_REGISTER_KEY 0x14 uint8_t reserved; uint8_t descr_length[2]; uint8_t reserved2[2]; uint8_t dst_cscd[2]; uint8_t res_key[8]; uint8_t sa_res_key[8]; uint8_t reserved3[4]; }; struct scsi_extended_copy_lid1_data { uint8_t list_identifier; uint8_t flags; #define EC_PRIORITY 0x07 #define EC_LIST_ID_USAGE_MASK 0x18 #define EC_LIST_ID_USAGE_FULL 0x08 #define EC_LIST_ID_USAGE_NOHOLD 0x10 #define EC_LIST_ID_USAGE_NONE 0x18 #define EC_STR 0x20 uint8_t cscd_list_length[2]; uint8_t reserved[4]; uint8_t segment_list_length[4]; uint8_t inline_data_length[4]; uint8_t data[]; }; struct scsi_extended_copy_lid4_data { uint8_t list_format; #define EC_LIST_FORMAT 0x01 uint8_t flags; uint8_t header_cscd_list_length[2]; uint8_t reserved[11]; uint8_t flags2; #define EC_IMMED 0x01 #define EC_G_SENSE 0x02 uint8_t header_cscd_type_code; uint8_t reserved2[3]; uint8_t list_identifier[4]; uint8_t reserved3[18]; uint8_t cscd_list_length[2]; uint8_t segment_list_length[2]; uint8_t inline_data_length[2]; uint8_t data[]; }; struct scsi_copy_operation_abort { uint8_t opcode; uint8_t service_action; #define EC_COA 0x1c uint8_t list_identifier[4]; uint8_t reserved[9]; uint8_t control; }; struct scsi_populate_token { uint8_t opcode; uint8_t service_action; #define EC_PT 0x10 uint8_t reserved[4]; uint8_t list_identifier[4]; uint8_t length[4]; uint8_t group_number; uint8_t control; }; struct scsi_range_desc { uint8_t lba[8]; uint8_t length[4]; uint8_t reserved[4]; }; struct scsi_populate_token_data { uint8_t length[2]; uint8_t flags; #define EC_PT_IMMED 0x01 #define EC_PT_RTV 0x02 uint8_t reserved; uint8_t inactivity_timeout[4]; uint8_t rod_type[4]; uint8_t reserved2[2]; uint8_t range_descriptor_length[2]; struct scsi_range_desc desc[]; }; struct scsi_write_using_token { uint8_t opcode; uint8_t service_action; #define EC_WUT 0x11 uint8_t reserved[4]; uint8_t list_identifier[4]; uint8_t length[4]; uint8_t group_number; uint8_t control; }; struct scsi_write_using_token_data { uint8_t length[2]; uint8_t flags; #define EC_WUT_IMMED 0x01 #define EC_WUT_DEL_TKN 0x02 uint8_t reserved[5]; uint8_t offset_into_rod[8]; uint8_t rod_token[512]; uint8_t reserved2[6]; uint8_t range_descriptor_length[2]; struct scsi_range_desc desc[]; }; struct scsi_receive_rod_token_information { uint8_t opcode; uint8_t service_action; #define RCS_RRTI 0x07 uint8_t list_identifier[4]; uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_token { uint8_t type[4]; #define ROD_TYPE_INTERNAL 0x00000000 #define ROD_TYPE_AUR 0x00001000 #define ROD_TYPE_PIT_DEF 0x00080000 #define ROD_TYPE_PIT_VULN 0x00080001 #define ROD_TYPE_PIT_PERS 0x00080002 #define ROD_TYPE_PIT_ANY 0x0008FFFF #define ROD_TYPE_BLOCK_ZERO 0xFFFF0001 uint8_t reserved[2]; uint8_t length[2]; uint8_t body[0]; }; struct scsi_report_all_rod_tokens { uint8_t opcode; uint8_t service_action; #define RCS_RART 0x08 uint8_t reserved[8]; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_report_all_rod_tokens_data { uint8_t available_data[4]; uint8_t reserved[4]; uint8_t rod_management_token_list[]; }; struct ata_pass_16 { u_int8_t opcode; u_int8_t protocol; #define AP_EXTEND 0x01 u_int8_t flags; #define AP_FLAG_TLEN_NO_DATA (0 << 0) #define AP_FLAG_TLEN_FEAT (1 << 0) #define AP_FLAG_TLEN_SECT_CNT (2 << 0) #define AP_FLAG_TLEN_STPSIU (3 << 0) #define AP_FLAG_BYT_BLOK_BYTES (0 << 2) #define AP_FLAG_BYT_BLOK_BLOCKS (1 << 2) #define AP_FLAG_TDIR_TO_DEV (0 << 3) #define AP_FLAG_TDIR_FROM_DEV (1 << 3) #define AP_FLAG_CHK_COND (1 << 5) u_int8_t features_ext; u_int8_t features; u_int8_t sector_count_ext; u_int8_t sector_count; u_int8_t lba_low_ext; u_int8_t lba_low; u_int8_t lba_mid_ext; u_int8_t lba_mid; u_int8_t lba_high_ext; u_int8_t lba_high; u_int8_t device; u_int8_t command; u_int8_t control; }; #define SC_SCSI_1 0x01 #define SC_SCSI_2 0x03 /* * Opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define READ_6 0x08 #define WRITE_6 0x0A #define INQUIRY 0x12 #define MODE_SELECT_6 0x15 #define MODE_SENSE_6 0x1A #define START_STOP_UNIT 0x1B #define START_STOP 0x1B #define RESERVE 0x16 #define RELEASE 0x17 #define RECEIVE_DIAGNOSTIC 0x1C #define SEND_DIAGNOSTIC 0x1D #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define WRITE_10 0x2A #define POSITION_TO_ELEMENT 0x2B #define WRITE_VERIFY_10 0x2E #define VERIFY_10 0x2F #define SYNCHRONIZE_CACHE 0x35 #define READ_DEFECT_DATA_10 0x37 #define WRITE_BUFFER 0x3B #define READ_BUFFER 0x3C #define CHANGE_DEFINITION 0x40 #define WRITE_SAME_10 0x41 #define UNMAP 0x42 #define LOG_SELECT 0x4C #define LOG_SENSE 0x4D #define MODE_SELECT_10 0x55 #define RESERVE_10 0x56 #define RELEASE_10 0x57 #define MODE_SENSE_10 0x5A #define PERSISTENT_RES_IN 0x5E #define PERSISTENT_RES_OUT 0x5F #define EXTENDED_COPY 0x83 #define RECEIVE_COPY_STATUS 0x84 #define ATA_PASS_16 0x85 #define READ_16 0x88 #define COMPARE_AND_WRITE 0x89 #define WRITE_16 0x8A #define WRITE_VERIFY_16 0x8E #define VERIFY_16 0x8F #define SYNCHRONIZE_CACHE_16 0x91 #define WRITE_SAME_16 0x93 #define SERVICE_ACTION_IN 0x9E #define REPORT_LUNS 0xA0 #define ATA_PASS_12 0xA1 #define MAINTENANCE_IN 0xA3 #define MAINTENANCE_OUT 0xA4 #define MOVE_MEDIUM 0xA5 #define READ_12 0xA8 #define WRITE_12 0xAA #define WRITE_VERIFY_12 0xAE #define VERIFY_12 0xAF #define READ_ELEMENT_STATUS 0xB8 #define READ_CD 0xBE /* Maintenance In Service Action Codes */ #define REPORT_IDENTIFYING_INFRMATION 0x05 #define REPORT_TARGET_PORT_GROUPS 0x0A #define REPORT_ALIASES 0x0B #define REPORT_SUPPORTED_OPERATION_CODES 0x0C #define REPORT_SUPPORTED_TASK_MANAGEMENT_FUNCTIONS 0x0D #define REPORT_PRIORITY 0x0E #define REPORT_TIMESTAMP 0x0F #define MANAGEMENT_PROTOCOL_IN 0x10 /* Maintenance Out Service Action Codes */ #define SET_IDENTIFY_INFORMATION 0x06 #define SET_TARGET_PORT_GROUPS 0x0A #define CHANGE_ALIASES 0x0B #define SET_PRIORITY 0x0E #define SET_TIMESTAMP 0x0F #define MANGAEMENT_PROTOCOL_OUT 0x10 /* * Device Types */ #define T_DIRECT 0x00 #define T_SEQUENTIAL 0x01 #define T_PRINTER 0x02 #define T_PROCESSOR 0x03 #define T_WORM 0x04 #define T_CDROM 0x05 #define T_SCANNER 0x06 #define T_OPTICAL 0x07 #define T_CHANGER 0x08 #define T_COMM 0x09 #define T_ASC0 0x0a #define T_ASC1 0x0b #define T_STORARRAY 0x0c #define T_ENCLOSURE 0x0d #define T_RBC 0x0e #define T_OCRW 0x0f #define T_OSD 0x11 #define T_ADC 0x12 #define T_NODEVICE 0x1f #define T_ANY 0xff /* Used in Quirk table matches */ #define T_REMOV 1 #define T_FIXED 0 /* * This length is the initial inquiry length used by the probe code, as * well as the length necessary for scsi_print_inquiry() to function * correctly. If either use requires a different length in the future, * the two values should be de-coupled. */ #define SHORT_INQUIRY_LENGTH 36 struct scsi_inquiry_data { u_int8_t device; #define SID_TYPE(inq_data) ((inq_data)->device & 0x1f) #define SID_QUAL(inq_data) (((inq_data)->device & 0xE0) >> 5) #define SID_QUAL_LU_CONNECTED 0x00 /* * The specified peripheral device * type is currently connected to * logical unit. If the target cannot * determine whether or not a physical * device is currently connected, it * shall also use this peripheral * qualifier when returning the INQUIRY * data. This peripheral qualifier * does not mean that the device is * ready for access by the initiator. */ #define SID_QUAL_LU_OFFLINE 0x01 /* * The target is capable of supporting * the specified peripheral device type * on this logical unit; however, the * physical device is not currently * connected to this logical unit. */ #define SID_QUAL_RSVD 0x02 #define SID_QUAL_BAD_LU 0x03 /* * The target is not capable of * supporting a physical device on * this logical unit. For this * peripheral qualifier the peripheral * device type shall be set to 1Fh to * provide compatibility with previous * versions of SCSI. All other * peripheral device type values are * reserved for this peripheral * qualifier. */ #define SID_QUAL_IS_VENDOR_UNIQUE(inq_data) ((SID_QUAL(inq_data) & 0x08) != 0) u_int8_t dev_qual2; #define SID_QUAL2 0x7F #define SID_LU_CONG 0x40 #define SID_RMB 0x80 #define SID_IS_REMOVABLE(inq_data) (((inq_data)->dev_qual2 & SID_RMB) != 0) u_int8_t version; #define SID_ANSI_REV(inq_data) ((inq_data)->version & 0x07) #define SCSI_REV_0 0 #define SCSI_REV_CCS 1 #define SCSI_REV_2 2 #define SCSI_REV_SPC 3 #define SCSI_REV_SPC2 4 #define SCSI_REV_SPC3 5 #define SCSI_REV_SPC4 6 #define SID_ECMA 0x38 #define SID_ISO 0xC0 u_int8_t response_format; #define SID_AENC 0x80 #define SID_TrmIOP 0x40 #define SID_NormACA 0x20 #define SID_HiSup 0x10 u_int8_t additional_length; #define SID_ADDITIONAL_LENGTH(iqd) \ ((iqd)->additional_length + \ __offsetof(struct scsi_inquiry_data, additional_length) + 1) u_int8_t spc3_flags; #define SPC3_SID_PROTECT 0x01 #define SPC3_SID_3PC 0x08 #define SPC3_SID_TPGS_MASK 0x30 #define SPC3_SID_TPGS_IMPLICIT 0x10 #define SPC3_SID_TPGS_EXPLICIT 0x20 #define SPC3_SID_ACC 0x40 #define SPC3_SID_SCCS 0x80 u_int8_t spc2_flags; #define SPC2_SID_ADDR16 0x01 #define SPC2_SID_MChngr 0x08 #define SPC2_SID_MultiP 0x10 #define SPC2_SID_EncServ 0x40 #define SPC2_SID_BQueue 0x80 #define INQ_DATA_TQ_ENABLED(iqd) \ ((SID_ANSI_REV(iqd) < SCSI_REV_SPC2)? ((iqd)->flags & SID_CmdQue) : \ (((iqd)->flags & SID_CmdQue) && !((iqd)->spc2_flags & SPC2_SID_BQueue)) || \ (!((iqd)->flags & SID_CmdQue) && ((iqd)->spc2_flags & SPC2_SID_BQueue))) u_int8_t flags; #define SID_SftRe 0x01 #define SID_CmdQue 0x02 #define SID_Linked 0x08 #define SID_Sync 0x10 #define SID_WBus16 0x20 #define SID_WBus32 0x40 #define SID_RelAdr 0x80 #define SID_VENDOR_SIZE 8 char vendor[SID_VENDOR_SIZE]; #define SID_PRODUCT_SIZE 16 char product[SID_PRODUCT_SIZE]; #define SID_REVISION_SIZE 4 char revision[SID_REVISION_SIZE]; /* * The following fields were taken from SCSI Primary Commands - 2 * (SPC-2) Revision 14, Dated 11 November 1999 */ #define SID_VENDOR_SPECIFIC_0_SIZE 20 u_int8_t vendor_specific0[SID_VENDOR_SPECIFIC_0_SIZE]; /* * An extension of SCSI Parallel Specific Values */ #define SID_SPI_IUS 0x01 #define SID_SPI_QAS 0x02 #define SID_SPI_CLOCK_ST 0x00 #define SID_SPI_CLOCK_DT 0x04 #define SID_SPI_CLOCK_DT_ST 0x0C #define SID_SPI_MASK 0x0F u_int8_t spi3data; u_int8_t reserved2; /* * Version Descriptors, stored 2 byte values. */ u_int8_t version1[2]; u_int8_t version2[2]; u_int8_t version3[2]; u_int8_t version4[2]; u_int8_t version5[2]; u_int8_t version6[2]; u_int8_t version7[2]; u_int8_t version8[2]; u_int8_t reserved3[22]; #define SID_VENDOR_SPECIFIC_1_SIZE 160 u_int8_t vendor_specific1[SID_VENDOR_SPECIFIC_1_SIZE]; }; /* * This structure is more suited to initiator operation, because the * maximum number of supported pages is already allocated. */ struct scsi_vpd_supported_page_list { u_int8_t device; u_int8_t page_code; #define SVPD_SUPPORTED_PAGE_LIST 0x00 #define SVPD_SUPPORTED_PAGES_HDR_LEN 4 u_int8_t reserved; u_int8_t length; /* number of VPD entries */ #define SVPD_SUPPORTED_PAGES_SIZE 251 u_int8_t list[SVPD_SUPPORTED_PAGES_SIZE]; }; /* * This structure is more suited to target operation, because the * number of supported pages is left to the user to allocate. */ struct scsi_vpd_supported_pages { u_int8_t device; u_int8_t page_code; u_int8_t reserved; #define SVPD_SUPPORTED_PAGES 0x00 u_int8_t length; u_int8_t page_list[0]; }; struct scsi_vpd_unit_serial_number { u_int8_t device; u_int8_t page_code; #define SVPD_UNIT_SERIAL_NUMBER 0x80 u_int8_t reserved; u_int8_t length; /* serial number length */ #define SVPD_SERIAL_NUM_SIZE 251 u_int8_t serial_num[SVPD_SERIAL_NUM_SIZE]; }; struct scsi_vpd_device_id { u_int8_t device; u_int8_t page_code; #define SVPD_DEVICE_ID 0x83 #define SVPD_DEVICE_ID_MAX_SIZE 252 #define SVPD_DEVICE_ID_HDR_LEN \ __offsetof(struct scsi_vpd_device_id, desc_list) u_int8_t length[2]; u_int8_t desc_list[]; }; struct scsi_vpd_id_descriptor { u_int8_t proto_codeset; /* * See the SCSI_PROTO definitions above for the protocols. */ #define SVPD_ID_PROTO_SHIFT 4 #define SVPD_ID_CODESET_BINARY 0x01 #define SVPD_ID_CODESET_ASCII 0x02 #define SVPD_ID_CODESET_UTF8 0x03 #define SVPD_ID_CODESET_MASK 0x0f u_int8_t id_type; #define SVPD_ID_PIV 0x80 #define SVPD_ID_ASSOC_LUN 0x00 #define SVPD_ID_ASSOC_PORT 0x10 #define SVPD_ID_ASSOC_TARGET 0x20 #define SVPD_ID_ASSOC_MASK 0x30 #define SVPD_ID_TYPE_VENDOR 0x00 #define SVPD_ID_TYPE_T10 0x01 #define SVPD_ID_TYPE_EUI64 0x02 #define SVPD_ID_TYPE_NAA 0x03 #define SVPD_ID_TYPE_RELTARG 0x04 #define SVPD_ID_TYPE_TPORTGRP 0x05 #define SVPD_ID_TYPE_LUNGRP 0x06 #define SVPD_ID_TYPE_MD5_LUN_ID 0x07 #define SVPD_ID_TYPE_SCSI_NAME 0x08 #define SVPD_ID_TYPE_MASK 0x0f u_int8_t reserved; u_int8_t length; #define SVPD_DEVICE_ID_DESC_HDR_LEN \ __offsetof(struct scsi_vpd_id_descriptor, identifier) u_int8_t identifier[]; }; struct scsi_vpd_id_t10 { u_int8_t vendor[8]; u_int8_t vendor_spec_id[0]; }; struct scsi_vpd_id_eui64 { u_int8_t ieee_company_id[3]; u_int8_t extension_id[5]; }; struct scsi_vpd_id_naa_basic { uint8_t naa; /* big endian, packed: uint8_t naa : 4; uint8_t naa_desig : 4; */ #define SVPD_ID_NAA_NAA_SHIFT 4 #define SVPD_ID_NAA_IEEE_EXT 0x02 #define SVPD_ID_NAA_LOCAL_REG 0x03 #define SVPD_ID_NAA_IEEE_REG 0x05 #define SVPD_ID_NAA_IEEE_REG_EXT 0x06 uint8_t naa_data[]; }; struct scsi_vpd_id_naa_ieee_extended_id { uint8_t naa; uint8_t vendor_specific_id_a; uint8_t ieee_company_id[3]; uint8_t vendor_specific_id_b[4]; }; struct scsi_vpd_id_naa_local_reg { uint8_t naa; uint8_t local_value[7]; }; struct scsi_vpd_id_naa_ieee_reg { uint8_t naa; uint8_t reg_value[7]; /* big endian, packed: uint8_t naa_basic : 4; uint8_t ieee_company_id_0 : 4; uint8_t ieee_company_id_1[2]; uint8_t ieee_company_id_2 : 4; uint8_t vendor_specific_id_0 : 4; uint8_t vendor_specific_id_1[4]; */ }; struct scsi_vpd_id_naa_ieee_reg_extended { uint8_t naa; uint8_t reg_value[15]; /* big endian, packed: uint8_t naa_basic : 4; uint8_t ieee_company_id_0 : 4; uint8_t ieee_company_id_1[2]; uint8_t ieee_company_id_2 : 4; uint8_t vendor_specific_id_0 : 4; uint8_t vendor_specific_id_1[4]; uint8_t vendor_specific_id_ext[8]; */ }; struct scsi_vpd_id_rel_trgt_port_id { uint8_t obsolete[2]; uint8_t rel_trgt_port_id[2]; }; struct scsi_vpd_id_trgt_port_grp_id { uint8_t reserved[2]; uint8_t trgt_port_grp[2]; }; struct scsi_vpd_id_lun_grp_id { uint8_t reserved[2]; uint8_t log_unit_grp[2]; }; struct scsi_vpd_id_md5_lun_id { uint8_t lun_id[16]; }; struct scsi_vpd_id_scsi_name { uint8_t name_string[256]; }; struct scsi_service_action_in { uint8_t opcode; uint8_t service_action; uint8_t action_dependent[13]; uint8_t control; }; +struct scsi_vpd_extended_inquiry_data +{ + uint8_t device; + uint8_t page_code; +#define SVPD_EXTENDED_INQUIRY_DATA 0x86 + uint8_t reserved; + uint8_t page_length; + uint8_t flags1; +#define SVPD_EID_AM 0xC0 +#define SVPD_EID_SPT 0x38 +#define SVPD_EID_SPT_1 0x00 +#define SVPD_EID_SPT_12 0x08 +#define SVPD_EID_SPT_2 0x10 +#define SVPD_EID_SPT_13 0x18 +#define SVPD_EID_SPT_3 0x20 +#define SVPD_EID_SPT_23 0x28 +#define SVPD_EID_SPT_123 0x38 +#define SVPD_EID_GRD_CHK 0x04 +#define SVPD_EID_APP_CHK 0x02 +#define SVPD_EID_REF_CHK 0x01 + uint8_t flags2; +#define SVPD_EID_UASK_SUP 0x20 +#define SVPD_EID_GROUP_SUP 0x10 +#define SVPD_EID_PRIOR_SUP 0x08 +#define SVPD_EID_HEADSUP 0x04 +#define SVPD_EID_ORDSUP 0x02 +#define SVPD_EID_SIMPSUP 0x01 + uint8_t flags3; +#define SVPD_EID_WU_SUP 0x08 +#define SVPD_EID_CRD_SUP 0x04 +#define SVPD_EID_NV_SUP 0x02 +#define SVPD_EID_V_SUP 0x01 + uint8_t flags4; +#define SVPD_EID_P_I_I_SUP 0x10 +#define SVPD_EID_LUICLT 0x01 + uint8_t flags5; +#define SVPD_EID_R_SUP 0x10 +#define SVPD_EID_CBCS 0x01 + uint8_t flags6; +#define SVPD_EID_MULTI_I_T_FW 0x0F + uint8_t est[2]; + uint8_t flags7; +#define SVPD_EID_POA_SUP 0x80 +#define SVPD_EID_HRA_SUP 0x80 +#define SVPD_EID_VSA_SUP 0x80 + uint8_t max_sense_length; + uint8_t reserved2[50]; +}; + struct scsi_vpd_mode_page_policy_descr { uint8_t page_code; uint8_t subpage_code; uint8_t policy; #define SVPD_MPP_SHARED 0x00 #define SVPD_MPP_PORT 0x01 #define SVPD_MPP_I_T 0x03 #define SVPD_MPP_MLUS 0x80 uint8_t reserved; }; struct scsi_vpd_mode_page_policy { uint8_t device; uint8_t page_code; #define SVPD_MODE_PAGE_POLICY 0x87 uint8_t page_length[2]; struct scsi_vpd_mode_page_policy_descr descr[0]; }; struct scsi_diag_page { uint8_t page_code; uint8_t page_specific_flags; uint8_t length[2]; uint8_t params[0]; }; struct scsi_vpd_port_designation { uint8_t reserved[2]; uint8_t relative_port_id[2]; uint8_t reserved2[2]; uint8_t initiator_transportid_length[2]; uint8_t initiator_transportid[0]; }; struct scsi_vpd_port_designation_cont { uint8_t reserved[2]; uint8_t target_port_descriptors_length[2]; struct scsi_vpd_id_descriptor target_port_descriptors[0]; }; struct scsi_vpd_scsi_ports { u_int8_t device; u_int8_t page_code; #define SVPD_SCSI_PORTS 0x88 u_int8_t page_length[2]; struct scsi_vpd_port_designation design[]; }; /* * ATA Information VPD Page based on * T10/2126-D Revision 04 */ #define SVPD_ATA_INFORMATION 0x89 struct scsi_vpd_tpc_descriptor { uint8_t desc_type[2]; uint8_t desc_length[2]; uint8_t parameters[]; }; struct scsi_vpd_tpc_descriptor_bdrl { uint8_t desc_type[2]; #define SVPD_TPC_BDRL 0x0000 uint8_t desc_length[2]; uint8_t vendor_specific[6]; uint8_t maximum_ranges[2]; uint8_t maximum_inactivity_timeout[4]; uint8_t default_inactivity_timeout[4]; uint8_t maximum_token_transfer_size[8]; uint8_t optimal_transfer_count[8]; }; struct scsi_vpd_tpc_descriptor_sc_descr { uint8_t opcode; uint8_t sa_length; uint8_t supported_service_actions[0]; }; struct scsi_vpd_tpc_descriptor_sc { uint8_t desc_type[2]; #define SVPD_TPC_SC 0x0001 uint8_t desc_length[2]; uint8_t list_length; struct scsi_vpd_tpc_descriptor_sc_descr descr[]; }; struct scsi_vpd_tpc_descriptor_pd { uint8_t desc_type[2]; #define SVPD_TPC_PD 0x0004 uint8_t desc_length[2]; uint8_t reserved[4]; uint8_t maximum_cscd_descriptor_count[2]; uint8_t maximum_segment_descriptor_count[2]; uint8_t maximum_descriptor_list_length[4]; uint8_t maximum_inline_data_length[4]; uint8_t reserved2[12]; }; struct scsi_vpd_tpc_descriptor_sd { uint8_t desc_type[2]; #define SVPD_TPC_SD 0x0008 uint8_t desc_length[2]; uint8_t list_length; uint8_t supported_descriptor_codes[]; }; struct scsi_vpd_tpc_descriptor_sdid { uint8_t desc_type[2]; #define SVPD_TPC_SDID 0x000C uint8_t desc_length[2]; uint8_t list_length[2]; uint8_t supported_descriptor_ids[]; }; struct scsi_vpd_tpc_descriptor_rtf_block { uint8_t type_format; #define SVPD_TPC_RTF_BLOCK 0x00 uint8_t reserved; uint8_t desc_length[2]; uint8_t reserved2[2]; uint8_t optimal_length_granularity[2]; uint8_t maximum_bytes[8]; uint8_t optimal_bytes[8]; uint8_t optimal_bytes_to_token_per_segment[8]; uint8_t optimal_bytes_from_token_per_segment[8]; uint8_t reserved3[8]; }; struct scsi_vpd_tpc_descriptor_rtf { uint8_t desc_type[2]; #define SVPD_TPC_RTF 0x0106 uint8_t desc_length[2]; uint8_t remote_tokens; uint8_t reserved[11]; uint8_t minimum_token_lifetime[4]; uint8_t maximum_token_lifetime[4]; uint8_t maximum_token_inactivity_timeout[4]; uint8_t reserved2[18]; uint8_t type_specific_features_length[2]; uint8_t type_specific_features[0]; }; struct scsi_vpd_tpc_descriptor_srtd { uint8_t rod_type[4]; uint8_t flags; #define SVPD_TPC_SRTD_TOUT 0x01 #define SVPD_TPC_SRTD_TIN 0x02 #define SVPD_TPC_SRTD_ECPY 0x80 uint8_t reserved; uint8_t preference_indicator[2]; uint8_t reserved2[56]; }; struct scsi_vpd_tpc_descriptor_srt { uint8_t desc_type[2]; #define SVPD_TPC_SRT 0x0108 uint8_t desc_length[2]; uint8_t reserved[2]; uint8_t rod_type_descriptors_length[2]; uint8_t rod_type_descriptors[0]; }; struct scsi_vpd_tpc_descriptor_gco { uint8_t desc_type[2]; #define SVPD_TPC_GCO 0x8001 uint8_t desc_length[2]; uint8_t total_concurrent_copies[4]; uint8_t maximum_identified_concurrent_copies[4]; uint8_t maximum_segment_length[4]; uint8_t data_segment_granularity; uint8_t inline_data_granularity; uint8_t reserved[18]; }; struct scsi_vpd_tpc { uint8_t device; uint8_t page_code; #define SVPD_SCSI_TPC 0x8F uint8_t page_length[2]; struct scsi_vpd_tpc_descriptor descr[]; }; /* * Block Device Characteristics VPD Page based on * T10/1799-D Revision 31 */ struct scsi_vpd_block_characteristics { u_int8_t device; u_int8_t page_code; #define SVPD_BDC 0xB1 u_int8_t page_length[2]; u_int8_t medium_rotation_rate[2]; #define SVPD_BDC_RATE_NOT_REPORTED 0x00 #define SVPD_BDC_RATE_NON_ROTATING 0x01 u_int8_t reserved1; u_int8_t nominal_form_factor; #define SVPD_BDC_FORM_NOT_REPORTED 0x00 #define SVPD_BDC_FORM_5_25INCH 0x01 #define SVPD_BDC_FORM_3_5INCH 0x02 #define SVPD_BDC_FORM_2_5INCH 0x03 #define SVPD_BDC_FORM_1_5INCH 0x04 #define SVPD_BDC_FORM_LESSTHAN_1_5INCH 0x05 u_int8_t reserved2[56]; }; /* * Block Device Characteristics VPD Page */ struct scsi_vpd_block_device_characteristics { uint8_t device; uint8_t page_code; #define SVPD_BDC 0xB1 uint8_t page_length[2]; uint8_t medium_rotation_rate[2]; #define SVPD_NOT_REPORTED 0x0000 #define SVPD_NON_ROTATING 0x0001 uint8_t product_type; uint8_t wab_wac_ff; uint8_t flags; #define SVPD_VBULS 0x01 #define SVPD_FUAB 0x02 #define SVPD_HAW_ZBC 0x10 uint8_t reserved[55]; }; /* * Logical Block Provisioning VPD Page based on * T10/1799-D Revision 31 */ struct scsi_vpd_logical_block_prov { u_int8_t device; u_int8_t page_code; #define SVPD_LBP 0xB2 u_int8_t page_length[2]; #define SVPD_LBP_PL_BASIC 0x04 u_int8_t threshold_exponent; u_int8_t flags; #define SVPD_LBP_UNMAP 0x80 #define SVPD_LBP_WS16 0x40 #define SVPD_LBP_WS10 0x20 #define SVPD_LBP_RZ 0x04 #define SVPD_LBP_ANC_SUP 0x02 #define SVPD_LBP_DP 0x01 u_int8_t prov_type; #define SVPD_LBP_RESOURCE 0x01 #define SVPD_LBP_THIN 0x02 u_int8_t reserved; /* * Provisioning Group Descriptor can be here if SVPD_LBP_DP is set * Its size can be determined from page_length - 4 */ }; /* * Block Limits VDP Page based on * T10/1799-D Revision 31 */ struct scsi_vpd_block_limits { u_int8_t device; u_int8_t page_code; #define SVPD_BLOCK_LIMITS 0xB0 u_int8_t page_length[2]; #define SVPD_BL_PL_BASIC 0x10 #define SVPD_BL_PL_TP 0x3C u_int8_t reserved1; u_int8_t max_cmp_write_len; u_int8_t opt_txfer_len_grain[2]; u_int8_t max_txfer_len[4]; u_int8_t opt_txfer_len[4]; u_int8_t max_prefetch[4]; u_int8_t max_unmap_lba_cnt[4]; u_int8_t max_unmap_blk_cnt[4]; u_int8_t opt_unmap_grain[4]; u_int8_t unmap_grain_align[4]; u_int8_t max_write_same_length[8]; u_int8_t reserved2[20]; }; struct scsi_read_capacity { u_int8_t opcode; u_int8_t byte2; #define SRC_RELADR 0x01 u_int8_t addr[4]; u_int8_t unused[2]; u_int8_t pmi; #define SRC_PMI 0x01 u_int8_t control; }; struct scsi_read_capacity_16 { uint8_t opcode; #define SRC16_SERVICE_ACTION 0x10 uint8_t service_action; uint8_t addr[8]; uint8_t alloc_len[4]; #define SRC16_PMI 0x01 #define SRC16_RELADR 0x02 uint8_t reladr; uint8_t control; }; struct scsi_read_capacity_data { u_int8_t addr[4]; u_int8_t length[4]; }; struct scsi_read_capacity_data_long { uint8_t addr[8]; uint8_t length[4]; #define SRC16_PROT_EN 0x01 #define SRC16_P_TYPE 0x0e #define SRC16_PTYPE_1 0x00 #define SRC16_PTYPE_2 0x02 #define SRC16_PTYPE_3 0x04 uint8_t prot; #define SRC16_LBPPBE 0x0f #define SRC16_PI_EXPONENT 0xf0 #define SRC16_PI_EXPONENT_SHIFT 4 uint8_t prot_lbppbe; #define SRC16_LALBA 0x3f #define SRC16_LBPRZ 0x40 #define SRC16_LBPME 0x80 /* * Alternate versions of these macros that are intended for use on a 16-bit * version of the lalba_lbp field instead of the array of 2 8 bit numbers. */ #define SRC16_LALBA_A 0x3fff #define SRC16_LBPRZ_A 0x4000 #define SRC16_LBPME_A 0x8000 uint8_t lalba_lbp[2]; uint8_t reserved[16]; }; struct scsi_report_luns { uint8_t opcode; uint8_t reserved1; #define RPL_REPORT_DEFAULT 0x00 #define RPL_REPORT_WELLKNOWN 0x01 #define RPL_REPORT_ALL 0x02 uint8_t select_report; uint8_t reserved2[3]; uint8_t length[4]; uint8_t reserved3; uint8_t control; }; struct scsi_report_luns_lundata { uint8_t lundata[8]; #define RPL_LUNDATA_PERIPH_BUS_MASK 0x3f #define RPL_LUNDATA_FLAT_LUN_MASK 0x3f #define RPL_LUNDATA_FLAT_LUN_BITS 0x06 #define RPL_LUNDATA_LUN_TARG_MASK 0x3f #define RPL_LUNDATA_LUN_BUS_MASK 0xe0 #define RPL_LUNDATA_LUN_LUN_MASK 0x1f #define RPL_LUNDATA_EXT_LEN_MASK 0x30 #define RPL_LUNDATA_EXT_EAM_MASK 0x0f #define RPL_LUNDATA_EXT_EAM_WK 0x01 #define RPL_LUNDATA_EXT_EAM_NOT_SPEC 0x0f #define RPL_LUNDATA_ATYP_MASK 0xc0 /* MBZ for type 0 lun */ #define RPL_LUNDATA_ATYP_PERIPH 0x00 #define RPL_LUNDATA_ATYP_FLAT 0x40 #define RPL_LUNDATA_ATYP_LUN 0x80 #define RPL_LUNDATA_ATYP_EXTLUN 0xc0 }; struct scsi_report_luns_data { u_int8_t length[4]; /* length of LUN inventory, in bytes */ u_int8_t reserved[4]; /* unused */ /* * LUN inventory- we only support the type zero form for now. */ struct scsi_report_luns_lundata luns[0]; }; struct scsi_target_group { uint8_t opcode; uint8_t service_action; #define STG_PDF_MASK 0xe0 #define STG_PDF_LENGTH 0x00 #define STG_PDF_EXTENDED 0x20 uint8_t reserved1[4]; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_target_port_descriptor { uint8_t reserved[2]; uint8_t relative_target_port_identifier[2]; uint8_t desc_list[]; }; struct scsi_target_port_group_descriptor { uint8_t pref_state; #define TPG_PRIMARY 0x80 #define TPG_ASYMMETRIC_ACCESS_STATE_MASK 0xf #define TPG_ASYMMETRIC_ACCESS_OPTIMIZED 0x0 #define TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED 0x1 #define TPG_ASYMMETRIC_ACCESS_STANDBY 0x2 #define TPG_ASYMMETRIC_ACCESS_UNAVAILABLE 0x3 #define TPG_ASYMMETRIC_ACCESS_LBA_DEPENDENT 0x4 #define TPG_ASYMMETRIC_ACCESS_OFFLINE 0xE #define TPG_ASYMMETRIC_ACCESS_TRANSITIONING 0xF uint8_t support; #define TPG_AO_SUP 0x01 #define TPG_AN_SUP 0x02 #define TPG_S_SUP 0x04 #define TPG_U_SUP 0x08 #define TPG_LBD_SUP 0x10 #define TPG_O_SUP 0x40 #define TPG_T_SUP 0x80 uint8_t target_port_group[2]; uint8_t reserved; uint8_t status; #define TPG_UNAVLBL 0 #define TPG_SET_BY_STPG 0x01 #define TPG_IMPLICIT 0x02 uint8_t vendor_specific; uint8_t target_port_count; struct scsi_target_port_descriptor descriptors[]; }; struct scsi_target_group_data { uint8_t length[4]; /* length of returned data, in bytes */ struct scsi_target_port_group_descriptor groups[]; }; struct scsi_target_group_data_extended { uint8_t length[4]; /* length of returned data, in bytes */ uint8_t format_type; /* STG_PDF_LENGTH or STG_PDF_EXTENDED */ uint8_t implicit_transition_time; uint8_t reserved[2]; struct scsi_target_port_group_descriptor groups[]; }; typedef enum { SSD_TYPE_NONE, SSD_TYPE_FIXED, SSD_TYPE_DESC } scsi_sense_data_type; typedef enum { SSD_ELEM_NONE, SSD_ELEM_SKIP, SSD_ELEM_DESC, SSD_ELEM_SKS, SSD_ELEM_COMMAND, SSD_ELEM_INFO, SSD_ELEM_FRU, SSD_ELEM_STREAM, SSD_ELEM_MAX } scsi_sense_elem_type; struct scsi_sense_data { uint8_t error_code; /* * SPC-4 says that the maximum length of sense data is 252 bytes. * So this structure is exactly 252 bytes log. */ #define SSD_FULL_SIZE 252 uint8_t sense_buf[SSD_FULL_SIZE - 1]; /* * XXX KDM is this still a reasonable minimum size? */ #define SSD_MIN_SIZE 18 /* * Maximum value for the extra_len field in the sense data. */ #define SSD_EXTRA_MAX 244 }; /* * Fixed format sense data. */ struct scsi_sense_data_fixed { u_int8_t error_code; #define SSD_ERRCODE 0x7F #define SSD_CURRENT_ERROR 0x70 #define SSD_DEFERRED_ERROR 0x71 #define SSD_ERRCODE_VALID 0x80 u_int8_t segment; u_int8_t flags; #define SSD_KEY 0x0F #define SSD_KEY_NO_SENSE 0x00 #define SSD_KEY_RECOVERED_ERROR 0x01 #define SSD_KEY_NOT_READY 0x02 #define SSD_KEY_MEDIUM_ERROR 0x03 #define SSD_KEY_HARDWARE_ERROR 0x04 #define SSD_KEY_ILLEGAL_REQUEST 0x05 #define SSD_KEY_UNIT_ATTENTION 0x06 #define SSD_KEY_DATA_PROTECT 0x07 #define SSD_KEY_BLANK_CHECK 0x08 #define SSD_KEY_Vendor_Specific 0x09 #define SSD_KEY_COPY_ABORTED 0x0a #define SSD_KEY_ABORTED_COMMAND 0x0b #define SSD_KEY_EQUAL 0x0c #define SSD_KEY_VOLUME_OVERFLOW 0x0d #define SSD_KEY_MISCOMPARE 0x0e #define SSD_KEY_COMPLETED 0x0f #define SSD_ILI 0x20 #define SSD_EOM 0x40 #define SSD_FILEMARK 0x80 u_int8_t info[4]; u_int8_t extra_len; u_int8_t cmd_spec_info[4]; u_int8_t add_sense_code; u_int8_t add_sense_code_qual; u_int8_t fru; u_int8_t sense_key_spec[3]; #define SSD_SCS_VALID 0x80 #define SSD_FIELDPTR_CMD 0x40 #define SSD_BITPTR_VALID 0x08 #define SSD_BITPTR_VALUE 0x07 u_int8_t extra_bytes[14]; #define SSD_FIXED_IS_PRESENT(sense, length, field) \ ((length >= (offsetof(struct scsi_sense_data_fixed, field) + \ sizeof(sense->field))) ? 1 :0) #define SSD_FIXED_IS_FILLED(sense, field) \ ((((offsetof(struct scsi_sense_data_fixed, field) + \ sizeof(sense->field)) - \ (offsetof(struct scsi_sense_data_fixed, extra_len) + \ sizeof(sense->extra_len))) <= sense->extra_len) ? 1 : 0) }; /* * Descriptor format sense data definitions. * Introduced in SPC-3. */ struct scsi_sense_data_desc { uint8_t error_code; #define SSD_DESC_CURRENT_ERROR 0x72 #define SSD_DESC_DEFERRED_ERROR 0x73 uint8_t sense_key; uint8_t add_sense_code; uint8_t add_sense_code_qual; uint8_t reserved[3]; /* * Note that SPC-4, section 4.5.2.1 says that the extra_len field * must be less than or equal to 244. */ uint8_t extra_len; uint8_t sense_desc[0]; #define SSD_DESC_IS_PRESENT(sense, length, field) \ ((length >= (offsetof(struct scsi_sense_data_desc, field) + \ sizeof(sense->field))) ? 1 :0) }; struct scsi_sense_desc_header { uint8_t desc_type; uint8_t length; }; /* * The information provide in the Information descriptor is device type or * command specific information, and defined in a command standard. * * Note that any changes to the field names or positions in this structure, * even reserved fields, should be accompanied by an examination of the * code in ctl_set_sense() that uses them. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_info { uint8_t desc_type; #define SSD_DESC_INFO 0x00 uint8_t length; uint8_t byte2; #define SSD_INFO_VALID 0x80 uint8_t reserved; uint8_t info[8]; }; /* * Command-specific information depends on the command for which the * reported condition occured. * * Note that any changes to the field names or positions in this structure, * even reserved fields, should be accompanied by an examination of the * code in ctl_set_sense() that uses them. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_command { uint8_t desc_type; #define SSD_DESC_COMMAND 0x01 uint8_t length; uint8_t reserved[2]; uint8_t command_info[8]; }; /* * Sense key specific descriptor. The sense key specific data format * depends on the sense key in question. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_sks { uint8_t desc_type; #define SSD_DESC_SKS 0x02 uint8_t length; uint8_t reserved1[2]; uint8_t sense_key_spec[3]; #define SSD_SKS_VALID 0x80 uint8_t reserved2; }; /* * This is used for the Illegal Request sense key (0x05) only. */ struct scsi_sense_sks_field { uint8_t byte0; #define SSD_SKS_FIELD_VALID 0x80 #define SSD_SKS_FIELD_CMD 0x40 #define SSD_SKS_BPV 0x08 #define SSD_SKS_BIT_VALUE 0x07 uint8_t field[2]; }; /* * This is used for the Hardware Error (0x04), Medium Error (0x03) and * Recovered Error (0x01) sense keys. */ struct scsi_sense_sks_retry { uint8_t byte0; #define SSD_SKS_RETRY_VALID 0x80 uint8_t actual_retry_count[2]; }; /* * Used with the NO Sense (0x00) or Not Ready (0x02) sense keys. */ struct scsi_sense_sks_progress { uint8_t byte0; #define SSD_SKS_PROGRESS_VALID 0x80 uint8_t progress[2]; #define SSD_SKS_PROGRESS_DENOM 0x10000 }; /* * Used with the Copy Aborted (0x0a) sense key. */ struct scsi_sense_sks_segment { uint8_t byte0; #define SSD_SKS_SEGMENT_VALID 0x80 #define SSD_SKS_SEGMENT_SD 0x20 #define SSD_SKS_SEGMENT_BPV 0x08 #define SSD_SKS_SEGMENT_BITPTR 0x07 uint8_t field[2]; }; /* * Used with the Unit Attention (0x06) sense key. * * This is currently used to indicate that the unit attention condition * queue has overflowed (when the overflow bit is set). */ struct scsi_sense_sks_overflow { uint8_t byte0; #define SSD_SKS_OVERFLOW_VALID 0x80 #define SSD_SKS_OVERFLOW_SET 0x01 uint8_t reserved[2]; }; /* * This specifies which component is associated with the sense data. There * is no standard meaning for the fru value. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_fru { uint8_t desc_type; #define SSD_DESC_FRU 0x03 uint8_t length; uint8_t reserved; uint8_t fru; }; /* * Used for Stream commands, defined in SSC-4. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_stream { uint8_t desc_type; #define SSD_DESC_STREAM 0x04 uint8_t length; uint8_t reserved; uint8_t byte3; #define SSD_DESC_STREAM_FM 0x80 #define SSD_DESC_STREAM_EOM 0x40 #define SSD_DESC_STREAM_ILI 0x20 }; /* * Used for Block commands, defined in SBC-3. * * This is currently (as of SBC-3) only used for the Incorrect Length * Indication (ILI) bit, which says that the data length requested in the * READ LONG or WRITE LONG command did not match the length of the logical * block. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_block { uint8_t desc_type; #define SSD_DESC_BLOCK 0x05 uint8_t length; uint8_t reserved; uint8_t byte3; #define SSD_DESC_BLOCK_ILI 0x20 }; /* * Used for Object-Based Storage Devices (OSD-3). * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_osd_objid { uint8_t desc_type; #define SSD_DESC_OSD_OBJID 0x06 uint8_t length; uint8_t reserved[6]; /* * XXX KDM provide the bit definitions here? There are a lot of * them, and we don't have an OSD driver yet. */ uint8_t not_init_cmds[4]; uint8_t completed_cmds[4]; uint8_t partition_id[8]; uint8_t object_id[8]; }; /* * Used for Object-Based Storage Devices (OSD-3). * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_osd_integrity { uint8_t desc_type; #define SSD_DESC_OSD_INTEGRITY 0x07 uint8_t length; uint8_t integ_check_val[32]; }; /* * Used for Object-Based Storage Devices (OSD-3). * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_osd_attr_id { uint8_t desc_type; #define SSD_DESC_OSD_ATTR_ID 0x08 uint8_t length; uint8_t reserved[2]; uint8_t attr_desc[0]; }; /* * Used with Sense keys No Sense (0x00) and Not Ready (0x02). * * Maximum descriptors allowed: 32 (as of SPC-4) */ struct scsi_sense_progress { uint8_t desc_type; #define SSD_DESC_PROGRESS 0x0a uint8_t length; uint8_t sense_key; uint8_t add_sense_code; uint8_t add_sense_code_qual; uint8_t reserved; uint8_t progress[2]; }; /* * This is typically forwarded as the result of an EXTENDED COPY command. * * Maximum descriptors allowed: 2 (as of SPC-4) */ struct scsi_sense_forwarded { uint8_t desc_type; #define SSD_DESC_FORWARDED 0x0c uint8_t length; uint8_t byte2; #define SSD_FORWARDED_FSDT 0x80 #define SSD_FORWARDED_SDS_MASK 0x0f #define SSD_FORWARDED_SDS_UNK 0x00 #define SSD_FORWARDED_SDS_EXSRC 0x01 #define SSD_FORWARDED_SDS_EXDST 0x02 }; /* * Vendor-specific sense descriptor. The desc_type field will be in the * range bewteen MIN and MAX inclusive. */ struct scsi_sense_vendor { uint8_t desc_type; #define SSD_DESC_VENDOR_MIN 0x80 #define SSD_DESC_VENDOR_MAX 0xff uint8_t length; uint8_t data[0]; }; struct scsi_mode_header_6 { u_int8_t data_length; /* Sense data length */ u_int8_t medium_type; u_int8_t dev_spec; u_int8_t blk_desc_len; }; struct scsi_mode_header_10 { u_int8_t data_length[2];/* Sense data length */ u_int8_t medium_type; u_int8_t dev_spec; u_int8_t unused[2]; u_int8_t blk_desc_len[2]; }; struct scsi_mode_page_header { u_int8_t page_code; #define SMPH_PS 0x80 #define SMPH_SPF 0x40 #define SMPH_PC_MASK 0x3f u_int8_t page_length; }; struct scsi_mode_page_header_sp { uint8_t page_code; uint8_t subpage; uint8_t page_length[2]; }; struct scsi_mode_blk_desc { u_int8_t density; u_int8_t nblocks[3]; u_int8_t reserved; u_int8_t blklen[3]; }; #define SCSI_DEFAULT_DENSITY 0x00 /* use 'default' density */ #define SCSI_SAME_DENSITY 0x7f /* use 'same' density- >= SCSI-2 only */ /* * Status Byte */ #define SCSI_STATUS_OK 0x00 #define SCSI_STATUS_CHECK_COND 0x02 #define SCSI_STATUS_COND_MET 0x04 #define SCSI_STATUS_BUSY 0x08 #define SCSI_STATUS_INTERMED 0x10 #define SCSI_STATUS_INTERMED_COND_MET 0x14 #define SCSI_STATUS_RESERV_CONFLICT 0x18 #define SCSI_STATUS_CMD_TERMINATED 0x22 /* Obsolete in SAM-2 */ #define SCSI_STATUS_QUEUE_FULL 0x28 #define SCSI_STATUS_ACA_ACTIVE 0x30 #define SCSI_STATUS_TASK_ABORTED 0x40 struct scsi_inquiry_pattern { u_int8_t type; u_int8_t media_type; #define SIP_MEDIA_REMOVABLE 0x01 #define SIP_MEDIA_FIXED 0x02 const char *vendor; const char *product; const char *revision; }; struct scsi_static_inquiry_pattern { u_int8_t type; u_int8_t media_type; char vendor[SID_VENDOR_SIZE+1]; char product[SID_PRODUCT_SIZE+1]; char revision[SID_REVISION_SIZE+1]; }; struct scsi_sense_quirk_entry { struct scsi_inquiry_pattern inq_pat; int num_sense_keys; int num_ascs; struct sense_key_table_entry *sense_key_info; struct asc_table_entry *asc_info; }; struct sense_key_table_entry { u_int8_t sense_key; u_int32_t action; const char *desc; }; struct asc_table_entry { u_int8_t asc; u_int8_t ascq; u_int32_t action; const char *desc; }; struct op_table_entry { u_int8_t opcode; u_int32_t opmask; const char *desc; }; struct scsi_op_quirk_entry { struct scsi_inquiry_pattern inq_pat; int num_ops; struct op_table_entry *op_table; }; typedef enum { SSS_FLAG_NONE = 0x00, SSS_FLAG_PRINT_COMMAND = 0x01 } scsi_sense_string_flags; struct scsi_nv { const char *name; uint64_t value; }; typedef enum { SCSI_NV_FOUND, SCSI_NV_AMBIGUOUS, SCSI_NV_NOT_FOUND } scsi_nv_status; typedef enum { SCSI_NV_FLAG_NONE = 0x00, SCSI_NV_FLAG_IG_CASE = 0x01 /* Case insensitive comparison */ } scsi_nv_flags; struct ccb_scsiio; struct cam_periph; union ccb; #ifndef _KERNEL struct cam_device; #endif extern const char *scsi_sense_key_text[]; struct sbuf; __BEGIN_DECLS void scsi_sense_desc(int sense_key, int asc, int ascq, struct scsi_inquiry_data *inq_data, const char **sense_key_desc, const char **asc_desc); scsi_sense_action scsi_error_action(struct ccb_scsiio* csio, struct scsi_inquiry_data *inq_data, u_int32_t sense_flags); const char * scsi_status_string(struct ccb_scsiio *csio); void scsi_desc_iterate(struct scsi_sense_data_desc *sense, u_int sense_len, int (*iter_func)(struct scsi_sense_data_desc *sense, u_int, struct scsi_sense_desc_header *, void *), void *arg); uint8_t *scsi_find_desc(struct scsi_sense_data_desc *sense, u_int sense_len, uint8_t desc_type); void scsi_set_sense_data(struct scsi_sense_data *sense_data, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, ...) ; void scsi_set_sense_data_va(struct scsi_sense_data *sense_data, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, va_list ap); int scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t info_type, uint64_t *info, int64_t *signed_info); int scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t *sks); int scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *block_bits); int scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *stream_bits); void scsi_info_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t info); void scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t csi); void scsi_progress_sbuf(struct sbuf *sb, uint16_t progress); int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks); void scsi_fru_sbuf(struct sbuf *sb, uint64_t fru); void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits, uint64_t info); void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits, uint64_t info); void scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_command_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_progress_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_generic_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_desc_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); scsi_sense_data_type scsi_sense_type(struct scsi_sense_data *sense_data); void scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len, struct sbuf *sb, char *path_str, struct scsi_inquiry_data *inq_data, uint8_t *cdb, int cdb_len); #ifdef _KERNEL int scsi_command_string(struct ccb_scsiio *csio, struct sbuf *sb); int scsi_sense_sbuf(struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags); char * scsi_sense_string(struct ccb_scsiio *csio, char *str, int str_len); void scsi_sense_print(struct ccb_scsiio *csio); int scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id); #else /* _KERNEL */ int scsi_command_string(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb); int scsi_sense_sbuf(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags); char * scsi_sense_string(struct cam_device *device, struct ccb_scsiio *csio, char *str, int str_len); void scsi_sense_print(struct cam_device *device, struct ccb_scsiio *csio, FILE *ofile); #endif /* _KERNEL */ const char * scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data); char * scsi_cdb_string(u_int8_t *cdb_ptr, char *cdb_string, size_t len); void scsi_print_inquiry(struct scsi_inquiry_data *inq_data); void scsi_print_inquiry_short(struct scsi_inquiry_data *inq_data); u_int scsi_calc_syncsrate(u_int period_factor); u_int scsi_calc_syncparam(u_int period); typedef int (*scsi_devid_checkfn_t)(uint8_t *); int scsi_devid_is_naa_ieee_reg(uint8_t *bufp); int scsi_devid_is_sas_target(uint8_t *bufp); int scsi_devid_is_lun_eui64(uint8_t *bufp); int scsi_devid_is_lun_naa(uint8_t *bufp); int scsi_devid_is_lun_name(uint8_t *bufp); int scsi_devid_is_lun_t10(uint8_t *bufp); struct scsi_vpd_id_descriptor * scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t len, scsi_devid_checkfn_t ck_fn); struct scsi_vpd_id_descriptor * scsi_get_devid_desc(struct scsi_vpd_id_descriptor *desc, uint32_t len, scsi_devid_checkfn_t ck_fn); int scsi_transportid_sbuf(struct sbuf *sb, struct scsi_transportid_header *hdr, uint32_t valid_len); const char * scsi_nv_to_str(struct scsi_nv *table, int num_table_entries, uint64_t value); scsi_nv_status scsi_get_nv(struct scsi_nv *table, int num_table_entries, char *name, int *table_entry, scsi_nv_flags flags); int scsi_parse_transportid_64bit(int proto_id, char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); int scsi_parse_transportid_spi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); int scsi_parse_transportid_rdma(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); int scsi_parse_transportid_iscsi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str,int error_str_len); int scsi_parse_transportid_sop(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str,int error_str_len); int scsi_parse_transportid(char *transportid_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); void scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout); void scsi_request_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), void *data_ptr, u_int8_t dxfer_len, u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout); void scsi_inquiry(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *inq_buf, u_int32_t inq_len, int evpd, u_int8_t page_code, u_int8_t sense_len, u_int32_t timeout); void scsi_mode_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int dbd, u_int8_t page_code, u_int8_t page, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout); void scsi_mode_sense_len(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int dbd, u_int8_t page_code, u_int8_t page, u_int8_t *param_buf, u_int32_t param_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout); void scsi_mode_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout); void scsi_mode_select_len(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout); void scsi_log_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, u_int8_t page, int save_pages, int ppc, u_int32_t paramptr, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout); void scsi_log_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, int save_pages, int pc_reset, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout); void scsi_prevent(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t action, u_int8_t sense_len, u_int32_t timeout); void scsi_read_capacity(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, struct scsi_read_capacity_data *, u_int8_t sense_len, u_int32_t timeout); void scsi_read_capacity_16(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint64_t lba, int reladr, int pmi, uint8_t *rcap_buf, int rcap_buf_len, uint8_t sense_len, uint32_t timeout); void scsi_report_luns(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t select_report, struct scsi_report_luns_data *rpl_buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_report_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t pdf, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_set_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_synchronize_cache(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int32_t begin_lba, u_int16_t lb_count, u_int8_t sense_len, u_int32_t timeout); void scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int pcv, uint8_t page_code, uint8_t *data_ptr, uint16_t allocation_length, uint8_t sense_len, uint32_t timeout); void scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int unit_offline, int device_offline, int self_test, int page_format, int self_test_code, uint8_t *data_ptr, uint16_t param_list_length, uint8_t sense_len, uint32_t timeout); void scsi_read_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t allocation_length, uint8_t sense_len, uint32_t timeout); void scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t param_list_length, uint8_t sense_len, uint32_t timeout); #define SCSI_RW_READ 0x0001 #define SCSI_RW_WRITE 0x0002 #define SCSI_RW_DIRMASK 0x0003 #define SCSI_RW_BIO 0x1000 void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int readop, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_write_same(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_ata_trim(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int16_t block_count, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_ata_pass_16(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t tag_action, u_int8_t protocol, u_int8_t ata_flags, u_int16_t features, u_int16_t sector_count, uint64_t lba, u_int8_t command, u_int8_t control, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_unmap(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int start, int load_eject, int immediate, u_int8_t sense_len, u_int32_t timeout); void scsi_persistent_reserve_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *,union ccb *), uint8_t tag_action, int service_action, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout); void scsi_persistent_reserve_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int service_action, int scope, int res_type, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout); int scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry); int scsi_static_inquiry_match(caddr_t inqbuffer, caddr_t table_entry); int scsi_devid_match(uint8_t *rhs, size_t rhs_len, uint8_t *lhs, size_t lhs_len); void scsi_extract_sense(struct scsi_sense_data *sense, int *error_code, int *sense_key, int *asc, int *ascq); int scsi_extract_sense_ccb(union ccb *ccb, int *error_code, int *sense_key, int *asc, int *ascq); void scsi_extract_sense_len(struct scsi_sense_data *sense, u_int sense_len, int *error_code, int *sense_key, int *asc, int *ascq, int show_errors); int scsi_get_sense_key(struct scsi_sense_data *sense, u_int sense_len, int show_errors); int scsi_get_asc(struct scsi_sense_data *sense, u_int sense_len, int show_errors); int scsi_get_ascq(struct scsi_sense_data *sense, u_int sense_len, int show_errors); static __inline void scsi_ulto2b(u_int32_t val, u_int8_t *bytes); static __inline void scsi_ulto3b(u_int32_t val, u_int8_t *bytes); static __inline void scsi_ulto4b(u_int32_t val, u_int8_t *bytes); static __inline void scsi_u64to8b(u_int64_t val, u_int8_t *bytes); static __inline uint32_t scsi_2btoul(const uint8_t *bytes); static __inline uint32_t scsi_3btoul(const uint8_t *bytes); static __inline int32_t scsi_3btol(const uint8_t *bytes); static __inline uint32_t scsi_4btoul(const uint8_t *bytes); static __inline uint64_t scsi_8btou64(const uint8_t *bytes); static __inline void *find_mode_page_6(struct scsi_mode_header_6 *mode_header); static __inline void *find_mode_page_10(struct scsi_mode_header_10 *mode_header); static __inline void scsi_ulto2b(u_int32_t val, u_int8_t *bytes) { bytes[0] = (val >> 8) & 0xff; bytes[1] = val & 0xff; } static __inline void scsi_ulto3b(u_int32_t val, u_int8_t *bytes) { bytes[0] = (val >> 16) & 0xff; bytes[1] = (val >> 8) & 0xff; bytes[2] = val & 0xff; } static __inline void scsi_ulto4b(u_int32_t val, u_int8_t *bytes) { bytes[0] = (val >> 24) & 0xff; bytes[1] = (val >> 16) & 0xff; bytes[2] = (val >> 8) & 0xff; bytes[3] = val & 0xff; } static __inline void scsi_u64to8b(u_int64_t val, u_int8_t *bytes) { bytes[0] = (val >> 56) & 0xff; bytes[1] = (val >> 48) & 0xff; bytes[2] = (val >> 40) & 0xff; bytes[3] = (val >> 32) & 0xff; bytes[4] = (val >> 24) & 0xff; bytes[5] = (val >> 16) & 0xff; bytes[6] = (val >> 8) & 0xff; bytes[7] = val & 0xff; } static __inline uint32_t scsi_2btoul(const uint8_t *bytes) { uint32_t rv; rv = (bytes[0] << 8) | bytes[1]; return (rv); } static __inline uint32_t scsi_3btoul(const uint8_t *bytes) { uint32_t rv; rv = (bytes[0] << 16) | (bytes[1] << 8) | bytes[2]; return (rv); } static __inline int32_t scsi_3btol(const uint8_t *bytes) { uint32_t rc = scsi_3btoul(bytes); if (rc & 0x00800000) rc |= 0xff000000; return (int32_t) rc; } static __inline uint32_t scsi_4btoul(const uint8_t *bytes) { uint32_t rv; rv = (bytes[0] << 24) | (bytes[1] << 16) | (bytes[2] << 8) | bytes[3]; return (rv); } static __inline uint64_t scsi_8btou64(const uint8_t *bytes) { uint64_t rv; rv = (((uint64_t)bytes[0]) << 56) | (((uint64_t)bytes[1]) << 48) | (((uint64_t)bytes[2]) << 40) | (((uint64_t)bytes[3]) << 32) | (((uint64_t)bytes[4]) << 24) | (((uint64_t)bytes[5]) << 16) | (((uint64_t)bytes[6]) << 8) | bytes[7]; return (rv); } /* * Given the pointer to a returned mode sense buffer, return a pointer to * the start of the first mode page. */ static __inline void * find_mode_page_6(struct scsi_mode_header_6 *mode_header) { void *page_start; page_start = (void *)((u_int8_t *)&mode_header[1] + mode_header->blk_desc_len); return(page_start); } static __inline void * find_mode_page_10(struct scsi_mode_header_10 *mode_header) { void *page_start; page_start = (void *)((u_int8_t *)&mode_header[1] + scsi_2btoul(mode_header->blk_desc_len)); return(page_start); } __END_DECLS #endif /*_SCSI_SCSI_ALL_H*/ Index: user/ae/inet6/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c =================================================================== --- user/ae/inet6/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 271452) +++ user/ae/inet6/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 271453) @@ -1,3491 +1,3494 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); /* * Virtual device management. */ /* * The limit for ZFS to automatically increase a top-level vdev's ashift * from logical ashift to physical ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 12 (4096 bytes) * zfs_max_auto_ashift = 11 (2048 bytes) * zfs_min_auto_ashift = 9 (512 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 2048 as limited by * zfs_max_auto_ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 12 (4096 bytes) * zfs_max_auto_ashift = 13 (8192 bytes) * zfs_min_auto_ashift = 9 (512 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 4096 to match the * max vdev_physical_ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 9 (512 bytes) * zfs_max_auto_ashift = 13 (8192 bytes) * zfs_min_auto_ashift = 12 (4096 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 4096 to match the * zfs_min_auto_ashift. */ static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; static int sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_max_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) return (EINVAL); zfs_max_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_max_auto_ashift, "QU", "Max ashift used when optimising for logical -> physical sectors size on " "new top-level vdevs."); static int sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_min_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) return (EINVAL); zfs_min_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_min_auto_ashift, "QU", "Min ashift used when creating new top-level vdevs."); static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, #ifdef _KERNEL &vdev_geom_ops, #else &vdev_disk_ops, #endif &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return (pvd->vdev_min_asize / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_zalloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); } txg_list_create(&vd->vdev_ms_list, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); } if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? spa_log_class(spa) : spa_normal_class(spa), vd); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be * valid in the current context. Local vdevs will * remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the current "typical" blocksize. * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, * or we will inconsistently account for existing bp's. */ vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); ASSERT(oldc <= newc); mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; if (txg == 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error) return (error); } vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, m, object, txg); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { uint64_t m; uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_data, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { zio_buf_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; zio_buf_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); while ((pio = zio_walk_parents(zio)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } boolean_t vdev_uses_zvols(vdev_t *vd) { if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (B_TRUE || vdev_uses_zvols(vd)) { for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); return; } tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); for (int c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != 0); taskq_destroy(tq); } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); if (vd->vdev_ops->vdev_op_leaf) { vd->vdev_notrim = B_FALSE; trim_map_create(vd); } for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (EINVAL); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; } else { /* * Make sure the alignment requirement hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (EINVAL); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy and the asize has increased, * then we've experienced dynamic LUN growth. If automatic * expansion is enabled then use the additional space. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && vdev_resilver_needed(vd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * If 'strict' is false ignore the spa guid check. This is necessary because * if the machine crashed during a re-guid the new guid might have been written * to all of the vdev labels, but not the cached config. The strict check * will be performed when the pool is opened again using the mos config. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd, boolean_t strict) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, top_guid; uint64_t state; for (int c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c], strict) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { uint64_t aux_guid = 0; nvlist_t *nvl; uint64_t txg = spa_last_synced_txg(spa) != 0 ? spa_last_synced_txg(spa) : -1ULL; if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); return (0); } if (strict && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's * vdev guid -- but the label may or may not be on disk yet. * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (SET_ERROR(EBADF)); /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; } return (0); } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *pvd = vd->vdev_parent; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); if (vd->vdev_ops->vdev_op_leaf) trim_map_destroy(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd, B_TRUE); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : ENXIO); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly 200 metaslabs per vdev. */ vd->vdev_ms_shift = highbit64(vd->vdev_asize / 200); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ void vdev_ashift_optimize(vdev_t *vd) { if (vd == vd->vdev_top) { if (vd->vdev_ashift < vd->vdev_physical_ashift) { vd->vdev_ashift = MIN( MAX(zfs_max_auto_ashift, vd->vdev_ashift), MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * Unusual case where logical ashift > physical ashift * so we can't cap the calculated ashift based on max * ashift as that would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_min_auto_ashift, vd->vdev_ashift); } } } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(rt->rt_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(rt->rt_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); mutex_enter(rt->rt_lock); if (range_tree_space(rt) != 0) dirty = range_tree_contains(rt, txg, size); mutex_exit(rt->rt_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(rt->rt_lock); empty = (range_tree_space(rt) == 0); mutex_exit(rt->rt_lock); return (empty); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_start - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_end); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd) { spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_resilver_txg == 0 || range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the scn_max_txg * value to the highest txg value that exists in all DTLs. If this * device's max DTL is not part of this scan (i.e. it is not in * the range (scn_min_txg, scn_max_txg] then it is not eligible * for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); return (B_TRUE); } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; mutex_enter(&vd->vdev_dtl_lock); /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during * the entire duration of this scan. */ if (scrub_txg != 0 && (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) && vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering and no longer has any - * DTLs then reset its resilvering flag. + * DTLs then reset its resilvering flag and dirty + * the top level so that we persist the change. */ if (vd->vdev_resilver_txg != 0 && range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && - range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) + range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(!vd->vdev_ishole); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); /* * Now that we've opened the space_map we need to update * the in-core DTL. */ space_map_update(vd->vdev_dtl_sm); error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; kmutex_t rtlock; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(!vd->vdev_ishole); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0, &vd->vdev_dtl_lock)); ASSERT(vd->vdev_dtl_sm != NULL); } bzero(&rtlock, sizeof(rtlock)); mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); rtsync = range_tree_create(NULL, NULL, &rtlock); mutex_enter(&rtlock); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); mutex_exit(&rtlock); mutex_destroy(&rtlock); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " "new object %llu", txg, spa_name(spa), object, space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); mutex_enter(&vd->vdev_dtl_lock); space_map_update(vd->vdev_dtl_sm); mutex_exit(&vd->vdev_dtl_lock); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); if (!required && zio_injection_enabled) required = !!zio_handle_device_injection(vd, NULL, ECHILD); return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } void vdev_load(vdev_t *vd) { /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_load(vd->vdev_child[c]); /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && !vd->vdev_ishole && (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } void vdev_remove(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp == NULL || msp->ms_sm == NULL) continue; mutex_enter(&msp->ms_lock); /* * If the metaslab was not loaded when the vdev * was removed then the histogram accounting may * not be accurate. Update the histogram information * here so that we ensure that the metaslab group * and metaslab class are up-to-date. */ metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); space_map_free(msp->ms_sm, tx); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); } metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } if (vd->vdev_ms_array) { (void) dmu_object_free(mos, vd->vdev_ms_array, tx); vd->vdev_ms_array = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(!vd->vdev_ishole); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); dmu_tx_commit(tx); } /* * Remove the metadata associated with this vdev once it's empty. */ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove(vd, txg); while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_offline_log(spa); spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); if (vd == rvd) { for (int c = 0; c < spa->spa_l2cache.sav_count; c++) vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); for (int c = 0; c < spa->spa_spares.sav_count; c++) vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); } /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in reponse to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER); spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && !vd->vdev_ishole); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } /* * Get statistics for the given vdev. */ void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (vd == rvd) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } } mutex_exit(&vd->vdev_stat_lock); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } vs->vs_ops[type]++; vs->vs_bytes[type] += psize; mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; metaslab_group_t *mg = vd->vdev_mg; metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } if (mc != NULL) { ASSERT(rvd == vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, dspace_delta); } } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_config_dirty_list, vd); } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes into the decision. */ if (child->vdev_ishole) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); /* * If we have brought this vdev back into service, we need * to notify fmd so that it can gracefully repair any outstanding * cases due to a missing device. We do this in all cases, even those * that probably don't correlate to a repaired fault. This is sure to * catch all cases, and we let the zfs-retire agent sort it out. If * this is a transient state it's OK, as the retire agent will * double-check the state of the vdev before repairing it. */ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && vd->vdev_prevstate != state) zfs_post_state_change(spa, vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } zfs_ereport_post(class, spa, vd, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. * * On Solaris, we do not support RAID-Z or partial configuration. In * addition, only a single top-level vdev is allowed and none of the * leaves can be wholedisks. * * For FreeBSD, we can boot from any configuration. There is a * limitation that the boot filesystem must be either uncompressed or * compresses with lzjb compression but I'm not sure how to enforce * that here. */ boolean_t vdev_is_bootable(vdev_t *vd) { #ifdef sun if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { return (B_FALSE); } } else if (vd->vdev_wholedisk == 1) { return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } #endif /* sun */ return (B_TRUE); } /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original * vdev was offline or faulted then we transfer that state to the * device in the current vdev tree (nvd). */ void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { spa_t *spa = nvd->vdev_spa; ASSERT(nvd->vdev_top->vdev_islog); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); for (int c = 0; c < nvd->vdev_children; c++) vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); if (nvd->vdev_ops->vdev_op_leaf) { /* * Restore the persistent vdev state */ nvd->vdev_offline = ovd->vdev_offline; nvd->vdev_faulted = ovd->vdev_faulted; nvd->vdev_degraded = ovd->vdev_degraded; nvd->vdev_removed = ovd->vdev_removed; } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime we panic the system. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) { zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " "delta %lluns, last io %lluns", fio->io_timestamp, delta, vq->vq_io_complete_ts); fm_panic("I/O to pool '%s' appears to be " "hung on vdev guid %llu at '%s'.", spa_name(spa), (long long unsigned int) vd->vdev_guid, vd->vdev_path); } } mutex_exit(&vq->vq_lock); } } Index: user/ae/inet6/sys/cddl/contrib/opensolaris =================================================================== --- user/ae/inet6/sys/cddl/contrib/opensolaris (revision 271452) +++ user/ae/inet6/sys/cddl/contrib/opensolaris (revision 271453) Property changes on: user/ae/inet6/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/cddl/contrib/opensolaris:r271428-271452 Index: user/ae/inet6/sys/dev/cxgbe/t4_main.c =================================================================== --- user/ae/inet6/sys/dev/cxgbe/t4_main.c (revision 271452) +++ user/ae/inet6/sys/dev/cxgbe/t4_main.c (revision 271453) @@ -1,8388 +1,8467 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_ioctl.h" #include "t4_l2t.h" /* T4 bus driver interface */ static int t4_probe(device_t); static int t4_attach(device_t); static int t4_detach(device_t); static device_method_t t4_methods[] = { DEVMETHOD(device_probe, t4_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD_END }; static driver_t t4_driver = { "t4nex", t4_methods, sizeof(struct adapter) }; /* T4 port (cxgbe) interface */ static int cxgbe_probe(device_t); static int cxgbe_attach(device_t); static int cxgbe_detach(device_t); static device_method_t cxgbe_methods[] = { DEVMETHOD(device_probe, cxgbe_probe), DEVMETHOD(device_attach, cxgbe_attach), DEVMETHOD(device_detach, cxgbe_detach), { 0, 0 } }; static driver_t cxgbe_driver = { "cxgbe", cxgbe_methods, sizeof(struct port_info) }; static d_ioctl_t t4_ioctl; static d_open_t t4_open; static d_close_t t4_close; static struct cdevsw t4_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = t4_open, .d_close = t4_close, .d_ioctl = t4_ioctl, .d_name = "t4nex", }; /* T5 bus driver interface */ static int t5_probe(device_t); static device_method_t t5_methods[] = { DEVMETHOD(device_probe, t5_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD_END }; static driver_t t5_driver = { "t5nex", t5_methods, sizeof(struct adapter) }; /* T5 port (cxl) interface */ static driver_t cxl_driver = { "cxl", cxgbe_methods, sizeof(struct port_info) }; static struct cdevsw t5_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = t4_open, .d_close = t4_close, .d_ioctl = t4_ioctl, .d_name = "t5nex", }; /* ifnet + media interface */ static void cxgbe_init(void *); static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t); static int cxgbe_transmit(struct ifnet *, struct mbuf *); static void cxgbe_qflush(struct ifnet *); static int cxgbe_media_change(struct ifnet *); static void cxgbe_media_status(struct ifnet *, struct ifmediareq *); MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services"); /* * Correct lock order when you need to acquire multiple locks is t4_list_lock, * then ADAPTER_LOCK, then t4_uld_list_lock. */ static struct sx t4_list_lock; SLIST_HEAD(, adapter) t4_list; #ifdef TCP_OFFLOAD static struct sx t4_uld_list_lock; SLIST_HEAD(, uld_info) t4_uld_list; #endif /* * Tunables. See tweak_tunables() too. * * Each tunable is set to a default value here if it's known at compile-time. * Otherwise it is set to -1 as an indication to tweak_tunables() that it should * provide a reasonable default when the driver is loaded. * * Tunables applicable to both T4 and T5 are under hw.cxgbe. Those specific to * T5 are under hw.cxl. */ /* * Number of queues for tx and rx, 10G and 1G, NIC and offload. */ #define NTXQ_10G 16 static int t4_ntxq10g = -1; TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq10g); #define NRXQ_10G 8 static int t4_nrxq10g = -1; TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq10g); #define NTXQ_1G 4 static int t4_ntxq1g = -1; TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g); #define NRXQ_1G 2 static int t4_nrxq1g = -1; TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g); static int t4_rsrv_noflowq = 0; TUNABLE_INT("hw.cxgbe.rsrv_noflowq", &t4_rsrv_noflowq); #ifdef TCP_OFFLOAD #define NOFLDTXQ_10G 8 static int t4_nofldtxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g); #define NOFLDRXQ_10G 2 static int t4_nofldrxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq10g", &t4_nofldrxq10g); #define NOFLDTXQ_1G 2 static int t4_nofldtxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq1g", &t4_nofldtxq1g); #define NOFLDRXQ_1G 1 static int t4_nofldrxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq1g", &t4_nofldrxq1g); #endif #ifdef DEV_NETMAP #define NNMTXQ_10G 2 static int t4_nnmtxq10g = -1; TUNABLE_INT("hw.cxgbe.nnmtxq10g", &t4_nnmtxq10g); #define NNMRXQ_10G 2 static int t4_nnmrxq10g = -1; TUNABLE_INT("hw.cxgbe.nnmrxq10g", &t4_nnmrxq10g); #define NNMTXQ_1G 1 static int t4_nnmtxq1g = -1; TUNABLE_INT("hw.cxgbe.nnmtxq1g", &t4_nnmtxq1g); #define NNMRXQ_1G 1 static int t4_nnmrxq1g = -1; TUNABLE_INT("hw.cxgbe.nnmrxq1g", &t4_nnmrxq1g); #endif /* * Holdoff parameters for 10G and 1G ports. */ #define TMR_IDX_10G 1 static int t4_tmr_idx_10g = TMR_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx_10g); #define PKTC_IDX_10G (-1) static int t4_pktc_idx_10g = PKTC_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx_10g); #define TMR_IDX_1G 1 static int t4_tmr_idx_1g = TMR_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_1G", &t4_tmr_idx_1g); #define PKTC_IDX_1G (-1) static int t4_pktc_idx_1g = PKTC_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_1G", &t4_pktc_idx_1g); /* * Size (# of entries) of each tx and rx queue. */ static unsigned int t4_qsize_txq = TX_EQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_txq", &t4_qsize_txq); static unsigned int t4_qsize_rxq = RX_IQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_rxq", &t4_qsize_rxq); /* * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively). */ static int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX; TUNABLE_INT("hw.cxgbe.interrupt_types", &t4_intr_types); /* * Configuration file. */ #define DEFAULT_CF "default" #define FLASH_CF "flash" #define UWIRE_CF "uwire" #define FPGA_CF "fpga" static char t4_cfg_file[32] = DEFAULT_CF; TUNABLE_STR("hw.cxgbe.config_file", t4_cfg_file, sizeof(t4_cfg_file)); /* + * PAUSE settings (bit 0, 1 = rx_pause, tx_pause respectively). + * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. + * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water + * mark or when signalled to do so, 0 to never emit PAUSE. + */ +static int t4_pause_settings = PAUSE_TX | PAUSE_RX; +TUNABLE_INT("hw.cxgbe.pause_settings", &t4_pause_settings); + +/* * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed, * encouraged respectively). */ static unsigned int t4_fw_install = 1; TUNABLE_INT("hw.cxgbe.fw_install", &t4_fw_install); /* * ASIC features that will be used. Disable the ones you don't want so that the * chip resources aren't wasted on features that will not be used. */ static int t4_linkcaps_allowed = 0; /* No DCBX, PPP, etc. by default */ TUNABLE_INT("hw.cxgbe.linkcaps_allowed", &t4_linkcaps_allowed); static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC; TUNABLE_INT("hw.cxgbe.niccaps_allowed", &t4_niccaps_allowed); static int t4_toecaps_allowed = -1; TUNABLE_INT("hw.cxgbe.toecaps_allowed", &t4_toecaps_allowed); static int t4_rdmacaps_allowed = 0; TUNABLE_INT("hw.cxgbe.rdmacaps_allowed", &t4_rdmacaps_allowed); static int t4_iscsicaps_allowed = 0; TUNABLE_INT("hw.cxgbe.iscsicaps_allowed", &t4_iscsicaps_allowed); static int t4_fcoecaps_allowed = 0; TUNABLE_INT("hw.cxgbe.fcoecaps_allowed", &t4_fcoecaps_allowed); static int t5_write_combine = 0; TUNABLE_INT("hw.cxl.write_combine", &t5_write_combine); struct intrs_and_queues { uint16_t intr_type; /* INTx, MSI, or MSI-X */ uint16_t nirq; /* Total # of vectors */ uint16_t intr_flags_10g;/* Interrupt flags for each 10G port */ uint16_t intr_flags_1g; /* Interrupt flags for each 1G port */ uint16_t ntxq10g; /* # of NIC txq's for each 10G port */ uint16_t nrxq10g; /* # of NIC rxq's for each 10G port */ uint16_t ntxq1g; /* # of NIC txq's for each 1G port */ uint16_t nrxq1g; /* # of NIC rxq's for each 1G port */ uint16_t rsrv_noflowq; /* Flag whether to reserve queue 0 */ #ifdef TCP_OFFLOAD uint16_t nofldtxq10g; /* # of TOE txq's for each 10G port */ uint16_t nofldrxq10g; /* # of TOE rxq's for each 10G port */ uint16_t nofldtxq1g; /* # of TOE txq's for each 1G port */ uint16_t nofldrxq1g; /* # of TOE rxq's for each 1G port */ #endif #ifdef DEV_NETMAP uint16_t nnmtxq10g; /* # of netmap txq's for each 10G port */ uint16_t nnmrxq10g; /* # of netmap rxq's for each 10G port */ uint16_t nnmtxq1g; /* # of netmap txq's for each 1G port */ uint16_t nnmrxq1g; /* # of netmap rxq's for each 1G port */ #endif }; struct filter_entry { uint32_t valid:1; /* filter allocated and valid */ uint32_t locked:1; /* filter is administratively locked */ uint32_t pending:1; /* filter action is pending firmware reply */ uint32_t smtidx:8; /* Source MAC Table index for smac */ struct l2t_entry *l2t; /* Layer Two Table entry for dmac */ struct t4_filter_specification fs; }; static int map_bars_0_and_4(struct adapter *); static int map_bar_2(struct adapter *); static void setup_memwin(struct adapter *); static int validate_mem_range(struct adapter *, uint32_t, int); static int fwmtype_to_hwmtype(int); static int validate_mt_off_len(struct adapter *, int, uint32_t, int, uint32_t *); static void memwin_info(struct adapter *, int, uint32_t *, uint32_t *); static uint32_t position_memwin(struct adapter *, int, uint32_t); static int cfg_itype_and_nqueues(struct adapter *, int, int, struct intrs_and_queues *); static int prep_firmware(struct adapter *); static int partition_resources(struct adapter *, const struct firmware *, const char *); static int get_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); static void build_medialist(struct port_info *, struct ifmedia *); static int cxgbe_init_synchronized(struct port_info *); static int cxgbe_uninit_synchronized(struct port_info *); static int setup_intr_handlers(struct adapter *); static void quiesce_eq(struct adapter *, struct sge_eq *); static void quiesce_iq(struct adapter *, struct sge_iq *); static void quiesce_fl(struct adapter *, struct sge_fl *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, driver_intr_t *, void *, char *); static int t4_free_irq(struct adapter *, struct irq *); static void reg_block_dump(struct adapter *, uint8_t *, unsigned int, unsigned int); static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void cxgbe_tick(void *); static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t); static int cpl_not_handled(struct sge_iq *, const struct rss_header *, struct mbuf *); static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *); static int fw_msg_not_handled(struct adapter *, const __be64 *); static int t4_sysctls(struct adapter *); static int cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS); +static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS); static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS); static int sysctl_temperature(SYSCTL_HANDLER_ARGS); #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS); static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_devlog(SYSCTL_HANDLER_ARGS); static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS); static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS); static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS); static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS); static int sysctl_meminfo(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS); static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS); static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS); static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tids(SYSCTL_HANDLER_ARGS); static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la(SYSCTL_HANDLER_ARGS); static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); #endif static inline void txq_start(struct ifnet *, struct sge_txq *); static uint32_t fconf_to_mode(uint32_t); static uint32_t mode_to_fconf(uint32_t); static uint32_t fspec_to_fconf(struct t4_filter_specification *); static int get_filter_mode(struct adapter *, uint32_t *); static int set_filter_mode(struct adapter *, uint32_t); static inline uint64_t get_filter_hits(struct adapter *, uint32_t); static int get_filter(struct adapter *, struct t4_filter *); static int set_filter(struct adapter *, struct t4_filter *); static int del_filter(struct adapter *, struct t4_filter *); static void clear_filter(struct filter_entry *); static int set_filter_wr(struct adapter *, int); static int del_filter_wr(struct adapter *, int); static int get_sge_context(struct adapter *, struct t4_sge_context *); static int load_fw(struct adapter *, struct t4_data *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); static int set_sched_class(struct adapter *, struct t4_sched_params *); static int set_sched_queue(struct adapter *, struct t4_sched_queue *); #ifdef TCP_OFFLOAD static int toe_capability(struct port_info *, int); #endif static int mod_event(module_t, int, void *); struct { uint16_t device; char *desc; } t4_pciids[] = { {0xa000, "Chelsio Terminator 4 FPGA"}, {0x4400, "Chelsio T440-dbg"}, {0x4401, "Chelsio T420-CR"}, {0x4402, "Chelsio T422-CR"}, {0x4403, "Chelsio T440-CR"}, {0x4404, "Chelsio T420-BCH"}, {0x4405, "Chelsio T440-BCH"}, {0x4406, "Chelsio T440-CH"}, {0x4407, "Chelsio T420-SO"}, {0x4408, "Chelsio T420-CX"}, {0x4409, "Chelsio T420-BT"}, {0x440a, "Chelsio T404-BT"}, {0x440e, "Chelsio T440-LP-CR"}, }, t5_pciids[] = { {0xb000, "Chelsio Terminator 5 FPGA"}, {0x5400, "Chelsio T580-dbg"}, {0x5401, "Chelsio T520-CR"}, /* 2 x 10G */ {0x5402, "Chelsio T522-CR"}, /* 2 x 10G, 2 X 1G */ {0x5403, "Chelsio T540-CR"}, /* 4 x 10G */ {0x5407, "Chelsio T520-SO"}, /* 2 x 10G, nomem */ {0x5409, "Chelsio T520-BT"}, /* 2 x 10GBaseT */ {0x540a, "Chelsio T504-BT"}, /* 4 x 1G */ {0x540d, "Chelsio T580-CR"}, /* 2 x 40G */ {0x540e, "Chelsio T540-LP-CR"}, /* 4 x 10G */ {0x5410, "Chelsio T580-LP-CR"}, /* 2 x 40G */ {0x5411, "Chelsio T520-LL-CR"}, /* 2 x 10G */ {0x5412, "Chelsio T560-CR"}, /* 1 x 40G, 2 x 10G */ {0x5414, "Chelsio T580-LP-SO-CR"}, /* 2 x 40G, nomem */ #ifdef notyet {0x5404, "Chelsio T520-BCH"}, {0x5405, "Chelsio T540-BCH"}, {0x5406, "Chelsio T540-CH"}, {0x5408, "Chelsio T520-CX"}, {0x540b, "Chelsio B520-SR"}, {0x540c, "Chelsio B504-BT"}, {0x540f, "Chelsio Amsterdam"}, {0x5413, "Chelsio T580-CHR"}, #endif }; #ifdef TCP_OFFLOAD /* * service_iq() has an iq and needs the fl. Offset of fl from the iq should be * exactly the same for both rxq and ofld_rxq. */ CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif /* No easy way to include t4_msg.h before adapter.h so we check this way */ CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS); CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES); CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); static int t4_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xa000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t4_pciids); i++) { if (d == t4_pciids[i].device) { device_set_desc(dev, t4_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t5_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xb000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t5_pciids); i++) { if (d == t5_pciids[i].device) { device_set_desc(dev, t5_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t4_attach(device_t dev) { struct adapter *sc; int rc = 0, i, n10g, n1g, rqidx, tqidx; struct intrs_and_queues iaq; struct sge *s; #ifdef TCP_OFFLOAD int ofld_rqidx, ofld_tqidx; #endif #ifdef DEV_NETMAP int nm_rqidx, nm_tqidx; #endif const char *pcie_ts; sc = device_get_softc(dev); sc->dev = dev; pci_enable_busmaster(dev); if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) { uint32_t v; pci_set_max_read_req(dev, 4096); v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2); v |= PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); } sc->traceq = -1; mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF); snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer", device_get_nameunit(dev)); snprintf(sc->lockname, sizeof(sc->lockname), "%s", device_get_nameunit(dev)); mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF); sx_xlock(&t4_list_lock); SLIST_INSERT_HEAD(&t4_list, sc, link); sx_xunlock(&t4_list_lock); mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF); TAILQ_INIT(&sc->sfl); callout_init(&sc->sfl_callout, CALLOUT_MPSAFE); rc = map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ /* * This is the real PF# to which we're attaching. Works from within PCI * passthrough environments too, where pci_get_function() could return a * different PF# depending on the passthrough configuration. We need to * use the real PF# in all our communication with the firmware. */ sc->pf = G_SOURCEPF(t4_read_reg(sc, A_PL_WHOAMI)); sc->mbox = sc->pf; memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); sc->an_handler = an_not_handled; for (i = 0; i < nitems(sc->cpl_handler); i++) sc->cpl_handler[i] = cpl_not_handled; for (i = 0; i < nitems(sc->fw_msg_handler); i++) sc->fw_msg_handler[i] = fw_msg_not_handled; t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, t4_filter_rpl); t4_register_cpl_handler(sc, CPL_TRACE_PKT, t4_trace_pkt); t4_register_cpl_handler(sc, CPL_TRACE_PKT_T5, t5_trace_pkt); t4_init_sge_cpl_handlers(sc); /* Prepare the adapter for operation */ rc = -t4_prep_adapter(sc); if (rc != 0) { device_printf(dev, "failed to prepare adapter: %d.\n", rc); goto done; } /* * Do this really early, with the memory windows set up even before the * character device. The userland tool's register i/o and mem read * will work even in "recovery mode". */ setup_memwin(sc); sc->cdev = make_dev(is_t4(sc) ? &t4_cdevsw : &t5_cdevsw, device_get_unit(dev), UID_ROOT, GID_WHEEL, 0600, "%s", device_get_nameunit(dev)); if (sc->cdev == NULL) device_printf(dev, "failed to create nexus char device.\n"); else sc->cdev->si_drv1 = sc; /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { device_printf(dev, "recovery mode.\n"); goto done; } /* Prepare the firmware for operation */ rc = prep_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = map_bar_2(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_create_dma_tag(sc); if (rc != 0) goto done; /* error message displayed already */ /* * First pass over all the ports - allocate VIs and initialize some * basic parameters like mac address, port type, etc. We also figure * out whether a port is 10G or 1G and use that information when * calculating how many interrupts to attempt to allocate. */ n10g = n1g = 0; for_each_port(sc, i) { struct port_info *pi; pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK); sc->port[i] = pi; /* These must be set before t4_port_init */ pi->adapter = sc; pi->port_id = i; /* Allocate the vi and initialize parameters like mac addr */ rc = -t4_port_init(pi, sc->mbox, sc->pf, 0); if (rc != 0) { device_printf(dev, "unable to initialize port %d: %d\n", i, rc); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } + + pi->link_cfg.requested_fc &= ~(PAUSE_TX | PAUSE_RX); + pi->link_cfg.requested_fc |= t4_pause_settings; + pi->link_cfg.fc &= ~(PAUSE_TX | PAUSE_RX); + pi->link_cfg.fc |= t4_pause_settings; + rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, &pi->link_cfg); if (rc != 0) { device_printf(dev, "port %d l1cfg failed: %d\n", i, rc); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF); sc->chan_map[pi->tx_chan] = i; if (is_10G_port(pi) || is_40G_port(pi)) { n10g++; pi->tmr_idx = t4_tmr_idx_10g; pi->pktc_idx = t4_pktc_idx_10g; } else { n1g++; pi->tmr_idx = t4_tmr_idx_1g; pi->pktc_idx = t4_pktc_idx_1g; } pi->xact_addr_filt = -1; pi->linkdnrc = -1; pi->qsize_rxq = t4_qsize_rxq; pi->qsize_txq = t4_qsize_txq; pi->dev = device_add_child(dev, is_t4(sc) ? "cxgbe" : "cxl", -1); if (pi->dev == NULL) { device_printf(dev, "failed to add device for port %d.\n", i); rc = ENXIO; goto done; } device_set_softc(pi->dev, pi); } /* * Interrupt type, # of interrupts, # of rx/tx queues, etc. */ rc = cfg_itype_and_nqueues(sc, n10g, n1g, &iaq); if (rc != 0) goto done; /* error message displayed already */ sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; s = &sc->sge; s->nrxq = n10g * iaq.nrxq10g + n1g * iaq.nrxq1g; s->ntxq = n10g * iaq.ntxq10g + n1g * iaq.ntxq1g; s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ #ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g; s->nofldtxq = n10g * iaq.nofldtxq10g + n1g * iaq.nofldtxq1g; s->neq += s->nofldtxq + s->nofldrxq; s->niq += s->nofldrxq; s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef DEV_NETMAP s->nnmrxq = n10g * iaq.nnmrxq10g + n1g * iaq.nnmrxq1g; s->nnmtxq = n10g * iaq.nnmtxq10g + n1g * iaq.nnmtxq1g; s->neq += s->nnmtxq + s->nnmrxq; s->niq += s->nnmrxq; s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), M_CXGBE, M_ZERO | M_WAITOK); #endif s->ctrlq = malloc(sc->params.nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE, M_ZERO | M_WAITOK); s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE, M_ZERO | M_WAITOK); s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE, M_ZERO | M_WAITOK); sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE, M_ZERO | M_WAITOK); t4_init_l2t(sc, M_WAITOK); /* * Second pass over the ports. This time we know the number of rx and * tx queues that each port should get. */ rqidx = tqidx = 0; #ifdef TCP_OFFLOAD ofld_rqidx = ofld_tqidx = 0; #endif #ifdef DEV_NETMAP nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; if (pi == NULL) continue; pi->first_rxq = rqidx; pi->first_txq = tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { pi->flags |= iaq.intr_flags_10g; pi->nrxq = iaq.nrxq10g; pi->ntxq = iaq.ntxq10g; } else { pi->flags |= iaq.intr_flags_1g; pi->nrxq = iaq.nrxq1g; pi->ntxq = iaq.ntxq1g; } if (pi->ntxq > 1) pi->rsrv_noflowq = iaq.rsrv_noflowq ? 1 : 0; else pi->rsrv_noflowq = 0; rqidx += pi->nrxq; tqidx += pi->ntxq; #ifdef TCP_OFFLOAD if (is_offload(sc)) { pi->first_ofld_rxq = ofld_rqidx; pi->first_ofld_txq = ofld_tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { pi->nofldrxq = iaq.nofldrxq10g; pi->nofldtxq = iaq.nofldtxq10g; } else { pi->nofldrxq = iaq.nofldrxq1g; pi->nofldtxq = iaq.nofldtxq1g; } ofld_rqidx += pi->nofldrxq; ofld_tqidx += pi->nofldtxq; } #endif #ifdef DEV_NETMAP pi->first_nm_rxq = nm_rqidx; pi->first_nm_txq = nm_tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { pi->nnmrxq = iaq.nnmrxq10g; pi->nnmtxq = iaq.nnmtxq10g; } else { pi->nnmrxq = iaq.nnmrxq1g; pi->nnmtxq = iaq.nnmtxq1g; } nm_rqidx += pi->nnmrxq; nm_tqidx += pi->nnmtxq; #endif } rc = setup_intr_handlers(sc); if (rc != 0) { device_printf(dev, "failed to setup interrupt handlers: %d\n", rc); goto done; } rc = bus_generic_attach(dev); if (rc != 0) { device_printf(dev, "failed to attach all child ports: %d\n", rc); goto done; } switch (sc->params.pci.speed) { case 0x1: pcie_ts = "2.5"; break; case 0x2: pcie_ts = "5.0"; break; case 0x3: pcie_ts = "8.0"; break; default: pcie_ts = "??"; break; } device_printf(dev, "PCIe x%d (%s GTS/s) (%d), %d ports, %d %s interrupt%s, %d eq, %d iq\n", sc->params.pci.width, pcie_ts, sc->params.pci.speed, sc->params.nports, sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" : (sc->intr_type == INTR_MSI ? "MSI" : "INTx"), sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq); t4_set_desc(sc); done: if (rc != 0 && sc->cdev) { /* cdev was created and so cxgbetool works; recover that way. */ device_printf(dev, "error during attach, adapter is now in recovery mode.\n"); rc = 0; } if (rc != 0) t4_detach(dev); else t4_sysctls(sc); return (rc); } /* * Idempotent */ static int t4_detach(device_t dev) { struct adapter *sc; struct port_info *pi; int i, rc; sc = device_get_softc(dev); if (sc->flags & FULL_INIT_DONE) t4_intr_disable(sc); if (sc->cdev) { destroy_dev(sc->cdev); sc->cdev = NULL; } rc = bus_generic_detach(dev); if (rc) { device_printf(dev, "failed to detach child devices: %d\n", rc); return (rc); } for (i = 0; i < sc->intr_count; i++) t4_free_irq(sc, &sc->irq[i]); for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->viid); if (pi->dev) device_delete_child(dev, pi->dev); mtx_destroy(&pi->pi_lock); free(pi, M_CXGBE); } } if (sc->flags & FULL_INIT_DONE) adapter_full_uninit(sc); if (sc->flags & FW_OK) t4_fw_bye(sc, sc->mbox); if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX) pci_release_msi(dev); if (sc->regs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); if (sc->udbs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->msix_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid, sc->msix_res); if (sc->l2t) t4_free_l2t(sc->l2t); #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); free(sc->sge.ofld_txq, M_CXGBE); #endif #ifdef DEV_NETMAP free(sc->sge.nm_rxq, M_CXGBE); free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); free(sc->sge.txq, M_CXGBE); free(sc->sge.ctrlq, M_CXGBE); free(sc->sge.iqmap, M_CXGBE); free(sc->sge.eqmap, M_CXGBE); free(sc->tids.ftid_tab, M_CXGBE); t4_destroy_dma_tag(sc); if (mtx_initialized(&sc->sc_lock)) { sx_xlock(&t4_list_lock); SLIST_REMOVE(&t4_list, sc, adapter, link); sx_xunlock(&t4_list_lock); mtx_destroy(&sc->sc_lock); } if (mtx_initialized(&sc->tids.ftid_lock)) mtx_destroy(&sc->tids.ftid_lock); if (mtx_initialized(&sc->sfl_lock)) mtx_destroy(&sc->sfl_lock); if (mtx_initialized(&sc->ifp_lock)) mtx_destroy(&sc->ifp_lock); bzero(sc, sizeof(*sc)); return (0); } static int cxgbe_probe(device_t dev) { char buf[128]; struct port_info *pi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d", pi->port_id); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS) #define T4_CAP_ENABLE (T4_CAP) static int cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct ifnet *ifp; char *s; int n, o; /* Allocate an ifnet and set it up */ ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } pi->ifp = ifp; ifp->if_softc = pi; callout_init(&pi->tick, CALLOUT_MPSAFE); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = cxgbe_init; ifp->if_ioctl = cxgbe_ioctl; ifp->if_transmit = cxgbe_transmit; ifp->if_qflush = cxgbe_qflush; ifp->if_capabilities = T4_CAP; #ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) ifp->if_capabilities |= IFCAP_TOE; #endif ifp->if_capenable = T4_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; /* Initialize ifmedia for this port */ ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); build_medialist(pi, &pi->media); pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp, EVENTHANDLER_PRI_ANY); ether_ifattach(ifp, pi->hw_addr); n = 128; s = malloc(n, M_CXGBE, M_WAITOK); o = snprintf(s, n, "%d txq, %d rxq (NIC)", pi->ntxq, pi->nrxq); MPASS(n > o); #ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) { o += snprintf(s + o, n - o, "; %d txq, %d rxq (TOE)", pi->nofldtxq, pi->nofldrxq); MPASS(n > o); } #endif #ifdef DEV_NETMAP o += snprintf(s + o, n - o, "; %d txq, %d rxq (netmap)", pi->nnmtxq, pi->nnmrxq); MPASS(n > o); #endif device_printf(dev, "%s\n", s); free(s, M_CXGBE); #ifdef DEV_NETMAP /* nm_media handled here to keep implementation private to this file */ ifmedia_init(&pi->nm_media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); build_medialist(pi, &pi->nm_media); create_netmap_ifnet(pi); /* logs errors it something fails */ #endif cxgbe_sysctls(pi); return (0); } static int cxgbe_detach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; /* Tell if_ioctl and if_init that the port is going away */ ADAPTER_LOCK(sc); SET_DOOMED(pi); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = "t4detach"; sc->last_op_thr = curthread; #endif ADAPTER_UNLOCK(sc); if (pi->flags & HAS_TRACEQ) { sc->traceq = -1; /* cloner should not create ifnet */ t4_tracer_port_detach(sc); } if (pi->vlan_c) EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c); PORT_LOCK(pi); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&pi->tick); PORT_UNLOCK(pi); callout_drain(&pi->tick); /* Let detach proceed even if these fail. */ cxgbe_uninit_synchronized(pi); port_full_uninit(pi); ifmedia_removeall(&pi->media); ether_ifdetach(pi->ifp); if_free(pi->ifp); #ifdef DEV_NETMAP /* XXXNM: equivalent of cxgbe_uninit_synchronized to ifdown nm_ifp */ destroy_netmap_ifnet(pi); #endif ADAPTER_LOCK(sc); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); return (0); } static void cxgbe_init(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; if (begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4init") != 0) return; cxgbe_init_synchronized(pi); end_synchronized_op(sc, 0); } static int cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) { int rc = 0, mtu, flags, can_sleep; struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; struct ifreq *ifr = (struct ifreq *)data; uint32_t mask; switch (cmd) { case SIOCSIFMTU: mtu = ifr->ifr_mtu; if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO)) return (EINVAL); rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4mtu"); if (rc) return (rc); ifp->if_mtu = mtu; if (pi->flags & PORT_INIT_DONE) { t4_update_fl_bufsize(ifp); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: can_sleep = 0; redo_sifflags: rc = begin_synchronized_op(sc, pi, can_sleep ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4flg"); if (rc) return (rc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { flags = pi->if_flags; if ((ifp->if_flags ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { if (can_sleep == 1) { end_synchronized_op(sc, 0); can_sleep = 0; goto redo_sifflags; } rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } } else { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_init_synchronized(pi); } pi->if_flags = ifp->if_flags; } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_uninit_synchronized(pi); } end_synchronized_op(sc, can_sleep ? 0 : LOCK_HELD); break; case SIOCADDMULTI: case SIOCDELMULTI: /* these two are called with a mutex held :-( */ rc = begin_synchronized_op(sc, pi, HOLD_LOCK, "t4multi"); if (rc) return (rc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, LOCK_HELD); break; case SIOCSIFCAP: rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4cap"); if (rc) return (rc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO4; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO6; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { if_printf(ifp, "enable txcsum first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO4; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { if_printf(ifp, "enable txcsum6 first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO6; } if (mask & IFCAP_LRO) { #if defined(INET) || defined(INET6) int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_LRO; for_each_rxq(pi, i, rxq) { if (ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; else rxq->iq.flags &= ~IQ_LRO_ENABLED; } #endif } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; rc = toe_capability(pi, enable); if (rc != 0) goto fail; ifp->if_capenable ^= mask; } #endif if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; /* Need to find out how to disable auto-mtu-inflation */ } if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWCSUM) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif fail: end_synchronized_op(sc, 0); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: ifmedia_ioctl(ifp, ifr, &pi->media, cmd); break; default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; struct sge_txq *txq = &sc->sge.txq[pi->first_txq]; struct buf_ring *br; int rc; M_ASSERTPKTHDR(m); if (__predict_false(pi->link_cfg.link_ok == 0)) { m_freem(m); return (ENETDOWN); } if (m->m_flags & M_FLOWID) txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) + pi->rsrv_noflowq); br = txq->br; if (TXQ_TRYLOCK(txq) == 0) { struct sge_eq *eq = &txq->eq; /* * It is possible that t4_eth_tx finishes up and releases the * lock between the TRYLOCK above and the drbr_enqueue here. We * need to make sure that this mbuf doesn't just sit there in * the drbr. */ rc = drbr_enqueue(ifp, br, m); if (rc == 0 && callout_pending(&eq->tx_callout) == 0 && !(eq->flags & EQ_DOOMED)) callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); return (rc); } /* * txq->m is the mbuf that is held up due to a temporary shortage of * resources and it should be put on the wire first. Then what's in * drbr and finally the mbuf that was just passed in to us. * * Return code should indicate the fate of the mbuf that was passed in * this time. */ TXQ_LOCK_ASSERT_OWNED(txq); if (drbr_needs_enqueue(ifp, br) || txq->m) { /* Queued for transmission. */ rc = drbr_enqueue(ifp, br, m); m = txq->m ? txq->m : drbr_dequeue(ifp, br); (void) t4_eth_tx(ifp, txq, m); TXQ_UNLOCK(txq); return (rc); } /* Direct transmission. */ rc = t4_eth_tx(ifp, txq, m); if (rc != 0 && txq->m) rc = 0; /* held, will be transmitted soon (hopefully) */ TXQ_UNLOCK(txq); return (rc); } static void cxgbe_qflush(struct ifnet *ifp) { struct port_info *pi = ifp->if_softc; struct sge_txq *txq; int i; struct mbuf *m; /* queues do not exist if !PORT_INIT_DONE. */ if (pi->flags & PORT_INIT_DONE) { for_each_txq(pi, i, txq) { TXQ_LOCK(txq); m_freem(txq->m); txq->m = NULL; while ((m = buf_ring_dequeue_sc(txq->br)) != NULL) m_freem(m); TXQ_UNLOCK(txq); } } if_qflush(ifp); } static int cxgbe_media_change(struct ifnet *ifp) { struct port_info *pi = ifp->if_softc; device_printf(pi->dev, "%s unimplemented.\n", __func__); return (EOPNOTSUPP); } static void cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct port_info *pi = ifp->if_softc; struct ifmedia *media = NULL; struct ifmedia_entry *cur; int speed = pi->link_cfg.speed; int data = (pi->port_type << 8) | pi->mod_type; if (ifp == pi->ifp) media = &pi->media; #ifdef DEV_NETMAP else if (ifp == pi->nm_ifp) media = &pi->nm_media; #endif MPASS(media != NULL); cur = media->ifm_cur; if (cur->ifm_data != data) { build_medialist(pi, media); cur = media->ifm_cur; } ifmr->ifm_status = IFM_AVALID; if (!pi->link_cfg.link_ok) return; ifmr->ifm_status |= IFM_ACTIVE; /* active and current will differ iff current media is autoselect. */ if (IFM_SUBTYPE(cur->ifm_media) != IFM_AUTO) return; ifmr->ifm_active = IFM_ETHER | IFM_FDX; if (speed == SPEED_10000) ifmr->ifm_active |= IFM_10G_T; else if (speed == SPEED_1000) ifmr->ifm_active |= IFM_1000_T; else if (speed == SPEED_100) ifmr->ifm_active |= IFM_100_TX; else if (speed == SPEED_10) ifmr->ifm_active |= IFM_10_T; else KASSERT(0, ("%s: link up but speed unknown (%u)", __func__, speed)); } void t4_fatal_err(struct adapter *sc) { t4_set_reg_field(sc, A_SGE_CONTROL, F_GLOBALENABLE, 0); t4_intr_disable(sc); log(LOG_EMERG, "%s: encountered fatal error, adapter stopped.\n", device_get_nameunit(sc->dev)); } static int map_bars_0_and_4(struct adapter *sc) { sc->regs_rid = PCIR_BAR(0); sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE); if (sc->regs_res == NULL) { device_printf(sc->dev, "cannot map registers.\n"); return (ENXIO); } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); setbit(&sc->doorbells, DOORBELL_KDB); sc->msix_rid = PCIR_BAR(4); sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->msix_rid, RF_ACTIVE); if (sc->msix_res == NULL) { device_printf(sc->dev, "cannot map MSI-X BAR.\n"); return (ENXIO); } return (0); } static int map_bar_2(struct adapter *sc) { /* * T4: only iWARP driver uses the userspace doorbells. There is no need * to map it if RDMA is disabled. */ if (is_t4(sc) && sc->rdmacaps == 0) return (0); sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE); if (sc->udbs_res == NULL) { device_printf(sc->dev, "cannot map doorbell BAR.\n"); return (ENXIO); } sc->udbs_base = rman_get_virtual(sc->udbs_res); if (is_t5(sc)) { setbit(&sc->doorbells, DOORBELL_UDB); #if defined(__i386__) || defined(__amd64__) if (t5_write_combine) { int rc; /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B * (UDBS_SEG_SIZE) doorbell regions, each associated * with an egress queue. The first 64B has the doorbell * and the second 64B can be used to submit a tx work * request with an implicit doorbell. */ rc = pmap_change_attr((vm_offset_t)sc->udbs_base, rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING); if (rc == 0) { clrbit(&sc->doorbells, DOORBELL_UDB); setbit(&sc->doorbells, DOORBELL_WCWR); setbit(&sc->doorbells, DOORBELL_UDBWC); } else { device_printf(sc->dev, "couldn't enable write combining: %d\n", rc); } t4_write_reg(sc, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) | V_STATMODE(0)); } #endif } return (0); } static const struct memwin t4_memwin[] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 } }; static const struct memwin t5_memwin[] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 }, }; static void setup_memwin(struct adapter *sc) { const struct memwin *mw; int i, n; uint32_t bar0; if (is_t4(sc)) { /* * Read low 32b of bar0 indirectly via the hardware backdoor * mechanism. Works from within PCI passthrough environments * too, where rman_get_start() can return a different value. We * need to program the T4 memory window decoders with the actual * addresses that will be coming across the PCIe link. */ bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0)); bar0 &= (uint32_t) PCIM_BAR_MEM_BASE; mw = &t4_memwin[0]; n = nitems(t4_memwin); } else { /* T5 uses the relative offset inside the PCIe BAR */ bar0 = 0; mw = &t5_memwin[0]; n = nitems(t5_memwin); } for (i = 0; i < n; i++, mw++) { t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i), (mw->base + bar0) | V_BIR(0) | V_WINDOW(ilog2(mw->aperture) - 10)); } /* flush */ t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2)); } /* * Verify that the memory range specified by the addr/len pair is valid and lies * entirely within a single region (EDCx or MCx). */ static int validate_mem_range(struct adapter *sc, uint32_t addr, int len) { uint32_t em, addr_len, maddr, mlen; /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len == 0) return (EINVAL); /* Enabled memories */ em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (em & F_EDRAM0_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; mlen = G_EDRAM0_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } if (em & F_EDRAM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; mlen = G_EDRAM1_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } if (em & F_EXT_MEM_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; mlen = G_EXT_MEM_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } if (!is_t4(sc) && em & F_EXT_MEM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; mlen = G_EXT_MEM1_SIZE(addr_len) << 20; if (mlen > 0 && addr >= maddr && addr < maddr + mlen && addr + len <= maddr + mlen) return (0); } return (EFAULT); } static int fwmtype_to_hwmtype(int mtype) { switch (mtype) { case FW_MEMTYPE_EDC0: return (MEM_EDC0); case FW_MEMTYPE_EDC1: return (MEM_EDC1); case FW_MEMTYPE_EXTMEM: return (MEM_MC0); case FW_MEMTYPE_EXTMEM1: return (MEM_MC1); default: panic("%s: cannot translate fw mtype %d.", __func__, mtype); } } /* * Verify that the memory range specified by the memtype/offset/len pair is * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ static int validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len, uint32_t *addr) { uint32_t em, addr_len, maddr, mlen; /* Memory can only be accessed in naturally aligned 4 byte units */ if (off & 3 || len & 3 || len == 0) return (EINVAL); em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); switch (fwmtype_to_hwmtype(mtype)) { case MEM_EDC0: if (!(em & F_EDRAM0_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; mlen = G_EDRAM0_SIZE(addr_len) << 20; break; case MEM_EDC1: if (!(em & F_EDRAM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; mlen = G_EDRAM1_SIZE(addr_len) << 20; break; case MEM_MC: if (!(em & F_EXT_MEM_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; mlen = G_EXT_MEM_SIZE(addr_len) << 20; break; case MEM_MC1: if (is_t4(sc) || !(em & F_EXT_MEM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; mlen = G_EXT_MEM1_SIZE(addr_len) << 20; break; default: return (EINVAL); } if (mlen > 0 && off < mlen && off + len <= mlen) { *addr = maddr + off; /* global address */ return (0); } return (EFAULT); } static void memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture) { const struct memwin *mw; if (is_t4(sc)) { KASSERT(win >= 0 && win < nitems(t4_memwin), ("%s: incorrect memwin# (%d)", __func__, win)); mw = &t4_memwin[win]; } else { KASSERT(win >= 0 && win < nitems(t5_memwin), ("%s: incorrect memwin# (%d)", __func__, win)); mw = &t5_memwin[win]; } if (base != NULL) *base = mw->base; if (aperture != NULL) *aperture = mw->aperture; } /* * Positions the memory window such that it can be used to access the specified * address in the chip's address space. The return value is the offset of addr * from the start of the window. */ static uint32_t position_memwin(struct adapter *sc, int n, uint32_t addr) { uint32_t start, pf; uint32_t reg; KASSERT(n >= 0 && n <= 3, ("%s: invalid window %d.", __func__, n)); KASSERT((addr & 3) == 0, ("%s: addr (0x%x) is not at a 4B boundary.", __func__, addr)); if (is_t4(sc)) { pf = 0; start = addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); start = addr & ~0x7f; /* start must be 128B aligned */ } reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, n); t4_write_reg(sc, reg, start | pf); t4_read_reg(sc, reg); return (addr - start); } static int cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, struct intrs_and_queues *iaq) { int rc, itype, navail, nrxq10g, nrxq1g, n; int nofldrxq10g = 0, nofldrxq1g = 0; int nnmrxq10g = 0, nnmrxq1g = 0; bzero(iaq, sizeof(*iaq)); iaq->ntxq10g = t4_ntxq10g; iaq->ntxq1g = t4_ntxq1g; iaq->nrxq10g = nrxq10g = t4_nrxq10g; iaq->nrxq1g = nrxq1g = t4_nrxq1g; iaq->rsrv_noflowq = t4_rsrv_noflowq; #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldtxq10g = t4_nofldtxq10g; iaq->nofldtxq1g = t4_nofldtxq1g; iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g; iaq->nofldrxq1g = nofldrxq1g = t4_nofldrxq1g; } #endif #ifdef DEV_NETMAP iaq->nnmtxq10g = t4_nnmtxq10g; iaq->nnmtxq1g = t4_nnmtxq1g; iaq->nnmrxq10g = nnmrxq10g = t4_nnmrxq10g; iaq->nnmrxq1g = nnmrxq1g = t4_nnmrxq1g; #endif for (itype = INTR_MSIX; itype; itype >>= 1) { if ((itype & t4_intr_types) == 0) continue; /* not allowed */ if (itype == INTR_MSIX) navail = pci_msix_count(sc->dev); else if (itype == INTR_MSI) navail = pci_msi_count(sc->dev); else navail = 1; restart: if (navail == 0) continue; iaq->intr_type = itype; iaq->intr_flags_10g = 0; iaq->intr_flags_1g = 0; /* * Best option: an interrupt vector for errors, one for the * firmware event queue, and one for every rxq (NIC, TOE, and * netmap). */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g * (nrxq10g + nofldrxq10g + nnmrxq10g); iaq->nirq += n1g * (nrxq1g + nofldrxq1g + nnmrxq1g); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { iaq->intr_flags_10g = INTR_ALL; iaq->intr_flags_1g = INTR_ALL; goto allocate; } /* * Second best option: a vector for errors, one for the firmware * event queue, and vectors for either all the NIC rx queues or * all the TOE rx queues. The queues that don't get vectors * will forward their interrupts to those that do. * * Note: netmap rx queues cannot be created early and so they * can't be setup to receive forwarded interrupts for others. */ iaq->nirq = T4_EXTRA_INTR; if (nrxq10g >= nofldrxq10g) { iaq->intr_flags_10g = INTR_RXQ; iaq->nirq += n10g * nrxq10g; #ifdef DEV_NETMAP iaq->nnmrxq10g = min(nnmrxq10g, nrxq10g); #endif } else { iaq->intr_flags_10g = INTR_OFLD_RXQ; iaq->nirq += n10g * nofldrxq10g; #ifdef DEV_NETMAP iaq->nnmrxq10g = min(nnmrxq10g, nofldrxq10g); #endif } if (nrxq1g >= nofldrxq1g) { iaq->intr_flags_1g = INTR_RXQ; iaq->nirq += n1g * nrxq1g; #ifdef DEV_NETMAP iaq->nnmrxq1g = min(nnmrxq1g, nrxq1g); #endif } else { iaq->intr_flags_1g = INTR_OFLD_RXQ; iaq->nirq += n1g * nofldrxq1g; #ifdef DEV_NETMAP iaq->nnmrxq1g = min(nnmrxq1g, nofldrxq1g); #endif } if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) goto allocate; /* * Next best option: an interrupt vector for errors, one for the * firmware event queue, and at least one per port. At this * point we know we'll have to downsize nrxq and/or nofldrxq * and/or nnmrxq to fit what's available to us. */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g + n1g; if (iaq->nirq <= navail) { int leftover = navail - iaq->nirq; if (n10g > 0) { int target = max(nrxq10g, nofldrxq10g); iaq->intr_flags_10g = nrxq10g >= nofldrxq10g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n10g) { leftover -= n10g; iaq->nirq += n10g; n++; } iaq->nrxq10g = min(n, nrxq10g); #ifdef TCP_OFFLOAD iaq->nofldrxq10g = min(n, nofldrxq10g); #endif #ifdef DEV_NETMAP iaq->nnmrxq10g = min(n, nnmrxq10g); #endif } if (n1g > 0) { int target = max(nrxq1g, nofldrxq1g); iaq->intr_flags_1g = nrxq1g >= nofldrxq1g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n1g) { leftover -= n1g; iaq->nirq += n1g; n++; } iaq->nrxq1g = min(n, nrxq1g); #ifdef TCP_OFFLOAD iaq->nofldrxq1g = min(n, nofldrxq1g); #endif #ifdef DEV_NETMAP iaq->nnmrxq1g = min(n, nnmrxq1g); #endif } if (itype != INTR_MSI || powerof2(iaq->nirq)) goto allocate; } /* * Least desirable option: one interrupt vector for everything. */ iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; iaq->intr_flags_10g = iaq->intr_flags_1g = 0; #ifdef TCP_OFFLOAD if (is_offload(sc)) iaq->nofldrxq10g = iaq->nofldrxq1g = 1; #endif #ifdef DEV_NETMAP iaq->nnmrxq10g = iaq->nnmrxq1g = 1; #endif allocate: navail = iaq->nirq; rc = 0; if (itype == INTR_MSIX) rc = pci_alloc_msix(sc->dev, &navail); else if (itype == INTR_MSI) rc = pci_alloc_msi(sc->dev, &navail); if (rc == 0) { if (navail == iaq->nirq) return (0); /* * Didn't get the number requested. Use whatever number * the kernel is willing to allocate (it's in navail). */ device_printf(sc->dev, "fewer vectors than requested, " "type=%d, req=%d, rcvd=%d; will downshift req.\n", itype, iaq->nirq, navail); pci_release_msi(sc->dev); goto restart; } device_printf(sc->dev, "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n", itype, rc, iaq->nirq, navail); } device_printf(sc->dev, "failed to find a usable interrupt type. " "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types, pci_msix_count(sc->dev), pci_msi_count(sc->dev)); return (ENXIO); } #define FW_VERSION(chip) ( \ V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \ V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \ V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \ V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD)) #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf) struct fw_info { uint8_t chip; char *kld_name; char *fw_mod_name; struct fw_hdr fw_hdr; /* XXX: waste of space, need a sparse struct */ } fw_info[] = { { .chip = CHELSIO_T4, .kld_name = "t4fw_cfg", .fw_mod_name = "t4fw", .fw_hdr = { .chip = FW_HDR_CHIP_T4, .fw_ver = htobe32_const(FW_VERSION(T4)), .intfver_nic = FW_INTFVER(T4, NIC), .intfver_vnic = FW_INTFVER(T4, VNIC), .intfver_ofld = FW_INTFVER(T4, OFLD), .intfver_ri = FW_INTFVER(T4, RI), .intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T4, ISCSI), .intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU), .intfver_fcoe = FW_INTFVER(T4, FCOE), }, }, { .chip = CHELSIO_T5, .kld_name = "t5fw_cfg", .fw_mod_name = "t5fw", .fw_hdr = { .chip = FW_HDR_CHIP_T5, .fw_ver = htobe32_const(FW_VERSION(T5)), .intfver_nic = FW_INTFVER(T5, NIC), .intfver_vnic = FW_INTFVER(T5, VNIC), .intfver_ofld = FW_INTFVER(T5, OFLD), .intfver_ri = FW_INTFVER(T5, RI), .intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T5, ISCSI), .intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU), .intfver_fcoe = FW_INTFVER(T5, FCOE), }, } }; static struct fw_info * find_fw_info(int chip) { int i; for (i = 0; i < nitems(fw_info); i++) { if (fw_info[i].chip == chip) return (&fw_info[i]); } return (NULL); } /* * Is the given firmware API compatible with the one the driver was compiled * with? */ static int fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2) { /* short circuit if it's the exact same firmware version */ if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) return (1); /* * XXX: Is this too conservative? Perhaps I should limit this to the * features that are supported in the driver. */ #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) && SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe)) return (1); #undef SAME_INTF return (0); } /* * The firmware in the KLD is usable, but should it be installed? This routine * explains itself in detail if it indicates the KLD firmware should be * installed. */ static int should_install_kld_fw(struct adapter *sc, int card_fw_usable, int k, int c) { const char *reason; if (!card_fw_usable) { reason = "incompatible or unusable"; goto install; } if (k > c) { reason = "older than the version bundled with this driver"; goto install; } if (t4_fw_install == 2 && k != c) { reason = "different than the version bundled with this driver"; goto install; } return (0); install: if (t4_fw_install == 0) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver is prohibited from installing a different " "firmware on the card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); return (0); } device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "installing firmware %u.%u.%u.%u on card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason, G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); return (1); } /* * Establish contact with the firmware and determine if we are the master driver * or not, and whether we are responsible for chip initialization. */ static int prep_firmware(struct adapter *sc) { const struct firmware *fw = NULL, *default_cfg; int rc, pf, card_fw_usable, kld_fw_usable, need_fw_reset = 1; enum dev_state state; struct fw_info *fw_info; struct fw_hdr *card_fw; /* fw on the card */ const struct fw_hdr *kld_fw; /* fw in the KLD */ const struct fw_hdr *drv_fw; /* fw header the driver was compiled against */ /* Contact firmware. */ rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state); if (rc < 0 || state == DEV_STATE_ERR) { rc = -rc; device_printf(sc->dev, "failed to connect to the firmware: %d, %d.\n", rc, state); return (rc); } pf = rc; if (pf == sc->mbox) sc->flags |= MASTER_PF; else if (state == DEV_STATE_UNINIT) { /* * We didn't get to be the master so we definitely won't be * configuring the chip. It's a bug if someone else hasn't * configured it already. */ device_printf(sc->dev, "couldn't be master(%d), " "device not already initialized either(%d).\n", rc, state); return (EDOOFUS); } /* This is the firmware whose headers the driver was compiled against */ fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } drv_fw = &fw_info->fw_hdr; /* * The firmware KLD contains many modules. The KLD name is also the * name of the module that contains the default config file. */ default_cfg = firmware_get(fw_info->kld_name); /* Read the header of the firmware on the card */ card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_read_flash(sc, FLASH_FW_START, sizeof (*card_fw) / sizeof (uint32_t), (uint32_t *)card_fw, 1); if (rc == 0) card_fw_usable = fw_compatible(drv_fw, (const void*)card_fw); else { device_printf(sc->dev, "Unable to read card's firmware header: %d\n", rc); card_fw_usable = 0; } /* This is the firmware in the KLD */ fw = firmware_get(fw_info->fw_mod_name); if (fw != NULL) { kld_fw = (const void *)fw->data; kld_fw_usable = fw_compatible(drv_fw, kld_fw); } else { kld_fw = NULL; kld_fw_usable = 0; } if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver && (!kld_fw_usable || kld_fw->fw_ver == drv_fw->fw_ver)) { /* * Common case: the firmware on the card is an exact match and * the KLD is an exact match too, or the KLD is * absent/incompatible. Note that t4_fw_install = 2 is ignored * here -- use cxgbetool loadfw if you want to reinstall the * same firmware as the one on the card. */ } else if (kld_fw_usable && state == DEV_STATE_UNINIT && should_install_kld_fw(sc, card_fw_usable, be32toh(kld_fw->fw_ver), be32toh(card_fw->fw_ver))) { rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0); if (rc != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", rc); goto done; } /* Installed successfully, update the cached header too. */ memcpy(card_fw, kld_fw, sizeof(*card_fw)); card_fw_usable = 1; need_fw_reset = 0; /* already reset as part of load_fw */ } if (!card_fw_usable) { uint32_t d, c, k; d = ntohl(drv_fw->fw_ver); c = ntohl(card_fw->fw_ver); k = kld_fw ? ntohl(kld_fw->fw_ver) : 0; device_printf(sc->dev, "Cannot find a usable firmware: " "fw_install %d, chip state %d, " "driver compiled with %d.%d.%d.%d, " "card has %d.%d.%d.%d, KLD has %d.%d.%d.%d\n", t4_fw_install, state, G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d), G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); rc = EINVAL; goto done; } /* We're using whatever's on the card and it's known to be good. */ sc->params.fw_vers = ntohl(card_fw->fw_ver); snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); t4_get_tp_version(sc, &sc->params.tp_vers); /* Reset device */ if (need_fw_reset && (rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST)) != 0) { device_printf(sc->dev, "firmware reset failed: %d.\n", rc); if (rc != ETIMEDOUT && rc != EIO) t4_fw_bye(sc, sc->mbox); goto done; } sc->flags |= FW_OK; rc = get_params__pre_init(sc); if (rc != 0) goto done; /* error message displayed already */ /* Partition adapter resources as specified in the config file. */ if (state == DEV_STATE_UNINIT) { KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); rc = partition_resources(sc, default_cfg, fw_info->kld_name); if (rc != 0) goto done; /* error message displayed already */ t4_tweak_chip_settings(sc); /* get basic stuff going */ rc = -t4_fw_initialize(sc, sc->mbox); if (rc != 0) { device_printf(sc->dev, "fw init failed: %d.\n", rc); goto done; } } else { snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", pf); sc->cfcsum = 0; } done: free(card_fw, M_CXGBE); if (fw != NULL) firmware_put(fw, FIRMWARE_UNLOAD); if (default_cfg != NULL) firmware_put(default_cfg, FIRMWARE_UNLOAD); return (rc); } #define FW_PARAM_DEV(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) #define FW_PARAM_PFVF(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)) /* * Partition chip resources for use between various PFs, VFs, etc. */ static int partition_resources(struct adapter *sc, const struct firmware *default_cfg, const char *name_prefix) { const struct firmware *cfg = NULL; int rc = 0; struct fw_caps_config_cmd caps; uint32_t mtype, moff, finicsum, cfcsum; /* * Figure out what configuration file to use. Pick the default config * file for the card if the user hasn't specified one explicitly. */ snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", t4_cfg_file); if (strncmp(t4_cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { /* Card specific overrides go here. */ if (pci_get_device(sc->dev) == 0x440a) snprintf(sc->cfg_file, sizeof(sc->cfg_file), UWIRE_CF); if (is_fpga(sc)) snprintf(sc->cfg_file, sizeof(sc->cfg_file), FPGA_CF); } /* * We need to load another module if the profile is anything except * "default" or "flash". */ if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) != 0 && strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { char s[32]; snprintf(s, sizeof(s), "%s_%s", name_prefix, sc->cfg_file); cfg = firmware_get(s); if (cfg == NULL) { if (default_cfg != NULL) { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the default config file instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", DEFAULT_CF); } else { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the config file on the card's flash " "instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } } } if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) == 0 && default_cfg == NULL) { device_printf(sc->dev, "default config file not available, will use the config " "file on the card's flash instead.\n"); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } if (strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { u_int cflen, i, n; const uint32_t *cfdata; uint32_t param, val, addr, off, mw_base, mw_aperture; KASSERT(cfg != NULL || default_cfg != NULL, ("%s: no config to upload", __func__)); /* * Ask the firmware where it wants us to upload the config file. */ param = FW_PARAM_DEV(CF); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* No support for config file? Shouldn't happen. */ device_printf(sc->dev, "failed to query config file location: %d.\n", rc); goto done; } mtype = G_FW_PARAMS_PARAM_Y(val); moff = G_FW_PARAMS_PARAM_Z(val) << 16; /* * XXX: sheer laziness. We deliberately added 4 bytes of * useless stuffing/comments at the end of the config file so * it's ok to simply throw away the last remaining bytes when * the config file is not an exact multiple of 4. This also * helps with the validate_mt_off_len check. */ if (cfg != NULL) { cflen = cfg->datasize & ~3; cfdata = cfg->data; } else { cflen = default_cfg->datasize & ~3; cfdata = default_cfg->data; } if (cflen > FLASH_CFG_MAX_SIZE) { device_printf(sc->dev, "config file too long (%d, max allowed is %d). " "Will try to use the config on the card, if any.\n", cflen, FLASH_CFG_MAX_SIZE); goto use_config_on_flash; } rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr); if (rc != 0) { device_printf(sc->dev, "%s: addr (%d/0x%x) or len %d is not valid: %d. " "Will try to use the config on the card, if any.\n", __func__, mtype, moff, cflen, rc); goto use_config_on_flash; } memwin_info(sc, 2, &mw_base, &mw_aperture); while (cflen) { off = position_memwin(sc, 2, addr); n = min(cflen, mw_aperture - off); for (i = 0; i < n; i += 4) t4_write_reg(sc, mw_base + off + i, *cfdata++); cflen -= n; addr += n; } } else { use_config_on_flash: mtype = FW_MEMTYPE_FLASH; moff = t4_flash_cfg_addr(sc); } bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to pre-process config file: %d " "(mtype %d, moff 0x%x).\n", rc, mtype, moff); goto done; } finicsum = be32toh(caps.finicsum); cfcsum = be32toh(caps.cfcsum); if (finicsum != cfcsum) { device_printf(sc->dev, "WARNING: config file checksum mismatch: %08x %08x\n", finicsum, cfcsum); } sc->cfcsum = cfcsum; #define LIMIT_CAPS(x) do { \ caps.x &= htobe16(t4_##x##_allowed); \ } while (0) /* * Let the firmware know what features will (not) be used so it can tune * things accordingly. */ LIMIT_CAPS(linkcaps); LIMIT_CAPS(niccaps); LIMIT_CAPS(toecaps); LIMIT_CAPS(rdmacaps); LIMIT_CAPS(iscsicaps); LIMIT_CAPS(fcoecaps); #undef LIMIT_CAPS caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL); if (rc != 0) { device_printf(sc->dev, "failed to process config file: %d.\n", rc); } done: if (cfg != NULL) firmware_put(cfg, FIRMWARE_UNLOAD); return (rc); } /* * Retrieve parameters that are needed (or nice to have) very early. */ static int get_params__pre_init(struct adapter *sc) { int rc; uint32_t param[2], val[2]; struct fw_devlog_cmd cmd; struct devlog_params *dlog = &sc->params.devlog; param[0] = FW_PARAM_DEV(PORTVEC); param[1] = FW_PARAM_DEV(CCLK); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (pre_init): %d.\n", rc); return (rc); } sc->params.portvec = val[0]; sc->params.nports = bitcount32(val[0]); sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ bzero(&cmd, sizeof(cmd)); cmd.op_to_write = htobe32(V_FW_CMD_OP(FW_DEVLOG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); cmd.retval_len16 = htobe32(FW_LEN16(cmd)); rc = -t4_wr_mbox(sc, sc->mbox, &cmd, sizeof(cmd), &cmd); if (rc != 0) { device_printf(sc->dev, "failed to get devlog parameters: %d.\n", rc); bzero(dlog, sizeof (*dlog)); rc = 0; /* devlog isn't critical for device operation */ } else { val[0] = be32toh(cmd.memtype_devlog_memaddr16_devlog); dlog->memtype = G_FW_DEVLOG_CMD_MEMTYPE_DEVLOG(val[0]); dlog->start = G_FW_DEVLOG_CMD_MEMADDR16_DEVLOG(val[0]) << 4; dlog->size = be32toh(cmd.memsize_devlog); } return (rc); } /* * Retrieve various parameters that are of interest to the driver. The device * has been initialized by the firmware at this point. */ static int get_params__post_init(struct adapter *sc) { int rc; uint32_t param[7], val[7]; struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); param[2] = FW_PARAM_PFVF(FILTER_START); param[3] = FW_PARAM_PFVF(FILTER_END); param[4] = FW_PARAM_PFVF(L2T_START); param[5] = FW_PARAM_PFVF(L2T_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init): %d.\n", rc); return (rc); } sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; sc->tids.ftid_base = val[2]; sc->tids.nftids = val[3] - val[2] + 1; sc->params.ftid_min = val[2]; sc->params.ftid_max = val[3]; sc->vres.l2t.start = val[4]; sc->vres.l2t.size = val[5] - val[4] + 1; KASSERT(sc->vres.l2t.size <= L2T_SIZE, ("%s: L2 table size (%u) larger than expected (%u)", __func__, sc->vres.l2t.size, L2T_SIZE)); /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to get card capabilities: %d.\n", rc); return (rc); } #define READ_CAPS(x) do { \ sc->x = htobe16(caps.x); \ } while (0) READ_CAPS(linkcaps); READ_CAPS(niccaps); READ_CAPS(toecaps); READ_CAPS(rdmacaps); READ_CAPS(iscsicaps); READ_CAPS(fcoecaps); if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) { param[0] = FW_PARAM_PFVF(ETHOFLD_START); param[1] = FW_PARAM_PFVF(ETHOFLD_END); param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query NIC parameters: %d.\n", rc); return (rc); } sc->tids.etid_base = val[0]; sc->params.etid_min = val[0]; sc->tids.netids = val[1] - val[0] + 1; sc->params.netids = sc->tids.netids; sc->params.eo_wr_cred = val[2]; sc->params.ethoffload = 1; } if (sc->toecaps) { /* query offload-related parameters */ param[0] = FW_PARAM_DEV(NTID); param[1] = FW_PARAM_PFVF(SERVER_START); param[2] = FW_PARAM_PFVF(SERVER_END); param[3] = FW_PARAM_PFVF(TDDP_START); param[4] = FW_PARAM_PFVF(TDDP_END); param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TOE parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; sc->vres.ddp.start = val[3]; sc->vres.ddp.size = val[4] - val[3] + 1; sc->params.ofldq_wr_cred = val[5]; sc->params.offload = 1; } if (sc->rdmacaps) { param[0] = FW_PARAM_PFVF(STAG_START); param[1] = FW_PARAM_PFVF(STAG_END); param[2] = FW_PARAM_PFVF(RQ_START); param[3] = FW_PARAM_PFVF(RQ_END); param[4] = FW_PARAM_PFVF(PBL_START); param[5] = FW_PARAM_PFVF(PBL_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(1): %d.\n", rc); return (rc); } sc->vres.stag.start = val[0]; sc->vres.stag.size = val[1] - val[0] + 1; sc->vres.rq.start = val[2]; sc->vres.rq.size = val[3] - val[2] + 1; sc->vres.pbl.start = val[4]; sc->vres.pbl.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SQRQ_START); param[1] = FW_PARAM_PFVF(SQRQ_END); param[2] = FW_PARAM_PFVF(CQ_START); param[3] = FW_PARAM_PFVF(CQ_END); param[4] = FW_PARAM_PFVF(OCQ_START); param[5] = FW_PARAM_PFVF(OCQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(2): %d.\n", rc); return (rc); } sc->vres.qp.start = val[0]; sc->vres.qp.size = val[1] - val[0] + 1; sc->vres.cq.start = val[2]; sc->vres.cq.size = val[3] - val[2] + 1; sc->vres.ocq.start = val[4]; sc->vres.ocq.size = val[5] - val[4] + 1; } if (sc->iscsicaps) { param[0] = FW_PARAM_PFVF(ISCSI_START); param[1] = FW_PARAM_PFVF(ISCSI_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query iSCSI parameters: %d.\n", rc); return (rc); } sc->vres.iscsi.start = val[0]; sc->vres.iscsi.size = val[1] - val[0] + 1; } /* * We've got the params we wanted to query via the firmware. Now grab * some others directly from the chip. */ rc = t4_read_chip_settings(sc); return (rc); } static int set_params__post_init(struct adapter *sc) { uint32_t param, val; /* ask for encapsulated CPLs */ param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); return (0); } #undef FW_PARAM_PFVF #undef FW_PARAM_DEV static void t4_set_desc(struct adapter *sc) { char buf[128]; struct adapter_params *p = &sc->params; snprintf(buf, sizeof(buf), "Chelsio %s %sNIC (rev %d), S/N:%s, " "P/N:%s, E/C:%s", p->vpd.id, is_offload(sc) ? "R" : "", chip_rev(sc), p->vpd.sn, p->vpd.pn, p->vpd.ec); device_set_desc_copy(sc->dev, buf); } static void build_medialist(struct port_info *pi, struct ifmedia *media) { int data, m; PORT_LOCK(pi); ifmedia_removeall(media); m = IFM_ETHER | IFM_FDX; data = (pi->port_type << 8) | pi->mod_type; switch(pi->port_type) { case FW_PORT_TYPE_BT_XFI: ifmedia_add(media, m | IFM_10G_T, data, NULL); break; case FW_PORT_TYPE_BT_XAUI: ifmedia_add(media, m | IFM_10G_T, data, NULL); /* fall through */ case FW_PORT_TYPE_BT_SGMII: ifmedia_add(media, m | IFM_1000_T, data, NULL); ifmedia_add(media, m | IFM_100_TX, data, NULL); ifmedia_add(media, IFM_ETHER | IFM_AUTO, data, NULL); ifmedia_set(media, IFM_ETHER | IFM_AUTO); break; case FW_PORT_TYPE_CX4: ifmedia_add(media, m | IFM_10G_CX4, data, NULL); ifmedia_set(media, m | IFM_10G_CX4); break; case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_10G_LR, data, NULL); ifmedia_set(media, m | IFM_10G_LR); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_10G_SR, data, NULL); ifmedia_set(media, m | IFM_10G_SR); break; case FW_PORT_MOD_TYPE_LRM: ifmedia_add(media, m | IFM_10G_LRM, data, NULL); ifmedia_set(media, m | IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_10G_TWINAX, data, NULL); ifmedia_set(media, m | IFM_10G_TWINAX); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, data, NULL); ifmedia_set(media, m | IFM_NONE); break; case FW_PORT_MOD_TYPE_NA: case FW_PORT_MOD_TYPE_ER: default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; case FW_PORT_TYPE_QSFP: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_40G_LR4, data, NULL); ifmedia_set(media, m | IFM_40G_LR4); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_40G_SR4, data, NULL); ifmedia_set(media, m | IFM_40G_SR4); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_40G_CR4, data, NULL); ifmedia_set(media, m | IFM_40G_CR4); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, data, NULL); ifmedia_set(media, m | IFM_NONE); break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, data, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } PORT_UNLOCK(pi); } #define FW_MAC_EXACT_CHUNK 7 /* * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ int update_mac_settings(struct ifnet *ifp, int flags) { int rc = 0; struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; uint16_t viid = 0xffff; int16_t *xact_addr_filt = NULL; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); if (ifp == pi->ifp) { viid = pi->viid; xact_addr_filt = &pi->xact_addr_filt; } #ifdef DEV_NETMAP else if (ifp == pi->nm_ifp) { viid = pi->nm_viid; xact_addr_filt = &pi->nm_xact_addr_filt; } #endif if (flags & XGMAC_MTU) mtu = ifp->if_mtu; if (flags & XGMAC_PROMISC) promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0; if (flags & XGMAC_ALLMULTI) allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0; if (flags & XGMAC_VLANEX) vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { rc = -t4_set_rxmode(sc, sc->mbox, viid, mtu, promisc, allmulti, 1, vlanex, false); if (rc) { if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); return (rc); } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); rc = t4_change_mac(sc, sc->mbox, viid, *xact_addr_filt, ucaddr, true, true); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { *xact_addr_filt = rc; rc = 0; } } if (flags & XGMAC_MCADDRS) { const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK]; int del = 1; uint64_t hash = 0; struct ifmultiaddr *ifma; int i = 0, j; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mcaddr[i++] = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); if (i == FW_MAC_EXACT_CHUNK) { rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } del = 0; i = 0; } } if (i > 0) { rc = t4_alloc_mac_filt(sc, sc->mbox, viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } } rc = -t4_set_addr_hash(sc, sc->mbox, viid, 0, hash, 0); if (rc != 0) if_printf(ifp, "failed to set mc address hash: %d", rc); mcfail: if_maddr_runlock(ifp); } return (rc); } int begin_synchronized_op(struct adapter *sc, struct port_info *pi, int flags, char *wmesg) { int rc, pri; #ifdef WITNESS /* the caller thinks it's ok to sleep, but is it really? */ if (flags & SLEEP_OK) pause("t4slptst", 1); #endif if (INTR_OK) pri = PCATCH; else pri = 0; ADAPTER_LOCK(sc); for (;;) { if (pi && IS_DOOMED(pi)) { rc = ENXIO; goto done; } if (!IS_BUSY(sc)) { rc = 0; break; } if (!(flags & SLEEP_OK)) { rc = EBUSY; goto done; } if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) { rc = EINTR; goto done; } } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = wmesg; sc->last_op_thr = curthread; #endif done: if (!(flags & HOLD_LOCK) || rc) ADAPTER_UNLOCK(sc); return (rc); } void end_synchronized_op(struct adapter *sc, int flags) { if (flags & LOCK_HELD) ADAPTER_LOCK_ASSERT_OWNED(sc); else ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); } static int cxgbe_init_synchronized(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; int rc = 0; ASSERT_SYNCHRONIZED_OP(sc); if (isset(&sc->open_device_map, pi->port_id)) { KASSERT(ifp->if_drv_flags & IFF_DRV_RUNNING, ("mismatch between open_device_map and if_drv_flags")); return (0); /* already running */ } if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_full_init(sc)) != 0)) return (rc); /* error message displayed already */ if (!(pi->flags & PORT_INIT_DONE) && ((rc = port_full_init(pi)) != 0)) return (rc); /* error message displayed already */ rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ rc = -t4_enable_vi(sc, sc->mbox, pi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); goto done; } /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0) { sc->traceq = sc->sge.rxq[pi->first_rxq].iq.abs_id; t4_write_reg(sc, is_t4(sc) ? A_MPS_TRC_RSS_CONTROL : A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) | V_QUEUENUMBER(sc->traceq)); pi->flags |= HAS_TRACEQ; } /* all ok */ setbit(&sc->open_device_map, pi->port_id); PORT_LOCK(pi); ifp->if_drv_flags |= IFF_DRV_RUNNING; PORT_UNLOCK(pi); callout_reset(&pi->tick, hz, cxgbe_tick, pi); done: if (rc != 0) cxgbe_uninit_synchronized(pi); return (rc); } /* * Idempotent. */ static int cxgbe_uninit_synchronized(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; int rc; ASSERT_SYNCHRONIZED_OP(sc); /* * Disable the VI so that all its data in either direction is discarded * by the MPS. Leave everything else (the queues, interrupts, and 1Hz * tick) intact as the TP can deliver negative advice or data that it's * holding in its RAM (for an offloaded connection) even after the VI is * disabled. */ rc = -t4_enable_vi(sc, sc->mbox, pi->viid, false, false); if (rc) { if_printf(ifp, "disable_vi failed: %d\n", rc); return (rc); } clrbit(&sc->open_device_map, pi->port_id); PORT_LOCK(pi); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; PORT_UNLOCK(pi); pi->link_cfg.link_ok = 0; pi->link_cfg.speed = 0; pi->linkdnrc = -1; t4_os_link_changed(sc, pi->port_id, 0, -1); return (0); } /* * It is ok for this function to fail midway and return right away. t4_detach * will walk the entire sc->irq list and clean up whatever is valid. */ static int setup_intr_handlers(struct adapter *sc) { int rc, rid, p, q; char s[8]; struct irq *irq; struct port_info *pi; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; #endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; if (sc->intr_count == 1) return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); /* Multiple interrupts. */ KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); /* The first one is always error intr */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); if (rc != 0) return (rc); irq++; rid++; /* The second one is always the firmware event queue */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sc->sge.fwq, "evt"); if (rc != 0) return (rc); irq++; rid++; for_each_port(sc, p) { pi = sc->port[p]; if (pi->flags & INTR_RXQ) { for_each_rxq(pi, q, rxq) { snprintf(s, sizeof(s), "%d.%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); if (rc != 0) return (rc); irq++; rid++; } } #ifdef TCP_OFFLOAD if (pi->flags & INTR_OFLD_RXQ) { for_each_ofld_rxq(pi, q, ofld_rxq) { snprintf(s, sizeof(s), "%d,%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); if (rc != 0) return (rc); irq++; rid++; } } #endif #ifdef DEV_NETMAP if (pi->flags & INTR_NM_RXQ) { for_each_nm_rxq(pi, q, nm_rxq) { snprintf(s, sizeof(s), "%d-%d", p, q); rc = t4_alloc_irq(sc, irq, rid, t4_nm_intr, nm_rxq, s); if (rc != 0) return (rc); irq++; rid++; } } #endif } MPASS(irq == &sc->irq[sc->intr_count]); return (0); } int adapter_full_init(struct adapter *sc) { int rc, i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); KASSERT((sc->flags & FULL_INIT_DONE) == 0, ("%s: FULL_INIT_DONE already", __func__)); /* * queues that belong to the adapter (not any particular port). */ rc = t4_setup_adapter_queues(sc); if (rc != 0) goto done; for (i = 0; i < nitems(sc->tq); i++) { sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq[i]); if (sc->tq[i] == NULL) { device_printf(sc->dev, "failed to allocate task queue %d\n", i); rc = ENOMEM; goto done; } taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d", device_get_nameunit(sc->dev), i); } t4_intr_enable(sc); sc->flags |= FULL_INIT_DONE; done: if (rc != 0) adapter_full_uninit(sc); return (rc); } int adapter_full_uninit(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); t4_teardown_adapter_queues(sc); for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) { taskqueue_free(sc->tq[i]); sc->tq[i] = NULL; } sc->flags &= ~FULL_INIT_DONE; return (0); } int port_full_init(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; uint16_t *rss; struct sge_rxq *rxq; int rc, i, j; ASSERT_SYNCHRONIZED_OP(sc); KASSERT((pi->flags & PORT_INIT_DONE) == 0, ("%s: PORT_INIT_DONE already", __func__)); sysctl_ctx_init(&pi->ctx); pi->flags |= PORT_SYSCTL_CTX; /* * Allocate tx/rx/fl queues for this port. */ rc = t4_setup_port_queues(pi); if (rc != 0) goto done; /* error message displayed already */ /* * Setup RSS for this port. Save a copy of the RSS table for later use. */ rss = malloc(pi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK); for (i = 0; i < pi->rss_size;) { for_each_rxq(pi, j, rxq) { rss[i++] = rxq->iq.abs_id; if (i == pi->rss_size) break; } } rc = -t4_config_rss_range(sc, sc->mbox, pi->viid, 0, pi->rss_size, rss, pi->rss_size); if (rc != 0) { if_printf(ifp, "rss_config failed: %d\n", rc); goto done; } pi->rss = rss; pi->flags |= PORT_INIT_DONE; done: if (rc != 0) port_full_uninit(pi); return (rc); } /* * Idempotent. */ int port_full_uninit(struct port_info *pi) { struct adapter *sc = pi->adapter; int i; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif if (pi->flags & PORT_INIT_DONE) { /* Need to quiesce queues. XXX: ctrl queues? */ for_each_txq(pi, i, txq) { quiesce_eq(sc, &txq->eq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { quiesce_eq(sc, &ofld_txq->eq); } #endif for_each_rxq(pi, i, rxq) { quiesce_iq(sc, &rxq->iq); quiesce_fl(sc, &rxq->fl); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { quiesce_iq(sc, &ofld_rxq->iq); quiesce_fl(sc, &ofld_rxq->fl); } #endif free(pi->rss, M_CXGBE); } t4_teardown_port_queues(pi); pi->flags &= ~PORT_INIT_DONE; return (0); } static void quiesce_eq(struct adapter *sc, struct sge_eq *eq) { EQ_LOCK(eq); eq->flags |= EQ_DOOMED; /* * Wait for the response to a credit flush if one's * pending. */ while (eq->flags & EQ_CRFLUSHED) mtx_sleep(eq, &eq->eq_lock, 0, "crflush", 0); EQ_UNLOCK(eq); callout_drain(&eq->tx_callout); /* XXX: iffy */ pause("callout", 10); /* Still iffy */ taskqueue_drain(sc->tq[eq->tx_chan], &eq->tx_task); } static void quiesce_iq(struct adapter *sc, struct sge_iq *iq) { (void) sc; /* unused */ /* Synchronize with the interrupt handler */ while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED)) pause("iqfree", 1); } static void quiesce_fl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); fl->flags |= FL_DOOMED; FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); callout_drain(&sc->sfl_callout); KASSERT((fl->flags & FL_STARVING) == 0, ("%s: still starving", __func__)); } static int t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid, driver_intr_t *handler, void *arg, char *name) { int rc; irq->rid = rid; irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid, RF_SHAREABLE | RF_ACTIVE); if (irq->res == NULL) { device_printf(sc->dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET, NULL, handler, arg, &irq->tag); if (rc != 0) { device_printf(sc->dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name, rc); } else if (name) bus_describe_intr(sc->dev, irq->res, irq->tag, name); return (rc); } static int t4_free_irq(struct adapter *sc, struct irq *irq) { if (irq->tag) bus_teardown_intr(sc->dev, irq->res, irq->tag); if (irq->res) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res); bzero(irq, sizeof(*irq)); return (0); } static void reg_block_dump(struct adapter *sc, uint8_t *buf, unsigned int start, unsigned int end) { uint32_t *p = (uint32_t *)(buf + start); for ( ; start <= end; start += sizeof(uint32_t)) *p++ = t4_read_reg(sc, start); } static void t4_get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf) { int i, n; const unsigned int *reg_ranges; static const unsigned int t4_reg_ranges[] = { 0x1008, 0x1108, 0x1180, 0x11b4, 0x11fc, 0x123c, 0x1300, 0x173c, 0x1800, 0x18fc, 0x3000, 0x30d8, 0x30e0, 0x5924, 0x5960, 0x59d4, 0x5a00, 0x5af8, 0x6000, 0x6098, 0x6100, 0x6150, 0x6200, 0x6208, 0x6240, 0x6248, 0x6280, 0x6338, 0x6370, 0x638c, 0x6400, 0x643c, 0x6500, 0x6524, 0x6a00, 0x6a38, 0x6a60, 0x6a78, 0x6b00, 0x6b84, 0x6bf0, 0x6c84, 0x6cf0, 0x6d84, 0x6df0, 0x6e84, 0x6ef0, 0x6f84, 0x6ff0, 0x7084, 0x70f0, 0x7184, 0x71f0, 0x7284, 0x72f0, 0x7384, 0x73f0, 0x7450, 0x7500, 0x7530, 0x7600, 0x761c, 0x7680, 0x76cc, 0x7700, 0x7798, 0x77c0, 0x77fc, 0x7900, 0x79fc, 0x7b00, 0x7c38, 0x7d00, 0x7efc, 0x8dc0, 0x8e1c, 0x8e30, 0x8e78, 0x8ea0, 0x8f6c, 0x8fc0, 0x9074, 0x90fc, 0x90fc, 0x9400, 0x9458, 0x9600, 0x96bc, 0x9800, 0x9808, 0x9820, 0x983c, 0x9850, 0x9864, 0x9c00, 0x9c6c, 0x9c80, 0x9cec, 0x9d00, 0x9d6c, 0x9d80, 0x9dec, 0x9e00, 0x9e6c, 0x9e80, 0x9eec, 0x9f00, 0x9f6c, 0x9f80, 0x9fec, 0xd004, 0xd03c, 0xdfc0, 0xdfe0, 0xe000, 0xea7c, 0xf000, 0x11110, 0x11118, 0x11190, 0x19040, 0x1906c, 0x19078, 0x19080, 0x1908c, 0x19124, 0x19150, 0x191b0, 0x191d0, 0x191e8, 0x19238, 0x1924c, 0x193f8, 0x19474, 0x19490, 0x194f8, 0x19800, 0x19f30, 0x1a000, 0x1a06c, 0x1a0b0, 0x1a120, 0x1a128, 0x1a138, 0x1a190, 0x1a1c4, 0x1a1fc, 0x1a1fc, 0x1e040, 0x1e04c, 0x1e284, 0x1e28c, 0x1e2c0, 0x1e2c0, 0x1e2e0, 0x1e2e0, 0x1e300, 0x1e384, 0x1e3c0, 0x1e3c8, 0x1e440, 0x1e44c, 0x1e684, 0x1e68c, 0x1e6c0, 0x1e6c0, 0x1e6e0, 0x1e6e0, 0x1e700, 0x1e784, 0x1e7c0, 0x1e7c8, 0x1e840, 0x1e84c, 0x1ea84, 0x1ea8c, 0x1eac0, 0x1eac0, 0x1eae0, 0x1eae0, 0x1eb00, 0x1eb84, 0x1ebc0, 0x1ebc8, 0x1ec40, 0x1ec4c, 0x1ee84, 0x1ee8c, 0x1eec0, 0x1eec0, 0x1eee0, 0x1eee0, 0x1ef00, 0x1ef84, 0x1efc0, 0x1efc8, 0x1f040, 0x1f04c, 0x1f284, 0x1f28c, 0x1f2c0, 0x1f2c0, 0x1f2e0, 0x1f2e0, 0x1f300, 0x1f384, 0x1f3c0, 0x1f3c8, 0x1f440, 0x1f44c, 0x1f684, 0x1f68c, 0x1f6c0, 0x1f6c0, 0x1f6e0, 0x1f6e0, 0x1f700, 0x1f784, 0x1f7c0, 0x1f7c8, 0x1f840, 0x1f84c, 0x1fa84, 0x1fa8c, 0x1fac0, 0x1fac0, 0x1fae0, 0x1fae0, 0x1fb00, 0x1fb84, 0x1fbc0, 0x1fbc8, 0x1fc40, 0x1fc4c, 0x1fe84, 0x1fe8c, 0x1fec0, 0x1fec0, 0x1fee0, 0x1fee0, 0x1ff00, 0x1ff84, 0x1ffc0, 0x1ffc8, 0x20000, 0x2002c, 0x20100, 0x2013c, 0x20190, 0x201c8, 0x20200, 0x20318, 0x20400, 0x20528, 0x20540, 0x20614, 0x21000, 0x21040, 0x2104c, 0x21060, 0x210c0, 0x210ec, 0x21200, 0x21268, 0x21270, 0x21284, 0x212fc, 0x21388, 0x21400, 0x21404, 0x21500, 0x21518, 0x2152c, 0x2153c, 0x21550, 0x21554, 0x21600, 0x21600, 0x21608, 0x21628, 0x21630, 0x2163c, 0x21700, 0x2171c, 0x21780, 0x2178c, 0x21800, 0x21c38, 0x21c80, 0x21d7c, 0x21e00, 0x21e04, 0x22000, 0x2202c, 0x22100, 0x2213c, 0x22190, 0x221c8, 0x22200, 0x22318, 0x22400, 0x22528, 0x22540, 0x22614, 0x23000, 0x23040, 0x2304c, 0x23060, 0x230c0, 0x230ec, 0x23200, 0x23268, 0x23270, 0x23284, 0x232fc, 0x23388, 0x23400, 0x23404, 0x23500, 0x23518, 0x2352c, 0x2353c, 0x23550, 0x23554, 0x23600, 0x23600, 0x23608, 0x23628, 0x23630, 0x2363c, 0x23700, 0x2371c, 0x23780, 0x2378c, 0x23800, 0x23c38, 0x23c80, 0x23d7c, 0x23e00, 0x23e04, 0x24000, 0x2402c, 0x24100, 0x2413c, 0x24190, 0x241c8, 0x24200, 0x24318, 0x24400, 0x24528, 0x24540, 0x24614, 0x25000, 0x25040, 0x2504c, 0x25060, 0x250c0, 0x250ec, 0x25200, 0x25268, 0x25270, 0x25284, 0x252fc, 0x25388, 0x25400, 0x25404, 0x25500, 0x25518, 0x2552c, 0x2553c, 0x25550, 0x25554, 0x25600, 0x25600, 0x25608, 0x25628, 0x25630, 0x2563c, 0x25700, 0x2571c, 0x25780, 0x2578c, 0x25800, 0x25c38, 0x25c80, 0x25d7c, 0x25e00, 0x25e04, 0x26000, 0x2602c, 0x26100, 0x2613c, 0x26190, 0x261c8, 0x26200, 0x26318, 0x26400, 0x26528, 0x26540, 0x26614, 0x27000, 0x27040, 0x2704c, 0x27060, 0x270c0, 0x270ec, 0x27200, 0x27268, 0x27270, 0x27284, 0x272fc, 0x27388, 0x27400, 0x27404, 0x27500, 0x27518, 0x2752c, 0x2753c, 0x27550, 0x27554, 0x27600, 0x27600, 0x27608, 0x27628, 0x27630, 0x2763c, 0x27700, 0x2771c, 0x27780, 0x2778c, 0x27800, 0x27c38, 0x27c80, 0x27d7c, 0x27e00, 0x27e04 }; static const unsigned int t5_reg_ranges[] = { 0x1008, 0x1148, 0x1180, 0x11b4, 0x11fc, 0x123c, 0x1280, 0x173c, 0x1800, 0x18fc, 0x3000, 0x3028, 0x3060, 0x30d8, 0x30e0, 0x30fc, 0x3140, 0x357c, 0x35a8, 0x35cc, 0x35ec, 0x35ec, 0x3600, 0x5624, 0x56cc, 0x575c, 0x580c, 0x5814, 0x5890, 0x58bc, 0x5940, 0x59dc, 0x59fc, 0x5a18, 0x5a60, 0x5a9c, 0x5b94, 0x5bfc, 0x6000, 0x6040, 0x6058, 0x614c, 0x7700, 0x7798, 0x77c0, 0x78fc, 0x7b00, 0x7c54, 0x7d00, 0x7efc, 0x8dc0, 0x8de0, 0x8df8, 0x8e84, 0x8ea0, 0x8f84, 0x8fc0, 0x90f8, 0x9400, 0x9470, 0x9600, 0x96f4, 0x9800, 0x9808, 0x9820, 0x983c, 0x9850, 0x9864, 0x9c00, 0x9c6c, 0x9c80, 0x9cec, 0x9d00, 0x9d6c, 0x9d80, 0x9dec, 0x9e00, 0x9e6c, 0x9e80, 0x9eec, 0x9f00, 0x9f6c, 0x9f80, 0xa020, 0xd004, 0xd03c, 0xdfc0, 0xdfe0, 0xe000, 0x11088, 0x1109c, 0x11110, 0x11118, 0x1117c, 0x11190, 0x11204, 0x19040, 0x1906c, 0x19078, 0x19080, 0x1908c, 0x19124, 0x19150, 0x191b0, 0x191d0, 0x191e8, 0x19238, 0x19290, 0x193f8, 0x19474, 0x19490, 0x194cc, 0x194f0, 0x194f8, 0x19c00, 0x19c60, 0x19c94, 0x19e10, 0x19e50, 0x19f34, 0x19f40, 0x19f50, 0x19f90, 0x19fe4, 0x1a000, 0x1a06c, 0x1a0b0, 0x1a120, 0x1a128, 0x1a138, 0x1a190, 0x1a1c4, 0x1a1fc, 0x1a1fc, 0x1e008, 0x1e00c, 0x1e040, 0x1e04c, 0x1e284, 0x1e290, 0x1e2c0, 0x1e2c0, 0x1e2e0, 0x1e2e0, 0x1e300, 0x1e384, 0x1e3c0, 0x1e3c8, 0x1e408, 0x1e40c, 0x1e440, 0x1e44c, 0x1e684, 0x1e690, 0x1e6c0, 0x1e6c0, 0x1e6e0, 0x1e6e0, 0x1e700, 0x1e784, 0x1e7c0, 0x1e7c8, 0x1e808, 0x1e80c, 0x1e840, 0x1e84c, 0x1ea84, 0x1ea90, 0x1eac0, 0x1eac0, 0x1eae0, 0x1eae0, 0x1eb00, 0x1eb84, 0x1ebc0, 0x1ebc8, 0x1ec08, 0x1ec0c, 0x1ec40, 0x1ec4c, 0x1ee84, 0x1ee90, 0x1eec0, 0x1eec0, 0x1eee0, 0x1eee0, 0x1ef00, 0x1ef84, 0x1efc0, 0x1efc8, 0x1f008, 0x1f00c, 0x1f040, 0x1f04c, 0x1f284, 0x1f290, 0x1f2c0, 0x1f2c0, 0x1f2e0, 0x1f2e0, 0x1f300, 0x1f384, 0x1f3c0, 0x1f3c8, 0x1f408, 0x1f40c, 0x1f440, 0x1f44c, 0x1f684, 0x1f690, 0x1f6c0, 0x1f6c0, 0x1f6e0, 0x1f6e0, 0x1f700, 0x1f784, 0x1f7c0, 0x1f7c8, 0x1f808, 0x1f80c, 0x1f840, 0x1f84c, 0x1fa84, 0x1fa90, 0x1fac0, 0x1fac0, 0x1fae0, 0x1fae0, 0x1fb00, 0x1fb84, 0x1fbc0, 0x1fbc8, 0x1fc08, 0x1fc0c, 0x1fc40, 0x1fc4c, 0x1fe84, 0x1fe90, 0x1fec0, 0x1fec0, 0x1fee0, 0x1fee0, 0x1ff00, 0x1ff84, 0x1ffc0, 0x1ffc8, 0x30000, 0x30030, 0x30100, 0x30144, 0x30190, 0x301d0, 0x30200, 0x30318, 0x30400, 0x3052c, 0x30540, 0x3061c, 0x30800, 0x30834, 0x308c0, 0x30908, 0x30910, 0x309ac, 0x30a00, 0x30a2c, 0x30a44, 0x30a50, 0x30a74, 0x30c24, 0x30d00, 0x30d00, 0x30d08, 0x30d14, 0x30d1c, 0x30d20, 0x30d3c, 0x30d50, 0x31200, 0x3120c, 0x31220, 0x31220, 0x31240, 0x31240, 0x31600, 0x3160c, 0x31a00, 0x31a1c, 0x31e00, 0x31e20, 0x31e38, 0x31e3c, 0x31e80, 0x31e80, 0x31e88, 0x31ea8, 0x31eb0, 0x31eb4, 0x31ec8, 0x31ed4, 0x31fb8, 0x32004, 0x32200, 0x32200, 0x32208, 0x32240, 0x32248, 0x32280, 0x32288, 0x322c0, 0x322c8, 0x322fc, 0x32600, 0x32630, 0x32a00, 0x32abc, 0x32b00, 0x32b70, 0x33000, 0x33048, 0x33060, 0x3309c, 0x330f0, 0x33148, 0x33160, 0x3319c, 0x331f0, 0x332e4, 0x332f8, 0x333e4, 0x333f8, 0x33448, 0x33460, 0x3349c, 0x334f0, 0x33548, 0x33560, 0x3359c, 0x335f0, 0x336e4, 0x336f8, 0x337e4, 0x337f8, 0x337fc, 0x33814, 0x33814, 0x3382c, 0x3382c, 0x33880, 0x3388c, 0x338e8, 0x338ec, 0x33900, 0x33948, 0x33960, 0x3399c, 0x339f0, 0x33ae4, 0x33af8, 0x33b10, 0x33b28, 0x33b28, 0x33b3c, 0x33b50, 0x33bf0, 0x33c10, 0x33c28, 0x33c28, 0x33c3c, 0x33c50, 0x33cf0, 0x33cfc, 0x34000, 0x34030, 0x34100, 0x34144, 0x34190, 0x341d0, 0x34200, 0x34318, 0x34400, 0x3452c, 0x34540, 0x3461c, 0x34800, 0x34834, 0x348c0, 0x34908, 0x34910, 0x349ac, 0x34a00, 0x34a2c, 0x34a44, 0x34a50, 0x34a74, 0x34c24, 0x34d00, 0x34d00, 0x34d08, 0x34d14, 0x34d1c, 0x34d20, 0x34d3c, 0x34d50, 0x35200, 0x3520c, 0x35220, 0x35220, 0x35240, 0x35240, 0x35600, 0x3560c, 0x35a00, 0x35a1c, 0x35e00, 0x35e20, 0x35e38, 0x35e3c, 0x35e80, 0x35e80, 0x35e88, 0x35ea8, 0x35eb0, 0x35eb4, 0x35ec8, 0x35ed4, 0x35fb8, 0x36004, 0x36200, 0x36200, 0x36208, 0x36240, 0x36248, 0x36280, 0x36288, 0x362c0, 0x362c8, 0x362fc, 0x36600, 0x36630, 0x36a00, 0x36abc, 0x36b00, 0x36b70, 0x37000, 0x37048, 0x37060, 0x3709c, 0x370f0, 0x37148, 0x37160, 0x3719c, 0x371f0, 0x372e4, 0x372f8, 0x373e4, 0x373f8, 0x37448, 0x37460, 0x3749c, 0x374f0, 0x37548, 0x37560, 0x3759c, 0x375f0, 0x376e4, 0x376f8, 0x377e4, 0x377f8, 0x377fc, 0x37814, 0x37814, 0x3782c, 0x3782c, 0x37880, 0x3788c, 0x378e8, 0x378ec, 0x37900, 0x37948, 0x37960, 0x3799c, 0x379f0, 0x37ae4, 0x37af8, 0x37b10, 0x37b28, 0x37b28, 0x37b3c, 0x37b50, 0x37bf0, 0x37c10, 0x37c28, 0x37c28, 0x37c3c, 0x37c50, 0x37cf0, 0x37cfc, 0x38000, 0x38030, 0x38100, 0x38144, 0x38190, 0x381d0, 0x38200, 0x38318, 0x38400, 0x3852c, 0x38540, 0x3861c, 0x38800, 0x38834, 0x388c0, 0x38908, 0x38910, 0x389ac, 0x38a00, 0x38a2c, 0x38a44, 0x38a50, 0x38a74, 0x38c24, 0x38d00, 0x38d00, 0x38d08, 0x38d14, 0x38d1c, 0x38d20, 0x38d3c, 0x38d50, 0x39200, 0x3920c, 0x39220, 0x39220, 0x39240, 0x39240, 0x39600, 0x3960c, 0x39a00, 0x39a1c, 0x39e00, 0x39e20, 0x39e38, 0x39e3c, 0x39e80, 0x39e80, 0x39e88, 0x39ea8, 0x39eb0, 0x39eb4, 0x39ec8, 0x39ed4, 0x39fb8, 0x3a004, 0x3a200, 0x3a200, 0x3a208, 0x3a240, 0x3a248, 0x3a280, 0x3a288, 0x3a2c0, 0x3a2c8, 0x3a2fc, 0x3a600, 0x3a630, 0x3aa00, 0x3aabc, 0x3ab00, 0x3ab70, 0x3b000, 0x3b048, 0x3b060, 0x3b09c, 0x3b0f0, 0x3b148, 0x3b160, 0x3b19c, 0x3b1f0, 0x3b2e4, 0x3b2f8, 0x3b3e4, 0x3b3f8, 0x3b448, 0x3b460, 0x3b49c, 0x3b4f0, 0x3b548, 0x3b560, 0x3b59c, 0x3b5f0, 0x3b6e4, 0x3b6f8, 0x3b7e4, 0x3b7f8, 0x3b7fc, 0x3b814, 0x3b814, 0x3b82c, 0x3b82c, 0x3b880, 0x3b88c, 0x3b8e8, 0x3b8ec, 0x3b900, 0x3b948, 0x3b960, 0x3b99c, 0x3b9f0, 0x3bae4, 0x3baf8, 0x3bb10, 0x3bb28, 0x3bb28, 0x3bb3c, 0x3bb50, 0x3bbf0, 0x3bc10, 0x3bc28, 0x3bc28, 0x3bc3c, 0x3bc50, 0x3bcf0, 0x3bcfc, 0x3c000, 0x3c030, 0x3c100, 0x3c144, 0x3c190, 0x3c1d0, 0x3c200, 0x3c318, 0x3c400, 0x3c52c, 0x3c540, 0x3c61c, 0x3c800, 0x3c834, 0x3c8c0, 0x3c908, 0x3c910, 0x3c9ac, 0x3ca00, 0x3ca2c, 0x3ca44, 0x3ca50, 0x3ca74, 0x3cc24, 0x3cd00, 0x3cd00, 0x3cd08, 0x3cd14, 0x3cd1c, 0x3cd20, 0x3cd3c, 0x3cd50, 0x3d200, 0x3d20c, 0x3d220, 0x3d220, 0x3d240, 0x3d240, 0x3d600, 0x3d60c, 0x3da00, 0x3da1c, 0x3de00, 0x3de20, 0x3de38, 0x3de3c, 0x3de80, 0x3de80, 0x3de88, 0x3dea8, 0x3deb0, 0x3deb4, 0x3dec8, 0x3ded4, 0x3dfb8, 0x3e004, 0x3e200, 0x3e200, 0x3e208, 0x3e240, 0x3e248, 0x3e280, 0x3e288, 0x3e2c0, 0x3e2c8, 0x3e2fc, 0x3e600, 0x3e630, 0x3ea00, 0x3eabc, 0x3eb00, 0x3eb70, 0x3f000, 0x3f048, 0x3f060, 0x3f09c, 0x3f0f0, 0x3f148, 0x3f160, 0x3f19c, 0x3f1f0, 0x3f2e4, 0x3f2f8, 0x3f3e4, 0x3f3f8, 0x3f448, 0x3f460, 0x3f49c, 0x3f4f0, 0x3f548, 0x3f560, 0x3f59c, 0x3f5f0, 0x3f6e4, 0x3f6f8, 0x3f7e4, 0x3f7f8, 0x3f7fc, 0x3f814, 0x3f814, 0x3f82c, 0x3f82c, 0x3f880, 0x3f88c, 0x3f8e8, 0x3f8ec, 0x3f900, 0x3f948, 0x3f960, 0x3f99c, 0x3f9f0, 0x3fae4, 0x3faf8, 0x3fb10, 0x3fb28, 0x3fb28, 0x3fb3c, 0x3fb50, 0x3fbf0, 0x3fc10, 0x3fc28, 0x3fc28, 0x3fc3c, 0x3fc50, 0x3fcf0, 0x3fcfc, 0x40000, 0x4000c, 0x40040, 0x40068, 0x4007c, 0x40144, 0x40180, 0x4018c, 0x40200, 0x40298, 0x402ac, 0x4033c, 0x403f8, 0x403fc, 0x41304, 0x413c4, 0x41400, 0x4141c, 0x41480, 0x414d0, 0x44000, 0x44078, 0x440c0, 0x44278, 0x442c0, 0x44478, 0x444c0, 0x44678, 0x446c0, 0x44878, 0x448c0, 0x449fc, 0x45000, 0x45068, 0x45080, 0x45084, 0x450a0, 0x450b0, 0x45200, 0x45268, 0x45280, 0x45284, 0x452a0, 0x452b0, 0x460c0, 0x460e4, 0x47000, 0x4708c, 0x47200, 0x47250, 0x47400, 0x47420, 0x47600, 0x47618, 0x47800, 0x47814, 0x48000, 0x4800c, 0x48040, 0x48068, 0x4807c, 0x48144, 0x48180, 0x4818c, 0x48200, 0x48298, 0x482ac, 0x4833c, 0x483f8, 0x483fc, 0x49304, 0x493c4, 0x49400, 0x4941c, 0x49480, 0x494d0, 0x4c000, 0x4c078, 0x4c0c0, 0x4c278, 0x4c2c0, 0x4c478, 0x4c4c0, 0x4c678, 0x4c6c0, 0x4c878, 0x4c8c0, 0x4c9fc, 0x4d000, 0x4d068, 0x4d080, 0x4d084, 0x4d0a0, 0x4d0b0, 0x4d200, 0x4d268, 0x4d280, 0x4d284, 0x4d2a0, 0x4d2b0, 0x4e0c0, 0x4e0e4, 0x4f000, 0x4f08c, 0x4f200, 0x4f250, 0x4f400, 0x4f420, 0x4f600, 0x4f618, 0x4f800, 0x4f814, 0x50000, 0x500cc, 0x50400, 0x50400, 0x50800, 0x508cc, 0x50c00, 0x50c00, 0x51000, 0x5101c, 0x51300, 0x51308, }; if (is_t4(sc)) { reg_ranges = &t4_reg_ranges[0]; n = nitems(t4_reg_ranges); } else { reg_ranges = &t5_reg_ranges[0]; n = nitems(t5_reg_ranges); } regs->version = chip_id(sc) | chip_rev(sc) << 10; for (i = 0; i < n; i += 2) reg_block_dump(sc, buf, reg_ranges[i], reg_ranges[i + 1]); } static void cxgbe_tick(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; struct sge_txq *txq; int i, drops; struct port_stats *s = &pi->stats; PORT_LOCK(pi); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { PORT_UNLOCK(pi); return; /* without scheduling another callout */ } t4_get_port_stats(sc, pi->tx_chan, s); ifp->if_opackets = s->tx_frames - s->tx_pause; ifp->if_ipackets = s->rx_frames - s->rx_pause; ifp->if_obytes = s->tx_octets - s->tx_pause * 64; ifp->if_ibytes = s->rx_octets - s->rx_pause * 64; ifp->if_omcasts = s->tx_mcast_frames - s->tx_pause; ifp->if_imcasts = s->rx_mcast_frames - s->rx_pause; ifp->if_iqdrops = s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3; for (i = 0; i < 4; i++) { if (pi->rx_chan_map & (1 << i)) { uint32_t v; /* * XXX: indirect reads from the same ADDR/DATA pair can * race with each other. */ t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); ifp->if_iqdrops += v; } } drops = s->tx_drop; for_each_txq(pi, i, txq) drops += txq->br->br_drops; ifp->if_oqdrops = drops; ifp->if_oerrors = s->tx_error_frames; ifp->if_ierrors = s->rx_jabber + s->rx_runt + s->rx_too_long + s->rx_fcs_err + s->rx_len_err; callout_schedule(&pi->tick, hz); PORT_UNLOCK(pi); } static void cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid) { struct ifnet *vlan; if (arg != ifp || ifp->if_type != IFT_ETHER) return; vlan = VLAN_DEVAT(ifp, vid); VLAN_SETCOOKIE(vlan, ifp); } static int cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { #ifdef INVARIANTS panic("%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); #else log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n", __func__, rss->opcode, iq, m); m_freem(m); #endif return (EDOOFUS); } int t4_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h) { uintptr_t *loc, new; if (opcode >= nitems(sc->cpl_handler)) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled; loc = (uintptr_t *) &sc->cpl_handler[opcode]; atomic_store_rel_ptr(loc, new); return (0); } static int an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl) { #ifdef INVARIANTS panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl); #else log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n", __func__, iq, ctrl); #endif return (EDOOFUS); } int t4_register_an_handler(struct adapter *sc, an_handler_t h) { uintptr_t *loc, new; new = h ? (uintptr_t)h : (uintptr_t)an_not_handled; loc = (uintptr_t *) &sc->an_handler; atomic_store_rel_ptr(loc, new); return (0); } static int fw_msg_not_handled(struct adapter *sc, const __be64 *rpl) { const struct cpl_fw6_msg *cpl = __containerof(rpl, struct cpl_fw6_msg, data[0]); #ifdef INVARIANTS panic("%s: fw_msg type %d", __func__, cpl->type); #else log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type); #endif return (EDOOFUS); } int t4_register_fw_msg_handler(struct adapter *sc, int type, fw_msg_handler_t h) { uintptr_t *loc, new; if (type >= nitems(sc->fw_msg_handler)) return (EINVAL); /* * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL * handler dispatch table. Reject any attempt to install a handler for * this subtype. */ if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled; loc = (uintptr_t *) &sc->fw_msg_handler[type]; atomic_store_rel_ptr(loc, new); return (0); } static int t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *c0; static char *caps[] = { "\20\1PPP\2QFC\3DCBX", /* caps[0] linkcaps */ "\20\1NIC\2VM\3IDS\4UM\5UM_ISGL" /* caps[1] niccaps */ "\6HASHFILTER\7ETHOFLD", "\20\1TOE", /* caps[2] toecaps */ "\20\1RDDP\2RDMAC", /* caps[3] rdmacaps */ "\20\1INITIATOR_PDU\2TARGET_PDU" /* caps[4] iscsicaps */ "\3INITIATOR_CNXOFLD\4TARGET_CNXOFLD" "\5INITIATOR_SSNOFLD\6TARGET_SSNOFLD", "\20\1INITIATOR\2TARGET\3CTRL_OFLD" /* caps[5] fcoecaps */ "\4PO_INITIAOR\5PO_TARGET" }; static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"}; ctx = device_get_sysctl_ctx(sc->dev); /* * dev.t4nex.X. */ oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); sc->sc_do_rxcopy = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, NULL, chip_rev(sc), "chip hardware revision"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, &sc->fw_version, 0, "firmware version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf", CTLFLAG_RD, &sc->cfg_file, 0, "configuration file"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL, sc->cfcsum, "config file checksum"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells", CTLTYPE_STRING | CTLFLAG_RD, doorbells, sc->doorbells, sysctl_bitfield, "A", "available doorbells"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkcaps", CTLTYPE_STRING | CTLFLAG_RD, caps[0], sc->linkcaps, sysctl_bitfield, "A", "available link capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "niccaps", CTLTYPE_STRING | CTLFLAG_RD, caps[1], sc->niccaps, sysctl_bitfield, "A", "available NIC capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "toecaps", CTLTYPE_STRING | CTLFLAG_RD, caps[2], sc->toecaps, sysctl_bitfield, "A", "available TCP offload capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdmacaps", CTLTYPE_STRING | CTLFLAG_RD, caps[3], sc->rdmacaps, sysctl_bitfield, "A", "available RDMA capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "iscsicaps", CTLTYPE_STRING | CTLFLAG_RD, caps[4], sc->iscsicaps, sysctl_bitfield, "A", "available iSCSI capabilities"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoecaps", CTLTYPE_STRING | CTLFLAG_RD, caps[5], sc->fcoecaps, sysctl_bitfield, "A", "available FCoE capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL, sc->params.vpd.cclk, "core clock frequency (in KHz)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers", CTLTYPE_STRING | CTLFLAG_RD, sc->sge.timer_val, sizeof(sc->sge.timer_val), sysctl_int_array, "A", "interrupt holdoff timer values (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts", CTLTYPE_STRING | CTLFLAG_RD, sc->sge.counter_val, sizeof(sc->sge.counter_val), sysctl_int_array, "A", "interrupt holdoff packet counter values"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD, NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); t4_sge_sysctls(sc, ctx, children); sc->lro_timeout = 100; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW, &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)"); #ifdef SBUF_DRAIN /* * dev.t4nex.X.misc. Marked CTLFLAG_SKIP to avoid information overload. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc", CTLFLAG_RD | CTLFLAG_SKIP, NULL, "logs and miscellaneous information"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cctrl, "A", "congestion control"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0", CTLTYPE_STRING | CTLFLAG_RD, sc, 3, sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1", CTLTYPE_STRING | CTLFLAG_RD, sc, 4, sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5, sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_la, "A", "CIM logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ma_la, "A", "CIM MA logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2", CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3", CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge", CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)"); if (is_t5(sc)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_pif_la, "A", "CIM PIF logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_qcfg, "A", "CIM queue configuration"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cpl_stats, "A", "CPL statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ddp_stats, "A", "DDP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_devlog, "A", "firmware's device log"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_fcoe_stats, "A", "FCoE statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_hw_sched, "A", "hardware scheduler "); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_l2t, "A", "hardware L2 table"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_lb_stats, "A", "loopback statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_meminfo, "A", "memory regions"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_mps_tcam, "A", "MPS TCAM entries"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_path_mtus, "A", "path MTUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_pm_stats, "A", "PM statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_rdma_stats, "A", "RDMA statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tcp_stats, "A", "TCP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tids, "A", "TID information"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_err_stats, "A", "TP error statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_la, "A", "TP logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tx_rate, "A", "Tx rate"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ulprx_la, "A", "ULPRX logic analyzer"); if (is_t5(sc)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_wcwr_stats, "A", "write combined work requests"); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { /* * dev.t4nex.X.toe. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD, NULL, "TOE parameters"); children = SYSCTL_CHILDREN(oid); sc->tt.sndbuf = 256 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, &sc->tt.sndbuf, 0, "max hardware send buffer size"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW, &sc->tt.ddp, 0, "DDP allowed"); sc->tt.indsz = G_INDICATESIZE(t4_read_reg(sc, A_TP_PARA_REG5)); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "indsz", CTLFLAG_RW, &sc->tt.indsz, 0, "DDP max indicate size allowed"); sc->tt.ddp_thres = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp_thres", CTLFLAG_RW, &sc->tt.ddp_thres, 0, "DDP threshold"); sc->tt.rx_coalesce = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); } #endif return (0); } static int cxgbe_sysctls(struct port_info *pi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children; struct adapter *sc = pi->adapter; ctx = device_get_sysctl_ctx(pi->dev); /* * dev.cxgbe.X. */ oid = device_get_sysctl_tree(pi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING | CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down"); if (pi->port_type == FW_PORT_TYPE_BT_XAUI) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I", "PHY temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version", CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I", "PHY firmware version"); } SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD, &pi->nrxq, 0, "# of rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD, &pi->ntxq, 0, "# of tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD, &pi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &pi->first_txq, 0, "index of first tx queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_noflowq, "IU", "Reserve queue 0 for non-flowid packets"); #ifdef TCP_OFFLOAD if (is_offload(sc)) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &pi->nofldrxq, 0, "# of rx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD, &pi->nofldtxq, 0, "# of tx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq", CTLFLAG_RD, &pi->first_ofld_rxq, 0, "index of first TOE rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq", CTLFLAG_RD, &pi->first_ofld_txq, 0, "index of first TOE tx queue"); } #endif #ifdef DEV_NETMAP SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, &pi->nnmrxq, 0, "# of rx queues for netmap"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, &pi->nnmtxq, 0, "# of tx queues for netmap"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", CTLFLAG_RD, &pi->first_nm_rxq, 0, "index of first netmap rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", CTLFLAG_RD, &pi->first_nm_txq, 0, "index of first netmap tx queue"); #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_holdoff_tmr_idx, "I", "holdoff timer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_qsize_rxq, "I", "rx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_qsize_txq, "I", "tx queue size"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings", + CTLTYPE_STRING | CTLFLAG_RW, pi, PAUSE_TX, sysctl_pause_settings, + "A", "PAUSE settings (bit 0 = rx_pause, bit 1 = tx_pause)"); + /* * dev.cxgbe.X.stats. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \ CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \ sysctl_handle_t4_reg64, "QU", desc) SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_64", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L)); SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L)); SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err", "# of frames received with bad FCS", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_len_err", "# of frames received with length error", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_64", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L)); #undef SYSCTL_ADD_T4_REG64 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \ SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \ &pi->stats.name, desc) /* We get these from port_stats and they may be stale by upto 1s */ SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets"); #undef SYSCTL_ADD_T4_PORTSTAT return (0); } static int sysctl_int_array(SYSCTL_HANDLER_ARGS) { int rc, *i; struct sbuf sb; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (i = arg1; arg2; arg2 -= sizeof(int), i++) sbuf_printf(&sb, "%d ", *i); sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } static int sysctl_bitfield(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", (int)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_btphy(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int op = arg2; struct adapter *sc = pi->adapter; u_int v; int rc; rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4btt"); if (rc) return (rc); /* XXX: magic numbers */ rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820, &v); end_synchronized_op(sc, 0); if (rc) return (rc); if (op == 0) v /= 256; rc = sysctl_handle_int(oidp, &v, 0, req); return (rc); } static int sysctl_noflowq(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int rc, val; val = pi->rsrv_noflowq; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if ((val >= 1) && (pi->ntxq > 1)) pi->rsrv_noflowq = 1; else pi->rsrv_noflowq = 0; return (rc); } static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int idx, rc, i; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif uint8_t v; idx = pi->tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(pi->pktc_idx != -1); for_each_rxq(pi, i, rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&rxq->iq.intr_params, v); #else rxq->iq.intr_params = v; #endif } #ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&ofld_rxq->iq.intr_params, v); #else ofld_rxq->iq.intr_params = v; #endif } #endif pi->tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int idx, rc; idx = pi->pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4pktc"); if (rc) return (rc); if (pi->flags & PORT_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else pi->pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int qsize, rc; qsize = pi->qsize_rxq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || (qsize & 7)) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4rxqs"); if (rc) return (rc); if (pi->flags & PORT_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else pi->qsize_rxq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; int qsize, rc; qsize = pi->qsize_txq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); /* bufring size must be powerof2 */ if (qsize < 128 || !powerof2(qsize)) return (EINVAL); rc = begin_synchronized_op(sc, pi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txqs"); if (rc) return (rc); if (pi->flags & PORT_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else pi->qsize_txq = qsize; end_synchronized_op(sc, LOCK_HELD); + return (rc); +} + +static int +sysctl_pause_settings(SYSCTL_HANDLER_ARGS) +{ + struct port_info *pi = arg1; + struct adapter *sc = pi->adapter; + struct link_config *lc = &pi->link_cfg; + int rc; + + if (req->newptr == NULL) { + struct sbuf *sb; + static char *bits = "\20\1PAUSE_RX\2PAUSE_TX"; + + rc = sysctl_wire_old_buffer(req, 0); + if (rc != 0) + return(rc); + + sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); + if (sb == NULL) + return (ENOMEM); + + sbuf_printf(sb, "%b", lc->fc & (PAUSE_TX | PAUSE_RX), bits); + rc = sbuf_finish(sb); + sbuf_delete(sb); + } else { + char s[2]; + int n; + + s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX)); + s[1] = 0; + + rc = sysctl_handle_string(oidp, s, sizeof(s), req); + if (rc != 0) + return(rc); + + if (s[1] != 0) + return (EINVAL); + if (s[0] < '0' || s[0] > '9') + return (EINVAL); /* not a number */ + n = s[0] - '0'; + if (n & ~(PAUSE_TX | PAUSE_RX)) + return (EINVAL); /* some other bit is set too */ + + rc = begin_synchronized_op(sc, pi, SLEEP_OK | INTR_OK, "t4PAUSE"); + if (rc) + return (rc); + if ((lc->requested_fc & (PAUSE_TX | PAUSE_RX)) != n) { + int link_ok = lc->link_ok; + + lc->requested_fc &= ~(PAUSE_TX | PAUSE_RX); + lc->requested_fc |= n; + rc = -t4_link_start(sc, sc->mbox, pi->tx_chan, lc); + lc->link_ok = link_ok; /* restore */ + } + end_synchronized_op(sc, 0); + } + return (rc); } static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; uint64_t val; val = t4_read_reg64(sc, reg); return (sysctl_handle_64(oidp, &val, 0, req)); } static int sysctl_temperature(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, t; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp"); if (rc) return (rc); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); end_synchronized_op(sc, 0); if (rc) return (rc); /* unknown is returned as 0 but we display -1 in that case */ t = val == 0 ? -1 : val; rc = sysctl_handle_int(oidp, &t, 0, req); return (rc); } #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t incr[NMTUS][NCCTRL_WIN]; static const char *dec_fac[] = { "0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875", "0.9375" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); t4_read_cong_tbl(sc, incr); for (i = 0; i < NCCTRL_WIN; ++i) { sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i, incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i], incr[5][i], incr[6][i], incr[7][i]); sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n", incr[8][i], incr[9][i], incr[10][i], incr[11][i], incr[12][i], incr[13][i], incr[14][i], incr[15][i], sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = { "TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI", /* ibq's */ "ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI", /* obq's */ "SGE0-RX", "SGE1-RX" /* additional obq's (T5 onwards) */ }; static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n, qid = arg2; uint32_t *buf, *p; char *qtype; u_int cim_num_obq = is_t4(sc) ? CIM_NUM_OBQ : CIM_NUM_OBQ_T5; KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq, ("%s: bad qid %d\n", __func__, qid)); if (qid < CIM_NUM_IBQ) { /* inbound queue */ qtype = "IBQ"; n = 4 * CIM_IBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_ibq(sc, qid, buf, n); } else { /* outbound queue */ qtype = "OBQ"; qid -= CIM_NUM_IBQ; n = 4 * cim_num_obq * CIM_OBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_obq(sc, qid, buf, n); } if (rc < 0) { rc = -rc; goto done; } n = rc * sizeof(uint32_t); /* rc has # of words actually read */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]); for (i = 0, p = buf; i < n; i += 16, p += 4) sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1], p[2], p[3]); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int cfg; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_cim_read_la(sc, buf, NULL); if (rc != 0) goto done; sbuf_printf(sb, "Status Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data"); KASSERT((sc->params.cim_la_size & 7) == 0, ("%s: p will walk off the end of buf", __func__)); for (p = buf; p < &buf[sc->params.cim_la_size]; p += 8) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x", p[5] & 0xff, p[6], p[7]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x", (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8, p[4] & 0xff, p[5] >> 8); sbuf_printf(sb, "\n %02x %x%07x %x%07x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4); } else { sbuf_printf(sb, "\n %02x %x%07x %x%07x %08x %08x " "%08x%08x%08x%08x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5], p[6], p[7]); } } rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE); p = buf; for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCnt ID Tag UE Data RDY VLD"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%3u %2u %x %u %08x%08x %u %u", (p[2] >> 10) & 0xff, (p[2] >> 7) & 7, (p[2] >> 3) & 0xf, (p[2] >> 2) & 1, (p[1] >> 2) | ((p[2] & 3) << 30), (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1, p[0] & 1); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL); p = buf; sbuf_printf(sb, "Cntl ID DataBE Addr Data"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %04x %08x %08x%08x%08x%08x", (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCntl ID Data"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %08x%08x%08x%08x", (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t thres[CIM_NUM_IBQ]; uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr; uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat; u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq; if (is_t4(sc)) { cim_num_obq = CIM_NUM_OBQ; ibq_rdaddr = A_UP_IBQ_0_RDADDR; obq_rdaddr = A_UP_OBQ_0_REALADDR; } else { cim_num_obq = CIM_NUM_OBQ_T5; ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR; obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR; } nq = CIM_NUM_IBQ + cim_num_obq; rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat); if (rc == 0) rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr); if (rc != 0) return (rc); t4_read_cimq_cfg(sc, base, size, thres); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Queue Base Size Thres RdPtr WrPtr SOP EOP Avail"); for (i = 0; i < CIM_NUM_IBQ; i++, p += 4) sbuf_printf(sb, "\n%7s %5x %5u %5u %6x %4x %4u %4u %5u", qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]), G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); for ( ; i < nq; i++, p += 4, wr += 2) sbuf_printf(sb, "\n%7s %5x %5u %12x %4x %4u %4u %5u", qname[i], base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff, wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_cpl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_cpl_stats(sc, &stats); sbuf_printf(sb, " channel 0 channel 1 channel 2 " "channel 3\n"); sbuf_printf(sb, "CPL requests: %10u %10u %10u %10u\n", stats.req[0], stats.req[1], stats.req[2], stats.req[3]); sbuf_printf(sb, "CPL responses: %10u %10u %10u %10u", stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_usm_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_usm_stats(sc, &stats); sbuf_printf(sb, "Frames: %u\n", stats.frames); sbuf_printf(sb, "Octets: %ju\n", stats.octets); sbuf_printf(sb, "Drops: %u", stats.drops); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } const char *devlog_level_strings[] = { [FW_DEVLOG_LEVEL_EMERG] = "EMERG", [FW_DEVLOG_LEVEL_CRIT] = "CRIT", [FW_DEVLOG_LEVEL_ERR] = "ERR", [FW_DEVLOG_LEVEL_NOTICE] = "NOTICE", [FW_DEVLOG_LEVEL_INFO] = "INFO", [FW_DEVLOG_LEVEL_DEBUG] = "DEBUG" }; const char *devlog_facility_strings[] = { [FW_DEVLOG_FACILITY_CORE] = "CORE", [FW_DEVLOG_FACILITY_CF] = "CF", [FW_DEVLOG_FACILITY_SCHED] = "SCHED", [FW_DEVLOG_FACILITY_TIMER] = "TIMER", [FW_DEVLOG_FACILITY_RES] = "RES", [FW_DEVLOG_FACILITY_HW] = "HW", [FW_DEVLOG_FACILITY_FLR] = "FLR", [FW_DEVLOG_FACILITY_DMAQ] = "DMAQ", [FW_DEVLOG_FACILITY_PHY] = "PHY", [FW_DEVLOG_FACILITY_MAC] = "MAC", [FW_DEVLOG_FACILITY_PORT] = "PORT", [FW_DEVLOG_FACILITY_VI] = "VI", [FW_DEVLOG_FACILITY_FILTER] = "FILTER", [FW_DEVLOG_FACILITY_ACL] = "ACL", [FW_DEVLOG_FACILITY_TM] = "TM", [FW_DEVLOG_FACILITY_QFC] = "QFC", [FW_DEVLOG_FACILITY_DCB] = "DCB", [FW_DEVLOG_FACILITY_ETH] = "ETH", [FW_DEVLOG_FACILITY_OFLD] = "OFLD", [FW_DEVLOG_FACILITY_RI] = "RI", [FW_DEVLOG_FACILITY_ISCSI] = "ISCSI", [FW_DEVLOG_FACILITY_FCOE] = "FCOE", [FW_DEVLOG_FACILITY_FOISCSI] = "FOISCSI", [FW_DEVLOG_FACILITY_FOFCOE] = "FOFCOE" }; static int sysctl_devlog(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e *buf, *e; int i, j, rc, nentries, first = 0, m; struct sbuf *sb; uint64_t ftstamp = UINT64_MAX; if (dparams->start == 0) { dparams->memtype = FW_MEMTYPE_EDC0; dparams->start = 0x84000; dparams->size = 32768; } nentries = dparams->size / sizeof(struct fw_devlog_e); buf = malloc(dparams->size, M_CXGBE, M_NOWAIT); if (buf == NULL) return (ENOMEM); m = fwmtype_to_hwmtype(dparams->memtype); rc = -t4_mem_read(sc, m, dparams->start, dparams->size, (void *)buf); if (rc != 0) goto done; for (i = 0; i < nentries; i++) { e = &buf[i]; if (e->timestamp == 0) break; /* end */ e->timestamp = be64toh(e->timestamp); e->seqno = be32toh(e->seqno); for (j = 0; j < 8; j++) e->params[j] = be32toh(e->params[j]); if (e->timestamp < ftstamp) { ftstamp = e->timestamp; first = i; } } if (buf[first].timestamp == 0) goto done; /* nothing in the log */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%10s %15s %8s %8s %s\n", "Seq#", "Tstamp", "Level", "Facility", "Message"); i = first; do { e = &buf[i]; if (e->timestamp == 0) break; /* end */ sbuf_printf(sb, "%10d %15ju %8s %8s ", e->seqno, e->timestamp, (e->level < nitems(devlog_level_strings) ? devlog_level_strings[e->level] : "UNKNOWN"), (e->facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e->facility] : "UNKNOWN")); sbuf_printf(sb, e->fmt, e->params[0], e->params[1], e->params[2], e->params[3], e->params[4], e->params[5], e->params[6], e->params[7]); if (++i == nentries) i = 0; } while (i != first); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_fcoe_stats stats[4]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_fcoe_stats(sc, 0, &stats[0]); t4_get_fcoe_stats(sc, 1, &stats[1]); t4_get_fcoe_stats(sc, 2, &stats[2]); t4_get_fcoe_stats(sc, 3, &stats[3]); sbuf_printf(sb, " channel 0 channel 1 " "channel 2 channel 3\n"); sbuf_printf(sb, "octetsDDP: %16ju %16ju %16ju %16ju\n", stats[0].octetsDDP, stats[1].octetsDDP, stats[2].octetsDDP, stats[3].octetsDDP); sbuf_printf(sb, "framesDDP: %16u %16u %16u %16u\n", stats[0].framesDDP, stats[1].framesDDP, stats[2].framesDDP, stats[3].framesDDP); sbuf_printf(sb, "framesDrop: %16u %16u %16u %16u", stats[0].framesDrop, stats[1].framesDrop, stats[2].framesDrop, stats[3].framesDrop); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; unsigned int map, kbps, ipg, mode; unsigned int pace_tab[NTX_SCHED]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP); mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG)); t4_read_pace_tbl(sc, pace_tab); sbuf_printf(sb, "Scheduler Mode Channel Rate (Kbps) " "Class IPG (0.1 ns) Flow IPG (us)"); for (i = 0; i < NTX_SCHED; ++i, map >>= 2) { t4_get_tx_sched(sc, i, &kbps, &ipg); sbuf_printf(sb, "\n %u %-5s %u ", i, (mode & (1 << i)) ? "flow" : "class", map & 3); if (kbps) sbuf_printf(sb, "%9u ", kbps); else sbuf_printf(sb, " disabled "); if (ipg) sbuf_printf(sb, "%13u ", ipg); else sbuf_printf(sb, " disabled "); if (pace_tab[i]) sbuf_printf(sb, "%10u", pace_tab[i]); else sbuf_printf(sb, " disabled"); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, j; uint64_t *p0, *p1; struct lb_port_stats s[2]; static const char *stat_name[] = { "OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:", "UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:", "Frames128To255:", "Frames256To511:", "Frames512To1023:", "Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:", "BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:", "BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:", "BG2FramesTrunc:", "BG3FramesTrunc:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); memset(s, 0, sizeof(s)); for (i = 0; i < 4; i += 2) { t4_get_lb_stats(sc, i, &s[0]); t4_get_lb_stats(sc, i + 1, &s[1]); p0 = &s[0].octets; p1 = &s[1].octets; sbuf_printf(sb, "%s Loopback %u" " Loopback %u", i == 0 ? "" : "\n", i, i + 1); for (j = 0; j < nitems(stat_name); j++) sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j], *p0++, *p1++); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS) { int rc = 0; struct port_info *pi = arg1; struct sbuf *sb; static const char *linkdnreasons[] = { "non-specific", "remote fault", "autoneg failed", "reserved3", "PHY overheated", "unknown", "rx los", "reserved7" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 64, req); if (sb == NULL) return (ENOMEM); if (pi->linkdnrc < 0) sbuf_printf(sb, "n/a"); else if (pi->linkdnrc < nitems(linkdnreasons)) sbuf_printf(sb, "%s", linkdnreasons[pi->linkdnrc]); else sbuf_printf(sb, "%d", pi->linkdnrc); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct mem_desc { unsigned int base; unsigned int limit; unsigned int idx; }; static int mem_desc_cmp(const void *a, const void *b) { return ((const struct mem_desc *)a)->base - ((const struct mem_desc *)b)->base; } static void mem_region_show(struct sbuf *sb, const char *name, unsigned int from, unsigned int to) { unsigned int size; size = to - from + 1; if (size == 0) return; /* XXX: need humanize_number(3) in libkern for a more readable 'size' */ sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size); } static int sysctl_meminfo(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; uint32_t lo, hi, used, alloc; static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"}; static const char *region[] = { "DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:", "Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:", "Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:", "TDDP region:", "TPT region:", "STAG region:", "RQ region:", "RQUDP region:", "PBL region:", "TXPBL region:", "DBVFIFO region:", "ULPRX state:", "ULPTX state:", "On-chip queues:" }; struct mem_desc avail[4]; struct mem_desc mem[nitems(region) + 3]; /* up to 3 holes */ struct mem_desc *md = mem; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nitems(mem); i++) { mem[i].limit = 0; mem[i].idx = i; } /* Find and sort the populated memory ranges */ i = 0; lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (lo & F_EDRAM0_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM0_BAR); avail[i].base = G_EDRAM0_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20); avail[i].idx = 0; i++; } if (lo & F_EDRAM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM1_BAR); avail[i].base = G_EDRAM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20); avail[i].idx = 1; i++; } if (lo & F_EXT_MEM_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); avail[i].base = G_EXT_MEM_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20); avail[i].idx = is_t4(sc) ? 2 : 3; /* Call it MC for T4 */ i++; } if (!is_t4(sc) && lo & F_EXT_MEM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 4; i++; } if (!i) /* no memory available */ return 0; qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp); (md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR); (md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE); /* the next few have explicit upper bounds */ md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) * G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE)); md++; md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) * G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE)); md++; if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { hi = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4; md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE); md->limit = (sc->tids.ntids - hi) * 16 + md->base - 1; } else { md->base = 0; md->idx = nitems(region); /* hide it */ } md++; #define ulp_region(reg) \ md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\ (md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT) ulp_region(RX_ISCSI); ulp_region(RX_TDDP); ulp_region(TX_TPT); ulp_region(RX_STAG); ulp_region(RX_RQ); ulp_region(RX_RQUDP); ulp_region(RX_PBL); ulp_region(TX_PBL); #undef ulp_region md->base = 0; md->idx = nitems(region); if (!is_t4(sc) && t4_read_reg(sc, A_SGE_CONTROL2) & F_VFIFO_ENABLE) { md->base = G_BASEADDR(t4_read_reg(sc, A_SGE_DBVFIFO_BADDR)); md->limit = md->base + (G_DBVFIFO_SIZE((t4_read_reg(sc, A_SGE_DBVFIFO_SIZE))) << 2) - 1; } md++; md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE); md->limit = md->base + sc->tids.ntids - 1; md++; md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE); md->limit = md->base + sc->tids.ntids - 1; md++; md->base = sc->vres.ocq.start; if (sc->vres.ocq.size) md->limit = md->base + sc->vres.ocq.size - 1; else md->idx = nitems(region); /* hide it */ md++; /* add any address-space holes, there can be up to 3 */ for (n = 0; n < i - 1; n++) if (avail[n].limit < avail[n + 1].base) (md++)->base = avail[n].limit; if (avail[n].limit) (md++)->base = avail[n].limit; n = md - mem; qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp); for (lo = 0; lo < i; lo++) mem_region_show(sb, memory[avail[lo].idx], avail[lo].base, avail[lo].limit - 1); sbuf_printf(sb, "\n"); for (i = 0; i < n; i++) { if (mem[i].idx >= nitems(region)) continue; /* skip holes */ if (!mem[i].limit) mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0; mem_region_show(sb, region[mem[i].idx], mem[i].base, mem[i].limit); } sbuf_printf(sb, "\n"); lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP RAM:", lo, hi); lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP Extmem2:", lo, hi); lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE); sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n", G_PMRXMAXPAGE(lo), t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10, (lo & F_PMRXNUMCHN) ? 2 : 1); lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE); hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n", G_PMTXMAXPAGE(lo), hi >= (1 << 20) ? (hi >> 20) : (hi >> 10), hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo)); sbuf_printf(sb, "%u p-structs\n", t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT)); for (i = 0; i < 4; i++) { lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4); if (is_t4(sc)) { used = G_USED(lo); alloc = G_ALLOC(lo); } else { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated", i, used, alloc); } for (i = 0; i < 4; i++) { lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4); if (is_t4(sc)) { used = G_USED(lo); alloc = G_ALLOC(lo); } else { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } sbuf_printf(sb, "\nLoopback %d using %u pages out of %u allocated", i, used, alloc); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static inline void tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask) { *mask = x | y; y = htobe64(y); memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN); } static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask Vld Ports PF" " VF Replication P0 P1 P2 P3 ML"); n = is_t4(sc) ? NUM_MPS_CLS_SRAM_L_INSTANCES : NUM_MPS_T5_CLS_SRAM_L_INSTANCES; for (i = 0; i < n; i++) { uint64_t tcamx, tcamy, mask; uint32_t cls_lo, cls_hi; uint8_t addr[ETHER_ADDR_LEN]; tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i)); tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i)); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx" " %c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, (cls_lo & F_SRAM_VLD) ? 'Y' : 'N', G_PORTMAP(cls_hi), G_PF(cls_lo), (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1); if (cls_lo & F_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.fid_ctl = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_CTL(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, " ------------ error %3u ------------", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc127_96), be32toh(ldst_cmd.u.mps.rplc95_64), be32toh(ldst_cmd.u.mps.rplc63_32), be32toh(ldst_cmd.u.mps.rplc31_0)); } } else sbuf_printf(sb, "%36s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo), G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo), G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint16_t mtus[NMTUS]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_read_mtu_tbl(sc, mtus, NULL); sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u", mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6], mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13], mtus[14], mtus[15]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint32_t cnt[PM_NSTATS]; uint64_t cyc[PM_NSTATS]; static const char *rx_stats[] = { "Read:", "Write bypass:", "Write mem:", "Flush:" }; static const char *tx_stats[] = { "Read:", "Write bypass:", "Write mem:", "Bypass + mem:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_pmtx_get_stats(sc, cnt, cyc); sbuf_printf(sb, " Tx pcmds Tx bytes"); for (i = 0; i < ARRAY_SIZE(tx_stats); i++) sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], cnt[i], cyc[i]); t4_pmrx_get_stats(sc, cnt, cyc); sbuf_printf(sb, "\n Rx pcmds Rx bytes"); for (i = 0; i < ARRAY_SIZE(rx_stats); i++) sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], cnt[i], cyc[i]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_rdma_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_rdma_stats(sc, &stats); sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod); sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tcp_stats v4, v6; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_tcp_stats(sc, &v4, &v6); sbuf_printf(sb, " IP IPv6\n"); sbuf_printf(sb, "OutRsts: %20u %20u\n", v4.tcpOutRsts, v6.tcpOutRsts); sbuf_printf(sb, "InSegs: %20ju %20ju\n", v4.tcpInSegs, v6.tcpInSegs); sbuf_printf(sb, "OutSegs: %20ju %20ju\n", v4.tcpOutSegs, v6.tcpOutSegs); sbuf_printf(sb, "RetransSegs: %20ju %20ju", v4.tcpRetransSegs, v6.tcpRetransSegs); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tids(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tid_info *t = &sc->tids; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (t->natids) { sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1, t->atids_in_use); } if (t->ntids) { if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { uint32_t b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4; if (b) { sbuf_printf(sb, "TID range: 0-%u, %u-%u", b - 1, t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } else { sbuf_printf(sb, "TID range: %u-%u", t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } } else sbuf_printf(sb, "TID range: 0-%u", t->ntids - 1); sbuf_printf(sb, ", in use: %u\n", atomic_load_acq_int(&t->tids_in_use)); } if (t->nstids) { sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base, t->stid_base + t->nstids - 1, t->stids_in_use); } if (t->nftids) { sbuf_printf(sb, "FTID range: %u-%u\n", t->ftid_base, t->ftid_base + t->nftids - 1); } if (t->netids) { sbuf_printf(sb, "ETID range: %u-%u\n", t->etid_base, t->etid_base + t->netids - 1); } sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4), t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_err_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_tp_get_err_stats(sc, &stats); sbuf_printf(sb, " channel 0 channel 1 channel 2 " "channel 3\n"); sbuf_printf(sb, "macInErrs: %10u %10u %10u %10u\n", stats.macInErrs[0], stats.macInErrs[1], stats.macInErrs[2], stats.macInErrs[3]); sbuf_printf(sb, "hdrInErrs: %10u %10u %10u %10u\n", stats.hdrInErrs[0], stats.hdrInErrs[1], stats.hdrInErrs[2], stats.hdrInErrs[3]); sbuf_printf(sb, "tcpInErrs: %10u %10u %10u %10u\n", stats.tcpInErrs[0], stats.tcpInErrs[1], stats.tcpInErrs[2], stats.tcpInErrs[3]); sbuf_printf(sb, "tcp6InErrs: %10u %10u %10u %10u\n", stats.tcp6InErrs[0], stats.tcp6InErrs[1], stats.tcp6InErrs[2], stats.tcp6InErrs[3]); sbuf_printf(sb, "tnlCongDrops: %10u %10u %10u %10u\n", stats.tnlCongDrops[0], stats.tnlCongDrops[1], stats.tnlCongDrops[2], stats.tnlCongDrops[3]); sbuf_printf(sb, "tnlTxDrops: %10u %10u %10u %10u\n", stats.tnlTxDrops[0], stats.tnlTxDrops[1], stats.tnlTxDrops[2], stats.tnlTxDrops[3]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u %10u %10u\n", stats.ofldVlanDrops[0], stats.ofldVlanDrops[1], stats.ofldVlanDrops[2], stats.ofldVlanDrops[3]); sbuf_printf(sb, "ofldChanDrops: %10u %10u %10u %10u\n\n", stats.ofldChanDrops[0], stats.ofldChanDrops[1], stats.ofldChanDrops[2], stats.ofldChanDrops[3]); sbuf_printf(sb, "ofldNoNeigh: %u\nofldCongDefer: %u", stats.ofldNoNeigh, stats.ofldCongDefer); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct field_desc { const char *name; u_int start; u_int width; }; static void field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f) { char buf[32]; int line_size = 0; while (f->name) { uint64_t mask = (1ULL << f->width) - 1; int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name, ((uintmax_t)v >> f->start) & mask); if (line_size + len >= 79) { line_size = 8; sbuf_printf(sb, "\n "); } sbuf_printf(sb, "%s ", buf); line_size += len + 1; f++; } sbuf_printf(sb, "\n"); } static struct field_desc tp_la0[] = { { "RcfOpCodeOut", 60, 4 }, { "State", 56, 4 }, { "WcfState", 52, 4 }, { "RcfOpcSrcOut", 50, 2 }, { "CRxError", 49, 1 }, { "ERxError", 48, 1 }, { "SanityFailed", 47, 1 }, { "SpuriousMsg", 46, 1 }, { "FlushInputMsg", 45, 1 }, { "FlushInputCpl", 44, 1 }, { "RssUpBit", 43, 1 }, { "RssFilterHit", 42, 1 }, { "Tid", 32, 10 }, { "InitTcb", 31, 1 }, { "LineNumber", 24, 7 }, { "Emsg", 23, 1 }, { "EdataOut", 22, 1 }, { "Cmsg", 21, 1 }, { "CdataOut", 20, 1 }, { "EreadPdu", 19, 1 }, { "CreadPdu", 18, 1 }, { "TunnelPkt", 17, 1 }, { "RcfPeerFin", 16, 1 }, { "RcfReasonOut", 12, 4 }, { "TxCchannel", 10, 2 }, { "RcfTxChannel", 8, 2 }, { "RxEchannel", 6, 2 }, { "RcfRxChannel", 5, 1 }, { "RcfDataOutSrdy", 4, 1 }, { "RxDvld", 3, 1 }, { "RxOoDvld", 2, 1 }, { "RxCongestion", 1, 1 }, { "TxCongestion", 0, 1 }, { NULL } }; static struct field_desc tp_la1[] = { { "CplCmdIn", 56, 8 }, { "CplCmdOut", 48, 8 }, { "ESynOut", 47, 1 }, { "EAckOut", 46, 1 }, { "EFinOut", 45, 1 }, { "ERstOut", 44, 1 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static struct field_desc tp_la2[] = { { "CplCmdIn", 56, 8 }, { "MpsVfVld", 55, 1 }, { "MpsPf", 52, 3 }, { "MpsVf", 44, 8 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static void tp_la_show(struct sbuf *sb, uint64_t *p, int idx) { field_desc_show(sb, *p, tp_la0); } static void tp_la_show2(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], tp_la0); } static void tp_la_show3(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1); } static int sysctl_tp_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint64_t *buf, *p; int rc; u_int i, inc; void (*show_func)(struct sbuf *, uint64_t *, int); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK); t4_tp_read_la(sc, buf, NULL); p = buf; switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) { case 2: inc = 2; show_func = tp_la_show2; break; case 3: inc = 2; show_func = tp_la_show3; break; default: inc = 1; show_func = tp_la_show; } for (i = 0; i < TPLA_SIZE / inc; i++, p += inc) (*show_func)(sb, p, i); rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; u64 nrate[NCHAN], orate[NCHAN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_chan_txrate(sc, nrate, orate); sbuf_printf(sb, " channel 0 channel 1 channel 2 " "channel 3\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju %10ju %10ju\n", nrate[0], nrate[1], nrate[2], nrate[3]); sbuf_printf(sb, "Offload B/s: %10ju %10ju %10ju %10ju", orate[0], orate[1], orate[2], orate[3]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint32_t *buf, *p; int rc, i; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_ulprx_read_la(sc, buf); p = buf; sbuf_printf(sb, " Pcmd Type Message" " Data"); for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) { sbuf_printf(sb, "\n%08x%08x %4x %08x %08x%08x%08x%08x", p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, v; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); v = t4_read_reg(sc, A_SGE_STAT_CFG); if (G_STATSOURCE_T5(v) == 7) { if (G_STATMODE(v) == 0) { sbuf_printf(sb, "total %d, incomplete %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } else if (G_STATMODE(v) == 1) { sbuf_printf(sb, "total %d, data overflow %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } #endif static inline void txq_start(struct ifnet *ifp, struct sge_txq *txq) { struct buf_ring *br; struct mbuf *m; TXQ_LOCK_ASSERT_OWNED(txq); br = txq->br; m = txq->m ? txq->m : drbr_dequeue(ifp, br); if (m) t4_eth_tx(ifp, txq, m); } void t4_tx_callout(void *arg) { struct sge_eq *eq = arg; struct adapter *sc; if (EQ_TRYLOCK(eq) == 0) goto reschedule; if (eq->flags & EQ_STALLED && !can_resume_tx(eq)) { EQ_UNLOCK(eq); reschedule: if (__predict_true(!(eq->flags && EQ_DOOMED))) callout_schedule(&eq->tx_callout, 1); return; } EQ_LOCK_ASSERT_OWNED(eq); if (__predict_true((eq->flags & EQ_DOOMED) == 0)) { if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) { struct sge_txq *txq = arg; struct port_info *pi = txq->ifp->if_softc; sc = pi->adapter; } else { struct sge_wrq *wrq = arg; sc = wrq->adapter; } taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task); } EQ_UNLOCK(eq); } void t4_tx_task(void *arg, int count) { struct sge_eq *eq = arg; EQ_LOCK(eq); if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) { struct sge_txq *txq = arg; txq_start(txq->ifp, txq); } else { struct sge_wrq *wrq = arg; t4_wrq_tx_locked(wrq->adapter, wrq, NULL); } EQ_UNLOCK(eq); } static uint32_t fconf_to_mode(uint32_t fconf) { uint32_t mode; mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR | T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT; if (fconf & F_FRAGMENTATION) mode |= T4_FILTER_IP_FRAGMENT; if (fconf & F_MPSHITTYPE) mode |= T4_FILTER_MPS_HIT_TYPE; if (fconf & F_MACMATCH) mode |= T4_FILTER_MAC_IDX; if (fconf & F_ETHERTYPE) mode |= T4_FILTER_ETH_TYPE; if (fconf & F_PROTOCOL) mode |= T4_FILTER_IP_PROTO; if (fconf & F_TOS) mode |= T4_FILTER_IP_TOS; if (fconf & F_VLAN) mode |= T4_FILTER_VLAN; if (fconf & F_VNIC_ID) mode |= T4_FILTER_VNIC; if (fconf & F_PORT) mode |= T4_FILTER_PORT; if (fconf & F_FCOE) mode |= T4_FILTER_FCoE; return (mode); } static uint32_t mode_to_fconf(uint32_t mode) { uint32_t fconf = 0; if (mode & T4_FILTER_IP_FRAGMENT) fconf |= F_FRAGMENTATION; if (mode & T4_FILTER_MPS_HIT_TYPE) fconf |= F_MPSHITTYPE; if (mode & T4_FILTER_MAC_IDX) fconf |= F_MACMATCH; if (mode & T4_FILTER_ETH_TYPE) fconf |= F_ETHERTYPE; if (mode & T4_FILTER_IP_PROTO) fconf |= F_PROTOCOL; if (mode & T4_FILTER_IP_TOS) fconf |= F_TOS; if (mode & T4_FILTER_VLAN) fconf |= F_VLAN; if (mode & T4_FILTER_VNIC) fconf |= F_VNIC_ID; if (mode & T4_FILTER_PORT) fconf |= F_PORT; if (mode & T4_FILTER_FCoE) fconf |= F_FCOE; return (fconf); } static uint32_t fspec_to_fconf(struct t4_filter_specification *fs) { uint32_t fconf = 0; if (fs->val.frag || fs->mask.frag) fconf |= F_FRAGMENTATION; if (fs->val.matchtype || fs->mask.matchtype) fconf |= F_MPSHITTYPE; if (fs->val.macidx || fs->mask.macidx) fconf |= F_MACMATCH; if (fs->val.ethtype || fs->mask.ethtype) fconf |= F_ETHERTYPE; if (fs->val.proto || fs->mask.proto) fconf |= F_PROTOCOL; if (fs->val.tos || fs->mask.tos) fconf |= F_TOS; if (fs->val.vlan_vld || fs->mask.vlan_vld) fconf |= F_VLAN; if (fs->val.vnic_vld || fs->mask.vnic_vld) fconf |= F_VNIC_ID; if (fs->val.iport || fs->mask.iport) fconf |= F_PORT; if (fs->val.fcoe || fs->mask.fcoe) fconf |= F_FCOE; return (fconf); } static int get_filter_mode(struct adapter *sc, uint32_t *mode) { int rc; uint32_t fconf; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4getfm"); if (rc) return (rc); t4_read_indirect(sc, A_TP_PIO_ADDR, A_TP_PIO_DATA, &fconf, 1, A_TP_VLAN_PRI_MAP); if (sc->params.tp.vlan_pri_map != fconf) { log(LOG_WARNING, "%s: cached filter mode out of sync %x %x.\n", device_get_nameunit(sc->dev), sc->params.tp.vlan_pri_map, fconf); sc->params.tp.vlan_pri_map = fconf; } *mode = fconf_to_mode(sc->params.tp.vlan_pri_map); end_synchronized_op(sc, LOCK_HELD); return (0); } static int set_filter_mode(struct adapter *sc, uint32_t mode) { uint32_t fconf; int rc; fconf = mode_to_fconf(mode); rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4setfm"); if (rc) return (rc); if (sc->tids.ftids_in_use > 0) { rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (sc->offload_map) { rc = EBUSY; goto done; } #endif #ifdef notyet rc = -t4_set_filter_mode(sc, fconf); if (rc == 0) sc->filter_mode = fconf; #else rc = ENOTSUP; #endif done: end_synchronized_op(sc, LOCK_HELD); return (rc); } static inline uint64_t get_filter_hits(struct adapter *sc, uint32_t fid) { uint32_t mw_base, off, tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); uint64_t hits; memwin_info(sc, 0, &mw_base, NULL); off = position_memwin(sc, 0, tcb_base + (fid + sc->tids.ftid_base) * TCB_SIZE); if (is_t4(sc)) { hits = t4_read_reg64(sc, mw_base + off + 16); hits = be64toh(hits); } else { hits = t4_read_reg(sc, mw_base + off + 24); hits = be32toh(hits); } return (hits); } static int get_filter(struct adapter *sc, struct t4_filter *t) { int i, rc, nfilters = sc->tids.nftids; struct filter_entry *f; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4getf"); if (rc) return (rc); if (sc->tids.ftids_in_use == 0 || sc->tids.ftid_tab == NULL || t->idx >= nfilters) { t->idx = 0xffffffff; goto done; } f = &sc->tids.ftid_tab[t->idx]; for (i = t->idx; i < nfilters; i++, f++) { if (f->valid) { t->idx = i; t->l2tidx = f->l2t ? f->l2t->idx : 0; t->smtidx = f->smtidx; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, t->idx); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: end_synchronized_op(sc, LOCK_HELD); return (0); } static int set_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters, nports; struct filter_entry *f; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setf"); if (rc) return (rc); nfilters = sc->tids.nftids; nports = sc->params.nports; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } if (t->idx >= nfilters) { rc = EINVAL; goto done; } /* Validate against the global filter mode */ if ((sc->params.tp.vlan_pri_map | fspec_to_fconf(&t->fs)) != sc->params.tp.vlan_pri_map) { rc = E2BIG; goto done; } if (t->fs.action == FILTER_SWITCH && t->fs.eport >= nports) { rc = EINVAL; goto done; } if (t->fs.val.iport >= nports) { rc = EINVAL; goto done; } /* Can't specify an iq if not steering to it */ if (!t->fs.dirsteer && t->fs.iq) { rc = EINVAL; goto done; } /* IPv6 filter idx must be 4 aligned */ if (t->fs.type == 1 && ((t->idx & 0x3) || t->idx + 4 >= nfilters)) { rc = EINVAL; goto done; } if (sc->tids.ftid_tab == NULL) { KASSERT(sc->tids.ftids_in_use == 0, ("%s: no memory allocated but filters_in_use > 0", __func__)); sc->tids.ftid_tab = malloc(sizeof (struct filter_entry) * nfilters, M_CXGBE, M_NOWAIT | M_ZERO); if (sc->tids.ftid_tab == NULL) { rc = ENOMEM; goto done; } mtx_init(&sc->tids.ftid_lock, "T4 filters", 0, MTX_DEF); } for (i = 0; i < 4; i++) { f = &sc->tids.ftid_tab[t->idx + i]; if (f->pending || f->valid) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (t->fs.type == 0) break; } f = &sc->tids.ftid_tab[t->idx]; f->fs = t->fs; rc = set_filter_wr(sc, t->idx); done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? 0 : EIO; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4setfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static int del_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters; struct filter_entry *f; int rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4delf"); if (rc) return (rc); nfilters = sc->tids.nftids; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (sc->tids.ftid_tab == NULL || sc->tids.ftids_in_use == 0 || t->idx >= nfilters) { rc = EINVAL; goto done; } if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } f = &sc->tids.ftid_tab[t->idx]; if (f->pending) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (f->valid) { t->fs = f->fs; /* extra info for the caller */ rc = del_filter_wr(sc, t->idx); } done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? EIO : 0; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4delfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static void clear_filter(struct filter_entry *f) { if (f->l2t) t4_l2t_release(f->l2t); bzero(f, sizeof (*f)); } static int set_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; ASSERT_SYNCHRONIZED_OP(sc); if (f->fs.newdmac || f->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ f->l2t = t4_l2t_alloc_switching(sc->l2t); if (f->l2t == NULL) return (EAGAIN); if (t4_l2t_set_switching(sc, f->l2t, f->fs.vlan, f->fs.eport, f->fs.dmac)) { t4_l2t_release(f->l2t); f->l2t = NULL; return (ENOMEM); } } ftid = sc->tids.ftid_base + fidx; wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); if (wr == NULL) return (ENOMEM); fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); fwr->len16_pkd = htobe32(FW_LEN16(*fwr)); fwr->tid_to_iq = htobe32(V_FW_FILTER_WR_TID(ftid) | V_FW_FILTER_WR_RQTYPE(f->fs.type) | V_FW_FILTER_WR_NOREPLY(0) | V_FW_FILTER_WR_IQ(f->fs.iq)); fwr->del_filter_to_l2tix = htobe32(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) | V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) | V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) | V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) | V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) | V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) | V_FW_FILTER_WR_DMAC(f->fs.newdmac) | V_FW_FILTER_WR_SMAC(f->fs.newsmac) | V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) | V_FW_FILTER_WR_TXCHAN(f->fs.eport) | V_FW_FILTER_WR_PRIO(f->fs.prio) | V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0)); fwr->ethtype = htobe16(f->fs.val.ethtype); fwr->ethtypem = htobe16(f->fs.mask.ethtype); fwr->frag_to_ovlan_vldm = (V_FW_FILTER_WR_FRAG(f->fs.val.frag) | V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) | V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLD(f->fs.val.vnic_vld) | V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLDM(f->fs.mask.vnic_vld)); fwr->smac_sel = 0; fwr->rx_chan_rx_rpl_iq = htobe16(V_FW_FILTER_WR_RX_CHAN(0) | V_FW_FILTER_WR_RX_RPL_IQ(sc->sge.fwq.abs_id)); fwr->maci_to_matchtypem = htobe32(V_FW_FILTER_WR_MACI(f->fs.val.macidx) | V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) | V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) | V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) | V_FW_FILTER_WR_PORT(f->fs.val.iport) | V_FW_FILTER_WR_PORTM(f->fs.mask.iport) | V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) | V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype)); fwr->ptcl = f->fs.val.proto; fwr->ptclm = f->fs.mask.proto; fwr->ttyp = f->fs.val.tos; fwr->ttypm = f->fs.mask.tos; fwr->ivlan = htobe16(f->fs.val.vlan); fwr->ivlanm = htobe16(f->fs.mask.vlan); fwr->ovlan = htobe16(f->fs.val.vnic); fwr->ovlanm = htobe16(f->fs.mask.vnic); bcopy(f->fs.val.dip, fwr->lip, sizeof (fwr->lip)); bcopy(f->fs.mask.dip, fwr->lipm, sizeof (fwr->lipm)); bcopy(f->fs.val.sip, fwr->fip, sizeof (fwr->fip)); bcopy(f->fs.mask.sip, fwr->fipm, sizeof (fwr->fipm)); fwr->lp = htobe16(f->fs.val.dport); fwr->lpm = htobe16(f->fs.mask.dport); fwr->fp = htobe16(f->fs.val.sport); fwr->fpm = htobe16(f->fs.mask.sport); if (f->fs.newsmac) bcopy(f->fs.smac, fwr->sma, sizeof (fwr->sma)); f->pending = 1; sc->tids.ftids_in_use++; t4_wrq_tx(sc, wr); return (0); } static int del_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; ftid = sc->tids.ftid_base + fidx; wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); if (wr == NULL) return (ENOMEM); fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id); f->pending = 1; t4_wrq_tx(sc, wr); return (0); } int t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); unsigned int idx = GET_TID(rpl); unsigned int rc; struct filter_entry *f; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (is_ftid(sc, idx)) { idx -= sc->tids.ftid_base; f = &sc->tids.ftid_tab[idx]; rc = G_COOKIE(rpl->cookie); mtx_lock(&sc->tids.ftid_lock); if (rc == FW_FILTER_WR_FLT_ADDED) { KASSERT(f->pending, ("%s: filter[%u] isn't pending.", __func__, idx)); f->smtidx = (be64toh(rpl->oldval) >> 24) & 0xff; f->pending = 0; /* asynchronous setup completed */ f->valid = 1; } else { if (rc != FW_FILTER_WR_FLT_DELETED) { /* Add or delete failed, display an error */ log(LOG_ERR, "filter %u setup failed with error %u\n", idx, rc); } clear_filter(f); sc->tids.ftids_in_use--; } wakeup(&sc->tids.ftid_tab); mtx_unlock(&sc->tids.ftid_lock); } return (0); } static int get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt) { int rc; if (cntxt->cid > M_CTXTQID) return (EINVAL); if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS && cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt"); if (rc) return (rc); if (sc->flags & FW_OK) { rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); if (rc == 0) goto done; } /* * Read via firmware failed or wasn't even attempted. Read directly via * the backdoor. */ rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); done: end_synchronized_op(sc, 0); return (rc); } static int load_fw(struct adapter *sc, struct t4_data *fw) { int rc; uint8_t *fw_data; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw"); if (rc) return (rc); if (sc->flags & FULL_INIT_DONE) { rc = EBUSY; goto done; } fw_data = malloc(fw->len, M_CXGBE, M_WAITOK); if (fw_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(fw->data, fw_data, fw->len); if (rc == 0) rc = -t4_load_fw(sc, fw_data, fw->len); free(fw_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) { uint32_t addr, off, remaining, i, n; uint32_t *buf, *b; uint32_t mw_base, mw_aperture; int rc; uint8_t *dst; rc = validate_mem_range(sc, mr->addr, mr->len); if (rc != 0) return (rc); memwin_info(sc, win, &mw_base, &mw_aperture); buf = b = malloc(min(mr->len, mw_aperture), M_CXGBE, M_WAITOK); addr = mr->addr; remaining = mr->len; dst = (void *)mr->data; while (remaining) { off = position_memwin(sc, win, addr); /* number of bytes that we'll copy in the inner loop */ n = min(remaining, mw_aperture - off); for (i = 0; i < n; i += 4) *b++ = t4_read_reg(sc, mw_base + off + i); rc = copyout(buf, dst, n); if (rc != 0) break; b = buf; dst += n; remaining -= n; addr += n; } free(buf, M_CXGBE); return (rc); } static int read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd) { int rc; if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports) return (EINVAL); if (i2cd->len > sizeof(i2cd->data)) return (EFBIG); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr, i2cd->offset, i2cd->len, &i2cd->data[0]); end_synchronized_op(sc, 0); return (rc); } static int in_range(int val, int lo, int hi) { return (val < 0 || (val <= hi && val >= lo)); } static int set_sched_class(struct adapter *sc, struct t4_sched_params *p) { int fw_subcmd, fw_type, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsc"); if (rc) return (rc); if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } /* * Translate the cxgbetool parameters into T4 firmware parameters. (The * sub-command and type are in common locations.) */ if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG) fw_subcmd = FW_SCHED_SC_CONFIG; else if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS) fw_subcmd = FW_SCHED_SC_PARAMS; else { rc = EINVAL; goto done; } if (p->type == SCHED_CLASS_TYPE_PACKET) fw_type = FW_SCHED_TYPE_PKTSCHED; else { rc = EINVAL; goto done; } if (fw_subcmd == FW_SCHED_SC_CONFIG) { /* Vet our parameters ..*/ if (p->u.config.minmax < 0) { rc = EINVAL; goto done; } /* And pass the request to the firmware ...*/ rc = -t4_sched_config(sc, fw_type, p->u.config.minmax, 1); goto done; } if (fw_subcmd == FW_SCHED_SC_PARAMS) { int fw_level; int fw_mode; int fw_rateunit; int fw_ratemode; if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL; else if (p->u.params.level == SCHED_CLASS_LEVEL_CL_WRR) fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR; else if (p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL; else { rc = EINVAL; goto done; } if (p->u.params.mode == SCHED_CLASS_MODE_CLASS) fw_mode = FW_SCHED_PARAMS_MODE_CLASS; else if (p->u.params.mode == SCHED_CLASS_MODE_FLOW) fw_mode = FW_SCHED_PARAMS_MODE_FLOW; else { rc = EINVAL; goto done; } if (p->u.params.rateunit == SCHED_CLASS_RATEUNIT_BITS) fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; else if (p->u.params.rateunit == SCHED_CLASS_RATEUNIT_PKTS) fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE; else { rc = EINVAL; goto done; } if (p->u.params.ratemode == SCHED_CLASS_RATEMODE_REL) fw_ratemode = FW_SCHED_PARAMS_RATE_REL; else if (p->u.params.ratemode == SCHED_CLASS_RATEMODE_ABS) fw_ratemode = FW_SCHED_PARAMS_RATE_ABS; else { rc = EINVAL; goto done; } /* Vet our parameters ... */ if (!in_range(p->u.params.channel, 0, 3) || !in_range(p->u.params.cl, 0, is_t4(sc) ? 15 : 16) || !in_range(p->u.params.minrate, 0, 10000000) || !in_range(p->u.params.maxrate, 0, 10000000) || !in_range(p->u.params.weight, 0, 100)) { rc = ERANGE; goto done; } /* * Translate any unset parameters into the firmware's * nomenclature and/or fail the call if the parameters * are required ... */ if (p->u.params.rateunit < 0 || p->u.params.ratemode < 0 || p->u.params.channel < 0 || p->u.params.cl < 0) { rc = EINVAL; goto done; } if (p->u.params.minrate < 0) p->u.params.minrate = 0; if (p->u.params.maxrate < 0) { if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL || p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) { rc = EINVAL; goto done; } else p->u.params.maxrate = 0; } if (p->u.params.weight < 0) { if (p->u.params.level == SCHED_CLASS_LEVEL_CL_WRR) { rc = EINVAL; goto done; } else p->u.params.weight = 0; } if (p->u.params.pktsize < 0) { if (p->u.params.level == SCHED_CLASS_LEVEL_CL_RL || p->u.params.level == SCHED_CLASS_LEVEL_CH_RL) { rc = EINVAL; goto done; } else p->u.params.pktsize = 0; } /* See what the firmware thinks of the request ... */ rc = -t4_sched_params(sc, fw_type, fw_level, fw_mode, fw_rateunit, fw_ratemode, p->u.params.channel, p->u.params.cl, p->u.params.minrate, p->u.params.maxrate, p->u.params.weight, p->u.params.pktsize, 1); goto done; } rc = EINVAL; done: end_synchronized_op(sc, 0); return (rc); } static int set_sched_queue(struct adapter *sc, struct t4_sched_queue *p) { struct port_info *pi = NULL; struct sge_txq *txq; uint32_t fw_mnem, fw_queue, fw_class; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsq"); if (rc) return (rc); if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } if (p->port >= sc->params.nports) { rc = EINVAL; goto done; } pi = sc->port[p->port]; if (!in_range(p->queue, 0, pi->ntxq - 1) || !in_range(p->cl, 0, 7)) { rc = EINVAL; goto done; } /* * Create a template for the FW_PARAMS_CMD mnemonic and value (TX * Scheduling Class in this case). */ fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH)); fw_class = p->cl < 0 ? 0xffffffff : p->cl; /* * If op.queue is non-negative, then we're only changing the scheduling * on a single specified TX queue. */ if (p->queue >= 0) { txq = &sc->sge.txq[pi->first_txq + p->queue]; fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); goto done; } /* * Change the scheduling on all the TX queues for the * interface. */ for_each_txq(pi, i, txq) { fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); if (rc) goto done; } rc = 0; done: end_synchronized_op(sc, 0); return (rc); } int t4_os_find_pci_capability(struct adapter *sc, int cap) { int i; return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0); } int t4_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t4_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } void t4_os_portmod_changed(const struct adapter *sc, int idx) { struct port_info *pi = sc->port[idx]; static const char *mod_str[] = { NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) if_printf(pi->ifp, "transceiver unplugged.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN) if_printf(pi->ifp, "unknown transceiver inserted.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED) if_printf(pi->ifp, "unsupported transceiver inserted.\n"); else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) { if_printf(pi->ifp, "%s transceiver inserted.\n", mod_str[pi->mod_type]); } else { if_printf(pi->ifp, "transceiver (type %d) inserted.\n", pi->mod_type); } } void t4_os_link_changed(struct adapter *sc, int idx, int link_stat, int reason) { struct port_info *pi = sc->port[idx]; struct ifnet *ifp = pi->ifp; if (link_stat) { pi->linkdnrc = -1; ifp->if_baudrate = IF_Mbps(pi->link_cfg.speed); if_link_state_change(ifp, LINK_STATE_UP); } else { if (reason >= 0) pi->linkdnrc = reason; if_link_state_change(ifp, LINK_STATE_DOWN); } } void t4_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; sx_slock(&t4_list_lock); SLIST_FOREACH(sc, &t4_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } sx_sunlock(&t4_list_lock); } static int t4_open(struct cdev *dev, int flags, int type, struct thread *td) { return (0); } static int t4_close(struct cdev *dev, int flags, int type, struct thread *td) { return (0); } static int t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int rc; struct adapter *sc = dev->si_drv1; rc = priv_check(td, PRIV_DRIVER); if (rc != 0) return (rc); switch (cmd) { case CHELSIO_T4_GETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) edata->val = t4_read_reg(sc, edata->addr); else if (edata->size == 8) edata->val = t4_read_reg64(sc, edata->addr); else return (EINVAL); break; } case CHELSIO_T4_SETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) { if (edata->val & 0xffffffff00000000) return (EINVAL); t4_write_reg(sc, edata->addr, (uint32_t) edata->val); } else if (edata->size == 8) t4_write_reg64(sc, edata->addr, edata->val); else return (EINVAL); break; } case CHELSIO_T4_REGDUMP: { struct t4_regdump *regs = (struct t4_regdump *)data; int reglen = is_t4(sc) ? T4_REGDUMP_SIZE : T5_REGDUMP_SIZE; uint8_t *buf; if (regs->len < reglen) { regs->len = reglen; /* hint to the caller */ return (ENOBUFS); } regs->len = reglen; buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO); t4_get_regs(sc, regs, buf); rc = copyout(buf, regs->data, reglen); free(buf, M_CXGBE); break; } case CHELSIO_T4_GET_FILTER_MODE: rc = get_filter_mode(sc, (uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MODE: rc = set_filter_mode(sc, *(uint32_t *)data); break; case CHELSIO_T4_GET_FILTER: rc = get_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_SET_FILTER: rc = set_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_DEL_FILTER: rc = del_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_GET_SGE_CONTEXT: rc = get_sge_context(sc, (struct t4_sge_context *)data); break; case CHELSIO_T4_LOAD_FW: rc = load_fw(sc, (struct t4_data *)data); break; case CHELSIO_T4_GET_MEM: rc = read_card_mem(sc, 2, (struct t4_mem_range *)data); break; case CHELSIO_T4_GET_I2C: rc = read_i2c(sc, (struct t4_i2c_data *)data); break; case CHELSIO_T4_CLEAR_STATS: { int i; u_int port_id = *(uint32_t *)data; struct port_info *pi; if (port_id >= sc->params.nports) return (EINVAL); pi = sc->port[port_id]; /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); if (pi->flags & PORT_INIT_DONE) { struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; for_each_rxq(pi, i, rxq) { #if defined(INET) || defined(INET6) rxq->lro.lro_queued = 0; rxq->lro.lro_flushed = 0; #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; } for_each_txq(pi, i, txq) { txq->txcsum = 0; txq->tso_wrs = 0; txq->vlan_insertion = 0; txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; txq->txpkts_wrs = 0; txq->txpkts_pkts = 0; txq->br->br_drops = 0; txq->no_dmamap = 0; txq->no_desc = 0; } #ifdef TCP_OFFLOAD /* nothing to clear for each ofld_rxq */ for_each_ofld_txq(pi, i, wrq) { wrq->tx_wrs = 0; wrq->no_desc = 0; } #endif wrq = &sc->sge.ctrlq[pi->port_id]; wrq->tx_wrs = 0; wrq->no_desc = 0; } break; } case CHELSIO_T4_SCHED_CLASS: rc = set_sched_class(sc, (struct t4_sched_params *)data); break; case CHELSIO_T4_SCHED_QUEUE: rc = set_sched_queue(sc, (struct t4_sched_queue *)data); break; case CHELSIO_T4_GET_TRACER: rc = t4_get_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_SET_TRACER: rc = t4_set_tracer(sc, (struct t4_tracer *)data); break; default: rc = EINVAL; } return (rc); } #ifdef TCP_OFFLOAD void t4_iscsi_init(struct ifnet *ifp, unsigned int tag_mask, const unsigned int *pgsz_order) { struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; t4_write_reg(sc, A_ULP_RX_ISCSI_TAGMASK, tag_mask); t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, V_HPZ0(pgsz_order[0]) | V_HPZ1(pgsz_order[1]) | V_HPZ2(pgsz_order[2]) | V_HPZ3(pgsz_order[3])); } static int toe_capability(struct port_info *pi, int enable) { int rc; struct adapter *sc = pi->adapter; ASSERT_SYNCHRONIZED_OP(sc); if (!is_offload(sc)) return (ENODEV); if (enable) { if (!(sc->flags & FULL_INIT_DONE)) { rc = cxgbe_init_synchronized(pi); if (rc) return (rc); } if (isset(&sc->offload_map, pi->port_id)) return (0); if (!(sc->flags & TOM_INIT_DONE)) { rc = t4_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t4_tom.ko before trying " "to enable TOE on a cxgbe interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(sc->flags & TOM_INIT_DONE, ("%s: TOM activated but flag not set", __func__)); } setbit(&sc->offload_map, pi->port_id); } else { if (!isset(&sc->offload_map, pi->port_id)) return (0); KASSERT(sc->flags & TOM_INIT_DONE, ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t4_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t4_uld_list, ui, link); ui->refcount = 0; done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t4_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_activate_uld(struct adapter *sc, int id) { int rc = EAGAIN; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->activate(sc); if (rc == 0) ui->refcount++; goto done; } } done: sx_sunlock(&t4_uld_list_lock); return (rc); } int t4_deactivate_uld(struct adapter *sc, int id) { int rc = EINVAL; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) ui->refcount--; goto done; } } done: sx_sunlock(&t4_uld_list_lock); return (rc); } #endif /* * Come up with reasonable defaults for some of the tunables, provided they're * not set by the user (in which case we'll use the values as is). */ static void tweak_tunables(void) { int nc = mp_ncpus; /* our snapshot of the number of CPUs */ if (t4_ntxq10g < 1) t4_ntxq10g = min(nc, NTXQ_10G); if (t4_ntxq1g < 1) t4_ntxq1g = min(nc, NTXQ_1G); if (t4_nrxq10g < 1) t4_nrxq10g = min(nc, NRXQ_10G); if (t4_nrxq1g < 1) t4_nrxq1g = min(nc, NRXQ_1G); #ifdef TCP_OFFLOAD if (t4_nofldtxq10g < 1) t4_nofldtxq10g = min(nc, NOFLDTXQ_10G); if (t4_nofldtxq1g < 1) t4_nofldtxq1g = min(nc, NOFLDTXQ_1G); if (t4_nofldrxq10g < 1) t4_nofldrxq10g = min(nc, NOFLDRXQ_10G); if (t4_nofldrxq1g < 1) t4_nofldrxq1g = min(nc, NOFLDRXQ_1G); if (t4_toecaps_allowed == -1) t4_toecaps_allowed = FW_CAPS_CONFIG_TOE; #else if (t4_toecaps_allowed == -1) t4_toecaps_allowed = 0; #endif #ifdef DEV_NETMAP if (t4_nnmtxq10g < 1) t4_nnmtxq10g = min(nc, NNMTXQ_10G); if (t4_nnmtxq1g < 1) t4_nnmtxq1g = min(nc, NNMTXQ_1G); if (t4_nnmrxq10g < 1) t4_nnmrxq10g = min(nc, NNMRXQ_10G); if (t4_nnmrxq1g < 1) t4_nnmrxq1g = min(nc, NNMRXQ_1G); #endif if (t4_tmr_idx_10g < 0 || t4_tmr_idx_10g >= SGE_NTIMERS) t4_tmr_idx_10g = TMR_IDX_10G; if (t4_pktc_idx_10g < -1 || t4_pktc_idx_10g >= SGE_NCOUNTERS) t4_pktc_idx_10g = PKTC_IDX_10G; if (t4_tmr_idx_1g < 0 || t4_tmr_idx_1g >= SGE_NTIMERS) t4_tmr_idx_1g = TMR_IDX_1G; if (t4_pktc_idx_1g < -1 || t4_pktc_idx_1g >= SGE_NCOUNTERS) t4_pktc_idx_1g = PKTC_IDX_1G; if (t4_qsize_txq < 128) t4_qsize_txq = 128; if (t4_qsize_rxq < 128) t4_qsize_rxq = 128; while (t4_qsize_rxq & 7) t4_qsize_rxq++; t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX; } static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); static int mod_event(module_t mod, int cmd, void *arg) { int rc = 0; static int loaded = 0; switch (cmd) { case MOD_LOAD: sx_xlock(&mlu); if (loaded++ == 0) { t4_sge_modload(); sx_init(&t4_list_lock, "T4/T5 adapters"); SLIST_INIT(&t4_list); #ifdef TCP_OFFLOAD sx_init(&t4_uld_list_lock, "T4/T5 ULDs"); SLIST_INIT(&t4_uld_list); #endif t4_tracer_modload(); tweak_tunables(); } sx_xunlock(&mlu); break; case MOD_UNLOAD: sx_xlock(&mlu); if (--loaded == 0) { int tries; sx_slock(&t4_list_lock); if (!SLIST_EMPTY(&t4_list)) { rc = EBUSY; sx_sunlock(&t4_list_lock); goto done_unload; } #ifdef TCP_OFFLOAD sx_slock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; sx_sunlock(&t4_uld_list_lock); sx_sunlock(&t4_list_lock); goto done_unload; } #endif tries = 0; while (tries++ < 5 && t4_sge_extfree_refs() != 0) { uprintf("%ju clusters with custom free routine " "still is use.\n", t4_sge_extfree_refs()); pause("t4unload", 2 * hz); } #ifdef TCP_OFFLOAD sx_sunlock(&t4_uld_list_lock); #endif sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { t4_tracer_modunload(); #ifdef TCP_OFFLOAD sx_destroy(&t4_uld_list_lock); #endif sx_destroy(&t4_list_lock); t4_sge_modunload(); loaded = 0; } else { rc = EBUSY; loaded++; /* undo earlier decrement */ } } done_unload: sx_xunlock(&mlu); break; } return (rc); } static devclass_t t4_devclass, t5_devclass; static devclass_t cxgbe_devclass, cxl_devclass; DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0); MODULE_VERSION(t4nex, 1); MODULE_DEPEND(t4nex, firmware, 1, 1, 1); DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0); MODULE_VERSION(t5nex, 1); MODULE_DEPEND(t5nex, firmware, 1, 1, 1); DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0); MODULE_VERSION(cxgbe, 1); DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0); MODULE_VERSION(cxl, 1); Index: user/ae/inet6/sys/geom/geom_map.c =================================================================== --- user/ae/inet6/sys/geom/geom_map.c (revision 271452) +++ user/ae/inet6/sys/geom/geom_map.c (revision 271453) @@ -1,394 +1,394 @@ /*- * Copyright (c) 2010-2011 Aleksandr Rybalko * based on geom_redboot.c * Copyright (c) 2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAP_CLASS_NAME "MAP" #define MAP_MAXSLICE 64 #define MAP_MAX_MARKER_LEN 64 struct g_map_softc { off_t offset[MAP_MAXSLICE]; /* offset in flash */ off_t size[MAP_MAXSLICE]; /* image size in bytes */ off_t entry[MAP_MAXSLICE]; off_t dsize[MAP_MAXSLICE]; uint8_t readonly[MAP_MAXSLICE]; g_access_t *parent_access; }; static int g_map_access(struct g_provider *pp, int dread, int dwrite, int dexcl) { struct g_geom *gp; struct g_slicer *gsp; struct g_map_softc *sc; gp = pp->geom; gsp = gp->softc; sc = gsp->softc; if (dwrite > 0 && sc->readonly[pp->index]) return (EPERM); return (sc->parent_access(pp, dread, dwrite, dexcl)); } static int g_map_start(struct bio *bp) { struct g_provider *pp; struct g_geom *gp; struct g_map_softc *sc; struct g_slicer *gsp; int idx; pp = bp->bio_to; idx = pp->index; gp = pp->geom; gsp = gp->softc; sc = gsp->softc; if (bp->bio_cmd == BIO_GETATTR) { if (g_handleattr_int(bp, MAP_CLASS_NAME "::entry", sc->entry[idx])) { return (1); } if (g_handleattr_int(bp, MAP_CLASS_NAME "::dsize", sc->dsize[idx])) { return (1); } } return (0); } static void g_map_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) { struct g_map_softc *sc; struct g_slicer *gsp; gsp = gp->softc; sc = gsp->softc; g_slice_dumpconf(sb, indent, gp, cp, pp); if (pp != NULL) { if (indent == NULL) { sbuf_printf(sb, " entry %jd", (intmax_t)sc->entry[pp->index]); sbuf_printf(sb, " dsize %jd", (intmax_t)sc->dsize[pp->index]); } else { sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->entry[pp->index]); sbuf_printf(sb, "%s%jd\n", indent, (intmax_t)sc->dsize[pp->index]); } } } static int find_marker(struct g_consumer *cp, const char *line, off_t *offset) { off_t search_start, search_offset, search_step; size_t sectorsize; uint8_t *buf; char *op, key[MAP_MAX_MARKER_LEN], search_key[MAP_MAX_MARKER_LEN]; int ret, c; /* Try convert to numeric first */ *offset = strtouq(line, &op, 0); if (*op == '\0') return (0); bzero(search_key, MAP_MAX_MARKER_LEN); sectorsize = cp->provider->sectorsize; ret = sscanf(line, "search:%qi:%qi:%63c", &search_start, &search_step, search_key); if (ret < 3) return (1); if (bootverbose) { - printf("MAP: search key \"%s\" from 0x%jx, step 0x%jx\n", - search_key, (intmax_t)search_start, (intmax_t)search_step); + printf("MAP: search %s for key \"%s\" from 0x%jx, step 0x%jx\n", + cp->geom->name, search_key, (intmax_t)search_start, (intmax_t)search_step); } /* error if search_key is empty */ if (strlen(search_key) < 1) return (1); /* sscanf successful, and we start marker search */ for (search_offset = search_start; search_offset < cp->provider->mediasize; search_offset += search_step) { g_topology_unlock(); buf = g_read_data(cp, rounddown(search_offset, sectorsize), roundup(strlen(search_key), sectorsize), NULL); g_topology_lock(); /* Wildcard, replace '.' with byte from data */ /* TODO: add support wildcard escape '\.' */ strncpy(key, search_key, MAP_MAX_MARKER_LEN); for (c = 0; c < MAP_MAX_MARKER_LEN && key[c]; c++) { if (key[c] == '.') { key[c] = ((char *)(buf + (search_offset % sectorsize)))[c]; } } if (buf != NULL && strncmp(buf + search_offset % sectorsize, key, strlen(search_key)) == 0) { g_free(buf); /* Marker found, so return their offset */ *offset = search_offset; return (0); } g_free(buf); } /* Marker not found */ return (1); } static int g_map_parse_part(struct g_class *mp, struct g_provider *pp, struct g_consumer *cp, struct g_geom *gp, struct g_map_softc *sc, int i) { const char *value, *name; char *op; off_t start, end, offset, size, dsize; int readonly, ret; /* hint.map.0.at="cfid0" - bind to cfid0 media */ if (resource_string_value("map", i, "at", &value) != 0) return (1); /* Check if this correct provider */ if (strcmp(pp->name, value) != 0) return (1); /* * hint.map.0.name="uboot" - name of partition, will be available * as "/dev/map/uboot" */ if (resource_string_value("map", i, "name", &name) != 0) { if (bootverbose) printf("MAP: hint.map.%d has no name\n", i); return (1); } /* * hint.map.0.start="0x00010000" - partition start at 0x00010000 * or hint.map.0.start="search:0x00010000:0x200:marker text" - * search for text "marker text", begin at 0x10000, step 0x200 * until we found marker or end of media reached */ if (resource_string_value("map", i, "start", &value) != 0) { if (bootverbose) printf("MAP: \"%s\" has no start value\n", name); return (1); } if (find_marker(cp, value, &start) != 0) { if (bootverbose) { printf("MAP: \"%s\" can't parse/use start value\n", name); } return (1); } /* like "start" */ if (resource_string_value("map", i, "end", &value) != 0) { if (bootverbose) printf("MAP: \"%s\" has no end value\n", name); return (1); } if (find_marker(cp, value, &end) != 0) { if (bootverbose) { printf("MAP: \"%s\" can't parse/use start value\n", name); } return (1); } /* variable readonly optional, disable write access */ if (resource_int_value("map", i, "readonly", &readonly) != 0) readonly = 0; /* offset of partition data, from partition begin */ if (resource_string_value("map", i, "offset", &value) == 0) { offset = strtouq(value, &op, 0); if (*op != '\0') { if (bootverbose) { printf("MAP: \"%s\" can't parse offset\n", name); } return (1); } } else { offset = 0; } /* partition data size */ if (resource_string_value("map", i, "dsize", &value) == 0) { dsize = strtouq(value, &op, 0); if (*op != '\0') { if (bootverbose) { printf("MAP: \"%s\" can't parse dsize\n", name); } return (1); } } else { dsize = 0; } size = end - start; if (dsize == 0) dsize = size - offset; /* end is 0 or size is 0, No MAP - so next */ if (end < start) { if (bootverbose) { printf("MAP: \"%s\", \"end\" less than " "\"start\"\n", name); } return (1); } if (offset + dsize > size) { if (bootverbose) { printf("MAP: \"%s\", \"dsize\" bigger than " "partition - offset\n", name); } return (1); } ret = g_slice_config(gp, i, G_SLICE_CONFIG_SET, start + offset, dsize, cp->provider->sectorsize, "map/%s", name); if (ret != 0) { if (bootverbose) { printf("MAP: g_slice_config returns %d for \"%s\"\n", ret, name); } return (1); } if (bootverbose) { - printf("MAP: %jxx%jx, data=%jxx%jx " + printf("MAP: %s: %jxx%jx, data=%jxx%jx " "\"/dev/map/%s\"\n", - (intmax_t)start, (intmax_t)size, (intmax_t)offset, + cp->geom->name, (intmax_t)start, (intmax_t)size, (intmax_t)offset, (intmax_t)dsize, name); } sc->offset[i] = start; sc->size[i] = size; sc->entry[i] = offset; sc->dsize[i] = dsize; sc->readonly[i] = readonly ? 1 : 0; return (0); } static struct g_geom * g_map_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_map_softc *sc; struct g_consumer *cp; struct g_geom *gp; int i; g_trace(G_T_TOPOLOGY, "map_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); if (strcmp(pp->geom->class->name, MAP_CLASS_NAME) == 0) return (NULL); gp = g_slice_new(mp, MAP_MAXSLICE, pp, &cp, &sc, sizeof(*sc), g_map_start); if (gp == NULL) return (NULL); /* interpose our access method */ sc->parent_access = gp->access; gp->access = g_map_access; for (i = 0; i < MAP_MAXSLICE; i++) g_map_parse_part(mp, pp, cp, gp, sc, i); g_access(cp, -1, 0, 0); if (LIST_EMPTY(&gp->provider)) { if (bootverbose) printf("MAP: No valid partition found at %s\n", pp->name); g_slice_spoiled(cp); return (NULL); } return (gp); } static void g_map_config(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_geom *gp; g_topology_assert(); gp = gctl_get_geom(req, mp, "geom"); if (gp == NULL) return; gctl_error(req, "Unknown verb"); } static struct g_class g_map_class = { .name = MAP_CLASS_NAME, .version = G_VERSION, .taste = g_map_taste, .dumpconf = g_map_dumpconf, .ctlreq = g_map_config, }; DECLARE_GEOM_CLASS(g_map_class, g_map); Index: user/ae/inet6/sys/net/if.c =================================================================== --- user/ae/inet6/sys/net/if.c (revision 271452) +++ user/ae/inet6/sys/net/if.c (revision 271453) @@ -1,4098 +1,4086 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #endif /* INET */ #ifdef INET6 #include #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include #ifdef COMPAT_FREEBSD32 #include #include #endif struct ifindex_entry { struct ifnet *ife_ifnet; }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*bridge_linkstate_p)(struct ifnet *ifp); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); static void if_init(void *); static void if_grow(void); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int if_rtdel(struct radix_node *, void *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int); static void if_detach_internal(struct ifnet *, int); #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifindex_entry *, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; struct sx ifnet_sxlock; /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx].ife_ifnet); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { IFNET_RUNLOCK_NOSLEEP(); return (NULL); } if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static int ifindex_alloc_locked(u_short *idxp) { u_short idx; IFNET_WLOCK_ASSERT(); retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx].ife_ifnet == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { if_grow(); goto retry; } if (idx > V_if_index) V_if_index = idx; *idxp = idx; return (0); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx].ife_ifnet = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index].ife_ifnet == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx].ife_ifnet = ifp; } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { IFNET_WLOCK(); ifnet_setbyindex_locked(idx, ifp); IFNET_WUNLOCK(); } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); ifa = ifnet_byindex_locked(idx)->if_addr; if (ifa != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); if_grow(); /* create initial table */ IFNET_WUNLOCK(); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); /* ARGSUSED*/ static void if_init(void *dummy __unused) { IFNET_LOCK_INIT(); if_clone_init(); } SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); #endif static void if_grow(void) { int oldlim; u_int n; struct ifindex_entry *e; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return; } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); free((caddr_t)V_ifindex_table, M_IFNET); } V_if_indexlim <<= 1; V_ifindex_table = e; } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc(u_char type) { struct ifnet *ifp; u_short idx; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); IFNET_WLOCK(); if (ifindex_alloc_locked(&idx) != 0) { IFNET_WUNLOCK(); free(ifp, M_IFNET); return (NULL); } ifnet_setbyindex_locked(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); TAILQ_INIT(&ifp->if_addrhead); TAILQ_INIT(&ifp->if_multiaddrs); TAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ if (ifp->if_description != NULL) free(ifp->if_description, M_IFDESCR); IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); free(ifp, M_IFNET); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) if_free_internal(ifp); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; if_free_internal(ifp); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initalization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0); } static void if_attach_internal(struct ifnet *ifp, int vmove) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_get_counter == NULL) ifp->if_get_counter = if_get_counter_compat; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; #if defined(INET) || defined(INET6) /* Initialize to max value. */ if (ifp->if_hw_tsomax == 0) ifp->if_hw_tsomax = min(IP_MAXPACKET, 32 * MCLBYTES - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); KASSERT(ifp->if_hw_tsomax <= IP_MAXPACKET && ifp->if_hw_tsomax >= IP_MAXPACKET / 8, ("%s: tsomax outside of range", __func__)); #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_attachdomain(void *dummy) { struct ifnet *ifp; TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ if (IF_AFDATA_TRYLOCK(ifp) == 0) return; if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 1); IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0); CURVNET_RESTORE(); } static void if_detach_internal(struct ifnet *ifp, int vmove) { struct ifaddr *ifa; struct radix_node_head *rnh; int i, j; struct domain *dp; struct ifnet *iter; int found = 0; IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } #ifdef VIMAGE if (found) curvnet->vnet_ifcnt--; #endif IFNET_WUNLOCK(); if (!found) { if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ } /* * Remove/wait for pending events. */ taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Remove routes and flush queues. */ if_down(ifp); #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ ifp->if_addr = NULL; /* We can now free link ifaddr. */ if (!TAILQ_EMPTY(&ifp->if_addrhead)) { ifa = TAILQ_FIRST(&ifp->if_addrhead); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); ifa_free(ifa); } } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ for (i = 1; i <= AF_MAX; i++) { for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); RADIX_NODE_HEAD_UNLOCK(rnh); } } if_delgroups(ifp); /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); } } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { u_short idx; /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. */ if_detach_internal(ifp, 1); /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); IFNET_WLOCK(); if (ifindex_alloc_locked(&idx) != 0) { IFNET_WUNLOCK(); panic("if_index overflow"); } ifp->if_index = idx; ifnet_setbyindex_locked(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); CURVNET_RESTORE(); if (difp != NULL) { prison_free(pr); return (EEXIST); } /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while (!TAILQ_EMPTY(&ifp->if_groups)) { ifgl = TAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } /* * Stores all groups from an interface in memory pointed * to by data */ static int if_getgroup(struct ifgroupreq *data, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; struct ifgroupreq *ifgr = data; if (ifgr->ifgr_len == 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; /* XXX: wire */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by data */ static int if_getgroupmembers(struct ifgroupreq *data) { struct ifgroupreq *ifgr = data; struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rn pointer to node in the routing table * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated * */ static int if_rtdel(struct radix_node *rn, void *arg) { struct rtentry *rt = (struct rtentry *)rn; struct ifnet *ifp = arg; int err; if (rt->rt_ifp == ifp) { /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags|RTF_RNH_LOCKED|RTF_PINNED, (struct rtentry **) NULL, rt->rt_fibnum); if (err) { log(LOG_WARNING, "if_rtdel: error %d\n", err); } } return (0); } /* * Return counter values from old racy non-pcpu counters. */ uint64_t if_get_counter_compat(struct ifnet *ifp, ifnet_counter cnt) { switch (cnt) { case IFCOUNTER_IPACKETS: return (ifp->if_ipackets); case IFCOUNTER_IERRORS: return (ifp->if_ierrors); case IFCOUNTER_OPACKETS: return (ifp->if_opackets); case IFCOUNTER_OERRORS: return (ifp->if_oerrors); case IFCOUNTER_COLLISIONS: return (ifp->if_collisions); case IFCOUNTER_IBYTES: return (ifp->if_ibytes); case IFCOUNTER_OBYTES: return (ifp->if_obytes); case IFCOUNTER_IMCASTS: return (ifp->if_imcasts); case IFCOUNTER_OMCASTS: return (ifp->if_omcasts); case IFCOUNTER_IQDROPS: return (ifp->if_iqdrops); case IFCOUNTER_OQDROPS: return (ifp->if_oqdrops); case IFCOUNTER_NOPROTO: return (ifp->if_noproto); } panic("%s: unknown counter %d", __func__, cnt); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { IF_ADDR_RLOCK(ifp); } void if_addr_runlock(struct ifnet *ifp) { IF_ADDR_RUNLOCK(ifp); } void if_maddr_rlock(if_t ifp) { IF_ADDR_RLOCK((struct ifnet *)ifp); } void if_maddr_runlock(if_t ifp) { IF_ADDR_RUNLOCK((struct ifnet *)ifp); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) { counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { int error = 0; struct rtentry *rt = NULL; struct rt_addrinfo info; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; bzero(&info, sizeof(info)); info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; error = rtrequest1_fib(RTM_ADD, &info, &rt, ifa->ifa_ifp->if_fib); if (error == 0 && rt != NULL) { RT_LOCK(rt); ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = ifa->ifa_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = ifa->ifa_ifp->if_index; RT_REMREF(rt); RT_UNLOCK(rt); } else if (error != 0) log(LOG_DEBUG, "%s: insertion failed: %u\n", __func__, error); return (error); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { int error = 0; struct rt_addrinfo info; struct sockaddr_dl null_sdl; bzero(&null_sdl, sizeof(null_sdl)); null_sdl.sdl_len = sizeof(null_sdl); null_sdl.sdl_family = AF_LINK; null_sdl.sdl_type = ifa->ifa_ifp->if_type; null_sdl.sdl_index = ifa->ifa_ifp->if_index; bzero(&info, sizeof(info)); info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; error = rtrequest1_fib(RTM_DELETE, &info, NULL, ifa->ifa_ifp->if_fib); if (error != 0) log(LOG_DEBUG, "%s: deletion failed: %u\n", __func__, error); return (error); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *sa, int fib) { struct rtentry *rt; rt = rtalloc1_fib(sa, 0, 0, fib); if (rt == NULL) { log(LOG_DEBUG, "%s: fail", __func__); return (EHOSTUNREACH); } ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = ifa->ifa_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = ifa->ifa_ifp->if_index; RTFREE_LOCKED(rt); return (0); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((struct sockaddr_dl *)(a1))->sdl_len == \ ((struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(LLADDR((struct sockaddr_dl *)(a1)), \ LLADDR((struct sockaddr_dl *)(a2)), \ ((struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ static struct ifaddr * ifa_ifwithaddr_internal(struct sockaddr *addr, int getref) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } struct ifaddr * ifa_ifwithaddr(struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 1)); } int ifa_ifwithaddr_check(struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 0) != NULL); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * -ifa_ifwithbroadaddr(struct sockaddr *addr) +ifa_ifwithbroadaddr(struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) + continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithdstaddr_fib(struct sockaddr *addr, int fibnum) +ifa_ifwithdstaddr(struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } -struct ifaddr * -ifa_ifwithdstaddr(struct sockaddr *addr) -{ - - return (ifa_ifwithdstaddr_fib(addr, RT_ALL_FIBS)); -} - /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * -ifa_ifwithnet_fib(struct sockaddr *addr, int ignore_ptp, int fibnum) +ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; char *addr_data = addr->sa_data, *cplim; /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } #ifdef INET6 if (af == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && sin6->sin6_scope_id != 0) { ifp = in6_getlinkifnet(sin6->sin6_scope_id); if (ifp != NULL) return ((struct ifaddr *) in6ifa_ifpforlinklocal(ifp, 0)); } } #endif /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. Maintain a reference * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that * kept it stable when we move onto the next interface. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { if (ifa_maybe != NULL) ifa_free(ifa_maybe); ifa_maybe = ifa; ifa_ref(ifa_maybe); } } } IF_ADDR_RUNLOCK(ifp); } ifa = ifa_maybe; ifa_maybe = NULL; done: IFNET_RUNLOCK_NOSLEEP(); if (ifa_maybe != NULL) ifa_free(ifa_maybe); return (ifa); -} - -struct ifaddr * -ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp) -{ - - return (ifa_ifwithnet_fib(addr, ignore_ptp, RT_ALL_FIBS)); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; RT_LOCK_ASSERT(rt); if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) return; ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { oifa = rt->rt_ifa; rt->rt_ifa = ifa; ifa_free(oifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && IFP2AC(ifp)->ac_netgraph != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) (*bridge_linkstate_p)(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != 0) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifunit(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr->ifr_buffer.length < descrlen) ifr->ifr_buffer.buffer = NULL; else error = copyout(ifp->if_description, ifr->ifr_buffer.buffer, descrlen); ifr->ifr_buffer.length = descrlen; } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr->ifr_buffer.length > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr->ifr_buffer.length == 0) descrbuf = NULL; else { descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr->ifr_buffer.buffer, descrbuf, ifr->ifr_buffer.length - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { if_up(ifp); } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; log(LOG_INFO, "%s: permanently promiscuous mode %s\n", ifp->if_xname, (new_flags & IFF_PPROMISC) ? "enabled" : "disabled"); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); log(LOG_INFO, "%s: changing name to '%s'\n", ifp->if_xname, new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFGENERIC: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); EVENTHANDLER_INVOKE(iflladdr_event, ifp); break; case SIOCAIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr->ifgr_group))) return (error); break; } case SIOCGIFGROUP: if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp))) return (error); break; case SIOCDIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr->ifgr_group))) return (error); break; } default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; CURVNET_SET(so->so_vnet); switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); CURVNET_RESTORE(); return (error); #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL); CURVNET_RESTORE(); return (error); case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); CURVNET_RESTORE(); return (error); case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); CURVNET_RESTORE(); return (error); case SIOCGIFGMEMB: error = if_getgroupmembers((struct ifgroupreq *)data); CURVNET_RESTORE(); return (error); #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); CURVNET_RESTORE(); return (error); #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { CURVNET_RESTORE(); return (ENXIO); } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) { if_rele(ifp); CURVNET_RESTORE(); return (error); } oif_flags = ifp->if_flags; if (so->so_proto == NULL) { if_rele(ifp); CURVNET_RESTORE(); return (EOPNOTSUPP); } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } if_rele(ifp); CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC)) log(LOG_INFO, "%s: promiscuous mode %s\n", ifp->if_xname, (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr_name buffer to make sure we don't * disclose the contents of the stack. */ memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ static void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma(struct ifmultiaddr *ifma) { struct ifnet *ifp; int lastref; ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) { printf("%s: ifnet %p disappeared\n", __func__, ifp); ifp = NULL; } IFNET_RUNLOCK_NOSLEEP(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, 0); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); } if_freemulti(ll_ifma); } } if (ifp != NULL) TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. */ int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { ifa_free(ifa); return (EINVAL); } if (len != sdl->sdl_alen) { /* don't allow length to change */ ifa_free(ifa); return (EINVAL); } switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_XETHER: case IFT_ISO88025: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_ARCNET: case IFT_IEEE8023ADLAG: case IFT_IEEE80211: bcopy(lladdr, LLADDR(sdl), len); ifa_free(ifa); break; default: ifa_free(ifa); return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } #ifdef INET /* * Also send gratuitous ARPs to notify other nodes about * the address change. */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } #endif } return (0); } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char * fmt, ...) { va_list ap; int retval; retval = printf("%s: ", ifp->if_xname); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { _IF_DROP(ifq); IF_UNLOCK(ifq); m_freem(m); return (0); } if (ifp != NULL) { ifp->if_obytes += m->m_pkthdr.len + adjust; if (m->m_flags & (M_BCAST|M_MCAST)) ifp->if_omcasts++; active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } /* Statistics */ int if_incipackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_ipackets += pkts; return (0); } int if_incopackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_opackets += pkts; return (0); } int if_incierrors(if_t ifp, int ierrors) { ((struct ifnet *)ifp)->if_ierrors += ierrors; return (0); } int if_setierrors(if_t ifp, int ierrors) { ((struct ifnet *)ifp)->if_ierrors = ierrors; return (0); } int if_setoerrors(if_t ifp, int oerrors) { ((struct ifnet *)ifp)->if_oerrors = oerrors; return (0); } int if_incoerrors(if_t ifp, int oerrors) { ((struct ifnet *)ifp)->if_oerrors += oerrors; return (0); } int if_inciqdrops(if_t ifp, int val) { ((struct ifnet *)ifp)->if_iqdrops += val; return (0); } int if_setcollisions(if_t ifp, int collisions) { ((struct ifnet *)ifp)->if_collisions = collisions; return (0); } int if_inccollisions(if_t ifp, int collisions) { ((struct ifnet *)ifp)->if_collisions += collisions; return (0); } int if_setipackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_ipackets = pkts; return (0); } int if_setopackets(if_t ifp, int pkts) { ((struct ifnet *)ifp)->if_opackets = pkts; return (0); } int if_incobytes(if_t ifp, int bytes) { ((struct ifnet *)ifp)->if_obytes += bytes; return (0); } int if_setibytes(if_t ifp, int bytes) { ((struct ifnet *)ifp)->if_ibytes = bytes; return (0); } int if_setobytes(if_t ifp, int bytes) { ((struct ifnet *)ifp)->if_obytes = bytes; return (0); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } int if_getiqdrops(if_t ifp) { return ((struct ifnet *)ifp)->if_iqdrops; } int if_incimcasts(if_t ifp, int mcast) { ((struct ifnet *)ifp)->if_imcasts += mcast; return (0); } int if_incomcasts(if_t ifp, int mcast) { ((struct ifnet *)ifp)->if_omcasts += mcast; return (0); } int if_setimcasts(if_t ifp, int mcast) { ((struct ifnet *)ifp)->if_imcasts = mcast; return (0); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse_drv(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: user/ae/inet6/sys/net/if_var.h =================================================================== --- user/ae/inet6/sys/net/if_var.h (revision 271452) +++ user/ae/inet6/sys/net/if_var.h (revision 271453) @@ -1,622 +1,619 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with three parameters: * (*ifp->if_output)(ifp, m, dst, rt) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of an internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating an interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ struct rtentry; /* ifa_rtrequest */ struct rt_addrinfo; /* ifa_rtrequest */ struct socket; struct carp_if; struct carp_softc; struct ifvlantrunk; struct route; /* if_output */ struct vnet; struct ifmedia; struct netmap_adapter; #ifdef _KERNEL #include /* ifqueue only? */ #include #include #endif /* _KERNEL */ #include #include /* XXX */ #include /* struct ifqueue */ #include /* XXX */ #include /* XXX */ #include /* if_link_task */ #define IF_DUNIT_NONE -1 #include TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ TAILQ_HEAD(ifmultihead, ifmultiaddr); TAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ #define V_link_pfil_hook VNET(link_pfil_hook) #endif /* _KERNEL */ typedef enum { IFCOUNTER_IPACKETS = 1, IFCOUNTER_IERRORS, IFCOUNTER_OPACKETS, IFCOUNTER_OERRORS, IFCOUNTER_COLLISIONS, IFCOUNTER_IBYTES, IFCOUNTER_OBYTES, IFCOUNTER_IMCASTS, IFCOUNTER_OMCASTS, IFCOUNTER_IQDROPS, IFCOUNTER_OQDROPS, IFCOUNTER_NOPROTO, } ifnet_counter; typedef struct ifnet * if_t; typedef void (*if_start_fn_t)(if_t); typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); typedef void (*if_init_fn_t)(void *); typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ifnet_counter); /* * Structure defining a network interface. * * Size ILP32: 592 (approx) * LP64: 1048 (approx) */ struct ifnet { /* General book keeping of interface lists. */ TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ u_short if_index; /* numeric abbreviation for this if */ short if_index_reserved; /* spare space to grow if_index */ char if_xname[IFNAMSIZ]; /* external name (name + unit) */ char *if_description; /* interface description */ /* Variable fields that are touched by the stack and drivers. */ int if_flags; /* up/down, broadcast, etc. */ int if_drv_flags; /* driver-managed status flags */ int if_capabilities; /* interface features & capabilities */ int if_capenable; /* enabled features & capabilities */ void *if_linkmib; /* link-type-specific MIB data */ size_t if_linkmiblen; /* length of above data */ u_int if_refcount; /* reference count */ /* These fields are shared with struct if_data. */ uint8_t if_type; /* ethernet, tokenring, etc */ uint8_t if_addrlen; /* media address length */ uint8_t if_hdrlen; /* media header length */ uint8_t if_link_state; /* current link state */ uint32_t if_mtu; /* maximum transmission unit */ uint32_t if_metric; /* routing metric (external only) */ uint64_t if_baudrate; /* linespeed */ uint64_t if_hwassist; /* HW offload capabilities, see IFCAP */ time_t if_epoch; /* uptime at attach or stat reset */ struct timeval if_lastchange; /* time of last administrative change */ struct ifaltq if_snd; /* output queue (includes altq) */ struct task if_linktask; /* task for link change events */ /* Addresses of different protocol families assigned to this if. */ struct rwlock if_addr_lock; /* lock to protect address lists */ /* * if_addrhead is the list of all addresses associated to * an interface. * Some code in the kernel assumes that first element * of the list has type AF_LINK, and contains sockaddr_dl * addresses which store the link-level address and the name * of the interface. * However, access to the AF_LINK address through this * field is deprecated. Use if_addr or ifaddr_byindex() instead. */ struct ifaddrhead if_addrhead; /* linked list of addresses per if */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ struct ifaddr *if_addr; /* pointer to link-level address */ const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ struct rwlock if_afdata_lock; void *if_afdata[AF_MAX]; int if_afdata_initialized; /* Additional features hung off the interface. */ u_int if_fib; /* interface FIB */ struct vnet *if_vnet; /* pointer to network stack instance */ struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ struct bpf_if *if_bpf; /* packet filter structure */ int if_pcount; /* number of promiscuous listeners */ void *if_bridge; /* bridge glue */ void *if_lagg; /* lagg glue */ void *if_pf_kif; /* pf glue */ struct carp_if *if_carp; /* carp interface structure */ struct label *if_label; /* interface MAC label */ struct netmap_adapter *if_netmap; /* netmap(4) softc */ /* Various procedures of the layer2 encapsulation and drivers. */ int (*if_output) /* output routine (enqueue) */ (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void (*if_input) /* input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); if_start_fn_t if_start; /* initiate output routine */ if_ioctl_fn_t if_ioctl; /* ioctl routine */ if_init_fn_t if_init; /* Init routine */ int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); if_qflush_fn_t if_qflush; /* flush any queue */ if_transmit_fn_t if_transmit; /* initiate output routine */ void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); if_get_counter_t if_get_counter; /* get counter values */ /* Stuff that's only temporary and doesn't belong here. */ u_int if_hw_tsomax; /* tso burst length limit, the minimum * is (IP_MAXPACKET / 8). * XXXAO: Have to find a better place * for it eventually. */ /* * Old, racy and expensive statistics, should not be used in * new drivers. */ uint64_t if_ipackets; /* packets received on interface */ uint64_t if_ierrors; /* input errors on interface */ uint64_t if_opackets; /* packets sent on interface */ uint64_t if_oerrors; /* output errors on interface */ uint64_t if_collisions; /* collisions on csma interfaces */ uint64_t if_ibytes; /* total number of octets received */ uint64_t if_obytes; /* total number of octets sent */ uint64_t if_imcasts; /* packets received via multicast */ uint64_t if_omcasts; /* packets sent via multicast */ uint64_t if_iqdrops; /* dropped on input */ uint64_t if_oqdrops; /* dropped on output */ uint64_t if_noproto; /* destined for unsupported protocol */ /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ }; #include /* XXXAO: temporary unconditional include */ /* for compatibility with other BSDs */ #define if_addrlist if_addrhead #define if_list if_link #define if_name(ifp) ((ifp)->if_xname) /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) rw_init(&(if)->if_addr_lock, "if_addr_lock") #define IF_ADDR_LOCK_DESTROY(if) rw_destroy(&(if)->if_addr_lock) #define IF_ADDR_WLOCK(if) rw_wlock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) rw_wunlock(&(if)->if_addr_lock) #define IF_ADDR_RLOCK(if) rw_rlock(&(if)->if_addr_lock) #define IF_ADDR_RUNLOCK(if) rw_runlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_LOCKED) #define IF_ADDR_WLOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_WLOCKED) /* * Function variations on locking macros intended to be used by loadable * kernel modules in order to divorce them from the internals of address list * locking. */ void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */ void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */ void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL #ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); /* interface address change event */ typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); /* new interface arrival event */ typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups */ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; TAILQ_HEAD(, ifg_member) ifg_members; TAILQ_ENTRY(ifg_group) ifg_next; }; struct ifg_member { TAILQ_ENTRY(ifg_member) ifgm_next; struct ifnet *ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; TAILQ_ENTRY(ifg_list) ifgl_next; }; #ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); /* group detach event */ typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ #define IF_AFDATA_LOCK_INIT(ifp) \ rw_init(&(ifp)->if_afdata_lock, "if_afdata") #define IF_AFDATA_WLOCK(ifp) rw_wlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_RLOCK(ifp) rw_rlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_WUNLOCK(ifp) rw_wunlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_RUNLOCK(ifp) rw_runlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) rw_try_wlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) rw_destroy(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED) #define IF_AFDATA_RLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_RLOCKED) #define IF_AFDATA_WLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_WLOCKED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED) /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). */ #define IF_MINMTU 72 #define IF_MAXMTU 65535 #define TOEDEV(ifp) ((ifp)->if_llsoftc) #endif /* _KERNEL */ /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. * * NOTE: a 'struct ifaddr' is always at the beginning of a larger * chunk of malloc'ed memory, where we store the three addresses * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. */ #if defined(_KERNEL) || defined(_WANT_IFADDR) struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct ifnet *ifa_ifp; /* back-pointer to interface */ struct carp_softc *ifa_carp; /* pointer to CARP data */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ u_int ifa_refcnt; /* references to this structure */ counter_u64_t ifa_ipackets; counter_u64_t ifa_opackets; counter_u64_t ifa_ibytes; counter_u64_t ifa_obytes; }; #endif #ifdef _KERNEL #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ /* For compatibility with other BSDs. SCTP uses it. */ #define ifa_list ifa_link struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); #endif /* _KERNEL */ /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. */ struct ifmultiaddr { TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ struct ifnet *ifma_ifp; /* back-pointer to interface */ u_int ifma_refcount; /* reference count */ void *ifma_protospec; /* protocol-specific state, if any */ struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ }; #ifdef _KERNEL extern struct rwlock ifnet_rwlock; extern struct sx ifnet_sxlock; #define IFNET_LOCK_INIT() do { \ rw_init_flags(&ifnet_rwlock, "ifnet_rw", RW_RECURSE); \ sx_init_flags(&ifnet_sxlock, "ifnet_sx", SX_RECURSE); \ } while(0) #define IFNET_WLOCK() do { \ sx_xlock(&ifnet_sxlock); \ rw_wlock(&ifnet_rwlock); \ } while (0) #define IFNET_WUNLOCK() do { \ rw_wunlock(&ifnet_rwlock); \ sx_xunlock(&ifnet_sxlock); \ } while (0) /* * To assert the ifnet lock, you must know not only whether it's for read or * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) #define IFNET_RLOCK_NOSLEEP_ASSERT() rw_assert(&ifnet_rwlock, RA_RLOCKED) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) #define IFNET_RLOCK_NOSLEEP() rw_rlock(&ifnet_rwlock) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) #define IFNET_RUNLOCK_NOSLEEP() rw_runlock(&ifnet_rwlock) /* * Look up an ifnet given its index; the _ref variant also acquires a * reference that must be freed using if_rele(). It is almost always a bug * to call ifnet_byindex() instead if ifnet_byindex_ref(). */ struct ifnet *ifnet_byindex(u_short idx); struct ifnet *ifnet_byindex_locked(u_short idx); struct ifnet *ifnet_byindex_ref(u_short idx); /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ struct ifaddr *ifaddr_byindex(u_short idx); VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(int, if_index); VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_if_index VNET(if_index) #define V_loif VNET(loif) int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_detach(struct ifnet *); void if_vmove(struct ifnet *, struct vnet *); void if_purgeaddrs(struct ifnet *); void if_delallmulti(struct ifnet *); void if_down(struct ifnet *); struct ifmultiaddr * if_findmulti(struct ifnet *, struct sockaddr *); void if_free(struct ifnet *); void if_initname(struct ifnet *, const char *, int); void if_link_state_change(struct ifnet *, int); int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); void if_ref(struct ifnet *); void if_rele(struct ifnet *); int if_setlladdr(struct ifnet *, const u_char *, int); void if_up(struct ifnet *); int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *ifunit_ref(const char *); int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *, int fib); struct ifaddr *ifa_ifwithaddr(struct sockaddr *); int ifa_ifwithaddr_check(struct sockaddr *); -struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *); -struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *); -struct ifaddr *ifa_ifwithdstaddr_fib(struct sockaddr *, int); -struct ifaddr *ifa_ifwithnet(struct sockaddr *, int); -struct ifaddr *ifa_ifwithnet_fib(struct sockaddr *, int, int); -struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *); -struct ifaddr *ifa_ifwithroute_fib(int, struct sockaddr *, struct sockaddr *, u_int); +struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *, int); +struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *, int); +struct ifaddr *ifa_ifwithnet(struct sockaddr *, int, int); +struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); struct ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *); int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); void if_data_copy(struct ifnet *, struct if_data *); uint64_t if_get_counter_compat(struct ifnet *, ifnet_counter); #define IF_LLADDR(ifp) \ LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); uint64_t if_getbaudrate(if_t ifp); int if_setcapabilities(if_t ifp, int capabilities); int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); int if_getcapabilities(if_t ifp); int if_togglecapenable(if_t ifp, int togglecap); int if_setcapenable(if_t ifp, int capenable); int if_setcapenablebit(if_t ifp, int setcap, int clearcap); int if_getcapenable(if_t ifp); const char *if_getdname(if_t ifp); int if_setdev(if_t ifp, void *dev); int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); int if_getdrvflags(if_t ifp); int if_setdrvflags(if_t ifp, int flags); int if_clearhwassist(if_t ifp); int if_sethwassistbits(if_t ifp, int toset, int toclear); int if_sethwassist(if_t ifp, int hwassist_bit); int if_gethwassist(if_t ifp); int if_setsoftc(if_t ifp, void *softc); void *if_getsoftc(if_t ifp); int if_setflags(if_t ifp, int flags); int if_setmtu(if_t ifp, int mtu); int if_getmtu(if_t ifp); int if_setflagbits(if_t ifp, int set, int clear); int if_getflags(if_t ifp); int if_sendq_empty(if_t ifp); int if_setsendqready(if_t ifp); int if_setsendqlen(if_t ifp, int tx_desc_count); int if_input(if_t ifp, struct mbuf* sendmp); int if_sendq_prepend(if_t ifp, struct mbuf *m); struct mbuf *if_dequeue(if_t ifp); int if_setifheaderlen(if_t ifp, int len); void if_setrcvif(struct mbuf *m, if_t ifp); void if_setvtag(struct mbuf *m, u_int16_t tag); u_int16_t if_getvtag(struct mbuf *m); int if_vlantrunkinuse(if_t ifp); caddr_t if_getlladdr(if_t ifp); void *if_gethandle(u_char); void if_bpfmtap(if_t ifp, struct mbuf *m); void if_etherbpfmtap(if_t ifp, struct mbuf *m); void if_vlancap(if_t ifp); int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max); int if_multiaddr_count(if_t ifp, int max); int if_getamcount(if_t ifp); struct ifaddr * if_getifaddr(if_t ifp); /* Statistics */ int if_incipackets(if_t ifp, int pkt); int if_incopackets(if_t ifp, int pkts); int if_incierrors(if_t ifp, int ierrors); int if_incoerrors(if_t ifp, int oerrors); int if_inciqdrops(if_t ifp, int val); int if_setierrors(if_t ifp, int ierrors); int if_setoerrors(if_t ifp, int oerrors); int if_setcollisions(if_t ifp, int collisions); int if_inccollisions(if_t ifp, int collisions); int if_incobytes(if_t ifp, int bytes); int if_getiqdrops(if_t ifp); int if_incimcasts(if_t ifp, int imcasts); int if_incomcasts(if_t ifp, int imcasts); int if_setipackets(if_t ifp, int pkts); int if_setopackets(if_t ifp, int pkts); int if_setibytes(if_t ifp, int bytes); int if_setobytes(if_t ifp, int bytes); int if_setimcasts(if_t ifp, int pkts); /* Functions */ void if_setinitfn(if_t ifp, void (*)(void *)); void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t)); void if_setstartfn(if_t ifp, void (*)(if_t)); void if_settransmitfn(if_t ifp, if_transmit_fn_t); void if_setqflushfn(if_t ifp, if_qflush_fn_t); /* Revisit the below. These are inline functions originally */ int drbr_inuse_drv(if_t ifp, struct buf_ring *br); struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); #endif /* _KERNEL */ #endif /* !_NET_IF_VAR_H_ */ Index: user/ae/inet6/sys/net/route.c =================================================================== --- user/ae/inet6/sys/net/route.c (revision 271452) +++ user/ae/inet6/sys/net/route.c (revision 271453) @@ -1,1934 +1,1924 @@ /*- * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif #if defined(INET) || defined(INET6) #ifdef SCTP extern void sctp_addr_change(struct ifaddr *ifa, int cmd); #endif /* SCTP */ #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ u_int rt_add_addr_allfibs = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN, &rt_add_addr_allfibs, 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) VNET_DEFINE(struct radix_node_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) static int rtrequest1_fib_change(struct radix_node_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct radix_node_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct radix_node_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", __func__)); /* rnh is [fib=0][af=0]. */ rnh = (struct radix_node_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } struct radix_node_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; rt->rt_pksent = counter_u64_alloc(how); if (rt->rt_pksent == NULL) return (ENOMEM); RT_LOCK_INIT(rt); return (0); } static void rtentry_zfini(void *mem, int size) { struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); counter_u64_free(rt->rt_pksent); } static int rtentry_ctor(void *mem, int size, void *arg, int how) { struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); counter_u64_zero(rt->rt_pksent); return (0); } static void rtentry_dtor(void *mem, int size, void *arg) { struct rtentry *rt = mem; RT_UNLOCK_COND(rt); } static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct radix_node_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; /* * XXX MRT rtattach will be also called from * vfs_export.c but the offset will be 0 (only for * AF_INET and AF_INET6 which don't need it anyhow). */ rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtattach((void **)rnh, dom->dom_rtoffset); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct radix_node_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_route_uninit, 0); #endif #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ void rtalloc(struct route *ro) { rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB); } void rtalloc_fib(struct route *ro, u_int fibnum) { rtalloc_ign_fib(ro, 0UL, fibnum); } void rtalloc_ign(struct route *ro, u_long ignore) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { struct radix_node_head *rnh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; int needlock; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rnh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rnh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ needlock = !(ignflags & RTF_RNH_LOCKED); if (needlock) RADIX_NODE_HEAD_RLOCK(rnh); #ifdef INVARIANTS else RADIX_NODE_HEAD_LOCK_ASSERT(rnh); #endif rn = rnh->rnh_matchaddr(dst, rnh); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if (needlock) RADIX_NODE_HEAD_RUNLOCK(rnh); goto done; } else if (needlock) RADIX_NODE_HEAD_RUNLOCK(rnh); /* * Either we hit the root or couldn't find any match, * Which basically means * "caint get there frm here" */ miss: V_rtstat.rts_unreach++; if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } done: if (newrt) RT_LOCK_ASSERT(newrt); return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct radix_node_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, rnh); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ Free(rt_key(rt)); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src) { rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB); } void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { struct rtentry *rt, *rt0 = NULL; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; struct radix_node_head *rnh; ifa = NULL; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; goto out; } /* verify the gateway is directly reachable */ - if ((ifa = ifa_ifwithnet_fib(gateway, 0, fibnum)) == NULL) { + if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt && (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) error = EINVAL; else if (ifa_ifwithaddr_check(gateway)) error = EHOSTUNREACH; if (error) goto done; /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: rt0 = rt; rt = NULL; flags |= RTF_GATEWAY | RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; if (rt0 != NULL) RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */ error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); if (rt0 != NULL) EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst); flags = rt->rt_flags; } if (rt0 != NULL) RTFREE(rt0); stat = &V_rtstat.rts_dynamic; } else { struct rtentry *gwrt; /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); RADIX_NODE_HEAD_LOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED); RADIX_NODE_HEAD_UNLOCK(rnh); EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst); RTFREE_LOCKED(gwrt); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); out: if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); if (ifa != NULL) ifa_free(ifa); } int rtioctl(u_long req, caddr_t data) { return (rtioctl_fib(req, data, RT_DEFAULT_FIB)); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } -/* - * For both ifa_ifwithroute() routines, 'ifa' is returned referenced. - */ struct ifaddr * -ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway) -{ - - return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB)); -} - -struct ifaddr * -ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, +ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) - ifa = ifa_ifwithdstaddr_fib(dst, fibnum); + ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ - ifa = ifa_ifwithdstaddr_fib(gateway, fibnum); + ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) - ifa = ifa_ifwithnet_fib(gateway, 0, fibnum); + ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum); if (rt == NULL) return (NULL); /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; ifa_ref(ifa); } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) return (NULL); } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; else ifa_free(oifa); } return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt) { return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, RT_DEFAULT_FIB)); } int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags int rt_getifa(struct rt_addrinfo *info) { return (rt_getifa_fib(info, RT_DEFAULT_FIB)); } /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct ifaddr *ifa; int error = 0; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && - (ifa = ifa_ifwithnet_fib(ifpaddr, 0, fibnum)) != NULL) { + (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; ifa_free(ifa); } if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) - info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway, + info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) - info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa, + info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } /* * Expunges references to a route that's about to be reclaimed. * The route must be locked. */ int rt_expunge(struct radix_node_head *rnh, struct rtentry *rt) { #if !defined(RADIX_MPATH) struct radix_node *rn; #else struct rt_addrinfo info; int fib; struct rtentry *rt0; #endif struct ifaddr *ifa; int error = 0; RT_LOCK_ASSERT(rt); RADIX_NODE_HEAD_LOCK_ASSERT(rnh); #ifdef RADIX_MPATH fib = rt->rt_fibnum; bzero(&info, sizeof(info)); info.rti_ifp = rt->rt_ifp; info.rti_flags = RTF_RNH_LOCKED; info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr; RT_UNLOCK(rt); error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib); if (error == 0 && rt0 != NULL) { rt = rt0; RT_LOCK(rt); } else if (error != 0) { RT_LOCK(rt); return (error); } #else /* * Remove the item from the tree; it should be there, * but when callers invoke us blindly it may not (sigh). */ rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh); if (rn == NULL) { error = ESRCH; goto bad; } KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0, ("unexpected flags 0x%x", rn->rn_flags)); KASSERT(rt == RNTORT(rn), ("lookup mismatch, rt %p rn %p", rt, rn)); #endif /* RADIX_MPATH */ rt->rt_flags &= ~RTF_UP; /* * Give the protocol a chance to keep things in sync. */ if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) { struct rt_addrinfo info; bzero((caddr_t)&info, sizeof(info)); info.rti_flags = rt->rt_flags; info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); ifa->ifa_rtrequest(RTM_DELETE, rt, &info); } /* * one more rtentry floating around that is not * linked to the routing table. */ V_rttrash++; #if !defined(RADIX_MPATH) bad: #endif return (error); } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); } return (i); } #endif #ifdef RADIX_MPATH static int rn_mpath_update(int req, struct rt_addrinfo *info, struct radix_node_head *rnh, struct rtentry **ret_nrt) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt, *rto = NULL; struct radix_node *rn; int error = 0; rn = rnh->rnh_lookup(dst, netmask, rnh); if (rn == NULL) return (ESRCH); rto = rt = RNTORT(rn); rt = rt_mpath_matchgate(rt, gateway); if (rt == NULL) return (ESRCH); /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gateway && (rt->rt_gateway->sa_len != gateway->sa_len || memcmp(rt->rt_gateway, gateway, gateway->sa_len))) error = ESRCH; else { /* * remove from tree before returning it * to the caller */ rn = rnh->rnh_deladdr(dst, netmask, rnh); KASSERT(rt == RNTORT(rn), ("radix node disappeared")); goto gwdelete; } } /* * use the normal delete code to remove * the first entry */ if (req != RTM_DELETE) goto nondelete; error = ENOENT; goto done; } /* * if the entry is 2nd and on up */ if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt)) panic ("rtrequest1: rt_mpath_deldup"); gwdelete: RT_LOCK(rt); RT_ADDREF(rt); if (req == RTM_DELETE) { rt->rt_flags &= ~RTF_UP; /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } nondelete: if (req != RTM_DELETE) panic("unrecognized request %d", req); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); done: return (error); } #endif int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { int error = 0, needlock = 0; struct rtentry *rt; #ifdef FLOWTABLE struct rtentry *rt0; #endif struct radix_node *rn; struct radix_node_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; #define senderr(x) { error = x ; goto bad; } KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); needlock = ((flags & RTF_RNH_LOCKED) == 0); flags &= ~RTF_RNH_LOCKED; if (needlock) RADIX_NODE_HEAD_LOCK(rnh); else RADIX_NODE_HEAD_LOCK_ASSERT(rnh); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } #ifdef RADIX_MPATH if (rn_mpath_capable(rnh)) { error = rn_mpath_update(req, info, rnh, ret_nrt); /* * "bad" holds true for the success case * as well */ if (error != ENOENT) goto bad; error = 0; } #endif if ((flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, rnh); if ((rt != NULL) && (rt->rt_flags & RTF_PINNED)) senderr(EADDRINUSE); } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ rn = rnh->rnh_deladdr(dst, netmask, rnh); if (rn == NULL) senderr(ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; /* * give the protocol a chance to keep things in sync. */ if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) ifa->ifa_rtrequest(RTM_DELETE, rt, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) senderr(EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) senderr(EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) senderr(error); } else ifa_ref(info->rti_ifa); ifa = info->rti_ifa; rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { ifa_free(ifa); senderr(ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ RT_LOCK(rt); if ((error = rt_setgate(rt, dst, gateway)) != 0) { ifa_free(ifa); uma_zfree(V_rtzone, rt); senderr(error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rn_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { ifa_free(rt->rt_ifa); Free(rt_key(rt)); uma_zfree(V_rtzone, rt); senderr(EEXIST); } #endif #ifdef FLOWTABLE rt0 = NULL; /* "flow-table" only supports IPv6 and IPv4 at the moment. */ switch (dst->sa_family) { #ifdef INET6 case AF_INET6: #endif #ifdef INET case AF_INET: #endif #if defined(INET6) || defined(INET) rn = rnh->rnh_matchaddr(dst, rnh); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { struct sockaddr *mask; u_char *m, *n; int len; /* * compare mask to see if the new route is * more specific than the existing one */ rt0 = RNTORT(rn); RT_LOCK(rt0); RT_ADDREF(rt0); RT_UNLOCK(rt0); /* * A host route is already present, so * leave the flow-table entries as is. */ if (rt0->rt_flags & RTF_HOST) { RTFREE(rt0); rt0 = NULL; } else if (!(flags & RTF_HOST) && netmask) { mask = rt_mask(rt0); len = mask->sa_len; m = (u_char *)mask; n = (u_char *)netmask; while (len-- > 0) { if (*n != *m) break; n++; m++; } if (len == 0 || (*n < *m)) { RTFREE(rt0); rt0 = NULL; } } } #endif/* INET6 || INET */ } #endif /* FLOWTABLE */ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); Free(rt_key(rt)); uma_zfree(V_rtzone, rt); #ifdef FLOWTABLE if (rt0 != NULL) RTFREE(rt0); #endif senderr(EEXIST); } #ifdef FLOWTABLE else if (rt0 != NULL) { flowtable_route_flush(dst->sa_family, rt0); RTFREE(rt0); } #endif /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); rt_setmetrics(info, rt); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } RT_UNLOCK(rt); break; case RTM_CHANGE: error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); break; default: error = EOPNOTSUPP; } bad: if (needlock) RADIX_NODE_HEAD_UNLOCK(rnh); return (error); #undef senderr } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags static int rtrequest1_fib_change(struct radix_node_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], rnh); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rn_mpath_capable(rnh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif RT_LOCK(rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { error = rt_getifa_fib(info, fibnum); if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) { rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa_free(rt->rt_ifa); } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); rt_setmetrics(info, rt); if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (free_ifa != 0) ifa_free(info->rti_ifa); return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) rt->rt_mtu = info->rti_rmx->rmx_mtu; if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); #ifdef INVARIANTS struct radix_node_head *rnh; rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family); #endif RT_LOCK_ASSERT(rt); RADIX_NODE_HEAD_LOCK_ASSERT(rnh); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct radix_node_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) { startfib = endfib = ifa->ifa_ifp->if_fib; } else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) /* this table doesn't exist but others might */ continue; RADIX_NODE_HEAD_RLOCK(rnh); rn = rnh->rnh_lookup(dst, netmask, rnh); #ifdef RADIX_MPATH if (rn_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); RADIX_NODE_HEAD_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if ((error == EEXIST) && (cmd == RTM_ADD)) { /* * Interface route addition failed. * Atomically delete current prefix generating * RTM_DELETE message, and retry adding * interface prefix. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); RADIX_NODE_HEAD_LOCK(rnh); /* Delete old prefix */ info.rti_ifa = NULL; info.rti_flags = RTF_RNH_LOCKED; error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); if (error == 0) { info.rti_ifa = ifa; info.rti_flags = flags | RTF_RNH_LOCKED | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; error = rtrequest1_fib(cmd, &info, &rt, fibnum); } RADIX_NODE_HEAD_UNLOCK(rnh); } if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) { ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = rt->rt_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = rt->rt_ifp->if_index; } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); #if defined(INET) || defined(INET6) #ifdef SCTP /* * notify the SCTP stack * this will only get called when an address is added/deleted * XXX pass the ifaddr struct instead if ifa->ifa_addr... */ sctp_addr_change(ifa, cmd); #endif /* SCTP */ #endif return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce route addition/removal. * Users of this function MUST validate input data BEFORE calling. * However we have to be able to handle invalid data: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * Returns 0 on success. */ int rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, ifp, error, rt, fibnum)); } void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: user/ae/inet6/sys/net/rtsock.c =================================================================== --- user/ae/inet6/sys/net/rtsock.c (revision 271452) +++ user/ae/inet6/sys/net/rtsock.c (revision 271453) @@ -1,1934 +1,1935 @@ /*- * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; static VNET_DEFINE(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void rts_init(void) { int tmp; if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); if (rp == NULL) return ENOBUFS; so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (prison_if(cred, ifa->ifa_addr) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; if (!found) { /* * As a last resort return the 'default' jail address. */ if (prison_get_ip6(cred, &saun->sin6) != 0) return (ESRCH); } else { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; saun->sin6.sin6_addr = sin6->sin6_addr; saun->sin6.sin6_scope_id = sin6->sin6_scope_id; } info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } #ifdef INET6 static int in6_rt_handle_lla(struct rt_msghdr **ortm, struct rt_addrinfo *info) { struct sockaddr_dl sdl; struct sockaddr_in6 *sin6; struct rt_msghdr *rtm = *ortm; struct llentry *lle; struct ifnet *ifp; int i; if (rtm->rtm_type != RTM_GET) return (EOPNOTSUPP); sin6 = (struct sockaddr_in6 *)info->rti_info[RTAX_DST]; if (sin6->sin6_scope_id == 0) return (EADDRNOTAVAIL); ifp = in6_getlinkifnet(sin6->sin6_scope_id); if (ifp == NULL) return (ESRCH); /* Clear all sockaddr pointers except DST */ for (i = RTAX_GATEWAY; i < RTAX_MAX; i++) info->rti_info[i] = NULL; IF_AFDATA_RLOCK(ifp); lle = lla_lookup(LLTABLE6(ifp), 0, info->rti_info[RTAX_DST]); IF_AFDATA_RUNLOCK(ifp); if (lle != NULL) { bzero(&sdl, sizeof(sdl)); info->rti_info[RTAX_GATEWAY] = (struct sockaddr *)&sdl; sdl.sdl_family = AF_LINK; sdl.sdl_len = sizeof(sdl); sdl.sdl_alen = ifp->if_addrlen; sdl.sdl_index = ifp->if_index; sdl.sdl_type = ifp->if_type; bcopy(&lle->ll_addr, LLADDR(&sdl), ifp->if_addrlen); if (lle->la_flags & LLE_PUB) rtm->rtm_flags |= RTF_ANNOUNCE; if (lle->la_flags & LLE_STATIC) { rtm->rtm_flags |= RTF_STATIC; rtm->rtm_rmx.rmx_expire = 0; } else rtm->rtm_rmx.rmx_expire = lle->la_expire; LLE_RUNLOCK(lle); } else info->rti_info[RTAX_GATEWAY] = ifp->if_addr->ifa_addr; rtm->rtm_flags |= RTF_UP | RTF_HOST; rtm->rtm_index = ifp->if_index; if (rtm->rtm_addrs & (RTA_IFA | RTA_IFP)) info->rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; i = rt_msg2(rtm->rtm_type, info, NULL, NULL); if (i > rtm->rtm_msglen) { R_Malloc(rtm, struct rt_msghdr *, i); if (rtm == NULL) return (ENOBUFS); bcopy(*ortm, rtm, i); Free(*ortm); *ortm = rtm; } rt_msg2(rtm->rtm_type, info, (caddr_t)rtm, NULL); rtm->rtm_addrs = info->rti_addrs; return (0); } #endif /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct radix_node_head *rnh; struct rt_addrinfo info; int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; struct rawcb *rp = NULL; struct walkarg w; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); bzero(&w, sizeof(w)); if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; info.rti_mflags = rtm->rtm_inits; info.rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error) senderr(error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct route gw_ro; bzero(&gw_ro, sizeof(gw_ro)); gw_ro.ro_dst = *info.rti_info[RTAX_GATEWAY]; rtalloc_ign_fib(&gw_ro, 0, fibnum); /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (gw_ro.ro_rt != NULL && gw_ro.ro_rt->rt_gateway->sa_family == AF_LINK && gw_ro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) { info.rti_flags &= ~RTF_GATEWAY; info.rti_flags |= RTF_GWFLAG_COMPAT; } if (gw_ro.ro_rt != NULL) RTFREE(gw_ro.ro_rt); } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: case RTM_CHANGE: if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); break; } error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, fibnum); if (error == 0 && saved_nrt != NULL) { RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); break; } error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } break; case RTM_GET: #ifdef INET6 if (info.rti_info[RTAX_DST]->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)info.rti_info[RTAX_DST]; if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { error = in6_rt_handle_lla(&rtm, &info); break; } } #endif rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) senderr(EAFNOSUPPORT); RADIX_NODE_HEAD_RLOCK(rnh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { /* * Provide logest prefix match for * address lookup (no mask). * 'route -n get addr' */ rt = (struct rtentry *) rnh->rnh_matchaddr( info.rti_info[RTAX_DST], rnh); } else rt = (struct rtentry *) rnh->rnh_lookup( info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], rnh); if (rt == NULL) { RADIX_NODE_HEAD_RUNLOCK(rnh); senderr(ESRCH); } #ifdef RADIX_MPATH /* * for RTM_CHANGE/LOCK, if we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. * * for RTM_GET, gate is optional even with multipath. * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ if (rn_mpath_capable(rnh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { RADIX_NODE_HEAD_RUNLOCK(rnh); senderr(ESRCH); } } #endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; - ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1); + ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, + RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, rt->rt_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh); if (rt == NULL) { RADIX_NODE_HEAD_RUNLOCK(rnh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); RADIX_NODE_HEAD_RUNLOCK(rnh); report: RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(curthread->td_ucred) : prison_if(curthread->td_ucred, rt_key(rt)) != 0) { RT_UNLOCK(rt); senderr(ESRCH); } info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(&info, ifp, rt, &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } if (ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info.rti_info[RTAX_IFP] = NULL; info.rti_info[RTAX_IFA] = NULL; } } else if ((ifp = rt->rt_ifp) != NULL) { rtm->rtm_index = ifp->if_index; } /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *new_rtm; new_rtm = malloc(len, M_TEMP, M_NOWAIT); if (new_rtm == NULL) { RT_UNLOCK(rt); senderr(ENOBUFS); } bcopy(rtm, new_rtm, rtm->rtm_msglen); free(rtm, M_TEMP); rtm = new_rtm; alloc_len = len; } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } flush: if (rt != NULL) RTFREE(rt); /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { if (error != 0) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } return (error); } static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 /* * XXX: some software use embedded scope ids. * We remove id from address and initialize sin6_scope_id * instead. */ if (sa->sa_family == AF_INET6) sa6_recoverscope((struct sockaddr_in6 *)sa); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa; int len, i; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); m_copyback(m, len, SA_SIZE(sa), (caddr_t)sa); len += SA_SIZE(sa); } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; int len, i, second_time = 0; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) len = sizeof(struct ifa_msghdrl32); else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); if (cp != NULL && buflen >= SA_SIZE(sa)) { bcopy((caddr_t)sa, cp, SA_SIZE(sa)); cp += SA_SIZE(sa); buflen -= SA_SIZE(sa); } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += SA_SIZE(sa); } if (cp != NULL) { if (buflen < ALIGN(len) - len) cp = NULL; else buflen -= ALIGN(len) - len; } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occured, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal. * Please do not call directly, use rt_routemsg(). * Note that @rt data MAY be inconsistent/invalid: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = sa = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; rtm->rtm_flags |= rt->rt_flags; rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; struct sockaddr_storage ss; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(w->w_req->td->td_ucred) : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); /* Some drivers still use ifqueue(9), add its stats. */ ifd->ifi_oqdrops += ifp->if_snd.ifq_drops; return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); /* Some drivers still use ifqueue(9), add its stats. */ ifd->ifi_oqdrops += ifp->if_snd.ifq_drops; return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &info, w, len); else error = sysctl_iflist_ifm(ifp, &info, w, len); if (error) goto done; } while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: if (ifp != NULL) IF_ADDR_RUNLOCK(ifp); IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifmultiaddr *ifma; struct rt_addrinfo info; int len, error = 0; struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) { IF_ADDR_RUNLOCK(ifp); goto done; } } } IF_ADDR_RUNLOCK(ifp); } done: IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct radix_node_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; name ++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { RADIX_NODE_HEAD_RLOCK(rnh); error = rnh->rnh_walktree(rnh, sysctl_dumpentry, &w); RADIX_NODE_HEAD_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])] }; VNET_DOMAIN_SET(route); Index: user/ae/inet6/sys/netinet/in_pcb.c =================================================================== --- user/ae/inet6/sys/netinet/in_pcb.c (revision 271452) +++ user/ae/inet6/sys/netinet/in_pcb.c (revision 271453) @@ -1,2630 +1,2633 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, uint32_t inpcbzone_flags, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; INP_INFO_WLOCK_ASSERT(pcbinfo); error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(inp, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #ifdef IPSEC error = ipsec_init_policy(so, &inp->inp_sp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ #if defined(IPSEC) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, inp->in6p_zoneid, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ short inp_so_options(const struct inpcb *inp) { short so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; - ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin)); + ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, + RT_ALL_FIBS)); if (ia == NULL) - ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0)); + ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, + RT_ALL_FIBS)); if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; - ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain))); + ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), RT_ALL_FIBS)); if (ia == NULL) - ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0)); + ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, + RT_ALL_FIBS)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(); faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(); if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&TAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) return (0); KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ #ifdef IPSEC if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); if (inp->in6p_moptions != NULL) ip6_freemoptions(inp->in6p_moptions); } #endif if (inp->inp_options) (void)m_free(inp->inp_options); #ifdef INET if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); #endif inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_RLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { in_delmulti(imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; /* XXX inp locking */ if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; /* XXX inp locking */ if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; /* XXX inp locking */ if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANTS void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_RLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_FAITH) { db_printf("%sINP_FAITH", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ Index: user/ae/inet6/sys/netinet/ip_options.c =================================================================== --- user/ae/inet6/sys/netinet/ip_options.c (revision 271452) +++ user/ae/inet6/sys/netinet/ip_options.c (revision 271453) @@ -1,737 +1,740 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ip_dosourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); static int ip_acceptsourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_RW, &ip_acceptsourceroute, 0, "Enable accepting source routed IP packets"); int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */ SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW, &ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)"); static void save_rte(struct mbuf *m, u_char *, struct in_addr); /* * Do option processing on a datagram, possibly discarding it if bad options * are encountered, or forwarding it if source-routed. * * The pass argument is used when operating in the IPSTEALTH mode to tell * what options to process: [LS]SRR (pass 0) or the others (pass 1). The * reason for as many as two passes is that when doing IPSTEALTH, non-routing * options should be processed only if the packet is for us. * * Returns 1 if packet has been forwarded/freed, 0 if the packet should be * processed further. */ int ip_dooptions(struct mbuf *m, int pass) { struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; uint32_t ntime; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; /* Ignore or reject packets with IP options. */ if (ip_doopts == 0) return 0; else if (ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; goto bad; } dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } } switch (opt) { default: break; /* * Source routing with record. Find interface with current * destination address. If none on this machine then drop if * strictly routed, or do nothing if loosely routed. Record * interface address and bring up next address component. If * strictly routed make sure next address is on directly * accessible net. */ case IPOPT_LSRR: case IPOPT_SSRR: #ifdef IPSTEALTH if (V_ipstealth && pass > 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = ip->ip_dst; if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr) == 0) { if (opt == IPOPT_SSRR) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } if (!ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination * yet; nothing to do except forward. */ break; } off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) { /* * End of source route. Should be for us. */ if (!ip_acceptsourceroute) goto nosourcerouting; save_rte(m, cp, ip->ip_src); break; } #ifdef IPSTEALTH if (V_ipstealth) goto dropit; #endif if (!ip_dosourceroute) { if (V_ipforwarding) { char buf[16]; /* aaa.bbb.ccc.ddd\0 */ /* * Acting as a router, so generate * ICMP */ nosourcerouting: strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_WARNING, "attempted source route from %s to %s\n", inet_ntoa(ip->ip_src), buf); type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } else { /* * Not acting as a router, so * silently drop. */ #ifdef IPSTEALTH dropit: #endif IPSTAT_INC(ips_cantforward); m_freem(m); return (1); } } /* * locate outgoing interface */ (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * - if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL) - ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0); + ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr, + RT_ALL_FIBS); + if (ia == NULL) + ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0, + RT_ALL_FIBS); } else /* XXX MRT 0 for routing */ ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m)); if (ia == NULL) { type = ICMP_UNREACH; code = ICMP_UNREACH_SRCFAIL; goto bad; } ip->ip_dst = ipaddr.sin_addr; (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts */ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); break; case IPOPT_RR: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif if (optlen < IPOPT_OFFSET + sizeof(*cp)) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } /* * If no space remains, ignore. */ off--; /* 0 origin */ if (off > optlen - (int)sizeof(struct in_addr)) break; (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, sizeof(ipaddr.sin_addr)); /* * Locate outgoing interface; if we're the * destination, use the incoming interface (should be * same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL && (ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; case IPOPT_TS: #ifdef IPSTEALTH if (V_ipstealth && pass == 0) break; #endif code = cp - (u_char *)ip; if (optlen < 4 || optlen > 40) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if ((off = cp[IPOPT_OFFSET]) < 5) { code = &cp[IPOPT_OLEN] - (u_char *)ip; goto bad; } if (off > optlen - (int)sizeof(int32_t)) { cp[IPOPT_OFFSET + 1] += (1 << 4); if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } break; } off--; /* 0 origin */ sin = (struct in_addr *)(cp + off); switch (cp[IPOPT_OFFSET + 1] & 0x0f) { case IPOPT_TS_TSONLY: break; case IPOPT_TS_TSANDADDR: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } ipaddr.sin_addr = dst; ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, m->m_pkthdr.rcvif); if (ia == NULL) continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; case IPOPT_TS_PRESPEC: if (off + sizeof(uint32_t) + sizeof(struct in_addr) > optlen) { code = &cp[IPOPT_OFFSET] - (u_char *)ip; goto bad; } (void)memcpy(&ipaddr.sin_addr, sin, sizeof(struct in_addr)); if (ifa_ifwithaddr_check((SA)&ipaddr) == 0) continue; cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; default: code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; goto bad; } ntime = iptime(); (void)memcpy(cp + off, &ntime, sizeof(uint32_t)); cp[IPOPT_OFFSET] += sizeof(uint32_t); } } if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); return (1); } /* * Save incoming source route for use in replies, to be picked up later by * ip_srcroute if the receiver is interested. */ static void save_rte(struct mbuf *m, u_char *option, struct in_addr dst) { unsigned olen; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS, sizeof(struct ipopt_tag), M_NOWAIT); if (opts == NULL) return; olen = option[IPOPT_OLEN]; if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) { m_tag_free((struct m_tag *)opts); return; } bcopy(option, opts->ip_srcrt.srcopt, olen); opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); opts->ip_srcrt.dst = dst; m_tag_prepend(m, (struct m_tag *)opts); } /* * Retrieve incoming source route for use in replies, in the same form used * by setsockopt. The first hop is placed before the options, will be * removed later. */ struct mbuf * ip_srcroute(struct mbuf *m0) { struct in_addr *p, *q; struct mbuf *m; struct ipopt_tag *opts; opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL); if (opts == NULL) return (NULL); if (opts->ip_nhops == 0) return (NULL); m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt)) /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ m->m_len = opts->ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + OPTSIZ; /* * First, save first hop for return route. */ p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]); *(mtod(m, struct in_addr *)) = *p--; /* * Copy option fields and padding (nop) to mbuf. */ opts->ip_srcrt.nop = IPOPT_NOP; opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &(opts->ip_srcrt.nop), OPTSIZ); q = (struct in_addr *)(mtod(m, caddr_t) + sizeof(struct in_addr) + OPTSIZ); #undef OPTSIZ /* * Record return path as an IP source route, reversing the path * (pointers are now aligned). */ while (p >= opts->ip_srcrt.route) { *q++ = *p--; } /* * Last hop goes to final destination. */ *q = opts->ip_srcrt.dst; m_tag_delete(m0, (struct m_tag *)opts); return (m); } /* * Strip out IP options, at higher level protocol in the kernel. */ void ip_stripoptions(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); int olen; olen = (ip->ip_hl << 2) - sizeof(struct ip); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; ip->ip_len = htons(ntohs(ip->ip_len) - olen); ip->ip_hl = sizeof(struct ip) >> 2; bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1), (size_t )(m->m_len - sizeof(struct ip))); } /* * Insert IP options into preformed packet. Adjust IP destination as * required for IP source routing, as indicated by a non-zero in_addr at the * start of the options. * * XXX This routine assumes that the packet has no options in place. */ struct mbuf * ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; struct ip *ip = mtod(m, struct ip *); unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { *phlen = 0; return (m); } m_move_pkthdr(n, m); n->m_pkthdr.rcvif = NULL; n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); n->m_next = m; m = n; m->m_len = optlen + sizeof(struct ip); m->m_data += max_linkhdr; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; bcopy(ip, mtod(m, void *), sizeof(struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; ip->ip_len = htons(ntohs(ip->ip_len) + optlen); return (m); } /* * Copy options from ip to jp, omitting those not copied during * fragmentation. */ int ip_optcopy(struct ip *ip, struct ip *jp) { u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); dp = (u_char *)(jp + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) { /* Preserve for IP mcast tunnel's LSRR alignment. */ *dp++ = IPOPT_NOP; optlen = 1; continue; } KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), ("ip_optcopy: malformed ipv4 option")); optlen = cp[IPOPT_OLEN]; KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, ("ip_optcopy: malformed ipv4 option")); /* Bogus lengths should have been caught by ip_dooptions. */ if (optlen > cnt) optlen = cnt; if (IPOPT_COPIED(opt)) { bcopy(cp, dp, optlen); dp += optlen; } } for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) *dp++ = IPOPT_EOL; return (optlen); } /* * Set up IP options in pcb for insertion in output packets. Store in mbuf * with pointer in pcbopt, adding pseudo-option with destination address if * source routed. */ int ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) { int cnt, optlen; u_char *cp; struct mbuf **pcbopt; u_char opt; INP_WLOCK_ASSERT(inp); pcbopt = &inp->inp_options; /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); *pcbopt = 0; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. */ if (m != NULL) (void)m_free(m); return (0); } if (m->m_len % sizeof(int32_t)) goto bad; /* * IP first-hop destination address will be stored before actual * options; move other options back and clear it when none present. */ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; m->m_len += sizeof(struct in_addr); cp = mtod(m, u_char *) + sizeof(struct in_addr); bcopy(mtod(m, void *), cp, (unsigned)cnt); bzero(mtod(m, void *), sizeof(struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) goto bad; optlen = cp[IPOPT_OLEN]; if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) goto bad; } switch (opt) { default: break; case IPOPT_LSRR: case IPOPT_SSRR: /* * User process specifies route as: * * ->A->B->C->D * * D must be our final destination (but we can't * check that since we may not have connected yet). * A is first hop destination, which doesn't appear * in actual IP option, but is stored before the * options. */ /* XXX-BZ PRIV_NETINET_SETHDROPTS? */ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) goto bad; m->m_len -= sizeof(struct in_addr); cnt -= sizeof(struct in_addr); optlen -= sizeof(struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), sizeof(struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), &cp[IPOPT_OFFSET+1], (unsigned)cnt - (IPOPT_MINOFF - 1)); break; } } if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) goto bad; *pcbopt = m; return (0); bad: (void)m_free(m); return (EINVAL); } /* * Check for the presence of the IP Router Alert option [RFC2113] * in the header of an IPv4 datagram. * * This call is not intended for use from the forwarding path; it is here * so that protocol domains may check for the presence of the option. * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert * option does not have much relevance to the implementation, though this * may change in future. * Router alert options SHOULD be passed if running in IPSTEALTH mode and * we are not the endpoint. * Length checks on individual options should already have been peformed * by ip_dooptions() therefore they are folded under INVARIANTS here. * * Return zero if not present or options are invalid, non-zero if present. */ int ip_checkrouteralert(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); u_char *cp; int opt, optlen, cnt, found_ra; found_ra = 0; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { #ifdef INVARIANTS if (cnt < IPOPT_OLEN + sizeof(*cp)) break; #endif optlen = cp[IPOPT_OLEN]; #ifdef INVARIANTS if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) break; #endif } switch (opt) { case IPOPT_RA: #ifdef INVARIANTS if (optlen != IPOPT_OFFSET + sizeof(uint16_t) || (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0)) break; else #endif found_ra = 1; break; default: break; } } return (found_ra); } Index: user/ae/inet6/sys/netinet/ip_output.c =================================================================== --- user/ae/inet6/sys/netinet/ip_output.c (revision 271452) +++ user/ae/inet6/sys/netinet/ip_output.c (revision 271453) @@ -1,1416 +1,1420 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #ifdef IPSEC #include #include #endif /* IPSEC*/ #include #include VNET_DEFINE(u_short, ip_id); #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; int isbroadcast; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ struct in_addr odst; struct m_tag *fwd_tag = NULL; int have_ia_ref; #ifdef IPSEC int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if (((flags & IP_NODEFAULTFLOWID) == 0) && inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); m->m_flags |= M_FLOWID; } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* * Fill in IP header. If we are not allowing fragmentation, * then the ip_id field is meaningless, but we don't set it * to zero. Doing so causes various problems when devices along * the path (routers, load balancers, firewalls, etc.) illegally * disable DF on our packet. Note that a 16-bit counter * will wrap around in less than 10 seconds at 100 Mbit/s on a * medium with MTU 1500. See Steven M. Bellovin, "A Technique * for Counting NATted Hosts", Proc. IMW'02, available at * . */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } /* * dst/gw handling: * * dst can be rewritten but always points to &ro->ro_dst. * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ gw = dst = (struct sockaddr_in *)&ro->ro_dst; again: ia = NULL; have_ia_ref = 0; /* * If there is a cached route, check that it is to the same * destination and is still up. If not, free it and try again. * The address family should also be checked in case of sharing * the cache with IPv6. */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RO_RTFREE(ro); ro->ro_lle = NULL; rte = NULL; gw = dst; } if (rte == NULL && fwd_tag == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { - if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && - (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { + if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), + RT_ALL_FIBS))) == NULL && + (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), + RT_ALL_FIBS))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { - if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && - (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), + RT_ALL_FIBS))) == NULL && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, + RT_ALL_FIBS))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia); if (ia) have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (rte == NULL) { #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); #else in_rtalloc_ign(ro, 0, inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); #endif rte = ro->ro_rt; } if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ia = ifatoia(rte->rt_ifa); ifp = rte->rt_ifp; counter_u64_add(rte->rt_pksent, 1); if (rte->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(gw->sin_addr, ifp); } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) { /* * This case can happen if the user changed the MTU * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ if (rte->rt_mtu > ifp->if_mtu) rte->rt_mtu = ifp->if_mtu; mtu = rte->rt_mtu; } else { mtu = ifp->if_mtu; } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, dst, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Both in the SMP world, pre-emption world if_transmit() world, * the following code doesn't really function as intended any further. * * + There can and will be multiple CPUs running this code path * in parallel, and we do no lock holding when checking the * queue depth; * + And since other threads can be running concurrently, even if * we do pass this check, another thread may queue some frames * before this thread does and it will end up partially or fully * failing to send anyway; * + if_transmit() based drivers don't necessarily set ifq_len * at all. * * This should be replaced with a method of pushing an entire list * of fragment frames to the driver and have the driver decide * whether it can queue or not queue the entire set. */ #if 0 /* * Verify that we have any chance at all of being able to queue the * packet or packet fragments, unless ALTQ is enabled on the given * interface in which case packetdrop should be done by queueing. */ n = ip_len / mtu + 1; /* how many fragments ? */ if ( #ifdef ALTQ (!ALTQ_IS_ENABLED(&ifp->if_snd)) && #endif /* ALTQ */ (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) { error = ENOBUFS; IPSTAT_INC(ips_odropped); ifp->if_snd.ifq_drops += n; goto bad; } #endif /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC switch(ip_ipsec_output(&m, inp, &flags, &error)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passout; /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IP, m); goto done; } else { if (have_ia_ref) ifa_free(&ia->ia_ifa); goto again; /* Redo the routing table lookup. */ } } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; error = netisr_queue(NETISR_IP, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); if (have_ia_ref) ifa_free(&ia->ia_ifa); goto again; } passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: if (ro == &iproute) RO_RTFREE(ro); if (have_ia_ref) ifa_free(&ia->ia_ifa); return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; struct mbuf *m; for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) off += m->m_len; /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } m->m_flags |= (m0->m_flags & M_MCAST); /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_netinet_fragment(m0, m); #endif m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; ip_len = ntohs(ip->ip_len); csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ /* find the mbuf in the chain where the checksum starts*/ while ((m != NULL) && (offset >= m->m_len)) { offset -= m->m_len; m = m->m_next; } KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain.")); KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs.")); *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_FAITH: OPTSET(INP_FAITH); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(inp, sopt->sopt_name, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_FAITH: optval = OPTBIT(INP_FAITH); break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, int hlen) { register struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); #if 1 /* XXX */ if (dst->sin_family != AF_INET) { printf("ip_mloopback: bad address family %d\n", dst->sin_family); dst->sin_family = AF_INET; } #endif if_simloop(ifp, copym, dst->sin_family, 0); } } Index: user/ae/inet6/sys/sys/param.h =================================================================== --- user/ae/inet6/sys/sys/param.h (revision 271452) +++ user/ae/inet6/sys/sys/param.h (revision 271453) @@ -1,347 +1,347 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/book.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1100031 /* Master, propagated to newvers */ +#define __FreeBSD_version 1100032 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #ifdef _KERNEL #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #define BKVASIZE 16384 /* must be power of 2 */ #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef lint #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* lint */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: user/ae/inet6/sys =================================================================== --- user/ae/inet6/sys (revision 271452) +++ user/ae/inet6/sys (revision 271453) Property changes on: user/ae/inet6/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r271428-271452 Index: user/ae/inet6/usr.bin/mkimg/bsd.c =================================================================== --- user/ae/inet6/usr.bin/mkimg/bsd.c (revision 271452) +++ user/ae/inet6/usr.bin/mkimg/bsd.c (revision 271453) @@ -1,135 +1,141 @@ /*- * Copyright (c) 2014 Juniper Networks, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "image.h" #include "mkimg.h" #include "scheme.h" #ifndef FS_NANDFS #define FS_NANDFS 30 #endif static struct mkimg_alias bsd_aliases[] = { { ALIAS_FREEBSD_NANDFS, ALIAS_INT2TYPE(FS_NANDFS) }, { ALIAS_FREEBSD_SWAP, ALIAS_INT2TYPE(FS_SWAP) }, { ALIAS_FREEBSD_UFS, ALIAS_INT2TYPE(FS_BSDFFS) }, { ALIAS_FREEBSD_VINUM, ALIAS_INT2TYPE(FS_VINUM) }, { ALIAS_FREEBSD_ZFS, ALIAS_INT2TYPE(FS_ZFS) }, { ALIAS_NONE, 0 } }; static u_int bsd_metadata(u_int where) { u_int secs; secs = BBSIZE / secsz; return ((where == SCHEME_META_IMG_START) ? secs : 0); } static int bsd_write(lba_t imgsz, void *bootcode) { u_char *buf, *p; struct disklabel *d; struct partition *dp; struct part *part; - int error, n; + int bsdparts, error, n; uint16_t checksum; buf = malloc(BBSIZE); if (buf == NULL) return (ENOMEM); if (bootcode != NULL) { memcpy(buf, bootcode, BBSIZE); - memset(buf + secsz, 0, secsz); + memset(buf + secsz, 0, sizeof(struct disklabel)); } else memset(buf, 0, BBSIZE); + bsdparts = nparts + 1; /* Account for c partition */ + if (bsdparts < MAXPARTITIONS) + bsdparts = MAXPARTITIONS; imgsz = (lba_t)ncyls * nheads * nsecs; error = image_set_size(imgsz); if (error) { free(buf); return (error); } d = (void *)(buf + secsz); le32enc(&d->d_magic, DISKMAGIC); le32enc(&d->d_secsize, secsz); le32enc(&d->d_nsectors, nsecs); le32enc(&d->d_ntracks, nheads); le32enc(&d->d_ncylinders, ncyls); le32enc(&d->d_secpercyl, nsecs * nheads); le32enc(&d->d_secperunit, imgsz); le16enc(&d->d_rpm, 3600); le32enc(&d->d_magic2, DISKMAGIC); - le16enc(&d->d_npartitions, (8 > nparts + 1) ? 8 : nparts + 1); + le16enc(&d->d_npartitions, bsdparts); le32enc(&d->d_bbsize, BBSIZE); dp = &d->d_partitions[RAW_PART]; le32enc(&dp->p_size, imgsz); STAILQ_FOREACH(part, &partlist, link) { n = part->index + ((part->index >= RAW_PART) ? 1 : 0); dp = &d->d_partitions[n]; le32enc(&dp->p_size, part->size); le32enc(&dp->p_offset, part->block); + le32enc(&dp->p_fsize, 0); dp->p_fstype = ALIAS_TYPE2INT(part->type); + dp->p_frag = 0; + le16enc(&dp->p_cpg, 0); } - dp = &d->d_partitions[nparts + 1]; + dp = &d->d_partitions[bsdparts]; checksum = 0; - for (p = buf; p < (u_char *)dp; p += 2) + for (p = (void *)d; p < (u_char *)dp; p += 2) checksum ^= le16dec(p); le16enc(&d->d_checksum, checksum); error = image_write(0, buf, BBSIZE / secsz); free(buf); return (error); } static struct mkimg_scheme bsd_scheme = { .name = "bsd", .description = "BSD disk label", .aliases = bsd_aliases, .metadata = bsd_metadata, .write = bsd_write, .nparts = 19, .bootcode = BBSIZE, .maxsecsz = 512 }; SCHEME_DEFINE(bsd_scheme); Index: user/ae/inet6/usr.bin/mkimg =================================================================== --- user/ae/inet6/usr.bin/mkimg (revision 271452) +++ user/ae/inet6/usr.bin/mkimg (revision 271453) Property changes on: user/ae/inet6/usr.bin/mkimg ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/usr.bin/mkimg:r271428-271452 Index: user/ae/inet6/usr.bin/rctl/rctl.8 =================================================================== --- user/ae/inet6/usr.bin/rctl/rctl.8 (revision 271452) +++ user/ae/inet6/usr.bin/rctl/rctl.8 (revision 271453) @@ -1,276 +1,276 @@ .\"- .\" Copyright (c) 2009 Edward Tomasz Napierala .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR THE VOICES IN HIS HEAD BE .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd February 16, 2014 +.Dd September 11, 2014 .Dt RCTL 8 .Os .Sh NAME .Nm rctl .Nd display and update resource limits database .Sh SYNOPSIS .Nm .Op Fl h .Op Fl n .Op Ar filter .Nm .Fl a .Op Ar rule .Nm .Fl l .Op Fl h .Op Fl n .Op Ar filter .Nm .Fl r .Op Ar filter .Nm .Fl u .Op Fl h .Op Ar filter .Pp .Nm requires the kernel to be compiled with: .Bd -ragged -offset indent .Cd "options RACCT" .Cd "options RCTL" .Ed .Sh DESCRIPTION When called without options, the .Nm command writes currently defined RCTL rules to standard output. .Pp If a .Ar filter argument is specified, only rules matching the filter are displayed. The options are as follows: .Bl -tag -width indent .It Fl a Ar rule Add .Ar rule to the RCTL database. .It Fl l Ar filter Display rules applicable to the process defined by .Ar filter . Note that this is different from showing the rules when called without any options, as it shows not just the rules with subject equal to that of process, but also rules for the user, jail, and login class applicable to the process. .It Fl r Ar filter Remove rules matching .Ar filter from the RCTL database. .It Fl u Ar filter Display resource usage for a subject .Po .Sy process , .Sy user , .Sy loginclass or .Sy jail .Pc matching the .Ar filter . .It Fl h "Human-readable" output. Use unit suffixes: Byte, Kilobyte, Megabyte, Gigabyte, Terabyte and Petabyte. .It Fl n Display user IDs numerically rather than converting them to a user name. .El .Pp Modifying rules affects all currently running and future processes matching the rule. .Sh RULE SYNTAX Syntax for a rule is subject:subject-id:resource:action=amount/per. .Pp .Bl -tag -width "subject-id" -compact -offset indent .It subject defines the kind of entity the rule applies to. It can be either .Sy process , .Sy user , .Sy loginclass , or .Sy jail . .It subject-id identifies the .Em subject . It can be a process ID, user name, numerical user ID, login class name from .Xr login.conf 5 , or jail name. .It resource identifies the resource the rule controls. See the .Sx RESOURCES section below for details. .It action defines what will happen when a process exceeds the allowed .Em amount . See the .Sx ACTIONS section below for details. .It amount defines how much of the resource a process can use before the defined .Em action triggers. Resources which limit bytes may use prefixes from .Xr expand_number 3 . .It per defines what entity the .Em amount gets accounted for. For example, rule "loginclass:users:vmem:deny=100M/process" means that each process of any user belonging to login class "users" may allocate up to 100MB of virtual memory. Rule "loginclass:users:vmem:deny=100M/user" would mean that for each user belonging to the login class "users", the sum of virtual memory allocated by all the processes of that user will not exceed 100MB. Rule "loginclass:users:vmem:deny=100M/loginclass" would mean that the sum of virtual memory allocated by all processes of all users belonging to that login class will not exceed 100MB. .El .Pp A valid rule has all those fields specified, except for .Em per , which defaults to the value of .Em subject . .Pp A filter is a rule for which one of more fields other than .Em per is left empty. For example, a filter that matches every rule could be written as ":::=/", or, in short, ":". A filter that matches all the login classes would be "loginclass:". A filter that matches all defined rules for .Sy maxproc resource would be "::maxproc". .Sh SUBJECTS .Bl -column -offset 3n "pseudoterminals" ".Sy username or numerical User ID" .It Em subject Ta Em subject-id .It Sy process Ta numerical Process ID .It Sy user Ta user name or numerical User ID .It Sy loginclass Ta login class from .Xr login.conf 5 .It Sy jail Ta jail name .El .Sh RESOURCES .Bl -column -offset 3n "pseudoterminals" .It Em resource .It Sy cputime Ta "CPU time, in seconds" .It Sy datasize Ta "data size, in bytes" .It Sy stacksize Ta "stack size, in bytes" .It Sy coredumpsize Ta "core dump size, in bytes" .It Sy memoryuse Ta "resident set size, in bytes" .It Sy memorylocked Ta "locked memory, in bytes" .It Sy maxproc Ta "number of processes" .It Sy openfiles Ta "file descriptor table size" .It Sy vmemoryuse Ta "address space limit, in bytes" .It Sy pseudoterminals Ta "number of PTYs" .It Sy swapuse Ta "swap usage, in bytes" .It Sy nthr Ta "number of threads" .It Sy msgqqueued Ta "number of queued SysV messages" .It Sy msgqsize Ta "SysV message queue size, in bytes" .It Sy nmsgq Ta "number of SysV message queues" .It Sy nsem Ta "number of SysV semaphores" .It Sy nsemop Ta "number of SysV semaphores modified in a single semop(2) call" .It Sy nshm Ta "number of SysV shared memory segments" .It Sy shmsize Ta "SysV shared memory size, in bytes" .It Sy wallclock Ta "wallclock time, in seconds" .It Sy pcpu Ta "%CPU, in percents of a single CPU core" .El .Sh ACTIONS .Bl -column -offset 3n "pseudoterminals" .It Em action .It Sy deny Ta deny the allocation; not supported for -.Sy cpu +.Sy cputime and .Sy wallclock .It Sy log Ta "log a warning to the console" .It Sy devctl Ta "send notification to" .Xr devd 8 using .Sy system = "RCTL", .Sy subsystem = "rule", .Sy type = "matched" .It sig* e.g. .Sy sigterm ; send a signal to the offending process. See .Xr signal 3 for a list of supported signals .El .Pp Not all actions are supported for all resources. Attempting to add a rule with an action not supported by a given resource will result in error. .Sh EXIT STATUS .Ex -std .Sh EXAMPLES Prevent user "joe" from allocating more than 1GB of virtual memory: .Dl Nm Fl a Ar user:joe:vmemoryuse:deny=1g .Pp Remove all RCTL rules: .Dl Nm Fl r Ar \&: .Pp Display resource usage information for jail named "www": .Dl Nm Fl hu Ar jail:www .Pp Display all the rules applicable to process with PID 512: .Dl Nm Fl l Ar process:512 .Pp Display all rules: .Dl Nm .Pp Display all rules matching user "joe": .Dl Nm Ar user:joe .Pp Display all rules matching login classes: .Dl Nm Ar loginclass: .Sh SEE ALSO .Xr rctl.conf 5 .Sh HISTORY The .Nm command appeared in .Fx 9.0 . .Sh AUTHORS .An -nosplit The .Nm was developed by .An Edward Tomasz Napierala Aq Mt trasz@FreeBSD.org under sponsorship from the FreeBSD Foundation. .Sh BUGS Limiting .Sy memoryuse may kill the machine due to thrashing. Index: user/ae/inet6/usr.sbin/bhyve/block_if.c =================================================================== --- user/ae/inet6/usr.sbin/bhyve/block_if.c (revision 271452) +++ user/ae/inet6/usr.sbin/bhyve/block_if.c (revision 271453) @@ -1,474 +1,475 @@ /*- * Copyright (c) 2013 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "block_if.h" #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_MAXREQ 32 enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_CANCEL }; enum blockstat { BST_FREE, BST_INUSE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) be_link; struct blockif_req *be_req; enum blockop be_op; enum blockstat be_status; }; struct blockif_ctxt { int bc_magic; int bc_fd; int bc_rdonly; off_t bc_size; int bc_sectsz; pthread_t bc_btid; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; int bc_closing; /* Request elements and free/inuse queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_inuseq; u_int bc_req_count; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; }; static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be; assert(bc->bc_req_count < BLOCKIF_MAXREQ); be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); TAILQ_REMOVE(&bc->bc_freeq, be, be_link); be->be_status = BST_INUSE; be->be_req = breq; be->be_op = op; TAILQ_INSERT_TAIL(&bc->bc_inuseq, be, be_link); bc->bc_req_count++; return (0); } static int blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem *el) { struct blockif_elem *be; if (bc->bc_req_count == 0) return (ENOENT); be = TAILQ_FIRST(&bc->bc_inuseq); assert(be != NULL); assert(be->be_status == BST_INUSE); *el = *be; TAILQ_REMOVE(&bc->bc_inuseq, be, be_link); be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); bc->bc_req_count--; return (0); } static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_req *br; int err; br = be->be_req; err = 0; switch (be->be_op) { case BOP_READ: if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset) < 0) err = errno; break; case BOP_WRITE: if (bc->bc_rdonly) err = EROFS; else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset) < 0) err = errno; break; case BOP_FLUSH: break; case BOP_CANCEL: err = EINTR; break; default: err = EINVAL; break; } (*br->br_callback)(br, err); } static void * blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem req; bc = arg; for (;;) { pthread_mutex_lock(&bc->bc_mtx); while (!blockif_dequeue(bc, &req)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, &req); pthread_mutex_lock(&bc->bc_mtx); } pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); pthread_mutex_unlock(&bc->bc_mtx); /* * Check ctxt status here to see if exit requested */ if (bc->bc_closing) pthread_exit(NULL); } /* Not reached */ return (NULL); } struct blockif_ctxt * blockif_open(const char *optstr, const char *ident) { char tname[MAXCOMLEN + 1]; char *nopt, *xopts; struct blockif_ctxt *bc; struct stat sbuf; off_t size; int extra, fd, i, sectsz; int nocache, sync, ro; nocache = 0; sync = 0; ro = 0; /* * The first element in the optstring is always a pathname. * Optional elements follow */ nopt = strdup(optstr); for (xopts = strtok(nopt, ","); xopts != NULL; xopts = strtok(NULL, ",")) { if (!strcmp(xopts, "nocache")) nocache = 1; else if (!strcmp(xopts, "sync")) sync = 1; else if (!strcmp(xopts, "ro")) ro = 1; } extra = 0; if (nocache) extra |= O_DIRECT; if (sync) extra |= O_SYNC; fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ fd = open(nopt, O_RDONLY | extra); ro = 1; } if (fd < 0) { perror("Could not open backing file"); return (NULL); } if (fstat(fd, &sbuf) < 0) { perror("Could not stat backing file"); close(fd); return (NULL); } /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); close(fd); return (NULL); } assert(size != 0); assert(sectsz != 0); } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { close(fd); return (NULL); } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; + bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_inuseq); bc->bc_req_count = 0; for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } pthread_create(&bc->bc_btid, NULL, blockif_thr, bc); snprintf(tname, sizeof(tname), "blk-%s", ident); pthread_set_name_np(bc->bc_btid, tname); return (bc); } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { int err; err = 0; pthread_mutex_lock(&bc->bc_mtx); if (bc->bc_req_count < BLOCKIF_MAXREQ) { /* * Enqueue and inform the block i/o thread * that there is work available */ blockif_enqueue(bc, breq, op); pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } pthread_mutex_unlock(&bc->bc_mtx); return (err); } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_CANCEL)); } int blockif_close(struct blockif_ctxt *bc) { void *jval; int err; err = 0; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ bc->bc_closing = 1; pthread_cond_signal(&bc->bc_cond); pthread_join(bc->bc_btid, &jval); /* XXX Cancel queued i/o's ??? */ /* * Release resources */ bc->bc_magic = 0; close(bc->bc_fd); free(bc); return (0); } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; assert(bc->bc_magic == BLOCKIF_SIG); sectors = bc->bc_size / bc->bc_sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535UL*16*255) sectors = 65535UL*16*255; if (sectors >= 65536UL*16*63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } int blockif_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ); } int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } Index: user/ae/inet6/usr.sbin/bhyve =================================================================== --- user/ae/inet6/usr.sbin/bhyve (revision 271452) +++ user/ae/inet6/usr.sbin/bhyve (revision 271453) Property changes on: user/ae/inet6/usr.sbin/bhyve ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/usr.sbin/bhyve:r271428-271452 Index: user/ae/inet6/usr.sbin/ctld/ctl.conf.5 =================================================================== --- user/ae/inet6/usr.sbin/ctld/ctl.conf.5 (revision 271452) +++ user/ae/inet6/usr.sbin/ctld/ctl.conf.5 (revision 271453) @@ -1,290 +1,357 @@ .\" Copyright (c) 2012 The FreeBSD Foundation .\" All rights reserved. .\" .\" This software was developed by Edward Tomasz Napierala under sponsorship .\" from the FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd September 5, 2014 +.Dd September 11, 2014 .Dt CTL.CONF 5 .Os .Sh NAME .Nm ctl.conf .Nd CAM Target Layer / iSCSI target daemon configuration file .Sh DESCRIPTION The .Nm configuration file is used by the .Xr ctld 8 daemon. Lines starting with .Ql # are interpreted as comments. The general syntax of the .Nm file is: .Bd -literal -offset indent -pidfile +.No pidfile Ar path -auth-group { - chap - ... +.No auth-group Ar name No { +.Dl chap Ar user Ar secret +.Dl ... } -portal-group { - listen
- listen-iser
- discovery-auth-group - ... +.No portal-group Ar name No { +.Dl listen Ar address +.Dl listen-iser Ar address +.Dl discovery-auth-group Ar name +.Dl ... } -target { - auth-group - portal-group - lun { - path - } - ... +.No target Ar name { +.Dl auth-group Ar name +.Dl portal-group Ar name +.Dl lun Ar number No { +.Dl path Ar path +.Dl } +.Dl ... } .Ed -.Ss global level -The following statements are available at the global level: +.Ss Global Context .Bl -tag -width indent -.It Ic auth-group Aq Ar name -Opens an auth-group section, defining an authentication group, +.It Ic auth-group Ar name +Create an +.Sy auth-group +configuration context, which can then be assigned to any number of targets. -.It Ic debug Aq Ar level -Specifies debug level. +.It Ic debug Ar level +The debug verbosity level. The default is 0. -.It Ic maxproc Aq Ar number -Specifies limit for concurrently running child processes handling +.It Ic maxproc Ar number +The limit for concurrently running child processes handling incoming connections. The default is 30. -Setting it to 0 disables the limit. -.It Ic pidfile Aq Ar path -Specifies path to pidfile. +A setting of 0 disables the limit. +.It Ic pidfile Ar path +The path to the pidfile. The default is .Pa /var/run/ctld.pid . -.It Ic portal-group Aq Ar name -Opens a portal-group section, defining a portal group, +.It Ic portal-group Ar name +Create a +.Sy portal-group +configuration context, which can then be assigned to any number of targets. -.It Ic target Aq Ar name -Opens a target configuration section. -.It Ic timeout Aq Ar seconds -Specifies timeout for login session, after which the connection +.It Ic target Ar name +Create a +.Sy target +configuration context, which can contain one or more +.Sy lun +contexts. +.It Ic timeout Ar seconds +The timeout for login sessions, after which the connection will be forcibly terminated. The default is 60. -Setting it to 0 disables the timeout. +A setting of 0 disables the timeout. .El -.Ss auth-group level -The following statements are available at the auth-group level: +.Ss auth-group Context .Bl -tag -width indent -.It Ic auth-type Ao Ar type Ac -Specifies authentication type. -Type can be either "none", "deny", "chap", or "chap-mutual". + +.It Ic auth-type Ar type +Sets the authentication type. +Type can be either +.Qq Ar none , +.Qq Ar deny , +.Qq Ar chap , +or +.Qq Ar chap-mutual . In most cases it is not necessary to set the type using this clause; -it is usually used to disable authentication for a given auth-group. -.It Ic chap Ao Ar user Ac Aq Ar secret -Specifies CHAP authentication credentials. -.It Ic chap-mutual Ao Ar user Ac Ao Ar secret Ac Ao Ar mutualuser Ac Aq Ar mutualsecret -Specifies mutual CHAP authentication credentials. -Note that for any auth-group, configuration may contain either chap, -or chap-mutual entries; it is an error to mix them. -.It Ic initiator-name Ao Ar initiator-name Ac -Specifies iSCSI initiator name. +it is usually used to disable authentication for a given +.Sy auth-group . +.It Ic chap Ar user Ar secret +A set of CHAP authentication credentials. +Note that for any +.Sy auth-group , +the configuration may only contain either +.Sy chap +or +.Sy chap-mutual +entries; it is an error to mix them. +.It Ic chap-mutual Ar user Ar secret Ar mutualuser Ar mutualsecret +A set of mutual CHAP authentication credentials. +Note that for any +.Sy auth-group , +the configuration may only contain either +.Sy chap +or +.Sy chap-mutual +entries; it is an error to mix them. +.It Ic initiator-name Ar initiator-name +An iSCSI initiator name. +Only initiators with a name matching one of the defined +names will be allowed to connect. If not defined, there will be no restrictions based on initiator name. -Otherwise, only initiators with names matching one of defined -ones will be allowed to connect. -.It Ic initiator-portal Ao Ar address Ac Ao Ar / prefixlen Ac -Specifies the iSCSI initiator portal: an IPv4 or IPv6 address, optionally -followed by slash and prefix length. +.It Ic initiator-portal Ar address Ns Op / Ns Ar prefixlen +An iSCSI initiator portal: an IPv4 or IPv6 address, optionally +followed by a literal slash and a prefix length. +Only initiators with an address matching one of the defined +addresses will be allowed to connect. If not defined, there will be no restrictions based on initiator address. -Otherwise, only initiators with addresses matching one of defined -ones will be allowed to connect. .El -.Ss portal-group level -The following statements are available at the portal-group level: +.Ss portal-group Context .Bl -tag -width indent -.It Ic discovery-auth-group Aq Ar name -Assigns previously defined authentication group to the portal group, +.It Ic discovery-auth-group Ar name +Assign a previously defined authentication group to the portal group, to be used for target discovery. By default, portal groups that do not specify their own auth settings, -using clauses such as "chap" or "initiator-name", are assigned -predefined auth-group "default", which denies discovery. -Another predefined auth-group, "no-authentication", may be used +using clauses such as +.Sy chap +or +.Sy initiator-name , +are assigned +predefined +.Sy auth-group +.Qq Ar default , +which denies discovery. +Another predefined +.Sy auth-group , +.Qq Ar no-authentication , +may be used to permit discovery without authentication. -.It Ic listen Aq Ar address -Specifies IPv4 or IPv6 address and port to listen on for incoming connections. -.It Ic listen-iser Aq Ar address -Specifies IPv4 or IPv6 address and port to listen on for incoming connections +.It Ic listen Ar address +An IPv4 or IPv6 address and port to listen on for incoming connections. +.It Ic listen-iser Ar address +An IPv4 or IPv6 address and port to listen on for incoming connections using iSER (iSCSI over RDMA) protocol. .El -.Ss target level: -The following statements are available at the target level: +.Ss target Context .Bl -tag -width indent -.It Ic alias Aq Ar text -Assigns human-readable description to the target. +.It Ic alias Ar text +Assign a human-readable description to the target. There is no default. -.It Ic auth-group Aq Ar name -Assigns previously defined authentication group to the target. +.It Ic auth-group Ar name +Assign a previously defined authentication group to the target. By default, targets that do not specify their own auth settings, -using clauses such as "chap" or "initiator-name", are assigned -predefined auth-group "default", which denies all access. -Another predefined auth-group, "no-authentication", may be used to permit access +using clauses such as +.Sy chap +or +.Sy initiator-name , +are assigned +predefined +.Sy auth-group +.Qq Ar default , +which denies all access. +Another predefined +.Sy auth-group , +.Qq Ar no-authentication , +may be used to permit access without authentication. -.It Ic auth-type Ao Ar type Ac -Specifies authentication type. -Type can be either "none", "deny", "chap", or "chap-mutual". +Note that targets must only use one of +.Sy auth-group , chap , No or Sy chap-mutual ; +it is a configuration error to mix multiple types in one target. +.It Ic auth-type Ar type +Sets the authentication type. +Type can be either +.Qq Ar none , +.Qq Ar deny , +.Qq Ar chap , +or +.Qq Ar chap-mutual . In most cases it is not necessary to set the type using this clause; -it is usually used to disable authentication for a given target. -This clause is mutually exclusive with auth-group; one cannot use +it is usually used to disable authentication for a given +.Sy target . +This clause is mutually exclusive with +.Sy auth-group ; +one cannot use both in a single target. -.It Ic chap Ao Ar user Ac Aq Ar secret -Specifies CHAP authentication credentials. -Note that targets must use either auth-group, or chap, -or chap-mutual clauses; it is a configuration error to mix them in one target. -.It Ic chap-mutual Ao Ar user Ac Ao Ar secret Ac Ao Ar mutualuser Ac Aq Ar mutualsecret -Specifies mutual CHAP authentication credentials. -Note that targets must use either auth-group, chap, or -chap-mutual clauses; it is a configuration error to mix them in one target. -.It Ic initiator-name Ao Ar initiator-name Ac -Specifies iSCSI initiator name. +.It Ic chap Ar user Ar secret +A set of CHAP authentication credentials. +Note that targets must only use one of +.Sy auth-group , chap , No or Sy chap-mutual ; +it is a configuration error to mix multiple types in one target. +.It Ic chap-mutual Ar user Ar secret Ar mutualuser Ar mutualsecret +A set of mutual CHAP authentication credentials. +Note that targets must only use one of +.Sy auth-group , chap , No or Sy chap-mutual ; +it is a configuration error to mix multiple types in one target. +.It Ic initiator-name Ar initiator-name +An iSCSI initiator name. +Only initiators with a name matching one of the defined +names will be allowed to connect. If not defined, there will be no restrictions based on initiator name. -Otherwise, only initiators with names matching one of defined -ones will be allowed to connect. -This clause is mutually exclusive with auth-group; one cannot use +This clause is mutually exclusive with +.Sy auth-group ; +one cannot use both in a single target. -.It Ic initiator-portal Ao Ar address Ac Ao Ar / prefixlen Ac -Specifies the iSCSI initiator portal: an IPv4 or IPv6 address, optionally -followed by slash and prefix length. +.It Ic initiator-portal Ar address Ns Op / Ns Ar prefixlen +An iSCSI initiator portal: an IPv4 or IPv6 address, optionally +followed by a literal slash and a prefix length. +Only initiators with an address matching one of the defined +addresses will be allowed to connect. If not defined, there will be no restrictions based on initiator address. -Otherwise, only initiators with addresses matching one of defined -ones will be allowed to connect. -This clause is mutually exclusive with auth-group; one cannot use +This clause is mutually exclusive with +.Sy auth-group ; +one cannot use both in a single target. -.It Ic portal-group Aq Ar name -Assigns previously defined portal group to the target. -Default portal group is "default", which makes the target available +.It Ic portal-group Ar name +Assign a previously defined portal group to the target. +The default portal group is +.Qq Ar default , +which makes the target available on TCP port 3260 on all configured IPv4 and IPv6 addresses. -.It Ic lun Aq Ar number -Opens a lun configuration section, defining LUN exported by a target. +.It Ic lun Ar number +Create a +.Sy lun +configuration context, defining a LUN exported by the parent target. .El -.Ss lun level -The following statements are available at the lun level: +.Ss lun Context .Bl -tag -width indent -.It Ic backend Ao Ar block | Ar ramdisk Ac -Specifies the CTL backend to use for a given LUN. +.It Ic backend Ar block No | Ar ramdisk +The CTL backend to use for a given LUN. Valid choices are -.Dq block +.Qq Ar block and -.Dq ramdisk ; +.Qq Ar ramdisk ; block is used for LUNs backed by files or disk device nodes; ramdisk is a bitsink device, used mostly for testing. The default backend is block. -.It Ic blocksize Aq Ar size -Specifies blocksize visible to the initiator. +.It Ic blocksize Ar size +The blocksize visible to the initiator. The default blocksize is 512. -.It Ic device-id Aq Ar string -Specifies SCSI Device Identification string presented to the initiator. -.It Ic option Ao Ar name Ac Aq Ar value -Specifies CTL-specific options passed to the kernel. -.It Ic path Aq Ar path -Specifies path to file or device node used to back the LUN. -.It Ic serial Aq Ar string -Specifies SCSI serial number presented to the initiator. -.It Ic size Aq Ar size -Specifies LUN size, in bytes. +.It Ic device-id Ar string +The SCSI Device Identification string presented to the initiator. +.It Ic option Ar name Ar value +The CTL-specific options passed to the kernel. +All CTL-specific options are documented in the +.Sx OPTIONS +section of +.Xr ctladm 8 +.It Ic path Ar path +The path to the file or device node used to back the LUN. +.It Ic serial Ar string +The SCSI serial number presented to the initiator. +.It Ic size Ar size +The LUN size, in bytes. .El .Sh FILES .Bl -tag -width ".Pa /etc/ctl.conf" -compact .It Pa /etc/ctl.conf The default location of the .Xr ctld 8 configuration file. .El .Sh EXAMPLES .Bd -literal pidfile /var/run/ctld.pid auth-group example2 { chap-mutual "user" "secret" "mutualuser" "mutualsecret" chap-mutual "user2" "secret2" "mutualuser" "mutualsecret" initiator-portal 192.168.1.1/16 } portal-group example2 { discovery-auth-group no-authentication listen 127.0.0.1 listen 0.0.0.0:3261 listen [::]:3261 listen [fe80::be:ef] } target iqn.2012-06.com.example:target0 { alias "Example target" auth-group no-authentication lun 0 { path /dev/zvol/example_0 blocksize 4096 size 4G } } target iqn.2012-06.com.example:target3 { chap chapuser chapsecret lun 0 { path /dev/zvol/example_3 } } target iqn.2012-06.com.example:target2 { auth-group example2 portal-group example2 lun 0 { path /dev/zvol/example2_0 } lun 1 { path /dev/zvol/example2_1 option foo bar } } .Ed .Sh SEE ALSO .Xr ctl 4 , .Xr ctladm 8 , .Xr ctld 8 .Sh AUTHORS The .Nm configuration file functionality for .Xr ctld 8 was developed by .An Edward Tomasz Napierala Aq Mt trasz@FreeBSD.org under sponsorship from the FreeBSD Foundation. Index: user/ae/inet6/usr.sbin/iscsid/login.c =================================================================== --- user/ae/inet6/usr.sbin/iscsid/login.c (revision 271452) +++ user/ae/inet6/usr.sbin/iscsid/login.c (revision 271453) @@ -1,959 +1,981 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "iscsid.h" #include "iscsi_proto.h" static int login_nsg(const struct pdu *response) { struct iscsi_bhs_login_response *bhslr; bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; return (bhslr->bhslr_flags & 0x03); } static void login_set_nsg(struct pdu *request, int nsg) { struct iscsi_bhs_login_request *bhslr; assert(nsg == BHSLR_STAGE_SECURITY_NEGOTIATION || nsg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION || nsg == BHSLR_STAGE_FULL_FEATURE_PHASE); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; bhslr->bhslr_flags &= 0xFC; bhslr->bhslr_flags |= nsg; } static void login_set_csg(struct pdu *request, int csg) { struct iscsi_bhs_login_request *bhslr; assert(csg == BHSLR_STAGE_SECURITY_NEGOTIATION || csg == BHSLR_STAGE_OPERATIONAL_NEGOTIATION || csg == BHSLR_STAGE_FULL_FEATURE_PHASE); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; bhslr->bhslr_flags &= 0xF3; bhslr->bhslr_flags |= csg << 2; } static const char * login_target_error_str(int class, int detail) { static char msg[128]; /* * RFC 3270, 10.13.5. Status-Class and Status-Detail */ switch (class) { case 0x01: switch (detail) { case 0x01: return ("Target moved temporarily"); case 0x02: return ("Target moved permanently"); default: snprintf(msg, sizeof(msg), "unknown redirection; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } case 0x02: switch (detail) { case 0x00: return ("Initiator error"); case 0x01: return ("Authentication failure"); case 0x02: return ("Authorization failure"); case 0x03: return ("Not found"); case 0x04: return ("Target removed"); case 0x05: return ("Unsupported version"); case 0x06: return ("Too many connections"); case 0x07: return ("Missing parameter"); case 0x08: return ("Can't include in session"); case 0x09: return ("Session type not supported"); case 0x0a: return ("Session does not exist"); case 0x0b: return ("Invalid during login"); default: snprintf(msg, sizeof(msg), "unknown initiator error; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } case 0x03: switch (detail) { case 0x00: return ("Target error"); case 0x01: return ("Service unavailable"); case 0x02: return ("Out of resources"); default: snprintf(msg, sizeof(msg), "unknown target error; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } default: snprintf(msg, sizeof(msg), "unknown error; " "Status-Class 0x%x, Status-Detail 0x%x", class, detail); return (msg); } } static void kernel_modify(const struct connection *conn, const char *target_address) { struct iscsi_session_modify ism; int error; memset(&ism, 0, sizeof(ism)); ism.ism_session_id = conn->conn_session_id; memcpy(&ism.ism_conf, &conn->conn_conf, sizeof(ism.ism_conf)); strlcpy(ism.ism_conf.isc_target_addr, target_address, sizeof(ism.ism_conf.isc_target)); error = ioctl(conn->conn_iscsi_fd, ISCSISMODIFY, &ism); if (error != 0) { log_err(1, "failed to redirect to %s: ISCSISMODIFY", target_address); } } /* * XXX: The way it works is suboptimal; what should happen is described * in draft-gilligan-iscsi-fault-tolerance-00. That, however, would * be much more complicated: we would need to keep "dependencies" * for sessions, so that, in case described in draft and using draft * terminology, we would have three sessions: one for discovery, * one for initial target portal, and one for redirect portal. * This would allow us to "backtrack" on connection failure, * as described in draft. */ static void login_handle_redirection(struct connection *conn, struct pdu *response) { struct iscsi_bhs_login_response *bhslr; struct keys *response_keys; const char *target_address; bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; assert (bhslr->bhslr_status_class == 1); response_keys = keys_new(); keys_load(response_keys, response); target_address = keys_find(response_keys, "TargetAddress"); if (target_address == NULL) log_errx(1, "received redirection without TargetAddress"); if (target_address[0] == '\0') log_errx(1, "received redirection with empty TargetAddress"); if (strlen(target_address) >= sizeof(conn->conn_conf.isc_target_addr) - 1) log_errx(1, "received TargetAddress is too long"); log_debugx("received redirection to \"%s\"", target_address); kernel_modify(conn, target_address); keys_delete(response_keys); } static struct pdu * login_receive(struct connection *conn) { struct pdu *response; struct iscsi_bhs_login_response *bhslr; const char *errorstr; static bool initial = true; response = pdu_new(conn); pdu_receive(response); if (response->pdu_bhs->bhs_opcode != ISCSI_BHS_OPCODE_LOGIN_RESPONSE) { log_errx(1, "protocol error: received invalid opcode 0x%x", response->pdu_bhs->bhs_opcode); } bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; /* * XXX: Implement the C flag some day. */ if ((bhslr->bhslr_flags & BHSLR_FLAGS_CONTINUE) != 0) log_errx(1, "received Login PDU with unsupported \"C\" flag"); if (bhslr->bhslr_version_max != 0x00) log_errx(1, "received Login PDU with unsupported " "Version-max 0x%x", bhslr->bhslr_version_max); if (bhslr->bhslr_version_active != 0x00) log_errx(1, "received Login PDU with unsupported " "Version-active 0x%x", bhslr->bhslr_version_active); if (bhslr->bhslr_status_class == 1) { login_handle_redirection(conn, response); log_debugx("redirection handled; exiting"); exit(0); } if (bhslr->bhslr_status_class != 0) { errorstr = login_target_error_str(bhslr->bhslr_status_class, bhslr->bhslr_status_detail); fail(conn, errorstr); log_errx(1, "target returned error: %s", errorstr); } if (initial == false && ntohl(bhslr->bhslr_statsn) != conn->conn_statsn + 1) { /* * It's a warning, not an error, to work around what seems * to be bug in NetBSD iSCSI target. */ log_warnx("received Login PDU with wrong StatSN: " "is %d, should be %d", ntohl(bhslr->bhslr_statsn), conn->conn_statsn + 1); } conn->conn_tsih = ntohs(bhslr->bhslr_tsih); conn->conn_statsn = ntohl(bhslr->bhslr_statsn); initial = false; return (response); } static struct pdu * login_new_request(struct connection *conn, int csg) { struct pdu *request; struct iscsi_bhs_login_request *bhslr; int nsg; request = pdu_new(conn); bhslr = (struct iscsi_bhs_login_request *)request->pdu_bhs; bhslr->bhslr_opcode = ISCSI_BHS_OPCODE_LOGIN_REQUEST | ISCSI_BHS_OPCODE_IMMEDIATE; bhslr->bhslr_flags = BHSLR_FLAGS_TRANSIT; switch (csg) { case BHSLR_STAGE_SECURITY_NEGOTIATION: nsg = BHSLR_STAGE_OPERATIONAL_NEGOTIATION; break; case BHSLR_STAGE_OPERATIONAL_NEGOTIATION: nsg = BHSLR_STAGE_FULL_FEATURE_PHASE; break; default: assert(!"invalid csg"); log_errx(1, "invalid csg %d", csg); } login_set_csg(request, csg); login_set_nsg(request, nsg); memcpy(bhslr->bhslr_isid, &conn->conn_isid, sizeof(bhslr->bhslr_isid)); bhslr->bhslr_tsih = htons(conn->conn_tsih); bhslr->bhslr_initiator_task_tag = 0; bhslr->bhslr_cmdsn = 0; bhslr->bhslr_expstatsn = htonl(conn->conn_statsn + 1); return (request); } static int login_list_prefers(const char *list, const char *choice1, const char *choice2) { char *tofree, *str, *token; tofree = str = checked_strdup(list); while ((token = strsep(&str, ",")) != NULL) { if (strcmp(token, choice1) == 0) { free(tofree); return (1); } if (strcmp(token, choice2) == 0) { free(tofree); return (2); } } free(tofree); return (-1); } static int login_hex2int(const char hex) { switch (hex) { case '0': return (0x00); case '1': return (0x01); case '2': return (0x02); case '3': return (0x03); case '4': return (0x04); case '5': return (0x05); case '6': return (0x06); case '7': return (0x07); case '8': return (0x08); case '9': return (0x09); case 'a': case 'A': return (0x0a); case 'b': case 'B': return (0x0b); case 'c': case 'C': return (0x0c); case 'd': case 'D': return (0x0d); case 'e': case 'E': return (0x0e); case 'f': case 'F': return (0x0f); default: return (-1); } } /* * XXX: Review this _carefully_. */ static int login_hex2bin(const char *hex, char **binp, size_t *bin_lenp) { int i, hex_len, nibble; bool lo = true; /* As opposed to 'hi'. */ char *bin; size_t bin_off, bin_len; if (strncasecmp(hex, "0x", strlen("0x")) != 0) { log_warnx("malformed variable, should start with \"0x\""); return (-1); } hex += strlen("0x"); hex_len = strlen(hex); if (hex_len < 1) { log_warnx("malformed variable; doesn't contain anything " "but \"0x\""); return (-1); } bin_len = hex_len / 2 + hex_len % 2; bin = calloc(bin_len, 1); if (bin == NULL) log_err(1, "calloc"); bin_off = bin_len - 1; for (i = hex_len - 1; i >= 0; i--) { nibble = login_hex2int(hex[i]); if (nibble < 0) { log_warnx("malformed variable, invalid char \"%c\"", hex[i]); return (-1); } assert(bin_off < bin_len); if (lo) { bin[bin_off] = nibble; lo = false; } else { bin[bin_off] |= nibble << 4; bin_off--; lo = true; } } *binp = bin; *bin_lenp = bin_len; return (0); } static char * login_bin2hex(const char *bin, size_t bin_len) { unsigned char *hex, *tmp, ch; size_t hex_len; size_t i; hex_len = bin_len * 2 + 3; /* +2 for "0x", +1 for '\0'. */ hex = malloc(hex_len); if (hex == NULL) log_err(1, "malloc"); tmp = hex; tmp += sprintf(tmp, "0x"); for (i = 0; i < bin_len; i++) { ch = bin[i]; tmp += sprintf(tmp, "%02x", ch); } return (hex); } static void login_compute_md5(const char id, const char *secret, const void *challenge, size_t challenge_len, void *response, size_t response_len) { MD5_CTX ctx; int rv; assert(response_len == MD5_DIGEST_LENGTH); MD5_Init(&ctx); MD5_Update(&ctx, &id, sizeof(id)); MD5_Update(&ctx, secret, strlen(secret)); MD5_Update(&ctx, challenge, challenge_len); rv = MD5_Final(response, &ctx); if (rv != 1) log_errx(1, "MD5_Final"); } static void login_negotiate_key(struct connection *conn, const char *name, const char *value) { int which, tmp; if (strcmp(name, "TargetAlias") == 0) { strlcpy(conn->conn_target_alias, value, sizeof(conn->conn_target_alias)); } else if (strcmp(value, "Irrelevant") == 0) { /* Ignore. */ } else if (strcmp(name, "HeaderDigest") == 0) { which = login_list_prefers(value, "CRC32C", "None"); switch (which) { case 1: log_debugx("target prefers CRC32C " "for header digest; we'll use it"); conn->conn_header_digest = CONN_DIGEST_CRC32C; break; case 2: log_debugx("target prefers not to do " "header digest; we'll comply"); break; default: log_warnx("target sent unrecognized " "HeaderDigest value \"%s\"; will use None", value); break; } } else if (strcmp(name, "DataDigest") == 0) { which = login_list_prefers(value, "CRC32C", "None"); switch (which) { case 1: log_debugx("target prefers CRC32C " "for data digest; we'll use it"); conn->conn_data_digest = CONN_DIGEST_CRC32C; break; case 2: log_debugx("target prefers not to do " "data digest; we'll comply"); break; default: log_warnx("target sent unrecognized " "DataDigest value \"%s\"; will use None", value); break; } } else if (strcmp(name, "MaxConnections") == 0) { /* Ignore. */ } else if (strcmp(name, "InitialR2T") == 0) { if (strcmp(value, "Yes") == 0) conn->conn_initial_r2t = true; else conn->conn_initial_r2t = false; } else if (strcmp(name, "ImmediateData") == 0) { if (strcmp(value, "Yes") == 0) conn->conn_immediate_data = true; else conn->conn_immediate_data = false; } else if (strcmp(name, "MaxRecvDataSegmentLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) log_errx(1, "received invalid " "MaxRecvDataSegmentLength"); conn->conn_max_data_segment_length = tmp; } else if (strcmp(name, "MaxBurstLength") == 0) { if (conn->conn_immediate_data) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) log_errx(1, "received invalid MaxBurstLength"); conn->conn_max_burst_length = tmp; } } else if (strcmp(name, "FirstBurstLength") == 0) { tmp = strtoul(value, NULL, 10); if (tmp <= 0) log_errx(1, "received invalid FirstBurstLength"); conn->conn_first_burst_length = tmp; } else if (strcmp(name, "DefaultTime2Wait") == 0) { /* Ignore */ } else if (strcmp(name, "DefaultTime2Retain") == 0) { /* Ignore */ } else if (strcmp(name, "MaxOutstandingR2T") == 0) { /* Ignore */ } else if (strcmp(name, "DataPDUInOrder") == 0) { /* Ignore */ } else if (strcmp(name, "DataSequenceInOrder") == 0) { /* Ignore */ } else if (strcmp(name, "ErrorRecoveryLevel") == 0) { /* Ignore */ } else if (strcmp(name, "OFMarker") == 0) { /* Ignore */ } else if (strcmp(name, "IFMarker") == 0) { /* Ignore */ } else if (strcmp(name, "TargetPortalGroupTag") == 0) { /* Ignore */ } else { log_debugx("unknown key \"%s\"; ignoring", name); } } static void login_negotiate(struct connection *conn) { struct pdu *request, *response; struct keys *request_keys, *response_keys; struct iscsi_bhs_login_response *bhslr; - int i; + int i, nrequests = 0; log_debugx("beginning operational parameter negotiation"); request = login_new_request(conn, BHSLR_STAGE_OPERATIONAL_NEGOTIATION); request_keys = keys_new(); /* * The following keys are irrelevant for discovery sessions. */ if (conn->conn_conf.isc_discovery == 0) { if (conn->conn_conf.isc_header_digest != 0) keys_add(request_keys, "HeaderDigest", "CRC32C"); else keys_add(request_keys, "HeaderDigest", "None"); if (conn->conn_conf.isc_data_digest != 0) keys_add(request_keys, "DataDigest", "CRC32C"); else keys_add(request_keys, "DataDigest", "None"); keys_add(request_keys, "ImmediateData", "Yes"); keys_add_int(request_keys, "MaxBurstLength", ISCSI_MAX_DATA_SEGMENT_LENGTH); keys_add_int(request_keys, "FirstBurstLength", ISCSI_MAX_DATA_SEGMENT_LENGTH); keys_add(request_keys, "InitialR2T", "Yes"); } else { keys_add(request_keys, "HeaderDigest", "None"); keys_add(request_keys, "DataDigest", "None"); } keys_add_int(request_keys, "MaxRecvDataSegmentLength", ISCSI_MAX_DATA_SEGMENT_LENGTH); keys_add(request_keys, "DefaultTime2Wait", "0"); keys_add(request_keys, "DefaultTime2Retain", "0"); keys_add(request_keys, "ErrorRecoveryLevel", "0"); keys_add(request_keys, "MaxOutstandingR2T", "1"); keys_save(request_keys, request); keys_delete(request_keys); request_keys = NULL; pdu_send(request); pdu_delete(request); request = NULL; response = login_receive(conn); response_keys = keys_new(); keys_load(response_keys, response); for (i = 0; i < KEYS_MAX; i++) { if (response_keys->keys_names[i] == NULL) break; login_negotiate_key(conn, response_keys->keys_names[i], response_keys->keys_values[i]); } - bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; - if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) == 0) - log_warnx("received final login response " - "without the \"T\" flag"); - else if (login_nsg(response) != BHSLR_STAGE_FULL_FEATURE_PHASE) + keys_delete(response_keys); + response_keys = NULL; + + for (;;) { + bhslr = (struct iscsi_bhs_login_response *)response->pdu_bhs; + if ((bhslr->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0) + break; + + nrequests++; + if (nrequests > 5) { + log_warnx("received login response " + "without the \"T\" flag too many times; giving up"); + break; + } + + log_debugx("received login response " + "without the \"T\" flag; sending another request"); + + pdu_delete(response); + + request = login_new_request(conn, + BHSLR_STAGE_OPERATIONAL_NEGOTIATION); + pdu_send(request); + pdu_delete(request); + + response = login_receive(conn); + } + + if (login_nsg(response) != BHSLR_STAGE_FULL_FEATURE_PHASE) log_warnx("received final login response with wrong NSG 0x%x", login_nsg(response)); + pdu_delete(response); log_debugx("operational parameter negotiation done; " "transitioning to Full Feature phase"); - - keys_delete(response_keys); - pdu_delete(response); } static void login_send_chap_a(struct connection *conn) { struct pdu *request; struct keys *request_keys; request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION); request_keys = keys_new(); keys_add(request_keys, "CHAP_A", "5"); keys_save(request_keys, request); keys_delete(request_keys); pdu_send(request); pdu_delete(request); } static void login_send_chap_r(struct pdu *response) { struct connection *conn; struct pdu *request; struct keys *request_keys, *response_keys; const char *chap_a, *chap_c, *chap_i; char *chap_r, *challenge, response_bin[MD5_DIGEST_LENGTH]; size_t challenge_len; int error, rv; unsigned char id; char *mutual_chap_c, mutual_chap_i[4]; /* * As in the rest of the initiator, 'request' means * 'initiator -> target', and 'response' means 'target -> initiator', * * So, here the 'response' from the target is the packet that contains * CHAP challenge; our CHAP response goes into 'request'. */ conn = response->pdu_connection; response_keys = keys_new(); keys_load(response_keys, response); /* * First, compute the response. */ chap_a = keys_find(response_keys, "CHAP_A"); if (chap_a == NULL) log_errx(1, "received CHAP packet without CHAP_A"); chap_c = keys_find(response_keys, "CHAP_C"); if (chap_c == NULL) log_errx(1, "received CHAP packet without CHAP_C"); chap_i = keys_find(response_keys, "CHAP_I"); if (chap_i == NULL) log_errx(1, "received CHAP packet without CHAP_I"); if (strcmp(chap_a, "5") != 0) log_errx(1, "received CHAP packet " "with unsupported CHAP_A \"%s\"", chap_a); id = strtoul(chap_i, NULL, 10); error = login_hex2bin(chap_c, &challenge, &challenge_len); if (error != 0) log_errx(1, "received CHAP packet with malformed CHAP_C"); login_compute_md5(id, conn->conn_conf.isc_secret, challenge, challenge_len, response_bin, sizeof(response_bin)); free(challenge); chap_r = login_bin2hex(response_bin, sizeof(response_bin)); keys_delete(response_keys); request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION); request_keys = keys_new(); keys_add(request_keys, "CHAP_N", conn->conn_conf.isc_user); keys_add(request_keys, "CHAP_R", chap_r); free(chap_r); /* * If we want mutual authentication, we're expected to send * our CHAP_I/CHAP_C now. */ if (conn->conn_conf.isc_mutual_user[0] != '\0') { log_debugx("requesting mutual authentication; " "binary challenge size is %zd bytes", sizeof(conn->conn_mutual_challenge)); rv = RAND_bytes(conn->conn_mutual_challenge, sizeof(conn->conn_mutual_challenge)); if (rv != 1) { log_errx(1, "RAND_bytes failed: %s", ERR_error_string(ERR_get_error(), NULL)); } rv = RAND_bytes(&conn->conn_mutual_id, sizeof(conn->conn_mutual_id)); if (rv != 1) { log_errx(1, "RAND_bytes failed: %s", ERR_error_string(ERR_get_error(), NULL)); } mutual_chap_c = login_bin2hex(conn->conn_mutual_challenge, sizeof(conn->conn_mutual_challenge)); snprintf(mutual_chap_i, sizeof(mutual_chap_i), "%d", conn->conn_mutual_id); keys_add(request_keys, "CHAP_I", mutual_chap_i); keys_add(request_keys, "CHAP_C", mutual_chap_c); free(mutual_chap_c); } keys_save(request_keys, request); keys_delete(request_keys); pdu_send(request); pdu_delete(request); } static void login_verify_mutual(const struct pdu *response) { struct connection *conn; struct keys *response_keys; const char *chap_n, *chap_r; char *response_bin, expected_response_bin[MD5_DIGEST_LENGTH]; size_t response_bin_len; int error; conn = response->pdu_connection; response_keys = keys_new(); keys_load(response_keys, response); chap_n = keys_find(response_keys, "CHAP_N"); if (chap_n == NULL) log_errx(1, "received CHAP Response PDU without CHAP_N"); chap_r = keys_find(response_keys, "CHAP_R"); if (chap_r == NULL) log_errx(1, "received CHAP Response PDU without CHAP_R"); error = login_hex2bin(chap_r, &response_bin, &response_bin_len); if (error != 0) log_errx(1, "received CHAP Response PDU with malformed CHAP_R"); if (strcmp(chap_n, conn->conn_conf.isc_mutual_user) != 0) { fail(conn, "Mutual CHAP failed"); log_errx(1, "mutual CHAP authentication failed: wrong user"); } login_compute_md5(conn->conn_mutual_id, conn->conn_conf.isc_mutual_secret, conn->conn_mutual_challenge, sizeof(conn->conn_mutual_challenge), expected_response_bin, sizeof(expected_response_bin)); if (memcmp(response_bin, expected_response_bin, sizeof(expected_response_bin)) != 0) { fail(conn, "Mutual CHAP failed"); log_errx(1, "mutual CHAP authentication failed: wrong secret"); } keys_delete(response_keys); free(response_bin); log_debugx("mutual CHAP authentication succeeded"); } static void login_chap(struct connection *conn) { struct pdu *response; log_debugx("beginning CHAP authentication; sending CHAP_A"); login_send_chap_a(conn); log_debugx("waiting for CHAP_A/CHAP_C/CHAP_I"); response = login_receive(conn); log_debugx("sending CHAP_N/CHAP_R"); login_send_chap_r(response); pdu_delete(response); /* * XXX: Make sure this is not susceptible to MITM. */ log_debugx("waiting for CHAP result"); response = login_receive(conn); if (conn->conn_conf.isc_mutual_user[0] != '\0') login_verify_mutual(response); pdu_delete(response); log_debugx("CHAP authentication done"); } void login(struct connection *conn) { struct pdu *request, *response; struct keys *request_keys, *response_keys; struct iscsi_bhs_login_response *bhslr2; const char *auth_method; int i; log_debugx("beginning Login phase; sending Login PDU"); request = login_new_request(conn, BHSLR_STAGE_SECURITY_NEGOTIATION); request_keys = keys_new(); if (conn->conn_conf.isc_mutual_user[0] != '\0') { keys_add(request_keys, "AuthMethod", "CHAP"); } else if (conn->conn_conf.isc_user[0] != '\0') { /* * Give target a chance to skip authentication if it * doesn't feel like it. * * None is first, CHAP second; this is to work around * what seems to be LIO (Linux target) bug: otherwise, * if target is configured with no authentication, * and we are configured to authenticate, the target * will erroneously respond with AuthMethod=CHAP * instead of AuthMethod=None, and will subsequently * fail the connection. This usually happens with * Discovery sessions, which default to no authentication. */ keys_add(request_keys, "AuthMethod", "None,CHAP"); } else { keys_add(request_keys, "AuthMethod", "None"); } keys_add(request_keys, "InitiatorName", conn->conn_conf.isc_initiator); if (conn->conn_conf.isc_initiator_alias[0] != '\0') { keys_add(request_keys, "InitiatorAlias", conn->conn_conf.isc_initiator_alias); } if (conn->conn_conf.isc_discovery == 0) { keys_add(request_keys, "SessionType", "Normal"); keys_add(request_keys, "TargetName", conn->conn_conf.isc_target); } else { keys_add(request_keys, "SessionType", "Discovery"); } keys_save(request_keys, request); keys_delete(request_keys); pdu_send(request); pdu_delete(request); response = login_receive(conn); response_keys = keys_new(); keys_load(response_keys, response); for (i = 0; i < KEYS_MAX; i++) { if (response_keys->keys_names[i] == NULL) break; /* * Not interested in AuthMethod at this point; we only need * to parse things such as TargetAlias. * * XXX: This is somewhat ugly. We should have a way to apply * all the keys to the session and use that by default * instead of discarding them. */ if (strcmp(response_keys->keys_names[i], "AuthMethod") == 0) continue; login_negotiate_key(conn, response_keys->keys_names[i], response_keys->keys_values[i]); } bhslr2 = (struct iscsi_bhs_login_response *)response->pdu_bhs; if ((bhslr2->bhslr_flags & BHSLR_FLAGS_TRANSIT) != 0 && login_nsg(response) == BHSLR_STAGE_OPERATIONAL_NEGOTIATION) { if (conn->conn_conf.isc_mutual_user[0] != '\0') { log_errx(1, "target requested transition " "to operational parameter negotiation, " "but we require mutual CHAP"); } log_debugx("target requested transition " "to operational parameter negotiation"); keys_delete(response_keys); pdu_delete(response); login_negotiate(conn); return; } auth_method = keys_find(response_keys, "AuthMethod"); if (auth_method == NULL) log_errx(1, "received response without AuthMethod"); if (strcmp(auth_method, "None") == 0) { if (conn->conn_conf.isc_mutual_user[0] != '\0') { log_errx(1, "target does not require authantication, " "but we require mutual CHAP"); } log_debugx("target does not require authentication"); keys_delete(response_keys); pdu_delete(response); login_negotiate(conn); return; } if (strcmp(auth_method, "CHAP") != 0) { fail(conn, "Unsupported AuthMethod"); log_errx(1, "received response " "with unsupported AuthMethod \"%s\"", auth_method); } if (conn->conn_conf.isc_user[0] == '\0' || conn->conn_conf.isc_secret[0] == '\0') { fail(conn, "Authentication required"); log_errx(1, "target requests CHAP authentication, but we don't " "have user and secret"); } keys_delete(response_keys); response_keys = NULL; pdu_delete(response); response = NULL; login_chap(conn); login_negotiate(conn); } Index: user/ae/inet6 =================================================================== --- user/ae/inet6 (revision 271452) +++ user/ae/inet6 (revision 271453) Property changes on: user/ae/inet6 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r271428-271452