Index: head/lib/libefivar/efi-osdep.h =================================================================== --- head/lib/libefivar/efi-osdep.h (revision 343754) +++ head/lib/libefivar/efi-osdep.h (revision 343755) @@ -1,111 +1,110 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _EFI_OSDEP_H_ #define _EFI_OSDEP_H_ /* * Defines to adjust the types that EDK2 uses for FreeBSD so we can * use the code and headers mostly unchanged. The headers are imported * all into one directory to avoid case issues with filenames and * included. The actual code is heavily modified since it has too many * annoying dependencies that are difficult to satisfy. */ #include #include #include #include typedef int8_t INT8; typedef int16_t INT16; typedef int32_t INT32; typedef int64_t INT64; typedef intptr_t INTN; typedef uint8_t UINT8; typedef uint16_t UINT16; typedef uint32_t UINT32; typedef uint64_t UINT64; typedef uintptr_t UINTN; //typedef uintptr_t EFI_PHYSICAL_ADDRESS; //typedef uint32_t EFI_IPv4_ADDRESS; //typedef uint8_t EFI_MAC_ADDRESS[6]; //typedef uint8_t EFI_IPv6_ADDRESS[16]; typedef uint8_t CHAR8; typedef uint16_t CHAR16; typedef UINT8 BOOLEAN; typedef void VOID; //typedef uuid_t GUID; //typedef uuid_t EFI_GUID; /* We can't actually call this stuff, so snip out API syntactic sugar */ #define INTERFACE_DECL(x) #define EFIAPI #define IN #define OUT #define CONST const #define OPTIONAL //#define TRUE 1 //#define FALSE 0 /* * EDK2 has fine definitions for these, so let it define them. */ #undef NULL #undef EFI_PAGE_SIZE #undef EFI_PAGE_MASK /* * Note: the EDK2 code assumed #pragma packed works and PACKED is a * workaround for some old toolchain issues for EDK2 that aren't * relevent to FreeBSD. */ #define PACKED /* * Since we're not compiling for the UEFI boot time (which use ms abi * conventions), tell EDK2 to define VA_START correctly. For the boot * loader, this likely needs to be different. */ #define NO_MSABI_VA_FUNCS 1 /* * Finally, we need to define the processor we are in EDK2 terms. */ #if defined(__i386__) #define MDE_CPU_IA32 #elif defined(__amd64__) #define MDE_CPU_X64 #elif defined(__arm__) #define MDE_CPU_ARM #elif defined(__aarch64__) #define MDE_CPU_AARCH64 #endif /* FreeBSD doesn't have/use MDE_CPU_EBC or MDE_CPU_IPF (ia64) */ #endif /* _EFI_OSDEP_H_ */ Index: head/lib/libefivar/efivar-dp-format.c =================================================================== --- head/lib/libefivar/efivar-dp-format.c (revision 343754) +++ head/lib/libefivar/efivar-dp-format.c (revision 343755) @@ -1,2474 +1,2473 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Routines to format EFI_DEVICE_PATHs from the UEFI standard. Much of * this file is taken from EDK2 and rototilled. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include "efichar.h" #include "efi-osdep.h" #include "efivar-dp.h" #include "uefi-dplib.h" /* * This is a lie, but since we have converted everything * from wide to narrow, it's the right lie now. */ #define UnicodeSPrint snprintf /* * Taken from MdePkg/Library/UefiDevicePathLib/DevicePathToText.c * hash a11928f3310518ab1c6fd34e8d0fdbb72de9602c 2017-Mar-01 * heavily modified: * wide strings converted to narrow * Low level printing code redone for narrow strings * Routines made static * %s -> %S in spots (where it is still UCS-2) * %a (ascii) -> %s * %g -> %36s hack to print guid (see above for caveat) * some tidying up of const and deconsting. It's evil, but const * poisoning the whole file was too much. */ /** @file DevicePathToText protocol as defined in the UEFI 2.0 specification. (C) Copyright 2015 Hewlett-Packard Development Company, L.P.
Copyright (c) 2013 - 2015, Intel Corporation. All rights reserved.
This program and the accompanying materials are licensed and made available under the terms and conditions of the BSD License which accompanies this distribution. The full text of the license may be found at http://opensource.org/licenses/bsd-license.php THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. **/ // #include "UefiDevicePathLib.h" /** Concatenates a formatted unicode string to allocated pool. The caller must free the resulting buffer. @param Str Tracks the allocated pool, size in use, and amount of pool allocated. @param Fmt The format string @param ... Variable arguments based on the format string. @return Allocated buffer with the formatted string printed in it. The caller must free the allocated buffer. The buffer allocation is not packed. **/ static char * EFIAPI UefiDevicePathLibCatPrint ( IN OUT POOL_PRINT *Str, IN const char *Fmt, ... ) { UINTN Count; VA_LIST Args; VA_START (Args, Fmt); Count = vsnprintf(NULL, 0, Fmt, Args); VA_END(Args); if ((Str->Count + (Count + 1)) > Str->Capacity) { Str->Capacity = (Str->Count + (Count + 1) * 2); Str->Str = reallocf(Str->Str, Str->Capacity); ASSERT (Str->Str != NULL); } VA_START (Args, Fmt); vsnprintf(Str->Str + Str->Count, Str->Capacity - Str->Count, Fmt, Args); Str->Count += Count; VA_END (Args); return Str->Str; } /** Converts a PCI device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextPci ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { PCI_DEVICE_PATH *Pci; Pci = DevPath; UefiDevicePathLibCatPrint (Str, "Pci(0x%x,0x%x)", Pci->Device, Pci->Function); } /** Converts a PC Card device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextPccard ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { PCCARD_DEVICE_PATH *Pccard; Pccard = DevPath; UefiDevicePathLibCatPrint (Str, "PcCard(0x%x)", Pccard->FunctionNumber); } /** Converts a Memory Map device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextMemMap ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { MEMMAP_DEVICE_PATH *MemMap; MemMap = DevPath; UefiDevicePathLibCatPrint ( Str, "MemoryMapped(0x%x,0x%lx,0x%lx)", MemMap->MemoryType, MemMap->StartingAddress, MemMap->EndingAddress ); } /** Converts a Vendor device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextVendor ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { VENDOR_DEVICE_PATH *Vendor; const char *Type; UINTN Index; UINTN DataLength; UINT32 FlowControlMap; UINT16 Info; Vendor = (VENDOR_DEVICE_PATH *) DevPath; switch (DevicePathType (&Vendor->Header)) { case HARDWARE_DEVICE_PATH: Type = "Hw"; break; case MESSAGING_DEVICE_PATH: Type = "Msg"; if (AllowShortcuts) { if (CompareGuid (&Vendor->Guid, &gEfiPcAnsiGuid)) { UefiDevicePathLibCatPrint (Str, "VenPcAnsi()"); return ; } else if (CompareGuid (&Vendor->Guid, &gEfiVT100Guid)) { UefiDevicePathLibCatPrint (Str, "VenVt100()"); return ; } else if (CompareGuid (&Vendor->Guid, &gEfiVT100PlusGuid)) { UefiDevicePathLibCatPrint (Str, "VenVt100Plus()"); return ; } else if (CompareGuid (&Vendor->Guid, &gEfiVTUTF8Guid)) { UefiDevicePathLibCatPrint (Str, "VenUft8()"); return ; } else if (CompareGuid (&Vendor->Guid, &gEfiUartDevicePathGuid)) { FlowControlMap = (((UART_FLOW_CONTROL_DEVICE_PATH *) Vendor)->FlowControlMap); switch (FlowControlMap & 0x00000003) { case 0: UefiDevicePathLibCatPrint (Str, "UartFlowCtrl(%s)", "None"); break; case 1: UefiDevicePathLibCatPrint (Str, "UartFlowCtrl(%s)", "Hardware"); break; case 2: UefiDevicePathLibCatPrint (Str, "UartFlowCtrl(%s)", "XonXoff"); break; default: break; } return ; } else if (CompareGuid (&Vendor->Guid, &gEfiSasDevicePathGuid)) { UefiDevicePathLibCatPrint ( Str, "SAS(0x%lx,0x%lx,0x%x,", ((SAS_DEVICE_PATH *) Vendor)->SasAddress, ((SAS_DEVICE_PATH *) Vendor)->Lun, ((SAS_DEVICE_PATH *) Vendor)->RelativeTargetPort ); Info = (((SAS_DEVICE_PATH *) Vendor)->DeviceTopology); if (((Info & 0x0f) == 0) && ((Info & BIT7) == 0)) { UefiDevicePathLibCatPrint (Str, "NoTopology,0,0,0,"); } else if (((Info & 0x0f) <= 2) && ((Info & BIT7) == 0)) { UefiDevicePathLibCatPrint ( Str, "%s,%s,%s,", ((Info & BIT4) != 0) ? "SATA" : "SAS", ((Info & BIT5) != 0) ? "External" : "Internal", ((Info & BIT6) != 0) ? "Expanded" : "Direct" ); if ((Info & 0x0f) == 1) { UefiDevicePathLibCatPrint (Str, "0,"); } else { // // Value 0x0 thru 0xFF -> Drive 1 thru Drive 256 // UefiDevicePathLibCatPrint (Str, "0x%x,", ((Info >> 8) & 0xff) + 1); } } else { UefiDevicePathLibCatPrint (Str, "0x%x,0,0,0,", Info); } UefiDevicePathLibCatPrint (Str, "0x%x)", ((SAS_DEVICE_PATH *) Vendor)->Reserved); return ; } else if (CompareGuid (&Vendor->Guid, &gEfiDebugPortProtocolGuid)) { UefiDevicePathLibCatPrint (Str, "DebugPort()"); return ; } } break; case MEDIA_DEVICE_PATH: Type = "Media"; break; default: Type = "?"; break; } DataLength = DevicePathNodeLength (&Vendor->Header) - sizeof (VENDOR_DEVICE_PATH); UefiDevicePathLibCatPrint (Str, "Ven%s(%36s", Type, G(&Vendor->Guid)); if (DataLength != 0) { UefiDevicePathLibCatPrint (Str, ","); for (Index = 0; Index < DataLength; Index++) { UefiDevicePathLibCatPrint (Str, "%02x", ((VENDOR_DEVICE_PATH_WITH_DATA *) Vendor)->VendorDefinedData[Index]); } } UefiDevicePathLibCatPrint (Str, ")"); } /** Converts a Controller device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextController ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { CONTROLLER_DEVICE_PATH *Controller; Controller = DevPath; UefiDevicePathLibCatPrint ( Str, "Ctrl(0x%x)", Controller->ControllerNumber ); } /** Converts a BMC device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextBmc ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { BMC_DEVICE_PATH *Bmc; Bmc = DevPath; UefiDevicePathLibCatPrint ( Str, "BMC(0x%x,0x%lx)", Bmc->InterfaceType, ReadUnaligned64 ((&Bmc->BaseAddress)) ); } /** Converts a ACPI device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextAcpi ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { ACPI_HID_DEVICE_PATH *Acpi; Acpi = DevPath; if ((Acpi->HID & PNP_EISA_ID_MASK) == PNP_EISA_ID_CONST) { switch (EISA_ID_TO_NUM (Acpi->HID)) { case 0x0a03: UefiDevicePathLibCatPrint (Str, "PciRoot(0x%x)", Acpi->UID); break; case 0x0a08: UefiDevicePathLibCatPrint (Str, "PcieRoot(0x%x)", Acpi->UID); break; case 0x0604: UefiDevicePathLibCatPrint (Str, "Floppy(0x%x)", Acpi->UID); break; case 0x0301: UefiDevicePathLibCatPrint (Str, "Keyboard(0x%x)", Acpi->UID); break; case 0x0501: UefiDevicePathLibCatPrint (Str, "Serial(0x%x)", Acpi->UID); break; case 0x0401: UefiDevicePathLibCatPrint (Str, "ParallelPort(0x%x)", Acpi->UID); break; default: UefiDevicePathLibCatPrint (Str, "Acpi(PNP%04x,0x%x)", EISA_ID_TO_NUM (Acpi->HID), Acpi->UID); break; } } else { UefiDevicePathLibCatPrint (Str, "Acpi(0x%08x,0x%x)", Acpi->HID, Acpi->UID); } } /** Converts a ACPI extended HID device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextAcpiEx ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { ACPI_EXTENDED_HID_DEVICE_PATH *AcpiEx; CHAR8 *HIDStr; CHAR8 *UIDStr; CHAR8 *CIDStr; char HIDText[11]; char CIDText[11]; AcpiEx = DevPath; HIDStr = (CHAR8 *) (((UINT8 *) AcpiEx) + sizeof (ACPI_EXTENDED_HID_DEVICE_PATH)); UIDStr = HIDStr + AsciiStrLen (HIDStr) + 1; CIDStr = UIDStr + AsciiStrLen (UIDStr) + 1; // // Converts EISA identification to string. // UnicodeSPrint ( HIDText, sizeof (HIDText), "%c%c%c%04X", ((AcpiEx->HID >> 10) & 0x1f) + 'A' - 1, ((AcpiEx->HID >> 5) & 0x1f) + 'A' - 1, ((AcpiEx->HID >> 0) & 0x1f) + 'A' - 1, (AcpiEx->HID >> 16) & 0xFFFF ); UnicodeSPrint ( CIDText, sizeof (CIDText), "%c%c%c%04X", ((AcpiEx->CID >> 10) & 0x1f) + 'A' - 1, ((AcpiEx->CID >> 5) & 0x1f) + 'A' - 1, ((AcpiEx->CID >> 0) & 0x1f) + 'A' - 1, (AcpiEx->CID >> 16) & 0xFFFF ); if ((*HIDStr == '\0') && (*CIDStr == '\0') && (AcpiEx->UID == 0)) { // // use AcpiExp() // UefiDevicePathLibCatPrint ( Str, "AcpiExp(%s,%s,%s)", HIDText, CIDText, UIDStr ); } else { if (AllowShortcuts) { // // display only // if (AcpiEx->HID == 0) { UefiDevicePathLibCatPrint (Str, "AcpiEx(%s,", HIDStr); } else { UefiDevicePathLibCatPrint (Str, "AcpiEx(%s,", HIDText); } if (AcpiEx->UID == 0) { UefiDevicePathLibCatPrint (Str, "%s,", UIDStr); } else { UefiDevicePathLibCatPrint (Str, "0x%x,", AcpiEx->UID); } if (AcpiEx->CID == 0) { UefiDevicePathLibCatPrint (Str, "%s)", CIDStr); } else { UefiDevicePathLibCatPrint (Str, "%s)", CIDText); } } else { UefiDevicePathLibCatPrint ( Str, "AcpiEx(%s,%s,0x%x,%s,%s,%s)", HIDText, CIDText, AcpiEx->UID, HIDStr, CIDStr, UIDStr ); } } } /** Converts a ACPI address device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextAcpiAdr ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { ACPI_ADR_DEVICE_PATH *AcpiAdr; UINT32 *Addr; UINT16 Index; UINT16 Length; UINT16 AdditionalAdrCount; AcpiAdr = DevPath; Length = (UINT16) DevicePathNodeLength ((EFI_DEVICE_PATH_PROTOCOL *) AcpiAdr); AdditionalAdrCount = (UINT16) ((Length - 8) / 4); UefiDevicePathLibCatPrint (Str, "AcpiAdr(0x%x", AcpiAdr->ADR); Addr = &AcpiAdr->ADR + 1; for (Index = 0; Index < AdditionalAdrCount; Index++) { UefiDevicePathLibCatPrint (Str, ",0x%x", Addr[Index]); } UefiDevicePathLibCatPrint (Str, ")"); } /** Converts a ATAPI device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextAtapi ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { ATAPI_DEVICE_PATH *Atapi; Atapi = DevPath; if (DisplayOnly) { UefiDevicePathLibCatPrint (Str, "Ata(0x%x)", Atapi->Lun); } else { UefiDevicePathLibCatPrint ( Str, "Ata(%s,%s,0x%x)", (Atapi->PrimarySecondary == 1) ? "Secondary" : "Primary", (Atapi->SlaveMaster == 1) ? "Slave" : "Master", Atapi->Lun ); } } /** Converts a SCSI device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextScsi ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { SCSI_DEVICE_PATH *Scsi; Scsi = DevPath; UefiDevicePathLibCatPrint (Str, "Scsi(0x%x,0x%x)", Scsi->Pun, Scsi->Lun); } /** Converts a Fibre device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextFibre ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { FIBRECHANNEL_DEVICE_PATH *Fibre; Fibre = DevPath; UefiDevicePathLibCatPrint (Str, "Fibre(0x%lx,0x%lx)", Fibre->WWN, Fibre->Lun); } /** Converts a FibreEx device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextFibreEx ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { FIBRECHANNELEX_DEVICE_PATH *FibreEx; UINTN Index; FibreEx = DevPath; UefiDevicePathLibCatPrint (Str, "FibreEx(0x"); for (Index = 0; Index < sizeof (FibreEx->WWN) / sizeof (FibreEx->WWN[0]); Index++) { UefiDevicePathLibCatPrint (Str, "%02x", FibreEx->WWN[Index]); } UefiDevicePathLibCatPrint (Str, ",0x"); for (Index = 0; Index < sizeof (FibreEx->Lun) / sizeof (FibreEx->Lun[0]); Index++) { UefiDevicePathLibCatPrint (Str, "%02x", FibreEx->Lun[Index]); } UefiDevicePathLibCatPrint (Str, ")"); } /** Converts a Sas Ex device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextSasEx ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { SASEX_DEVICE_PATH *SasEx; UINTN Index; SasEx = DevPath; UefiDevicePathLibCatPrint (Str, "SasEx(0x"); for (Index = 0; Index < sizeof (SasEx->SasAddress) / sizeof (SasEx->SasAddress[0]); Index++) { UefiDevicePathLibCatPrint (Str, "%02x", SasEx->SasAddress[Index]); } UefiDevicePathLibCatPrint (Str, ",0x"); for (Index = 0; Index < sizeof (SasEx->Lun) / sizeof (SasEx->Lun[0]); Index++) { UefiDevicePathLibCatPrint (Str, "%02x", SasEx->Lun[Index]); } UefiDevicePathLibCatPrint (Str, ",0x%x,", SasEx->RelativeTargetPort); if (((SasEx->DeviceTopology & 0x0f) == 0) && ((SasEx->DeviceTopology & BIT7) == 0)) { UefiDevicePathLibCatPrint (Str, "NoTopology,0,0,0"); } else if (((SasEx->DeviceTopology & 0x0f) <= 2) && ((SasEx->DeviceTopology & BIT7) == 0)) { UefiDevicePathLibCatPrint ( Str, "%s,%s,%s,", ((SasEx->DeviceTopology & BIT4) != 0) ? "SATA" : "SAS", ((SasEx->DeviceTopology & BIT5) != 0) ? "External" : "Internal", ((SasEx->DeviceTopology & BIT6) != 0) ? "Expanded" : "Direct" ); if ((SasEx->DeviceTopology & 0x0f) == 1) { UefiDevicePathLibCatPrint (Str, "0"); } else { // // Value 0x0 thru 0xFF -> Drive 1 thru Drive 256 // UefiDevicePathLibCatPrint (Str, "0x%x", ((SasEx->DeviceTopology >> 8) & 0xff) + 1); } } else { UefiDevicePathLibCatPrint (Str, "0x%x,0,0,0", SasEx->DeviceTopology); } UefiDevicePathLibCatPrint (Str, ")"); return ; } /** Converts a NVM Express Namespace device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextNVMe ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { NVME_NAMESPACE_DEVICE_PATH *Nvme; UINT8 *Uuid; Nvme = DevPath; Uuid = (UINT8 *) &Nvme->NamespaceUuid; UefiDevicePathLibCatPrint ( Str, "NVMe(0x%x,%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x)", Nvme->NamespaceId, Uuid[7], Uuid[6], Uuid[5], Uuid[4], Uuid[3], Uuid[2], Uuid[1], Uuid[0] ); } /** Converts a UFS device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextUfs ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { UFS_DEVICE_PATH *Ufs; Ufs = DevPath; UefiDevicePathLibCatPrint (Str, "UFS(0x%x,0x%x)", Ufs->Pun, Ufs->Lun); } /** Converts a SD (Secure Digital) device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextSd ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { SD_DEVICE_PATH *Sd; Sd = DevPath; UefiDevicePathLibCatPrint ( Str, "SD(0x%x)", Sd->SlotNumber ); } /** Converts a EMMC (Embedded MMC) device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextEmmc ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { EMMC_DEVICE_PATH *Emmc; Emmc = DevPath; UefiDevicePathLibCatPrint ( Str, "eMMC(0x%x)", Emmc->SlotNumber ); } /** Converts a 1394 device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToText1394 ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { F1394_DEVICE_PATH *F1394DevPath; F1394DevPath = DevPath; // // Guid has format of IEEE-EUI64 // UefiDevicePathLibCatPrint (Str, "I1394(%016lx)", F1394DevPath->Guid); } /** Converts a USB device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextUsb ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { USB_DEVICE_PATH *Usb; Usb = DevPath; UefiDevicePathLibCatPrint (Str, "USB(0x%x,0x%x)", Usb->ParentPortNumber, Usb->InterfaceNumber); } /** Converts a USB WWID device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextUsbWWID ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { USB_WWID_DEVICE_PATH *UsbWWId; CHAR16 *SerialNumberStr; CHAR16 *NewStr; UINT16 Length; UsbWWId = DevPath; SerialNumberStr = (CHAR16 *) (&UsbWWId + 1); Length = (UINT16) ((DevicePathNodeLength ((EFI_DEVICE_PATH_PROTOCOL *) UsbWWId) - sizeof (USB_WWID_DEVICE_PATH)) / sizeof (CHAR16)); if (SerialNumberStr [Length - 1] != 0) { // // In case no NULL terminator in SerialNumber, create a new one with NULL terminator // NewStr = AllocateCopyPool ((Length + 1) * sizeof (CHAR16), SerialNumberStr); ASSERT (NewStr != NULL); NewStr [Length] = 0; SerialNumberStr = NewStr; } UefiDevicePathLibCatPrint ( Str, "UsbWwid(0x%x,0x%x,0x%x,\"%S\")", UsbWWId->VendorId, UsbWWId->ProductId, UsbWWId->InterfaceNumber, SerialNumberStr ); } /** Converts a Logic Unit device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextLogicalUnit ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { DEVICE_LOGICAL_UNIT_DEVICE_PATH *LogicalUnit; LogicalUnit = DevPath; UefiDevicePathLibCatPrint (Str, "Unit(0x%x)", LogicalUnit->Lun); } /** Converts a USB class device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextUsbClass ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { USB_CLASS_DEVICE_PATH *UsbClass; BOOLEAN IsKnownSubClass; UsbClass = DevPath; IsKnownSubClass = TRUE; switch (UsbClass->DeviceClass) { case USB_CLASS_AUDIO: UefiDevicePathLibCatPrint (Str, "UsbAudio"); break; case USB_CLASS_CDCCONTROL: UefiDevicePathLibCatPrint (Str, "UsbCDCControl"); break; case USB_CLASS_HID: UefiDevicePathLibCatPrint (Str, "UsbHID"); break; case USB_CLASS_IMAGE: UefiDevicePathLibCatPrint (Str, "UsbImage"); break; case USB_CLASS_PRINTER: UefiDevicePathLibCatPrint (Str, "UsbPrinter"); break; case USB_CLASS_MASS_STORAGE: UefiDevicePathLibCatPrint (Str, "UsbMassStorage"); break; case USB_CLASS_HUB: UefiDevicePathLibCatPrint (Str, "UsbHub"); break; case USB_CLASS_CDCDATA: UefiDevicePathLibCatPrint (Str, "UsbCDCData"); break; case USB_CLASS_SMART_CARD: UefiDevicePathLibCatPrint (Str, "UsbSmartCard"); break; case USB_CLASS_VIDEO: UefiDevicePathLibCatPrint (Str, "UsbVideo"); break; case USB_CLASS_DIAGNOSTIC: UefiDevicePathLibCatPrint (Str, "UsbDiagnostic"); break; case USB_CLASS_WIRELESS: UefiDevicePathLibCatPrint (Str, "UsbWireless"); break; default: IsKnownSubClass = FALSE; break; } if (IsKnownSubClass) { UefiDevicePathLibCatPrint ( Str, "(0x%x,0x%x,0x%x,0x%x)", UsbClass->VendorId, UsbClass->ProductId, UsbClass->DeviceSubClass, UsbClass->DeviceProtocol ); return; } if (UsbClass->DeviceClass == USB_CLASS_RESERVE) { if (UsbClass->DeviceSubClass == USB_SUBCLASS_FW_UPDATE) { UefiDevicePathLibCatPrint ( Str, "UsbDeviceFirmwareUpdate(0x%x,0x%x,0x%x)", UsbClass->VendorId, UsbClass->ProductId, UsbClass->DeviceProtocol ); return; } else if (UsbClass->DeviceSubClass == USB_SUBCLASS_IRDA_BRIDGE) { UefiDevicePathLibCatPrint ( Str, "UsbIrdaBridge(0x%x,0x%x,0x%x)", UsbClass->VendorId, UsbClass->ProductId, UsbClass->DeviceProtocol ); return; } else if (UsbClass->DeviceSubClass == USB_SUBCLASS_TEST) { UefiDevicePathLibCatPrint ( Str, "UsbTestAndMeasurement(0x%x,0x%x,0x%x)", UsbClass->VendorId, UsbClass->ProductId, UsbClass->DeviceProtocol ); return; } } UefiDevicePathLibCatPrint ( Str, "UsbClass(0x%x,0x%x,0x%x,0x%x,0x%x)", UsbClass->VendorId, UsbClass->ProductId, UsbClass->DeviceClass, UsbClass->DeviceSubClass, UsbClass->DeviceProtocol ); } /** Converts a SATA device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextSata ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { SATA_DEVICE_PATH *Sata; Sata = DevPath; UefiDevicePathLibCatPrint ( Str, "Sata(0x%x,0x%x,0x%x)", Sata->HBAPortNumber, Sata->PortMultiplierPortNumber, Sata->Lun ); } /** Converts a I20 device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextI2O ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { I2O_DEVICE_PATH *I2ODevPath; I2ODevPath = DevPath; UefiDevicePathLibCatPrint (Str, "I2O(0x%x)", I2ODevPath->Tid); } /** Converts a MAC address device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextMacAddr ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { MAC_ADDR_DEVICE_PATH *MacDevPath; UINTN HwAddressSize; UINTN Index; MacDevPath = DevPath; HwAddressSize = sizeof (EFI_MAC_ADDRESS); if (MacDevPath->IfType == 0x01 || MacDevPath->IfType == 0x00) { HwAddressSize = 6; } UefiDevicePathLibCatPrint (Str, "MAC("); for (Index = 0; Index < HwAddressSize; Index++) { UefiDevicePathLibCatPrint (Str, "%02x", MacDevPath->MacAddress.Addr[Index]); } UefiDevicePathLibCatPrint (Str, ",0x%x)", MacDevPath->IfType); } /** Converts network protocol string to its text representation. @param Str The string representative of input device. @param Protocol The network protocol ID. **/ static VOID CatNetworkProtocol ( IN OUT POOL_PRINT *Str, IN UINT16 Protocol ) { if (Protocol == RFC_1700_TCP_PROTOCOL) { UefiDevicePathLibCatPrint (Str, "TCP"); } else if (Protocol == RFC_1700_UDP_PROTOCOL) { UefiDevicePathLibCatPrint (Str, "UDP"); } else { UefiDevicePathLibCatPrint (Str, "0x%x", Protocol); } } /** Converts IP v4 address to its text representation. @param Str The string representative of input device. @param Address The IP v4 address. **/ static VOID CatIPv4Address ( IN OUT POOL_PRINT *Str, IN EFI_IPv4_ADDRESS *Address ) { UefiDevicePathLibCatPrint (Str, "%d.%d.%d.%d", Address->Addr[0], Address->Addr[1], Address->Addr[2], Address->Addr[3]); } /** Converts IP v6 address to its text representation. @param Str The string representative of input device. @param Address The IP v6 address. **/ static VOID CatIPv6Address ( IN OUT POOL_PRINT *Str, IN EFI_IPv6_ADDRESS *Address ) { UefiDevicePathLibCatPrint ( Str, "%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x", Address->Addr[0], Address->Addr[1], Address->Addr[2], Address->Addr[3], Address->Addr[4], Address->Addr[5], Address->Addr[6], Address->Addr[7], Address->Addr[8], Address->Addr[9], Address->Addr[10], Address->Addr[11], Address->Addr[12], Address->Addr[13], Address->Addr[14], Address->Addr[15] ); } /** Converts a IPv4 device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextIPv4 ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { IPv4_DEVICE_PATH *IPDevPath; IPDevPath = DevPath; UefiDevicePathLibCatPrint (Str, "IPv4("); CatIPv4Address (Str, &IPDevPath->RemoteIpAddress); if (DisplayOnly) { UefiDevicePathLibCatPrint (Str, ")"); return ; } UefiDevicePathLibCatPrint (Str, ","); CatNetworkProtocol (Str, IPDevPath->Protocol); UefiDevicePathLibCatPrint (Str, ",%s,", IPDevPath->StaticIpAddress ? "Static" : "DHCP"); CatIPv4Address (Str, &IPDevPath->LocalIpAddress); if (DevicePathNodeLength (IPDevPath) == sizeof (IPv4_DEVICE_PATH)) { UefiDevicePathLibCatPrint (Str, ","); CatIPv4Address (Str, &IPDevPath->GatewayIpAddress); UefiDevicePathLibCatPrint (Str, ","); CatIPv4Address (Str, &IPDevPath->SubnetMask); } UefiDevicePathLibCatPrint (Str, ")"); } /** Converts a IPv6 device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextIPv6 ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { IPv6_DEVICE_PATH *IPDevPath; IPDevPath = DevPath; UefiDevicePathLibCatPrint (Str, "IPv6("); CatIPv6Address (Str, &IPDevPath->RemoteIpAddress); if (DisplayOnly) { UefiDevicePathLibCatPrint (Str, ")"); return ; } UefiDevicePathLibCatPrint (Str, ","); CatNetworkProtocol (Str, IPDevPath->Protocol); switch (IPDevPath->IpAddressOrigin) { case 0: UefiDevicePathLibCatPrint (Str, ",Static,"); break; case 1: UefiDevicePathLibCatPrint (Str, ",StatelessAutoConfigure,"); break; default: UefiDevicePathLibCatPrint (Str, ",StatefulAutoConfigure,"); break; } CatIPv6Address (Str, &IPDevPath->LocalIpAddress); if (DevicePathNodeLength (IPDevPath) == sizeof (IPv6_DEVICE_PATH)) { UefiDevicePathLibCatPrint (Str, ",0x%x,", IPDevPath->PrefixLength); CatIPv6Address (Str, &IPDevPath->GatewayIpAddress); } UefiDevicePathLibCatPrint (Str, ")"); } /** Converts an Infini Band device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextInfiniBand ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { INFINIBAND_DEVICE_PATH *InfiniBand; InfiniBand = DevPath; UefiDevicePathLibCatPrint ( Str, "Infiniband(0x%x,%36s,0x%lx,0x%lx,0x%lx)", InfiniBand->ResourceFlags, G(InfiniBand->PortGid), InfiniBand->ServiceId, InfiniBand->TargetPortId, InfiniBand->DeviceId ); } /** Converts a UART device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextUart ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { UART_DEVICE_PATH *Uart; CHAR8 Parity; Uart = DevPath; switch (Uart->Parity) { case 0: Parity = 'D'; break; case 1: Parity = 'N'; break; case 2: Parity = 'E'; break; case 3: Parity = 'O'; break; case 4: Parity = 'M'; break; case 5: Parity = 'S'; break; default: Parity = 'x'; break; } if (Uart->BaudRate == 0) { UefiDevicePathLibCatPrint (Str, "Uart(DEFAULT,"); } else { UefiDevicePathLibCatPrint (Str, "Uart(%ld,", Uart->BaudRate); } if (Uart->DataBits == 0) { UefiDevicePathLibCatPrint (Str, "DEFAULT,"); } else { UefiDevicePathLibCatPrint (Str, "%d,", Uart->DataBits); } UefiDevicePathLibCatPrint (Str, "%c,", Parity); switch (Uart->StopBits) { case 0: UefiDevicePathLibCatPrint (Str, "D)"); break; case 1: UefiDevicePathLibCatPrint (Str, "1)"); break; case 2: UefiDevicePathLibCatPrint (Str, "1.5)"); break; case 3: UefiDevicePathLibCatPrint (Str, "2)"); break; default: UefiDevicePathLibCatPrint (Str, "x)"); break; } } /** Converts an iSCSI device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextiSCSI ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { ISCSI_DEVICE_PATH_WITH_NAME *ISCSIDevPath; UINT16 Options; ISCSIDevPath = DevPath; UefiDevicePathLibCatPrint ( Str, "iSCSI(%s,0x%x,0x%lx,", ISCSIDevPath->TargetName, ISCSIDevPath->TargetPortalGroupTag, ISCSIDevPath->Lun ); Options = ISCSIDevPath->LoginOption; UefiDevicePathLibCatPrint (Str, "%s,", (((Options >> 1) & 0x0001) != 0) ? "CRC32C" : "None"); UefiDevicePathLibCatPrint (Str, "%s,", (((Options >> 3) & 0x0001) != 0) ? "CRC32C" : "None"); if (((Options >> 11) & 0x0001) != 0) { UefiDevicePathLibCatPrint (Str, "%s,", "None"); } else if (((Options >> 12) & 0x0001) != 0) { UefiDevicePathLibCatPrint (Str, "%s,", "CHAP_UNI"); } else { UefiDevicePathLibCatPrint (Str, "%s,", "CHAP_BI"); } UefiDevicePathLibCatPrint (Str, "%s)", (ISCSIDevPath->NetworkProtocol == 0) ? "TCP" : "reserved"); } /** Converts a VLAN device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextVlan ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { VLAN_DEVICE_PATH *Vlan; Vlan = DevPath; UefiDevicePathLibCatPrint (Str, "Vlan(%d)", Vlan->VlanId); } /** Converts a Bluetooth device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextBluetooth ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { BLUETOOTH_DEVICE_PATH *Bluetooth; Bluetooth = DevPath; UefiDevicePathLibCatPrint ( Str, "Bluetooth(%02x%02x%02x%02x%02x%02x)", Bluetooth->BD_ADDR.Address[5], Bluetooth->BD_ADDR.Address[4], Bluetooth->BD_ADDR.Address[3], Bluetooth->BD_ADDR.Address[2], Bluetooth->BD_ADDR.Address[1], Bluetooth->BD_ADDR.Address[0] ); } /** Converts a Wi-Fi device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextWiFi ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { WIFI_DEVICE_PATH *WiFi; UINT8 SSId[33]; WiFi = DevPath; SSId[32] = '\0'; CopyMem (SSId, WiFi->SSId, 32); UefiDevicePathLibCatPrint (Str, "Wi-Fi(%s)", SSId); } /** Converts a URI device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextUri ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { URI_DEVICE_PATH *Uri; UINTN UriLength; CHAR8 *UriStr; // // Uri in the device path may not be null terminated. // Uri = DevPath; UriLength = DevicePathNodeLength (Uri) - sizeof (URI_DEVICE_PATH); UriStr = AllocatePool (UriLength + 1); ASSERT (UriStr != NULL); CopyMem (UriStr, Uri->Uri, UriLength); UriStr[UriLength] = '\0'; UefiDevicePathLibCatPrint (Str, "Uri(%s)", UriStr); FreePool (UriStr); } /** Converts a Hard drive device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextHardDrive ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { HARDDRIVE_DEVICE_PATH *Hd; Hd = DevPath; switch (Hd->SignatureType) { case SIGNATURE_TYPE_MBR: UefiDevicePathLibCatPrint ( Str, "HD(%d,%s,0x%08x,", Hd->PartitionNumber, "MBR", // *((UINT32 *) (&(Hd->Signature[0]))) le32dec(&(Hd->Signature[0])) ); break; case SIGNATURE_TYPE_GUID: UefiDevicePathLibCatPrint ( Str, "HD(%d,%s,%36s,", Hd->PartitionNumber, "GPT", G(&(Hd->Signature[0])) ); break; default: UefiDevicePathLibCatPrint ( Str, "HD(%d,%d,0,", Hd->PartitionNumber, Hd->SignatureType ); break; } UefiDevicePathLibCatPrint (Str, "0x%lx,0x%lx)", Hd->PartitionStart, Hd->PartitionSize); } /** Converts a CDROM device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextCDROM ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { CDROM_DEVICE_PATH *Cd; Cd = DevPath; if (DisplayOnly) { UefiDevicePathLibCatPrint (Str, "CDROM(0x%x)", Cd->BootEntry); return ; } UefiDevicePathLibCatPrint (Str, "CDROM(0x%x,0x%lx,0x%lx)", Cd->BootEntry, Cd->PartitionStart, Cd->PartitionSize); } /** Converts a File device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextFilePath ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { FILEPATH_DEVICE_PATH *Fp; char *name = NULL; Fp = DevPath; ucs2_to_utf8(Fp->PathName, &name); UefiDevicePathLibCatPrint (Str, "File(%s)", name); free(name); } /** Converts a Media protocol device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextMediaProtocol ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { MEDIA_PROTOCOL_DEVICE_PATH *MediaProt; MediaProt = DevPath; UefiDevicePathLibCatPrint (Str, "Media(%36s)", G(&MediaProt->Protocol)); } /** Converts a Firmware Volume device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextFv ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { MEDIA_FW_VOL_DEVICE_PATH *Fv; Fv = DevPath; UefiDevicePathLibCatPrint (Str, "Fv(%36s)", G(&Fv->FvName)); } /** Converts a Firmware Volume File device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextFvFile ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { MEDIA_FW_VOL_FILEPATH_DEVICE_PATH *FvFile; FvFile = DevPath; UefiDevicePathLibCatPrint (Str, "FvFile(%36s)", G(&FvFile->FvFileName)); } /** Converts a Relative Offset device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathRelativeOffsetRange ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { MEDIA_RELATIVE_OFFSET_RANGE_DEVICE_PATH *Offset; Offset = DevPath; UefiDevicePathLibCatPrint ( Str, "Offset(0x%lx,0x%lx)", Offset->StartingOffset, Offset->EndingOffset ); } /** Converts a Ram Disk device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextRamDisk ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { MEDIA_RAM_DISK_DEVICE_PATH *RamDisk; RamDisk = DevPath; if (CompareGuid (&RamDisk->TypeGuid, &gEfiVirtualDiskGuid)) { UefiDevicePathLibCatPrint ( Str, "VirtualDisk(0x%lx,0x%lx,%d)", LShiftU64 ((UINT64)RamDisk->StartingAddr[1], 32) | RamDisk->StartingAddr[0], LShiftU64 ((UINT64)RamDisk->EndingAddr[1], 32) | RamDisk->EndingAddr[0], RamDisk->Instance ); } else if (CompareGuid (&RamDisk->TypeGuid, &gEfiVirtualCdGuid)) { UefiDevicePathLibCatPrint ( Str, "VirtualCD(0x%lx,0x%lx,%d)", LShiftU64 ((UINT64)RamDisk->StartingAddr[1], 32) | RamDisk->StartingAddr[0], LShiftU64 ((UINT64)RamDisk->EndingAddr[1], 32) | RamDisk->EndingAddr[0], RamDisk->Instance ); } else if (CompareGuid (&RamDisk->TypeGuid, &gEfiPersistentVirtualDiskGuid)) { UefiDevicePathLibCatPrint ( Str, "PersistentVirtualDisk(0x%lx,0x%lx,%d)", LShiftU64 ((UINT64)RamDisk->StartingAddr[1], 32) | RamDisk->StartingAddr[0], LShiftU64 ((UINT64)RamDisk->EndingAddr[1], 32) | RamDisk->EndingAddr[0], RamDisk->Instance ); } else if (CompareGuid (&RamDisk->TypeGuid, &gEfiPersistentVirtualCdGuid)) { UefiDevicePathLibCatPrint ( Str, "PersistentVirtualCD(0x%lx,0x%lx,%d)", LShiftU64 ((UINT64)RamDisk->StartingAddr[1], 32) | RamDisk->StartingAddr[0], LShiftU64 ((UINT64)RamDisk->EndingAddr[1], 32) | RamDisk->EndingAddr[0], RamDisk->Instance ); } else { UefiDevicePathLibCatPrint ( Str, "RamDisk(0x%lx,0x%lx,%d,%36s)", LShiftU64 ((UINT64)RamDisk->StartingAddr[1], 32) | RamDisk->StartingAddr[0], LShiftU64 ((UINT64)RamDisk->EndingAddr[1], 32) | RamDisk->EndingAddr[0], RamDisk->Instance, G(&RamDisk->TypeGuid) ); } } /** Converts a BIOS Boot Specification device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextBBS ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { BBS_BBS_DEVICE_PATH *Bbs; const char *Type; Bbs = DevPath; switch (Bbs->DeviceType) { case BBS_TYPE_FLOPPY: Type = "Floppy"; break; case BBS_TYPE_HARDDRIVE: Type = "HD"; break; case BBS_TYPE_CDROM: Type = "CDROM"; break; case BBS_TYPE_PCMCIA: Type = "PCMCIA"; break; case BBS_TYPE_USB: Type = "USB"; break; case BBS_TYPE_EMBEDDED_NETWORK: Type = "Network"; break; default: Type = NULL; break; } if (Type != NULL) { UefiDevicePathLibCatPrint (Str, "BBS(%s,%s", Type, Bbs->String); } else { UefiDevicePathLibCatPrint (Str, "BBS(0x%x,%s", Bbs->DeviceType, Bbs->String); } if (DisplayOnly) { UefiDevicePathLibCatPrint (Str, ")"); return ; } UefiDevicePathLibCatPrint (Str, ",0x%x)", Bbs->StatusFlag); } /** Converts an End-of-Device-Path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextEndInstance ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { UefiDevicePathLibCatPrint (Str, ","); } GLOBAL_REMOVE_IF_UNREFERENCED const DEVICE_PATH_TO_TEXT_GENERIC_TABLE mUefiDevicePathLibToTextTableGeneric[] = { {HARDWARE_DEVICE_PATH, "HardwarePath" }, {ACPI_DEVICE_PATH, "AcpiPath" }, {MESSAGING_DEVICE_PATH, "Msg" }, {MEDIA_DEVICE_PATH, "MediaPath" }, {BBS_DEVICE_PATH, "BbsPath" }, {0, NULL} }; /** Converts an unknown device path structure to its string representative. @param Str The string representative of input device. @param DevPath The input device path structure. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. **/ static VOID DevPathToTextNodeGeneric ( IN OUT POOL_PRINT *Str, IN VOID *DevPath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { EFI_DEVICE_PATH_PROTOCOL *Node; UINTN Index; Node = DevPath; for (Index = 0; mUefiDevicePathLibToTextTableGeneric[Index].Text != NULL; Index++) { if (DevicePathType (Node) == mUefiDevicePathLibToTextTableGeneric[Index].Type) { break; } } if (mUefiDevicePathLibToTextTableGeneric[Index].Text == NULL) { // // It's a node whose type cannot be recognized // UefiDevicePathLibCatPrint (Str, "Path(%d,%d", DevicePathType (Node), DevicePathSubType (Node)); } else { // // It's a node whose type can be recognized // UefiDevicePathLibCatPrint (Str, "%s(%d", mUefiDevicePathLibToTextTableGeneric[Index].Text, DevicePathSubType (Node)); } Index = sizeof (EFI_DEVICE_PATH_PROTOCOL); if (Index < DevicePathNodeLength (Node)) { UefiDevicePathLibCatPrint (Str, ","); for (; Index < DevicePathNodeLength (Node); Index++) { UefiDevicePathLibCatPrint (Str, "%02x", ((UINT8 *) Node)[Index]); } } UefiDevicePathLibCatPrint (Str, ")"); } static const DEVICE_PATH_TO_TEXT_TABLE mUefiDevicePathLibToTextTable[] = { {HARDWARE_DEVICE_PATH, HW_PCI_DP, DevPathToTextPci }, {HARDWARE_DEVICE_PATH, HW_PCCARD_DP, DevPathToTextPccard }, {HARDWARE_DEVICE_PATH, HW_MEMMAP_DP, DevPathToTextMemMap }, {HARDWARE_DEVICE_PATH, HW_VENDOR_DP, DevPathToTextVendor }, {HARDWARE_DEVICE_PATH, HW_CONTROLLER_DP, DevPathToTextController }, {HARDWARE_DEVICE_PATH, HW_BMC_DP, DevPathToTextBmc }, {ACPI_DEVICE_PATH, ACPI_DP, DevPathToTextAcpi }, {ACPI_DEVICE_PATH, ACPI_EXTENDED_DP, DevPathToTextAcpiEx }, {ACPI_DEVICE_PATH, ACPI_ADR_DP, DevPathToTextAcpiAdr }, {MESSAGING_DEVICE_PATH, MSG_ATAPI_DP, DevPathToTextAtapi }, {MESSAGING_DEVICE_PATH, MSG_SCSI_DP, DevPathToTextScsi }, {MESSAGING_DEVICE_PATH, MSG_FIBRECHANNEL_DP, DevPathToTextFibre }, {MESSAGING_DEVICE_PATH, MSG_FIBRECHANNELEX_DP, DevPathToTextFibreEx }, {MESSAGING_DEVICE_PATH, MSG_SASEX_DP, DevPathToTextSasEx }, {MESSAGING_DEVICE_PATH, MSG_NVME_NAMESPACE_DP, DevPathToTextNVMe }, {MESSAGING_DEVICE_PATH, MSG_UFS_DP, DevPathToTextUfs }, {MESSAGING_DEVICE_PATH, MSG_SD_DP, DevPathToTextSd }, {MESSAGING_DEVICE_PATH, MSG_EMMC_DP, DevPathToTextEmmc }, {MESSAGING_DEVICE_PATH, MSG_1394_DP, DevPathToText1394 }, {MESSAGING_DEVICE_PATH, MSG_USB_DP, DevPathToTextUsb }, {MESSAGING_DEVICE_PATH, MSG_USB_WWID_DP, DevPathToTextUsbWWID }, {MESSAGING_DEVICE_PATH, MSG_DEVICE_LOGICAL_UNIT_DP, DevPathToTextLogicalUnit }, {MESSAGING_DEVICE_PATH, MSG_USB_CLASS_DP, DevPathToTextUsbClass }, {MESSAGING_DEVICE_PATH, MSG_SATA_DP, DevPathToTextSata }, {MESSAGING_DEVICE_PATH, MSG_I2O_DP, DevPathToTextI2O }, {MESSAGING_DEVICE_PATH, MSG_MAC_ADDR_DP, DevPathToTextMacAddr }, {MESSAGING_DEVICE_PATH, MSG_IPv4_DP, DevPathToTextIPv4 }, {MESSAGING_DEVICE_PATH, MSG_IPv6_DP, DevPathToTextIPv6 }, {MESSAGING_DEVICE_PATH, MSG_INFINIBAND_DP, DevPathToTextInfiniBand }, {MESSAGING_DEVICE_PATH, MSG_UART_DP, DevPathToTextUart }, {MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, DevPathToTextVendor }, {MESSAGING_DEVICE_PATH, MSG_ISCSI_DP, DevPathToTextiSCSI }, {MESSAGING_DEVICE_PATH, MSG_VLAN_DP, DevPathToTextVlan }, {MESSAGING_DEVICE_PATH, MSG_URI_DP, DevPathToTextUri }, {MESSAGING_DEVICE_PATH, MSG_BLUETOOTH_DP, DevPathToTextBluetooth }, {MESSAGING_DEVICE_PATH, MSG_WIFI_DP, DevPathToTextWiFi }, {MEDIA_DEVICE_PATH, MEDIA_HARDDRIVE_DP, DevPathToTextHardDrive }, {MEDIA_DEVICE_PATH, MEDIA_CDROM_DP, DevPathToTextCDROM }, {MEDIA_DEVICE_PATH, MEDIA_VENDOR_DP, DevPathToTextVendor }, {MEDIA_DEVICE_PATH, MEDIA_PROTOCOL_DP, DevPathToTextMediaProtocol }, {MEDIA_DEVICE_PATH, MEDIA_FILEPATH_DP, DevPathToTextFilePath }, {MEDIA_DEVICE_PATH, MEDIA_PIWG_FW_VOL_DP, DevPathToTextFv }, {MEDIA_DEVICE_PATH, MEDIA_PIWG_FW_FILE_DP, DevPathToTextFvFile }, {MEDIA_DEVICE_PATH, MEDIA_RELATIVE_OFFSET_RANGE_DP, DevPathRelativeOffsetRange }, {MEDIA_DEVICE_PATH, MEDIA_RAM_DISK_DP, DevPathToTextRamDisk }, {BBS_DEVICE_PATH, BBS_BBS_DP, DevPathToTextBBS }, {END_DEVICE_PATH_TYPE, END_INSTANCE_DEVICE_PATH_SUBTYPE, DevPathToTextEndInstance }, {0, 0, NULL} }; /** Converts a device node to its string representation. @param DeviceNode A Pointer to the device node to be converted. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. @return A pointer to the allocated text representation of the device node or NULL if DeviceNode is NULL or there was insufficient memory. **/ static char * EFIAPI UefiDevicePathLibConvertDeviceNodeToText ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DeviceNode, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { POOL_PRINT Str; UINTN Index; DEVICE_PATH_TO_TEXT ToText; EFI_DEVICE_PATH_PROTOCOL *Node; if (DeviceNode == NULL) { return NULL; } ZeroMem (&Str, sizeof (Str)); // // Process the device path node // If not found, use a generic function // Node = __DECONST(EFI_DEVICE_PATH_PROTOCOL *, DeviceNode); ToText = DevPathToTextNodeGeneric; for (Index = 0; mUefiDevicePathLibToTextTable[Index].Function != NULL; Index++) { if (DevicePathType (DeviceNode) == mUefiDevicePathLibToTextTable[Index].Type && DevicePathSubType (DeviceNode) == mUefiDevicePathLibToTextTable[Index].SubType ) { ToText = mUefiDevicePathLibToTextTable[Index].Function; break; } } // // Print this node // ToText (&Str, (VOID *) Node, DisplayOnly, AllowShortcuts); ASSERT (Str.Str != NULL); return Str.Str; } /** Converts a device path to its text representation. @param DevicePath A Pointer to the device to be converted. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. @return A pointer to the allocated text representation of the device path or NULL if DeviceNode is NULL or there was insufficient memory. **/ static char * EFIAPI UefiDevicePathLibConvertDevicePathToText ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ) { POOL_PRINT Str; EFI_DEVICE_PATH_PROTOCOL *Node; EFI_DEVICE_PATH_PROTOCOL *AlignedNode; UINTN Index; DEVICE_PATH_TO_TEXT ToText; if (DevicePath == NULL) { return NULL; } ZeroMem (&Str, sizeof (Str)); // // Process each device path node // Node = __DECONST(EFI_DEVICE_PATH_PROTOCOL *, DevicePath); while (!IsDevicePathEnd (Node)) { // // Find the handler to dump this device path node // If not found, use a generic function // ToText = DevPathToTextNodeGeneric; for (Index = 0; mUefiDevicePathLibToTextTable[Index].Function != NULL; Index += 1) { if (DevicePathType (Node) == mUefiDevicePathLibToTextTable[Index].Type && DevicePathSubType (Node) == mUefiDevicePathLibToTextTable[Index].SubType ) { ToText = mUefiDevicePathLibToTextTable[Index].Function; break; } } // // Put a path separator in if needed // if ((Str.Count != 0) && (ToText != DevPathToTextEndInstance)) { if (Str.Str[Str.Count] != ',') { UefiDevicePathLibCatPrint (&Str, "/"); } } AlignedNode = AllocateCopyPool (DevicePathNodeLength (Node), Node); // // Print this node of the device path // ToText (&Str, AlignedNode, DisplayOnly, AllowShortcuts); FreePool (AlignedNode); // // Next device path node // Node = NextDevicePathNode (Node); } if (Str.Str == NULL) { return AllocateZeroPool (sizeof (CHAR16)); } else { return Str.Str; } } ssize_t efidp_format_device_path(char *buf, size_t len, const_efidp dp, ssize_t max) { char *str; ssize_t retval; /* * Basic sanity check on the device path. */ if (!IsDevicePathValid((CONST EFI_DEVICE_PATH_PROTOCOL *) dp, max)) { *buf = '\0'; return 0; } str = UefiDevicePathLibConvertDevicePathToText ( __DECONST(EFI_DEVICE_PATH_PROTOCOL *, dp), FALSE, TRUE); if (str == NULL) return -1; strlcpy(buf, str, len); retval = strlen(str); free(str); return retval; } ssize_t efidp_format_device_path_node(char *buf, size_t len, const_efidp dp) { char *str; ssize_t retval; str = UefiDevicePathLibConvertDeviceNodeToText ( __DECONST(EFI_DEVICE_PATH_PROTOCOL *, dp), FALSE, TRUE); if (str == NULL) return -1; strlcpy(buf, str, len); retval = strlen(str); free(str); return retval; } size_t efidp_size(const_efidp dp) { return GetDevicePathSize(__DECONST(EFI_DEVICE_PATH_PROTOCOL *, dp)); } char * efidp_extract_file_path(const_efidp dp) { const FILEPATH_DEVICE_PATH *fp; char *name = NULL; fp = (const void *)dp; ucs2_to_utf8(fp->PathName, &name); return name; } Index: head/lib/libefivar/efivar-dp-parse.c =================================================================== --- head/lib/libefivar/efivar-dp-parse.c (revision 343754) +++ head/lib/libefivar/efivar-dp-parse.c (revision 343755) @@ -1,3712 +1,3711 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Routines to format EFI_DEVICE_PATHs from the UEFI standard. Much of * this file is taken from EDK2 and rototilled. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "efichar.h" #include "efi-osdep.h" #include "efivar-dp.h" #include "uefi-dplib.h" /* XXX STUBS -- this stuff doesn't work yet */ #define StrToIpv4Address(str, unk, ipv4ptr, unk2) #define StrToIpv6Address(str, unk, ipv6ptr, unk2) /* * OK. Now this is evil. Can't typedef it again. Sure beats changing them all. * Since we're doing it all as narrow characters since wchar_t can't be used on * FreeBSD and CHAR16 strings generally aren't a good fit. Since this parsing * doesn't need Unicode for anything, this works out well. */ #define CHAR16 char /* * Taken from MdePkg/Library/UefiDevicePathLib/DevicePathFromText.c * hash a11928f3310518ab1c6fd34e8d0fdbb72de9602c 2017-Mar-01 */ /** @file DevicePathFromText protocol as defined in the UEFI 2.0 specification. Copyright (c) 2013 - 2017, Intel Corporation. All rights reserved.
This program and the accompanying materials are licensed and made available under the terms and conditions of the BSD License which accompanies this distribution. The full text of the license may be found at http://opensource.org/licenses/bsd-license.php THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. **/ // #include "UefiDevicePathLib.h" /** Duplicates a string. @param Src Source string. @return The duplicated string. **/ static CHAR16 * UefiDevicePathLibStrDuplicate ( IN CONST CHAR16 *Src ) { return AllocateCopyPool (StrSize (Src), Src); } /** Get parameter in a pair of parentheses follow the given node name. For example, given the "Pci(0,1)" and NodeName "Pci", it returns "0,1". @param Str Device Path Text. @param NodeName Name of the node. @return Parameter text for the node. **/ static CHAR16 * GetParamByNodeName ( IN CHAR16 *Str, IN const CHAR16 *NodeName ) { CHAR16 *ParamStr; CHAR16 *StrPointer; UINTN NodeNameLength; UINTN ParameterLength; // // Check whether the node name matchs // NodeNameLength = StrLen (NodeName); if (StrnCmp (Str, NodeName, NodeNameLength) != 0) { return NULL; } ParamStr = Str + NodeNameLength; if (!IS_LEFT_PARENTH (*ParamStr)) { return NULL; } // // Skip the found '(' and find first occurrence of ')' // ParamStr++; ParameterLength = 0; StrPointer = ParamStr; while (!IS_NULL (*StrPointer)) { if (IS_RIGHT_PARENTH (*StrPointer)) { break; } StrPointer++; ParameterLength++; } if (IS_NULL (*StrPointer)) { // // ')' not found // return NULL; } ParamStr = AllocateCopyPool ((ParameterLength + 1) * sizeof (CHAR16), ParamStr); if (ParamStr == NULL) { return NULL; } // // Terminate the parameter string // ParamStr[ParameterLength] = '\0'; return ParamStr; } /** Gets current sub-string from a string list, before return the list header is moved to next sub-string. The sub-string is separated by the specified character. For example, the separator is ',', the string list is "2,0,3", it returns "2", the remain list move to "0,3" @param List A string list separated by the specified separator @param Separator The separator character @return A pointer to the current sub-string **/ static CHAR16 * SplitStr ( IN OUT CHAR16 **List, IN CHAR16 Separator ) { CHAR16 *Str; CHAR16 *ReturnStr; Str = *List; ReturnStr = Str; if (IS_NULL (*Str)) { return ReturnStr; } // // Find first occurrence of the separator // while (!IS_NULL (*Str)) { if (*Str == Separator) { break; } Str++; } if (*Str == Separator) { // // Find a sub-string, terminate it // *Str = '\0'; Str++; } // // Move to next sub-string // *List = Str; return ReturnStr; } /** Gets the next parameter string from the list. @param List A string list separated by the specified separator @return A pointer to the current sub-string **/ static CHAR16 * GetNextParamStr ( IN OUT CHAR16 **List ) { // // The separator is comma // return SplitStr (List, ','); } /** Get one device node from entire device path text. @param DevicePath On input, the current Device Path node; on output, the next device path node @param IsInstanceEnd This node is the end of a device path instance @return A device node text or NULL if no more device node available **/ static CHAR16 * GetNextDeviceNodeStr ( IN OUT CHAR16 **DevicePath, OUT BOOLEAN *IsInstanceEnd ) { CHAR16 *Str; CHAR16 *ReturnStr; UINTN ParenthesesStack; Str = *DevicePath; if (IS_NULL (*Str)) { return NULL; } // // Skip the leading '/', '(', ')' and ',' // while (!IS_NULL (*Str)) { if (!IS_SLASH (*Str) && !IS_COMMA (*Str) && !IS_LEFT_PARENTH (*Str) && !IS_RIGHT_PARENTH (*Str)) { break; } Str++; } ReturnStr = Str; // // Scan for the separator of this device node, '/' or ',' // ParenthesesStack = 0; while (!IS_NULL (*Str)) { if ((IS_COMMA (*Str) || IS_SLASH (*Str)) && (ParenthesesStack == 0)) { break; } if (IS_LEFT_PARENTH (*Str)) { ParenthesesStack++; } else if (IS_RIGHT_PARENTH (*Str)) { ParenthesesStack--; } Str++; } if (ParenthesesStack != 0) { // // The '(' doesn't pair with ')', invalid device path text // return NULL; } if (IS_COMMA (*Str)) { *IsInstanceEnd = TRUE; *Str = '\0'; Str++; } else { *IsInstanceEnd = FALSE; if (!IS_NULL (*Str)) { *Str = '\0'; Str++; } } *DevicePath = Str; return ReturnStr; } #ifndef __FreeBSD__ /** Return whether the integer string is a hex string. @param Str The integer string @retval TRUE Hex string @retval FALSE Decimal string **/ static BOOLEAN IsHexStr ( IN CHAR16 *Str ) { // // skip preceeding white space // while ((*Str != 0) && *Str == ' ') { Str ++; } // // skip preceeding zeros // while ((*Str != 0) && *Str == '0') { Str ++; } return (BOOLEAN) (*Str == 'x' || *Str == 'X'); } /** Convert integer string to uint. @param Str The integer string. If leading with "0x" or "0X", it's hexadecimal. @return A UINTN value represented by Str **/ static UINTN Strtoi ( IN CHAR16 *Str ) { if (IsHexStr (Str)) { return StrHexToUintn (Str); } else { return StrDecimalToUintn (Str); } } /** Convert integer string to 64 bit data. @param Str The integer string. If leading with "0x" or "0X", it's hexadecimal. @param Data A pointer to the UINT64 value represented by Str **/ static VOID Strtoi64 ( IN CHAR16 *Str, OUT UINT64 *Data ) { if (IsHexStr (Str)) { *Data = StrHexToUint64 (Str); } else { *Data = StrDecimalToUint64 (Str); } } #endif /** Converts a Unicode string to ASCII string. @param Str The equivalent Unicode string @param AsciiStr On input, it points to destination ASCII string buffer; on output, it points to the next ASCII string next to it **/ static VOID StrToAscii ( IN CHAR16 *Str, IN OUT CHAR8 **AsciiStr ) { CHAR8 *Dest; Dest = *AsciiStr; while (!IS_NULL (*Str)) { *(Dest++) = (CHAR8) *(Str++); } *Dest = 0; // // Return the string next to it // *AsciiStr = Dest + 1; } /** Converts a generic text device path node to device path structure. @param Type The type of the device path node. @param TextDeviceNode The input text device path node. @return A pointer to device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextGenericPath ( IN UINT8 Type, IN CHAR16 *TextDeviceNode ) { EFI_DEVICE_PATH_PROTOCOL *Node; CHAR16 *SubtypeStr; CHAR16 *DataStr; UINTN DataLength; SubtypeStr = GetNextParamStr (&TextDeviceNode); DataStr = GetNextParamStr (&TextDeviceNode); if (DataStr == NULL) { DataLength = 0; } else { DataLength = StrLen (DataStr) / 2; } Node = CreateDeviceNode ( Type, (UINT8) Strtoi (SubtypeStr), (UINT16) (sizeof (EFI_DEVICE_PATH_PROTOCOL) + DataLength) ); StrHexToBytes (DataStr, DataLength * 2, (UINT8 *) (Node + 1), DataLength); return Node; } /** Converts a generic text device path node to device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextPath ( IN CHAR16 *TextDeviceNode ) { CHAR16 *TypeStr; TypeStr = GetNextParamStr (&TextDeviceNode); return DevPathFromTextGenericPath ((UINT8) Strtoi (TypeStr), TextDeviceNode); } /** Converts a generic hardware text device path node to Hardware device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to Hardware device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextHardwarePath ( IN CHAR16 *TextDeviceNode ) { return DevPathFromTextGenericPath (HARDWARE_DEVICE_PATH, TextDeviceNode); } /** Converts a text device path node to Hardware PCI device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to Hardware PCI device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextPci ( IN CHAR16 *TextDeviceNode ) { CHAR16 *FunctionStr; CHAR16 *DeviceStr; PCI_DEVICE_PATH *Pci; DeviceStr = GetNextParamStr (&TextDeviceNode); FunctionStr = GetNextParamStr (&TextDeviceNode); Pci = (PCI_DEVICE_PATH *) CreateDeviceNode ( HARDWARE_DEVICE_PATH, HW_PCI_DP, (UINT16) sizeof (PCI_DEVICE_PATH) ); Pci->Function = (UINT8) Strtoi (FunctionStr); Pci->Device = (UINT8) Strtoi (DeviceStr); return (EFI_DEVICE_PATH_PROTOCOL *) Pci; } /** Converts a text device path node to Hardware PC card device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to Hardware PC card device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextPcCard ( IN CHAR16 *TextDeviceNode ) { CHAR16 *FunctionNumberStr; PCCARD_DEVICE_PATH *Pccard; FunctionNumberStr = GetNextParamStr (&TextDeviceNode); Pccard = (PCCARD_DEVICE_PATH *) CreateDeviceNode ( HARDWARE_DEVICE_PATH, HW_PCCARD_DP, (UINT16) sizeof (PCCARD_DEVICE_PATH) ); Pccard->FunctionNumber = (UINT8) Strtoi (FunctionNumberStr); return (EFI_DEVICE_PATH_PROTOCOL *) Pccard; } /** Converts a text device path node to Hardware memory map device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to Hardware memory map device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextMemoryMapped ( IN CHAR16 *TextDeviceNode ) { CHAR16 *MemoryTypeStr; CHAR16 *StartingAddressStr; CHAR16 *EndingAddressStr; MEMMAP_DEVICE_PATH *MemMap; MemoryTypeStr = GetNextParamStr (&TextDeviceNode); StartingAddressStr = GetNextParamStr (&TextDeviceNode); EndingAddressStr = GetNextParamStr (&TextDeviceNode); MemMap = (MEMMAP_DEVICE_PATH *) CreateDeviceNode ( HARDWARE_DEVICE_PATH, HW_MEMMAP_DP, (UINT16) sizeof (MEMMAP_DEVICE_PATH) ); MemMap->MemoryType = (UINT32) Strtoi (MemoryTypeStr); Strtoi64 (StartingAddressStr, &MemMap->StartingAddress); Strtoi64 (EndingAddressStr, &MemMap->EndingAddress); return (EFI_DEVICE_PATH_PROTOCOL *) MemMap; } /** Converts a text device path node to Vendor device path structure based on the input Type and SubType. @param TextDeviceNode The input Text device path node. @param Type The type of device path node. @param SubType The subtype of device path node. @return A pointer to the newly-created Vendor device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * ConvertFromTextVendor ( IN CHAR16 *TextDeviceNode, IN UINT8 Type, IN UINT8 SubType ) { CHAR16 *GuidStr; CHAR16 *DataStr; UINTN Length; VENDOR_DEVICE_PATH *Vendor; GuidStr = GetNextParamStr (&TextDeviceNode); DataStr = GetNextParamStr (&TextDeviceNode); Length = StrLen (DataStr); // // Two hex characters make up 1 buffer byte // Length = (Length + 1) / 2; Vendor = (VENDOR_DEVICE_PATH *) CreateDeviceNode ( Type, SubType, (UINT16) (sizeof (VENDOR_DEVICE_PATH) + Length) ); StrToGuid (GuidStr, &Vendor->Guid); StrHexToBytes (DataStr, Length * 2, (UINT8 *) (Vendor + 1), Length); return (EFI_DEVICE_PATH_PROTOCOL *) Vendor; } /** Converts a text device path node to Vendor Hardware device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Vendor Hardware device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVenHw ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextVendor ( TextDeviceNode, HARDWARE_DEVICE_PATH, HW_VENDOR_DP ); } /** Converts a text device path node to Hardware Controller device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Hardware Controller device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextCtrl ( IN CHAR16 *TextDeviceNode ) { CHAR16 *ControllerStr; CONTROLLER_DEVICE_PATH *Controller; ControllerStr = GetNextParamStr (&TextDeviceNode); Controller = (CONTROLLER_DEVICE_PATH *) CreateDeviceNode ( HARDWARE_DEVICE_PATH, HW_CONTROLLER_DP, (UINT16) sizeof (CONTROLLER_DEVICE_PATH) ); Controller->ControllerNumber = (UINT32) Strtoi (ControllerStr); return (EFI_DEVICE_PATH_PROTOCOL *) Controller; } /** Converts a text device path node to BMC device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created BMC device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextBmc ( IN CHAR16 *TextDeviceNode ) { CHAR16 *InterfaceTypeStr; CHAR16 *BaseAddressStr; BMC_DEVICE_PATH *BmcDp; InterfaceTypeStr = GetNextParamStr (&TextDeviceNode); BaseAddressStr = GetNextParamStr (&TextDeviceNode); BmcDp = (BMC_DEVICE_PATH *) CreateDeviceNode ( HARDWARE_DEVICE_PATH, HW_BMC_DP, (UINT16) sizeof (BMC_DEVICE_PATH) ); BmcDp->InterfaceType = (UINT8) Strtoi (InterfaceTypeStr); WriteUnaligned64 ( (UINT64 *) (&BmcDp->BaseAddress), StrHexToUint64 (BaseAddressStr) ); return (EFI_DEVICE_PATH_PROTOCOL *) BmcDp; } /** Converts a generic ACPI text device path node to ACPI device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to ACPI device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextAcpiPath ( IN CHAR16 *TextDeviceNode ) { return DevPathFromTextGenericPath (ACPI_DEVICE_PATH, TextDeviceNode); } /** Converts a string to EisaId. @param Text The input string. @return UINT32 EISA ID. **/ static UINT32 EisaIdFromText ( IN CHAR16 *Text ) { return (((Text[0] - 'A' + 1) & 0x1f) << 10) + (((Text[1] - 'A' + 1) & 0x1f) << 5) + (((Text[2] - 'A' + 1) & 0x1f) << 0) + (UINT32) (StrHexToUintn (&Text[3]) << 16) ; } /** Converts a text device path node to ACPI HID device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created ACPI HID device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextAcpi ( IN CHAR16 *TextDeviceNode ) { CHAR16 *HIDStr; CHAR16 *UIDStr; ACPI_HID_DEVICE_PATH *Acpi; HIDStr = GetNextParamStr (&TextDeviceNode); UIDStr = GetNextParamStr (&TextDeviceNode); Acpi = (ACPI_HID_DEVICE_PATH *) CreateDeviceNode ( ACPI_DEVICE_PATH, ACPI_DP, (UINT16) sizeof (ACPI_HID_DEVICE_PATH) ); Acpi->HID = EisaIdFromText (HIDStr); Acpi->UID = (UINT32) Strtoi (UIDStr); return (EFI_DEVICE_PATH_PROTOCOL *) Acpi; } /** Converts a text device path node to ACPI HID device path structure. @param TextDeviceNode The input Text device path node. @param PnPId The input plug and play identification. @return A pointer to the newly-created ACPI HID device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * ConvertFromTextAcpi ( IN CHAR16 *TextDeviceNode, IN UINT32 PnPId ) { CHAR16 *UIDStr; ACPI_HID_DEVICE_PATH *Acpi; UIDStr = GetNextParamStr (&TextDeviceNode); Acpi = (ACPI_HID_DEVICE_PATH *) CreateDeviceNode ( ACPI_DEVICE_PATH, ACPI_DP, (UINT16) sizeof (ACPI_HID_DEVICE_PATH) ); Acpi->HID = EFI_PNP_ID (PnPId); Acpi->UID = (UINT32) Strtoi (UIDStr); return (EFI_DEVICE_PATH_PROTOCOL *) Acpi; } /** Converts a text device path node to PCI root device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created PCI root device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextPciRoot ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextAcpi (TextDeviceNode, 0x0a03); } /** Converts a text device path node to PCIE root device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created PCIE root device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextPcieRoot ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextAcpi (TextDeviceNode, 0x0a08); } /** Converts a text device path node to Floppy device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Floppy device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextFloppy ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextAcpi (TextDeviceNode, 0x0604); } /** Converts a text device path node to Keyboard device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Keyboard device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextKeyboard ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextAcpi (TextDeviceNode, 0x0301); } /** Converts a text device path node to Serial device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Serial device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextSerial ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextAcpi (TextDeviceNode, 0x0501); } /** Converts a text device path node to Parallel Port device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Parallel Port device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextParallelPort ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextAcpi (TextDeviceNode, 0x0401); } /** Converts a text device path node to ACPI extension device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created ACPI extension device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextAcpiEx ( IN CHAR16 *TextDeviceNode ) { CHAR16 *HIDStr; CHAR16 *CIDStr; CHAR16 *UIDStr; CHAR16 *HIDSTRStr; CHAR16 *CIDSTRStr; CHAR16 *UIDSTRStr; CHAR8 *AsciiStr; UINT16 Length; ACPI_EXTENDED_HID_DEVICE_PATH *AcpiEx; HIDStr = GetNextParamStr (&TextDeviceNode); CIDStr = GetNextParamStr (&TextDeviceNode); UIDStr = GetNextParamStr (&TextDeviceNode); HIDSTRStr = GetNextParamStr (&TextDeviceNode); CIDSTRStr = GetNextParamStr (&TextDeviceNode); UIDSTRStr = GetNextParamStr (&TextDeviceNode); Length = (UINT16) (sizeof (ACPI_EXTENDED_HID_DEVICE_PATH) + StrLen (HIDSTRStr) + 1); Length = (UINT16) (Length + StrLen (UIDSTRStr) + 1); Length = (UINT16) (Length + StrLen (CIDSTRStr) + 1); AcpiEx = (ACPI_EXTENDED_HID_DEVICE_PATH *) CreateDeviceNode ( ACPI_DEVICE_PATH, ACPI_EXTENDED_DP, Length ); AcpiEx->HID = EisaIdFromText (HIDStr); AcpiEx->CID = EisaIdFromText (CIDStr); AcpiEx->UID = (UINT32) Strtoi (UIDStr); AsciiStr = (CHAR8 *) ((UINT8 *)AcpiEx + sizeof (ACPI_EXTENDED_HID_DEVICE_PATH)); StrToAscii (HIDSTRStr, &AsciiStr); StrToAscii (UIDSTRStr, &AsciiStr); StrToAscii (CIDSTRStr, &AsciiStr); return (EFI_DEVICE_PATH_PROTOCOL *) AcpiEx; } /** Converts a text device path node to ACPI extension device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created ACPI extension device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextAcpiExp ( IN CHAR16 *TextDeviceNode ) { CHAR16 *HIDStr; CHAR16 *CIDStr; CHAR16 *UIDSTRStr; CHAR8 *AsciiStr; UINT16 Length; ACPI_EXTENDED_HID_DEVICE_PATH *AcpiEx; HIDStr = GetNextParamStr (&TextDeviceNode); CIDStr = GetNextParamStr (&TextDeviceNode); UIDSTRStr = GetNextParamStr (&TextDeviceNode); Length = (UINT16) (sizeof (ACPI_EXTENDED_HID_DEVICE_PATH) + StrLen (UIDSTRStr) + 3); AcpiEx = (ACPI_EXTENDED_HID_DEVICE_PATH *) CreateDeviceNode ( ACPI_DEVICE_PATH, ACPI_EXTENDED_DP, Length ); AcpiEx->HID = EisaIdFromText (HIDStr); AcpiEx->CID = EisaIdFromText (CIDStr); AcpiEx->UID = 0; AsciiStr = (CHAR8 *) ((UINT8 *)AcpiEx + sizeof (ACPI_EXTENDED_HID_DEVICE_PATH)); // // HID string is NULL // *AsciiStr = '\0'; // // Convert UID string // AsciiStr++; StrToAscii (UIDSTRStr, &AsciiStr); // // CID string is NULL // *AsciiStr = '\0'; return (EFI_DEVICE_PATH_PROTOCOL *) AcpiEx; } /** Converts a text device path node to ACPI _ADR device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created ACPI _ADR device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextAcpiAdr ( IN CHAR16 *TextDeviceNode ) { CHAR16 *DisplayDeviceStr; ACPI_ADR_DEVICE_PATH *AcpiAdr; UINTN Index; UINTN Length; AcpiAdr = (ACPI_ADR_DEVICE_PATH *) CreateDeviceNode ( ACPI_DEVICE_PATH, ACPI_ADR_DP, (UINT16) sizeof (ACPI_ADR_DEVICE_PATH) ); ASSERT (AcpiAdr != NULL); for (Index = 0; ; Index++) { DisplayDeviceStr = GetNextParamStr (&TextDeviceNode); if (IS_NULL (*DisplayDeviceStr)) { break; } if (Index > 0) { Length = DevicePathNodeLength (AcpiAdr); AcpiAdr = ReallocatePool ( Length, Length + sizeof (UINT32), AcpiAdr ); ASSERT (AcpiAdr != NULL); SetDevicePathNodeLength (AcpiAdr, Length + sizeof (UINT32)); } (&AcpiAdr->ADR)[Index] = (UINT32) Strtoi (DisplayDeviceStr); } return (EFI_DEVICE_PATH_PROTOCOL *) AcpiAdr; } /** Converts a generic messaging text device path node to messaging device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to messaging device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextMsg ( IN CHAR16 *TextDeviceNode ) { return DevPathFromTextGenericPath (MESSAGING_DEVICE_PATH, TextDeviceNode); } /** Converts a text device path node to Parallel Port device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Parallel Port device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextAta ( IN CHAR16 *TextDeviceNode ) { CHAR16 *PrimarySecondaryStr; CHAR16 *SlaveMasterStr; CHAR16 *LunStr; ATAPI_DEVICE_PATH *Atapi; Atapi = (ATAPI_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_ATAPI_DP, (UINT16) sizeof (ATAPI_DEVICE_PATH) ); PrimarySecondaryStr = GetNextParamStr (&TextDeviceNode); SlaveMasterStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); if (StrCmp (PrimarySecondaryStr, "Primary") == 0) { Atapi->PrimarySecondary = 0; } else if (StrCmp (PrimarySecondaryStr, "Secondary") == 0) { Atapi->PrimarySecondary = 1; } else { Atapi->PrimarySecondary = (UINT8) Strtoi (PrimarySecondaryStr); } if (StrCmp (SlaveMasterStr, "Master") == 0) { Atapi->SlaveMaster = 0; } else if (StrCmp (SlaveMasterStr, "Slave") == 0) { Atapi->SlaveMaster = 1; } else { Atapi->SlaveMaster = (UINT8) Strtoi (SlaveMasterStr); } Atapi->Lun = (UINT16) Strtoi (LunStr); return (EFI_DEVICE_PATH_PROTOCOL *) Atapi; } /** Converts a text device path node to SCSI device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created SCSI device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextScsi ( IN CHAR16 *TextDeviceNode ) { CHAR16 *PunStr; CHAR16 *LunStr; SCSI_DEVICE_PATH *Scsi; PunStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); Scsi = (SCSI_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_SCSI_DP, (UINT16) sizeof (SCSI_DEVICE_PATH) ); Scsi->Pun = (UINT16) Strtoi (PunStr); Scsi->Lun = (UINT16) Strtoi (LunStr); return (EFI_DEVICE_PATH_PROTOCOL *) Scsi; } /** Converts a text device path node to Fibre device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Fibre device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextFibre ( IN CHAR16 *TextDeviceNode ) { CHAR16 *WWNStr; CHAR16 *LunStr; FIBRECHANNEL_DEVICE_PATH *Fibre; WWNStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); Fibre = (FIBRECHANNEL_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_FIBRECHANNEL_DP, (UINT16) sizeof (FIBRECHANNEL_DEVICE_PATH) ); Fibre->Reserved = 0; Strtoi64 (WWNStr, &Fibre->WWN); Strtoi64 (LunStr, &Fibre->Lun); return (EFI_DEVICE_PATH_PROTOCOL *) Fibre; } /** Converts a text device path node to FibreEx device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created FibreEx device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextFibreEx ( IN CHAR16 *TextDeviceNode ) { CHAR16 *WWNStr; CHAR16 *LunStr; FIBRECHANNELEX_DEVICE_PATH *FibreEx; WWNStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); FibreEx = (FIBRECHANNELEX_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_FIBRECHANNELEX_DP, (UINT16) sizeof (FIBRECHANNELEX_DEVICE_PATH) ); FibreEx->Reserved = 0; Strtoi64 (WWNStr, (UINT64 *) (&FibreEx->WWN)); Strtoi64 (LunStr, (UINT64 *) (&FibreEx->Lun)); *(UINT64 *) (&FibreEx->WWN) = SwapBytes64 (*(UINT64 *) (&FibreEx->WWN)); *(UINT64 *) (&FibreEx->Lun) = SwapBytes64 (*(UINT64 *) (&FibreEx->Lun)); return (EFI_DEVICE_PATH_PROTOCOL *) FibreEx; } /** Converts a text device path node to 1394 device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created 1394 device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromText1394 ( IN CHAR16 *TextDeviceNode ) { CHAR16 *GuidStr; F1394_DEVICE_PATH *F1394DevPath; GuidStr = GetNextParamStr (&TextDeviceNode); F1394DevPath = (F1394_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_1394_DP, (UINT16) sizeof (F1394_DEVICE_PATH) ); F1394DevPath->Reserved = 0; F1394DevPath->Guid = StrHexToUint64 (GuidStr); return (EFI_DEVICE_PATH_PROTOCOL *) F1394DevPath; } /** Converts a text device path node to USB device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsb ( IN CHAR16 *TextDeviceNode ) { CHAR16 *PortStr; CHAR16 *InterfaceStr; USB_DEVICE_PATH *Usb; PortStr = GetNextParamStr (&TextDeviceNode); InterfaceStr = GetNextParamStr (&TextDeviceNode); Usb = (USB_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_USB_DP, (UINT16) sizeof (USB_DEVICE_PATH) ); Usb->ParentPortNumber = (UINT8) Strtoi (PortStr); Usb->InterfaceNumber = (UINT8) Strtoi (InterfaceStr); return (EFI_DEVICE_PATH_PROTOCOL *) Usb; } /** Converts a text device path node to I20 device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created I20 device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextI2O ( IN CHAR16 *TextDeviceNode ) { CHAR16 *TIDStr; I2O_DEVICE_PATH *I2ODevPath; TIDStr = GetNextParamStr (&TextDeviceNode); I2ODevPath = (I2O_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_I2O_DP, (UINT16) sizeof (I2O_DEVICE_PATH) ); I2ODevPath->Tid = (UINT32) Strtoi (TIDStr); return (EFI_DEVICE_PATH_PROTOCOL *) I2ODevPath; } /** Converts a text device path node to Infini Band device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Infini Band device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextInfiniband ( IN CHAR16 *TextDeviceNode ) { CHAR16 *FlagsStr; CHAR16 *GuidStr; CHAR16 *SidStr; CHAR16 *TidStr; CHAR16 *DidStr; INFINIBAND_DEVICE_PATH *InfiniBand; FlagsStr = GetNextParamStr (&TextDeviceNode); GuidStr = GetNextParamStr (&TextDeviceNode); SidStr = GetNextParamStr (&TextDeviceNode); TidStr = GetNextParamStr (&TextDeviceNode); DidStr = GetNextParamStr (&TextDeviceNode); InfiniBand = (INFINIBAND_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_INFINIBAND_DP, (UINT16) sizeof (INFINIBAND_DEVICE_PATH) ); InfiniBand->ResourceFlags = (UINT32) Strtoi (FlagsStr); StrToGuid (GuidStr, (EFI_GUID *) InfiniBand->PortGid); Strtoi64 (SidStr, &InfiniBand->ServiceId); Strtoi64 (TidStr, &InfiniBand->TargetPortId); Strtoi64 (DidStr, &InfiniBand->DeviceId); return (EFI_DEVICE_PATH_PROTOCOL *) InfiniBand; } /** Converts a text device path node to Vendor-Defined Messaging device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Vendor-Defined Messaging device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVenMsg ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextVendor ( TextDeviceNode, MESSAGING_DEVICE_PATH, MSG_VENDOR_DP ); } /** Converts a text device path node to Vendor defined PC-ANSI device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Vendor defined PC-ANSI device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVenPcAnsi ( IN CHAR16 *TextDeviceNode ) { VENDOR_DEVICE_PATH *Vendor; Vendor = (VENDOR_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, (UINT16) sizeof (VENDOR_DEVICE_PATH)); CopyGuid (&Vendor->Guid, &gEfiPcAnsiGuid); return (EFI_DEVICE_PATH_PROTOCOL *) Vendor; } /** Converts a text device path node to Vendor defined VT100 device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Vendor defined VT100 device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVenVt100 ( IN CHAR16 *TextDeviceNode ) { VENDOR_DEVICE_PATH *Vendor; Vendor = (VENDOR_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, (UINT16) sizeof (VENDOR_DEVICE_PATH)); CopyGuid (&Vendor->Guid, &gEfiVT100Guid); return (EFI_DEVICE_PATH_PROTOCOL *) Vendor; } /** Converts a text device path node to Vendor defined VT100 Plus device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Vendor defined VT100 Plus device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVenVt100Plus ( IN CHAR16 *TextDeviceNode ) { VENDOR_DEVICE_PATH *Vendor; Vendor = (VENDOR_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, (UINT16) sizeof (VENDOR_DEVICE_PATH)); CopyGuid (&Vendor->Guid, &gEfiVT100PlusGuid); return (EFI_DEVICE_PATH_PROTOCOL *) Vendor; } /** Converts a text device path node to Vendor defined UTF8 device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Vendor defined UTF8 device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVenUtf8 ( IN CHAR16 *TextDeviceNode ) { VENDOR_DEVICE_PATH *Vendor; Vendor = (VENDOR_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, (UINT16) sizeof (VENDOR_DEVICE_PATH)); CopyGuid (&Vendor->Guid, &gEfiVTUTF8Guid); return (EFI_DEVICE_PATH_PROTOCOL *) Vendor; } /** Converts a text device path node to UART Flow Control device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created UART Flow Control device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUartFlowCtrl ( IN CHAR16 *TextDeviceNode ) { CHAR16 *ValueStr; UART_FLOW_CONTROL_DEVICE_PATH *UartFlowControl; ValueStr = GetNextParamStr (&TextDeviceNode); UartFlowControl = (UART_FLOW_CONTROL_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, (UINT16) sizeof (UART_FLOW_CONTROL_DEVICE_PATH) ); CopyGuid (&UartFlowControl->Guid, &gEfiUartDevicePathGuid); if (StrCmp (ValueStr, "XonXoff") == 0) { UartFlowControl->FlowControlMap = 2; } else if (StrCmp (ValueStr, "Hardware") == 0) { UartFlowControl->FlowControlMap = 1; } else { UartFlowControl->FlowControlMap = 0; } return (EFI_DEVICE_PATH_PROTOCOL *) UartFlowControl; } /** Converts a text device path node to Serial Attached SCSI device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Serial Attached SCSI device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextSAS ( IN CHAR16 *TextDeviceNode ) { CHAR16 *AddressStr; CHAR16 *LunStr; CHAR16 *RTPStr; CHAR16 *SASSATAStr; CHAR16 *LocationStr; CHAR16 *ConnectStr; CHAR16 *DriveBayStr; CHAR16 *ReservedStr; UINT16 Info; UINT16 Uint16; SAS_DEVICE_PATH *Sas; AddressStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); RTPStr = GetNextParamStr (&TextDeviceNode); SASSATAStr = GetNextParamStr (&TextDeviceNode); LocationStr = GetNextParamStr (&TextDeviceNode); ConnectStr = GetNextParamStr (&TextDeviceNode); DriveBayStr = GetNextParamStr (&TextDeviceNode); ReservedStr = GetNextParamStr (&TextDeviceNode); Sas = (SAS_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, (UINT16) sizeof (SAS_DEVICE_PATH) ); CopyGuid (&Sas->Guid, &gEfiSasDevicePathGuid); Strtoi64 (AddressStr, &Sas->SasAddress); Strtoi64 (LunStr, &Sas->Lun); Sas->RelativeTargetPort = (UINT16) Strtoi (RTPStr); if (StrCmp (SASSATAStr, "NoTopology") == 0) { Info = 0x0; } else if ((StrCmp (SASSATAStr, "SATA") == 0) || (StrCmp (SASSATAStr, "SAS") == 0)) { Uint16 = (UINT16) Strtoi (DriveBayStr); if (Uint16 == 0) { Info = 0x1; } else { Info = (UINT16) (0x2 | ((Uint16 - 1) << 8)); } if (StrCmp (SASSATAStr, "SATA") == 0) { Info |= BIT4; } // // Location is an integer between 0 and 1 or else // the keyword Internal (0) or External (1). // if (StrCmp (LocationStr, "External") == 0) { Uint16 = 1; } else if (StrCmp (LocationStr, "Internal") == 0) { Uint16 = 0; } else { Uint16 = ((UINT16) Strtoi (LocationStr) & BIT0); } Info |= (Uint16 << 5); // // Connect is an integer between 0 and 3 or else // the keyword Direct (0) or Expanded (1). // if (StrCmp (ConnectStr, "Expanded") == 0) { Uint16 = 1; } else if (StrCmp (ConnectStr, "Direct") == 0) { Uint16 = 0; } else { Uint16 = ((UINT16) Strtoi (ConnectStr) & (BIT0 | BIT1)); } Info |= (Uint16 << 6); } else { Info = (UINT16) Strtoi (SASSATAStr); } Sas->DeviceTopology = Info; Sas->Reserved = (UINT32) Strtoi (ReservedStr); return (EFI_DEVICE_PATH_PROTOCOL *) Sas; } /** Converts a text device path node to Serial Attached SCSI Ex device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Serial Attached SCSI Ex device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextSasEx ( IN CHAR16 *TextDeviceNode ) { CHAR16 *AddressStr; CHAR16 *LunStr; CHAR16 *RTPStr; CHAR16 *SASSATAStr; CHAR16 *LocationStr; CHAR16 *ConnectStr; CHAR16 *DriveBayStr; UINT16 Info; UINT16 Uint16; UINT64 SasAddress; UINT64 Lun; SASEX_DEVICE_PATH *SasEx; AddressStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); RTPStr = GetNextParamStr (&TextDeviceNode); SASSATAStr = GetNextParamStr (&TextDeviceNode); LocationStr = GetNextParamStr (&TextDeviceNode); ConnectStr = GetNextParamStr (&TextDeviceNode); DriveBayStr = GetNextParamStr (&TextDeviceNode); SasEx = (SASEX_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_SASEX_DP, (UINT16) sizeof (SASEX_DEVICE_PATH) ); Strtoi64 (AddressStr, &SasAddress); Strtoi64 (LunStr, &Lun); WriteUnaligned64 ((UINT64 *) &SasEx->SasAddress, SwapBytes64 (SasAddress)); WriteUnaligned64 ((UINT64 *) &SasEx->Lun, SwapBytes64 (Lun)); SasEx->RelativeTargetPort = (UINT16) Strtoi (RTPStr); if (StrCmp (SASSATAStr, "NoTopology") == 0) { Info = 0x0; } else if ((StrCmp (SASSATAStr, "SATA") == 0) || (StrCmp (SASSATAStr, "SAS") == 0)) { Uint16 = (UINT16) Strtoi (DriveBayStr); if (Uint16 == 0) { Info = 0x1; } else { Info = (UINT16) (0x2 | ((Uint16 - 1) << 8)); } if (StrCmp (SASSATAStr, "SATA") == 0) { Info |= BIT4; } // // Location is an integer between 0 and 1 or else // the keyword Internal (0) or External (1). // if (StrCmp (LocationStr, "External") == 0) { Uint16 = 1; } else if (StrCmp (LocationStr, "Internal") == 0) { Uint16 = 0; } else { Uint16 = ((UINT16) Strtoi (LocationStr) & BIT0); } Info |= (Uint16 << 5); // // Connect is an integer between 0 and 3 or else // the keyword Direct (0) or Expanded (1). // if (StrCmp (ConnectStr, "Expanded") == 0) { Uint16 = 1; } else if (StrCmp (ConnectStr, "Direct") == 0) { Uint16 = 0; } else { Uint16 = ((UINT16) Strtoi (ConnectStr) & (BIT0 | BIT1)); } Info |= (Uint16 << 6); } else { Info = (UINT16) Strtoi (SASSATAStr); } SasEx->DeviceTopology = Info; return (EFI_DEVICE_PATH_PROTOCOL *) SasEx; } /** Converts a text device path node to NVM Express Namespace device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created NVM Express Namespace device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextNVMe ( IN CHAR16 *TextDeviceNode ) { CHAR16 *NamespaceIdStr; CHAR16 *NamespaceUuidStr; NVME_NAMESPACE_DEVICE_PATH *Nvme; UINT8 *Uuid; UINTN Index; NamespaceIdStr = GetNextParamStr (&TextDeviceNode); NamespaceUuidStr = GetNextParamStr (&TextDeviceNode); Nvme = (NVME_NAMESPACE_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_NVME_NAMESPACE_DP, (UINT16) sizeof (NVME_NAMESPACE_DEVICE_PATH) ); Nvme->NamespaceId = (UINT32) Strtoi (NamespaceIdStr); Uuid = (UINT8 *) &Nvme->NamespaceUuid; Index = sizeof (Nvme->NamespaceUuid) / sizeof (UINT8); while (Index-- != 0) { Uuid[Index] = (UINT8) StrHexToUintn (SplitStr (&NamespaceUuidStr, '-')); } return (EFI_DEVICE_PATH_PROTOCOL *) Nvme; } /** Converts a text device path node to UFS device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created UFS device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUfs ( IN CHAR16 *TextDeviceNode ) { CHAR16 *PunStr; CHAR16 *LunStr; UFS_DEVICE_PATH *Ufs; PunStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); Ufs = (UFS_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_UFS_DP, (UINT16) sizeof (UFS_DEVICE_PATH) ); Ufs->Pun = (UINT8) Strtoi (PunStr); Ufs->Lun = (UINT8) Strtoi (LunStr); return (EFI_DEVICE_PATH_PROTOCOL *) Ufs; } /** Converts a text device path node to SD (Secure Digital) device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created SD device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextSd ( IN CHAR16 *TextDeviceNode ) { CHAR16 *SlotNumberStr; SD_DEVICE_PATH *Sd; SlotNumberStr = GetNextParamStr (&TextDeviceNode); Sd = (SD_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_SD_DP, (UINT16) sizeof (SD_DEVICE_PATH) ); Sd->SlotNumber = (UINT8) Strtoi (SlotNumberStr); return (EFI_DEVICE_PATH_PROTOCOL *) Sd; } /** Converts a text device path node to EMMC (Embedded MMC) device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created EMMC device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextEmmc ( IN CHAR16 *TextDeviceNode ) { CHAR16 *SlotNumberStr; EMMC_DEVICE_PATH *Emmc; SlotNumberStr = GetNextParamStr (&TextDeviceNode); Emmc = (EMMC_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_EMMC_DP, (UINT16) sizeof (EMMC_DEVICE_PATH) ); Emmc->SlotNumber = (UINT8) Strtoi (SlotNumberStr); return (EFI_DEVICE_PATH_PROTOCOL *) Emmc; } /** Converts a text device path node to Debug Port device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Debug Port device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextDebugPort ( IN CHAR16 *TextDeviceNode ) { VENDOR_DEFINED_MESSAGING_DEVICE_PATH *Vend; Vend = (VENDOR_DEFINED_MESSAGING_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VENDOR_DP, (UINT16) sizeof (VENDOR_DEFINED_MESSAGING_DEVICE_PATH) ); CopyGuid (&Vend->Guid, &gEfiDebugPortProtocolGuid); return (EFI_DEVICE_PATH_PROTOCOL *) Vend; } /** Converts a text device path node to MAC device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created MAC device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextMAC ( IN CHAR16 *TextDeviceNode ) { CHAR16 *AddressStr; CHAR16 *IfTypeStr; UINTN Length; MAC_ADDR_DEVICE_PATH *MACDevPath; AddressStr = GetNextParamStr (&TextDeviceNode); IfTypeStr = GetNextParamStr (&TextDeviceNode); MACDevPath = (MAC_ADDR_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_MAC_ADDR_DP, (UINT16) sizeof (MAC_ADDR_DEVICE_PATH) ); MACDevPath->IfType = (UINT8) Strtoi (IfTypeStr); Length = sizeof (EFI_MAC_ADDRESS); StrHexToBytes (AddressStr, Length * 2, MACDevPath->MacAddress.Addr, Length); return (EFI_DEVICE_PATH_PROTOCOL *) MACDevPath; } /** Converts a text format to the network protocol ID. @param Text String of protocol field. @return Network protocol ID . **/ static UINTN NetworkProtocolFromText ( IN CHAR16 *Text ) { if (StrCmp (Text, "UDP") == 0) { return RFC_1700_UDP_PROTOCOL; } if (StrCmp (Text, "TCP") == 0) { return RFC_1700_TCP_PROTOCOL; } return Strtoi (Text); } /** Converts a text device path node to IPV4 device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created IPV4 device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextIPv4 ( IN CHAR16 *TextDeviceNode ) { CHAR16 *RemoteIPStr; CHAR16 *ProtocolStr; CHAR16 *TypeStr; CHAR16 *LocalIPStr; CHAR16 *GatewayIPStr; CHAR16 *SubnetMaskStr; IPv4_DEVICE_PATH *IPv4; RemoteIPStr = GetNextParamStr (&TextDeviceNode); ProtocolStr = GetNextParamStr (&TextDeviceNode); TypeStr = GetNextParamStr (&TextDeviceNode); LocalIPStr = GetNextParamStr (&TextDeviceNode); GatewayIPStr = GetNextParamStr (&TextDeviceNode); SubnetMaskStr = GetNextParamStr (&TextDeviceNode); IPv4 = (IPv4_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_IPv4_DP, (UINT16) sizeof (IPv4_DEVICE_PATH) ); StrToIpv4Address (RemoteIPStr, NULL, &IPv4->RemoteIpAddress, NULL); IPv4->Protocol = (UINT16) NetworkProtocolFromText (ProtocolStr); if (StrCmp (TypeStr, "Static") == 0) { IPv4->StaticIpAddress = TRUE; } else { IPv4->StaticIpAddress = FALSE; } StrToIpv4Address (LocalIPStr, NULL, &IPv4->LocalIpAddress, NULL); if (!IS_NULL (*GatewayIPStr) && !IS_NULL (*SubnetMaskStr)) { StrToIpv4Address (GatewayIPStr, NULL, &IPv4->GatewayIpAddress, NULL); StrToIpv4Address (SubnetMaskStr, NULL, &IPv4->SubnetMask, NULL); } else { ZeroMem (&IPv4->GatewayIpAddress, sizeof (IPv4->GatewayIpAddress)); ZeroMem (&IPv4->SubnetMask, sizeof (IPv4->SubnetMask)); } IPv4->LocalPort = 0; IPv4->RemotePort = 0; return (EFI_DEVICE_PATH_PROTOCOL *) IPv4; } /** Converts a text device path node to IPV6 device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created IPV6 device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextIPv6 ( IN CHAR16 *TextDeviceNode ) { CHAR16 *RemoteIPStr; CHAR16 *ProtocolStr; CHAR16 *TypeStr; CHAR16 *LocalIPStr; CHAR16 *GatewayIPStr; CHAR16 *PrefixLengthStr; IPv6_DEVICE_PATH *IPv6; RemoteIPStr = GetNextParamStr (&TextDeviceNode); ProtocolStr = GetNextParamStr (&TextDeviceNode); TypeStr = GetNextParamStr (&TextDeviceNode); LocalIPStr = GetNextParamStr (&TextDeviceNode); PrefixLengthStr = GetNextParamStr (&TextDeviceNode); GatewayIPStr = GetNextParamStr (&TextDeviceNode); IPv6 = (IPv6_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_IPv6_DP, (UINT16) sizeof (IPv6_DEVICE_PATH) ); StrToIpv6Address (RemoteIPStr, NULL, &IPv6->RemoteIpAddress, NULL); IPv6->Protocol = (UINT16) NetworkProtocolFromText (ProtocolStr); if (StrCmp (TypeStr, "Static") == 0) { IPv6->IpAddressOrigin = 0; } else if (StrCmp (TypeStr, "StatelessAutoConfigure") == 0) { IPv6->IpAddressOrigin = 1; } else { IPv6->IpAddressOrigin = 2; } StrToIpv6Address (LocalIPStr, NULL, &IPv6->LocalIpAddress, NULL); if (!IS_NULL (*GatewayIPStr) && !IS_NULL (*PrefixLengthStr)) { StrToIpv6Address (GatewayIPStr, NULL, &IPv6->GatewayIpAddress, NULL); IPv6->PrefixLength = (UINT8) Strtoi (PrefixLengthStr); } else { ZeroMem (&IPv6->GatewayIpAddress, sizeof (IPv6->GatewayIpAddress)); IPv6->PrefixLength = 0; } IPv6->LocalPort = 0; IPv6->RemotePort = 0; return (EFI_DEVICE_PATH_PROTOCOL *) IPv6; } /** Converts a text device path node to UART device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created UART device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUart ( IN CHAR16 *TextDeviceNode ) { CHAR16 *BaudStr; CHAR16 *DataBitsStr; CHAR16 *ParityStr; CHAR16 *StopBitsStr; UART_DEVICE_PATH *Uart; BaudStr = GetNextParamStr (&TextDeviceNode); DataBitsStr = GetNextParamStr (&TextDeviceNode); ParityStr = GetNextParamStr (&TextDeviceNode); StopBitsStr = GetNextParamStr (&TextDeviceNode); Uart = (UART_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_UART_DP, (UINT16) sizeof (UART_DEVICE_PATH) ); if (StrCmp (BaudStr, "DEFAULT") == 0) { Uart->BaudRate = 115200; } else { Strtoi64 (BaudStr, &Uart->BaudRate); } Uart->DataBits = (UINT8) ((StrCmp (DataBitsStr, "DEFAULT") == 0) ? 8 : Strtoi (DataBitsStr)); switch (*ParityStr) { case 'D': Uart->Parity = 0; break; case 'N': Uart->Parity = 1; break; case 'E': Uart->Parity = 2; break; case 'O': Uart->Parity = 3; break; case 'M': Uart->Parity = 4; break; case 'S': Uart->Parity = 5; break; default: Uart->Parity = (UINT8) Strtoi (ParityStr); break; } if (StrCmp (StopBitsStr, "D") == 0) { Uart->StopBits = (UINT8) 0; } else if (StrCmp (StopBitsStr, "1") == 0) { Uart->StopBits = (UINT8) 1; } else if (StrCmp (StopBitsStr, "1.5") == 0) { Uart->StopBits = (UINT8) 2; } else if (StrCmp (StopBitsStr, "2") == 0) { Uart->StopBits = (UINT8) 3; } else { Uart->StopBits = (UINT8) Strtoi (StopBitsStr); } return (EFI_DEVICE_PATH_PROTOCOL *) Uart; } /** Converts a text device path node to USB class device path structure. @param TextDeviceNode The input Text device path node. @param UsbClassText A pointer to USB_CLASS_TEXT structure to be integrated to USB Class Text. @return A pointer to the newly-created USB class device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * ConvertFromTextUsbClass ( IN CHAR16 *TextDeviceNode, IN USB_CLASS_TEXT *UsbClassText ) { CHAR16 *VIDStr; CHAR16 *PIDStr; CHAR16 *ClassStr; CHAR16 *SubClassStr; CHAR16 *ProtocolStr; USB_CLASS_DEVICE_PATH *UsbClass; UsbClass = (USB_CLASS_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_USB_CLASS_DP, (UINT16) sizeof (USB_CLASS_DEVICE_PATH) ); VIDStr = GetNextParamStr (&TextDeviceNode); PIDStr = GetNextParamStr (&TextDeviceNode); if (UsbClassText->ClassExist) { ClassStr = GetNextParamStr (&TextDeviceNode); UsbClass->DeviceClass = (UINT8) Strtoi (ClassStr); } else { UsbClass->DeviceClass = UsbClassText->Class; } if (UsbClassText->SubClassExist) { SubClassStr = GetNextParamStr (&TextDeviceNode); UsbClass->DeviceSubClass = (UINT8) Strtoi (SubClassStr); } else { UsbClass->DeviceSubClass = UsbClassText->SubClass; } ProtocolStr = GetNextParamStr (&TextDeviceNode); UsbClass->VendorId = (UINT16) Strtoi (VIDStr); UsbClass->ProductId = (UINT16) Strtoi (PIDStr); UsbClass->DeviceProtocol = (UINT8) Strtoi (ProtocolStr); return (EFI_DEVICE_PATH_PROTOCOL *) UsbClass; } /** Converts a text device path node to USB class device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB class device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbClass ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = TRUE; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB audio device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB audio device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbAudio ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_AUDIO; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB CDC Control device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB CDC Control device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbCDCControl ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_CDCCONTROL; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB HID device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB HID device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbHID ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_HID; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB Image device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB Image device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbImage ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_IMAGE; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB Print device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB Print device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbPrinter ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_PRINTER; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB mass storage device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB mass storage device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbMassStorage ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_MASS_STORAGE; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB HUB device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB HUB device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbHub ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_HUB; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB CDC data device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB CDC data device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbCDCData ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_CDCDATA; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB smart card device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB smart card device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbSmartCard ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_SMART_CARD; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB video device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB video device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbVideo ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_VIDEO; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB diagnostic device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB diagnostic device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbDiagnostic ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_DIAGNOSTIC; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB wireless device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB wireless device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbWireless ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_WIRELESS; UsbClassText.SubClassExist = TRUE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB device firmware update device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB device firmware update device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbDeviceFirmwareUpdate ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_RESERVE; UsbClassText.SubClassExist = FALSE; UsbClassText.SubClass = USB_SUBCLASS_FW_UPDATE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB IRDA bridge device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB IRDA bridge device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbIrdaBridge ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_RESERVE; UsbClassText.SubClassExist = FALSE; UsbClassText.SubClass = USB_SUBCLASS_IRDA_BRIDGE; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB text and measurement device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB text and measurement device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbTestAndMeasurement ( IN CHAR16 *TextDeviceNode ) { USB_CLASS_TEXT UsbClassText; UsbClassText.ClassExist = FALSE; UsbClassText.Class = USB_CLASS_RESERVE; UsbClassText.SubClassExist = FALSE; UsbClassText.SubClass = USB_SUBCLASS_TEST; return ConvertFromTextUsbClass (TextDeviceNode, &UsbClassText); } /** Converts a text device path node to USB WWID device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created USB WWID device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUsbWwid ( IN CHAR16 *TextDeviceNode ) { CHAR16 *VIDStr; CHAR16 *PIDStr; CHAR16 *InterfaceNumStr; CHAR16 *SerialNumberStr; USB_WWID_DEVICE_PATH *UsbWwid; UINTN SerialNumberStrLen; VIDStr = GetNextParamStr (&TextDeviceNode); PIDStr = GetNextParamStr (&TextDeviceNode); InterfaceNumStr = GetNextParamStr (&TextDeviceNode); SerialNumberStr = GetNextParamStr (&TextDeviceNode); SerialNumberStrLen = StrLen (SerialNumberStr); if (SerialNumberStrLen >= 2 && SerialNumberStr[0] == '\"' && SerialNumberStr[SerialNumberStrLen - 1] == '\"' ) { SerialNumberStr[SerialNumberStrLen - 1] = '\0'; SerialNumberStr++; SerialNumberStrLen -= 2; } UsbWwid = (USB_WWID_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_USB_WWID_DP, (UINT16) (sizeof (USB_WWID_DEVICE_PATH) + SerialNumberStrLen * sizeof (CHAR16)) ); UsbWwid->VendorId = (UINT16) Strtoi (VIDStr); UsbWwid->ProductId = (UINT16) Strtoi (PIDStr); UsbWwid->InterfaceNumber = (UINT16) Strtoi (InterfaceNumStr); // // There is no memory allocated in UsbWwid for the '\0' in SerialNumberStr. // Therefore, the '\0' will not be copied. // CopyMem ( (UINT8 *) UsbWwid + sizeof (USB_WWID_DEVICE_PATH), SerialNumberStr, SerialNumberStrLen * sizeof (CHAR16) ); return (EFI_DEVICE_PATH_PROTOCOL *) UsbWwid; } /** Converts a text device path node to Logic Unit device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Logic Unit device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUnit ( IN CHAR16 *TextDeviceNode ) { CHAR16 *LunStr; DEVICE_LOGICAL_UNIT_DEVICE_PATH *LogicalUnit; LunStr = GetNextParamStr (&TextDeviceNode); LogicalUnit = (DEVICE_LOGICAL_UNIT_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_DEVICE_LOGICAL_UNIT_DP, (UINT16) sizeof (DEVICE_LOGICAL_UNIT_DEVICE_PATH) ); LogicalUnit->Lun = (UINT8) Strtoi (LunStr); return (EFI_DEVICE_PATH_PROTOCOL *) LogicalUnit; } /** Converts a text device path node to iSCSI device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created iSCSI device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextiSCSI ( IN CHAR16 *TextDeviceNode ) { UINT16 Options; CHAR16 *NameStr; CHAR16 *PortalGroupStr; CHAR16 *LunStr; CHAR16 *HeaderDigestStr; CHAR16 *DataDigestStr; CHAR16 *AuthenticationStr; CHAR16 *ProtocolStr; CHAR8 *AsciiStr; ISCSI_DEVICE_PATH_WITH_NAME *ISCSIDevPath; NameStr = GetNextParamStr (&TextDeviceNode); PortalGroupStr = GetNextParamStr (&TextDeviceNode); LunStr = GetNextParamStr (&TextDeviceNode); HeaderDigestStr = GetNextParamStr (&TextDeviceNode); DataDigestStr = GetNextParamStr (&TextDeviceNode); AuthenticationStr = GetNextParamStr (&TextDeviceNode); ProtocolStr = GetNextParamStr (&TextDeviceNode); ISCSIDevPath = (ISCSI_DEVICE_PATH_WITH_NAME *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_ISCSI_DP, (UINT16) (sizeof (ISCSI_DEVICE_PATH_WITH_NAME) + StrLen (NameStr)) ); AsciiStr = ISCSIDevPath->TargetName; StrToAscii (NameStr, &AsciiStr); ISCSIDevPath->TargetPortalGroupTag = (UINT16) Strtoi (PortalGroupStr); Strtoi64 (LunStr, &ISCSIDevPath->Lun); Options = 0x0000; if (StrCmp (HeaderDigestStr, "CRC32C") == 0) { Options |= 0x0002; } if (StrCmp (DataDigestStr, "CRC32C") == 0) { Options |= 0x0008; } if (StrCmp (AuthenticationStr, "None") == 0) { Options |= 0x0800; } if (StrCmp (AuthenticationStr, "CHAP_UNI") == 0) { Options |= 0x1000; } ISCSIDevPath->LoginOption = (UINT16) Options; ISCSIDevPath->NetworkProtocol = (UINT16) StrCmp (ProtocolStr, "TCP"); return (EFI_DEVICE_PATH_PROTOCOL *) ISCSIDevPath; } /** Converts a text device path node to VLAN device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created VLAN device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVlan ( IN CHAR16 *TextDeviceNode ) { CHAR16 *VlanStr; VLAN_DEVICE_PATH *Vlan; VlanStr = GetNextParamStr (&TextDeviceNode); Vlan = (VLAN_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_VLAN_DP, (UINT16) sizeof (VLAN_DEVICE_PATH) ); Vlan->VlanId = (UINT16) Strtoi (VlanStr); return (EFI_DEVICE_PATH_PROTOCOL *) Vlan; } /** Converts a text device path node to Bluetooth device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Bluetooth device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextBluetooth ( IN CHAR16 *TextDeviceNode ) { CHAR16 *BluetoothStr; CHAR16 *Walker; CHAR16 *TempNumBuffer; UINTN TempBufferSize; INT32 Index; BLUETOOTH_DEVICE_PATH *BluetoothDp; BluetoothStr = GetNextParamStr (&TextDeviceNode); BluetoothDp = (BLUETOOTH_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_BLUETOOTH_DP, (UINT16) sizeof (BLUETOOTH_DEVICE_PATH) ); Index = sizeof (BLUETOOTH_ADDRESS) - 1; Walker = BluetoothStr; while (!IS_NULL(*Walker) && Index >= 0) { TempBufferSize = 2 * sizeof(CHAR16) + StrSize("0x"); TempNumBuffer = AllocateZeroPool (TempBufferSize); if (TempNumBuffer == NULL) { break; } StrCpyS (TempNumBuffer, TempBufferSize / sizeof (CHAR16), "0x"); StrnCatS (TempNumBuffer, TempBufferSize / sizeof (CHAR16), Walker, 2); BluetoothDp->BD_ADDR.Address[Index] = (UINT8)Strtoi (TempNumBuffer); FreePool (TempNumBuffer); Walker += 2; Index--; } return (EFI_DEVICE_PATH_PROTOCOL *) BluetoothDp; } /** Converts a text device path node to Wi-Fi device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Wi-Fi device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextWiFi ( IN CHAR16 *TextDeviceNode ) { CHAR16 *SSIdStr; CHAR8 AsciiStr[33]; UINTN DataLen; WIFI_DEVICE_PATH *WiFiDp; SSIdStr = GetNextParamStr (&TextDeviceNode); WiFiDp = (WIFI_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_WIFI_DP, (UINT16) sizeof (WIFI_DEVICE_PATH) ); if (NULL != SSIdStr) { DataLen = StrLen (SSIdStr); if (StrLen (SSIdStr) > 32) { SSIdStr[32] = '\0'; DataLen = 32; } UnicodeStrToAsciiStrS (SSIdStr, AsciiStr, sizeof (AsciiStr)); CopyMem (WiFiDp->SSId, AsciiStr, DataLen); } return (EFI_DEVICE_PATH_PROTOCOL *) WiFiDp; } /** Converts a text device path node to URI device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created URI device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextUri ( IN CHAR16 *TextDeviceNode ) { CHAR16 *UriStr; UINTN UriLength; URI_DEVICE_PATH *Uri; UriStr = GetNextParamStr (&TextDeviceNode); UriLength = StrnLenS (UriStr, MAX_UINT16 - sizeof (URI_DEVICE_PATH)); Uri = (URI_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_URI_DP, (UINT16) (sizeof (URI_DEVICE_PATH) + UriLength) ); while (UriLength-- != 0) { Uri->Uri[UriLength] = (CHAR8) UriStr[UriLength]; } return (EFI_DEVICE_PATH_PROTOCOL *) Uri; } /** Converts a media text device path node to media device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to media device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextMediaPath ( IN CHAR16 *TextDeviceNode ) { return DevPathFromTextGenericPath (MEDIA_DEVICE_PATH, TextDeviceNode); } /** Converts a text device path node to HD device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created HD device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextHD ( IN CHAR16 *TextDeviceNode ) { CHAR16 *PartitionStr; CHAR16 *TypeStr; CHAR16 *SignatureStr; CHAR16 *StartStr; CHAR16 *SizeStr; UINT32 Signature32; HARDDRIVE_DEVICE_PATH *Hd; PartitionStr = GetNextParamStr (&TextDeviceNode); TypeStr = GetNextParamStr (&TextDeviceNode); SignatureStr = GetNextParamStr (&TextDeviceNode); StartStr = GetNextParamStr (&TextDeviceNode); SizeStr = GetNextParamStr (&TextDeviceNode); Hd = (HARDDRIVE_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_HARDDRIVE_DP, (UINT16) sizeof (HARDDRIVE_DEVICE_PATH) ); Hd->PartitionNumber = (UINT32) Strtoi (PartitionStr); ZeroMem (Hd->Signature, 16); Hd->MBRType = (UINT8) 0; if (StrCmp (TypeStr, "MBR") == 0) { Hd->SignatureType = SIGNATURE_TYPE_MBR; Hd->MBRType = 0x01; Signature32 = (UINT32) Strtoi (SignatureStr); CopyMem (Hd->Signature, &Signature32, sizeof (UINT32)); } else if (StrCmp (TypeStr, "GPT") == 0) { Hd->SignatureType = SIGNATURE_TYPE_GUID; Hd->MBRType = 0x02; StrToGuid (SignatureStr, (EFI_GUID *) Hd->Signature); } else { Hd->SignatureType = (UINT8) Strtoi (TypeStr); } Strtoi64 (StartStr, &Hd->PartitionStart); Strtoi64 (SizeStr, &Hd->PartitionSize); return (EFI_DEVICE_PATH_PROTOCOL *) Hd; } /** Converts a text device path node to CDROM device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created CDROM device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextCDROM ( IN CHAR16 *TextDeviceNode ) { CHAR16 *EntryStr; CHAR16 *StartStr; CHAR16 *SizeStr; CDROM_DEVICE_PATH *CDROMDevPath; EntryStr = GetNextParamStr (&TextDeviceNode); StartStr = GetNextParamStr (&TextDeviceNode); SizeStr = GetNextParamStr (&TextDeviceNode); CDROMDevPath = (CDROM_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_CDROM_DP, (UINT16) sizeof (CDROM_DEVICE_PATH) ); CDROMDevPath->BootEntry = (UINT32) Strtoi (EntryStr); Strtoi64 (StartStr, &CDROMDevPath->PartitionStart); Strtoi64 (SizeStr, &CDROMDevPath->PartitionSize); return (EFI_DEVICE_PATH_PROTOCOL *) CDROMDevPath; } /** Converts a text device path node to Vendor-defined media device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Vendor-defined media device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVenMedia ( IN CHAR16 *TextDeviceNode ) { return ConvertFromTextVendor ( TextDeviceNode, MEDIA_DEVICE_PATH, MEDIA_VENDOR_DP ); } /** Converts a text device path node to File device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created File device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextFilePath ( IN CHAR16 *TextDeviceNode ) { FILEPATH_DEVICE_PATH *File; #ifndef __FreeBSD__ File = (FILEPATH_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_FILEPATH_DP, (UINT16) (sizeof (FILEPATH_DEVICE_PATH) + StrLen (TextDeviceNode) * 2) ); StrCpyS (File->PathName, StrLen (TextDeviceNode) + 1, TextDeviceNode); #else size_t len = (sizeof (FILEPATH_DEVICE_PATH) + StrLen (TextDeviceNode) * 2); efi_char * v; File = (FILEPATH_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_FILEPATH_DP, (UINT16)len ); v = File->PathName; utf8_to_ucs2(TextDeviceNode, &v, &len); #endif return (EFI_DEVICE_PATH_PROTOCOL *) File; } /** Converts a text device path node to Media protocol device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Media protocol device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextMedia ( IN CHAR16 *TextDeviceNode ) { CHAR16 *GuidStr; MEDIA_PROTOCOL_DEVICE_PATH *Media; GuidStr = GetNextParamStr (&TextDeviceNode); Media = (MEDIA_PROTOCOL_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_PROTOCOL_DP, (UINT16) sizeof (MEDIA_PROTOCOL_DEVICE_PATH) ); StrToGuid (GuidStr, &Media->Protocol); return (EFI_DEVICE_PATH_PROTOCOL *) Media; } /** Converts a text device path node to firmware volume device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created firmware volume device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextFv ( IN CHAR16 *TextDeviceNode ) { CHAR16 *GuidStr; MEDIA_FW_VOL_DEVICE_PATH *Fv; GuidStr = GetNextParamStr (&TextDeviceNode); Fv = (MEDIA_FW_VOL_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_PIWG_FW_VOL_DP, (UINT16) sizeof (MEDIA_FW_VOL_DEVICE_PATH) ); StrToGuid (GuidStr, &Fv->FvName); return (EFI_DEVICE_PATH_PROTOCOL *) Fv; } /** Converts a text device path node to firmware file device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created firmware file device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextFvFile ( IN CHAR16 *TextDeviceNode ) { CHAR16 *GuidStr; MEDIA_FW_VOL_FILEPATH_DEVICE_PATH *FvFile; GuidStr = GetNextParamStr (&TextDeviceNode); FvFile = (MEDIA_FW_VOL_FILEPATH_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_PIWG_FW_FILE_DP, (UINT16) sizeof (MEDIA_FW_VOL_FILEPATH_DEVICE_PATH) ); StrToGuid (GuidStr, &FvFile->FvFileName); return (EFI_DEVICE_PATH_PROTOCOL *) FvFile; } /** Converts a text device path node to text relative offset device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Text device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextRelativeOffsetRange ( IN CHAR16 *TextDeviceNode ) { CHAR16 *StartingOffsetStr; CHAR16 *EndingOffsetStr; MEDIA_RELATIVE_OFFSET_RANGE_DEVICE_PATH *Offset; StartingOffsetStr = GetNextParamStr (&TextDeviceNode); EndingOffsetStr = GetNextParamStr (&TextDeviceNode); Offset = (MEDIA_RELATIVE_OFFSET_RANGE_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_RELATIVE_OFFSET_RANGE_DP, (UINT16) sizeof (MEDIA_RELATIVE_OFFSET_RANGE_DEVICE_PATH) ); Strtoi64 (StartingOffsetStr, &Offset->StartingOffset); Strtoi64 (EndingOffsetStr, &Offset->EndingOffset); return (EFI_DEVICE_PATH_PROTOCOL *) Offset; } /** Converts a text device path node to text ram disk device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Text device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextRamDisk ( IN CHAR16 *TextDeviceNode ) { CHAR16 *StartingAddrStr; CHAR16 *EndingAddrStr; CHAR16 *TypeGuidStr; CHAR16 *InstanceStr; MEDIA_RAM_DISK_DEVICE_PATH *RamDisk; UINT64 StartingAddr; UINT64 EndingAddr; StartingAddrStr = GetNextParamStr (&TextDeviceNode); EndingAddrStr = GetNextParamStr (&TextDeviceNode); InstanceStr = GetNextParamStr (&TextDeviceNode); TypeGuidStr = GetNextParamStr (&TextDeviceNode); RamDisk = (MEDIA_RAM_DISK_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_RAM_DISK_DP, (UINT16) sizeof (MEDIA_RAM_DISK_DEVICE_PATH) ); Strtoi64 (StartingAddrStr, &StartingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->StartingAddr[0]), StartingAddr); Strtoi64 (EndingAddrStr, &EndingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->EndingAddr[0]), EndingAddr); RamDisk->Instance = (UINT16) Strtoi (InstanceStr); StrToGuid (TypeGuidStr, &RamDisk->TypeGuid); return (EFI_DEVICE_PATH_PROTOCOL *) RamDisk; } /** Converts a text device path node to text virtual disk device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Text device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVirtualDisk ( IN CHAR16 *TextDeviceNode ) { CHAR16 *StartingAddrStr; CHAR16 *EndingAddrStr; CHAR16 *InstanceStr; MEDIA_RAM_DISK_DEVICE_PATH *RamDisk; UINT64 StartingAddr; UINT64 EndingAddr; StartingAddrStr = GetNextParamStr (&TextDeviceNode); EndingAddrStr = GetNextParamStr (&TextDeviceNode); InstanceStr = GetNextParamStr (&TextDeviceNode); RamDisk = (MEDIA_RAM_DISK_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_RAM_DISK_DP, (UINT16) sizeof (MEDIA_RAM_DISK_DEVICE_PATH) ); Strtoi64 (StartingAddrStr, &StartingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->StartingAddr[0]), StartingAddr); Strtoi64 (EndingAddrStr, &EndingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->EndingAddr[0]), EndingAddr); RamDisk->Instance = (UINT16) Strtoi (InstanceStr); CopyGuid (&RamDisk->TypeGuid, &gEfiVirtualDiskGuid); return (EFI_DEVICE_PATH_PROTOCOL *) RamDisk; } /** Converts a text device path node to text virtual cd device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Text device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextVirtualCd ( IN CHAR16 *TextDeviceNode ) { CHAR16 *StartingAddrStr; CHAR16 *EndingAddrStr; CHAR16 *InstanceStr; MEDIA_RAM_DISK_DEVICE_PATH *RamDisk; UINT64 StartingAddr; UINT64 EndingAddr; StartingAddrStr = GetNextParamStr (&TextDeviceNode); EndingAddrStr = GetNextParamStr (&TextDeviceNode); InstanceStr = GetNextParamStr (&TextDeviceNode); RamDisk = (MEDIA_RAM_DISK_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_RAM_DISK_DP, (UINT16) sizeof (MEDIA_RAM_DISK_DEVICE_PATH) ); Strtoi64 (StartingAddrStr, &StartingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->StartingAddr[0]), StartingAddr); Strtoi64 (EndingAddrStr, &EndingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->EndingAddr[0]), EndingAddr); RamDisk->Instance = (UINT16) Strtoi (InstanceStr); CopyGuid (&RamDisk->TypeGuid, &gEfiVirtualCdGuid); return (EFI_DEVICE_PATH_PROTOCOL *) RamDisk; } /** Converts a text device path node to text persistent virtual disk device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Text device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextPersistentVirtualDisk ( IN CHAR16 *TextDeviceNode ) { CHAR16 *StartingAddrStr; CHAR16 *EndingAddrStr; CHAR16 *InstanceStr; MEDIA_RAM_DISK_DEVICE_PATH *RamDisk; UINT64 StartingAddr; UINT64 EndingAddr; StartingAddrStr = GetNextParamStr (&TextDeviceNode); EndingAddrStr = GetNextParamStr (&TextDeviceNode); InstanceStr = GetNextParamStr (&TextDeviceNode); RamDisk = (MEDIA_RAM_DISK_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_RAM_DISK_DP, (UINT16) sizeof (MEDIA_RAM_DISK_DEVICE_PATH) ); Strtoi64 (StartingAddrStr, &StartingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->StartingAddr[0]), StartingAddr); Strtoi64 (EndingAddrStr, &EndingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->EndingAddr[0]), EndingAddr); RamDisk->Instance = (UINT16) Strtoi (InstanceStr); CopyGuid (&RamDisk->TypeGuid, &gEfiPersistentVirtualDiskGuid); return (EFI_DEVICE_PATH_PROTOCOL *) RamDisk; } /** Converts a text device path node to text persistent virtual cd device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created Text device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextPersistentVirtualCd ( IN CHAR16 *TextDeviceNode ) { CHAR16 *StartingAddrStr; CHAR16 *EndingAddrStr; CHAR16 *InstanceStr; MEDIA_RAM_DISK_DEVICE_PATH *RamDisk; UINT64 StartingAddr; UINT64 EndingAddr; StartingAddrStr = GetNextParamStr (&TextDeviceNode); EndingAddrStr = GetNextParamStr (&TextDeviceNode); InstanceStr = GetNextParamStr (&TextDeviceNode); RamDisk = (MEDIA_RAM_DISK_DEVICE_PATH *) CreateDeviceNode ( MEDIA_DEVICE_PATH, MEDIA_RAM_DISK_DP, (UINT16) sizeof (MEDIA_RAM_DISK_DEVICE_PATH) ); Strtoi64 (StartingAddrStr, &StartingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->StartingAddr[0]), StartingAddr); Strtoi64 (EndingAddrStr, &EndingAddr); WriteUnaligned64 ((UINT64 *) &(RamDisk->EndingAddr[0]), EndingAddr); RamDisk->Instance = (UINT16) Strtoi (InstanceStr); CopyGuid (&RamDisk->TypeGuid, &gEfiPersistentVirtualCdGuid); return (EFI_DEVICE_PATH_PROTOCOL *) RamDisk; } /** Converts a BBS text device path node to BBS device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to BBS device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextBbsPath ( IN CHAR16 *TextDeviceNode ) { return DevPathFromTextGenericPath (BBS_DEVICE_PATH, TextDeviceNode); } /** Converts a text device path node to BIOS Boot Specification device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created BIOS Boot Specification device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextBBS ( IN CHAR16 *TextDeviceNode ) { CHAR16 *TypeStr; CHAR16 *IdStr; CHAR16 *FlagsStr; CHAR8 *AsciiStr; BBS_BBS_DEVICE_PATH *Bbs; TypeStr = GetNextParamStr (&TextDeviceNode); IdStr = GetNextParamStr (&TextDeviceNode); FlagsStr = GetNextParamStr (&TextDeviceNode); Bbs = (BBS_BBS_DEVICE_PATH *) CreateDeviceNode ( BBS_DEVICE_PATH, BBS_BBS_DP, (UINT16) (sizeof (BBS_BBS_DEVICE_PATH) + StrLen (IdStr)) ); if (StrCmp (TypeStr, "Floppy") == 0) { Bbs->DeviceType = BBS_TYPE_FLOPPY; } else if (StrCmp (TypeStr, "HD") == 0) { Bbs->DeviceType = BBS_TYPE_HARDDRIVE; } else if (StrCmp (TypeStr, "CDROM") == 0) { Bbs->DeviceType = BBS_TYPE_CDROM; } else if (StrCmp (TypeStr, "PCMCIA") == 0) { Bbs->DeviceType = BBS_TYPE_PCMCIA; } else if (StrCmp (TypeStr, "USB") == 0) { Bbs->DeviceType = BBS_TYPE_USB; } else if (StrCmp (TypeStr, "Network") == 0) { Bbs->DeviceType = BBS_TYPE_EMBEDDED_NETWORK; } else { Bbs->DeviceType = (UINT16) Strtoi (TypeStr); } AsciiStr = Bbs->String; StrToAscii (IdStr, &AsciiStr); Bbs->StatusFlag = (UINT16) Strtoi (FlagsStr); return (EFI_DEVICE_PATH_PROTOCOL *) Bbs; } /** Converts a text device path node to SATA device path structure. @param TextDeviceNode The input Text device path node. @return A pointer to the newly-created SATA device path structure. **/ static EFI_DEVICE_PATH_PROTOCOL * DevPathFromTextSata ( IN CHAR16 *TextDeviceNode ) { SATA_DEVICE_PATH *Sata; CHAR16 *Param1; CHAR16 *Param2; CHAR16 *Param3; Param1 = GetNextParamStr (&TextDeviceNode); Param2 = GetNextParamStr (&TextDeviceNode); Param3 = GetNextParamStr (&TextDeviceNode); Sata = (SATA_DEVICE_PATH *) CreateDeviceNode ( MESSAGING_DEVICE_PATH, MSG_SATA_DP, (UINT16) sizeof (SATA_DEVICE_PATH) ); Sata->HBAPortNumber = (UINT16) Strtoi (Param1); Sata->PortMultiplierPortNumber = (UINT16) Strtoi (Param2); Sata->Lun = (UINT16) Strtoi (Param3); return (EFI_DEVICE_PATH_PROTOCOL *) Sata; } GLOBAL_REMOVE_IF_UNREFERENCED DEVICE_PATH_FROM_TEXT_TABLE mUefiDevicePathLibDevPathFromTextTable[] = { {"Path", DevPathFromTextPath }, {"HardwarePath", DevPathFromTextHardwarePath }, {"Pci", DevPathFromTextPci }, {"PcCard", DevPathFromTextPcCard }, {"MemoryMapped", DevPathFromTextMemoryMapped }, {"VenHw", DevPathFromTextVenHw }, {"Ctrl", DevPathFromTextCtrl }, {"BMC", DevPathFromTextBmc }, {"AcpiPath", DevPathFromTextAcpiPath }, {"Acpi", DevPathFromTextAcpi }, {"PciRoot", DevPathFromTextPciRoot }, {"PcieRoot", DevPathFromTextPcieRoot }, {"Floppy", DevPathFromTextFloppy }, {"Keyboard", DevPathFromTextKeyboard }, {"Serial", DevPathFromTextSerial }, {"ParallelPort", DevPathFromTextParallelPort }, {"AcpiEx", DevPathFromTextAcpiEx }, {"AcpiExp", DevPathFromTextAcpiExp }, {"AcpiAdr", DevPathFromTextAcpiAdr }, {"Msg", DevPathFromTextMsg }, {"Ata", DevPathFromTextAta }, {"Scsi", DevPathFromTextScsi }, {"Fibre", DevPathFromTextFibre }, {"FibreEx", DevPathFromTextFibreEx }, {"I1394", DevPathFromText1394 }, {"USB", DevPathFromTextUsb }, {"I2O", DevPathFromTextI2O }, {"Infiniband", DevPathFromTextInfiniband }, {"VenMsg", DevPathFromTextVenMsg }, {"VenPcAnsi", DevPathFromTextVenPcAnsi }, {"VenVt100", DevPathFromTextVenVt100 }, {"VenVt100Plus", DevPathFromTextVenVt100Plus }, {"VenUtf8", DevPathFromTextVenUtf8 }, {"UartFlowCtrl", DevPathFromTextUartFlowCtrl }, {"SAS", DevPathFromTextSAS }, {"SasEx", DevPathFromTextSasEx }, {"NVMe", DevPathFromTextNVMe }, {"UFS", DevPathFromTextUfs }, {"SD", DevPathFromTextSd }, {"eMMC", DevPathFromTextEmmc }, {"DebugPort", DevPathFromTextDebugPort }, {"MAC", DevPathFromTextMAC }, {"IPv4", DevPathFromTextIPv4 }, {"IPv6", DevPathFromTextIPv6 }, {"Uart", DevPathFromTextUart }, {"UsbClass", DevPathFromTextUsbClass }, {"UsbAudio", DevPathFromTextUsbAudio }, {"UsbCDCControl", DevPathFromTextUsbCDCControl }, {"UsbHID", DevPathFromTextUsbHID }, {"UsbImage", DevPathFromTextUsbImage }, {"UsbPrinter", DevPathFromTextUsbPrinter }, {"UsbMassStorage", DevPathFromTextUsbMassStorage }, {"UsbHub", DevPathFromTextUsbHub }, {"UsbCDCData", DevPathFromTextUsbCDCData }, {"UsbSmartCard", DevPathFromTextUsbSmartCard }, {"UsbVideo", DevPathFromTextUsbVideo }, {"UsbDiagnostic", DevPathFromTextUsbDiagnostic }, {"UsbWireless", DevPathFromTextUsbWireless }, {"UsbDeviceFirmwareUpdate", DevPathFromTextUsbDeviceFirmwareUpdate }, {"UsbIrdaBridge", DevPathFromTextUsbIrdaBridge }, {"UsbTestAndMeasurement", DevPathFromTextUsbTestAndMeasurement }, {"UsbWwid", DevPathFromTextUsbWwid }, {"Unit", DevPathFromTextUnit }, {"iSCSI", DevPathFromTextiSCSI }, {"Vlan", DevPathFromTextVlan }, {"Uri", DevPathFromTextUri }, {"Bluetooth", DevPathFromTextBluetooth }, {"Wi-Fi", DevPathFromTextWiFi }, {"MediaPath", DevPathFromTextMediaPath }, {"HD", DevPathFromTextHD }, {"CDROM", DevPathFromTextCDROM }, {"VenMedia", DevPathFromTextVenMedia }, {"Media", DevPathFromTextMedia }, {"Fv", DevPathFromTextFv }, {"FvFile", DevPathFromTextFvFile }, {"File", DevPathFromTextFilePath }, {"Offset", DevPathFromTextRelativeOffsetRange }, {"RamDisk", DevPathFromTextRamDisk }, {"VirtualDisk", DevPathFromTextVirtualDisk }, {"VirtualCD", DevPathFromTextVirtualCd }, {"PersistentVirtualDisk", DevPathFromTextPersistentVirtualDisk }, {"PersistentVirtualCD", DevPathFromTextPersistentVirtualCd }, {"BbsPath", DevPathFromTextBbsPath }, {"BBS", DevPathFromTextBBS }, {"Sata", DevPathFromTextSata }, {NULL, NULL} }; /** Convert text to the binary representation of a device node. @param TextDeviceNode TextDeviceNode points to the text representation of a device node. Conversion starts with the first character and continues until the first non-device node character. @return A pointer to the EFI device node or NULL if TextDeviceNode is NULL or there was insufficient memory or text unsupported. **/ static EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibConvertTextToDeviceNode ( IN CONST CHAR16 *TextDeviceNode ) { DEVICE_PATH_FROM_TEXT FromText; CHAR16 *ParamStr; EFI_DEVICE_PATH_PROTOCOL *DeviceNode; CHAR16 *DeviceNodeStr; UINTN Index; if ((TextDeviceNode == NULL) || (IS_NULL (*TextDeviceNode))) { return NULL; } ParamStr = NULL; FromText = NULL; DeviceNodeStr = UefiDevicePathLibStrDuplicate (TextDeviceNode); ASSERT (DeviceNodeStr != NULL); for (Index = 0; mUefiDevicePathLibDevPathFromTextTable[Index].Function != NULL; Index++) { ParamStr = GetParamByNodeName (DeviceNodeStr, mUefiDevicePathLibDevPathFromTextTable[Index].DevicePathNodeText); if (ParamStr != NULL) { FromText = mUefiDevicePathLibDevPathFromTextTable[Index].Function; break; } } if (FromText == NULL) { // // A file path // FromText = DevPathFromTextFilePath; DeviceNode = FromText (DeviceNodeStr); } else { DeviceNode = FromText (ParamStr); FreePool (ParamStr); } FreePool (DeviceNodeStr); return DeviceNode; } /** Convert text to the binary representation of a device path. @param TextDevicePath TextDevicePath points to the text representation of a device path. Conversion starts with the first character and continues until the first non-device node character. @return A pointer to the allocated device path or NULL if TextDeviceNode is NULL or there was insufficient memory. **/ static EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibConvertTextToDevicePath ( IN CONST CHAR16 *TextDevicePath ) { EFI_DEVICE_PATH_PROTOCOL *DeviceNode; EFI_DEVICE_PATH_PROTOCOL *NewDevicePath; CHAR16 *DevicePathStr; CHAR16 *Str; CHAR16 *DeviceNodeStr; BOOLEAN IsInstanceEnd; EFI_DEVICE_PATH_PROTOCOL *DevicePath; if ((TextDevicePath == NULL) || (IS_NULL (*TextDevicePath))) { return NULL; } DevicePath = (EFI_DEVICE_PATH_PROTOCOL *) AllocatePool (END_DEVICE_PATH_LENGTH); ASSERT (DevicePath != NULL); SetDevicePathEndNode (DevicePath); DevicePathStr = UefiDevicePathLibStrDuplicate (TextDevicePath); Str = DevicePathStr; while ((DeviceNodeStr = GetNextDeviceNodeStr (&Str, &IsInstanceEnd)) != NULL) { DeviceNode = UefiDevicePathLibConvertTextToDeviceNode (DeviceNodeStr); NewDevicePath = AppendDevicePathNode (DevicePath, DeviceNode); FreePool (DevicePath); FreePool (DeviceNode); DevicePath = NewDevicePath; if (IsInstanceEnd) { DeviceNode = (EFI_DEVICE_PATH_PROTOCOL *) AllocatePool (END_DEVICE_PATH_LENGTH); ASSERT (DeviceNode != NULL); SetDevicePathEndNode (DeviceNode); // Fix from https://bugzilla.tianocore.org/show_bug.cgi?id=419 DeviceNode->SubType = END_INSTANCE_DEVICE_PATH_SUBTYPE; NewDevicePath = AppendDevicePathNode (DevicePath, DeviceNode); FreePool (DevicePath); FreePool (DeviceNode); DevicePath = NewDevicePath; } } FreePool (DevicePathStr); return DevicePath; } ssize_t efidp_parse_device_path(char *path, efidp out, size_t max) { EFI_DEVICE_PATH_PROTOCOL *dp; UINTN len; dp = UefiDevicePathLibConvertTextToDevicePath (path); if (dp == NULL) return -1; len = GetDevicePathSize(dp); if (len > max) { free(dp); return -1; } memcpy(out, dp, len); free(dp); return len; } Index: head/lib/libefivar/efivar-dp-xlate.c =================================================================== --- head/lib/libefivar/efivar-dp-xlate.c (revision 343754) +++ head/lib/libefivar/efivar-dp-xlate.c (revision 343755) @@ -1,721 +1,720 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #undef MAX #undef MIN #include #include #include #include #include #include #include #include "efichar.h" #include "efi-osdep.h" #include "efivar-dp.h" #include "uefi-dplib.h" #define MAX_DP_SANITY 4096 /* Biggest device path in bytes */ #define MAX_DP_TEXT_LEN 4096 /* Longest string rep of dp */ #define G_PART "PART" #define G_LABEL "LABEL" #define G_DISK "DISK" static const char * geom_pp_attr(struct gmesh *mesh, struct gprovider *pp, const char *attr) { struct gconfig *conf; LIST_FOREACH(conf, &pp->lg_config, lg_config) { if (strcmp(conf->lg_name, attr) != 0) continue; return (conf->lg_val); } return (NULL); } static struct gprovider * find_provider_by_efimedia(struct gmesh *mesh, const char *efimedia) { struct gclass *classp; struct ggeom *gp; struct gprovider *pp; const char *val; /* * Find the partition class so we can search it... */ LIST_FOREACH(classp, &mesh->lg_class, lg_class) { if (strcasecmp(classp->lg_name, G_PART) == 0) break; } if (classp == NULL) return (NULL); /* * Each geom will have a number of providers, search each * one of them for the efimedia that matches. */ /* XXX just used gpart class since I know it's the only one, but maybe I should search all classes */ LIST_FOREACH(gp, &classp->lg_geom, lg_geom) { LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { val = geom_pp_attr(mesh, pp, "efimedia"); if (val == NULL) continue; if (strcasecmp(efimedia, val) == 0) return (pp); } } return (NULL); } static struct gprovider * find_provider_by_name(struct gmesh *mesh, const char *name) { struct gclass *classp; struct ggeom *gp; struct gprovider *pp; LIST_FOREACH(classp, &mesh->lg_class, lg_class) { LIST_FOREACH(gp, &classp->lg_geom, lg_geom) { LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { if (strcmp(pp->lg_name, name) == 0) return (pp); } } } return (NULL); } static int efi_hd_to_unix(struct gmesh *mesh, const_efidp dp, char **dev, char **relpath, char **abspath) { int rv = 0, n, i; const_efidp media, file, walker; size_t len, mntlen; char buf[MAX_DP_TEXT_LEN]; char *pwalk; struct gprovider *pp, *provider; struct gconsumer *cp; struct statfs *mnt; walker = media = dp; /* * Now, we can either have a filepath node next, or the end. * Otherwise, it's an error. */ walker = (const_efidp)NextDevicePathNode(walker); if ((uintptr_t)walker - (uintptr_t)dp > MAX_DP_SANITY) return (EINVAL); if (DevicePathType(walker) == MEDIA_DEVICE_PATH && DevicePathSubType(walker) == MEDIA_FILEPATH_DP) file = walker; else if (DevicePathType(walker) == MEDIA_DEVICE_PATH && DevicePathType(walker) == END_DEVICE_PATH_TYPE) file = NULL; else return (EINVAL); /* * Format this node. We're going to look for it as a efimedia * attribute of some geom node. Once we find that node, we use it * as the device it comes from, at least provisionally. */ len = efidp_format_device_path_node(buf, sizeof(buf), media); if (len > sizeof(buf)) return (EINVAL); pp = find_provider_by_efimedia(mesh, buf); if (pp == NULL) { rv = ENOENT; goto errout; } *dev = strdup(pp->lg_name); if (*dev == NULL) { rv = ENOMEM; goto errout; } /* * No file specified, just return the device. Don't even look * for a mountpoint. XXX Sane? */ if (file == NULL) goto errout; /* * Now extract the relative path. The next node in the device path should * be a filesystem node. If not, we have issues. */ *relpath = efidp_extract_file_path(file); if (*relpath == NULL) { rv = ENOMEM; goto errout; } for (pwalk = *relpath; *pwalk; pwalk++) if (*pwalk == '\\') *pwalk = '/'; /* * To find the absolute path, we have to look for where we're mounted. * We only look a little hard, since looking too hard can come up with * false positives (imagine a graid, one of whose devices is *dev). */ n = getfsstat(NULL, 0, MNT_NOWAIT) + 1; if (n < 0) { rv = errno; goto errout; } mntlen = sizeof(struct statfs) * n; mnt = malloc(mntlen); n = getfsstat(mnt, mntlen, MNT_NOWAIT); if (n < 0) { rv = errno; goto errout; } provider = pp; for (i = 0; i < n; i++) { /* * Skip all pseudo filesystems. This also skips the real filesytsem * of ZFS. There's no EFI designator for ZFS in the standard, so * we'll need to invent one, but its decoding will be handled in * a separate function. */ if (mnt[i].f_mntfromname[0] != '/') continue; /* * First see if it is directly attached */ if (strcmp(provider->lg_name, mnt[i].f_mntfromname + 5) == 0) break; /* * Next see if it is attached via one of the physical disk's * labels. */ LIST_FOREACH(cp, &provider->lg_consumers, lg_consumer) { pp = cp->lg_provider; if (strcmp(pp->lg_geom->lg_class->lg_name, G_LABEL) != 0) continue; if (strcmp(g_device_path(pp->lg_name), mnt[i].f_mntfromname) == 0) goto break2; } /* Not the one, try the next mount point */ } break2: /* * No mountpoint found, no absolute path possible */ if (i >= n) goto errout; /* * Construct absolute path and we're finally done. */ if (strcmp(mnt[i].f_mntonname, "/") == 0) asprintf(abspath, "/%s", *relpath); else asprintf(abspath, "%s/%s", mnt[i].f_mntonname, *relpath); errout: if (rv != 0) { free(*dev); *dev = NULL; free(*relpath); *relpath = NULL; } return (rv); } /* * Translate the passed in device_path to a unix path via the following * algorithm. * * If dp, dev or path NULL, return EDOOFUS. XXX wise? * * Set *path = NULL; *dev = NULL; * * Walk through the device_path until we find either a media device path. * Return EINVAL if not found. Return EINVAL if walking dp would * land us more than sanity size away from the start (4k). * * If we find a media descriptor, we search through the geom mesh to see if we * can find a matching node. If no match is found in the mesh that matches, * return ENXIO. * * Once we find a matching node, we search to see if there is a filesystem * mounted on it. If we find nothing, then search each of the devices that are * mounted to see if we can work up the geom tree to find the matching node. if * we still can't find anything, *dev = sprintf("/dev/%s", provider_name * of the original node we found), but return ENOTBLK. * * Record the dev of the mountpoint in *dev. * * Once we find something, check to see if the next node in the device path is * the end of list. If so, return the mountpoint. * * If the next node isn't a File path node, return EFTYPE. * * Extract the path from the File path node(s). translate any \ file separators * to /. Append the result to the mount point. Copy the resulting path into * *path. Stat that path. If it is not found, return the errorr from stat. * * Finally, check to make sure the resulting path is still on the same * device. If not, return ENODEV. * * Otherwise return 0. * * The dev or full path that's returned is malloced, so needs to be freed when * the caller is done about it. Unlike many other functions, we can return data * with an error code, so pay attention. */ int efivar_device_path_to_unix_path(const_efidp dp, char **dev, char **relpath, char **abspath) { const_efidp walker; struct gmesh mesh; int rv = 0; /* * Sanity check args, fail early */ if (dp == NULL || dev == NULL || relpath == NULL || abspath == NULL) return (EDOOFUS); *dev = NULL; *relpath = NULL; *abspath = NULL; /* * Find the first media device path we can. If we go too far, * assume the passed in device path is bogus. If we hit the end * then we didn't find a media device path, so signal that error. */ walker = dp; while (DevicePathType(walker) != MEDIA_DEVICE_PATH && DevicePathType(walker) != END_DEVICE_PATH_TYPE) { walker = (const_efidp)NextDevicePathNode(walker); if ((uintptr_t)walker - (uintptr_t)dp > MAX_DP_SANITY) return (EINVAL); } if (DevicePathType(walker) != MEDIA_DEVICE_PATH) return (EINVAL); /* * There's several types of media paths. We're only interested in the * hard disk path, as it's really the only relevant one to booting. The * CD path just might also be relevant, and would be easy to add, but * isn't supported. A file path too is relevant, but at this stage, it's * premature because we're trying to translate a specification for a device * and path on that device into a unix path, or at the very least, a * geom device : path-on-device. * * Also, ZFS throws a bit of a monkey wrench in here since it doesn't have * a device path type (it creates a new virtual device out of one or more * storage devices). * * For all of them, we'll need to know the geoms, so allocate / free the * geom mesh here since it's safer than doing it in each sub-function * which may have many error exits. */ if (geom_gettree(&mesh)) return (ENOMEM); rv = EINVAL; if (DevicePathSubType(walker) == MEDIA_HARDDRIVE_DP) rv = efi_hd_to_unix(&mesh, walker, dev, relpath, abspath); #ifdef notyet else if (is_cdrom_device(walker)) rv = efi_cdrom_to_unix(&mesh, walker, dev, relpath, abspath); else if (is_floppy_device(walker)) rv = efi_floppy_to_unix(&mesh, walker, dev, relpath, abspath); else if (is_zpool_device(walker)) rv = efi_zpool_to_unix(&mesh, walker, dev, relpath, abspath); #endif geom_deletetree(&mesh); return (rv); } /* * Construct the EFI path to a current unix path as follows. * * The path may be of one of three forms: * 1) /path/to/file -- full path to a file. The file need not be present, * but /path/to must be. It must reside on a local filesystem * mounted on a GPT or MBR partition. * 2) //path/to/file -- Shorthand for 'On the EFI partition, \path\to\file' * where 'The EFI Partition' is a partiton that's type is 'efi' * on the same disk that / is mounted from. If there are multiple * or no 'efi' parittions on that disk, or / isn't on a disk that * we can trace back to a physical device, an error will result * 3) [/dev/]geom-name:/path/to/file -- Use the specified partition * (and it must be a GPT or MBR partition) with the specified * path. The latter is not authenticated. * all path forms translate any \ characters to / before further processing. * When a file path node is created, all / characters are translated back * to \. * * For paths of the first form: * find where the filesystem is mount (either the file directly, or * its parent directory). * translate any logical device name (eg lable) to a physical one * If not possible, return ENXIO * If the physical path is unsupported (Eg not on a GPT or MBR disk), * return ENXIO * Create a media device path node. * append the relative path from the mountpoint to the media device node * as a file path. * * For paths matching the second form: * find the EFI partition corresponding to the root fileystem. * If none found, return ENXIO * Create a media device path node for the found partition * Append a File Path to the end for the rest of the file. * * For paths of the third form * Translate the geom-name passed in into a physical partition * name. * Return ENXIO if the translation fails * Make a media device path for it * append the part after the : as a File path node. */ static char * path_to_file_dp(const char *relpath) { char *rv; asprintf(&rv, "File(%s)", relpath); return rv; } static char * find_geom_efi_on_root(struct gmesh *mesh) { struct statfs buf; const char *dev; struct gprovider *pp; // struct ggeom *disk; struct gconsumer *cp; /* * Find /'s geom. Assume it's mounted on /dev/ and filter out all the * filesystems that aren't. */ if (statfs("/", &buf) != 0) return (NULL); dev = buf.f_mntfromname; if (*dev != '/' || strncmp(dev, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0) return (NULL); dev += sizeof(_PATH_DEV) -1; pp = find_provider_by_name(mesh, dev); if (pp == NULL) return (NULL); /* * If the provider is a LABEL, find it's outer PART class, if any. We * only operate on partitions. */ if (strcmp(pp->lg_geom->lg_class->lg_name, G_LABEL) == 0) { LIST_FOREACH(cp, &pp->lg_consumers, lg_consumer) { if (strcmp(cp->lg_provider->lg_geom->lg_class->lg_name, G_PART) == 0) { pp = cp->lg_provider; break; } } } if (strcmp(pp->lg_geom->lg_class->lg_name, G_PART) != 0) return (NULL); #if 0 /* This doesn't work because we can't get the data to walk UP the tree it seems */ /* * Now that we've found the PART that we have mounted as root, find the * first efi typed partition that's a peer, if any. */ LIST_FOREACH(cp, &pp->lg_consumers, lg_consumer) { if (strcmp(cp->lg_provider->lg_geom->lg_class->lg_name, G_DISK) == 0) { disk = cp->lg_provider->lg_geom; break; } } if (disk == NULL) /* This is very bad -- old nested partitions -- no support ? */ return (NULL); #endif #if 0 /* This doesn't work because we can't get the data to walk UP the tree it seems */ /* * With the disk provider, we can look for its consumers to see if any are the proper type. */ LIST_FOREACH(pp, &disk->lg_consumer, lg_consumer) { type = geom_pp_attr(mesh, pp, "type"); if (type == NULL) continue; if (strcmp(type, "efi") != 0) continue; efimedia = geom_pp_attr(mesh, pp, "efimedia"); if (efimedia == NULL) return (NULL); return strdup(efimedia); } #endif return (NULL); } static char * find_geom_efimedia(struct gmesh *mesh, const char *dev) { struct gprovider *pp; const char *efimedia; pp = find_provider_by_name(mesh, dev); if (pp == NULL) return (NULL); efimedia = geom_pp_attr(mesh, pp, "efimedia"); if (efimedia == NULL) return (NULL); return strdup(efimedia); } static int build_dp(const char *efimedia, const char *relpath, efidp *dp) { char *fp, *dptxt = NULL, *cp, *rp; int rv = 0; efidp out = NULL; size_t len; rp = strdup(relpath); for (cp = rp; *cp; cp++) if (*cp == '/') *cp = '\\'; fp = path_to_file_dp(rp); free(rp); if (fp == NULL) { rv = ENOMEM; goto errout; } asprintf(&dptxt, "%s/%s", efimedia, fp); out = malloc(8192); len = efidp_parse_device_path(dptxt, out, 8192); if (len > 8192) { rv = ENOMEM; goto errout; } if (len == 0) { rv = EINVAL; goto errout; } *dp = out; errout: if (rv) { free(out); } free(dptxt); free(fp); return rv; } /* Handles //path/to/file */ /* * Which means: find the disk that has /. Then look for a EFI partition * and use that for the efimedia and /path/to/file as relative to that. * Not sure how ZFS will work here since we can't easily make the leap * to the geom from the zpool. */ static int efipart_to_dp(struct gmesh *mesh, char *path, efidp *dp) { char *efimedia = NULL; int rv; efimedia = find_geom_efi_on_root(mesh); #ifdef notyet if (efimedia == NULL) efimedia = find_efi_on_zfsroot(dev); #endif if (efimedia == NULL) { rv = ENOENT; goto errout; } rv = build_dp(efimedia, path + 1, dp); errout: free(efimedia); return rv; } /* Handles [/dev/]geom:[/]path/to/file */ /* Handles zfs-dataset:[/]path/to/file (this may include / ) */ static int dev_path_to_dp(struct gmesh *mesh, char *path, efidp *dp) { char *relpath, *dev, *efimedia = NULL; int rv = 0; relpath = strchr(path, ':'); assert(relpath != NULL); *relpath++ = '\0'; dev = path; if (strncmp(dev, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) dev += sizeof(_PATH_DEV) -1; efimedia = find_geom_efimedia(mesh, dev); #ifdef notyet if (efimedia == NULL) find_zfs_efi_media(dev); #endif if (efimedia == NULL) { rv = ENOENT; goto errout; } rv = build_dp(efimedia, relpath, dp); errout: free(efimedia); return rv; } /* Handles /path/to/file */ static int path_to_dp(struct gmesh *mesh, char *path, efidp *dp) { struct statfs buf; char *rp = NULL, *ep, *dev, *efimedia = NULL; int rv = 0; rp = realpath(path, NULL); if (rp == NULL) { rv = errno; goto errout; } if (statfs(rp, &buf) != 0) { rv = errno; goto errout; } dev = buf.f_mntfromname; if (strncmp(dev, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) dev += sizeof(_PATH_DEV) -1; ep = rp + strlen(buf.f_mntonname); efimedia = find_geom_efimedia(mesh, dev); #ifdef notyet if (efimedia == NULL) find_zfs_efi_media(dev); #endif if (efimedia == NULL) { rv = ENOENT; goto errout; } rv = build_dp(efimedia, ep, dp); errout: free(efimedia); free(rp); if (rv != 0) { free(*dp); *dp = NULL; } return (rv); } int efivar_unix_path_to_device_path(const char *path, efidp *dp) { char *modpath = NULL, *cp; int rv = ENOMEM; struct gmesh mesh; /* * Fail early for clearly bogus things */ if (path == NULL || dp == NULL) return (EDOOFUS); /* * We'll need the goem mesh to grovel through it to find the * efimedia attribute for any devices we find. Grab it here * and release it to simplify the error paths out of the * subordinate functions */ if (geom_gettree(&mesh)) return (errno); /* * Convert all \ to /. We'll convert them back again when * we encode the file. Boot loaders are expected to cope. */ modpath = strdup(path); if (modpath == NULL) goto out; for (cp = modpath; *cp; cp++) if (*cp == '\\') *cp = '/'; if (modpath[0] == '/' && modpath[1] == '/') /* Handle //foo/bar/baz */ rv = efipart_to_dp(&mesh, modpath, dp); else if (strchr(modpath, ':')) /* Handle dev:/bar/baz */ rv = dev_path_to_dp(&mesh, modpath, dp); else /* Handle /a/b/c */ rv = path_to_dp(&mesh, modpath, dp); out: geom_deletetree(&mesh); free(modpath); return (rv); } Index: head/lib/libefivar/efivar-dp.h =================================================================== --- head/lib/libefivar/efivar-dp.h (revision 343754) +++ head/lib/libefivar/efivar-dp.h (revision 343755) @@ -1,72 +1,71 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _EFIVAR_DP_H_ #define _EFIVAR_DP_H_ /* * "Linux compatible" efivar-dp.h header. At the moment, it's really a * very thin, minimal interface. */ /* * Generic EFI_DEVICE_PATH, spelled the Linux way. We use this * interface to the outside world and type-pun to the EFI EDK2 code * we use to implement it. */ typedef struct { uint8_t type; uint8_t subtype; uint16_t length; } __packed efidp_header; /* NB: Linux has shadow types for all dp type */ typedef union { efidp_header header; } efidp_data; typedef efidp_data *efidp; typedef const efidp_data *const_efidp; /** format a device path into UEFI standard conforming output. * * NB: FreeBSD's implementation is taken from EDK2, while Linux's * was hand-rolled. There may be differences as a result. */ ssize_t efidp_format_device_path(char *buf, size_t len, const_efidp dp, ssize_t max); ssize_t efidp_format_device_path_node(char *buf, size_t len, const_efidp dp); ssize_t efidp_parse_device_path(char *path, efidp out, size_t max); char * efidp_extract_file_path(const_efidp dp); size_t efidp_size(const_efidp); int efivar_device_path_to_unix_path(const_efidp dp, char **dev, char **relpath, char **abspath); int efivar_unix_path_to_device_path(const char *path, efidp *dp); #endif /* _EFIVAR_DP_H_ */ Index: head/lib/libefivar/efivar.3 =================================================================== --- head/lib/libefivar/efivar.3 (revision 343754) +++ head/lib/libefivar/efivar.3 (revision 343755) @@ -1,122 +1,123 @@ +.\" .\" Copyright 2016 Netflix, Inc. .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd October 25, 2018 .Dt LIBEFIVAR 3 .Os .Sh NAME .Nm libefivar .Nd EFI Non Volatile Variable Support .Sh SYNOPSIS .In efivar.h .Ft int .Fn efi_append_variable "efi_guid_t guid" "const char *name" "void *data" "size_t data_size" "uint32_t attributes" .Ft int .Fn efi_del_variable "efi_guid_t guid" "const char *name" .Ft int .Fn efi_get_variable "efi_guid_t guid" "const char *name" "void **data" "ssize_t *data_size" "uint32_t *attributes" .Ft int .Fn efi_get_variable_attributes "efi_guid_t guid" "const char *name" "uint32_t *attributes" .Ft int .Fn efi_get_variable_size "efi_guid_t guid" "const char *name" "size_t *size" .Ft int .Fn efi_get_next_variable_name "efi_guid_t **guid" "char **name" .Ft int .Fn efi_guid_to_name "efi_guid_t *guid" "char **name" .Ft int .Fn efi_guid_to_symbol "efi_guid_t *guid" "char **symbol" .Ft int .Fn efi_guid_to_str "const efi_guid_t *guid" "char **sp" .Ft int .Fn efi_name_to_guid "const char *name" "efi_guid_t *guid" .Ft int .Fn efi_set_variable "efi_guid_t guid" "const char *name" "void *data" "size_t data_size" "uint32_t attributes" .Ft int .Fn efi_str_to_guid "const char *s" "efi_guid_t *guid" .Ft int .Fn efi_variables_supported "void" .Sh DESCRIPTION The .Nm library implements access to EFI Variables via the EFI Runtime Services. All .Vt "char *" strings are converted to 16-bit UTF strings before passing them to EFI. .Pp .Fn efi_variables_supported returns non-zero if the current machine supports setting of EFI firmware variables and the kernel support for doing so is present. Otherwise zero is returned. .Pp .Fn efi_del_variable deletes the EFI variable selected by .Va guid and .Va name . .Pp The following functions have not been implemented yet: .Bl -dash -offset indent -compact .It .Fn efi_append_variable .It .Fn efi_get_next_variable_name .It .Fn efi_get_variable .It .Fn efi_get_variable_attributes .It .Fn efi_get_variable_size .It .Fn efi_guid_to_name .It .Fn efi_guid_to_str .It .Fn efi_guid_to_symbol .It .Fn efi_name_to_guid .It .Fn efi_set_variable .It .Fn efi_str_to_guid .El .Sh SEE ALSO .Xr efidev 4 .Sh HISTORY The .Nm library first appeared in .Fx 12.0 . .Sh AUTHORS .An -nosplit This software was originally written by .An Warner Losh . .Sh BUGS No facilities exist to process the strings as native UTF. This is a limitation in the Linux .Nm library interface. Index: head/lib/libefivar/efivar.c =================================================================== --- head/lib/libefivar/efivar.c (revision 343754) +++ head/lib/libefivar/efivar.c (revision 343755) @@ -1,400 +1,399 @@ /*- * Copyright (c) 2016 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "efichar.h" static int efi_fd = -2; #define Z { 0, 0, 0, 0, 0, { 0 } } const efi_guid_t efi_guid_empty = Z; static struct uuid_table guid_tbl [] = { { "00000000-0000-0000-0000-000000000000", "zero", Z }, { "093e0fae-a6c4-4f50-9f1b-d41e2b89c19a", "sha512", Z }, { "0abba7dc-e516-4167-bbf5-4d9d1c739416", "redhat", Z }, { "0b6e5233-a65c-44c9-9407-d9ab83bfc8bd", "sha224", Z }, { "126a762d-5758-4fca-8531-201a7f57f850", "lenovo_boot_menu", Z }, { "3bd2a492-96c0-4079-b420-fcf98ef103ed", "x509_sha256", Z }, { "3c5766e8-269c-4e34-aa14-ed776e85b3b6", "rsa2048", Z }, { "3CC24E96-22C7-41D8-8863-8E39DCDCC2CF", "lenovo", Z }, { "3f7e615b-0d45-4f80-88dc-26b234958560", "lenovo_diag", Z }, { "446dbf63-2502-4cda-bcfa-2465d2b0fe9d", "x509_sha512", Z }, { "4aafd29d-68df-49ee-8aa9-347d375665a7", "pkcs7_cert", Z }, { "605dab50-e046-4300-abb6-3dd810dd8b23", "shim", Z }, { "665d3f60-ad3e-4cad-8e26-db46eee9f1b5", "lenovo_rescue", Z }, { "67f8444f-8743-48f1-a328-1eaab8736080", "rsa2048_sha1", Z }, { "7076876e-80c2-4ee6-aad2-28b349a6865b", "x509_sha384", Z }, { "721c8b66-426c-4e86-8e99-3457c46ab0b9", "lenovo_setup", Z }, { "77fa9abd-0359-4d32-bd60-28f4e78f784b", "microsoft", Z }, { "7FACC7B6-127F-4E9C-9C5D-080F98994345", "lenovo_2", Z }, { "826ca512-cf10-4ac9-b187-be01496631bd", "sha1", Z }, { "82988420-7467-4490-9059-feb448dd1963", "lenovo_me_config", Z }, { "8be4df61-93ca-11d2-aa0d-00e098032b8c", "global", Z }, { "a5c059a1-94e4-4aa7-87b5-ab155c2bf072", "x509_cert", Z }, { "a7717414-c616-4977-9420-844712a735bf", "rsa2048_sha256_cert", Z }, { "a7d8d9a6-6ab0-4aeb-ad9d-163e59a7a380", "lenovo_diag_splash", Z }, { "ade9e48f-9cb8-98e6-31af-b4e6009e2fe3", "redhat_2", Z }, { "bc7838d2-0f82-4d60-8316-c068ee79d25b", "lenovo_msg", Z }, { "c1c41626-504c-4092-aca9-41f936934328", "sha256", Z }, { "c57ad6b7-0515-40a8-9d21-551652854e37", "shell", Z }, { "d719b2cb-3d3a-4596-a3bc-dad00e67656f", "security", Z }, { "e2b36190-879b-4a3d-ad8d-f2e7bba32784", "rsa2048_sha256", Z }, { "ff3e5307-9fd0-48c9-85f1-8ad56c701e01", "sha384", Z }, { "f46ee6f4-4785-43a3-923d-7f786c3c8479", "lenovo_startup_interrupt", Z }, { "ffffffff-ffff-ffff-ffff-ffffffffffff", "zzignore-this-guid", Z }, }; #undef Z static void efi_guid_tbl_compile(void) { size_t i; uint32_t status; static int done = 0; if (done) return; for (i = 0; i < nitems(guid_tbl); i++) { uuid_from_string(guid_tbl[i].uuid_str, &guid_tbl[i].guid, &status); /* all f's is a bad version, so ignore that error */ if (status != uuid_s_ok && status != uuid_s_bad_version) fprintf(stderr, "Can't convert %s to a uuid for %s: %d\n", guid_tbl[i].uuid_str, guid_tbl[i].name, (int)status); } done = 1; } int efi_known_guid(struct uuid_table **tbl) { *tbl = guid_tbl; return (nitems(guid_tbl)); } static int efi_open_dev(void) { if (efi_fd == -2) efi_fd = open("/dev/efi", O_RDWR); if (efi_fd < 0) efi_fd = -1; else efi_guid_tbl_compile(); return (efi_fd); } static void efi_var_reset(struct efi_var_ioc *var) { var->name = NULL; var->namesize = 0; memset(&var->vendor, 0, sizeof(var->vendor)); var->attrib = 0; var->data = NULL; var->datasize = 0; } static int rv_to_linux_rv(int rv) { if (rv == 0) rv = 1; else rv = -errno; return (rv); } int efi_append_variable(efi_guid_t guid, const char *name, uint8_t *data, size_t data_size, uint32_t attributes) { return efi_set_variable(guid, name, data, data_size, attributes | EFI_VARIABLE_APPEND_WRITE); } int efi_del_variable(efi_guid_t guid, const char *name) { /* data_size of 0 deletes the variable */ return efi_set_variable(guid, name, NULL, 0, 0); } int efi_get_variable(efi_guid_t guid, const char *name, uint8_t **data, size_t *data_size, uint32_t *attributes) { struct efi_var_ioc var; int rv; static uint8_t buf[1024*32]; if (efi_open_dev() == -1) return -1; efi_var_reset(&var); rv = utf8_to_ucs2(name, &var.name, &var.namesize); if (rv != 0) goto errout; var.vendor = guid; var.data = buf; var.datasize = sizeof(buf); rv = ioctl(efi_fd, EFIIOC_VAR_GET, &var); if (data_size != NULL) *data_size = var.datasize; if (data != NULL) *data = buf; if (attributes != NULL) *attributes = var.attrib; errout: free(var.name); return rv_to_linux_rv(rv); } int efi_get_variable_attributes(efi_guid_t guid, const char *name, uint32_t *attributes) { /* Make sure this construct works -- I think it will fail */ return efi_get_variable(guid, name, NULL, NULL, attributes); } int efi_get_variable_size(efi_guid_t guid, const char *name, size_t *size) { /* XXX check to make sure this matches the linux value */ *size = 0; return efi_get_variable(guid, name, NULL, size, NULL); } int efi_get_next_variable_name(efi_guid_t **guid, char **name) { struct efi_var_ioc var; int rv; static efi_char *buf; static size_t buflen = 256 * sizeof(efi_char); static efi_guid_t retguid; size_t size; if (efi_open_dev() == -1) return -1; /* * Always allocate enough for an extra NUL on the end, but don't tell * the IOCTL about it so we can NUL terminate the name before converting * it to UTF8. */ if (buf == NULL) buf = malloc(buflen + sizeof(efi_char)); again: efi_var_reset(&var); var.name = buf; var.namesize = buflen; if (*name == NULL) { *buf = 0; /* GUID zeroed in var_reset */ } else { rv = utf8_to_ucs2(*name, &var.name, &size); if (rv != 0) goto errout; var.vendor = **guid; } rv = ioctl(efi_fd, EFIIOC_VAR_NEXT, &var); if (rv == 0 && var.name == NULL) { /* * Variable name not long enough, so allocate more space for the * name and try again. As above, mind the NUL we add. */ void *new = realloc(buf, var.namesize + sizeof(efi_char)); if (new == NULL) { rv = -1; errno = ENOMEM; goto done; } buflen = var.namesize; buf = new; goto again; } if (rv == 0) { free(*name); /* Free last name, to avoid leaking */ *name = NULL; /* Force ucs2_to_utf8 to malloc new space */ var.name[var.namesize / sizeof(efi_char)] = 0; /* EFI doesn't NUL terminate */ rv = ucs2_to_utf8(var.name, name); if (rv != 0) goto errout; retguid = var.vendor; *guid = &retguid; } errout: /* XXX The linux interface expects name to be a static buffer -- fix or leak memory? */ /* XXX for the moment, we free just before we'd leak, but still leak last one */ done: if (rv != 0 && errno == ENOENT) { errno = 0; free(*name); /* Free last name, to avoid leaking */ return 0; } return (rv_to_linux_rv(rv)); } int efi_guid_cmp(const efi_guid_t *guid1, const efi_guid_t *guid2) { uint32_t status; return uuid_compare(guid1, guid2, &status); } int efi_guid_is_zero(const efi_guid_t *guid) { uint32_t status; return uuid_is_nil(guid, &status); } int efi_guid_to_name(efi_guid_t *guid, char **name) { size_t i; uint32_t status; efi_guid_tbl_compile(); for (i = 0; i < nitems(guid_tbl); i++) { if (uuid_equal(guid, &guid_tbl[i].guid, &status)) { *name = strdup(guid_tbl[i].name); return (0); } } return (efi_guid_to_str(guid, name)); } int efi_guid_to_symbol(efi_guid_t *guid __unused, char **symbol __unused) { /* * Unsure what this is used for, efibootmgr doesn't use it. * Leave unimplemented for now. */ return -1; } int efi_guid_to_str(const efi_guid_t *guid, char **sp) { uint32_t status; /* knows efi_guid_t is a typedef of uuid_t */ uuid_to_string(guid, sp, &status); return (status == uuid_s_ok ? 0 : -1); } int efi_name_to_guid(const char *name, efi_guid_t *guid) { size_t i; efi_guid_tbl_compile(); for (i = 0; i < nitems(guid_tbl); i++) { if (strcmp(name, guid_tbl[i].name) == 0) { *guid = guid_tbl[i].guid; return (0); } } return (efi_str_to_guid(name, guid)); } int efi_set_variable(efi_guid_t guid, const char *name, uint8_t *data, size_t data_size, uint32_t attributes) { struct efi_var_ioc var; int rv; if (efi_open_dev() == -1) return -1; efi_var_reset(&var); rv = utf8_to_ucs2(name, &var.name, &var.namesize); if (rv != 0) goto errout; var.vendor = guid; var.data = data; var.datasize = data_size; var.attrib = attributes; rv = ioctl(efi_fd, EFIIOC_VAR_SET, &var); errout: free(var.name); return rv; } int efi_str_to_guid(const char *s, efi_guid_t *guid) { uint32_t status; /* knows efi_guid_t is a typedef of uuid_t */ uuid_from_string(s, guid, &status); return (status == uuid_s_ok ? 0 : -1); } int efi_variables_supported(void) { return efi_open_dev() != -1; } Index: head/lib/libefivar/efivar.h =================================================================== --- head/lib/libefivar/efivar.h (revision 343754) +++ head/lib/libefivar/efivar.h (revision 343755) @@ -1,102 +1,101 @@ /*- * Copyright (c) 2016 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _EFIVAR_H_ #define _EFIVAR_H_ #include #include #include #include /* Shoud these be elsewhere ? */ #define EFI_VARIABLE_NON_VOLATILE 0x00000001 #define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x00000002 #define EFI_VARIABLE_RUNTIME_ACCESS 0x00000004 #define EFI_VARIABLE_HARDWARE_ERROR_RECORD 0x00000008 #define EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS 0x00000010 #define EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS \ 0x00000020 #define EFI_VARIABLE_APPEND_WRITE 0x00000040 #if 0 /* todo */ #define EFI_VARIABLE_HAS_AUTH_HEADER #define EFI_VARIABLE_HAS_SIGNATURE #endif #ifndef _EFIVAR_EFI_GUID_T_DEF #define _EFIVAR_EFI_GUID_T_DEF typedef uuid_t efi_guid_t; #endif #if BYTE_ORDER == LITTLE_ENDIAN #define EFI_GUID(a, b, c, d, e0, e1, e2, e3, e4, e5) \ ((efi_guid_t) {(a), (b), (c), (d) >> 8, (d) & 0xff, \ { (e0), (e1), (e2), (e3), (e4), (e5) }}) #else #define EFI_GUID(a, b, c, d, e0, e1, e2, e3, e4, e5) \ ((efi_guid_t) {(a), (b), (c), (d) & 0xff, (d) >> 8, \ { (e0), (e1), (e2), (e3), (e4), (e5) }}) #endif #define EFI_GLOBAL_GUID EFI_GUID(0x8be4df61, 0x93ca, 0x11d2, 0xaa0d, \ 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c) int efi_append_variable(efi_guid_t guid, const char *name, uint8_t *data, size_t data_size, uint32_t attributes); int efi_del_variable(efi_guid_t guid, const char *name); int efi_get_variable(efi_guid_t guid, const char *name, uint8_t **data, size_t *data_size, uint32_t *attributes); int efi_get_variable_attributes(efi_guid_t guid, const char *name, uint32_t *attributes); int efi_get_variable_size(efi_guid_t guid, const char *name, size_t *size); int efi_get_next_variable_name(efi_guid_t **guid, char **name); int efi_guid_cmp(const efi_guid_t *guid1, const efi_guid_t *guid2); int efi_guid_is_zero(const efi_guid_t *guid1); int efi_guid_to_name(efi_guid_t *guid, char **name); int efi_guid_to_symbol(efi_guid_t *guid, char **symbol); int efi_guid_to_str(const efi_guid_t *guid, char **sp); int efi_name_to_guid(const char *name, efi_guid_t *guid); int efi_set_variable(efi_guid_t guid, const char *name, uint8_t *data, size_t data_size, uint32_t attributes); int efi_str_to_guid(const char *s, efi_guid_t *guid); int efi_variables_supported(void); /* FreeBSD extensions */ struct uuid_table { const char *uuid_str; const char *name; efi_guid_t guid; }; int efi_known_guid(struct uuid_table **); extern const efi_guid_t efi_guid_empty; #endif /* _EFIVAR_H_ */ Index: head/lib/libefivar/uefi-dplib.h =================================================================== --- head/lib/libefivar/uefi-dplib.h (revision 343754) +++ head/lib/libefivar/uefi-dplib.h (revision 343755) @@ -1,635 +1,634 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Taken from MdePkg/Library/UefiDevicePathLib/UefiDevicePathLib.h * hash a11928f3310518ab1c6fd34e8d0fdbb72de9602c 2017-Mar-01 */ /** @file Definition for Device Path library. Copyright (c) 2013 - 2015, Intel Corporation. All rights reserved.
This program and the accompanying materials are licensed and made available under the terms and conditions of the BSD License which accompanies this distribution. The full text of the license may be found at http://opensource.org/licenses/bsd-license.php THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. **/ #ifndef _UEFI_DEVICE_PATH_LIB_H_ #define _UEFI_DEVICE_PATH_LIB_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IS_COMMA(a) ((a) == ',') #define IS_HYPHEN(a) ((a) == '-') #define IS_DOT(a) ((a) == '.') #define IS_LEFT_PARENTH(a) ((a) == '(') #define IS_RIGHT_PARENTH(a) ((a) == ')') #define IS_SLASH(a) ((a) == '/') #define IS_NULL(a) ((a) == '\0') // // Private Data structure // typedef struct { char *Str; UINTN Count; UINTN Capacity; } POOL_PRINT; typedef EFI_DEVICE_PATH_PROTOCOL * (*DEVICE_PATH_FROM_TEXT) ( IN char *Str ); typedef VOID (*DEVICE_PATH_TO_TEXT) ( IN OUT POOL_PRINT *Str, IN VOID *DevicePath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ); typedef struct { UINT8 Type; UINT8 SubType; DEVICE_PATH_TO_TEXT Function; } DEVICE_PATH_TO_TEXT_TABLE; typedef struct { UINT8 Type; const char *Text; } DEVICE_PATH_TO_TEXT_GENERIC_TABLE; typedef struct { const char *DevicePathNodeText; DEVICE_PATH_FROM_TEXT Function; } DEVICE_PATH_FROM_TEXT_TABLE; typedef struct { BOOLEAN ClassExist; UINT8 Class; BOOLEAN SubClassExist; UINT8 SubClass; } USB_CLASS_TEXT; #define USB_CLASS_AUDIO 1 #define USB_CLASS_CDCCONTROL 2 #define USB_CLASS_HID 3 #define USB_CLASS_IMAGE 6 #define USB_CLASS_PRINTER 7 #define USB_CLASS_MASS_STORAGE 8 #define USB_CLASS_HUB 9 #define USB_CLASS_CDCDATA 10 #define USB_CLASS_SMART_CARD 11 #define USB_CLASS_VIDEO 14 #define USB_CLASS_DIAGNOSTIC 220 #define USB_CLASS_WIRELESS 224 #define USB_CLASS_RESERVE 254 #define USB_SUBCLASS_FW_UPDATE 1 #define USB_SUBCLASS_IRDA_BRIDGE 2 #define USB_SUBCLASS_TEST 3 #define RFC_1700_UDP_PROTOCOL 17 #define RFC_1700_TCP_PROTOCOL 6 #pragma pack(1) typedef struct { EFI_DEVICE_PATH_PROTOCOL Header; EFI_GUID Guid; UINT8 VendorDefinedData[1]; } VENDOR_DEFINED_HARDWARE_DEVICE_PATH; typedef struct { EFI_DEVICE_PATH_PROTOCOL Header; EFI_GUID Guid; UINT8 VendorDefinedData[1]; } VENDOR_DEFINED_MESSAGING_DEVICE_PATH; typedef struct { EFI_DEVICE_PATH_PROTOCOL Header; EFI_GUID Guid; UINT8 VendorDefinedData[1]; } VENDOR_DEFINED_MEDIA_DEVICE_PATH; typedef struct { EFI_DEVICE_PATH_PROTOCOL Header; UINT32 Hid; UINT32 Uid; UINT32 Cid; CHAR8 HidUidCidStr[3]; } ACPI_EXTENDED_HID_DEVICE_PATH_WITH_STR; typedef struct { EFI_DEVICE_PATH_PROTOCOL Header; UINT16 NetworkProtocol; UINT16 LoginOption; UINT64 Lun; UINT16 TargetPortalGroupTag; CHAR8 TargetName[1]; } ISCSI_DEVICE_PATH_WITH_NAME; typedef struct { EFI_DEVICE_PATH_PROTOCOL Header; EFI_GUID Guid; UINT8 VendorDefinedData[1]; } VENDOR_DEVICE_PATH_WITH_DATA; #pragma pack() #ifdef FreeBSD /* Remove these on FreeBSD */ /** Returns the size of a device path in bytes. This function returns the size, in bytes, of the device path data structure specified by DevicePath including the end of device path node. If DevicePath is NULL or invalid, then 0 is returned. @param DevicePath A pointer to a device path data structure. @retval 0 If DevicePath is NULL or invalid. @retval Others The size of a device path in bytes. **/ UINTN EFIAPI UefiDevicePathLibGetDevicePathSize ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath ); /** Creates a new copy of an existing device path. This function allocates space for a new copy of the device path specified by DevicePath. If DevicePath is NULL, then NULL is returned. If the memory is successfully allocated, then the contents of DevicePath are copied to the newly allocated buffer, and a pointer to that buffer is returned. Otherwise, NULL is returned. The memory for the new device path is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param DevicePath A pointer to a device path data structure. @retval NULL DevicePath is NULL or invalid. @retval Others A pointer to the duplicated device path. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibDuplicateDevicePath ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath ); /** Creates a new device path by appending a second device path to a first device path. This function creates a new device path by appending a copy of SecondDevicePath to a copy of FirstDevicePath in a newly allocated buffer. Only the end-of-device-path device node from SecondDevicePath is retained. The newly created device path is returned. If FirstDevicePath is NULL, then it is ignored, and a duplicate of SecondDevicePath is returned. If SecondDevicePath is NULL, then it is ignored, and a duplicate of FirstDevicePath is returned. If both FirstDevicePath and SecondDevicePath are NULL, then a copy of an end-of-device-path is returned. If there is not enough memory for the newly allocated buffer, then NULL is returned. The memory for the new device path is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param FirstDevicePath A pointer to a device path data structure. @param SecondDevicePath A pointer to a device path data structure. @retval NULL If there is not enough memory for the newly allocated buffer. @retval NULL If FirstDevicePath or SecondDevicePath is invalid. @retval Others A pointer to the new device path if success. Or a copy an end-of-device-path if both FirstDevicePath and SecondDevicePath are NULL. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibAppendDevicePath ( IN CONST EFI_DEVICE_PATH_PROTOCOL *FirstDevicePath, OPTIONAL IN CONST EFI_DEVICE_PATH_PROTOCOL *SecondDevicePath OPTIONAL ); /** Creates a new path by appending the device node to the device path. This function creates a new device path by appending a copy of the device node specified by DevicePathNode to a copy of the device path specified by DevicePath in an allocated buffer. The end-of-device-path device node is moved after the end of the appended device node. If DevicePathNode is NULL then a copy of DevicePath is returned. If DevicePath is NULL then a copy of DevicePathNode, followed by an end-of-device path device node is returned. If both DevicePathNode and DevicePath are NULL then a copy of an end-of-device-path device node is returned. If there is not enough memory to allocate space for the new device path, then NULL is returned. The memory is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param DevicePath A pointer to a device path data structure. @param DevicePathNode A pointer to a single device path node. @retval NULL If there is not enough memory for the new device path. @retval Others A pointer to the new device path if success. A copy of DevicePathNode followed by an end-of-device-path node if both FirstDevicePath and SecondDevicePath are NULL. A copy of an end-of-device-path node if both FirstDevicePath and SecondDevicePath are NULL. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibAppendDevicePathNode ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath, OPTIONAL IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePathNode OPTIONAL ); /** Creates a new device path by appending the specified device path instance to the specified device path. This function creates a new device path by appending a copy of the device path instance specified by DevicePathInstance to a copy of the device path specified by DevicePath in a allocated buffer. The end-of-device-path device node is moved after the end of the appended device path instance and a new end-of-device-path-instance node is inserted between. If DevicePath is NULL, then a copy if DevicePathInstance is returned. If DevicePathInstance is NULL, then NULL is returned. If DevicePath or DevicePathInstance is invalid, then NULL is returned. If there is not enough memory to allocate space for the new device path, then NULL is returned. The memory is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param DevicePath A pointer to a device path data structure. @param DevicePathInstance A pointer to a device path instance. @return A pointer to the new device path. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibAppendDevicePathInstance ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath, OPTIONAL IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePathInstance OPTIONAL ); /** Creates a copy of the current device path instance and returns a pointer to the next device path instance. This function creates a copy of the current device path instance. It also updates DevicePath to point to the next device path instance in the device path (or NULL if no more) and updates Size to hold the size of the device path instance copy. If DevicePath is NULL, then NULL is returned. If DevicePath points to a invalid device path, then NULL is returned. If there is not enough memory to allocate space for the new device path, then NULL is returned. The memory is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. If Size is NULL, then ASSERT(). @param DevicePath On input, this holds the pointer to the current device path instance. On output, this holds the pointer to the next device path instance or NULL if there are no more device path instances in the device path pointer to a device path data structure. @param Size On output, this holds the size of the device path instance, in bytes or zero, if DevicePath is NULL. @return A pointer to the current device path instance. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibGetNextDevicePathInstance ( IN OUT EFI_DEVICE_PATH_PROTOCOL **DevicePath, OUT UINTN *Size ); /** Creates a device node. This function creates a new device node in a newly allocated buffer of size NodeLength and initializes the device path node header with NodeType and NodeSubType. The new device path node is returned. If NodeLength is smaller than a device path header, then NULL is returned. If there is not enough memory to allocate space for the new device path, then NULL is returned. The memory is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param NodeType The device node type for the new device node. @param NodeSubType The device node sub-type for the new device node. @param NodeLength The length of the new device node. @return The new device path. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibCreateDeviceNode ( IN UINT8 NodeType, IN UINT8 NodeSubType, IN UINT16 NodeLength ); /** Determines if a device path is single or multi-instance. This function returns TRUE if the device path specified by DevicePath is multi-instance. Otherwise, FALSE is returned. If DevicePath is NULL or invalid, then FALSE is returned. @param DevicePath A pointer to a device path data structure. @retval TRUE DevicePath is multi-instance. @retval FALSE DevicePath is not multi-instance, or DevicePath is NULL or invalid. **/ BOOLEAN EFIAPI UefiDevicePathLibIsDevicePathMultiInstance ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath ); /** Converts a device path to its text representation. @param DevicePath A Pointer to the device to be converted. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. @return A pointer to the allocated text representation of the device path or NULL if DeviceNode is NULL or there was insufficient memory. **/ CHAR16 * EFIAPI UefiDevicePathLibConvertDevicePathToText ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ); /** Converts a device node to its string representation. @param DeviceNode A Pointer to the device node to be converted. @param DisplayOnly If DisplayOnly is TRUE, then the shorter text representation of the display node is used, where applicable. If DisplayOnly is FALSE, then the longer text representation of the display node is used. @param AllowShortcuts If AllowShortcuts is TRUE, then the shortcut forms of text representation for a device node can be used, where applicable. @return A pointer to the allocated text representation of the device node or NULL if DeviceNode is NULL or there was insufficient memory. **/ CHAR16 * EFIAPI UefiDevicePathLibConvertDeviceNodeToText ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DeviceNode, IN BOOLEAN DisplayOnly, IN BOOLEAN AllowShortcuts ); /** Convert text to the binary representation of a device node. @param TextDeviceNode TextDeviceNode points to the text representation of a device node. Conversion starts with the first character and continues until the first non-device node character. @return A pointer to the EFI device node or NULL if TextDeviceNode is NULL or there was insufficient memory or text unsupported. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibConvertTextToDeviceNode ( IN CONST CHAR16 *TextDeviceNode ); /** Convert text to the binary representation of a device path. @param TextDevicePath TextDevicePath points to the text representation of a device path. Conversion starts with the first character and continues until the first non-device node character. @return A pointer to the allocated device path or NULL if TextDeviceNode is NULL or there was insufficient memory. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI UefiDevicePathLibConvertTextToDevicePath ( IN CONST CHAR16 *TextDevicePath ); #else /* * Small FreeBSD shim layer. Fast and lose hacks to make this code work with FreeBSD. */ #include #define _PCD_GET_MODE_32_PcdMaximumDevicePathNodeCount 1000 #define MAX_UINTN UINTPTR_MAX #define AllocatePool(x) malloc(x) #define AllocateZeroPool(x) calloc(1,x) #define AsciiStrLen(s) strlen(s) #define CopyGuid(dst, src) memcpy(dst, src, sizeof(uuid_t)) #define CopyMem(d, s, l) memcpy(d, s, l) #define FreePool(x) free(x) #define LShiftU64(x, s) ((x) << s) #define ReadUnaligned64(x) le64dec(x) #define ReallocatePool(old, new, ptr) realloc(ptr, new) /* * Quirky StrCmp returns 0 if equal, 1 if not. This is what the code * expects, though that expectation is likely a bug (it casts the * return value. EDK2's StrCmp returns values just like C's strcmp, * but the parse code casts this to an UINTN, which is bogus. This * definition papers over that bogusness to do the right thing. If * iSCSI protocol string processing is ever fixed, we can remove this * bletcherous kludge. */ #define StrCmp(a, b) (strcmp(a, b) != 0) #define StrCpyS(d, l, s) strcpy(d, s) #define StrHexToUint64(x) strtoll(x, NULL, 16) #define StrHexToUintn(x) strtoll(x, NULL, 16) #define StrLen(x) strlen(x) #define StrSize(x) (strlen(x) + 1) #define StrnCatS(d, l, s, len) strncat(d, s, len) #define StrnCmp(a, b, n) strncmp(a, b, n) #define StrnLenS(str, max) strlen(str) #define Strtoi(x) strtol(x, NULL, 0) #define Strtoi64(x, y) *(long long *)y = strtoll(x, NULL, 0) #define SwapBytes64(u64) bswap64(u64) #define UnicodeStrToAsciiStrS(src, dest, len) strlcpy(dest, src, len) #define ZeroMem(p,l) memset(p, 0, l) #undef ASSERT #define ASSERT(x) /* * Define AllocateCopyPool and others so that we "forget" about the * previous non-static deifnition since we want these to be static * inlines. */ #define AllocateCopyPool AllocateCopyPoolFreeBSD #define CompareGuid CompareGuidFreeBSD #define StrHexToBytes StrHexToBytesFreeBSD #define StrToGuid StrToGuidFreeBSD #define WriteUnaligned64 WriteUnaligned64FreeBSD static inline void * AllocateCopyPool(size_t l, const void *p) { void *rv; rv = malloc(l); if (rv == NULL) return NULL; memcpy(rv, p, l); return (rv); } static inline BOOLEAN CompareGuid (const GUID *g1, const GUID *g2) { uint32_t ignored_status; return (uuid_compare((const uuid_t *)g1, (const uuid_t *)g2, &ignored_status) == 0); } static inline int StrHexToBytes(const char *str, size_t len, uint8_t *buf, size_t buflen) { size_t i; char hex[3]; /* * Sanity check preconditions. */ if (buflen != len / 2 || (len % 2) == 1) return 1; for (i = 0; i < len; i += 2) { if (!isxdigit(str[i]) || !isxdigit(str[i + 1])) return 1; hex[0] = str[i]; hex[1] = str[i + 1]; hex[2] = '\0'; buf[i / 2] = strtol(hex, NULL, 16); } return 0; } static inline void StrToGuid(const char *str, GUID *guid) { uint32_t status; uuid_from_string(str, (uuid_t *)guid, &status); } static inline void WriteUnaligned64(void *ptr, uint64_t val) { memcpy(ptr, &val, sizeof(val)); } /* * Hack to allow converting %g to %s in printfs. Hack because * it's single entry, uses a static buffer, etc. Sufficient for * the day for this file though. IF you ever have to convert * two %g's in one format, punt. Did I mention this was super lame. * Not to mention it's name.... Also, the error GUID is horrific. */ static inline const char * guid_str(const GUID *g) { static char buf[36 + 1]; char *str = NULL; int32_t ignored_status; uuid_to_string((const uuid_t *)g, &str, &ignored_status); if (str != NULL) strlcpy(buf, str, sizeof(buf)); else strlcpy(buf, "groot-cannot-decode-guid-groot-smash", sizeof(buf)); /* ^^^^^^^ 36 characters ^^^^^^^ */ free(str); return buf; } #define G(x) guid_str((const GUID *)(const void *)x) #endif #undef GLOBAL_REMOVE_IF_UNREFERENCED #define GLOBAL_REMOVE_IF_UNREFERENCED static #endif Index: head/lib/libefivar/uefi-dputil.c =================================================================== --- head/lib/libefivar/uefi-dputil.c (revision 343754) +++ head/lib/libefivar/uefi-dputil.c (revision 343755) @@ -1,636 +1,635 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Routines to format EFI_DEVICE_PATHs from the UEFI standard. Much of * this file is taken from EDK2 and rototilled. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "efi-osdep.h" #include "uefi-dplib.h" /* XXX maybe I should include the entire DevicePathUtiltiies.c and ifdef out what we don't use */ /* * Taken from MdePkg/Library/UefiDevicePathLib/DevicePathUtilities.c * hash a11928f3310518ab1c6fd34e8d0fdbb72de9602c 2017-Mar-01 */ /** @file Device Path services. The thing to remember is device paths are built out of nodes. The device path is terminated by an end node that is length sizeof(EFI_DEVICE_PATH_PROTOCOL). That would be why there is sizeof(EFI_DEVICE_PATH_PROTOCOL) all over this file. The only place where multi-instance device paths are supported is in environment varibles. Multi-instance device paths should never be placed on a Handle. Copyright (c) 2006 - 2016, Intel Corporation. All rights reserved.
This program and the accompanying materials are licensed and made available under the terms and conditions of the BSD License which accompanies this distribution. The full text of the license may be found at http://opensource.org/licenses/bsd-license.php. THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. **/ // // Template for an end-of-device path node. // static CONST EFI_DEVICE_PATH_PROTOCOL mUefiDevicePathLibEndDevicePath = { END_DEVICE_PATH_TYPE, END_ENTIRE_DEVICE_PATH_SUBTYPE, { END_DEVICE_PATH_LENGTH, 0 } }; /** Returns the size of a device path in bytes. This function returns the size, in bytes, of the device path data structure specified by DevicePath including the end of device path node. If DevicePath is NULL or invalid, then 0 is returned. @param DevicePath A pointer to a device path data structure. @retval 0 If DevicePath is NULL or invalid. @retval Others The size of a device path in bytes. **/ UINTN EFIAPI GetDevicePathSize ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath ) { CONST EFI_DEVICE_PATH_PROTOCOL *Start; if (DevicePath == NULL) { return 0; } if (!IsDevicePathValid (DevicePath, 0)) { return 0; } // // Search for the end of the device path structure // Start = DevicePath; while (!IsDevicePathEnd (DevicePath)) { DevicePath = NextDevicePathNode (DevicePath); } // // Compute the size and add back in the size of the end device path structure // return ((UINTN) DevicePath - (UINTN) Start) + DevicePathNodeLength (DevicePath); } /** Determine whether a given device path is valid. If DevicePath is NULL, then ASSERT(). @param DevicePath A pointer to a device path data structure. @param MaxSize The maximum size of the device path data structure. @retval TRUE DevicePath is valid. @retval FALSE The length of any node in the DevicePath is less than sizeof (EFI_DEVICE_PATH_PROTOCOL). @retval FALSE If MaxSize is not zero, the size of the DevicePath exceeds MaxSize. @retval FALSE If PcdMaximumDevicePathNodeCount is not zero, the node count of the DevicePath exceeds PcdMaximumDevicePathNodeCount. **/ BOOLEAN EFIAPI IsDevicePathValid ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath, IN UINTN MaxSize ) { UINTN Count; UINTN Size; UINTN NodeLength; ASSERT (DevicePath != NULL); if (MaxSize == 0) { MaxSize = MAX_UINTN; } // // Validate the input size big enough to touch the first node. // if (MaxSize < sizeof (EFI_DEVICE_PATH_PROTOCOL)) { return FALSE; } for (Count = 0, Size = 0; !IsDevicePathEnd (DevicePath); DevicePath = NextDevicePathNode (DevicePath)) { NodeLength = DevicePathNodeLength (DevicePath); if (NodeLength < sizeof (EFI_DEVICE_PATH_PROTOCOL)) { return FALSE; } if (NodeLength > MAX_UINTN - Size) { return FALSE; } Size += NodeLength; // // Validate next node before touch it. // if (Size > MaxSize - END_DEVICE_PATH_LENGTH ) { return FALSE; } if (PcdGet32 (PcdMaximumDevicePathNodeCount) > 0) { Count++; if (Count >= PcdGet32 (PcdMaximumDevicePathNodeCount)) { return FALSE; } } } // // Only return TRUE when the End Device Path node is valid. // return (BOOLEAN) (DevicePathNodeLength (DevicePath) == END_DEVICE_PATH_LENGTH); } /** Returns the Type field of a device path node. Returns the Type field of the device path node specified by Node. If Node is NULL, then ASSERT(). @param Node A pointer to a device path node data structure. @return The Type field of the device path node specified by Node. **/ UINT8 EFIAPI DevicePathType ( IN CONST VOID *Node ) { ASSERT (Node != NULL); return ((const EFI_DEVICE_PATH_PROTOCOL *)(Node))->Type; } /** Returns the SubType field of a device path node. Returns the SubType field of the device path node specified by Node. If Node is NULL, then ASSERT(). @param Node A pointer to a device path node data structure. @return The SubType field of the device path node specified by Node. **/ UINT8 EFIAPI DevicePathSubType ( IN CONST VOID *Node ) { ASSERT (Node != NULL); return ((const EFI_DEVICE_PATH_PROTOCOL *)(Node))->SubType; } /** Returns the 16-bit Length field of a device path node. Returns the 16-bit Length field of the device path node specified by Node. Node is not required to be aligned on a 16-bit boundary, so it is recommended that a function such as ReadUnaligned16() be used to extract the contents of the Length field. If Node is NULL, then ASSERT(). @param Node A pointer to a device path node data structure. @return The 16-bit Length field of the device path node specified by Node. **/ UINTN EFIAPI DevicePathNodeLength ( IN CONST VOID *Node ) { ASSERT (Node != NULL); return ((const EFI_DEVICE_PATH_PROTOCOL *)Node)->Length[0] | (((const EFI_DEVICE_PATH_PROTOCOL *)Node)->Length[1] << 8); } /** Returns a pointer to the next node in a device path. Returns a pointer to the device path node that follows the device path node specified by Node. If Node is NULL, then ASSERT(). @param Node A pointer to a device path node data structure. @return a pointer to the device path node that follows the device path node specified by Node. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI NextDevicePathNode ( IN CONST VOID *Node ) { ASSERT (Node != NULL); return ((EFI_DEVICE_PATH_PROTOCOL *)(__DECONST(UINT8 *, Node) + DevicePathNodeLength(Node))); } /** Determines if a device path node is an end node of a device path. This includes nodes that are the end of a device path instance and nodes that are the end of an entire device path. Determines if the device path node specified by Node is an end node of a device path. This includes nodes that are the end of a device path instance and nodes that are the end of an entire device path. If Node represents an end node of a device path, then TRUE is returned. Otherwise, FALSE is returned. If Node is NULL, then ASSERT(). @param Node A pointer to a device path node data structure. @retval TRUE The device path node specified by Node is an end node of a device path. @retval FALSE The device path node specified by Node is not an end node of a device path. **/ BOOLEAN EFIAPI IsDevicePathEndType ( IN CONST VOID *Node ) { ASSERT (Node != NULL); return (BOOLEAN) (DevicePathType (Node) == END_DEVICE_PATH_TYPE); } /** Determines if a device path node is an end node of an entire device path. Determines if a device path node specified by Node is an end node of an entire device path. If Node represents the end of an entire device path, then TRUE is returned. Otherwise, FALSE is returned. If Node is NULL, then ASSERT(). @param Node A pointer to a device path node data structure. @retval TRUE The device path node specified by Node is the end of an entire device path. @retval FALSE The device path node specified by Node is not the end of an entire device path. **/ BOOLEAN EFIAPI IsDevicePathEnd ( IN CONST VOID *Node ) { ASSERT (Node != NULL); return (BOOLEAN) (IsDevicePathEndType (Node) && DevicePathSubType(Node) == END_ENTIRE_DEVICE_PATH_SUBTYPE); } /** Fills in all the fields of a device path node that is the end of an entire device path. Fills in all the fields of a device path node specified by Node so Node represents the end of an entire device path. The Type field of Node is set to END_DEVICE_PATH_TYPE, the SubType field of Node is set to END_ENTIRE_DEVICE_PATH_SUBTYPE, and the Length field of Node is set to END_DEVICE_PATH_LENGTH. Node is not required to be aligned on a 16-bit boundary, so it is recommended that a function such as WriteUnaligned16() be used to set the contents of the Length field. If Node is NULL, then ASSERT(). @param Node A pointer to a device path node data structure. **/ VOID EFIAPI SetDevicePathEndNode ( OUT VOID *Node ) { ASSERT (Node != NULL); memcpy (Node, &mUefiDevicePathLibEndDevicePath, sizeof (mUefiDevicePathLibEndDevicePath)); } /** Sets the length, in bytes, of a device path node. Sets the length of the device path node specified by Node to the value specified by NodeLength. NodeLength is returned. Node is not required to be aligned on a 16-bit boundary, so it is recommended that a function such as WriteUnaligned16() be used to set the contents of the Length field. If Node is NULL, then ASSERT(). If NodeLength >= SIZE_64KB, then ASSERT(). If NodeLength < sizeof (EFI_DEVICE_PATH_PROTOCOL), then ASSERT(). @param Node A pointer to a device path node data structure. @param Length The length, in bytes, of the device path node. @return Length **/ UINT16 EFIAPI SetDevicePathNodeLength ( IN OUT VOID *Node, IN UINTN Length ) { ASSERT (Node != NULL); ASSERT ((Length >= sizeof (EFI_DEVICE_PATH_PROTOCOL)) && (Length < SIZE_64KB)); // return WriteUnaligned16 ((UINT16 *)&((EFI_DEVICE_PATH_PROTOCOL *)(Node))->Length[0], (UINT16)(Length)); le16enc(&((EFI_DEVICE_PATH_PROTOCOL *)(Node))->Length[0], (UINT16)(Length)); return Length; } /** Creates a device node. This function creates a new device node in a newly allocated buffer of size NodeLength and initializes the device path node header with NodeType and NodeSubType. The new device path node is returned. If NodeLength is smaller than a device path header, then NULL is returned. If there is not enough memory to allocate space for the new device path, then NULL is returned. The memory is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param NodeType The device node type for the new device node. @param NodeSubType The device node sub-type for the new device node. @param NodeLength The length of the new device node. @return The new device path. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI CreateDeviceNode ( IN UINT8 NodeType, IN UINT8 NodeSubType, IN UINT16 NodeLength ) { EFI_DEVICE_PATH_PROTOCOL *DevicePath; if (NodeLength < sizeof (EFI_DEVICE_PATH_PROTOCOL)) { // // NodeLength is less than the size of the header. // return NULL; } DevicePath = AllocateZeroPool (NodeLength); if (DevicePath != NULL) { DevicePath->Type = NodeType; DevicePath->SubType = NodeSubType; SetDevicePathNodeLength (DevicePath, NodeLength); } return DevicePath; } /** Creates a new copy of an existing device path. This function allocates space for a new copy of the device path specified by DevicePath. If DevicePath is NULL, then NULL is returned. If the memory is successfully allocated, then the contents of DevicePath are copied to the newly allocated buffer, and a pointer to that buffer is returned. Otherwise, NULL is returned. The memory for the new device path is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param DevicePath A pointer to a device path data structure. @retval NULL DevicePath is NULL or invalid. @retval Others A pointer to the duplicated device path. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI DuplicateDevicePath ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath ) { UINTN Size; // // Compute the size // Size = GetDevicePathSize (DevicePath); if (Size == 0) { return NULL; } // // Allocate space for duplicate device path // return AllocateCopyPool (Size, DevicePath); } /** Creates a new device path by appending a second device path to a first device path. This function creates a new device path by appending a copy of SecondDevicePath to a copy of FirstDevicePath in a newly allocated buffer. Only the end-of-device-path device node from SecondDevicePath is retained. The newly created device path is returned. If FirstDevicePath is NULL, then it is ignored, and a duplicate of SecondDevicePath is returned. If SecondDevicePath is NULL, then it is ignored, and a duplicate of FirstDevicePath is returned. If both FirstDevicePath and SecondDevicePath are NULL, then a copy of an end-of-device-path is returned. If there is not enough memory for the newly allocated buffer, then NULL is returned. The memory for the new device path is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param FirstDevicePath A pointer to a device path data structure. @param SecondDevicePath A pointer to a device path data structure. @retval NULL If there is not enough memory for the newly allocated buffer. @retval NULL If FirstDevicePath or SecondDevicePath is invalid. @retval Others A pointer to the new device path if success. Or a copy an end-of-device-path if both FirstDevicePath and SecondDevicePath are NULL. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI AppendDevicePath ( IN CONST EFI_DEVICE_PATH_PROTOCOL *FirstDevicePath, OPTIONAL IN CONST EFI_DEVICE_PATH_PROTOCOL *SecondDevicePath OPTIONAL ) { UINTN Size; UINTN Size1; UINTN Size2; EFI_DEVICE_PATH_PROTOCOL *NewDevicePath; EFI_DEVICE_PATH_PROTOCOL *DevicePath2; // // If there's only 1 path, just duplicate it. // if (FirstDevicePath == NULL) { return DuplicateDevicePath ((SecondDevicePath != NULL) ? SecondDevicePath : &mUefiDevicePathLibEndDevicePath); } if (SecondDevicePath == NULL) { return DuplicateDevicePath (FirstDevicePath); } if (!IsDevicePathValid (FirstDevicePath, 0) || !IsDevicePathValid (SecondDevicePath, 0)) { return NULL; } // // Allocate space for the combined device path. It only has one end node of // length EFI_DEVICE_PATH_PROTOCOL. // Size1 = GetDevicePathSize (FirstDevicePath); Size2 = GetDevicePathSize (SecondDevicePath); Size = Size1 + Size2 - END_DEVICE_PATH_LENGTH; NewDevicePath = AllocatePool (Size); if (NewDevicePath != NULL) { NewDevicePath = CopyMem (NewDevicePath, FirstDevicePath, Size1); // // Over write FirstDevicePath EndNode and do the copy // DevicePath2 = (EFI_DEVICE_PATH_PROTOCOL *) ((CHAR8 *) NewDevicePath + (Size1 - END_DEVICE_PATH_LENGTH)); CopyMem (DevicePath2, SecondDevicePath, Size2); } return NewDevicePath; } /** Creates a new path by appending the device node to the device path. This function creates a new device path by appending a copy of the device node specified by DevicePathNode to a copy of the device path specified by DevicePath in an allocated buffer. The end-of-device-path device node is moved after the end of the appended device node. If DevicePathNode is NULL then a copy of DevicePath is returned. If DevicePath is NULL then a copy of DevicePathNode, followed by an end-of-device path device node is returned. If both DevicePathNode and DevicePath are NULL then a copy of an end-of-device-path device node is returned. If there is not enough memory to allocate space for the new device path, then NULL is returned. The memory is allocated from EFI boot services memory. It is the responsibility of the caller to free the memory allocated. @param DevicePath A pointer to a device path data structure. @param DevicePathNode A pointer to a single device path node. @retval NULL If there is not enough memory for the new device path. @retval Others A pointer to the new device path if success. A copy of DevicePathNode followed by an end-of-device-path node if both FirstDevicePath and SecondDevicePath are NULL. A copy of an end-of-device-path node if both FirstDevicePath and SecondDevicePath are NULL. **/ EFI_DEVICE_PATH_PROTOCOL * EFIAPI AppendDevicePathNode ( IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePath, OPTIONAL IN CONST EFI_DEVICE_PATH_PROTOCOL *DevicePathNode OPTIONAL ) { EFI_DEVICE_PATH_PROTOCOL *TempDevicePath; EFI_DEVICE_PATH_PROTOCOL *NextNode; EFI_DEVICE_PATH_PROTOCOL *NewDevicePath; UINTN NodeLength; if (DevicePathNode == NULL) { return DuplicateDevicePath ((DevicePath != NULL) ? DevicePath : &mUefiDevicePathLibEndDevicePath); } // // Build a Node that has a terminator on it // NodeLength = DevicePathNodeLength (DevicePathNode); TempDevicePath = AllocatePool (NodeLength + END_DEVICE_PATH_LENGTH); if (TempDevicePath == NULL) { return NULL; } TempDevicePath = CopyMem (TempDevicePath, DevicePathNode, NodeLength); // // Add and end device path node to convert Node to device path // NextNode = NextDevicePathNode (TempDevicePath); SetDevicePathEndNode (NextNode); // // Append device paths // NewDevicePath = AppendDevicePath (DevicePath, TempDevicePath); FreePool (TempDevicePath); return NewDevicePath; } Index: head/sbin/devmatch/devmatch.8 =================================================================== --- head/sbin/devmatch/devmatch.8 (revision 343754) +++ head/sbin/devmatch/devmatch.8 (revision 343755) @@ -1,98 +1,96 @@ .\" -.\" Copyright (c) 2017 Netflix, Inc -.\" -.\" All rights reserved. +.\" Copyright (c) 2017 Netflix, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. .\" IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd February 16, 2018 .Dt DEVMATCH 8 .Os .Sh NAME .Nm devmatch .Nd print information about unattached devices .Sh SYNOPSIS .Nm .Op Fl adhpuv .Op Fl -all .Op Fl -dump .Op Fl -hints Ar file .Op Fl -nomatch Ar event .Op Fl -unbound .Op Fl -verbose .Sh DESCRIPTION The .Nm utility, without any arguments, prints all the kernel modules it has found for all the unattached, enabled devices in the system. .Bl -tag -width 20m .It Fl a Fl -all Include all devices, not just the ones that are unattached. .It Fl d Fl -dump Produce a human readable dump of the .Pa linker.hints file. .It Fl h Fl -hints Ar file Use the named .Ar file instead of .Pa linker.hints guessed from the current module load path. .It Fl p Fl -nomatch Ar event Parse and use a standard NOMATCH event from .Xr devd 8 for matching instead of searching the device tree. .It Fl u Fl -unbound Attempt to produce a list of those drivers with PNP info whose driver tables with that PNP info can't be found. .It Fl v Fl -verbose Produce more verbose output. .El .Sh SEE ALSO .Xr devinfo 8 , .Xr MODULE_PNP_INFO 9 .Sh BUGS The kernel has hints in it, but we exclude it from the list of modules to suggest for unmatched devices. We exclude it when suggesting drivers, but include it when looking for unbound devices or producing a full dump of .Pa linker.hints . This can be confusing. .Pp Some modules are hard links in .Pa /boot/kernel and will be reported twice. .Pp The PNP string's attributes are evaluated once per PNP entry on that bus rather than once. .Pp The term PNP is overloaded in FreeBSD. It means, generically, the identifying data the bus provides about a device. While this include old ISA PNP identifiers, it also includes the logical equivalent in USB, PCI, and others. .Pp Many drivers currently lack proper PNP table decorations and need to be updated. .Sh AUTHORS .An Warner Losh Aq Mt imp@FreeBSD.org Index: head/sbin/devmatch/devmatch.c =================================================================== --- head/sbin/devmatch/devmatch.c (revision 343754) +++ head/sbin/devmatch/devmatch.c (revision 343755) @@ -1,596 +1,595 @@ /*- - * Copyright (c) 2017 Netflix, Inc - * All rights reserved. + * Copyright (c) 2017 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* options descriptor */ static struct option longopts[] = { { "all", no_argument, NULL, 'a' }, { "dump", no_argument, NULL, 'd' }, { "hints", required_argument, NULL, 'h' }, { "nomatch", required_argument, NULL, 'p' }, { "unbound", no_argument, NULL, 'u' }, { "verbose", no_argument, NULL, 'v' }, { NULL, 0, NULL, 0 } }; #define DEVMATCH_MAX_HITS 256 static int all_flag; static int dump_flag; static char *linker_hints; static char *nomatch_str; static int unbound_flag; static int verbose_flag; static void *hints; static void *hints_end; static struct devinfo_dev *root; static void * read_hints(const char *fn, size_t *len) { void *h; int fd; struct stat sb; fd = open(fn, O_RDONLY); if (fd < 0) { if (errno == ENOENT) return NULL; err(1, "Can't open %s for reading", fn); } if (fstat(fd, &sb) != 0) err(1, "Can't fstat %s\n", fn); h = malloc(sb.st_size); if (h == NULL) err(1, "not enough space to read hints file of %ju bytes", (uintmax_t)sb.st_size); if (read(fd, h, sb.st_size) != sb.st_size) err(1, "Can't read in %ju bytes from %s", (uintmax_t)sb.st_size, fn); close(fd); *len = sb.st_size; return h; } static void read_linker_hints(void) { char fn[MAXPATHLEN]; char *modpath, *p, *q; size_t buflen, len; if (linker_hints == NULL) { if (sysctlbyname("kern.module_path", NULL, &buflen, NULL, 0) < 0) errx(1, "Can't find kernel module path."); modpath = malloc(buflen); if (modpath == NULL) err(1, "Can't get memory for modpath."); if (sysctlbyname("kern.module_path", modpath, &buflen, NULL, 0) < 0) errx(1, "Can't find kernel module path."); p = modpath; while ((q = strsep(&p, ";")) != NULL) { snprintf(fn, sizeof(fn), "%s/linker.hints", q); hints = read_hints(fn, &len); if (hints == NULL) continue; break; } if (q == NULL) errx(1, "Can't read linker hints file."); } else { hints = read_hints(linker_hints, &len); if (hints == NULL) err(1, "Can't open %s for reading", fn); } if (*(int *)(intptr_t)hints != LINKER_HINTS_VERSION) { warnx("Linker hints version %d doesn't match expected %d.", *(int *)(intptr_t)hints, LINKER_HINTS_VERSION); free(hints); hints = NULL; } if (hints != NULL) hints_end = (void *)((intptr_t)hints + (intptr_t)len); } static int getint(void **ptr) { int *p = *ptr; int rv; p = (int *)roundup2((intptr_t)p, sizeof(int)); rv = *p++; *ptr = p; return rv; } static void getstr(void **ptr, char *val) { int *p = *ptr; char *c = (char *)p; int len = *(uint8_t *)c; memcpy(val, c + 1, len); val[len] = 0; c += len + 1; *ptr = (void *)c; } static int pnpval_as_int(const char *val, const char *pnpinfo) { int rv; char key[256]; char *cp; if (pnpinfo == NULL) return -1; cp = strchr(val, ';'); key[0] = ' '; if (cp == NULL) strlcpy(key + 1, val, sizeof(key) - 1); else { memcpy(key + 1, val, cp - val); key[cp - val + 1] = '\0'; } strlcat(key, "=", sizeof(key)); if (strncmp(key + 1, pnpinfo, strlen(key + 1)) == 0) rv = strtol(pnpinfo + strlen(key + 1), NULL, 0); else { cp = strstr(pnpinfo, key); if (cp == NULL) rv = -1; else rv = strtol(cp + strlen(key), NULL, 0); } return rv; } static void quoted_strcpy(char *dst, const char *src) { char q = ' '; if (*src == '\'' || *src == '"') q = *src++; while (*src && *src != q) *dst++ = *src++; // XXX backtick quoting *dst++ = '\0'; // XXX overflow } static char * pnpval_as_str(const char *val, const char *pnpinfo) { static char retval[256]; char key[256]; char *cp; if (pnpinfo == NULL) { *retval = '\0'; return retval; } cp = strchr(val, ';'); key[0] = ' '; if (cp == NULL) strlcpy(key + 1, val, sizeof(key) - 1); else { memcpy(key + 1, val, cp - val); key[cp - val + 1] = '\0'; } strlcat(key, "=", sizeof(key)); if (strncmp(key + 1, pnpinfo, strlen(key + 1)) == 0) quoted_strcpy(retval, pnpinfo + strlen(key + 1)); else { cp = strstr(pnpinfo, key); if (cp == NULL) strcpy(retval, "MISSING"); else quoted_strcpy(retval, cp + strlen(key)); } return retval; } static void search_hints(const char *bus, const char *dev, const char *pnpinfo) { char val1[256], val2[256]; int ival, len, ents, i, notme, mask, bit, v, found; void *ptr, *walker; char *lastmod = NULL, *cp, *s; walker = hints; getint(&walker); found = 0; if (verbose_flag) printf("Searching bus %s dev %s for pnpinfo %s\n", bus, dev, pnpinfo); while (walker < hints_end) { len = getint(&walker); ival = getint(&walker); ptr = walker; switch (ival) { case MDT_VERSION: getstr(&ptr, val1); ival = getint(&ptr); getstr(&ptr, val2); if (dump_flag || verbose_flag) printf("Version: if %s.%d kmod %s\n", val1, ival, val2); break; case MDT_MODULE: getstr(&ptr, val1); getstr(&ptr, val2); if (lastmod) free(lastmod); lastmod = strdup(val2); if (dump_flag || verbose_flag) printf("Module %s in %s\n", val1, val2); break; case MDT_PNP_INFO: if (!dump_flag && !unbound_flag && lastmod && strcmp(lastmod, "kernel") == 0) break; getstr(&ptr, val1); getstr(&ptr, val2); ents = getint(&ptr); if (dump_flag || verbose_flag) printf("PNP info for bus %s format %s %d entries (%s)\n", val1, val2, ents, lastmod); if (strcmp(val1, "usb") == 0) { if (verbose_flag) printf("Treating usb as uhub -- bug in source table still?\n"); strcpy(val1, "uhub"); } if (bus && strcmp(val1, bus) != 0) { if (verbose_flag) printf("Skipped because table for bus %s, looking for %s\n", val1, bus); break; } for (i = 0; i < ents; i++) { if (verbose_flag) printf("---------- Entry %d ----------\n", i); if (dump_flag) printf(" "); cp = val2; notme = 0; mask = -1; bit = -1; do { switch (*cp) { /* All integer fields */ case 'I': case 'J': case 'G': case 'L': case 'M': ival = getint(&ptr); if (dump_flag) { printf("%#x:", ival); break; } if (bit >= 0 && ((1 << bit) & mask) == 0) break; v = pnpval_as_int(cp + 2, pnpinfo); if (verbose_flag) printf("Matching %s (%c) table=%#x tomatch=%#x\n", cp + 2, *cp, v, ival); switch (*cp) { case 'J': if (ival == -1) break; /*FALLTHROUGH*/ case 'I': if (v != ival) notme++; break; case 'G': if (v < ival) notme++; break; case 'L': if (v > ival) notme++; break; case 'M': mask = ival; break; } break; /* String fields */ case 'D': case 'Z': getstr(&ptr, val1); if (dump_flag) { printf("'%s':", val1); break; } if (*cp == 'D') break; s = pnpval_as_str(cp + 2, pnpinfo); if (strcmp(s, val1) != 0) notme++; break; /* Key override fields, required to be last in the string */ case 'T': /* * This is imperfect and only does one key and will be redone * to be more general for multiple keys. Currently, nothing * does that. */ if (dump_flag) /* No per-row data stored */ break; if (cp[strlen(cp) - 1] == ';') /* Skip required ; at end */ cp[strlen(cp) - 1] = '\0'; /* in case it's not there */ if ((s = strstr(pnpinfo, cp + 2)) == NULL) notme++; else if (s > pnpinfo && s[-1] != ' ') notme++; break; default: fprintf(stderr, "Unknown field type %c\n:", *cp); break; } bit++; cp = strchr(cp, ';'); if (cp) cp++; } while (cp && *cp); if (dump_flag) printf("\n"); else if (!notme) { if (!unbound_flag) { if (all_flag) printf("%s: %s", *dev ? dev : "unattached", lastmod); else printf("%s\n", lastmod); if (verbose_flag) printf("Matches --- %s ---\n", lastmod); } found++; } } break; default: if (dump_flag) printf("Unknown Type %d len %d\n", ival, len); break; } walker = (void *)(len - sizeof(int) + (intptr_t)walker); } if (unbound_flag && found == 0 && *pnpinfo) { if (verbose_flag) printf("------------------------- "); printf("%s on %s pnpinfo %s", *dev ? dev : "unattached", bus, pnpinfo); if (verbose_flag) printf(" -------------------------"); printf("\n"); } free(lastmod); } static int find_unmatched(struct devinfo_dev *dev, void *arg) { struct devinfo_dev *parent; char *bus, *p; do { if (!all_flag && dev->dd_name[0] != '\0') break; if (!(dev->dd_flags & DF_ENABLED)) break; if (dev->dd_flags & DF_ATTACHED_ONCE) break; parent = devinfo_handle_to_device(dev->dd_parent); bus = strdup(parent->dd_name); p = bus + strlen(bus) - 1; while (p >= bus && isdigit(*p)) p--; *++p = '\0'; if (verbose_flag) printf("Searching %s %s bus at %s for pnpinfo %s\n", dev->dd_name, bus, dev->dd_location, dev->dd_pnpinfo); search_hints(bus, dev->dd_name, dev->dd_pnpinfo); free(bus); } while (0); return (devinfo_foreach_device_child(dev, find_unmatched, arg)); } struct exact_info { const char *bus; const char *loc; struct devinfo_dev *dev; }; /* * Look for the exact location specified by the nomatch event. The * loc and pnpinfo run together to get the string we're looking for, * so we have to synthesize the same thing that subr_bus.c is * generating in devnomatch/devaddq to do the string comparison. */ static int find_exact_dev(struct devinfo_dev *dev, void *arg) { struct devinfo_dev *parent; char *loc; struct exact_info *info; info = arg; do { if (info->dev != NULL) break; if (!(dev->dd_flags & DF_ENABLED)) break; parent = devinfo_handle_to_device(dev->dd_parent); if (strcmp(info->bus, parent->dd_name) != 0) break; asprintf(&loc, "%s %s", parent->dd_pnpinfo, parent->dd_location); if (strcmp(loc, info->loc) == 0) info->dev = dev; free(loc); } while (0); return (devinfo_foreach_device_child(dev, find_exact_dev, arg)); } static void find_nomatch(char *nomatch) { char *bus, *pnpinfo, *tmp, *busnameunit; struct exact_info info; /* * Find our bus name. It will include the unit number. We have to search * backwards to avoid false positive for any PNP string that has ' on ' * in them, which would come earlier in the string. Like if there were * an 'Old Bard' ethernet card made by 'Stratford on Avon Hardware' or * something silly like that. */ tmp = nomatch + strlen(nomatch) - 4; while (tmp > nomatch && strncmp(tmp, " on ", 4) != 0) tmp--; if (tmp == nomatch) errx(1, "No bus found in nomatch string: '%s'", nomatch); bus = tmp + 4; *tmp = '\0'; busnameunit = strdup(bus); if (busnameunit == NULL) errx(1, "Can't allocate memory for strings"); tmp = bus + strlen(bus) - 1; while (tmp > bus && isdigit(*tmp)) tmp--; *++tmp = '\0'; /* * Note: the NOMATCH events place both the bus location as well as the * pnp info after the 'at' and we don't know where one stops and the * other begins, so we pass the whole thing to our search routine. */ if (*nomatch == '?') nomatch++; if (strncmp(nomatch, " at ", 4) != 0) errx(1, "Malformed NOMATCH string: '%s'", nomatch); pnpinfo = nomatch + 4; /* * See if we can find the devinfo_dev for this device. If we * can, and it's been attached before, we should filter it out * so that a kldunload foo doesn't cause an immediate reload. */ info.loc = pnpinfo; info.bus = busnameunit; info.dev = NULL; devinfo_foreach_device_child(root, find_exact_dev, (void *)&info); if (info.dev != NULL && info.dev->dd_flags & DF_ATTACHED_ONCE) exit(0); search_hints(bus, "", pnpinfo); exit(0); } static void usage(void) { errx(1, "devmatch [-adv] [-p nomatch] [-h linker-hints]"); } int main(int argc, char **argv) { int ch; while ((ch = getopt_long(argc, argv, "adh:p:uv", longopts, NULL)) != -1) { switch (ch) { case 'a': all_flag++; break; case 'd': dump_flag++; break; case 'h': linker_hints = optarg; break; case 'p': nomatch_str = optarg; break; case 'u': unbound_flag++; break; case 'v': verbose_flag++; break; default: usage(); } } argc -= optind; argv += optind; if (argc >= 1) usage(); read_linker_hints(); if (dump_flag) { search_hints(NULL, NULL, NULL); exit(0); } if (devinfo_init()) err(1, "devinfo_init"); if ((root = devinfo_handle_to_device(DEVINFO_ROOT_DEVICE)) == NULL) errx(1, "can't find root device"); if (nomatch_str != NULL) find_nomatch(nomatch_str); else devinfo_foreach_device_child(root, find_unmatched, (void *)0); devinfo_free(); } Index: head/sbin/nvmecontrol/modules/wdc/wdc.c =================================================================== --- head/sbin/nvmecontrol/modules/wdc/wdc.c (revision 343754) +++ head/sbin/nvmecontrol/modules/wdc/wdc.c (revision 343755) @@ -1,597 +1,596 @@ /*- - * Copyright (c) 2017 Netflix, Inc - * All rights reserved. + * Copyright (c) 2017 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" #define WDC_USAGE \ "wdc (cap-diag)\n" NVME_CMD_DECLARE(wdc, struct nvme_function); #define WDC_NVME_TOC_SIZE 8 #define WDC_NVME_CAP_DIAG_OPCODE 0xe6 #define WDC_NVME_CAP_DIAG_CMD 0x0000 static void wdc_cap_diag(const struct nvme_function *nf, int argc, char *argv[]); #define WDC_CAP_DIAG_USAGE "wdc cap-diag [-o path-template]\n" NVME_COMMAND(wdc, cap-diag, wdc_cap_diag, WDC_CAP_DIAG_USAGE); static void wdc_append_serial_name(int fd, char *buf, size_t len, const char *suffix) { struct nvme_controller_data cdata; char sn[NVME_SERIAL_NUMBER_LENGTH + 1]; char *walker; len -= strlen(buf); buf += strlen(buf); read_controller_data(fd, &cdata); memcpy(sn, cdata.sn, NVME_SERIAL_NUMBER_LENGTH); walker = sn + NVME_SERIAL_NUMBER_LENGTH - 1; while (walker > sn && *walker == ' ') walker--; *++walker = '\0'; snprintf(buf, len, "%s%s.bin", sn, suffix); } static void wdc_get_data(int fd, uint32_t opcode, uint32_t len, uint32_t off, uint32_t cmd, uint8_t *buffer, size_t buflen) { struct nvme_pt_command pt; memset(&pt, 0, sizeof(pt)); pt.cmd.opc = opcode; pt.cmd.cdw10 = htole32(len / sizeof(uint32_t)); /* - 1 like all the others ??? */ pt.cmd.cdw11 = htole32(off / sizeof(uint32_t)); pt.cmd.cdw12 = htole32(cmd); pt.buf = buffer; pt.len = buflen; pt.is_read = 1; // printf("opcode %#x cdw10(len) %#x cdw11(offset?) %#x cdw12(cmd/sub) %#x buflen %zd\n", // (int)opcode, (int)cdw10, (int)cdw11, (int)cdw12, buflen); if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "wdc_get_data request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "wdc_get_data request returned error"); } static void wdc_do_dump(int fd, char *tmpl, const char *suffix, uint32_t opcode, uint32_t cmd, int len_off) { int first; int fd2; uint8_t *buf; uint32_t len, offset; size_t resid; wdc_append_serial_name(fd, tmpl, MAXPATHLEN, suffix); /* XXX overwrite protection? */ fd2 = open(tmpl, O_WRONLY | O_CREAT | O_TRUNC, 0644); if (fd2 < 0) err(1, "open %s", tmpl); buf = aligned_alloc(PAGE_SIZE, NVME_MAX_XFER_SIZE); if (buf == NULL) errx(1, "Can't get buffer to read dump"); offset = 0; len = NVME_MAX_XFER_SIZE; first = 1; do { resid = len > NVME_MAX_XFER_SIZE ? NVME_MAX_XFER_SIZE : len; wdc_get_data(fd, opcode, resid, offset, cmd, buf, resid); if (first) { len = be32dec(buf + len_off); if (len == 0) errx(1, "No data for %s", suffix); if (memcmp("E6LG", buf, 4) != 0) printf("Expected header of E6LG, found '%4.4s' instead\n", buf); printf("Dumping %d bytes of version %d.%d log to %s\n", len, buf[8], buf[9], tmpl); /* * Adjust amount to dump if total dump < 1MB, * though it likely doesn't matter to the WDC * analysis tools. */ if (resid > len) resid = len; first = 0; } if (write(fd2, buf, resid) != (ssize_t)resid) err(1, "write"); offset += resid; len -= resid; } while (len > 0); free(buf); close(fd2); } static void wdc_cap_diag(const struct nvme_function *nf, int argc, char *argv[]) { char path_tmpl[MAXPATHLEN]; int ch, fd; path_tmpl[0] = '\0'; while ((ch = getopt(argc, argv, "o:")) != -1) { switch ((char)ch) { case 'o': strlcpy(path_tmpl, optarg, MAXPATHLEN); break; default: usage(nf); } } /* Check that a controller was specified. */ if (optind >= argc) usage(nf); open_dev(argv[optind], &fd, 1, 1); wdc_do_dump(fd, path_tmpl, "cap_diag", WDC_NVME_CAP_DIAG_OPCODE, WDC_NVME_CAP_DIAG_CMD, 4); close(fd); exit(1); } static void wdc(const struct nvme_function *nf __unused, int argc, char *argv[]) { DISPATCH(argc, argv, wdc); } /* * HGST's 0xc1 page. This is a grab bag of additional data. Please see * https://www.hgst.com/sites/default/files/resources/US_SN150_ProdManual.pdf * https://www.hgst.com/sites/default/files/resources/US_SN100_ProdManual.pdf * Appendix A for details */ typedef void (*subprint_fn_t)(void *buf, uint16_t subtype, uint8_t res, uint32_t size); struct subpage_print { uint16_t key; subprint_fn_t fn; }; static void print_hgst_info_write_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_read_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_verify_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_self_test(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_background_scan(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_erase_errors(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_erase_counts(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_temp_history(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_ssd_perf(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static void print_hgst_info_firmware_load(void *buf, uint16_t subtype, uint8_t res, uint32_t size); static struct subpage_print hgst_subpage[] = { { 0x02, print_hgst_info_write_errors }, { 0x03, print_hgst_info_read_errors }, { 0x05, print_hgst_info_verify_errors }, { 0x10, print_hgst_info_self_test }, { 0x15, print_hgst_info_background_scan }, { 0x30, print_hgst_info_erase_errors }, { 0x31, print_hgst_info_erase_counts }, { 0x32, print_hgst_info_temp_history }, { 0x37, print_hgst_info_ssd_perf }, { 0x38, print_hgst_info_firmware_load }, }; /* Print a subpage that is basically just key value pairs */ static void print_hgst_info_subpage_gen(void *buf, uint16_t subtype __unused, uint32_t size, const struct kv_name *kv, size_t kv_count) { uint8_t *wsp, *esp; uint16_t ptype; uint8_t plen; uint64_t param; int i; wsp = buf; esp = wsp + size; while (wsp < esp) { ptype = le16dec(wsp); wsp += 2; wsp++; /* Flags, just ignore */ plen = *wsp++; param = 0; for (i = 0; i < plen; i++) param |= (uint64_t)*wsp++ << (i * 8); printf(" %-30s: %jd\n", kv_lookup(kv, kv_count, ptype), (uintmax_t)param); } } static void print_hgst_info_write_errors(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Writes" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Flash Write Commands" }, { 0x8001, "HGST Special" }, }; printf("Write Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_read_errors(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Reads" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Flash Read Commands" }, { 0x8001, "XOR Recovered" }, { 0x8002, "Total Corrected Bits" }, }; printf("Read Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_verify_errors(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Reads" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Commands Processed" }, }; printf("Verify Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_self_test(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size) { size_t i; uint8_t *walker = buf; uint16_t code, hrs; uint32_t lba; printf("Self Test Subpage:\n"); for (i = 0; i < size / 20; i++) { /* Each entry is 20 bytes */ code = le16dec(walker); walker += 2; walker++; /* Ignore fixed flags */ if (*walker == 0) /* Last entry is zero length */ break; if (*walker++ != 0x10) { printf("Bad length for self test report\n"); return; } printf(" %-30s: %d\n", "Recent Test", code); printf(" %-28s: %#x\n", "Self-Test Results", *walker & 0xf); printf(" %-28s: %#x\n", "Self-Test Code", (*walker >> 5) & 0x7); walker++; printf(" %-28s: %#x\n", "Self-Test Number", *walker++); hrs = le16dec(walker); walker += 2; lba = le32dec(walker); walker += 4; printf(" %-28s: %u\n", "Total Power On Hrs", hrs); printf(" %-28s: %#jx (%jd)\n", "LBA", (uintmax_t)lba, (uintmax_t)lba); printf(" %-28s: %#x\n", "Sense Key", *walker++ & 0xf); printf(" %-28s: %#x\n", "Additional Sense Code", *walker++); printf(" %-28s: %#x\n", "Additional Sense Qualifier", *walker++); printf(" %-28s: %#x\n", "Vendor Specific Detail", *walker++); } } static void print_hgst_info_background_scan(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size) { uint8_t *walker = buf; uint8_t status; uint16_t code, nscan, progress; uint32_t pom, nand; printf("Background Media Scan Subpage:\n"); /* Decode the header */ code = le16dec(walker); walker += 2; walker++; /* Ignore fixed flags */ if (*walker++ != 0x10) { printf("Bad length for background scan header\n"); return; } if (code != 0) { printf("Expceted code 0, found code %#x\n", code); return; } pom = le32dec(walker); walker += 4; walker++; /* Reserved */ status = *walker++; nscan = le16dec(walker); walker += 2; progress = le16dec(walker); walker += 2; walker += 6; /* Reserved */ printf(" %-30s: %d\n", "Power On Minutes", pom); printf(" %-30s: %x (%s)\n", "BMS Status", status, status == 0 ? "idle" : (status == 1 ? "active" : (status == 8 ? "suspended" : "unknown"))); printf(" %-30s: %d\n", "Number of BMS", nscan); printf(" %-30s: %d\n", "Progress Current BMS", progress); /* Report retirements */ if (walker - (uint8_t *)buf != 20) { printf("Coding error, offset not 20\n"); return; } size -= 20; printf(" %-30s: %d\n", "BMS retirements", size / 0x18); while (size > 0) { code = le16dec(walker); walker += 2; walker++; if (*walker++ != 0x14) { printf("Bad length parameter\n"); return; } pom = le32dec(walker); walker += 4; /* * Spec sheet says the following are hard coded, if true, just * print the NAND retirement. */ if (walker[0] == 0x41 && walker[1] == 0x0b && walker[2] == 0x01 && walker[3] == 0x00 && walker[4] == 0x00 && walker[5] == 0x00 && walker[6] == 0x00 && walker[7] == 0x00) { walker += 8; walker += 4; /* Skip reserved */ nand = le32dec(walker); walker += 4; printf(" %-30s: %d\n", "Retirement number", code); printf(" %-28s: %#x\n", "NAND (C/T)BBBPPP", nand); } else { printf("Parameter %#x entry corrupt\n", code); walker += 16; } } } static void print_hgst_info_erase_errors(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size) { static struct kv_name kv[] = { { 0x0000, "Corrected Without Delay" }, { 0x0001, "Corrected Maybe Delayed" }, { 0x0002, "Re-Erase" }, { 0x0003, "Errors Corrected" }, { 0x0004, "Correct Algorithm Used" }, { 0x0005, "Bytes Processed" }, { 0x0006, "Uncorrected Errors" }, { 0x8000, "Flash Erase Commands" }, { 0x8001, "Mfg Defect Count" }, { 0x8002, "Grown Defect Count" }, { 0x8003, "Erase Count -- User" }, { 0x8004, "Erase Count -- System" }, }; printf("Erase Errors Subpage:\n"); print_hgst_info_subpage_gen(buf, subtype, size, kv, nitems(kv)); } static void print_hgst_info_erase_counts(void *buf, uint16_t subtype, uint8_t res __unused, uint32_t size) { /* My drive doesn't export this -- so not coding up */ printf("XXX: Erase counts subpage: %p, %#x %d\n", buf, subtype, size); } static void print_hgst_info_temp_history(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size __unused) { uint8_t *walker = buf; uint32_t min; printf("Temperature History:\n"); printf(" %-30s: %d C\n", "Current Temperature", *walker++); printf(" %-30s: %d C\n", "Reference Temperature", *walker++); printf(" %-30s: %d C\n", "Maximum Temperature", *walker++); printf(" %-30s: %d C\n", "Minimum Temperature", *walker++); min = le32dec(walker); walker += 4; printf(" %-30s: %d:%02d:00\n", "Max Temperature Time", min / 60, min % 60); min = le32dec(walker); walker += 4; printf(" %-30s: %d:%02d:00\n", "Over Temperature Duration", min / 60, min % 60); min = le32dec(walker); walker += 4; printf(" %-30s: %d:%02d:00\n", "Min Temperature Time", min / 60, min % 60); } static void print_hgst_info_ssd_perf(void *buf, uint16_t subtype __unused, uint8_t res, uint32_t size __unused) { uint8_t *walker = buf; uint64_t val; printf("SSD Performance Subpage Type %d:\n", res); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Read Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Read Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Cache Read Hits Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Cache Read Hits Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Read Commands Stalled", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Odd Start Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Odd End Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "Host Write Commands Stalled", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Read Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Read Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Write Commands", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Write Blocks", val); val = le64dec(walker); walker += 8; printf(" %-30s: %ju\n", "NAND Read Before Writes", val); } static void print_hgst_info_firmware_load(void *buf, uint16_t subtype __unused, uint8_t res __unused, uint32_t size __unused) { uint8_t *walker = buf; printf("Firmware Load Subpage:\n"); printf(" %-30s: %d\n", "Firmware Downloads", le32dec(walker)); } static void kv_indirect(void *buf, uint32_t subtype, uint8_t res, uint32_t size, struct subpage_print *sp, size_t nsp) { size_t i; for (i = 0; i < nsp; i++, sp++) { if (sp->key == subtype) { sp->fn(buf, subtype, res, size); return; } } printf("No handler for page type %x\n", subtype); } static void print_hgst_info_log(const struct nvme_controller_data *cdata __unused, void *buf, uint32_t size __unused) { uint8_t *walker, *end, *subpage; int pages; uint16_t len; uint8_t subtype, res; printf("HGST Extra Info Log\n"); printf("===================\n"); walker = buf; pages = *walker++; walker++; len = le16dec(walker); walker += 2; end = walker + len; /* Length is exclusive of this header */ while (walker < end) { subpage = walker + 4; subtype = *walker++ & 0x3f; /* subtype */ res = *walker++; /* Reserved */ len = le16dec(walker); walker += len + 2; /* Length, not incl header */ if (walker > end) { printf("Ooops! Off the end of the list\n"); break; } kv_indirect(subpage, subtype, res, len, hgst_subpage, nitems(hgst_subpage)); } } NVME_LOGPAGE(hgst_info, HGST_INFO_LOG, "hgst", "Detailed Health/SMART", print_hgst_info_log, DEFAULT_SIZE); NVME_LOGPAGE(wdc_info, HGST_INFO_LOG, "wdc", "Detailed Health/SMART", print_hgst_info_log, DEFAULT_SIZE); NVME_COMMAND(top, wdc, wdc, WDC_USAGE); Index: head/sbin/nvmecontrol/nc_util.c =================================================================== --- head/sbin/nvmecontrol/nc_util.c (revision 343754) +++ head/sbin/nvmecontrol/nc_util.c (revision 343755) @@ -1,59 +1,58 @@ /*- - * Copyright (c) 2017 Netflix, Inc - * All rights reserved. + * Copyright (c) 2017 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include "nvmecontrol.h" char * uint128_to_str(uint128_t u, char *buf, size_t buflen) { char *end = buf + buflen - 1; *end-- = '\0'; if (u == 0) *end-- = '0'; while (u && end >= buf) { *end-- = u % 10 + '0'; u /= 10; } end++; if (u != 0) return NULL; return end; } /* "Missing" from endian.h */ uint64_t le48dec(const void *pp) { uint8_t const *p = (uint8_t const *)pp; return (((uint64_t)le16dec(p + 4) << 32) | le32dec(p)); } Index: head/sbin/nvmecontrol/ns.c =================================================================== --- head/sbin/nvmecontrol/ns.c (revision 343754) +++ head/sbin/nvmecontrol/ns.c (revision 343755) @@ -1,446 +1,446 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2017 Netflix, Inc + * Copyright (c) 2017 Netflix, Inc. * Copyright (C) 2018 Alexander Motin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "nvmecontrol.h" NVME_CMD_DECLARE(ns, struct nvme_function); #define NS_USAGE \ "ns (create|delete|attach|detach)\n" /* handles NVME_OPC_NAMESPACE_MANAGEMENT and ATTACHMENT admin cmds */ #define NSCREATE_USAGE \ "ns create -s size [-c cap] [-f fmt] [-m mset] [-n nmic] [-p pi] [-l pil] nvmeN\n" #define NSDELETE_USAGE \ "ns delete -n nsid nvmeN\n" #define NSATTACH_USAGE \ "ns attach -n nsid [-c ctrlrid] nvmeN \n" #define NSDETACH_USAGE \ "ns detach -n nsid [-c ctrlrid] nvmeN\n" static void nscreate(const struct nvme_function *nf, int argc, char *argv[]); static void nsdelete(const struct nvme_function *nf, int argc, char *argv[]); static void nsattach(const struct nvme_function *nf, int argc, char *argv[]); static void nsdetach(const struct nvme_function *nf, int argc, char *argv[]); NVME_COMMAND(ns, create, nscreate, NSCREATE_USAGE); NVME_COMMAND(ns, delete, nsdelete, NSDELETE_USAGE); NVME_COMMAND(ns, attach, nsattach, NSATTACH_USAGE); NVME_COMMAND(ns, detach, nsdetach, NSDETACH_USAGE); struct ns_result_str { uint16_t res; const char * str; }; static struct ns_result_str ns_result[] = { { 0x2, "Invalid Field"}, { 0xa, "Invalid Format"}, { 0xb, "Invalid Namespace or format"}, { 0x15, "Namespace insufficent capacity"}, { 0x16, "Namespace ID unavaliable"}, { 0x18, "Namespace already attached"}, { 0x19, "Namespace is private"}, { 0x1a, "Namespace is not attached"}, { 0x1b, "Thin provisioning not supported"}, { 0x1c, "Controller list invalid"}, { 0xFFFF, "Unknown"} }; static const char * get_res_str(uint16_t res) { struct ns_result_str *t = ns_result; while (t->res != 0xFFFF) { if (t->res == res) return (t->str); t++; } return t->str; } /* * NS MGMT Command specific status values: * 0xa = Invalid Format * 0x15 = Namespace Insuffience capacity * 0x16 = Namespace ID unavailable (number namespaces exceeded) * 0xb = Thin Provisioning Not supported */ static void nscreate(const struct nvme_function *nf, int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; struct nvme_namespace_data nsdata; int64_t nsze = -1, cap = -1; int ch, fd, result, lbaf = 0, mset = 0, nmic = -1, pi = 0, pil = 0; if (optind >= argc) usage(nf); while ((ch = getopt(argc, argv, "s:c:f:m:n:p:l:")) != -1) { switch (ch) { case 's': nsze = strtol(optarg, (char **)NULL, 0); break; case 'c': cap = strtol(optarg, (char **)NULL, 0); break; case 'f': lbaf = strtol(optarg, (char **)NULL, 0); break; case 'm': mset = strtol(optarg, NULL, 0); break; case 'n': nmic = strtol(optarg, NULL, 0); break; case 'p': pi = strtol(optarg, NULL, 0); break; case 'l': pil = strtol(optarg, NULL, 0); break; default: usage(nf); } } if (optind >= argc) usage(nf); if (cap == -1) cap = nsze; if (nsze == -1 || cap == -1) usage(nf); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); /* Allow namespaces sharing if Multi-Path I/O is supported. */ if (nmic == -1) { nmic = cd.mic ? (NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK << NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT) : 0; } memset(&nsdata, 0, sizeof(nsdata)); nsdata.nsze = (uint64_t)nsze; nsdata.ncap = (uint64_t)cap; nsdata.flbas = ((lbaf & NVME_NS_DATA_FLBAS_FORMAT_MASK) << NVME_NS_DATA_FLBAS_FORMAT_SHIFT) | ((mset & NVME_NS_DATA_FLBAS_EXTENDED_MASK) << NVME_NS_DATA_FLBAS_EXTENDED_SHIFT); nsdata.dps = ((pi & NVME_NS_DATA_DPS_MD_START_MASK) << NVME_NS_DATA_DPS_MD_START_SHIFT) | ((pil & NVME_NS_DATA_DPS_PIT_MASK) << NVME_NS_DATA_DPS_PIT_SHIFT); nsdata.nmic = nmic; nvme_namespace_data_swapbytes(&nsdata); memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_NAMESPACE_MANAGEMENT; pt.cmd.cdw10 = 0; /* create */ pt.buf = &nsdata; pt.len = sizeof(struct nvme_namespace_data); pt.is_read = 0; /* passthrough writes data to ctrlr */ if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace creation failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d created\n", pt.cpl.cdw0); exit(0); } static void nsdelete(const struct nvme_function *nf, int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; int ch, fd, result, nsid = -2; char buf[2]; if (optind >= argc) usage(nf); while ((ch = getopt(argc, argv, "n:")) != -1) { switch ((char)ch) { case 'n': nsid = strtol(optarg, (char **)NULL, 0); break; default: usage(nf); } } if (optind >= argc || nsid == -2) usage(nf); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_NAMESPACE_MANAGEMENT; pt.cmd.cdw10 = 1; /* delete */ pt.buf = buf; pt.len = sizeof(buf); pt.is_read = 1; pt.cmd.nsid = (uint32_t)nsid; if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace deletion failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d deleted\n", nsid); exit(0); } /* * Attach and Detach use Dword 10, and a controller list (section 4.9) * This struct is 4096 bytes in size. * 0h = attach * 1h = detach * * Result values for both attach/detach: * * Completion 18h = Already attached * 19h = NS is private and already attached to a controller * 1Ah = Not attached, request could not be completed * 1Ch = Controller list invalid. * * 0x2 Invalid Field can occur if ctrlrid d.n.e in system. */ static void nsattach(const struct nvme_function *nf, int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; int ctrlrid = -2; int fd, ch, result, nsid = -1; uint16_t clist[2048]; if (optind >= argc) usage(nf); while ((ch = getopt(argc, argv, "n:c:")) != -1) { switch (ch) { case 'n': nsid = strtol(optarg, (char **)NULL, 0); break; case 'c': ctrlrid = strtol(optarg, (char **)NULL, 0); break; default: usage(nf); } } if (optind >= argc) usage(nf); if (nsid == -1 ) usage(nf); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); if (ctrlrid == -1) { /* Get full list of controllers to attach to. */ memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_IDENTIFY; pt.cmd.cdw10 = htole32(0x13); pt.buf = clist; pt.len = sizeof(clist); pt.is_read = 1; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "identify request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "identify request returned error"); } else { /* By default attach to this controller. */ if (ctrlrid == -2) ctrlrid = cd.ctrlr_id; memset(&clist, 0, sizeof(clist)); clist[0] = htole16(1); clist[1] = htole16(ctrlrid); } memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_NAMESPACE_ATTACHMENT; pt.cmd.cdw10 = 0; /* attach */ pt.cmd.nsid = (uint32_t)nsid; pt.buf = &clist; pt.len = sizeof(clist); if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace attach failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d attached\n", nsid); exit(0); } static void nsdetach(const struct nvme_function *nf, int argc, char *argv[]) { struct nvme_pt_command pt; struct nvme_controller_data cd; int ctrlrid = -2; int fd, ch, result, nsid = -1; uint16_t clist[2048]; if (optind >= argc) usage(nf); while ((ch = getopt(argc, argv, "n:c:")) != -1) { switch (ch) { case 'n': nsid = strtol(optarg, (char **)NULL, 0); break; case 'c': ctrlrid = strtol(optarg, (char **)NULL, 0); break; default: usage(nf); } } if (optind >= argc) usage(nf); if (nsid == -1) usage(nf); open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cd); /* Check that controller can execute this command. */ if (((cd.oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK) == 0) errx(1, "controller does not support namespace management"); if (ctrlrid == -1) { /* Get list of controllers this namespace attached to. */ memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_IDENTIFY; pt.cmd.nsid = htole32(nsid); pt.cmd.cdw10 = htole32(0x12); pt.buf = clist; pt.len = sizeof(clist); pt.is_read = 1; if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "identify request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "identify request returned error"); if (clist[0] == 0) { ctrlrid = cd.ctrlr_id; memset(&clist, 0, sizeof(clist)); clist[0] = htole16(1); clist[1] = htole16(ctrlrid); } } else { /* By default detach from this controller. */ if (ctrlrid == -2) ctrlrid = cd.ctrlr_id; memset(&clist, 0, sizeof(clist)); clist[0] = htole16(1); clist[1] = htole16(ctrlrid); } memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_NAMESPACE_ATTACHMENT; pt.cmd.cdw10 = 1; /* detach */ pt.cmd.nsid = (uint32_t)nsid; pt.buf = &clist; pt.len = sizeof(clist); if ((result = ioctl(fd, NVME_PASSTHROUGH_CMD, &pt)) < 0) errx(1, "ioctl request to %s failed: %d", argv[optind], result); if (nvme_completion_is_error(&pt.cpl)) { errx(1, "namespace detach failed: %s", get_res_str((pt.cpl.status >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)); } printf("namespace %d detached\n", nsid); exit(0); } static void ns(const struct nvme_function *nf __unused, int argc, char *argv[]) { DISPATCH(argc, argv, ns); } NVME_COMMAND(top, ns, ns, NS_USAGE); Index: head/sbin/nvmecontrol/nvmecontrol_ext.h =================================================================== --- head/sbin/nvmecontrol/nvmecontrol_ext.h (revision 343754) +++ head/sbin/nvmecontrol/nvmecontrol_ext.h (revision 343755) @@ -1,30 +1,30 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (C) 2018 Netflix + * Copyright (C) 2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ void nvme_print_controller(struct nvme_controller_data *cdata); Index: head/sbin/nvmecontrol/power.c =================================================================== --- head/sbin/nvmecontrol/power.c (revision 343754) +++ head/sbin/nvmecontrol/power.c (revision 343755) @@ -1,192 +1,191 @@ /*- - * Copyright (c) 2016 Netflix, Inc - * All rights reserved. + * Copyright (c) 2016 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" _Static_assert(sizeof(struct nvme_power_state) == 256 / NBBY, "nvme_power_state size wrong"); #define POWER_USAGE \ "power [-l] [-p new-state [-w workload-hint]] \n" static void power_list_one(int i, struct nvme_power_state *nps) { int mpower, apower, ipower; uint8_t mps, nops, aps, apw; mps = (nps->mps_nops >> NVME_PWR_ST_MPS_SHIFT) & NVME_PWR_ST_MPS_MASK; nops = (nps->mps_nops >> NVME_PWR_ST_NOPS_SHIFT) & NVME_PWR_ST_NOPS_MASK; apw = (nps->apw_aps >> NVME_PWR_ST_APW_SHIFT) & NVME_PWR_ST_APW_MASK; aps = (nps->apw_aps >> NVME_PWR_ST_APS_SHIFT) & NVME_PWR_ST_APS_MASK; mpower = nps->mp; if (mps == 0) mpower *= 100; ipower = nps->idlp; if (nps->ips == 1) ipower *= 100; apower = nps->actp; if (aps == 1) apower *= 100; printf("%2d: %2d.%04dW%c %3d.%03dms %3d.%03dms %2d %2d %2d %2d %2d.%04dW %2d.%04dW %d\n", i, mpower / 10000, mpower % 10000, nops ? '*' : ' ', nps->enlat / 1000, nps->enlat % 1000, nps->exlat / 1000, nps->exlat % 1000, nps->rrt, nps->rrl, nps->rwt, nps->rwl, ipower / 10000, ipower % 10000, apower / 10000, apower % 10000, apw); } static void power_list(struct nvme_controller_data *cdata) { int i; printf("\nPower States Supported: %d\n\n", cdata->npss + 1); printf(" # Max pwr Enter Lat Exit Lat RT RL WT WL Idle Pwr Act Pwr Workloadd\n"); printf("-- -------- --------- --------- -- -- -- -- -------- -------- --\n"); for (i = 0; i <= cdata->npss; i++) power_list_one(i, &cdata->power_state[i]); } static void power_set(int fd, int power_val, int workload, int perm) { struct nvme_pt_command pt; uint32_t p; p = perm ? (1u << 31) : 0; memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_SET_FEATURES; pt.cmd.cdw10 = htole32(NVME_FEAT_POWER_MANAGEMENT | p); pt.cmd.cdw11 = htole32(power_val | (workload << 5)); if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "set feature power mgmt request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "set feature power mgmt request returned error"); } static void power_show(int fd) { struct nvme_pt_command pt; memset(&pt, 0, sizeof(pt)); pt.cmd.opc = NVME_OPC_GET_FEATURES; pt.cmd.cdw10 = htole32(NVME_FEAT_POWER_MANAGEMENT); if (ioctl(fd, NVME_PASSTHROUGH_CMD, &pt) < 0) err(1, "set feature power mgmt request failed"); if (nvme_completion_is_error(&pt.cpl)) errx(1, "set feature power mgmt request returned error"); printf("Current Power Mode is %d\n", pt.cpl.cdw0); } static void power(const struct nvme_function *nf, int argc, char *argv[]) { struct nvme_controller_data cdata; int ch, listflag = 0, powerflag = 0, power_val = 0, fd; int workload = 0; char *end; while ((ch = getopt(argc, argv, "lp:w:")) != -1) { switch ((char)ch) { case 'l': listflag = 1; break; case 'p': powerflag = 1; power_val = strtol(optarg, &end, 0); if (*end != '\0') { fprintf(stderr, "Invalid power state number: %s\n", optarg); usage(nf); } break; case 'w': workload = strtol(optarg, &end, 0); if (*end != '\0') { fprintf(stderr, "Invalid workload hint: %s\n", optarg); usage(nf); } break; default: usage(nf); } } /* Check that a controller was specified. */ if (optind >= argc) usage(nf); if (listflag && powerflag) { fprintf(stderr, "Can't set power and list power states\n"); usage(nf); } open_dev(argv[optind], &fd, 1, 1); read_controller_data(fd, &cdata); if (listflag) { power_list(&cdata); goto out; } if (powerflag) { power_set(fd, power_val, workload, 0); goto out; } power_show(fd); out: close(fd); exit(0); } NVME_COMMAND(top, power, power, POWER_USAGE); Index: head/share/man/man4/nda.4 =================================================================== --- head/share/man/man4/nda.4 (revision 343754) +++ head/share/man/man4/nda.4 (revision 343755) @@ -1,87 +1,87 @@ -.\" Copyright (c) 2017 Netflix, Inc -.\" All rights reserved. +.\" +.\" Copyright (c) 2017 Netflix, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd December 20, 2017 .Dt NDA 4 .Os .Sh NAME .Nm nda .Nd NVMe Direct Access device driver .Sh SYNOPSIS .Cd device nvme .Cd device scbus .Sh DESCRIPTION The .Nm driver provides support for direct access devices, implementing the .Tn NVMe command protocol, that are attached to the system through a host adapter supported by the CAM subsystem. .Sh SYSCTL VARIABLES The following variables are available as both .Xr sysctl 8 variables and .Xr loader 8 tunables: .Bl -tag -width 12 .It Va kern.cam.nda.sort_io_queue .Pp This variable determines whether the software queued entries are sorted in LBA order or not. Sorting is almost always a waste of time. The default is to not sort. .El .Pp The following report per-device settings, and are read-only unless otherwise indicated. Replace .Va N with the device unit number. .Bl -tag -width 12 .It Va kern.cam.nda.N.rotating .Pp This variable reports whether the storage volume is spinning or flash. It's value is hard coded to 0 indicating flash. .It Va kern.cam.nda.N.unmapped_io This variable reports whether the .Nm driver accepts unmapped I/O for this unit. .Sh FILES .Bl -tag -width ".Pa /dev/nda*" -compact .It Pa /dev/nda* NVMe storage device nodes .El .Sh SEE ALSO .Xr nvme 4 , .Xr nvd 4 .Sh HISTORY The .Nm driver first appeared in .Fx 12.0 . .Sh AUTHORS .An Warner Losh Aq Mt imp@FreeBSD.org Index: head/share/man/man9/kern_testfrwk.9 =================================================================== --- head/share/man/man9/kern_testfrwk.9 (revision 343754) +++ head/share/man/man9/kern_testfrwk.9 (revision 343755) @@ -1,194 +1,193 @@ .\" -.\" Copyright (c) 2015 Netflix Inc. -.\" All rights reserved. +.\" Copyright (c) 2015 Netflix, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. .\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd November 12, 2015 .Dt KERN_TESTFRWK 9 .Os .Sh NAME .Nm kern_testfrwk .Nd A kernel testing framework .Sh SYNOPSIS kld_load kern_testfrwk .Sh DESCRIPTION .\" This whole section is not written in manual page style and should be ripped .\" out and replaced. -CEM So what is this sys/tests directory in the kernel all about? .Pp Have you ever wanted to test a part of the FreeBSD kernel in some way and you had no real way from user-land to make what you want to occur happen? Say an error path or situation where locking occurs in a particular manner that happens only once in a blue moon? .Pp If so, then the kernel test framework is just what you are looking for. It is designed to help you create the situation you want. .Pp There are two components to the system: the test framework and your test. This document will describe both components and use the test submitted with the initial commit of this code to discuss the test .Xr ( callout_test 4 ) . All of the tests become kernel loadable modules. The test you write should have a dependency on the test framework. That way it will be loaded automatically with your test. For example, you can see how to do this in the bottom of callout_test.c in .Pa sys/tests/callout_test/callout_test.c . .Pp The framework itself is in .Pa sys/tests/framework/kern_testfrwk.c . Its job is to manage the tests that are loaded. (More than one can be loaded.) The idea is pretty simple; you load the test framework and then load your test. .Pp When your test loads, you register your tests with the kernel test framework. You do that through a call to .Fn kern_testframework_register . Usually this is done at the module load event as shown below: .Bd -literal -offset indent switch (type) { case MOD_LOAD: err = kern_testframework_register("callout_test", run_callout_test); .Ed .Pp Here the test is "callout_test" and it is registered to run the function .Fn run_callout_test passing it a .Fa struct kern_test *ptr . The .Vt kern_test structure is defined in .Pa kern_testfrwk.h . .Bd -literal -offset indent struct kern_test { char name[TEST_NAME_LEN]; int num_threads; /* Fill in how many threads you want */ int tot_threads_running; /* Private to framework */ uint8_t test_options[TEST_OPTION_SPACE]; }; .Ed .Pp The user sends this structure down via a sysctl to start your test. He or she places the same name you registered ("callout_test" in our example) in the .Va name field. The user can also set the number of threads to run with .Va num_threads . .Pp The framework will start the requested number of kernel threads, all running your test at the same time. The user does not specify anything in .Va tot_threads_running ; it is private to the framework. As the framework calls each of your tests, it will set the .Va tot_threads_running to the index of the thread that your call is made from. For example, if the user sets .Va num_threads to 2, then the function .Fn run_callout_test will be called once with .Va tot_threads_running to 0, and a second time with .Va tot_threads_running set to 1. .Pp The .Va test_options field is a test-specific set of information that is an opaque blob. It is passed in from user space and has a maximum size of 256 bytes. You can pass arbitrary test input in the space. In the case of callout_test we reshape that to: .Bd -literal -offset indent struct callout_test { int number_of_callouts; int test_number; }; .Ed .Pp So the first lines of .Fn run_callout_test does the following to get at the user specific data: .\" This is a bad example and violates strict aliasing. It should be replaced. .Bd -literal -offset indent struct callout_test *u; size_t sz; int i; struct callout_run *rn; int index = test->tot_threads_running; u = (struct callout_test *)test->test_options; .Ed .Pp That way it can access: .Va u->test_number (there are two types of tests provided with this test) and .Va u->number_of_callouts (how many simultaneous callouts to run). .Pp Your test can do anything with these bytes. So the callout_test in question wants to create a situation where multiple callouts are all run, that is the .Va number_of_callouts , and it tries to cancel the callout with the new .Fn callout_async_drain . The threads do this by acquiring the lock in question, and then starting each of the callouts. It waits for the callouts to all go off (the executor spins waits). This forces the situation that the callouts have expired and are all waiting on the lock that the executor holds. After the callouts are all blocked, the executor calls .Fn callout_async_drain on each callout and releases the lock. .Pp .\" callout_test(4) specific documentation should probably be moved to its own .\" page. After all the callouts are done, a total status is printed showing the results via .Xr printf 9 . The human tester can run .Xr dmesg 8 to see the results. In this case it is expected that if you are running test 0, all the callouts expire on the same CPU so only one callout_drain function would have been called. the number of zero_returns should match the number of callout_drains that were called, i.e., 1. The one_returns should be the remainder of the callouts. If the test number was 1, the callouts were spread across all CPUs. The number of zero_returns will again match the number of drain calls made which matches the number of CPUs that were put in use. .Pp More than one thread can be used with this test, though in the example case it is probably not necessary. .Pp You should not need to change the framework. Just add tests and register them after loading. .Sh AUTHORS The kernel test framework was written by .An Randall Stewart Aq Mt rrs@FreeBSD.org with help from .An John Mark Gurney Aq Mt jmg@FreeBSD.org . Index: head/stand/efi/libefi/efienv.c =================================================================== --- head/stand/efi/libefi/efienv.c (revision 343754) +++ head/stand/efi/libefi/efienv.c (revision 343755) @@ -1,87 +1,86 @@ /*- * Copyright (c) 2018 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include static EFI_GUID FreeBSDBootVarGUID = FREEBSD_BOOT_VAR_GUID; static EFI_GUID GlobalBootVarGUID = EFI_GLOBAL_VARIABLE; EFI_STATUS efi_getenv(EFI_GUID *g, const char *v, void *data, size_t *len) { size_t ul; CHAR16 *uv; UINT32 attr; UINTN dl; EFI_STATUS rv; uv = NULL; if (utf8_to_ucs2(v, &uv, &ul) != 0) return (EFI_OUT_OF_RESOURCES); dl = *len; rv = RS->GetVariable(uv, g, &attr, &dl, data); if (rv == EFI_SUCCESS) *len = dl; free(uv); return (rv); } EFI_STATUS efi_global_getenv(const char *v, void *data, size_t *len) { return (efi_getenv(&GlobalBootVarGUID, v, data, len)); } EFI_STATUS efi_freebsd_getenv(const char *v, void *data, size_t *len) { return (efi_getenv(&FreeBSDBootVarGUID, v, data, len)); } EFI_STATUS efi_setenv_freebsd_wcs(const char *varname, CHAR16 *valstr) { CHAR16 *var = NULL; size_t len; EFI_STATUS rv; if (utf8_to_ucs2(varname, &var, &len) != 0) return (EFI_OUT_OF_RESOURCES); rv = RS->SetVariable(var, &FreeBSDBootVarGUID, EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, (ucs2len(valstr) + 1) * sizeof(efi_char), valstr); free(var); return (rv); } Index: head/stand/efi/libefi/env.c =================================================================== --- head/stand/efi/libefi/env.c (revision 343754) +++ head/stand/efi/libefi/env.c (revision 343755) @@ -1,1003 +1,1003 @@ /* - * Copyright (c) 2015 Netflix, Inc. All Rights Reserved. + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include /* Partition GUIDS */ #include #include #include #include #include #include #include #include #include "bootstrap.h" /* * About ENABLE_UPDATES * * The UEFI variables are identified only by GUID and name, there is no * way to (auto)detect the type for the value, so we need to process the * variables case by case, as we do learn about them. * * While showing the variable name and the value is safe, we must not store * random values nor allow removing (random) variables. * * Since we do have stub code to set/unset the variables, I do want to keep * it to make the future development a bit easier, but the updates are disabled * by default till: * a) the validation and data translation to values is properly implemented * b) We have established which variables we do allow to be updated. * Therefore the set/unset code is included only for developers aid. */ static struct efi_uuid_mapping { const char *efi_guid_name; EFI_GUID efi_guid; } efi_uuid_mapping[] = { { .efi_guid_name = "global", .efi_guid = EFI_GLOBAL_VARIABLE }, { .efi_guid_name = "freebsd", .efi_guid = FREEBSD_BOOT_VAR_GUID }, /* EFI Systab entry names. */ { .efi_guid_name = "MPS Table", .efi_guid = MPS_TABLE_GUID }, { .efi_guid_name = "ACPI Table", .efi_guid = ACPI_TABLE_GUID }, { .efi_guid_name = "ACPI 2.0 Table", .efi_guid = ACPI_20_TABLE_GUID }, { .efi_guid_name = "SMBIOS Table", .efi_guid = SMBIOS_TABLE_GUID }, { .efi_guid_name = "SMBIOS3 Table", .efi_guid = SMBIOS3_TABLE_GUID }, { .efi_guid_name = "DXE Table", .efi_guid = DXE_SERVICES_TABLE_GUID }, { .efi_guid_name = "HOB List Table", .efi_guid = HOB_LIST_TABLE_GUID }, { .efi_guid_name = EFI_MEMORY_TYPE_INFORMATION_VARIABLE_NAME, .efi_guid = EFI_MEMORY_TYPE_INFORMATION_GUID }, { .efi_guid_name = "Debug Image Info Table", .efi_guid = DEBUG_IMAGE_INFO_TABLE_GUID }, { .efi_guid_name = "FDT Table", .efi_guid = FDT_TABLE_GUID }, /* * Protocol names for debug purposes. * Can be removed along with lsefi command. */ { .efi_guid_name = "device path", .efi_guid = DEVICE_PATH_PROTOCOL }, { .efi_guid_name = "block io", .efi_guid = BLOCK_IO_PROTOCOL }, { .efi_guid_name = "disk io", .efi_guid = DISK_IO_PROTOCOL }, { .efi_guid_name = "disk info", .efi_guid = EFI_DISK_INFO_PROTOCOL_GUID }, { .efi_guid_name = "simple fs", .efi_guid = SIMPLE_FILE_SYSTEM_PROTOCOL }, { .efi_guid_name = "load file", .efi_guid = LOAD_FILE_PROTOCOL }, { .efi_guid_name = "device io", .efi_guid = DEVICE_IO_PROTOCOL }, { .efi_guid_name = "unicode collation", .efi_guid = UNICODE_COLLATION_PROTOCOL }, { .efi_guid_name = "unicode collation2", .efi_guid = EFI_UNICODE_COLLATION2_PROTOCOL_GUID }, { .efi_guid_name = "simple network", .efi_guid = EFI_SIMPLE_NETWORK_PROTOCOL }, { .efi_guid_name = "simple text output", .efi_guid = SIMPLE_TEXT_OUTPUT_PROTOCOL }, { .efi_guid_name = "simple text input", .efi_guid = SIMPLE_TEXT_INPUT_PROTOCOL }, { .efi_guid_name = "simple text ex input", .efi_guid = EFI_SIMPLE_TEXT_INPUT_EX_PROTOCOL_GUID }, { .efi_guid_name = "console control", .efi_guid = EFI_CONSOLE_CONTROL_PROTOCOL_GUID }, { .efi_guid_name = "stdin", .efi_guid = EFI_CONSOLE_IN_DEVICE_GUID }, { .efi_guid_name = "stdout", .efi_guid = EFI_CONSOLE_OUT_DEVICE_GUID }, { .efi_guid_name = "stderr", .efi_guid = EFI_STANDARD_ERROR_DEVICE_GUID }, { .efi_guid_name = "GOP", .efi_guid = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID }, { .efi_guid_name = "UGA draw", .efi_guid = EFI_UGA_DRAW_PROTOCOL_GUID }, { .efi_guid_name = "PXE base code", .efi_guid = EFI_PXE_BASE_CODE_PROTOCOL }, { .efi_guid_name = "PXE base code callback", .efi_guid = EFI_PXE_BASE_CODE_CALLBACK_PROTOCOL }, { .efi_guid_name = "serial io", .efi_guid = SERIAL_IO_PROTOCOL }, { .efi_guid_name = "loaded image", .efi_guid = LOADED_IMAGE_PROTOCOL }, { .efi_guid_name = "loaded image device path", .efi_guid = EFI_LOADED_IMAGE_DEVICE_PATH_PROTOCOL_GUID }, { .efi_guid_name = "ISA io", .efi_guid = EFI_ISA_IO_PROTOCOL_GUID }, { .efi_guid_name = "IDE controller init", .efi_guid = EFI_IDE_CONTROLLER_INIT_PROTOCOL_GUID }, { .efi_guid_name = "ISA ACPI", .efi_guid = EFI_ISA_ACPI_PROTOCOL_GUID }, { .efi_guid_name = "PCI", .efi_guid = EFI_PCI_IO_PROTOCOL_GUID }, { .efi_guid_name = "PCI root", .efi_guid = EFI_PCI_ROOT_IO_GUID }, { .efi_guid_name = "PCI enumeration", .efi_guid = EFI_PCI_ENUMERATION_COMPLETE_GUID }, { .efi_guid_name = "Driver diagnostics", .efi_guid = EFI_DRIVER_DIAGNOSTICS_PROTOCOL_GUID }, { .efi_guid_name = "Driver diagnostics2", .efi_guid = EFI_DRIVER_DIAGNOSTICS2_PROTOCOL_GUID }, { .efi_guid_name = "simple pointer", .efi_guid = EFI_SIMPLE_POINTER_PROTOCOL_GUID }, { .efi_guid_name = "absolute pointer", .efi_guid = EFI_ABSOLUTE_POINTER_PROTOCOL_GUID }, { .efi_guid_name = "VLAN config", .efi_guid = EFI_VLAN_CONFIG_PROTOCOL_GUID }, { .efi_guid_name = "ARP service binding", .efi_guid = EFI_ARP_SERVICE_BINDING_PROTOCOL_GUID }, { .efi_guid_name = "ARP", .efi_guid = EFI_ARP_PROTOCOL_GUID }, { .efi_guid_name = "IPv4 service binding", .efi_guid = EFI_IP4_SERVICE_BINDING_PROTOCOL }, { .efi_guid_name = "IPv4", .efi_guid = EFI_IP4_PROTOCOL }, { .efi_guid_name = "IPv4 config", .efi_guid = EFI_IP4_CONFIG_PROTOCOL_GUID }, { .efi_guid_name = "IPv6 service binding", .efi_guid = EFI_IP6_SERVICE_BINDING_PROTOCOL }, { .efi_guid_name = "IPv6", .efi_guid = EFI_IP6_PROTOCOL }, { .efi_guid_name = "IPv6 config", .efi_guid = EFI_IP6_CONFIG_PROTOCOL_GUID }, { .efi_guid_name = "UDPv4", .efi_guid = EFI_UDP4_PROTOCOL }, { .efi_guid_name = "UDPv4 service binding", .efi_guid = EFI_UDP4_SERVICE_BINDING_PROTOCOL }, { .efi_guid_name = "UDPv6", .efi_guid = EFI_UDP6_PROTOCOL }, { .efi_guid_name = "UDPv6 service binding", .efi_guid = EFI_UDP6_SERVICE_BINDING_PROTOCOL }, { .efi_guid_name = "TCPv4", .efi_guid = EFI_TCP4_PROTOCOL }, { .efi_guid_name = "TCPv4 service binding", .efi_guid = EFI_TCP4_SERVICE_BINDING_PROTOCOL }, { .efi_guid_name = "TCPv6", .efi_guid = EFI_TCP6_PROTOCOL }, { .efi_guid_name = "TCPv6 service binding", .efi_guid = EFI_TCP6_SERVICE_BINDING_PROTOCOL }, { .efi_guid_name = "EFI System partition", .efi_guid = EFI_PART_TYPE_EFI_SYSTEM_PART_GUID }, { .efi_guid_name = "MBR legacy", .efi_guid = EFI_PART_TYPE_LEGACY_MBR_GUID }, { .efi_guid_name = "device tree", .efi_guid = EFI_DEVICE_TREE_GUID }, { .efi_guid_name = "USB io", .efi_guid = EFI_USB_IO_PROTOCOL_GUID }, { .efi_guid_name = "USB2 HC", .efi_guid = EFI_USB2_HC_PROTOCOL_GUID }, { .efi_guid_name = "component name", .efi_guid = EFI_COMPONENT_NAME_PROTOCOL_GUID }, { .efi_guid_name = "component name2", .efi_guid = EFI_COMPONENT_NAME2_PROTOCOL_GUID }, { .efi_guid_name = "driver binding", .efi_guid = EFI_DRIVER_BINDING_PROTOCOL_GUID }, { .efi_guid_name = "driver configuration", .efi_guid = EFI_DRIVER_CONFIGURATION_PROTOCOL_GUID }, { .efi_guid_name = "driver configuration2", .efi_guid = EFI_DRIVER_CONFIGURATION2_PROTOCOL_GUID }, { .efi_guid_name = "decompress", .efi_guid = EFI_DECOMPRESS_PROTOCOL_GUID }, { .efi_guid_name = "ebc interpreter", .efi_guid = EFI_EBC_INTERPRETER_PROTOCOL_GUID }, { .efi_guid_name = "network interface identifier", .efi_guid = EFI_NETWORK_INTERFACE_IDENTIFIER_PROTOCOL }, { .efi_guid_name = "network interface identifier_31", .efi_guid = EFI_NETWORK_INTERFACE_IDENTIFIER_PROTOCOL_31 }, { .efi_guid_name = "managed network service binding", .efi_guid = EFI_MANAGED_NETWORK_SERVICE_BINDING_PROTOCOL_GUID }, { .efi_guid_name = "managed network", .efi_guid = EFI_MANAGED_NETWORK_PROTOCOL_GUID }, { .efi_guid_name = "form browser", .efi_guid = EFI_FORM_BROWSER2_PROTOCOL_GUID }, { .efi_guid_name = "HII config routing", .efi_guid = EFI_HII_CONFIG_ROUTING_PROTOCOL_GUID }, { .efi_guid_name = "HII database", .efi_guid = EFI_HII_DATABASE_PROTOCOL_GUID }, { .efi_guid_name = "HII string", .efi_guid = EFI_HII_STRING_PROTOCOL_GUID }, { .efi_guid_name = "HII image", .efi_guid = EFI_HII_IMAGE_PROTOCOL_GUID }, { .efi_guid_name = "HII font", .efi_guid = EFI_HII_FONT_PROTOCOL_GUID }, { .efi_guid_name = "HII config", .efi_guid = EFI_HII_CONFIGURATION_ACCESS_PROTOCOL_GUID }, { .efi_guid_name = "MTFTP4 service binding", .efi_guid = EFI_MTFTP4_SERVICE_BINDING_PROTOCOL_GUID }, { .efi_guid_name = "MTFTP4", .efi_guid = EFI_MTFTP4_PROTOCOL_GUID }, { .efi_guid_name = "MTFTP6 service binding", .efi_guid = EFI_MTFTP6_SERVICE_BINDING_PROTOCOL_GUID }, { .efi_guid_name = "MTFTP6", .efi_guid = EFI_MTFTP6_PROTOCOL_GUID }, { .efi_guid_name = "DHCP4 service binding", .efi_guid = EFI_DHCP4_SERVICE_BINDING_PROTOCOL_GUID }, { .efi_guid_name = "DHCP4", .efi_guid = EFI_DHCP4_PROTOCOL_GUID }, { .efi_guid_name = "DHCP6 service binding", .efi_guid = EFI_DHCP6_SERVICE_BINDING_PROTOCOL_GUID }, { .efi_guid_name = "DHCP6", .efi_guid = EFI_DHCP6_PROTOCOL_GUID }, { .efi_guid_name = "SCSI io", .efi_guid = EFI_SCSI_IO_PROTOCOL_GUID }, { .efi_guid_name = "SCSI pass thru", .efi_guid = EFI_SCSI_PASS_THRU_PROTOCOL_GUID }, { .efi_guid_name = "SCSI pass thru ext", .efi_guid = EFI_EXT_SCSI_PASS_THRU_PROTOCOL_GUID }, { .efi_guid_name = "Capsule arch", .efi_guid = EFI_CAPSULE_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "monotonic counter arch", .efi_guid = EFI_MONOTONIC_COUNTER_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "realtime clock arch", .efi_guid = EFI_REALTIME_CLOCK_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "variable arch", .efi_guid = EFI_VARIABLE_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "variable write arch", .efi_guid = EFI_VARIABLE_WRITE_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "watchdog timer arch", .efi_guid = EFI_WATCHDOG_TIMER_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "ACPI support", .efi_guid = EFI_ACPI_SUPPORT_PROTOCOL_GUID }, { .efi_guid_name = "BDS arch", .efi_guid = EFI_BDS_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "metronome arch", .efi_guid = EFI_METRONOME_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "timer arch", .efi_guid = EFI_TIMER_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "DPC", .efi_guid = EFI_DPC_PROTOCOL_GUID }, { .efi_guid_name = "print2", .efi_guid = EFI_PRINT2_PROTOCOL_GUID }, { .efi_guid_name = "device path to text", .efi_guid = EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID }, { .efi_guid_name = "reset arch", .efi_guid = EFI_RESET_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "CPU arch", .efi_guid = EFI_CPU_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "CPU IO2", .efi_guid = EFI_CPU_IO2_PROTOCOL_GUID }, { .efi_guid_name = "Legacy 8259", .efi_guid = EFI_LEGACY_8259_PROTOCOL_GUID }, { .efi_guid_name = "Security arch", .efi_guid = EFI_SECURITY_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "Security2 arch", .efi_guid = EFI_SECURITY2_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "Runtime arch", .efi_guid = EFI_RUNTIME_ARCH_PROTOCOL_GUID }, { .efi_guid_name = "status code runtime", .efi_guid = EFI_STATUS_CODE_RUNTIME_PROTOCOL_GUID }, { .efi_guid_name = "data hub", .efi_guid = EFI_DATA_HUB_PROTOCOL_GUID }, { .efi_guid_name = "PCD", .efi_guid = PCD_PROTOCOL_GUID }, { .efi_guid_name = "EFI PCD", .efi_guid = EFI_PCD_PROTOCOL_GUID }, { .efi_guid_name = "firmware volume block", .efi_guid = EFI_FIRMWARE_VOLUME_BLOCK_PROTOCOL_GUID }, { .efi_guid_name = "firmware volume2", .efi_guid = EFI_FIRMWARE_VOLUME2_PROTOCOL_GUID }, { .efi_guid_name = "firmware volume dispatch", .efi_guid = EFI_FIRMWARE_VOLUME_DISPATCH_PROTOCOL_GUID }, { .efi_guid_name = "lzma compress", .efi_guid = LZMA_COMPRESS_GUID }, { .efi_guid_name = "MP services", .efi_guid = EFI_MP_SERVICES_PROTOCOL_GUID }, { .efi_guid_name = MTC_VARIABLE_NAME, .efi_guid = MTC_VENDOR_GUID }, { .efi_guid_name = "RTC", .efi_guid = { 0x378D7B65, 0x8DA9, 0x4773, { 0xB6, 0xE4, 0xA4, 0x78, 0x26, 0xA8, 0x33, 0xE1} } }, { .efi_guid_name = "Active EDID", .efi_guid = EFI_EDID_ACTIVE_PROTOCOL_GUID }, { .efi_guid_name = "Discovered EDID", .efi_guid = EFI_EDID_DISCOVERED_PROTOCOL_GUID } }; bool efi_guid_to_str(const EFI_GUID *guid, char **sp) { uint32_t status; uuid_to_string((const uuid_t *)guid, sp, &status); return (status == uuid_s_ok ? true : false); } bool efi_str_to_guid(const char *s, EFI_GUID *guid) { uint32_t status; uuid_from_string(s, (uuid_t *)guid, &status); return (status == uuid_s_ok ? true : false); } bool efi_name_to_guid(const char *name, EFI_GUID *guid) { uint32_t i; for (i = 0; i < nitems(efi_uuid_mapping); i++) { if (strcasecmp(name, efi_uuid_mapping[i].efi_guid_name) == 0) { *guid = efi_uuid_mapping[i].efi_guid; return (true); } } return (efi_str_to_guid(name, guid)); } bool efi_guid_to_name(EFI_GUID *guid, char **name) { uint32_t i; int rv; for (i = 0; i < nitems(efi_uuid_mapping); i++) { rv = uuid_equal((uuid_t *)guid, (uuid_t *)&efi_uuid_mapping[i].efi_guid, NULL); if (rv != 0) { *name = strdup(efi_uuid_mapping[i].efi_guid_name); if (*name == NULL) return (false); return (true); } } return (efi_guid_to_str(guid, name)); } void efi_init_environment(void) { char var[128]; snprintf(var, sizeof(var), "%d.%02d", ST->Hdr.Revision >> 16, ST->Hdr.Revision & 0xffff); env_setenv("efi-version", EV_VOLATILE, var, env_noset, env_nounset); } COMMAND_SET(efishow, "efi-show", "print some or all EFI variables", command_efi_show); static int efi_print_other_value(uint8_t *data, UINTN datasz) { UINTN i; bool is_ascii = true; printf(" = "); for (i = 0; i < datasz - 1; i++) { /* * Quick hack to see if this ascii-ish string is printable * range plus tab, cr and lf. */ if ((data[i] < 32 || data[i] > 126) && data[i] != 9 && data[i] != 10 && data[i] != 13) { is_ascii = false; break; } } if (data[datasz - 1] != '\0') is_ascii = false; if (is_ascii == true) { printf("%s", data); if (pager_output("\n")) return (CMD_WARN); } else { if (pager_output("\n")) return (CMD_WARN); /* * Dump hex bytes grouped by 4. */ for (i = 0; i < datasz; i++) { printf("%02x ", data[i]); if ((i + 1) % 4 == 0) printf(" "); if ((i + 1) % 20 == 0) { if (pager_output("\n")) return (CMD_WARN); } } if (pager_output("\n")) return (CMD_WARN); } return (CMD_OK); } /* This appears to be some sort of UEFI shell alias table. */ static int efi_print_shell_str(const CHAR16 *varnamearg __unused, uint8_t *data, UINTN datasz __unused) { printf(" = %S", (CHAR16 *)data); if (pager_output("\n")) return (CMD_WARN); return (CMD_OK); } const char * efi_memory_type(EFI_MEMORY_TYPE type) { const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; switch (type) { case EfiReservedMemoryType: case EfiLoaderCode: case EfiLoaderData: case EfiBootServicesCode: case EfiBootServicesData: case EfiRuntimeServicesCode: case EfiRuntimeServicesData: case EfiConventionalMemory: case EfiUnusableMemory: case EfiACPIReclaimMemory: case EfiACPIMemoryNVS: case EfiMemoryMappedIO: case EfiMemoryMappedIOPortSpace: case EfiPalCode: case EfiPersistentMemory: return (types[type]); default: return ("Unknown"); } } /* Print memory type table. */ static int efi_print_mem_type(const CHAR16 *varnamearg __unused, uint8_t *data, UINTN datasz) { int i, n; EFI_MEMORY_TYPE_INFORMATION *ti; ti = (EFI_MEMORY_TYPE_INFORMATION *)data; if (pager_output(" = \n")) return (CMD_WARN); n = datasz / sizeof (EFI_MEMORY_TYPE_INFORMATION); for (i = 0; i < n && ti[i].NumberOfPages != 0; i++) { printf("\t%23s pages: %u", efi_memory_type(ti[i].Type), ti[i].NumberOfPages); if (pager_output("\n")) return (CMD_WARN); } return (CMD_OK); } /* * Print FreeBSD variables. * We have LoaderPath and LoaderDev as CHAR16 strings. */ static int efi_print_freebsd(const CHAR16 *varnamearg, uint8_t *data, UINTN datasz __unused) { int rv = -1; char *var = NULL; if (ucs2_to_utf8(varnamearg, &var) != 0) return (CMD_ERROR); if (strcmp("LoaderPath", var) == 0 || strcmp("LoaderDev", var) == 0) { printf(" = "); printf("%S", (CHAR16 *)data); if (pager_output("\n")) rv = CMD_WARN; else rv = CMD_OK; } free(var); return (rv); } /* Print global variables. */ static int efi_print_global(const CHAR16 *varnamearg, uint8_t *data, UINTN datasz) { int rv = -1; char *var = NULL; if (ucs2_to_utf8(varnamearg, &var) != 0) return (CMD_ERROR); if (strcmp("AuditMode", var) == 0) { printf(" = "); printf("0x%x", *data); /* 8-bit int */ goto done; } if (strcmp("BootOptionSupport", var) == 0) { printf(" = "); printf("0x%x", *((uint32_t *)data)); /* UINT32 */ goto done; } if (strcmp("BootCurrent", var) == 0 || strcmp("BootNext", var) == 0 || strcmp("Timeout", var) == 0) { printf(" = "); printf("%u", *((uint16_t *)data)); /* UINT16 */ goto done; } if (strcmp("BootOrder", var) == 0 || strcmp("DriverOrder", var) == 0) { UINTN i; UINT16 *u16 = (UINT16 *)data; printf(" ="); for (i = 0; i < datasz / sizeof (UINT16); i++) printf(" %u", u16[i]); goto done; } if (strncmp("Boot", var, 4) == 0 || strncmp("Driver", var, 5) == 0 || strncmp("SysPrep", var, 7) == 0 || strncmp("OsRecovery", var, 10) == 0) { UINT16 filepathlistlen; CHAR16 *text; int desclen; EFI_DEVICE_PATH *dp; data += sizeof(UINT32); filepathlistlen = *(uint16_t *)data; data += sizeof (UINT16); text = (CHAR16 *)data; for (desclen = 0; text[desclen] != 0; desclen++) ; if (desclen != 0) { /* Add terminating zero and we have CHAR16. */ desclen = (desclen + 1) * 2; } printf(" = "); printf("%S", text); if (filepathlistlen != 0) { /* Output pathname from new line. */ if (pager_output("\n")) { rv = CMD_WARN; goto done; } dp = malloc(filepathlistlen); if (dp == NULL) goto done; memcpy(dp, data + desclen, filepathlistlen); text = efi_devpath_name(dp); if (text != NULL) { printf("\t%S", text); efi_free_devpath_name(text); } free(dp); } goto done; } if (strcmp("ConIn", var) == 0 || strcmp("ConInDev", var) == 0 || strcmp("ConOut", var) == 0 || strcmp("ConOutDev", var) == 0 || strcmp("ErrOut", var) == 0 || strcmp("ErrOutDev", var) == 0) { CHAR16 *text; printf(" = "); text = efi_devpath_name((EFI_DEVICE_PATH *)data); if (text != NULL) { printf("%S", text); efi_free_devpath_name(text); } goto done; } if (strcmp("PlatformLang", var) == 0 || strcmp("PlatformLangCodes", var) == 0 || strcmp("LangCodes", var) == 0 || strcmp("Lang", var) == 0) { printf(" = "); printf("%s", data); /* ASCII string */ goto done; } /* * Feature bitmap from firmware to OS. * Older UEFI provides UINT32, newer UINT64. */ if (strcmp("OsIndicationsSupported", var) == 0) { printf(" = "); if (datasz == 4) printf("0x%x", *((uint32_t *)data)); else printf("0x%jx", *((uint64_t *)data)); goto done; } /* Fallback for anything else. */ rv = efi_print_other_value(data, datasz); done: if (rv == -1) { if (pager_output("\n")) rv = CMD_WARN; else rv = CMD_OK; } free(var); return (rv); } static void efi_print_var_attr(UINT32 attr) { bool comma = false; if (attr & EFI_VARIABLE_NON_VOLATILE) { printf("NV"); comma = true; } if (attr & EFI_VARIABLE_BOOTSERVICE_ACCESS) { if (comma == true) printf(","); printf("BS"); comma = true; } if (attr & EFI_VARIABLE_RUNTIME_ACCESS) { if (comma == true) printf(","); printf("RS"); comma = true; } if (attr & EFI_VARIABLE_HARDWARE_ERROR_RECORD) { if (comma == true) printf(","); printf("HR"); comma = true; } if (attr & EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS) { if (comma == true) printf(","); printf("AT"); comma = true; } } static int efi_print_var(CHAR16 *varnamearg, EFI_GUID *matchguid, int lflag) { UINTN datasz; EFI_STATUS status; UINT32 attr; char *str; uint8_t *data; int rv = CMD_OK; str = NULL; datasz = 0; status = RS->GetVariable(varnamearg, matchguid, &attr, &datasz, NULL); if (status != EFI_BUFFER_TOO_SMALL) { printf("Can't get the variable: error %#lx\n", EFI_ERROR_CODE(status)); return (CMD_ERROR); } data = malloc(datasz); if (data == NULL) { printf("Out of memory\n"); return (CMD_ERROR); } status = RS->GetVariable(varnamearg, matchguid, &attr, &datasz, data); if (status != EFI_SUCCESS) { printf("Can't get the variable: error %#lx\n", EFI_ERROR_CODE(status)); free(data); return (CMD_ERROR); } if (efi_guid_to_name(matchguid, &str) == false) { rv = CMD_ERROR; goto done; } printf("%s ", str); efi_print_var_attr(attr); printf(" %S", varnamearg); if (lflag == 0) { if (strcmp(str, "global") == 0) rv = efi_print_global(varnamearg, data, datasz); else if (strcmp(str, "freebsd") == 0) rv = efi_print_freebsd(varnamearg, data, datasz); else if (strcmp(str, EFI_MEMORY_TYPE_INFORMATION_VARIABLE_NAME) == 0) rv = efi_print_mem_type(varnamearg, data, datasz); else if (strcmp(str, "47c7b227-c42a-11d2-8e57-00a0c969723b") == 0) rv = efi_print_shell_str(varnamearg, data, datasz); else if (strcmp(str, MTC_VARIABLE_NAME) == 0) { printf(" = "); printf("%u", *((uint32_t *)data)); /* UINT32 */ rv = CMD_OK; if (pager_output("\n")) rv = CMD_WARN; } else rv = efi_print_other_value(data, datasz); } else if (pager_output("\n")) rv = CMD_WARN; done: free(str); free(data); return (rv); } static int command_efi_show(int argc, char *argv[]) { /* * efi-show [-a] * print all the env * efi-show -g UUID * print all the env vars tagged with UUID * efi-show -v var * search all the env vars and print the ones matching var * efi-show -g UUID -v var * efi-show UUID var * print all the env vars that match UUID and var */ /* NB: We assume EFI_GUID is the same as uuid_t */ int aflag = 0, gflag = 0, lflag = 0, vflag = 0; int ch, rv; unsigned i; EFI_STATUS status; EFI_GUID varguid = ZERO_GUID; EFI_GUID matchguid = ZERO_GUID; CHAR16 *varname; CHAR16 *newnm; CHAR16 varnamearg[128]; UINTN varalloc; UINTN varsz; optind = 1; optreset = 1; opterr = 1; while ((ch = getopt(argc, argv, "ag:lv:")) != -1) { switch (ch) { case 'a': aflag = 1; break; case 'g': gflag = 1; if (efi_name_to_guid(optarg, &matchguid) == false) { printf("uuid %s could not be parsed\n", optarg); return (CMD_ERROR); } break; case 'l': lflag = 1; break; case 'v': vflag = 1; if (strlen(optarg) >= nitems(varnamearg)) { printf("Variable %s is longer than %zu " "characters\n", optarg, nitems(varnamearg)); return (CMD_ERROR); } cpy8to16(optarg, varnamearg, nitems(varnamearg)); break; default: return (CMD_ERROR); } } if (argc == 1) /* default is -a */ aflag = 1; if (aflag && (gflag || vflag)) { printf("-a isn't compatible with -g or -v\n"); return (CMD_ERROR); } if (aflag && optind < argc) { printf("-a doesn't take any args\n"); return (CMD_ERROR); } argc -= optind; argv += optind; pager_open(); if (vflag && gflag) { rv = efi_print_var(varnamearg, &matchguid, lflag); if (rv == CMD_WARN) rv = CMD_OK; pager_close(); return (rv); } if (argc == 2) { optarg = argv[0]; if (strlen(optarg) >= nitems(varnamearg)) { printf("Variable %s is longer than %zu characters\n", optarg, nitems(varnamearg)); pager_close(); return (CMD_ERROR); } for (i = 0; i < strlen(optarg); i++) varnamearg[i] = optarg[i]; varnamearg[i] = 0; optarg = argv[1]; if (efi_name_to_guid(optarg, &matchguid) == false) { printf("uuid %s could not be parsed\n", optarg); pager_close(); return (CMD_ERROR); } rv = efi_print_var(varnamearg, &matchguid, lflag); if (rv == CMD_WARN) rv = CMD_OK; pager_close(); return (rv); } if (argc > 0) { printf("Too many args: %d\n", argc); pager_close(); return (CMD_ERROR); } /* * Initiate the search -- note the standard takes pain * to specify the initial call must be a poiner to a NULL * character. */ varalloc = 1024; varname = malloc(varalloc); if (varname == NULL) { printf("Can't allocate memory to get variables\n"); pager_close(); return (CMD_ERROR); } varname[0] = 0; while (1) { varsz = varalloc; status = RS->GetNextVariableName(&varsz, varname, &varguid); if (status == EFI_BUFFER_TOO_SMALL) { varalloc = varsz; newnm = realloc(varname, varalloc); if (newnm == NULL) { printf("Can't allocate memory to get " "variables\n"); rv = CMD_ERROR; break; } varname = newnm; continue; /* Try again with bigger buffer */ } if (status == EFI_NOT_FOUND) { rv = CMD_OK; break; } if (status != EFI_SUCCESS) { rv = CMD_ERROR; break; } if (aflag) { rv = efi_print_var(varname, &varguid, lflag); if (rv != CMD_OK) { if (rv == CMD_WARN) rv = CMD_OK; break; } continue; } if (vflag) { if (wcscmp(varnamearg, varname) == 0) { rv = efi_print_var(varname, &varguid, lflag); if (rv != CMD_OK) { if (rv == CMD_WARN) rv = CMD_OK; break; } continue; } } if (gflag) { rv = uuid_equal((uuid_t *)&varguid, (uuid_t *)&matchguid, NULL); if (rv != 0) { rv = efi_print_var(varname, &varguid, lflag); if (rv != CMD_OK) { if (rv == CMD_WARN) rv = CMD_OK; break; } continue; } } } free(varname); pager_close(); return (rv); } COMMAND_SET(efiset, "efi-set", "set EFI variables", command_efi_set); static int command_efi_set(int argc, char *argv[]) { char *uuid, *var, *val; CHAR16 wvar[128]; EFI_GUID guid; #if defined(ENABLE_UPDATES) EFI_STATUS err; #endif if (argc != 4) { printf("efi-set uuid var new-value\n"); return (CMD_ERROR); } uuid = argv[1]; var = argv[2]; val = argv[3]; if (efi_name_to_guid(uuid, &guid) == false) { printf("Invalid uuid %s\n", uuid); return (CMD_ERROR); } cpy8to16(var, wvar, nitems(wvar)); #if defined(ENABLE_UPDATES) err = RS->SetVariable(wvar, &guid, EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_RUNTIME_ACCESS | EFI_VARIABLE_BOOTSERVICE_ACCESS, strlen(val) + 1, val); if (EFI_ERROR(err)) { printf("Failed to set variable: error %lu\n", EFI_ERROR_CODE(err)); return (CMD_ERROR); } #else printf("would set %s %s = %s\n", uuid, var, val); #endif return (CMD_OK); } COMMAND_SET(efiunset, "efi-unset", "delete / unset EFI variables", command_efi_unset); static int command_efi_unset(int argc, char *argv[]) { char *uuid, *var; CHAR16 wvar[128]; EFI_GUID guid; #if defined(ENABLE_UPDATES) EFI_STATUS err; #endif if (argc != 3) { printf("efi-unset uuid var\n"); return (CMD_ERROR); } uuid = argv[1]; var = argv[2]; if (efi_name_to_guid(uuid, &guid) == false) { printf("Invalid uuid %s\n", uuid); return (CMD_ERROR); } cpy8to16(var, wvar, nitems(wvar)); #if defined(ENABLE_UPDATES) err = RS->SetVariable(wvar, &guid, 0, 0, NULL); if (EFI_ERROR(err)) { printf("Failed to unset variable: error %lu\n", EFI_ERROR_CODE(err)); return (CMD_ERROR); } #else printf("would unset %s %s \n", uuid, var); #endif return (CMD_OK); } Index: head/stand/efi/libefi/wchar.c =================================================================== --- head/stand/efi/libefi/wchar.c (revision 343754) +++ head/stand/efi/libefi/wchar.c (revision 343755) @@ -1,73 +1,73 @@ /*- - * Copyright 2016 Netflix, Inc. All Rights Reserved. + * Copyright 2016 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include /* * CHAR16 related functions moved from loader. * Perhaps we should move those to libstand afterall, but they are * needed only by UEFI. */ int wcscmp(CHAR16 *a, CHAR16 *b) { while (*a && *b && *a == *b) { a++; b++; } return *a - *b; } /* * cpy8to16 copies a traditional C string into a CHAR16 string and * 0 terminates it. len is the size of *dst in bytes. */ void cpy8to16(const char *src, CHAR16 *dst, size_t len) { len <<= 1; /* Assume CHAR16 is 2 bytes */ while (len > 0 && *src) { *dst++ = *src++; len--; } *dst++ = (CHAR16)0; } void cpy16to8(const CHAR16 *src, char *dst, size_t len) { size_t i; for (i = 0; i < len && src[i]; i++) dst[i] = (char)src[i]; if (i < len) dst[i] = '\0'; } Index: head/stand/efi/loader/main.c =================================================================== --- head/stand/efi/loader/main.c (revision 343754) +++ head/stand/efi/loader/main.c (revision 343755) @@ -1,1414 +1,1415 @@ /*- * Copyright (c) 2008-2010 Rui Paulo * Copyright (c) 2006 Marcel Moolenaar - * Copyright (c) 2018 Netflix, Inc * All rights reserved. * + * Copyright (c) 2018 Netflix, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef EFI_ZFS_BOOT #include #include "efizfs.h" #endif #include "loader_efi.h" struct arch_switch archsw; /* MI/MD interface boundary */ EFI_GUID acpi = ACPI_TABLE_GUID; EFI_GUID acpi20 = ACPI_20_TABLE_GUID; EFI_GUID devid = DEVICE_PATH_PROTOCOL; EFI_GUID imgid = LOADED_IMAGE_PROTOCOL; EFI_GUID mps = MPS_TABLE_GUID; EFI_GUID netid = EFI_SIMPLE_NETWORK_PROTOCOL; EFI_GUID smbios = SMBIOS_TABLE_GUID; EFI_GUID smbios3 = SMBIOS3_TABLE_GUID; EFI_GUID dxe = DXE_SERVICES_TABLE_GUID; EFI_GUID hoblist = HOB_LIST_TABLE_GUID; EFI_GUID lzmadecomp = LZMA_DECOMPRESSION_GUID; EFI_GUID mpcore = ARM_MP_CORE_INFO_TABLE_GUID; EFI_GUID esrt = ESRT_TABLE_GUID; EFI_GUID memtype = MEMORY_TYPE_INFORMATION_TABLE_GUID; EFI_GUID debugimg = DEBUG_IMAGE_INFO_TABLE_GUID; EFI_GUID fdtdtb = FDT_TABLE_GUID; EFI_GUID inputid = SIMPLE_TEXT_INPUT_PROTOCOL; /* * Number of seconds to wait for a keystroke before exiting with failure * in the event no currdev is found. -2 means always break, -1 means * never break, 0 means poll once and then reboot, > 0 means wait for * that many seconds. "fail_timeout" can be set in the environment as * well. */ static int fail_timeout = 5; /* * Current boot variable */ UINT16 boot_current; static bool has_keyboard(void) { EFI_STATUS status; EFI_DEVICE_PATH *path; EFI_HANDLE *hin, *hin_end, *walker; UINTN sz; bool retval = false; /* * Find all the handles that support the SIMPLE_TEXT_INPUT_PROTOCOL and * do the typical dance to get the right sized buffer. */ sz = 0; hin = NULL; status = BS->LocateHandle(ByProtocol, &inputid, 0, &sz, 0); if (status == EFI_BUFFER_TOO_SMALL) { hin = (EFI_HANDLE *)malloc(sz); status = BS->LocateHandle(ByProtocol, &inputid, 0, &sz, hin); if (EFI_ERROR(status)) free(hin); } if (EFI_ERROR(status)) return retval; /* * Look at each of the handles. If it supports the device path protocol, * use it to get the device path for this handle. Then see if that * device path matches either the USB device path for keyboards or the * legacy device path for keyboards. */ hin_end = &hin[sz / sizeof(*hin)]; for (walker = hin; walker < hin_end; walker++) { status = BS->HandleProtocol(*walker, &devid, (VOID **)&path); if (EFI_ERROR(status)) continue; while (!IsDevicePathEnd(path)) { /* * Check for the ACPI keyboard node. All PNP3xx nodes * are keyboards of different flavors. Note: It is * unclear of there's always a keyboard node when * there's a keyboard controller, or if there's only one * when a keyboard is detected at boot. */ if (DevicePathType(path) == ACPI_DEVICE_PATH && (DevicePathSubType(path) == ACPI_DP || DevicePathSubType(path) == ACPI_EXTENDED_DP)) { ACPI_HID_DEVICE_PATH *acpi; acpi = (ACPI_HID_DEVICE_PATH *)(void *)path; if ((EISA_ID_TO_NUM(acpi->HID) & 0xff00) == 0x300 && (acpi->HID & 0xffff) == PNP_EISA_ID_CONST) { retval = true; goto out; } /* * Check for USB keyboard node, if present. Unlike a * PS/2 keyboard, these definitely only appear when * connected to the system. */ } else if (DevicePathType(path) == MESSAGING_DEVICE_PATH && DevicePathSubType(path) == MSG_USB_CLASS_DP) { USB_CLASS_DEVICE_PATH *usb; usb = (USB_CLASS_DEVICE_PATH *)(void *)path; if (usb->DeviceClass == 3 && /* HID */ usb->DeviceSubClass == 1 && /* Boot devices */ usb->DeviceProtocol == 1) { /* Boot keyboards */ retval = true; goto out; } } path = NextDevicePathNode(path); } } out: free(hin); return retval; } static void set_currdev(const char *devname) { env_setenv("currdev", EV_VOLATILE, devname, efi_setcurrdev, env_nounset); env_setenv("loaddev", EV_VOLATILE, devname, env_noset, env_nounset); } static void set_currdev_devdesc(struct devdesc *currdev) { const char *devname; devname = efi_fmtdev(currdev); printf("Setting currdev to %s\n", devname); set_currdev(devname); } static void set_currdev_devsw(struct devsw *dev, int unit) { struct devdesc currdev; currdev.d_dev = dev; currdev.d_unit = unit; set_currdev_devdesc(&currdev); } static void set_currdev_pdinfo(pdinfo_t *dp) { /* * Disks are special: they have partitions. if the parent * pointer is non-null, we're a partition not a full disk * and we need to adjust currdev appropriately. */ if (dp->pd_devsw->dv_type == DEVT_DISK) { struct disk_devdesc currdev; currdev.dd.d_dev = dp->pd_devsw; if (dp->pd_parent == NULL) { currdev.dd.d_unit = dp->pd_unit; currdev.d_slice = -1; currdev.d_partition = -1; } else { currdev.dd.d_unit = dp->pd_parent->pd_unit; currdev.d_slice = dp->pd_unit; currdev.d_partition = 255; /* Assumes GPT */ } set_currdev_devdesc((struct devdesc *)&currdev); } else { set_currdev_devsw(dp->pd_devsw, dp->pd_unit); } } static bool sanity_check_currdev(void) { struct stat st; return (stat("/boot/defaults/loader.conf", &st) == 0 || stat("/boot/kernel/kernel", &st) == 0); } #ifdef EFI_ZFS_BOOT static bool probe_zfs_currdev(uint64_t guid) { char *devname; struct zfs_devdesc currdev; currdev.dd.d_dev = &zfs_dev; currdev.dd.d_unit = 0; currdev.pool_guid = guid; currdev.root_guid = 0; set_currdev_devdesc((struct devdesc *)&currdev); devname = efi_fmtdev(&currdev); init_zfs_bootenv(devname); return (sanity_check_currdev()); } #endif static bool try_as_currdev(pdinfo_t *hd, pdinfo_t *pp) { uint64_t guid; #ifdef EFI_ZFS_BOOT /* * If there's a zpool on this device, try it as a ZFS * filesystem, which has somewhat different setup than all * other types of fs due to imperfect loader integration. * This all stems from ZFS being both a device (zpool) and * a filesystem, plus the boot env feature. */ if (efizfs_get_guid_by_handle(pp->pd_handle, &guid)) return (probe_zfs_currdev(guid)); #endif /* * All other filesystems just need the pdinfo * initialized in the standard way. */ set_currdev_pdinfo(pp); return (sanity_check_currdev()); } /* * Sometimes we get filenames that are all upper case * and/or have backslashes in them. Filter all this out * if it looks like we need to do so. */ static void fix_dosisms(char *p) { while (*p) { if (isupper(*p)) *p = tolower(*p); else if (*p == '\\') *p = '/'; p++; } } #define SIZE(dp, edp) (size_t)((intptr_t)(void *)edp - (intptr_t)(void *)dp) enum { BOOT_INFO_OK = 0, BAD_CHOICE = 1, NOT_SPECIFIC = 2 }; static int match_boot_info(EFI_LOADED_IMAGE *img __unused, char *boot_info, size_t bisz) { uint32_t attr; uint16_t fplen; size_t len; char *walker, *ep; EFI_DEVICE_PATH *dp, *edp, *first_dp, *last_dp; pdinfo_t *pp; CHAR16 *descr; char *kernel = NULL; FILEPATH_DEVICE_PATH *fp; struct stat st; CHAR16 *text; /* * FreeBSD encodes it's boot loading path into the boot loader * BootXXXX variable. We look for the last one in the path * and use that to load the kernel. However, if we only fine * one DEVICE_PATH, then there's nothing specific and we should * fall back. * * In an ideal world, we'd look at the image handle we were * passed, match up with the loader we are and then return the * next one in the path. This would be most flexible and cover * many chain booting scenarios where you need to use this * boot loader to get to the next boot loader. However, that * doesn't work. We rarely have the path to the image booted * (just the device) so we can't count on that. So, we do the * enxt best thing, we look through the device path(s) passed * in the BootXXXX varaible. If there's only one, we return * NOT_SPECIFIC. Otherwise, we look at the last one and try to * load that. If we can, we return BOOT_INFO_OK. Otherwise we * return BAD_CHOICE for the caller to sort out. */ if (bisz < sizeof(attr) + sizeof(fplen) + sizeof(CHAR16)) return NOT_SPECIFIC; walker = boot_info; ep = walker + bisz; memcpy(&attr, walker, sizeof(attr)); walker += sizeof(attr); memcpy(&fplen, walker, sizeof(fplen)); walker += sizeof(fplen); descr = (CHAR16 *)(intptr_t)walker; len = ucs2len(descr); walker += (len + 1) * sizeof(CHAR16); last_dp = first_dp = dp = (EFI_DEVICE_PATH *)walker; edp = (EFI_DEVICE_PATH *)(walker + fplen); if ((char *)edp > ep) return NOT_SPECIFIC; while (dp < edp && SIZE(dp, edp) > sizeof(EFI_DEVICE_PATH)) { text = efi_devpath_name(dp); if (text != NULL) { printf(" BootInfo Path: %S\n", text); efi_free_devpath_name(text); } last_dp = dp; dp = (EFI_DEVICE_PATH *)((char *)dp + efi_devpath_length(dp)); } /* * If there's only one item in the list, then nothing was * specified. Or if the last path doesn't have a media * path in it. Those show up as various VenHw() nodes * which are basically opaque to us. Don't count those * as something specifc. */ if (last_dp == first_dp) { printf("Ignoring Boot%04x: Only one DP found\n", boot_current); return NOT_SPECIFIC; } if (efi_devpath_to_media_path(last_dp) == NULL) { printf("Ignoring Boot%04x: No Media Path\n", boot_current); return NOT_SPECIFIC; } /* * OK. At this point we either have a good path or a bad one. * Let's check. */ pp = efiblk_get_pdinfo_by_device_path(last_dp); if (pp == NULL) { printf("Ignoring Boot%04x: Device Path not found\n", boot_current); return BAD_CHOICE; } set_currdev_pdinfo(pp); if (!sanity_check_currdev()) { printf("Ignoring Boot%04x: sanity check failed\n", boot_current); return BAD_CHOICE; } /* * OK. We've found a device that matches, next we need to check the last * component of the path. If it's a file, then we set the default kernel * to that. Otherwise, just use this as the default root. * * Reminder: we're running very early, before we've parsed the defaults * file, so we may need to have a hack override. */ dp = efi_devpath_last_node(last_dp); if (DevicePathType(dp) != MEDIA_DEVICE_PATH || DevicePathSubType(dp) != MEDIA_FILEPATH_DP) { printf("Using Boot%04x for root partition\n", boot_current); return (BOOT_INFO_OK); /* use currdir, default kernel */ } fp = (FILEPATH_DEVICE_PATH *)dp; ucs2_to_utf8(fp->PathName, &kernel); if (kernel == NULL) { printf("Not using Boot%04x: can't decode kernel\n", boot_current); return (BAD_CHOICE); } if (*kernel == '\\' || isupper(*kernel)) fix_dosisms(kernel); if (stat(kernel, &st) != 0) { free(kernel); printf("Not using Boot%04x: can't find %s\n", boot_current, kernel); return (BAD_CHOICE); } setenv("kernel", kernel, 1); free(kernel); text = efi_devpath_name(last_dp); if (text) { printf("Using Boot%04x %S + %s\n", boot_current, text, kernel); efi_free_devpath_name(text); } return (BOOT_INFO_OK); } /* * Look at the passed-in boot_info, if any. If we find it then we need * to see if we can find ourselves in the boot chain. If we can, and * there's another specified thing to boot next, assume that the file * is loaded from / and use that for the root filesystem. If can't * find the specified thing, we must fail the boot. If we're last on * the list, then we fallback to looking for the first available / * candidate (ZFS, if there's a bootable zpool, otherwise a UFS * partition that has either /boot/defaults/loader.conf on it or * /boot/kernel/kernel (the default kernel) that we can use. * * We always fail if we can't find the right thing. However, as * a concession to buggy UEFI implementations, like u-boot, if * we have determined that the host is violating the UEFI boot * manager protocol, we'll signal the rest of the program that * a drop to the OK boot loader prompt is possible. */ static int find_currdev(EFI_LOADED_IMAGE *img, bool do_bootmgr, bool is_last, char *boot_info, size_t boot_info_sz) { pdinfo_t *dp, *pp; EFI_DEVICE_PATH *devpath, *copy; EFI_HANDLE h; CHAR16 *text; struct devsw *dev; int unit; uint64_t extra; int rv; char *rootdev; /* * First choice: if rootdev is already set, use that, even if * it's wrong. */ rootdev = getenv("rootdev"); if (rootdev != NULL) { printf("Setting currdev to configured rootdev %s\n", rootdev); set_currdev(rootdev); return (0); } /* * Second choice: If we can find out image boot_info, and there's * a follow-on boot image in that boot_info, use that. In this * case root will be the partition specified in that image and * we'll load the kernel specified by the file path. Should there * not be a filepath, we use the default. This filepath overrides * loader.conf. */ if (do_bootmgr) { rv = match_boot_info(img, boot_info, boot_info_sz); switch (rv) { case BOOT_INFO_OK: /* We found it */ return (0); case BAD_CHOICE: /* specified file not found -> error */ /* XXX do we want to have an escape hatch for last in boot order? */ return (ENOENT); } /* Nothing specified, try normal match */ } #ifdef EFI_ZFS_BOOT /* * Did efi_zfs_probe() detect the boot pool? If so, use the zpool * it found, if it's sane. ZFS is the only thing that looks for * disks and pools to boot. This may change in the future, however, * if we allow specifying which pool to boot from via UEFI variables * rather than the bootenv stuff that FreeBSD uses today. */ if (pool_guid != 0) { printf("Trying ZFS pool\n"); if (probe_zfs_currdev(pool_guid)) return (0); } #endif /* EFI_ZFS_BOOT */ /* * Try to find the block device by its handle based on the * image we're booting. If we can't find a sane partition, * search all the other partitions of the disk. We do not * search other disks because it's a violation of the UEFI * boot protocol to do so. We fail and let UEFI go on to * the next candidate. */ dp = efiblk_get_pdinfo_by_handle(img->DeviceHandle); if (dp != NULL) { text = efi_devpath_name(dp->pd_devpath); if (text != NULL) { printf("Trying ESP: %S\n", text); efi_free_devpath_name(text); } set_currdev_pdinfo(dp); if (sanity_check_currdev()) return (0); if (dp->pd_parent != NULL) { pdinfo_t *espdp = dp; dp = dp->pd_parent; STAILQ_FOREACH(pp, &dp->pd_part, pd_link) { /* Already tried the ESP */ if (espdp == pp) continue; /* * Roll up the ZFS special case * for those partitions that have * zpools on them. */ text = efi_devpath_name(pp->pd_devpath); if (text != NULL) { printf("Trying: %S\n", text); efi_free_devpath_name(text); } if (try_as_currdev(dp, pp)) return (0); } } } /* * Try the device handle from our loaded image first. If that * fails, use the device path from the loaded image and see if * any of the nodes in that path match one of the enumerated * handles. Currently, this handle list is only for netboot. */ if (efi_handle_lookup(img->DeviceHandle, &dev, &unit, &extra) == 0) { set_currdev_devsw(dev, unit); if (sanity_check_currdev()) return (0); } copy = NULL; devpath = efi_lookup_image_devpath(IH); while (devpath != NULL) { h = efi_devpath_handle(devpath); if (h == NULL) break; free(copy); copy = NULL; if (efi_handle_lookup(h, &dev, &unit, &extra) == 0) { set_currdev_devsw(dev, unit); if (sanity_check_currdev()) return (0); } devpath = efi_lookup_devpath(h); if (devpath != NULL) { copy = efi_devpath_trim(devpath); devpath = copy; } } free(copy); return (ENOENT); } static bool interactive_interrupt(const char *msg) { time_t now, then, last; last = 0; now = then = getsecs(); printf("%s\n", msg); if (fail_timeout == -2) /* Always break to OK */ return (true); if (fail_timeout == -1) /* Never break to OK */ return (false); do { if (last != now) { printf("press any key to interrupt reboot in %d seconds\r", fail_timeout - (int)(now - then)); last = now; } /* XXX no pause or timeout wait for char */ if (ischar()) return (true); now = getsecs(); } while (now - then < fail_timeout); return (false); } static int parse_args(int argc, CHAR16 *argv[]) { int i, j, howto; bool vargood; char var[128]; /* * Parse the args to set the console settings, etc * boot1.efi passes these in, if it can read /boot.config or /boot/config * or iPXE may be setup to pass these in. Or the optional argument in the * boot environment was used to pass these arguments in (in which case * neither /boot.config nor /boot/config are consulted). * * Loop through the args, and for each one that contains an '=' that is * not the first character, add it to the environment. This allows * loader and kernel env vars to be passed on the command line. Convert * args from UCS-2 to ASCII (16 to 8 bit) as they are copied (though this * method is flawed for non-ASCII characters). */ howto = 0; for (i = 1; i < argc; i++) { cpy16to8(argv[i], var, sizeof(var)); howto |= boot_parse_arg(var); } return (howto); } static void setenv_int(const char *key, int val) { char buf[20]; snprintf(buf, sizeof(buf), "%d", val); setenv(key, buf, 1); } /* * Parse ConOut (the list of consoles active) and see if we can find a * serial port and/or a video port. It would be nice to also walk the * ACPI name space to map the UID for the serial port to a port. The * latter is especially hard. */ static int parse_uefi_con_out(void) { int how, rv; int vid_seen = 0, com_seen = 0, seen = 0; size_t sz; char buf[4096], *ep; EFI_DEVICE_PATH *node; ACPI_HID_DEVICE_PATH *acpi; UART_DEVICE_PATH *uart; bool pci_pending; how = 0; sz = sizeof(buf); rv = efi_global_getenv("ConOut", buf, &sz); if (rv != EFI_SUCCESS) goto out; ep = buf + sz; node = (EFI_DEVICE_PATH *)buf; while ((char *)node < ep) { pci_pending = false; if (DevicePathType(node) == ACPI_DEVICE_PATH && DevicePathSubType(node) == ACPI_DP) { /* Check for Serial node */ acpi = (void *)node; if (EISA_ID_TO_NUM(acpi->HID) == 0x501) { setenv_int("efi_8250_uid", acpi->UID); com_seen = ++seen; } } else if (DevicePathType(node) == MESSAGING_DEVICE_PATH && DevicePathSubType(node) == MSG_UART_DP) { uart = (void *)node; setenv_int("efi_com_speed", uart->BaudRate); } else if (DevicePathType(node) == ACPI_DEVICE_PATH && DevicePathSubType(node) == ACPI_ADR_DP) { /* Check for AcpiAdr() Node for video */ vid_seen = ++seen; } else if (DevicePathType(node) == HARDWARE_DEVICE_PATH && DevicePathSubType(node) == HW_PCI_DP) { /* * Note, vmware fusion has a funky console device * PciRoot(0x0)/Pci(0xf,0x0) * which we can only detect at the end since we also * have to cope with: * PciRoot(0x0)/Pci(0x1f,0x0)/Serial(0x1) * so only match it if it's last. */ pci_pending = true; } node = NextDevicePathNode(node); /* Skip the end node */ } if (pci_pending && vid_seen == 0) vid_seen = ++seen; /* * Truth table for RB_MULTIPLE | RB_SERIAL * Value Result * 0 Use only video console * RB_SERIAL Use only serial console * RB_MULTIPLE Use both video and serial console * (but video is primary so gets rc messages) * both Use both video and serial console * (but serial is primary so gets rc messages) * * Try to honor this as best we can. If only one of serial / video * found, then use that. Otherwise, use the first one we found. * This also implies if we found nothing, default to video. */ how = 0; if (vid_seen && com_seen) { how |= RB_MULTIPLE; if (com_seen < vid_seen) how |= RB_SERIAL; } else if (com_seen) how |= RB_SERIAL; out: return (how); } EFI_STATUS main(int argc, CHAR16 *argv[]) { EFI_GUID *guid; int howto, i, uhowto; UINTN k; bool has_kbd, is_last; char *s; EFI_DEVICE_PATH *imgpath; CHAR16 *text; EFI_STATUS rv; size_t sz, bosz = 0, bisz = 0; UINT16 boot_order[100]; char boot_info[4096]; EFI_LOADED_IMAGE *img; char buf[32]; bool uefi_boot_mgr; archsw.arch_autoload = efi_autoload; archsw.arch_getdev = efi_getdev; archsw.arch_copyin = efi_copyin; archsw.arch_copyout = efi_copyout; archsw.arch_readin = efi_readin; #ifdef EFI_ZFS_BOOT /* Note this needs to be set before ZFS init. */ archsw.arch_zfs_probe = efi_zfs_probe; #endif /* Get our loaded image protocol interface structure. */ BS->HandleProtocol(IH, &imgid, (VOID**)&img); #ifdef EFI_ZFS_BOOT /* Tell ZFS probe code where we booted from */ efizfs_set_preferred(img->DeviceHandle); #endif /* Init the time source */ efi_time_init(); has_kbd = has_keyboard(); /* * XXX Chicken-and-egg problem; we want to have console output * early, but some console attributes may depend on reading from * eg. the boot device, which we can't do yet. We can use * printf() etc. once this is done. */ setenv("console", "efi", 1); cons_probe(); /* * Initialise the block cache. Set the upper limit. */ bcache_init(32768, 512); howto = parse_args(argc, argv); if (!has_kbd && (howto & RB_PROBE)) howto |= RB_SERIAL | RB_MULTIPLE; howto &= ~RB_PROBE; uhowto = parse_uefi_con_out(); /* * We now have two notions of console. howto should be viewed as * overrides. If console is already set, don't set it again. */ #define VIDEO_ONLY 0 #define SERIAL_ONLY RB_SERIAL #define VID_SER_BOTH RB_MULTIPLE #define SER_VID_BOTH (RB_SERIAL | RB_MULTIPLE) #define CON_MASK (RB_SERIAL | RB_MULTIPLE) if (strcmp(getenv("console"), "efi") == 0) { if ((howto & CON_MASK) == 0) { /* No override, uhowto is controlling and efi cons is perfect */ howto = howto | (uhowto & CON_MASK); setenv("console", "efi", 1); } else if ((howto & CON_MASK) == (uhowto & CON_MASK)) { /* override matches what UEFI told us, efi console is perfect */ setenv("console", "efi", 1); } else if ((uhowto & (CON_MASK)) != 0) { /* * We detected a serial console on ConOut. All possible * overrides include serial. We can't really override what efi * gives us, so we use it knowing it's the best choice. */ setenv("console", "efi", 1); } else { /* * We detected some kind of serial in the override, but ConOut * has no serial, so we have to sort out which case it really is. */ switch (howto & CON_MASK) { case SERIAL_ONLY: setenv("console", "comconsole", 1); break; case VID_SER_BOTH: setenv("console", "efi comconsole", 1); break; case SER_VID_BOTH: setenv("console", "comconsole efi", 1); break; /* case VIDEO_ONLY can't happen -- it's the first if above */ } } } /* * howto is set now how we want to export the flags to the kernel, so * set the env based on it. */ boot_howto_to_env(howto); if (efi_copy_init()) { printf("failed to allocate staging area\n"); return (EFI_BUFFER_TOO_SMALL); } if ((s = getenv("fail_timeout")) != NULL) fail_timeout = strtol(s, NULL, 10); /* * Scan the BLOCK IO MEDIA handles then * march through the device switch probing for things. */ i = efipart_inithandles(); if (i != 0 && i != ENOENT) { printf("efipart_inithandles failed with ERRNO %d, expect " "failures\n", i); } for (i = 0; devsw[i] != NULL; i++) if (devsw[i]->dv_init != NULL) (devsw[i]->dv_init)(); printf("%s\n", bootprog_info); printf(" Command line arguments:"); for (i = 0; i < argc; i++) printf(" %S", argv[i]); printf("\n"); printf(" EFI version: %d.%02d\n", ST->Hdr.Revision >> 16, ST->Hdr.Revision & 0xffff); printf(" EFI Firmware: %S (rev %d.%02d)\n", ST->FirmwareVendor, ST->FirmwareRevision >> 16, ST->FirmwareRevision & 0xffff); printf(" Console: %s (%#x)\n", getenv("console"), howto); /* Determine the devpath of our image so we can prefer it. */ text = efi_devpath_name(img->FilePath); if (text != NULL) { printf(" Load Path: %S\n", text); efi_setenv_freebsd_wcs("LoaderPath", text); efi_free_devpath_name(text); } rv = BS->HandleProtocol(img->DeviceHandle, &devid, (void **)&imgpath); if (rv == EFI_SUCCESS) { text = efi_devpath_name(imgpath); if (text != NULL) { printf(" Load Device: %S\n", text); efi_setenv_freebsd_wcs("LoaderDev", text); efi_free_devpath_name(text); } } uefi_boot_mgr = true; boot_current = 0; sz = sizeof(boot_current); rv = efi_global_getenv("BootCurrent", &boot_current, &sz); if (rv == EFI_SUCCESS) printf(" BootCurrent: %04x\n", boot_current); else { boot_current = 0xffff; uefi_boot_mgr = false; } sz = sizeof(boot_order); rv = efi_global_getenv("BootOrder", &boot_order, &sz); if (rv == EFI_SUCCESS) { printf(" BootOrder:"); for (i = 0; i < sz / sizeof(boot_order[0]); i++) printf(" %04x%s", boot_order[i], boot_order[i] == boot_current ? "[*]" : ""); printf("\n"); is_last = boot_order[(sz / sizeof(boot_order[0])) - 1] == boot_current; bosz = sz; } else if (uefi_boot_mgr) { /* * u-boot doesn't set BootOrder, but otherwise participates in the * boot manager protocol. So we fake it here and don't consider it * a failure. */ bosz = sizeof(boot_order[0]); boot_order[0] = boot_current; is_last = true; } /* * Next, find the boot info structure the UEFI boot manager is * supposed to setup. We need this so we can walk through it to * find where we are in the booting process and what to try to * boot next. */ if (uefi_boot_mgr) { snprintf(buf, sizeof(buf), "Boot%04X", boot_current); sz = sizeof(boot_info); rv = efi_global_getenv(buf, &boot_info, &sz); if (rv == EFI_SUCCESS) bisz = sz; else uefi_boot_mgr = false; } /* * Disable the watchdog timer. By default the boot manager sets * the timer to 5 minutes before invoking a boot option. If we * want to return to the boot manager, we have to disable the * watchdog timer and since we're an interactive program, we don't * want to wait until the user types "quit". The timer may have * fired by then. We don't care if this fails. It does not prevent * normal functioning in any way... */ BS->SetWatchdogTimer(0, 0, 0, NULL); /* * Try and find a good currdev based on the image that was booted. * It might be desirable here to have a short pause to allow falling * through to the boot loader instead of returning instantly to follow * the boot protocol and also allow an escape hatch for users wishing * to try something different. */ if (find_currdev(img, uefi_boot_mgr, is_last, boot_info, bisz) != 0) if (!interactive_interrupt("Failed to find bootable partition")) return (EFI_NOT_FOUND); efi_init_environment(); #if !defined(__arm__) for (k = 0; k < ST->NumberOfTableEntries; k++) { guid = &ST->ConfigurationTable[k].VendorGuid; if (!memcmp(guid, &smbios, sizeof(EFI_GUID))) { char buf[40]; snprintf(buf, sizeof(buf), "%p", ST->ConfigurationTable[k].VendorTable); setenv("hint.smbios.0.mem", buf, 1); smbios_detect(ST->ConfigurationTable[k].VendorTable); break; } } #endif interact(); /* doesn't return */ return (EFI_SUCCESS); /* keep compiler happy */ } COMMAND_SET(poweroff, "poweroff", "power off the system", command_poweroff); static int command_poweroff(int argc __unused, char *argv[] __unused) { int i; for (i = 0; devsw[i] != NULL; ++i) if (devsw[i]->dv_cleanup != NULL) (devsw[i]->dv_cleanup)(); RS->ResetSystem(EfiResetShutdown, EFI_SUCCESS, 0, NULL); /* NOTREACHED */ return (CMD_ERROR); } COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot); static int command_reboot(int argc, char *argv[]) { int i; for (i = 0; devsw[i] != NULL; ++i) if (devsw[i]->dv_cleanup != NULL) (devsw[i]->dv_cleanup)(); RS->ResetSystem(EfiResetCold, EFI_SUCCESS, 0, NULL); /* NOTREACHED */ return (CMD_ERROR); } COMMAND_SET(quit, "quit", "exit the loader", command_quit); static int command_quit(int argc, char *argv[]) { exit(0); return (CMD_OK); } COMMAND_SET(memmap, "memmap", "print memory map", command_memmap); static int command_memmap(int argc __unused, char *argv[] __unused) { UINTN sz; EFI_MEMORY_DESCRIPTOR *map, *p; UINTN key, dsz; UINT32 dver; EFI_STATUS status; int i, ndesc; char line[80]; sz = 0; status = BS->GetMemoryMap(&sz, 0, &key, &dsz, &dver); if (status != EFI_BUFFER_TOO_SMALL) { printf("Can't determine memory map size\n"); return (CMD_ERROR); } map = malloc(sz); status = BS->GetMemoryMap(&sz, map, &key, &dsz, &dver); if (EFI_ERROR(status)) { printf("Can't read memory map\n"); return (CMD_ERROR); } ndesc = sz / dsz; snprintf(line, sizeof(line), "%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); pager_open(); if (pager_output(line)) { pager_close(); return (CMD_OK); } for (i = 0, p = map; i < ndesc; i++, p = NextMemoryDescriptor(p, dsz)) { snprintf(line, sizeof(line), "%23s %012jx %012jx %08jx ", efi_memory_type(p->Type), (uintmax_t)p->PhysicalStart, (uintmax_t)p->VirtualStart, (uintmax_t)p->NumberOfPages); if (pager_output(line)) break; if (p->Attribute & EFI_MEMORY_UC) printf("UC "); if (p->Attribute & EFI_MEMORY_WC) printf("WC "); if (p->Attribute & EFI_MEMORY_WT) printf("WT "); if (p->Attribute & EFI_MEMORY_WB) printf("WB "); if (p->Attribute & EFI_MEMORY_UCE) printf("UCE "); if (p->Attribute & EFI_MEMORY_WP) printf("WP "); if (p->Attribute & EFI_MEMORY_RP) printf("RP "); if (p->Attribute & EFI_MEMORY_XP) printf("XP "); if (p->Attribute & EFI_MEMORY_NV) printf("NV "); if (p->Attribute & EFI_MEMORY_MORE_RELIABLE) printf("MR "); if (p->Attribute & EFI_MEMORY_RO) printf("RO "); if (pager_output("\n")) break; } pager_close(); return (CMD_OK); } COMMAND_SET(configuration, "configuration", "print configuration tables", command_configuration); static int command_configuration(int argc, char *argv[]) { UINTN i; char *name; printf("NumberOfTableEntries=%lu\n", (unsigned long)ST->NumberOfTableEntries); for (i = 0; i < ST->NumberOfTableEntries; i++) { EFI_GUID *guid; printf(" "); guid = &ST->ConfigurationTable[i].VendorGuid; if (efi_guid_to_name(guid, &name) == true) { printf(name); free(name); } else { printf("Error while translating UUID to name"); } printf(" at %p\n", ST->ConfigurationTable[i].VendorTable); } return (CMD_OK); } COMMAND_SET(mode, "mode", "change or display EFI text modes", command_mode); static int command_mode(int argc, char *argv[]) { UINTN cols, rows; unsigned int mode; int i; char *cp; char rowenv[8]; EFI_STATUS status; SIMPLE_TEXT_OUTPUT_INTERFACE *conout; extern void HO(void); conout = ST->ConOut; if (argc > 1) { mode = strtol(argv[1], &cp, 0); if (cp[0] != '\0') { printf("Invalid mode\n"); return (CMD_ERROR); } status = conout->QueryMode(conout, mode, &cols, &rows); if (EFI_ERROR(status)) { printf("invalid mode %d\n", mode); return (CMD_ERROR); } status = conout->SetMode(conout, mode); if (EFI_ERROR(status)) { printf("couldn't set mode %d\n", mode); return (CMD_ERROR); } sprintf(rowenv, "%u", (unsigned)rows); setenv("LINES", rowenv, 1); HO(); /* set cursor */ return (CMD_OK); } printf("Current mode: %d\n", conout->Mode->Mode); for (i = 0; i <= conout->Mode->MaxMode; i++) { status = conout->QueryMode(conout, i, &cols, &rows); if (EFI_ERROR(status)) continue; printf("Mode %d: %u columns, %u rows\n", i, (unsigned)cols, (unsigned)rows); } if (i != 0) printf("Select a mode with the command \"mode \"\n"); return (CMD_OK); } COMMAND_SET(lsefi, "lsefi", "list EFI handles", command_lsefi); static int command_lsefi(int argc __unused, char *argv[] __unused) { char *name; EFI_HANDLE *buffer = NULL; EFI_HANDLE handle; UINTN bufsz = 0, i, j; EFI_STATUS status; int ret; status = BS->LocateHandle(AllHandles, NULL, NULL, &bufsz, buffer); if (status != EFI_BUFFER_TOO_SMALL) { snprintf(command_errbuf, sizeof (command_errbuf), "unexpected error: %lld", (long long)status); return (CMD_ERROR); } if ((buffer = malloc(bufsz)) == NULL) { sprintf(command_errbuf, "out of memory"); return (CMD_ERROR); } status = BS->LocateHandle(AllHandles, NULL, NULL, &bufsz, buffer); if (EFI_ERROR(status)) { free(buffer); snprintf(command_errbuf, sizeof (command_errbuf), "LocateHandle() error: %lld", (long long)status); return (CMD_ERROR); } pager_open(); for (i = 0; i < (bufsz / sizeof (EFI_HANDLE)); i++) { UINTN nproto = 0; EFI_GUID **protocols = NULL; handle = buffer[i]; printf("Handle %p", handle); if (pager_output("\n")) break; /* device path */ status = BS->ProtocolsPerHandle(handle, &protocols, &nproto); if (EFI_ERROR(status)) { snprintf(command_errbuf, sizeof (command_errbuf), "ProtocolsPerHandle() error: %lld", (long long)status); continue; } for (j = 0; j < nproto; j++) { if (efi_guid_to_name(protocols[j], &name) == true) { printf(" %s", name); free(name); } else { printf("Error while translating UUID to name"); } if ((ret = pager_output("\n")) != 0) break; } BS->FreePool(protocols); if (ret != 0) break; } pager_close(); free(buffer); return (CMD_OK); } #ifdef LOADER_FDT_SUPPORT extern int command_fdt_internal(int argc, char *argv[]); /* * Since proper fdt command handling function is defined in fdt_loader_cmd.c, * and declaring it as extern is in contradiction with COMMAND_SET() macro * (which uses static pointer), we're defining wrapper function, which * calls the proper fdt handling routine. */ static int command_fdt(int argc, char *argv[]) { return (command_fdt_internal(argc, argv)); } COMMAND_SET(fdt, "fdt", "flattened device tree handling", command_fdt); #endif /* * Chain load another efi loader. */ static int command_chain(int argc, char *argv[]) { EFI_GUID LoadedImageGUID = LOADED_IMAGE_PROTOCOL; EFI_HANDLE loaderhandle; EFI_LOADED_IMAGE *loaded_image; EFI_STATUS status; struct stat st; struct devdesc *dev; char *name, *path; void *buf; int fd; if (argc < 2) { command_errmsg = "wrong number of arguments"; return (CMD_ERROR); } name = argv[1]; if ((fd = open(name, O_RDONLY)) < 0) { command_errmsg = "no such file"; return (CMD_ERROR); } if (fstat(fd, &st) < -1) { command_errmsg = "stat failed"; close(fd); return (CMD_ERROR); } status = BS->AllocatePool(EfiLoaderCode, (UINTN)st.st_size, &buf); if (status != EFI_SUCCESS) { command_errmsg = "failed to allocate buffer"; close(fd); return (CMD_ERROR); } if (read(fd, buf, st.st_size) != st.st_size) { command_errmsg = "error while reading the file"; (void)BS->FreePool(buf); close(fd); return (CMD_ERROR); } close(fd); status = BS->LoadImage(FALSE, IH, NULL, buf, st.st_size, &loaderhandle); (void)BS->FreePool(buf); if (status != EFI_SUCCESS) { command_errmsg = "LoadImage failed"; return (CMD_ERROR); } status = BS->HandleProtocol(loaderhandle, &LoadedImageGUID, (void **)&loaded_image); if (argc > 2) { int i, len = 0; CHAR16 *argp; for (i = 2; i < argc; i++) len += strlen(argv[i]) + 1; len *= sizeof (*argp); loaded_image->LoadOptions = argp = malloc (len); loaded_image->LoadOptionsSize = len; for (i = 2; i < argc; i++) { char *ptr = argv[i]; while (*ptr) *(argp++) = *(ptr++); *(argp++) = ' '; } *(--argv) = 0; } if (efi_getdev((void **)&dev, name, (const char **)&path) == 0) { #ifdef EFI_ZFS_BOOT struct zfs_devdesc *z_dev; #endif struct disk_devdesc *d_dev; pdinfo_t *hd, *pd; switch (dev->d_dev->dv_type) { #ifdef EFI_ZFS_BOOT case DEVT_ZFS: z_dev = (struct zfs_devdesc *)dev; loaded_image->DeviceHandle = efizfs_get_handle_by_guid(z_dev->pool_guid); break; #endif case DEVT_NET: loaded_image->DeviceHandle = efi_find_handle(dev->d_dev, dev->d_unit); break; default: hd = efiblk_get_pdinfo(dev); if (STAILQ_EMPTY(&hd->pd_part)) { loaded_image->DeviceHandle = hd->pd_handle; break; } d_dev = (struct disk_devdesc *)dev; STAILQ_FOREACH(pd, &hd->pd_part, pd_link) { /* * d_partition should be 255 */ if (pd->pd_unit == (uint32_t)d_dev->d_slice) { loaded_image->DeviceHandle = pd->pd_handle; break; } } break; } } dev_cleanup(); status = BS->StartImage(loaderhandle, NULL, NULL); if (status != EFI_SUCCESS) { command_errmsg = "StartImage failed"; free(loaded_image->LoadOptions); loaded_image->LoadOptions = NULL; status = BS->UnloadImage(loaded_image); return (CMD_ERROR); } return (CMD_ERROR); /* not reached */ } COMMAND_SET(chain, "chain", "chain load file", command_chain); Index: head/stand/forth/efi.4th =================================================================== --- head/stand/forth/efi.4th (revision 343754) +++ head/stand/forth/efi.4th (revision 343755) @@ -1,39 +1,39 @@ -\ Copyright (c) 2016 Netflix, Inc -\ All rights reserved. +\ +\ Copyright (c) 2016 Netflix, Inc. \ \ Redistribution and use in source and binary forms, with or without \ modification, are permitted provided that the following conditions \ are met: \ 1. Redistributions of source code must retain the above copyright \ notice, this list of conditions and the following disclaimer. \ 2. Redistributions in binary form must reproduce the above copyright \ notice, this list of conditions and the following disclaimer in the \ documentation and/or other materials provided with the distribution. \ \ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND \ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE \ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE \ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE \ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL \ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS \ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT \ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY \ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF \ SUCH DAMAGE. \ \ $FreeBSD$ only forth definitions : efiboot? ( -- flag ) s" efi-version" getenv -1 <> dup if swap drop ( c-addr flag -- flag ) then ; : maybe-efi-resizecons efiboot? if s" efi-autoresizecons" evaluate then ; Index: head/stand/i386/libi386/biospci.c =================================================================== --- head/stand/i386/libi386/biospci.c (revision 343754) +++ head/stand/i386/libi386/biospci.c (revision 343755) @@ -1,405 +1,406 @@ /*- * Copyright (c) 1998 Michael Smith - * Copyright (c) 2016 Netflix, Inc * All rights reserved. + * + * Copyright (c) 2016 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * PnP enumerator using the PCI BIOS. */ #include #include #include #include #include #include "libi386.h" /* * Stupid PCI BIOS interface doesn't let you simply enumerate everything * that's there, instead you have to ask it if it has something. * * So we have to scan by class code, subclass code and sometimes programming * interface. */ struct pci_progif { int pi_code; const char *pi_name; }; static struct pci_progif progif_null[] = { {0x0, NULL}, {-1, NULL} }; static struct pci_progif progif_display[] = { {0x0, "VGA"}, {0x1, "8514"}, {-1, NULL} }; static struct pci_progif progif_ide[] = { {0x00, NULL}, {0x01, NULL}, {0x02, NULL}, {0x03, NULL}, {0x04, NULL}, {0x05, NULL}, {0x06, NULL}, {0x07, NULL}, {0x08, NULL}, {0x09, NULL}, {0x0a, NULL}, {0x0b, NULL}, {0x0c, NULL}, {0x0d, NULL}, {0x0e, NULL}, {0x0f, NULL}, {0x80, NULL}, {0x81, NULL}, {0x82, NULL}, {0x83, NULL}, {0x84, NULL}, {0x85, NULL}, {0x86, NULL}, {0x87, NULL}, {0x88, NULL}, {0x89, NULL}, {0x8a, NULL}, {0x8b, NULL}, {0x8c, NULL}, {0x8d, NULL}, {0x8e, NULL}, {0x8f, NULL}, {-1, NULL} }; static struct pci_progif progif_serial[] = { {0x0, "8250"}, {0x1, "16450"}, {0x2, "16550"}, {-1, NULL} }; static struct pci_progif progif_parallel[] = { {0x0, "Standard"}, {0x1, "Bidirectional"}, {0x2, "ECP"}, {-1, NULL} }; static struct pci_progif progif_firewire[] = { {0x10, "OHCI"}, {-1, NULL} }; struct pci_subclass { int ps_subclass; const char *ps_name; struct pci_progif *ps_progif; /* if set, use for programming interface value(s) */ }; static struct pci_subclass subclass_old[] = { {0x0, "Old non-VGA", progif_null}, {0x1, "Old VGA", progif_null}, {-1, NULL, NULL} }; static struct pci_subclass subclass_mass[] = { {0x0, "SCSI", progif_null}, {0x1, "IDE", progif_ide}, {0x2, "Floppy disk", progif_null}, {0x3, "IPI", progif_null}, {0x4, "RAID", progif_null}, {0x80, "mass storage", progif_null}, {-1, NULL, NULL} }; static struct pci_subclass subclass_net[] = { {0x0, "Ethernet", progif_null}, {0x1, "Token ring", progif_null}, {0x2, "FDDI", progif_null}, {0x3, "ATM", progif_null}, {0x80, "network", progif_null}, {-1, NULL, NULL} }; static struct pci_subclass subclass_display[] = { {0x0, NULL, progif_display}, {0x1, "XGA", progif_null}, {0x80, "other", progif_null}, {-1, NULL, NULL} }; static struct pci_subclass subclass_comms[] = { {0x0, "serial", progif_serial}, {0x1, "parallel", progif_parallel}, {0x80, "communications", progif_null}, {-1, NULL, NULL} }; static struct pci_subclass subclass_serial[] = { {0x0, "FireWire", progif_firewire}, {0x1, "ACCESS.bus", progif_null}, {0x2, "SSA", progif_null}, {0x3, "USB", progif_null}, {0x4, "Fibrechannel", progif_null}, {-1, NULL, NULL} }; static struct pci_class { int pc_class; const char *pc_name; struct pci_subclass *pc_subclass; } pci_classes[] = { {0x0, "device", subclass_old}, {0x1, "controller", subclass_mass}, {0x2, "controller", subclass_net}, {0x3, "display", subclass_display}, {0x7, "controller", subclass_comms}, {0xc, "controller", subclass_serial}, {-1, NULL, NULL} }; static void biospci_enumerate(void); static void biospci_addinfo(int devid, struct pci_class *pc, struct pci_subclass *psc, struct pci_progif *ppi); struct pnphandler biospcihandler = { "PCI BIOS", biospci_enumerate }; static int biospci_version; #define PCI_BIOS_PRESENT 0xb101 #define FIND_PCI_DEVICE 0xb102 #define FIND_PCI_CLASS_CODE 0xb103 #define GENERATE_SPECIAL_CYCLE 0xb106 #define READ_CONFIG_BYTE 0xb108 #define READ_CONFIG_WORD 0xb109 #define READ_CONFIG_DWORD 0xb10a #define WRITE_CONFIG_BYTE 0xb10b #define WRITE_CONFIG_WORD 0xb10c #define WRITE_CONFIG_DWORD 0xb10d #define GET_IRQ_ROUTING_OPTIONS 0xb10e #define SET_PCI_IRQ 0xb10f #define PCI_INT 0x1a #define PCI_SIGNATURE 0x20494350 /* AKA "PCI " */ void biospci_detect(void) { uint16_t version, hwcap, maxbus; char buf[24]; /* Find the PCI BIOS */ v86.ctl = V86_FLAGS; v86.addr = PCI_INT; v86.eax = PCI_BIOS_PRESENT; v86.edi = 0x0; v86int(); /* Check for OK response */ if (V86_CY(v86.efl) || ((v86.eax & 0xff00) != 0) || (v86.edx != PCI_SIGNATURE)) return; version = v86.ebx & 0xffff; hwcap = v86.eax & 0xff; maxbus = v86.ecx & 0xff; #if 0 printf("PCI BIOS %d.%d%s%s maxbus %d\n", bcd2bin((version >> 8) & 0xf), bcd2bin(version & 0xf), (hwcap & 1) ? " config1" : "", (hwcap & 2) ? " config2" : "", maxbus); #endif sprintf(buf, "%d", bcd2bin((version >> 8) & 0xf)); setenv("pcibios.major", buf, 1); sprintf(buf, "%d", bcd2bin(version & 0xf)); setenv("pcibios.minor", buf, 1); sprintf(buf, "%d", !!(hwcap & 1)); setenv("pcibios.config1", buf, 1); sprintf(buf, "%d", !!(hwcap & 2)); setenv("pcibios.config2", buf, 1); sprintf(buf, "%d", maxbus); setenv("pcibios.maxbus", buf, 1); biospci_version = bcd2bin((version >> 8) & 0xf) * 10 + bcd2bin(version & 0xf); } static void biospci_enumerate(void) { int device_index, err; uint32_t locator, devid; struct pci_class *pc; struct pci_subclass *psc; struct pci_progif *ppi; /* Iterate over known classes */ for (pc = pci_classes; pc->pc_class >= 0; pc++) { /* Iterate over subclasses */ for (psc = pc->pc_subclass; psc->ps_subclass >= 0; psc++) { /* Iterate over programming interfaces */ for (ppi = psc->ps_progif; ppi->pi_code >= 0; ppi++) { /* Scan for matches */ for (device_index = 0; ; device_index++) { /* Look for a match */ err = biospci_find_devclass((pc->pc_class << 16) + (psc->ps_subclass << 8) + ppi->pi_code, device_index, &locator); if (err != 0) break; /* Read the device identifier from the nominated device */ err = biospci_read_config(locator, 0, BIOSPCI_32BITS, &devid); if (err != 0) break; /* We have the device ID, create a PnP object and save everything */ biospci_addinfo(devid, pc, psc, ppi); } } } } } static void biospci_addinfo(int devid, struct pci_class *pc, struct pci_subclass *psc, struct pci_progif *ppi) { struct pnpinfo *pi; char desc[80]; /* build the description */ desc[0] = 0; if (ppi->pi_name != NULL) { strcat(desc, ppi->pi_name); strcat(desc, " "); } if (psc->ps_name != NULL) { strcat(desc, psc->ps_name); strcat(desc, " "); } if (pc->pc_name != NULL) strcat(desc, pc->pc_name); pi = pnp_allocinfo(); pi->pi_desc = strdup(desc); sprintf(desc,"0x%08x", devid); pnp_addident(pi, desc); pnp_addinfo(pi); } int biospci_find_devclass(uint32_t class, int index, uint32_t *locator) { v86.ctl = V86_FLAGS; v86.addr = PCI_INT; v86.eax = FIND_PCI_CLASS_CODE; v86.ecx = class; v86.esi = index; v86int(); /* error */ if (V86_CY(v86.efl) || (v86.eax & 0xff00)) return (-1); *locator = v86.ebx; return (0); } static int biospci_find_device(uint32_t devid, int index, uint32_t *locator) { v86.ctl = V86_FLAGS; v86.addr = PCI_INT; v86.eax = FIND_PCI_DEVICE; v86.edx = devid & 0xffff; /* EDX - Vendor ID */ v86.ecx = (devid >> 16) & 0xffff; /* ECX - Device ID */ v86.esi = index; v86int(); /* error */ if (V86_CY(v86.efl) || (v86.eax & 0xff00)) return (-1); *locator = v86.ebx; return (0); } /* * Configuration space access methods. * width = 0(byte), 1(word) or 2(dword). */ int biospci_write_config(uint32_t locator, int offset, int width, uint32_t val) { v86.ctl = V86_FLAGS; v86.addr = PCI_INT; v86.eax = WRITE_CONFIG_BYTE + width; v86.ebx = locator; v86.edi = offset; v86.ecx = val; v86int(); /* error */ if (V86_CY(v86.efl) || (v86.eax & 0xff00)) return (-1); return(0); } int biospci_read_config(uint32_t locator, int offset, int width, uint32_t *val) { v86.ctl = V86_FLAGS; v86.addr = PCI_INT; v86.eax = READ_CONFIG_BYTE + width; v86.ebx = locator; v86.edi = offset; v86int(); /* error */ if (V86_CY(v86.efl) || (v86.eax & 0xff00)) return (-1); *val = v86.ecx; return (0); } uint32_t biospci_locator(int8_t bus, uint8_t device, uint8_t function) { return ((bus << 8) | ((device & 0x1f) << 3) | (function & 0x7)); } Index: head/stand/libsa/abort.c =================================================================== --- head/stand/libsa/abort.c (revision 343754) +++ head/stand/libsa/abort.c (revision 343755) @@ -1,35 +1,35 @@ /* - * Copyright (c) 2018 Netflix. All Rights Reserved. + * Copyright (c) 2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include void abort(void) { panic("Bootloader aborted by abort"); } Index: head/stand/libsa/xlocale_private.h =================================================================== --- head/stand/libsa/xlocale_private.h (revision 343754) +++ head/stand/libsa/xlocale_private.h (revision 343755) @@ -1,36 +1,36 @@ /*- - * Copyright (c) 2018 Netflix + * Copyright (c) 2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef STAND_XLOCALE_PRIVATE_H #define STAND_XLOCALE_PRIVATE_H 1 typedef int locale_t; #define FIX_LOCALE(x) #define isspace_l(c, l) isspace(c) #define __get_locale() 0 #endif /* STAND_XLOCALE_PRIVATE_H */ Index: head/sys/cam/nvme/nvme_all.c =================================================================== --- head/sys/cam/nvme/nvme_all.c (revision 343754) +++ head/sys/cam/nvme/nvme_all.c (revision 343755) @@ -1,163 +1,163 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2015 Netflix, Inc + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include "opt_scsi.h" #include #include #include #include #include #else #include #include #include #include #ifndef min #define min(a,b) (((a)<(b))?(a):(b)) #endif #endif #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #endif void nvme_ns_cmd(struct ccb_nvmeio *nvmeio, uint8_t cmd, uint32_t nsid, uint32_t cdw10, uint32_t cdw11, uint32_t cdw12, uint32_t cdw13, uint32_t cdw14, uint32_t cdw15) { bzero(&nvmeio->cmd, sizeof(struct nvme_command)); nvmeio->cmd.opc = cmd; nvmeio->cmd.nsid = htole32(nsid); nvmeio->cmd.cdw10 = htole32(cdw10); nvmeio->cmd.cdw11 = htole32(cdw11); nvmeio->cmd.cdw12 = htole32(cdw12); nvmeio->cmd.cdw13 = htole32(cdw13); nvmeio->cmd.cdw14 = htole32(cdw14); nvmeio->cmd.cdw15 = htole32(cdw15); } int nvme_identify_match(caddr_t identbuffer, caddr_t table_entry) { return 0; } void nvme_print_ident(const struct nvme_controller_data *cdata, const struct nvme_namespace_data *data, struct sbuf *sb) { sbuf_printf(sb, "<"); cam_strvis_sbuf(sb, cdata->mn, sizeof(cdata->mn), 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, cdata->fr, sizeof(cdata->fr), 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, cdata->sn, sizeof(cdata->sn), 0); sbuf_printf(sb, ">\n"); } /* XXX need to do nvme admin opcodes too, but those aren't used yet by nda */ static const char * nvme_opc2str[] = { "FLUSH", "WRITE", "READ", "RSVD-3", "WRITE_UNCORRECTABLE", "COMPARE", "RSVD-6", "RSVD-7", "DATASET_MANAGEMENT" }; const char * nvme_op_string(const struct nvme_command *cmd) { if (cmd->opc >= nitems(nvme_opc2str)) return "UNKNOWN"; return nvme_opc2str[cmd->opc]; } const char * nvme_cmd_string(const struct nvme_command *cmd, char *cmd_string, size_t len) { /* * cid, rsvd areas and mptr not printed, since they are used * only internally by the SIM. */ snprintf(cmd_string, len, "opc=%x fuse=%x nsid=%x prp1=%llx prp2=%llx cdw=%x %x %x %x %x %x", cmd->opc, cmd->fuse, cmd->nsid, (unsigned long long)cmd->prp1, (unsigned long long)cmd->prp2, cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14, cmd->cdw15); return cmd_string; } const void * nvme_get_identify_cntrl(struct cam_periph *periph) { struct cam_ed *device; device = periph->path->device; return device->nvme_cdata; } const void * nvme_get_identify_ns(struct cam_periph *periph) { struct cam_ed *device; device = periph->path->device; return device->nvme_data; } Index: head/sys/cam/nvme/nvme_all.h =================================================================== --- head/sys/cam/nvme/nvme_all.h (revision 343754) +++ head/sys/cam/nvme/nvme_all.h (revision 343755) @@ -1,50 +1,50 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2015 Netflix, Inc + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef CAM_NVME_NVME_ALL_H #define CAM_NVME_NVME_ALL_H 1 #include struct ccb_nvmeio; void nvme_ns_cmd(struct ccb_nvmeio *nvmeio, uint8_t cmd, uint32_t nsid, uint32_t cdw10, uint32_t cdw11, uint32_t cdw12, uint32_t cdw13, uint32_t cdw14, uint32_t cdw15); int nvme_identify_match(caddr_t identbuffer, caddr_t table_entry); struct sbuf; void nvme_print_ident(const struct nvme_controller_data *, const struct nvme_namespace_data *, struct sbuf *); const char *nvme_op_string(const struct nvme_command *); const char *nvme_cmd_string(const struct nvme_command *, char *, size_t); const void *nvme_get_identify_cntrl(struct cam_periph *); const void *nvme_get_identify_ns(struct cam_periph *); #endif /* CAM_NVME_NVME_ALL_H */ Index: head/sys/cam/nvme/nvme_da.c =================================================================== --- head/sys/cam/nvme/nvme_da.c (revision 343754) +++ head/sys/cam/nvme/nvme_da.c (revision 343755) @@ -1,1228 +1,1228 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2015 Netflix, Inc + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Derived from ata_da.c: * Copyright (c) 2009 Alexander Motin */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #ifndef _KERNEL #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include typedef enum { NDA_STATE_NORMAL } nda_state; typedef enum { NDA_FLAG_OPEN = 0x0001, NDA_FLAG_DIRTY = 0x0002, NDA_FLAG_SCTX_INIT = 0x0004, } nda_flags; typedef enum { NDA_Q_4K = 0x01, NDA_Q_NONE = 0x00, } nda_quirks; #define NDA_Q_BIT_STRING \ "\020" \ "\001Bit 0" typedef enum { NDA_CCB_BUFFER_IO = 0x01, NDA_CCB_DUMP = 0x02, NDA_CCB_TRIM = 0x03, NDA_CCB_TYPE_MASK = 0x0F, } nda_ccb_state; /* Offsets into our private area for storing information */ #define ccb_state ccb_h.ppriv_field0 #define ccb_bp ccb_h.ppriv_ptr1 /* For NDA_CCB_BUFFER_IO */ #define ccb_trim ccb_h.ppriv_ptr1 /* For NDA_CCB_TRIM */ struct nda_softc { struct cam_iosched_softc *cam_iosched; int outstanding_cmds; /* Number of active commands */ int refcount; /* Active xpt_action() calls */ nda_state state; nda_flags flags; nda_quirks quirks; int unmappedio; quad_t deletes; uint32_t nsid; /* Namespace ID for this nda device */ struct disk *disk; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; uint64_t trim_count; uint64_t trim_ranges; uint64_t trim_lbas; #ifdef CAM_TEST_FAILURE int force_read_error; int force_write_error; int periodic_read_error; int periodic_read_count; #endif #ifdef CAM_IO_STATS struct sysctl_ctx_list sysctl_stats_ctx; struct sysctl_oid *sysctl_stats_tree; u_int timeouts; u_int errors; u_int invalidations; #endif }; struct nda_trim_request { union { struct nvme_dsm_range dsm; uint8_t data[NVME_MAX_DSM_TRIM]; }; TAILQ_HEAD(, bio) bps; }; /* Need quirk table */ static disk_strategy_t ndastrategy; static dumper_t ndadump; static periph_init_t ndainit; static void ndaasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void ndasysctlinit(void *context, int pending); static periph_ctor_t ndaregister; static periph_dtor_t ndacleanup; static periph_start_t ndastart; static periph_oninv_t ndaoninvalidate; static void ndadone(struct cam_periph *periph, union ccb *done_ccb); static int ndaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static void ndashutdown(void *arg, int howto); static void ndasuspend(void *arg); #ifndef NDA_DEFAULT_SEND_ORDERED #define NDA_DEFAULT_SEND_ORDERED 1 #endif #ifndef NDA_DEFAULT_TIMEOUT #define NDA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */ #endif #ifndef NDA_DEFAULT_RETRY #define NDA_DEFAULT_RETRY 4 #endif #ifndef NDA_MAX_TRIM_ENTRIES #define NDA_MAX_TRIM_ENTRIES (NVME_MAX_DSM_TRIM / sizeof(struct nvme_dsm_range))/* Number of DSM trims to use, max 256 */ #endif static SYSCTL_NODE(_kern_cam, OID_AUTO, nda, CTLFLAG_RD, 0, "CAM Direct Access Disk driver"); //static int nda_retry_count = NDA_DEFAULT_RETRY; static int nda_send_ordered = NDA_DEFAULT_SEND_ORDERED; static int nda_default_timeout = NDA_DEFAULT_TIMEOUT; static int nda_max_trim_entries = NDA_MAX_TRIM_ENTRIES; SYSCTL_INT(_kern_cam_nda, OID_AUTO, max_trim, CTLFLAG_RDTUN, &nda_max_trim_entries, NDA_MAX_TRIM_ENTRIES, "Maximum number of BIO_DELETE to send down as a DSM TRIM."); /* * All NVMe media is non-rotational, so all nvme device instances * share this to implement the sysctl. */ static int nda_rotating_media = 0; static struct periph_driver ndadriver = { ndainit, "nda", TAILQ_HEAD_INITIALIZER(ndadriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(nda, ndadriver); static MALLOC_DEFINE(M_NVMEDA, "nvme_da", "nvme_da buffers"); /* * nice wrappers. Maybe these belong in nvme_all.c instead of * here, but this is the only place that uses these. Should * we ever grow another NVME periph, we should move them * all there wholesale. */ static void nda_nvme_flush(struct nda_softc *softc, struct ccb_nvmeio *nvmeio) { cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ CAM_DIR_NONE, /* flags */ NULL, /* data_ptr */ 0, /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_flush_cmd(&nvmeio->cmd, softc->nsid); } static void nda_nvme_trim(struct nda_softc *softc, struct ccb_nvmeio *nvmeio, void *payload, uint32_t num_ranges) { cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ CAM_DIR_OUT, /* flags */ payload, /* data_ptr */ num_ranges * sizeof(struct nvme_dsm_range), /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_trim_cmd(&nvmeio->cmd, softc->nsid, num_ranges); } static void nda_nvme_write(struct nda_softc *softc, struct ccb_nvmeio *nvmeio, void *payload, uint64_t lba, uint32_t len, uint32_t count) { cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ CAM_DIR_OUT, /* flags */ payload, /* data_ptr */ len, /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_write_cmd(&nvmeio->cmd, softc->nsid, lba, count); } static void nda_nvme_rw_bio(struct nda_softc *softc, struct ccb_nvmeio *nvmeio, struct bio *bp, uint32_t rwcmd) { int flags = rwcmd == NVME_OPC_READ ? CAM_DIR_IN : CAM_DIR_OUT; void *payload; uint64_t lba; uint32_t count; if (bp->bio_flags & BIO_UNMAPPED) { flags |= CAM_DATA_BIO; payload = bp; } else { payload = bp->bio_data; } lba = bp->bio_pblkno; count = bp->bio_bcount / softc->disk->d_sectorsize; cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ flags, /* flags */ payload, /* data_ptr */ bp->bio_bcount, /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_rw_cmd(&nvmeio->cmd, rwcmd, softc->nsid, lba, count); } static int ndaopen(struct disk *dp) { struct cam_periph *periph; struct nda_softc *softc; int error; periph = (struct cam_periph *)dp->d_drv1; if (cam_periph_acquire(periph) != 0) { return(ENXIO); } cam_periph_lock(periph); if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (error); } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("ndaopen\n")); softc = (struct nda_softc *)periph->softc; softc->flags |= NDA_FLAG_OPEN; cam_periph_unhold(periph); cam_periph_unlock(periph); return (0); } static int ndaclose(struct disk *dp) { struct cam_periph *periph; struct nda_softc *softc; union ccb *ccb; int error; periph = (struct cam_periph *)dp->d_drv1; softc = (struct nda_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("ndaclose\n")); if ((softc->flags & NDA_FLAG_DIRTY) != 0 && (periph->flags & CAM_PERIPH_INVALID) == 0 && cam_periph_hold(periph, PRIBIO) == 0) { ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); nda_nvme_flush(softc, &ccb->nvmeio); error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0, /*sense_flags*/0, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); else softc->flags &= ~NDA_FLAG_DIRTY; xpt_release_ccb(ccb); cam_periph_unhold(periph); } softc->flags &= ~NDA_FLAG_OPEN; while (softc->refcount != 0) cam_periph_sleep(periph, &softc->refcount, PRIBIO, "ndaclose", 1); KASSERT(softc->outstanding_cmds == 0, ("nda %d outstanding commands", softc->outstanding_cmds)); cam_periph_unlock(periph); cam_periph_release(periph); return (0); } static void ndaschedule(struct cam_periph *periph) { struct nda_softc *softc = (struct nda_softc *)periph->softc; if (softc->state != NDA_STATE_NORMAL) return; cam_iosched_schedule(softc->cam_iosched, periph); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void ndastrategy(struct bio *bp) { struct cam_periph *periph; struct nda_softc *softc; periph = (struct cam_periph *)bp->bio_disk->d_drv1; softc = (struct nda_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastrategy(%p)\n", bp)); /* * If the device has been made invalid, error out */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) { cam_periph_unlock(periph); biofinish(bp, NULL, ENXIO); return; } if (bp->bio_cmd == BIO_DELETE) softc->deletes++; /* * Place it in the queue of disk activities for this disk */ cam_iosched_queue_work(softc->cam_iosched, bp); /* * Schedule ourselves for performing the work. */ ndaschedule(periph); cam_periph_unlock(periph); return; } static int ndadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct cam_periph *periph; struct nda_softc *softc; u_int secsize; struct ccb_nvmeio nvmeio; struct disk *dp; uint64_t lba; uint32_t count; int error = 0; dp = arg; periph = dp->d_drv1; softc = (struct nda_softc *)periph->softc; secsize = softc->disk->d_sectorsize; lba = offset / secsize; count = length / secsize; if ((periph->flags & CAM_PERIPH_INVALID) != 0) return (ENXIO); /* xpt_get_ccb returns a zero'd allocation for the ccb, mimic that here */ memset(&nvmeio, 0, sizeof(nvmeio)); if (length > 0) { xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); nvmeio.ccb_state = NDA_CCB_DUMP; nda_nvme_write(softc, &nvmeio, virtual, lba, length, count); error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) printf("Aborting dump due to I/O error %d.\n", error); return (error); } /* Flush */ xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); nvmeio.ccb_state = NDA_CCB_DUMP; nda_nvme_flush(softc, &nvmeio); error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) xpt_print(periph->path, "flush cmd failed\n"); return (error); } static void ndainit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, ndaasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("nda: Failed to attach master async callback " "due to status 0x%x!\n", status); } else if (nda_send_ordered) { /* Register our event handlers */ if ((EVENTHANDLER_REGISTER(power_suspend, ndasuspend, NULL, EVENTHANDLER_PRI_LAST)) == NULL) printf("ndainit: power event registration failed!\n"); if ((EVENTHANDLER_REGISTER(shutdown_post_sync, ndashutdown, NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) printf("ndainit: shutdown event registration failed!\n"); } } /* * Callback from GEOM, called when it has finished cleaning up its * resources. */ static void ndadiskgonecb(struct disk *dp) { struct cam_periph *periph; periph = (struct cam_periph *)dp->d_drv1; cam_periph_release(periph); } static void ndaoninvalidate(struct cam_periph *periph) { struct nda_softc *softc; softc = (struct nda_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, ndaasync, periph, periph->path); #ifdef CAM_IO_STATS softc->invalidations++; #endif /* * Return all queued I/O with ENXIO. * XXX Handle any transactions queued to the card * with XPT_ABORT_CCB. */ cam_iosched_flush(softc->cam_iosched, NULL, ENXIO); disk_gone(softc->disk); } static void ndacleanup(struct cam_periph *periph) { struct nda_softc *softc; softc = (struct nda_softc *)periph->softc; cam_periph_unlock(periph); cam_iosched_fini(softc->cam_iosched); /* * If we can't free the sysctl tree, oh well... */ if ((softc->flags & NDA_FLAG_SCTX_INIT) != 0) { #ifdef CAM_IO_STATS if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0) xpt_print(periph->path, "can't remove sysctl stats context\n"); #endif if (sysctl_ctx_free(&softc->sysctl_ctx) != 0) xpt_print(periph->path, "can't remove sysctl context\n"); } disk_destroy(softc->disk); free(softc, M_DEVBUF); cam_periph_lock(periph); } static void ndaasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_NVME) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(ndaregister, ndaoninvalidate, ndacleanup, ndastart, "nda", CAM_PERIPH_BIO, path, ndaasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("ndaasync: Unable to attach to new device " "due to status 0x%x\n", status); break; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct nda_softc *softc; softc = periph->softc; disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); } break; } case AC_LOST_DEVICE: default: cam_periph_async(periph, code, path, arg); break; } } static void ndasysctlinit(void *context, int pending) { struct cam_periph *periph; struct nda_softc *softc; char tmpstr[32], tmpstr2[16]; periph = (struct cam_periph *)context; /* periph was held for us when this task was enqueued */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) { cam_periph_release(periph); return; } softc = (struct nda_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM NDA unit %d", periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); softc->flags |= NDA_FLAG_SCTX_INIT; softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_nda), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr, "device_index"); if (softc->sysctl_tree == NULL) { printf("ndasysctlinit: unable to allocate sysctl tree\n"); cam_periph_release(periph); return; } SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "unmapped_io", CTLFLAG_RD, &softc->unmappedio, 0, "Unmapped I/O leaf"); SYSCTL_ADD_QUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "deletes", CTLFLAG_RD, &softc->deletes, "Number of BIO_DELETE requests"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "trim_count", CTLFLAG_RD, &softc->trim_count, "Total number of unmap/dsm commands sent"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "trim_ranges", CTLFLAG_RD, &softc->trim_ranges, "Total number of ranges in unmap/dsm commands"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "trim_lbas", CTLFLAG_RD, &softc->trim_lbas, "Total lbas in the unmap/dsm commands sent"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "rotating", CTLFLAG_RD, &nda_rotating_media, 1, "Rotating media"); #ifdef CAM_IO_STATS softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats", CTLFLAG_RD, 0, "Statistics"); if (softc->sysctl_stats_tree == NULL) { printf("ndasysctlinit: unable to allocate sysctl tree for stats\n"); cam_periph_release(periph); return; } SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "timeouts", CTLFLAG_RD, &softc->timeouts, 0, "Device timeouts reported by the SIM"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "errors", CTLFLAG_RD, &softc->errors, 0, "Transport errors reported by the SIM."); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "pack_invalidations", CTLFLAG_RD, &softc->invalidations, 0, "Device pack invalidations."); #endif #ifdef CAM_TEST_FAILURE SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "invalidate", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, periph, 0, cam_periph_invalidate_sysctl, "I", "Write 1 to invalidate the drive immediately"); #endif cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx, softc->sysctl_tree); cam_periph_release(periph); } static int ndagetattr(struct bio *bp) { int ret; struct cam_periph *periph; periph = (struct cam_periph *)bp->bio_disk->d_drv1; cam_periph_lock(periph); ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, periph->path); cam_periph_unlock(periph); if (ret == 0) bp->bio_completed = bp->bio_length; return ret; } static cam_status ndaregister(struct cam_periph *periph, void *arg) { struct nda_softc *softc; struct disk *disk; struct ccb_pathinq cpi; const struct nvme_namespace_data *nsd; const struct nvme_controller_data *cd; char announce_buf[80]; uint8_t flbas_fmt, lbads, vwc_present; u_int maxio; int quirks; nsd = nvme_get_identify_ns(periph); cd = nvme_get_identify_cntrl(periph); softc = (struct nda_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT | M_ZERO); if (softc == NULL) { printf("ndaregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } if (cam_iosched_init(&softc->cam_iosched, periph) != 0) { printf("ndaregister: Unable to probe new device. " "Unable to allocate iosched memory\n"); free(softc, M_DEVBUF); return(CAM_REQ_CMP_ERR); } /* ident_data parsing */ periph->softc = softc; softc->quirks = NDA_Q_NONE; xpt_path_inq(&cpi, periph->path); TASK_INIT(&softc->sysctl_task, 0, ndasysctlinit, periph); /* * The name space ID is the lun, save it for later I/O */ softc->nsid = (uint32_t)xpt_path_lun_id(periph->path); /* * Register this media as a disk */ (void)cam_periph_hold(periph, PRIBIO); cam_periph_unlock(periph); snprintf(announce_buf, sizeof(announce_buf), "kern.cam.nda.%d.quirks", periph->unit_number); quirks = softc->quirks; TUNABLE_INT_FETCH(announce_buf, &quirks); softc->quirks = quirks; cam_iosched_set_sort_queue(softc->cam_iosched, 0); softc->disk = disk = disk_alloc(); strlcpy(softc->disk->d_descr, cd->mn, MIN(sizeof(softc->disk->d_descr), sizeof(cd->mn))); strlcpy(softc->disk->d_ident, cd->sn, MIN(sizeof(softc->disk->d_ident), sizeof(cd->sn))); disk->d_rotation_rate = DISK_RR_NON_ROTATING; disk->d_open = ndaopen; disk->d_close = ndaclose; disk->d_strategy = ndastrategy; disk->d_getattr = ndagetattr; disk->d_dump = ndadump; disk->d_gone = ndadiskgonecb; disk->d_name = "nda"; disk->d_drv1 = periph; disk->d_unit = periph->unit_number; maxio = cpi.maxio; /* Honor max I/O size of SIM */ if (maxio == 0) maxio = DFLTPHYS; /* traditional default */ else if (maxio > MAXPHYS) maxio = MAXPHYS; /* for safety */ disk->d_maxsize = maxio; flbas_fmt = (nsd->flbas >> NVME_NS_DATA_FLBAS_FORMAT_SHIFT) & NVME_NS_DATA_FLBAS_FORMAT_MASK; lbads = (nsd->lbaf[flbas_fmt] >> NVME_NS_DATA_LBAF_LBADS_SHIFT) & NVME_NS_DATA_LBAF_LBADS_MASK; disk->d_sectorsize = 1 << lbads; disk->d_mediasize = (off_t)(disk->d_sectorsize * nsd->nsze); disk->d_delmaxsize = disk->d_mediasize; disk->d_flags = DISKFLAG_DIRECT_COMPLETION; if (nvme_ctrlr_has_dataset_mgmt(cd)) disk->d_flags |= DISKFLAG_CANDELETE; vwc_present = (cd->vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) & NVME_CTRLR_DATA_VWC_PRESENT_MASK; if (vwc_present) disk->d_flags |= DISKFLAG_CANFLUSHCACHE; if ((cpi.hba_misc & PIM_UNMAPPED) != 0) { disk->d_flags |= DISKFLAG_UNMAPPED_BIO; softc->unmappedio = 1; } /* * d_ident and d_descr are both far bigger than the length of either * the serial or model number strings. */ nvme_strvis(disk->d_descr, cd->mn, sizeof(disk->d_descr), NVME_MODEL_NUMBER_LENGTH); nvme_strvis(disk->d_ident, cd->sn, sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH); disk->d_hba_vendor = cpi.hba_vendor; disk->d_hba_device = cpi.hba_device; disk->d_hba_subvendor = cpi.hba_subvendor; disk->d_hba_subdevice = cpi.hba_subdevice; disk->d_stripesize = disk->d_sectorsize; disk->d_stripeoffset = 0; disk->d_devstat = devstat_new_entry(periph->periph_name, periph->unit_number, disk->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_DISK); /* * Add alias for older nvd drives to ease transition. */ /* disk_add_alias(disk, "nvd"); Have reports of this causing problems */ /* * Acquire a reference to the periph before we register with GEOM. * We'll release this reference once GEOM calls us back (via * ndadiskgonecb()) telling us that our provider has been freed. */ if (cam_periph_acquire(periph) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } disk_create(softc->disk, DISK_VERSION); cam_periph_lock(periph); cam_periph_unhold(periph); snprintf(announce_buf, sizeof(announce_buf), "%juMB (%ju %u byte sectors)", (uintmax_t)((uintmax_t)disk->d_mediasize / (1024*1024)), (uintmax_t)disk->d_mediasize / disk->d_sectorsize, disk->d_sectorsize); xpt_announce_periph(periph, announce_buf); xpt_announce_quirks(periph, softc->quirks, NDA_Q_BIT_STRING); /* * Create our sysctl variables, now that we know * we have successfully attached. */ if (cam_periph_acquire(periph) == 0) taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task); /* * Register for device going away and info about the drive * changing (though with NVMe, it can't) */ xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED, ndaasync, periph, periph->path); softc->state = NDA_STATE_NORMAL; return(CAM_REQ_CMP); } static void ndastart(struct cam_periph *periph, union ccb *start_ccb) { struct nda_softc *softc = (struct nda_softc *)periph->softc; struct ccb_nvmeio *nvmeio = &start_ccb->nvmeio; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart\n")); switch (softc->state) { case NDA_STATE_NORMAL: { struct bio *bp; bp = cam_iosched_next_bio(softc->cam_iosched); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart: bio %p\n", bp)); if (bp == NULL) { xpt_release_ccb(start_ccb); break; } switch (bp->bio_cmd) { case BIO_WRITE: softc->flags |= NDA_FLAG_DIRTY; /* FALLTHROUGH */ case BIO_READ: { #ifdef CAM_TEST_FAILURE int fail = 0; /* * Support the failure ioctls. If the command is a * read, and there are pending forced read errors, or * if a write and pending write errors, then fail this * operation with EIO. This is useful for testing * purposes. Also, support having every Nth read fail. * * This is a rather blunt tool. */ if (bp->bio_cmd == BIO_READ) { if (softc->force_read_error) { softc->force_read_error--; fail = 1; } if (softc->periodic_read_error > 0) { if (++softc->periodic_read_count >= softc->periodic_read_error) { softc->periodic_read_count = 0; fail = 1; } } } else { if (softc->force_write_error) { softc->force_write_error--; fail = 1; } } if (fail) { biofinish(bp, NULL, EIO); xpt_release_ccb(start_ccb); ndaschedule(periph); return; } #endif KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_bcount + bp->bio_ma_offset) / PAGE_SIZE == bp->bio_ma_n, ("Short bio %p", bp)); nda_nvme_rw_bio(softc, &start_ccb->nvmeio, bp, bp->bio_cmd == BIO_READ ? NVME_OPC_READ : NVME_OPC_WRITE); break; } case BIO_DELETE: { struct nvme_dsm_range *dsm_range, *dsm_end; struct nda_trim_request *trim; struct bio *bp1; int ents; uint32_t totalcount = 0, ranges = 0; trim = malloc(sizeof(*trim), M_NVMEDA, M_ZERO | M_NOWAIT); if (trim == NULL) { biofinish(bp, NULL, ENOMEM); xpt_release_ccb(start_ccb); ndaschedule(periph); return; } TAILQ_INIT(&trim->bps); bp1 = bp; ents = sizeof(trim->data) / sizeof(struct nvme_dsm_range); ents = min(ents, nda_max_trim_entries); dsm_range = &trim->dsm; dsm_end = dsm_range + ents; do { TAILQ_INSERT_TAIL(&trim->bps, bp1, bio_queue); dsm_range->length = htole32(bp1->bio_bcount / softc->disk->d_sectorsize); dsm_range->starting_lba = htole64(bp1->bio_offset / softc->disk->d_sectorsize); ranges++; totalcount += dsm_range->length; dsm_range++; if (dsm_range >= dsm_end) break; bp1 = cam_iosched_next_trim(softc->cam_iosched); /* XXX -- Could collapse adjacent ranges, but we don't for now */ /* XXX -- Could limit based on total payload size */ } while (bp1 != NULL); start_ccb->ccb_trim = trim; nda_nvme_trim(softc, &start_ccb->nvmeio, &trim->dsm, dsm_range - &trim->dsm); start_ccb->ccb_state = NDA_CCB_TRIM; softc->trim_count++; softc->trim_ranges += ranges; softc->trim_lbas += totalcount; /* * Note: We can have multiple TRIMs in flight, so we don't call * cam_iosched_submit_trim(softc->cam_iosched); * since that forces the I/O scheduler to only schedule one at a time. * On NVMe drives, this is a performance disaster. */ goto out; } case BIO_FLUSH: nda_nvme_flush(softc, nvmeio); break; } start_ccb->ccb_state = NDA_CCB_BUFFER_IO; start_ccb->ccb_bp = bp; out: start_ccb->ccb_h.flags |= CAM_UNLOCKED; softc->outstanding_cmds++; softc->refcount++; /* For submission only */ cam_periph_unlock(periph); xpt_action(start_ccb); cam_periph_lock(periph); softc->refcount--; /* Submission done */ /* May have more work to do, so ensure we stay scheduled */ ndaschedule(periph); break; } } } static void ndadone(struct cam_periph *periph, union ccb *done_ccb) { struct nda_softc *softc; struct ccb_nvmeio *nvmeio = &done_ccb->nvmeio; struct cam_path *path; int state; softc = (struct nda_softc *)periph->softc; path = done_ccb->ccb_h.path; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("ndadone\n")); state = nvmeio->ccb_state & NDA_CCB_TYPE_MASK; switch (state) { case NDA_CCB_BUFFER_IO: case NDA_CCB_TRIM: { int error; cam_periph_lock(periph); if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = ndaerror(done_ccb, 0, 0); if (error == ERESTART) { /* A retry was scheduled, so just return. */ cam_periph_unlock(periph); return; } if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } else { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) panic("REQ_CMP with QFRZN"); error = 0; } if (state == NDA_CCB_BUFFER_IO) { struct bio *bp; bp = (struct bio *)done_ccb->ccb_bp; bp->bio_error = error; if (error != 0) { bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; } else { bp->bio_resid = 0; } softc->outstanding_cmds--; /* * We need to call cam_iosched before we call biodone so that we * don't measure any activity that happens in the completion * routine, which in the case of sendfile can be quite * extensive. */ cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb); xpt_release_ccb(done_ccb); ndaschedule(periph); cam_periph_unlock(periph); biodone(bp); } else { /* state == NDA_CCB_TRIM */ struct nda_trim_request *trim; struct bio *bp1, *bp2; TAILQ_HEAD(, bio) queue; trim = nvmeio->ccb_trim; TAILQ_INIT(&queue); TAILQ_CONCAT(&queue, &trim->bps, bio_queue); free(trim, M_NVMEDA); /* * Since we can have multiple trims in flight, we don't * need to call this here. * cam_iosched_trim_done(softc->cam_iosched); */ /* * The the I/O scheduler that we're finishing the I/O * so we can keep book. The first one we pass in the CCB * which has the timing information. The rest we pass in NULL * so we can keep proper counts. */ bp1 = TAILQ_FIRST(&queue); cam_iosched_bio_complete(softc->cam_iosched, bp1, done_ccb); xpt_release_ccb(done_ccb); softc->outstanding_cmds--; ndaschedule(periph); cam_periph_unlock(periph); while ((bp2 = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bp2, bio_queue); bp2->bio_error = error; if (error != 0) { bp2->bio_flags |= BIO_ERROR; bp2->bio_resid = bp1->bio_bcount; } else bp2->bio_resid = 0; if (bp1 != bp2) cam_iosched_bio_complete(softc->cam_iosched, bp2, NULL); biodone(bp2); } } return; } case NDA_CCB_DUMP: /* No-op. We're polling */ return; default: break; } xpt_release_ccb(done_ccb); } static int ndaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { struct nda_softc *softc; struct cam_periph *periph; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct nda_softc *)periph->softc; switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: #ifdef CAM_IO_STATS softc->timeouts++; #endif break; case CAM_REQ_ABORTED: case CAM_REQ_CMP_ERR: case CAM_REQ_TERMIO: case CAM_UNREC_HBA_ERROR: case CAM_DATA_RUN_ERR: case CAM_ATA_STATUS_ERROR: #ifdef CAM_IO_STATS softc->errors++; #endif break; default: break; } return(cam_periph_error(ccb, cam_flags, sense_flags)); } /* * Step through all NDA peripheral drivers, and if the device is still open, * sync the disk cache to physical media. */ static void ndaflush(void) { struct cam_periph *periph; struct nda_softc *softc; union ccb *ccb; int error; CAM_PERIPH_FOREACH(periph, &ndadriver) { softc = (struct nda_softc *)periph->softc; if (SCHEDULER_STOPPED()) { /* * If we paniced with the lock held or the periph is not * open, do not recurse. Otherwise, call ndadump since * that avoids the sleeping cam_periph_getccb does if no * CCBs are available. */ if (!cam_periph_owned(periph) && (softc->flags & NDA_FLAG_OPEN)) { ndadump(softc->disk, NULL, 0, 0, 0); } continue; } /* * We only sync the cache if the drive is still open */ cam_periph_lock(periph); if ((softc->flags & NDA_FLAG_OPEN) == 0) { cam_periph_unlock(periph); continue; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); nda_nvme_flush(softc, &ccb->nvmeio); error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0, /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); xpt_release_ccb(ccb); cam_periph_unlock(periph); } } static void ndashutdown(void *arg, int howto) { ndaflush(); } static void ndasuspend(void *arg) { ndaflush(); } Index: head/sys/crypto/aesni/aesencdec.h =================================================================== --- head/sys/crypto/aesni/aesencdec.h (revision 343754) +++ head/sys/crypto/aesni/aesencdec.h (revision 343755) @@ -1,146 +1,147 @@ /*- * Copyright 2013 John-Mark Gurney - * Copyright 2015 Netflix, Inc. * All rights reserved. + * + * Copyright 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef _AESENCDEC_H_ #define _AESENCDEC_H_ #include #include static inline void aesni_enc8(int rounds, const __m128i *keysched, __m128i a, __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, __m128i h, __m128i out[8]) { int i; a ^= keysched[0]; b ^= keysched[0]; c ^= keysched[0]; d ^= keysched[0]; e ^= keysched[0]; f ^= keysched[0]; g ^= keysched[0]; h ^= keysched[0]; for (i = 0; i < rounds; i++) { a = _mm_aesenc_si128(a, keysched[i + 1]); b = _mm_aesenc_si128(b, keysched[i + 1]); c = _mm_aesenc_si128(c, keysched[i + 1]); d = _mm_aesenc_si128(d, keysched[i + 1]); e = _mm_aesenc_si128(e, keysched[i + 1]); f = _mm_aesenc_si128(f, keysched[i + 1]); g = _mm_aesenc_si128(g, keysched[i + 1]); h = _mm_aesenc_si128(h, keysched[i + 1]); } out[0] = _mm_aesenclast_si128(a, keysched[i + 1]); out[1] = _mm_aesenclast_si128(b, keysched[i + 1]); out[2] = _mm_aesenclast_si128(c, keysched[i + 1]); out[3] = _mm_aesenclast_si128(d, keysched[i + 1]); out[4] = _mm_aesenclast_si128(e, keysched[i + 1]); out[5] = _mm_aesenclast_si128(f, keysched[i + 1]); out[6] = _mm_aesenclast_si128(g, keysched[i + 1]); out[7] = _mm_aesenclast_si128(h, keysched[i + 1]); } static inline void aesni_dec8(int rounds, const __m128i *keysched, __m128i a, __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, __m128i h, __m128i out[8]) { int i; a ^= keysched[0]; b ^= keysched[0]; c ^= keysched[0]; d ^= keysched[0]; e ^= keysched[0]; f ^= keysched[0]; g ^= keysched[0]; h ^= keysched[0]; for (i = 0; i < rounds; i++) { a = _mm_aesdec_si128(a, keysched[i + 1]); b = _mm_aesdec_si128(b, keysched[i + 1]); c = _mm_aesdec_si128(c, keysched[i + 1]); d = _mm_aesdec_si128(d, keysched[i + 1]); e = _mm_aesdec_si128(e, keysched[i + 1]); f = _mm_aesdec_si128(f, keysched[i + 1]); g = _mm_aesdec_si128(g, keysched[i + 1]); h = _mm_aesdec_si128(h, keysched[i + 1]); } out[0] = _mm_aesdeclast_si128(a, keysched[i + 1]); out[1] = _mm_aesdeclast_si128(b, keysched[i + 1]); out[2] = _mm_aesdeclast_si128(c, keysched[i + 1]); out[3] = _mm_aesdeclast_si128(d, keysched[i + 1]); out[4] = _mm_aesdeclast_si128(e, keysched[i + 1]); out[5] = _mm_aesdeclast_si128(f, keysched[i + 1]); out[6] = _mm_aesdeclast_si128(g, keysched[i + 1]); out[7] = _mm_aesdeclast_si128(h, keysched[i + 1]); } /* rounds is passed in as rounds - 1 */ static inline __m128i aesni_enc(int rounds, const __m128i *keysched, const __m128i from) { __m128i tmp; int i; tmp = from ^ keysched[0]; for (i = 1; i < rounds; i += 2) { tmp = _mm_aesenc_si128(tmp, keysched[i]); tmp = _mm_aesenc_si128(tmp, keysched[i + 1]); } tmp = _mm_aesenc_si128(tmp, keysched[rounds]); return _mm_aesenclast_si128(tmp, keysched[rounds + 1]); } static inline __m128i aesni_dec(int rounds, const __m128i *keysched, const __m128i from) { __m128i tmp; int i; tmp = from ^ keysched[0]; for (i = 1; i < rounds; i += 2) { tmp = _mm_aesdec_si128(tmp, keysched[i]); tmp = _mm_aesdec_si128(tmp, keysched[i + 1]); } tmp = _mm_aesdec_si128(tmp, keysched[rounds]); return _mm_aesdeclast_si128(tmp, keysched[rounds + 1]); } #endif /* _AESENCDEC_H_ */ Index: head/sys/dev/efidev/efidev.c =================================================================== --- head/sys/dev/efidev/efidev.c (revision 343754) +++ head/sys/dev/efidev/efidev.c (revision 343755) @@ -1,221 +1,220 @@ /*- * Copyright (c) 2016 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include static d_ioctl_t efidev_ioctl; static struct cdevsw efi_cdevsw = { .d_name = "efi", .d_version = D_VERSION, .d_ioctl = efidev_ioctl, }; static int efidev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr, int flags __unused, struct thread *td __unused) { int error; switch (cmd) { case EFIIOC_GET_TABLE: { struct efi_get_table_ioc *egtioc = (struct efi_get_table_ioc *)addr; error = efi_get_table(&egtioc->uuid, &egtioc->ptr); break; } case EFIIOC_GET_TIME: { struct efi_tm *tm = (struct efi_tm *)addr; error = efi_get_time(tm); break; } case EFIIOC_SET_TIME: { struct efi_tm *tm = (struct efi_tm *)addr; error = efi_set_time(tm); break; } case EFIIOC_VAR_GET: { struct efi_var_ioc *ev = (struct efi_var_ioc *)addr; void *data; efi_char *name; data = malloc(ev->datasize, M_TEMP, M_WAITOK); name = malloc(ev->namesize, M_TEMP, M_WAITOK); error = copyin(ev->name, name, ev->namesize); if (error) goto vg_out; if (name[ev->namesize / sizeof(efi_char) - 1] != 0) { error = EINVAL; goto vg_out; } error = efi_var_get(name, &ev->vendor, &ev->attrib, &ev->datasize, data); if (error == 0) { error = copyout(data, ev->data, ev->datasize); } else if (error == EOVERFLOW) { /* * Pass back the size we really need, but * convert the error to 0 so the copyout * happens. datasize was updated in the * efi_var_get call. */ ev->data = NULL; error = 0; } vg_out: free(data, M_TEMP); free(name, M_TEMP); break; } case EFIIOC_VAR_NEXT: { struct efi_var_ioc *ev = (struct efi_var_ioc *)addr; efi_char *name; name = malloc(ev->namesize, M_TEMP, M_WAITOK); error = copyin(ev->name, name, ev->namesize); if (error) goto vn_out; /* Note: namesize is the buffer size, not the string lenght */ error = efi_var_nextname(&ev->namesize, name, &ev->vendor); if (error == 0) { error = copyout(name, ev->name, ev->namesize); } else if (error == EOVERFLOW) { ev->name = NULL; error = 0; } vn_out: free(name, M_TEMP); break; } case EFIIOC_VAR_SET: { struct efi_var_ioc *ev = (struct efi_var_ioc *)addr; void *data = NULL; efi_char *name; /* datasize == 0 -> delete (more or less) */ if (ev->datasize > 0) data = malloc(ev->datasize, M_TEMP, M_WAITOK); name = malloc(ev->namesize, M_TEMP, M_WAITOK); if (ev->datasize) { error = copyin(ev->data, data, ev->datasize); if (error) goto vs_out; } error = copyin(ev->name, name, ev->namesize); if (error) goto vs_out; if (name[ev->namesize / sizeof(efi_char) - 1] != 0) { error = EINVAL; goto vs_out; } error = efi_var_set(name, &ev->vendor, ev->attrib, ev->datasize, data); vs_out: free(data, M_TEMP); free(name, M_TEMP); break; } default: error = ENOTTY; break; } return (error); } static struct cdev *efidev; static int efidev_modevents(module_t m, int event, void *arg __unused) { struct make_dev_args mda; int error; switch (event) { case MOD_LOAD: /* * If we have no efi environment, then don't create the device. */ if (efi_rt_ok() != 0) return (0); make_dev_args_init(&mda); mda.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME; mda.mda_devsw = &efi_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0700; error = make_dev_s(&mda, &efidev, "efi"); return (error); case MOD_UNLOAD: if (efidev != NULL) destroy_dev(efidev); efidev = NULL; return (0); case MOD_SHUTDOWN: return (0); default: return (EOPNOTSUPP); } } static moduledata_t efidev_moddata = { .name = "efidev", .evhand = efidev_modevents, .priv = NULL, }; DECLARE_MODULE(efidev, efidev_moddata, SI_SUB_DRIVERS, SI_ORDER_ANY); MODULE_VERSION(efidev, 1); MODULE_DEPEND(efidev, efirt, 1, 1, 1); Index: head/sys/dev/nvme/nvme_sim.c =================================================================== --- head/sys/dev/nvme/nvme_sim.c (revision 343754) +++ head/sys/dev/nvme/nvme_sim.c (revision 343755) @@ -1,379 +1,379 @@ /*- - * Copyright (c) 2016 Netflix, Inc + * Copyright (c) 2016 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nvme_private.h" #define ccb_accb_ptr spriv_ptr0 #define ccb_ctrlr_ptr spriv_ptr1 static void nvme_sim_action(struct cam_sim *sim, union ccb *ccb); static void nvme_sim_poll(struct cam_sim *sim); #define sim2softc(sim) ((struct nvme_sim_softc *)cam_sim_softc(sim)) #define sim2ctrlr(sim) (sim2softc(sim)->s_ctrlr) struct nvme_sim_softc { struct nvme_controller *s_ctrlr; struct cam_sim *s_sim; struct cam_path *s_path; }; static void nvme_sim_nvmeio_done(void *ccb_arg, const struct nvme_completion *cpl) { union ccb *ccb = (union ccb *)ccb_arg; /* * Let the periph know the completion, and let it sort out what * it means. Make our best guess, though for the status code. */ memcpy(&ccb->nvmeio.cpl, cpl, sizeof(*cpl)); ccb->ccb_h.status &= ~CAM_SIM_QUEUED; if (nvme_completion_is_error(cpl)) { ccb->ccb_h.status = CAM_REQ_CMP_ERR; xpt_done(ccb); } else { ccb->ccb_h.status = CAM_REQ_CMP; xpt_done_direct(ccb); } } static void nvme_sim_nvmeio(struct cam_sim *sim, union ccb *ccb) { struct ccb_nvmeio *nvmeio = &ccb->nvmeio; struct nvme_request *req; void *payload; uint32_t size; struct nvme_controller *ctrlr; ctrlr = sim2ctrlr(sim); payload = nvmeio->data_ptr; size = nvmeio->dxfer_len; /* SG LIST ??? */ if ((nvmeio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO) req = nvme_allocate_request_bio((struct bio *)payload, nvme_sim_nvmeio_done, ccb); else if ((nvmeio->ccb_h.flags & CAM_DATA_SG) == CAM_DATA_SG) req = nvme_allocate_request_ccb(ccb, nvme_sim_nvmeio_done, ccb); else if (payload == NULL) req = nvme_allocate_request_null(nvme_sim_nvmeio_done, ccb); else req = nvme_allocate_request_vaddr(payload, size, nvme_sim_nvmeio_done, ccb); if (req == NULL) { nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL; xpt_done(ccb); return; } ccb->ccb_h.status |= CAM_SIM_QUEUED; memcpy(&req->cmd, &ccb->nvmeio.cmd, sizeof(ccb->nvmeio.cmd)); if (ccb->ccb_h.func_code == XPT_NVME_IO) nvme_ctrlr_submit_io_request(ctrlr, req); else nvme_ctrlr_submit_admin_request(ctrlr, req); } static uint32_t nvme_link_kBps(struct nvme_controller *ctrlr) { uint32_t speed, lanes, link[] = { 1, 250000, 500000, 985000, 1970000 }; uint32_t status; status = pcie_read_config(ctrlr->dev, PCIER_LINK_STA, 2); speed = status & PCIEM_LINK_STA_SPEED; lanes = (status & PCIEM_LINK_STA_WIDTH) >> 4; /* * Failsafe on link speed indicator. If it is insane report the number of * lanes as the speed. Not 100% accurate, but may be diagnostic. */ if (speed >= nitems(link)) speed = 0; return link[speed] * lanes; } static void nvme_sim_action(struct cam_sim *sim, union ccb *ccb) { struct nvme_controller *ctrlr; CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("nvme_sim_action: func= %#x\n", ccb->ccb_h.func_code)); ctrlr = sim2ctrlr(sim); switch (ccb->ccb_h.func_code) { case XPT_CALC_GEOMETRY: /* Calculate Geometry Totally nuts ? XXX */ /* * Only meaningful for old-school SCSI disks since only the SCSI * da driver generates them. Reject all these that slip through. */ /*FALLTHROUGH*/ case XPT_ABORT: /* Abort the specified CCB */ ccb->ccb_h.status = CAM_REQ_INVALID; break; case XPT_SET_TRAN_SETTINGS: /* * NVMe doesn't really have different transfer settings, but * other parts of CAM think failure here is a big deal. */ ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_PATH_INQ: /* Path routing inquiry */ { struct ccb_pathinq *cpi = &ccb->cpi; device_t dev = ctrlr->dev; /* * NVMe may have multiple LUNs on the same path. Current generation * of NVMe devives support only a single name space. Multiple name * space drives are coming, but it's unclear how we should report * them up the stack. */ cpi->version_num = 1; cpi->hba_inquiry = 0; cpi->target_sprt = 0; cpi->hba_misc = PIM_UNMAPPED | PIM_NOSCAN; cpi->hba_eng_cnt = 0; cpi->max_target = 0; cpi->max_lun = ctrlr->cdata.nn; cpi->maxio = ctrlr->max_xfer_size; cpi->initiator_id = 0; cpi->bus_id = cam_sim_bus(sim); cpi->base_transfer_speed = nvme_link_kBps(ctrlr); strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strlcpy(cpi->hba_vid, "NVMe", HBA_IDLEN); strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); cpi->transport = XPORT_NVME; /* XXX XPORT_PCIE ? */ cpi->transport_version = nvme_mmio_read_4(ctrlr, vs); cpi->protocol = PROTO_NVME; cpi->protocol_version = nvme_mmio_read_4(ctrlr, vs); cpi->xport_specific.nvme.nsid = xpt_path_lun_id(ccb->ccb_h.path); cpi->xport_specific.nvme.domain = pci_get_domain(dev); cpi->xport_specific.nvme.bus = pci_get_bus(dev); cpi->xport_specific.nvme.slot = pci_get_slot(dev); cpi->xport_specific.nvme.function = pci_get_function(dev); cpi->xport_specific.nvme.extra = 0; cpi->ccb_h.status = CAM_REQ_CMP; break; } case XPT_GET_TRAN_SETTINGS: /* Get transport settings */ { struct ccb_trans_settings *cts; struct ccb_trans_settings_nvme *nvmep; struct ccb_trans_settings_nvme *nvmex; device_t dev; uint32_t status, caps; dev = ctrlr->dev; cts = &ccb->cts; nvmex = &cts->xport_specific.nvme; nvmep = &cts->proto_specific.nvme; status = pcie_read_config(dev, PCIER_LINK_STA, 2); caps = pcie_read_config(dev, PCIER_LINK_CAP, 2); nvmex->valid = CTS_NVME_VALID_SPEC | CTS_NVME_VALID_LINK; nvmex->spec = nvme_mmio_read_4(ctrlr, vs); nvmex->speed = status & PCIEM_LINK_STA_SPEED; nvmex->lanes = (status & PCIEM_LINK_STA_WIDTH) >> 4; nvmex->max_speed = caps & PCIEM_LINK_CAP_MAX_SPEED; nvmex->max_lanes = (caps & PCIEM_LINK_CAP_MAX_WIDTH) >> 4; /* XXX these should be something else maybe ? */ nvmep->valid = 1; nvmep->spec = nvmex->spec; cts->transport = XPORT_NVME; cts->protocol = PROTO_NVME; cts->ccb_h.status = CAM_REQ_CMP; break; } case XPT_TERM_IO: /* Terminate the I/O process */ /* * every driver handles this, but nothing generates it. Assume * it's OK to just say 'that worked'. */ /*FALLTHROUGH*/ case XPT_RESET_DEV: /* Bus Device Reset the specified device */ case XPT_RESET_BUS: /* Reset the specified bus */ /* * NVMe doesn't really support physically resetting the bus. It's part * of the bus scanning dance, so return sucess to tell the process to * proceed. */ ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_NVME_IO: /* Execute the requested I/O operation */ case XPT_NVME_ADMIN: /* or Admin operation */ nvme_sim_nvmeio(sim, ccb); return; /* no done */ default: ccb->ccb_h.status = CAM_REQ_INVALID; break; } xpt_done(ccb); } static void nvme_sim_poll(struct cam_sim *sim) { nvme_ctrlr_poll(sim2ctrlr(sim)); } static void * nvme_sim_new_controller(struct nvme_controller *ctrlr) { struct nvme_sim_softc *sc; struct cam_devq *devq; int max_trans; max_trans = ctrlr->max_hw_pend_io; devq = cam_simq_alloc(max_trans); if (devq == NULL) return (NULL); sc = malloc(sizeof(*sc), M_NVME, M_ZERO | M_WAITOK); sc->s_ctrlr = ctrlr; sc->s_sim = cam_sim_alloc(nvme_sim_action, nvme_sim_poll, "nvme", sc, device_get_unit(ctrlr->dev), NULL, max_trans, max_trans, devq); if (sc->s_sim == NULL) { printf("Failed to allocate a sim\n"); cam_simq_free(devq); goto err1; } if (xpt_bus_register(sc->s_sim, ctrlr->dev, 0) != CAM_SUCCESS) { printf("Failed to create a bus\n"); goto err2; } if (xpt_create_path(&sc->s_path, /*periph*/NULL, cam_sim_path(sc->s_sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { printf("Failed to create a path\n"); goto err3; } return (sc); err3: xpt_bus_deregister(cam_sim_path(sc->s_sim)); err2: cam_sim_free(sc->s_sim, /*free_devq*/TRUE); err1: free(sc, M_NVME); return (NULL); } static void * nvme_sim_new_ns(struct nvme_namespace *ns, void *sc_arg) { struct nvme_sim_softc *sc = sc_arg; union ccb *ccb; ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { printf("unable to alloc CCB for rescan\n"); return (NULL); } if (xpt_create_path(&ccb->ccb_h.path, /*periph*/NULL, cam_sim_path(sc->s_sim), 0, ns->id) != CAM_REQ_CMP) { printf("unable to create path for rescan\n"); xpt_free_ccb(ccb); return (NULL); } xpt_rescan(ccb); return (ns); } static void nvme_sim_controller_fail(void *ctrlr_arg) { struct nvme_sim_softc *sc = ctrlr_arg; xpt_async(AC_LOST_DEVICE, sc->s_path, NULL); xpt_free_path(sc->s_path); xpt_bus_deregister(cam_sim_path(sc->s_sim)); cam_sim_free(sc->s_sim, /*free_devq*/TRUE); free(sc, M_NVME); } struct nvme_consumer *consumer_cookie; static void nvme_sim_init(void) { if (nvme_use_nvd) return; consumer_cookie = nvme_register_consumer(nvme_sim_new_ns, nvme_sim_new_controller, NULL, nvme_sim_controller_fail); } SYSINIT(nvme_sim_register, SI_SUB_DRIVERS, SI_ORDER_ANY, nvme_sim_init, NULL); static void nvme_sim_uninit(void) { if (nvme_use_nvd) return; /* XXX Cleanup */ nvme_unregister_consumer(consumer_cookie); } SYSUNINIT(nvme_sim_unregister, SI_SUB_DRIVERS, SI_ORDER_ANY, nvme_sim_uninit, NULL); Index: head/sys/dev/tcp_log/tcp_log_dev.c =================================================================== --- head/sys/dev/tcp_log/tcp_log_dev.c (revision 343754) +++ head/sys/dev/tcp_log/tcp_log_dev.c (revision 343755) @@ -1,521 +1,520 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2016-2017 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016-2017 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPLOG_DEBUG_COUNTERS extern counter_u64_t tcp_log_que_read; extern counter_u64_t tcp_log_que_freed; #endif static struct cdev *tcp_log_dev; static struct selinfo tcp_log_sel; static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head); static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head); MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures"); static int tcp_log_dev_listeners = 0; static struct mtx tcp_log_dev_queue_lock; #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock) #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock) #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED) #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED) #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt)) #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt)) static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry); static void tcp_log_dev_clear_cdevpriv(void *data); static int tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, struct thread *td __unused); static int tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, int flags __unused); static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags __unused); static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int fflag __unused, struct thread *td __unused); static int tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td); enum tcp_log_dev_queue_lock_state { QUEUE_UNLOCKED = 0, QUEUE_LOCKED, }; static struct cdevsw tcp_log_cdevsw = { .d_version = D_VERSION, .d_read = tcp_log_dev_read, .d_open = tcp_log_dev_open, .d_write = tcp_log_dev_write, .d_poll = tcp_log_dev_poll, .d_ioctl = tcp_log_dev_ioctl, #ifdef NOTYET .d_mmap = tcp_log_dev_mmap, #endif .d_name = "tcp_log", }; static __inline void tcp_log_dev_queue_validate_lock(int lockstate) { #ifdef INVARIANTS switch (lockstate) { case QUEUE_LOCKED: TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); break; case QUEUE_UNLOCKED: TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT(); break; default: kassert_panic("%s:%d: unknown queue lock state", __func__, __LINE__); } #endif } /* * Clear the refcount. If appropriate, it will remove the entry from the * queue and call the destructor. * * This must be called with the queue lock held. */ static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry) { KASSERT(entry != NULL, ("%s: called with NULL entry", __func__)); TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); if (TCP_LOG_DEV_QUEUE_UNREF(entry)) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_freed, 1); #endif /* Remove the entry from the queue and call the destructor. */ STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue, tldq_queue); (*entry->tldq_dtor)(entry); } } static void tcp_log_dev_clear_cdevpriv(void *data) { struct tcp_log_dev_info *priv; struct tcp_log_dev_queue *entry, *entry_tmp; priv = (struct tcp_log_dev_info *)data; if (priv == NULL) return; /* * Lock the queue and drop our references. We hold references to all * the entries starting with tldi_head (or, if tldi_head == NULL, all * entries in the queue). * * Because we don't want anyone adding addition things to the queue * while we are doing this, we lock the queue. */ TCP_LOG_DEV_QUEUE_LOCK(); if (priv->tldi_head != NULL) { entry = priv->tldi_head; STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head, tldq_queue, entry_tmp) { tcp_log_dev_clear_refcount(entry); } } tcp_log_dev_listeners--; KASSERT(tcp_log_dev_listeners >= 0, ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__)); STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info, tldi_list); TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); TCP_LOG_DEV_QUEUE_UNLOCK(); free(priv, M_TCPLOGDEV); } static int tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, struct thread *td __unused) { struct tcp_log_dev_info *priv; struct tcp_log_dev_queue *entry; int rv; /* * Ideally, we shouldn't see these because of file system * permissions. */ if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC)) return (ENODEV); /* Allocate space to hold information about where we are. */ priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV, M_ZERO | M_WAITOK); /* Stash the private data away. */ rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv); if (!rv) { /* * Increase the listener count, add this reader to the list, and * take references on all current queues. */ TCP_LOG_DEV_QUEUE_LOCK(); tcp_log_dev_listeners++; STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list); priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head); if (priv->tldi_head != NULL) priv->tldi_cur = priv->tldi_head->tldq_buf; STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue) TCP_LOG_DEV_QUEUE_REF(entry); TCP_LOG_DEV_QUEUE_UNLOCK(); } else { /* Free the entry. */ free(priv, M_TCPLOGDEV); } return (rv); } static int tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, int flags __unused) { return (ENODEV); } static __inline void tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate) { struct tcp_log_dev_queue *entry; KASSERT(priv->tldi_head != NULL, ("%s:%d: priv->tldi_head unexpectedly NULL", __func__, __LINE__)); KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur, ("%s:%d: buffer mismatch (%p vs %p)", __func__, __LINE__, priv->tldi_head->tldq_buf, priv->tldi_cur)); tcp_log_dev_queue_validate_lock(*lockstate); if (*lockstate == QUEUE_UNLOCKED) { TCP_LOG_DEV_QUEUE_LOCK(); *lockstate = QUEUE_LOCKED; } entry = priv->tldi_head; priv->tldi_head = STAILQ_NEXT(entry, tldq_queue); tcp_log_dev_clear_refcount(entry); priv->tldi_cur = NULL; } static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags) { struct tcp_log_common_header *buf; struct tcp_log_dev_info *priv; struct tcp_log_dev_queue *entry; ssize_t len; int lockstate, rv; /* Get our private info. */ rv = devfs_get_cdevpriv((void **)&priv); if (rv) return (rv); lockstate = QUEUE_UNLOCKED; /* Do we need to get a new buffer? */ while (priv->tldi_cur == NULL || priv->tldi_cur->tlch_length <= priv->tldi_off) { /* Did we somehow forget to rotate? */ KASSERT(priv->tldi_cur == NULL, ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__, __LINE__)); if (priv->tldi_cur != NULL) tcp_log_dev_rotate_bufs(priv, &lockstate); /* * Before we start looking at tldi_head, we need a lock on the * queue to make sure tldi_head stays stable. */ if (lockstate == QUEUE_UNLOCKED) { TCP_LOG_DEV_QUEUE_LOCK(); lockstate = QUEUE_LOCKED; } /* We need the next buffer. Do we have one? */ if (priv->tldi_head == NULL && (flags & FNONBLOCK)) { rv = EAGAIN; goto done; } if (priv->tldi_head == NULL) { /* Sleep and wait for more things we can read. */ rv = mtx_sleep(&tcp_log_dev_listeners, &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0); if (rv) goto done; if (priv->tldi_head == NULL) continue; } /* * We have an entry to read. We want to try to create a * buffer, if one doesn't already exist. */ entry = priv->tldi_head; if (entry->tldq_buf == NULL) { TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); buf = (*entry->tldq_xform)(entry); if (buf == NULL) { rv = EBUSY; goto done; } entry->tldq_buf = buf; } priv->tldi_cur = entry->tldq_buf; priv->tldi_off = 0; } /* Copy what we can from this buffer to the output buffer. */ if (uio->uio_resid > 0) { /* Drop locks so we can take page faults. */ if (lockstate == QUEUE_LOCKED) TCP_LOG_DEV_QUEUE_UNLOCK(); lockstate = QUEUE_UNLOCKED; KASSERT(priv->tldi_cur != NULL, ("%s: priv->tldi_cur is unexpectedly NULL", __func__)); /* Copy as much as we can to this uio. */ len = priv->tldi_cur->tlch_length - priv->tldi_off; if (len > uio->uio_resid) len = uio->uio_resid; rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off, len, uio); if (rv != 0) goto done; priv->tldi_off += len; #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_read, len); #endif } /* Are we done with this buffer? If so, find the next one. */ if (priv->tldi_off >= priv->tldi_cur->tlch_length) { KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length, ("%s: offset (%ju) exceeds length (%ju)", __func__, (uintmax_t)priv->tldi_off, (uintmax_t)priv->tldi_cur->tlch_length)); tcp_log_dev_rotate_bufs(priv, &lockstate); } done: tcp_log_dev_queue_validate_lock(lockstate); if (lockstate == QUEUE_LOCKED) TCP_LOG_DEV_QUEUE_UNLOCK(); return (rv); } static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int fflag __unused, struct thread *td __unused) { struct tcp_log_dev_info *priv; int rv; /* Get our private info. */ rv = devfs_get_cdevpriv((void **)&priv); if (rv) return (rv); /* * Set things. Here, we are most concerned about the non-blocking I/O * flag. */ rv = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data != 0) rv = EINVAL; break; default: rv = ENOIOCTL; } return (rv); } static int tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td) { struct tcp_log_dev_info *priv; int revents; /* * Get our private info. If this fails, claim that all events are * ready. That should prod the user to do something that will * make the error evident to them. */ if (devfs_get_cdevpriv((void **)&priv)) return (events); revents = 0; if (events & (POLLIN | POLLRDNORM)) { /* * We can (probably) read right now if we are partway through * a buffer or if we are just about to start a buffer. * Because we are going to read tldi_head, we should acquire * a read lock on the queue. */ TCP_LOG_DEV_QUEUE_LOCK(); if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) || (priv->tldi_cur != NULL && priv->tldi_off < priv->tldi_cur->tlch_length)) revents = events & (POLLIN | POLLRDNORM); else selrecord(td, &tcp_log_sel); TCP_LOG_DEV_QUEUE_UNLOCK(); } else { /* * It only makes sense to poll for reading. So, again, prod the * user to do something that will make the error of their ways * apparent. */ revents = events; } return (revents); } int tcp_log_dev_add_log(struct tcp_log_dev_queue *entry) { struct tcp_log_dev_info *priv; int rv; bool wakeup_needed; KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL, ("%s: Called with both tldq_buf and tldq_xform set to NULL", __func__)); KASSERT(entry->tldq_dtor != NULL, ("%s: Called with tldq_dtor set to NULL", __func__)); /* Get a lock on the queue. */ TCP_LOG_DEV_QUEUE_LOCK(); /* If no one is listening, tell the caller to free the resources. */ if (tcp_log_dev_listeners == 0) { rv = ENXIO; goto done; } /* Add this to the end of the tailq. */ STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue); /* Add references for all current listeners. */ refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners); /* * If any listener is currently stuck on NULL, that means they are * waiting. Point their head to this new entry. */ wakeup_needed = false; STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list) if (priv->tldi_head == NULL) { priv->tldi_head = entry; wakeup_needed = true; } if (wakeup_needed) { selwakeup(&tcp_log_sel); wakeup(&tcp_log_dev_listeners); } rv = 0; done: TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); TCP_LOG_DEV_QUEUE_UNLOCK(); return (rv); } static int tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused) { /* TODO: Support intelligent unloading. */ switch (type) { case MOD_LOAD: if (bootverbose) printf("tcp_log: tcp_log device\n"); memset(&tcp_log_sel, 0, sizeof(tcp_log_sel)); memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx)); mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev", "tcp_log device queues", MTX_DEF); tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, "tcp_log"); break; default: return (EOPNOTSUPP); } return (0); } DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL); MODULE_VERSION(tcp_log_dev, 1); Index: head/sys/dev/tcp_log/tcp_log_dev.h =================================================================== --- head/sys/dev/tcp_log/tcp_log_dev.h (revision 343754) +++ head/sys/dev/tcp_log/tcp_log_dev.h (revision 343755) @@ -1,89 +1,88 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2016 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_log_dev_h__ #define __tcp_log_dev_h__ /* * This is the common header for data streamed from the log device. All * blocks of data need to start with this header. */ struct tcp_log_common_header { uint32_t tlch_version; /* Version is specific to type. */ uint32_t tlch_type; /* Type of entry(ies) that follow. */ uint64_t tlch_length; /* Total length, including header. */ } __packed; #define TCP_LOG_DEV_TYPE_BBR 1 /* black box recorder */ #ifdef _KERNEL /* * This is a queue entry. All queue entries need to start with this structure * so the common code can cast them to this structure; however, other modules * are free to include additional data after this structure. * * The elements are explained here: * tldq_queue: used by the common code to maintain this entry's position in the * queue. * tldq_buf: should be NULL, or a pointer to a chunk of data. The data must be * as long as the common header indicates. * tldq_xform: If tldq_buf is NULL, the code will call this to create the * the tldq_buf object. The function should *not* directly modify tldq_buf, * but should return the buffer (which must meet the restrictions * indicated for tldq_buf). * tldq_dtor: This function is called to free the queue entry. If tldq_buf is * not NULL, the dtor function must free that, too. * tldq_refcnt: used by the common code to indicate how many readers still need * this data. */ struct tcp_log_dev_queue { STAILQ_ENTRY(tcp_log_dev_queue) tldq_queue; struct tcp_log_common_header *tldq_buf; struct tcp_log_common_header *(*tldq_xform)(struct tcp_log_dev_queue *entry); void (*tldq_dtor)(struct tcp_log_dev_queue *entry); volatile u_int tldq_refcnt; }; STAILQ_HEAD(log_queueh, tcp_log_dev_queue); struct tcp_log_dev_info { STAILQ_ENTRY(tcp_log_dev_info) tldi_list; struct tcp_log_dev_queue *tldi_head; struct tcp_log_common_header *tldi_cur; off_t tldi_off; }; STAILQ_HEAD(log_infoh, tcp_log_dev_info); #ifdef TCP_BLACKBOX MALLOC_DECLARE(M_TCPLOGDEV); int tcp_log_dev_add_log(struct tcp_log_dev_queue *entry); #endif /* TCP_BLACKBOX */ #endif /* _KERNEL */ #endif /* !__tcp_log_dev_h__ */ Index: head/sys/kern/subr_boot.c =================================================================== --- head/sys/kern/subr_boot.c (revision 343754) +++ head/sys/kern/subr_boot.c (revision 343755) @@ -1,223 +1,223 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 Michael Smith * All Rights Reserved. * Copyright (c) 1998 Robert Nordier * All Rights Reserved. * Copyright (c) 2009, Oleksandr Tymoshenko * All rights reserved. * Copyright (c) 2014 Roger Pau Monné * All Rights Reserved. * Copyright (c) 2018 Kyle Evans - * Copyright (c) 2018 Netflix + * Copyright (c) 2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* Note: This is compiled in both the kernel and boot loader contexts */ #include #ifdef _KERNEL #include #else #include #endif #include #include #ifdef _KERNEL #define SETENV(k, v) kern_setenv(k, v) #define GETENV(k) kern_getenv(k) #define FREE(v) freeenv(v) #else /* Boot loader */ #define SETENV(k, v) setenv(k, v, 1) #define GETENV(k) getenv(k) #define FREE(v) #endif static struct { const char *ev; int mask; } howto_names[] = { { "boot_askname", RB_ASKNAME}, { "boot_cdrom", RB_CDROM}, { "boot_ddb", RB_KDB}, { "boot_dfltroot", RB_DFLTROOT}, { "boot_gdb", RB_GDB}, { "boot_multicons", RB_MULTIPLE}, { "boot_mute", RB_MUTE}, { "boot_pause", RB_PAUSE}, { "boot_serial", RB_SERIAL}, { "boot_single", RB_SINGLE}, { "boot_verbose", RB_VERBOSE}, { NULL, 0} }; /* * In the boot environment, we often parse a command line and have to throw away * its contents. As we do so, we set environment variables that correspond to * the flags we encounter. Later, to get a howto mask, we grovel through these * to reconstruct it. This also allows users in their loader.conf to set them * and have the kernel see them. */ /** * @brief convert the env vars in howto_names into a howto mask */ int boot_env_to_howto(void) { int i, howto; char *val; for (howto = 0, i = 0; howto_names[i].ev != NULL; i++) { val = GETENV(howto_names[i].ev); if (val != NULL && strcasecmp(val, "no") != 0) howto |= howto_names[i].mask; FREE(val); } return (howto); } /** * @brief Set env vars from howto_names based on howto passed in */ void boot_howto_to_env(int howto) { int i; for (i = 0; howto_names[i].ev != NULL; i++) if (howto & howto_names[i].mask) SETENV(howto_names[i].ev, "YES"); } /** * @brief Helper routine to parse a single arg and return its mask * * Parse all the - options to create a mask (or a serial speed in the * case of -S). If the arg doesn't start with '-' assume it's an env * variable and set that instead. */ int boot_parse_arg(char *v) { char *n; int howto; #if 0 /* Need to see if this is better or worse than the meat of the #else */ static const char howto_switches[] = "aCdrgDmphsv"; static int howto_masks[] = { RB_ASKNAME, RB_CDROM, RB_KDB, RB_DFLTROOT, RB_GDB, RB_MULTIPLE, RB_MUTE, RB_PAUSE, RB_SERIAL, RB_SINGLE, RB_VERBOSE }; opts = strchr(kargs, '-'); while (opts != NULL) { while (*(++opts) != '\0') { sw = strchr(howto_switches, *opts); if (sw == NULL) break; howto |= howto_masks[sw - howto_switches]; } opts = strchr(opts, '-'); } #else howto = 0; if (*v == '-') { while (*v != '\0') { v++; switch (*v) { case 'a': howto |= RB_ASKNAME; break; case 'C': howto |= RB_CDROM; break; case 'd': howto |= RB_KDB; break; case 'D': howto |= RB_MULTIPLE; break; case 'm': howto |= RB_MUTE; break; case 'g': howto |= RB_GDB; break; case 'h': howto |= RB_SERIAL; break; case 'p': howto |= RB_PAUSE; break; case 'P': howto |= RB_PROBE; break; case 'r': howto |= RB_DFLTROOT; break; case 's': howto |= RB_SINGLE; break; case 'S': SETENV("comconsole_speed", v + 1); v += strlen(v); break; case 'v': howto |= RB_VERBOSE; break; } } } else { n = strsep(&v, "="); if (v == NULL) SETENV(n, "1"); else SETENV(n, v); } #endif return (howto); } /** * @brief breakup the command line into args, and pass to boot_parse_arg */ int boot_parse_cmdline_delim(char *cmdline, const char *delim) { char *v; int howto; howto = 0; while ((v = strsep(&cmdline, delim)) != NULL) { if (*v == '\0') continue; howto |= boot_parse_arg(v); } return (howto); } /** * @brief Simplified interface for common 'space separated' args */ int boot_parse_cmdline(char *cmdline) { return (boot_parse_cmdline_delim(cmdline, " \n")); } /** * @brief Pass a vector of strings to boot_parse_arg */ int boot_parse_args(int argc, char *argv[]) { int i, howto; howto = 0; for (i = 1; i < argc; i++) howto |= boot_parse_arg(argv[i]); return (howto); } Index: head/sys/netinet/tcp_hpts.c =================================================================== --- head/sys/netinet/tcp_hpts.c (revision 343754) +++ head/sys/netinet/tcp_hpts.c (revision 343755) @@ -1,1902 +1,1902 @@ /*- - * Copyright (c) 2016-2018 Netflix Inc. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" /** * Some notes about usage. * * The tcp_hpts system is designed to provide a high precision timer * system for tcp. Its main purpose is to provide a mechanism for * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * * First, and probably the main thing its used by Rack and BBR for, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The * slot is the time from now that the stack wants to be called but it * must be converted to tcp_hpts's notion of slot. This is done with * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical * call from the tcp_output() routine might look like: * * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); * * The above would schedule tcp_ouput() to be called in 550 useconds. * Note that if using this mechanism the stack will want to add near * its top a check to prevent unwanted calls (from user land or the * arrival of incoming ack's). So it would add something like: * * if (inp->inp_in_hpts) * return; * * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. * * Now the tcp_hpts system will call tcp_output in one of two forms, * it will first check to see if the stack as defined a * tfb_tcp_output_wtime() function, if so that is the routine it * will call, if that function is not defined then it will call the * tfb_tcp_output() function. The only difference between these * two calls is that the former passes the time in to the function * so the function does not have to access the time (which tcp_hpts * already has). What these functions do is of course totally up * to the individual tcp stack. * * Now the second function (actually two functions I guess :D) * the tcp_hpts system provides is the ability to either abort * a connection (later) or process input on a connection. * Why would you want to do this? To keep processor locality. * * So in order to use the input redirection function the * stack changes its tcp_do_segment() routine to instead * of process the data call the function: * * tcp_queue_pkt_to_input() * * You will note that the arguments to this function look * a lot like tcp_do_segments's arguments. This function * will assure that the tcp_hpts system will * call the functions tfb_tcp_hpts_do_segment() from the * correct CPU. Note that multiple calls can get pushed * into the tcp_hpts system this will be indicated by * the next to last argument to tfb_tcp_hpts_do_segment() * (nxt_pkt). If nxt_pkt is a 1 then another packet is * coming. If nxt_pkt is a 0 then this is the last call * that the tcp_hpts system has available for the tcp stack. * * The other point of the input system is to be able to safely * drop a tcp connection without worrying about the recursive * locking that may be occuring on the INP_WLOCK. So if * a stack wants to drop a connection it calls: * * tcp_set_inp_to_drop(tp, ETIMEDOUT) * * To schedule the tcp_hpts system to call * * tcp_drop(tp, drop_reason) * * at a future point. This is quite handy to prevent locking * issues when dropping connections. * */ #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef tcpdebug #include #endif /* tcpdebug */ #ifdef tcp_offload #include #endif #include "opt_rss.h" MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS #include #include static int tcp_bind_threads = 1; #else static int tcp_bind_threads = 0; #endif TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); static struct tcp_hptsi tcp_pace; static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; static int32_t tcp_hpts_callout_skip_swi = 0; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls"); #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) static int32_t logging_on = 0; static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, &logging_on, 0, "Turn on logging if compiled in"); counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); counter_u64_t back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); static int32_t in_newts_every_tcb = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, &in_newts_every_tcb, 0, "Do we have a new cts every tcb we process for input"); static int32_t in_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, &in_ts_percision, 0, "Do we use percise timestamp for clients on input"); static int32_t out_newts_every_tcb = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, &out_newts_every_tcb, 0, "Do we have a new cts every tcb we process for output"); static int32_t out_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, &out_ts_percision, 0, "Do we use a percise timestamp for every output cts"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, &hpts_sleep_max, 0, "The maximum time the hpts will sleep <1 - 254>"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, &tcp_min_hptsi_time, 0, "The minimum time the hpts must sleep before processing more slots"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, &tcp_hpts_callout_skip_swi, 0, "Do we have the callout call directly to the hpts?"); static void __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, uint32_t ticknow, int32_t line) { struct hpts_log *pl; HPTS_MTX_ASSERT(hpts); if (hpts->p_log == NULL) return; pl = &hpts->p_log[hpts->p_log_at]; hpts->p_log_at++; if (hpts->p_log_at >= hpts->p_logsize) { hpts->p_log_at = 0; hpts->p_log_wrapped = 1; } pl->inp = inp; if (inp) { pl->t_paceslot = inp->inp_hptsslot; pl->t_hptsreq = inp->inp_hpts_request; pl->p_onhpts = inp->inp_in_hpts; pl->p_oninput = inp->inp_in_input; } else { pl->t_paceslot = 0; pl->t_hptsreq = 0; pl->p_onhpts = 0; pl->p_oninput = 0; } pl->is_notempty = 1; pl->event = event; pl->line = line; pl->cts = tcp_get_usecs(NULL); pl->p_curtick = hpts->p_curtick; pl->p_prevtick = hpts->p_prevtick; pl->p_on_queue_cnt = hpts->p_on_queue_cnt; pl->ticknow = ticknow; pl->slot_req = slot; pl->p_nxt_slot = hpts->p_nxt_slot; pl->p_cur_slot = hpts->p_cur_slot; pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; pl->p_flags = (hpts->p_cpu & 0x7f); pl->p_flags <<= 7; pl->p_flags |= (hpts->p_num & 0x7f); pl->p_flags <<= 2; if (hpts->p_hpts_active) { pl->p_flags |= HPTS_HPTS_ACTIVE; } } #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) static void hpts_timeout_swi(void *arg) { struct tcp_hpts_entry *hpts; hpts = (struct tcp_hpts_entry *)arg; swi_sched(hpts->ie_cookie, 0); } static void hpts_timeout_dir(void *arg) { tcp_hpts_thread(arg); } static inline void hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_hpts_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_hpts == 0) { /* We are not on the hpts? */ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); } if (TAILQ_EMPTY(head) && (hpts->p_on_queue_cnt != 0)) { /* We should not be empty with a queue count */ panic("%s hpts:%p hpts bucket empty but cnt:%d", __FUNCTION__, hpts, hpts->p_on_queue_cnt); } #endif TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; if (hpts->p_on_queue_cnt < 0) { /* Count should not go negative .. */ #ifdef INVARIANTS panic("Hpts goes negative inp:%p hpts:%p", inp, hpts); #endif hpts->p_on_queue_cnt = 0; } if (clear) { inp->inp_hpts_request = 0; inp->inp_in_hpts = 0; } } static inline void hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_hpts_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if ((noref == 0) && (inp->inp_in_hpts == 1)) { /* We are already on the hpts? */ panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_INSERT_TAIL(head, inp, inp_hpts); inp->inp_in_hpts = 1; hpts->p_on_queue_cnt++; if (noref == 0) { in_pcbref(inp); } } static inline void hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_input_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_input == 0) { /* We are not on the input hpts? */ panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_REMOVE(&hpts->p_input, inp, inp_input); hpts->p_on_inqueue_cnt--; if (hpts->p_on_inqueue_cnt < 0) { #ifdef INVARIANTS panic("Hpts in goes negative inp:%p hpts:%p", inp, hpts); #endif hpts->p_on_inqueue_cnt = 0; } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { /* We should not be empty with a queue count */ panic("%s hpts:%p in_hpts input empty but cnt:%d", __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); } #endif if (clear) inp->inp_in_input = 0; } static inline void hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_input_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_input == 1) { /* We are already on the input hpts? */ panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); inp->inp_in_input = 1; hpts->p_on_inqueue_cnt++; in_pcbref(inp); } static int sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) { struct tcp_hpts_entry *hpts; size_t sz; int32_t logging_was, i; int32_t error = 0; /* * HACK: Turn off logging so no locks are required this really needs * a memory barrier :) */ logging_was = logging_on; logging_on = 0; if (!req->oldptr) { /* How much? */ sz = 0; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; if (hpts->p_log == NULL) continue; sz += (sizeof(struct hpts_log) * hpts->p_logsize); } error = SYSCTL_OUT(req, 0, sz); } else { for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; if (hpts->p_log == NULL) continue; if (hpts->p_log_wrapped) sz = (sizeof(struct hpts_log) * hpts->p_logsize); else sz = (sizeof(struct hpts_log) * hpts->p_log_at); error = SYSCTL_OUT(req, hpts->p_log, sz); } } logging_on = logging_was; return error; } SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); swi_sched(hpts->ie_cookie, 0); if (hpts->p_hpts_active == 2) { /* Rare sleeping on a ENOBUF */ wakeup_one(hpts); } } static void tcp_wakeinput(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); swi_sched(hpts->ie_cookie, 0); if (hpts->p_hpts_active == 2) { /* Rare sleeping on a ENOBUF */ wakeup_one(hpts); } } struct tcp_hpts_entry * tcp_cur_hpts(struct inpcb *inp) { int32_t hpts_num; struct tcp_hpts_entry *hpts; hpts_num = inp->inp_hpts_cpu; hpts = tcp_pace.rp_ent[hpts_num]; return (hpts); } struct tcp_hpts_entry * tcp_hpts_lock(struct inpcb *inp) { struct tcp_hpts_entry *hpts; int32_t hpts_num; again: hpts_num = inp->inp_hpts_cpu; hpts = tcp_pace.rp_ent[hpts_num]; #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_hpts_cpu) { mtx_unlock(&hpts->p_mtx); goto again; } return (hpts); } struct tcp_hpts_entry * tcp_input_lock(struct inpcb *inp) { struct tcp_hpts_entry *hpts; int32_t hpts_num; again: hpts_num = inp->inp_input_cpu; hpts = tcp_pace.rp_ent[hpts_num]; #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_input_cpu) { mtx_unlock(&hpts->p_mtx); goto again; } return (hpts); } static void tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) { int32_t add_freed; if (inp->inp_flags2 & INP_FREED) { /* * Need to play a special trick so that in_pcbrele_wlocked * does not return 1 when it really should have returned 0. */ add_freed = 1; inp->inp_flags2 &= ~INP_FREED; } else { add_freed = 0; } #ifndef INP_REF_DEBUG if (in_pcbrele_wlocked(inp)) { /* * This should not happen. We have the inpcb referred to by * the main socket (why we are called) and the hpts. It * should always return 0. */ panic("inpcb:%p release ret 1", inp); } #else if (__in_pcbrele_wlocked(inp, line)) { /* * This should not happen. We have the inpcb referred to by * the main socket (why we are called) and the hpts. It * should always return 0. */ panic("inpcb:%p release ret 1", inp); } #endif if (add_freed) { inp->inp_flags2 |= INP_FREED; } } static void tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) { if (inp->inp_in_hpts) { hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1); tcp_remove_hpts_ref(inp, hpts, line); } } static void tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) { HPTS_MTX_ASSERT(hpts); if (inp->inp_in_input) { hpts_sane_input_remove(hpts, inp, 1); tcp_remove_hpts_ref(inp, hpts, line); } } /* * Called normally with the INP_LOCKED but it * does not matter, the hpts lock is the key * but the lock order allows us to hold the * INP lock and then get the hpts lock. * * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. * HPTS_REMOVE_INPUT - remove from the input of the hpts. * Note that you can or both values together and get two * actions. */ void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) { struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); if (flags & HPTS_REMOVE_OUTPUT) { hpts = tcp_hpts_lock(inp); tcp_hpts_remove_locked_output(hpts, inp, flags, line); mtx_unlock(&hpts->p_mtx); } if (flags & HPTS_REMOVE_INPUT) { hpts = tcp_input_lock(inp); tcp_hpts_remove_locked_input(hpts, inp, flags, line); mtx_unlock(&hpts->p_mtx); } } static inline int hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) { return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); } static int tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) { int32_t need_wake = 0; uint32_t ticknow = 0; HPTS_MTX_ASSERT(hpts); if (inp->inp_in_hpts == 0) { /* Ok we need to set it on the hpts in the current slot */ if (hpts->p_hpts_active == 0) { /* A sleeping hpts we want in next slot to run */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, hpts_tick(hpts, 1)); } inp->inp_hptsslot = hpts_tick(hpts, 1); inp->inp_hpts_request = 0; if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); } need_wake = 1; } else if ((void *)inp == hpts->p_inp) { /* * We can't allow you to go into the same slot we * are in. We must put you out. */ inp->inp_hptsslot = hpts->p_nxt_slot; } else inp->inp_hptsslot = hpts->p_cur_slot; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); inp->inp_hpts_request = 0; if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); } if (need_wake) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } } return (need_wake); } int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) { int32_t ret; struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); mtx_unlock(&hpts->p_mtx); return (ret); } static void tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, struct hpts_diag *diag, int32_t noref) { int32_t need_new_to = 0; int32_t need_wakeup = 0; uint32_t largest_slot; uint32_t ticknow = 0; uint32_t slot_calc; HPTS_MTX_ASSERT(hpts); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; diag->slot_req = slot; } if ((inp->inp_in_hpts == 0) || noref) { inp->inp_hpts_request = slot; if (slot == 0) { /* Immediate */ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); return; } if (hpts->p_hpts_active) { /* * Its slot - 1 since nxt_slot is the next tick that * will go off since the hpts is awake */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); } /* * We want to make sure that we don't place a inp in * the range of p_cur_slot <-> p_nxt_slot. If we * take from p_nxt_slot to the end, plus p_cur_slot * and then take away 2, we will know how many is * the max slots we can use. */ if (hpts->p_nxt_slot > hpts->p_cur_slot) { /* * Non-wrap case nxt_slot <-> cur_slot we * don't want to land in. So the diff gives * us what is taken away from the number of * slots. */ largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { largest_slot = NUM_OF_HPTSI_SLOTS - 2; } else { /* * Wrap case so the diff gives us the number * of slots that we can land in. */ largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; } /* * We take away two so we never have a problem (20 * usec's) out of 1024000 usecs */ largest_slot -= 2; if (inp->inp_hpts_request > largest_slot) { /* * Restrict max jump of slots and remember * leftover */ slot = largest_slot; inp->inp_hpts_request -= largest_slot; } else { /* This one will run when we hit it */ inp->inp_hpts_request = 0; } if (hpts->p_nxt_slot == hpts->p_cur_slot) slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; else slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; if (slot_calc == hpts->p_cur_slot) { #ifdef INVARIANTS /* TSNH */ panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", hpts, slot_calc, slot, largest_slot); #endif if (slot_calc) slot_calc--; else slot_calc = NUM_OF_HPTSI_SLOTS - 1; } inp->inp_hptsslot = slot_calc; if (diag) { diag->inp_hptsslot = inp->inp_hptsslot; } } else { /* * The hpts is sleeping, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; uint32_t slot_now; struct timeval tv; ticknow = tcp_gethptstick(&tv); slot_now = ticknow % NUM_OF_HPTSI_SLOTS; /* * The user wants to be inserted at (slot_now + * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. */ largest_slot = NUM_OF_HPTSI_SLOTS - 2; if (inp->inp_hpts_request > largest_slot) { /* Adjust the residual in inp_hpts_request */ slot = largest_slot; inp->inp_hpts_request -= largest_slot; } else { /* No residual it all fits */ inp->inp_hpts_request = 0; } inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; if (diag) { diag->slot_now = slot_now; diag->inp_hptsslot = inp->inp_hptsslot; diag->p_on_min_sleep = hpts->p_on_min_sleep; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); } /* Now do we need to restart the hpts's timer? */ if (TSTMP_GT(ticknow, hpts->p_curtick)) have_slept = ticknow - hpts->p_curtick; else have_slept = 0; if (have_slept < hpts->p_hpts_sleep_time) { /* This should be what happens */ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; } else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; } if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { /* * We need to reschedule the hptss time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_USEC; } } hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); if (diag) { diag->need_new_to = 0; diag->co_ret = 0xffff0000; } } else if (need_new_to) { int32_t co_ret; struct timeval tv; sbintime_t sb; tv.tv_sec = 0; tv.tv_usec = 0; while (need_new_to > HPTS_USEC_IN_SEC) { tv.tv_sec++; need_new_to -= HPTS_USEC_IN_SEC; } tv.tv_usec = need_new_to; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } if (diag) { diag->need_new_to = need_new_to; diag->co_ret = co_ret; } } } else { #ifdef INVARIANTS panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); #endif } } uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ struct tcp_hpts_entry *hpts; uint32_t slot_on, cts; struct timeval tv; /* * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); if (in_ts_percision) microuptime(&tv); else getmicrouptime(&tv); cts = tcp_tv_to_usectick(&tv); tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); slot_on = hpts->p_nxt_slot; mtx_unlock(&hpts->p_mtx); return (slot_on); } uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ return (tcp_hpts_insert_diag(inp, slot, line, NULL)); } int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) { int32_t retval = 0; HPTS_MTX_ASSERT(hpts); if (inp->inp_in_input == 0) { /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); retval = 1; if (hpts->p_hpts_active == 0) { /* * Activate the hpts if it is sleeping. */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); } retval = 2; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } } else if (hpts->p_hpts_active == 0) { retval = 4; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } return (retval); } void tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) { /* Setup packet for input first */ INP_WLOCK_ASSERT(tp->t_inpcb); m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); m->m_pkthdr.pace_tlen = (uint16_t) tlen; m->m_pkthdr.pace_drphdrlen = drop_hdrlen; m->m_pkthdr.pace_tos = iptos; m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); if (tp->t_in_pkt == NULL) { tp->t_in_pkt = m; tp->t_tail_pkt = m; } else { tp->t_tail_pkt->m_nextpkt = m; tp->t_tail_pkt = m; } } int32_t __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ struct tcp_hpts_entry *hpts; int32_t ret; tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); hpts = tcp_input_lock(tp->t_inpcb); ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); mtx_unlock(&hpts->p_mtx); return (ret); } void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) { struct tcp_hpts_entry *hpts; struct tcpcb *tp; tp = intotcpcb(inp); hpts = tcp_input_lock(tp->t_inpcb); if (inp->inp_in_input == 0) { /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); if (hpts->p_hpts_active == 0) { /* * Activate the hpts if it is sleeping. */ hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } } else if (hpts->p_hpts_active == 0) { hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } inp->inp_hpts_drop_reas = reason; mtx_unlock(&hpts->p_mtx); } static uint16_t hpts_random_cpu(struct inpcb *inp){ /* * No flow type set distribute the load randomly. */ uint16_t cpuid; uint32_t ran; /* * If one has been set use it i.e. we want both in and out on the * same hpts. */ if (inp->inp_input_cpu_set) { return (inp->inp_input_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } /* Nothing set use a random number */ ran = arc4random(); cpuid = (ran & 0xffff) % mp_ncpus; return (cpuid); } static uint16_t hpts_cpuid(struct inpcb *inp){ u_int cpuid; /* * If one has been set use it i.e. we want both in and out on the * same hpts. */ if (inp->inp_input_cpu_set) { return (inp->inp_input_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } /* If one is set the other must be the same */ #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (hpts_random_cpu(inp)); else return (cpuid); #else /* * We don't have a flowid -> cpuid mapping, so cheat and just map * unknown cpuids to curcpu. Not the best, but apparently better * than defaulting to swi 0. */ if (inp->inp_flowtype != M_HASHTYPE_NONE) { cpuid = inp->inp_flowid % mp_ncpus; return (cpuid); } cpuid = hpts_random_cpu(inp); return (cpuid); #endif } /* * Do NOT try to optimize the processing of inp's * by first pulling off all the inp's into a temporary * list (e.g. TAILQ_CONCAT). If you do that the subtle * interactions of switching CPU's will kill because of * problems in the linked list manipulation. Basically * you would switch cpu's with the hpts mutex locked * but then while you were processing one of the inp's * some other one that you switch will get a new * packet on the different CPU. It will insert it * on the new hptss input list. Creating a temporary * link in the inp will not fix it either, since * the other hpts will be doing the same thing and * you will both end up using the temporary link. * * You will die in an ASSERT for tailq corruption if you * run INVARIANTS or you will die horribly without * INVARIANTS in some unknown way with a corrupt linked * list. */ static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) { struct mbuf *m, *n; struct tcpcb *tp; struct inpcb *inp; uint16_t drop_reason; int16_t set_cpu; uint32_t did_prefetch = 0; int32_t ti_locked = TI_UNLOCKED; struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { HPTS_MTX_ASSERT(hpts); hpts_sane_input_remove(hpts, inp, 0); if (inp->inp_input_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } hpts->p_inp = inp; drop_reason = inp->inp_hpts_drop_reas; inp->inp_in_input = 0; mtx_unlock(&hpts->p_mtx); CURVNET_SET(inp->inp_vnet); if (drop_reason) { INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else { ti_locked = TI_UNLOCKED; } INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { out: hpts->p_inp = NULL; if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } if (in_pcbrele_wlocked(inp) == 0) { INP_WUNLOCK(inp); } ti_locked = TI_UNLOCKED; CURVNET_RESTORE(); mtx_lock(&hpts->p_mtx); continue; } tp = intotcpcb(inp); if ((tp == NULL) || (tp->t_inpcb == NULL)) { goto out; } if (drop_reason) { /* This tcb is being destroyed for drop_reason */ m = tp->t_in_pkt; if (m) n = m->m_nextpkt; else n = NULL; tp->t_in_pkt = NULL; while (m) { m_freem(m); m = n; if (m) n = m->m_nextpkt; } tp = tcp_drop(tp, drop_reason); INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (tp == NULL) { INP_WLOCK(inp); } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); CURVNET_RESTORE(); mtx_lock(&hpts->p_mtx); continue; } if (set_cpu) { /* * Setup so the next time we will move to the right * CPU. This should be a rare event. It will * sometimes happens when we are the client side * (usually not the server). Somehow tcp_output() * gets called before the tcp_do_segment() sets the * intial state. This means the r_cpu and r_hpts_cpu * is 0. We get on the hpts, and then tcp_input() * gets called setting up the r_cpu to the correct * value. The hpts goes off and sees the mis-match. * We simply correct it here and the CPU will switch * to the new hpts nextime the tcb gets added to the * the hpts (not this time) :-) */ tcp_set_hpts(inp); } m = tp->t_in_pkt; n = NULL; if (m != NULL && (m->m_pkthdr.pace_lock == TI_RLOCKED || tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; INP_INFO_RLOCK_ET(&V_tcbinfo, et); m = tp->t_in_pkt; } if (in_newts_every_tcb) { if (in_ts_percision) microuptime(tv); else getmicrouptime(tv); } if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } /* Any input work to do, if so do it first */ if ((m != NULL) && (m == tp->t_in_pkt)) { struct tcphdr *th; int32_t tlen, drop_hdrlen, nxt_pkt; uint8_t iptos; n = m->m_nextpkt; tp->t_in_pkt = tp->t_tail_pkt = NULL; while (m) { th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); tlen = m->m_pkthdr.pace_tlen; drop_hdrlen = m->m_pkthdr.pace_drphdrlen; iptos = m->m_pkthdr.pace_tos; m->m_nextpkt = NULL; if (n) nxt_pkt = 1; else nxt_pkt = 0; inp->inp_input_calls = 1; if (tp->t_fb->tfb_tcp_hpts_do_segment) { /* Use the hpts specific do_segment */ (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, tp, drop_hdrlen, tlen, iptos, nxt_pkt, tv); } else { /* Use the default do_segment */ (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, tp, drop_hdrlen, tlen, iptos); } if (ti_locked == TI_RLOCKED) INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); /* * Do segment returns unlocked we need the * lock again but we also need some kasserts * here. */ INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); m = n; if (m) n = m->m_nextpkt; if (m != NULL && m->m_pkthdr.pace_lock == TI_RLOCKED) { INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; INP_WLOCK(inp); /* * Since we have an opening here we must * re-check if the tcb went away while we * were getting the lock(s). */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { while (m) { m_freem(m); m = n; if (m) n = m->m_nextpkt; } goto out; } /* * Now that we hold the INP lock, check if * we need to upgrade our lock. */ if (ti_locked == TI_UNLOCKED && (tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; INP_INFO_RLOCK_ET(&V_tcbinfo, et); } } /** end while(m) */ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); if (ti_locked == TI_RLOCKED) INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); ti_locked = TI_UNLOCKED; mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; CURVNET_RESTORE(); } } static int tcp_hpts_est_run(struct tcp_hpts_entry *hpts) { int32_t ticks_to_run; if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { ticks_to_run = hpts->p_curtick - hpts->p_prevtick; if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; } } else { if (hpts->p_prevtick == hpts->p_curtick) { /* This happens when we get woken up right away */ return (-1); } ticks_to_run = 1; } /* Set in where we will be when we catch up */ hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; if (hpts->p_nxt_slot == hpts->p_cur_slot) { panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); } return (ticks_to_run); } static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) { struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; int32_t ticks_to_run, i, error, tick_now, interum_tick; int32_t paced_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; uint32_t cts; int16_t set_cpu; HPTS_MTX_ASSERT(hpts); hpts->p_curtick = tcp_tv_to_hptstick(ctick); cts = tcp_tv_to_usectick(ctick); memcpy(&tv, ctick, sizeof(struct timeval)); hpts->p_cur_slot = hpts_tick(hpts, 1); /* Figure out if we had missed ticks */ again: HPTS_MTX_ASSERT(hpts); ticks_to_run = tcp_hpts_est_run(hpts); if (!TAILQ_EMPTY(&hpts->p_input)) { tcp_input_data(hpts, &tv); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", hpts, hpts->p_on_inqueue_cnt); } #endif HPTS_MTX_ASSERT(hpts); /* Reset the ticks to run and time if we need too */ interum_tick = tcp_gethptstick(&tv); if (interum_tick != hpts->p_curtick) { /* Save off the new time we execute to */ *ctick = tv; hpts->p_curtick = interum_tick; cts = tcp_tv_to_usectick(&tv); hpts->p_cur_slot = hpts_tick(hpts, 1); ticks_to_run = tcp_hpts_est_run(hpts); } if (ticks_to_run == -1) { goto no_run; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); } if (hpts->p_on_queue_cnt == 0) { goto no_one; } HPTS_MTX_ASSERT(hpts); for (i = 0; i < ticks_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there * was not any */ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; HPTS_MTX_ASSERT(hpts); while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { /* For debugging */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); } hpts->p_inp = inp; paced_cnt++; if (hpts->p_cur_slot != inp->inp_hptsslot) { panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); } /* Now pull it */ if (inp->inp_hpts_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { /* We prefetch the next inp if possible */ kern_prefetch(ninp, &prefetch_ninp); prefetch_ninp = 1; } if (inp->inp_hpts_request) { /* * This guy is deferred out further in time * then our wheel had on it. Push him back * on the wheel. */ int32_t remaining_slots; remaining_slots = ticks_to_run - (i + 1); if (inp->inp_hpts_request > remaining_slots) { /* * Keep INVARIANTS happy by clearing * the flag */ tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); hpts->p_inp = NULL; continue; } inp->inp_hpts_request = 0; } /* * We clear the hpts flag here after dealing with * remaining slots. This way anyone looking with the * TCB lock will see its on the hpts until just * before we unlock. */ inp->inp_in_hpts = 0; mtx_unlock(&hpts->p_mtx); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); hpts->p_inp = NULL; continue; } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { out_now: #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif INP_WUNLOCK(inp); mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); hpts->p_inp = NULL; continue; } tp = intotcpcb(inp); if ((tp == NULL) || (tp->t_inpcb == NULL)) { goto out_now; } if (set_cpu) { /* * Setup so the next time we will move to * the right CPU. This should be a rare * event. It will sometimes happens when we * are the client side (usually not the * server). Somehow tcp_output() gets called * before the tcp_do_segment() sets the * intial state. This means the r_cpu and * r_hpts_cpu is 0. We get on the hpts, and * then tcp_input() gets called setting up * the r_cpu to the correct value. The hpts * goes off and sees the mis-match. We * simply correct it here and the CPU will * switch to the new hpts nextime the tcb * gets added to the the hpts (not this one) * :-) */ tcp_set_hpts(inp); } if (out_newts_every_tcb) { struct timeval sv; if (out_ts_percision) microuptime(&sv); else getmicrouptime(&sv); cts = tcp_tv_to_usectick(&sv); } CURVNET_SET(inp->inp_vnet); /* * There is a hole here, we get the refcnt on the * inp so it will still be preserved but to make * sure we can get the INP we need to hold the p_mtx * above while we pull out the tp/inp, as long as * fini gets the lock first we are assured of having * a sane INP we can lock and test. */ #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx before tcp-output:%d", hpts, __LINE__); } #endif if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } inp->inp_hpts_calls = 1; if (tp->t_fb->tfb_tcp_output_wtime != NULL) { error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); } else { error = tp->t_fb->tfb_tcp_output(tp); } if (ninp && ninp->inp_ppcb) { /* * If we have a nxt inp, see if we can * prefetch its ppcb. Note this may seem * "risky" since we have no locks (other * than the previous inp) and there no * assurance that ninp was not pulled while * we were processing inp and freed. If this * occured it could mean that either: * * a) Its NULL (which is fine we won't go * here) b) Its valid (which is cool we * will prefetch it) c) The inp got * freed back to the slab which was * reallocated. Then the piece of memory was * re-used and something else (not an * address) is in inp_ppcb. If that occurs * we don't crash, but take a TLB shootdown * performance hit (same as if it was NULL * and we tried to pre-fetch it). * * Considering that the likelyhood of is * quite rare we will take a risk on doing * this. If performance drops after testing * we can always take this out. NB: the * kern_prefetch on amd64 actually has * protection against a bad address now via * the DMAP_() tests. This will prevent the * TLB hit, and instead if occurs just * cause us to load cache with a useless * address (to us). */ kern_prefetch(ninp->inp_ppcb, &prefetch_tp); prefetch_tp = 1; } INP_WUNLOCK(inp); INP_UNLOCK_ASSERT(inp); CURVNET_RESTORE(); #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); hpts->p_inp = NULL; } HPTS_MTX_ASSERT(hpts); hpts->p_inp = NULL; hpts->p_cur_slot++; if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { hpts->p_cur_slot = 0; } } no_one: HPTS_MTX_ASSERT(hpts); hpts->p_prevtick = hpts->p_curtick; hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ /* Re-run any input that may be there */ (void)tcp_gethptstick(&tv); if (!TAILQ_EMPTY(&hpts->p_input)) { tcp_input_data(hpts, &tv); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", hpts, hpts->p_on_inqueue_cnt); } #endif tick_now = tcp_gethptstick(&tv); if (SEQ_GT(tick_now, hpts->p_prevtick)) { struct timeval res; /* Did we really spend a full tick or more in here? */ timersub(&tv, ctick, &res); if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { counter_u64_add(hpts_loops, 1); if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); } *ctick = res; hpts->p_curtick = tick_now; goto again; } } no_run: { uint32_t t = 0, i, fnd = 0; if (hpts->p_on_queue_cnt) { /* * Find next slot that is occupied and use that to * be the sleep time. */ for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { fnd = 1; break; } t = (t + 1) % NUM_OF_HPTSI_SLOTS; } if (fnd) { hpts->p_hpts_sleep_time = i; } else { counter_u64_add(back_tosleep, 1); #ifdef INVARIANTS panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); #endif hpts->p_on_queue_cnt = 0; goto non_found; } t++; } else { /* No one on the wheel sleep for all but 2 slots */ non_found: if (hpts_sleep_max == 0) hpts_sleep_max = 1; hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); t = 0; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); } } } void __tcp_set_hpts(struct inpcb *inp, int32_t line) { struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); if ((inp->inp_in_hpts == 0) && (inp->inp_hpts_cpu_set == 0)) { inp->inp_hpts_cpu = hpts_cpuid(inp); inp->inp_hpts_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if ((inp->inp_input_cpu_set == 0) && (inp->inp_in_input == 0)) { inp->inp_input_cpu = hpts_cpuid(inp); inp->inp_input_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); } uint16_t tcp_hpts_delayedby(struct inpcb *inp){ return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); } static void tcp_hpts_thread(void *ctx) { struct tcp_hpts_entry *hpts; struct timeval tv; sbintime_t sb; hpts = (struct tcp_hpts_entry *)ctx; mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { /* Signaled by input */ if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); callout_stop(&hpts->co); } else { /* Timed out */ if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); mtx_unlock(&hpts->p_mtx); return; } callout_deactivate(&hpts->co); if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); } hpts->p_hpts_active = 1; (void)tcp_gethptstick(&tv); tcp_hptsi(hpts, &tv); HPTS_MTX_ASSERT(hpts); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { tv.tv_usec = tcp_min_hptsi_time; hpts->p_on_min_sleep = 1; } else { /* Clear the min sleep flag */ hpts->p_on_min_sleep = 0; } hpts->p_hpts_active = 0; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } hpts->p_direct_wake = 0; mtx_unlock(&hpts->p_mtx); } #undef timersub static void tcp_init_hptsi(void *st) { int32_t i, j, error, bound = 0, created = 0; size_t sz, asz; struct timeval tv; sbintime_t sb; struct tcp_hpts_entry *hpts; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; tcp_pace.rp_proc = NULL; tcp_pace.rp_num_hptss = ncpus; hpts_loops = counter_u64_alloc(M_WAITOK); back_tosleep = counter_u64_alloc(M_WAITOK); sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), M_TCPHPTS, M_WAITOK | M_ZERO); tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); hpts = tcp_pace.rp_ent[i]; /* * Init all the hpts structures that are not specifically * zero'd by the allocations. Also lets attach them to the * appropriate sysctl block as well. */ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", MTX_DEF | MTX_DUPOK); TAILQ_INIT(&hpts->p_input); for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { TAILQ_INIT(&hpts->p_hptss[j]); } sysctl_ctx_init(&hpts->hpts_ctx); sprintf(unit, "%d", i); hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), OID_AUTO, unit, CTLFLAG_RW, 0, ""); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "in_qcnt", CTLFLAG_RD, &hpts->p_on_inqueue_cnt, 0, "Count TCB's awaiting input processing"); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "out_qcnt", CTLFLAG_RD, &hpts->p_on_queue_cnt, 0, "Count TCB's awaiting output processing"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "active", CTLFLAG_RD, &hpts->p_hpts_active, 0, "Is the hpts active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curslot", CTLFLAG_RD, &hpts->p_cur_slot, 0, "What the current slot is if active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curtick", CTLFLAG_RD, &hpts->p_curtick, 0, "What the current tick on if active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "logsize", CTLFLAG_RD, &hpts->p_logsize, 0, "Hpts logging buffer size"); hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; hpts->p_num = i; hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); hpts->p_prevtick -= 1; hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; hpts->p_cpu = 0xffff; hpts->p_nxt_slot = 1; hpts->p_logsize = tcp_hpts_logging_size; if (hpts->p_logsize) { sz = (sizeof(struct hpts_log) * hpts->p_logsize); hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); } callout_init(&hpts->co, 1); } /* * Now lets start ithreads to handle the hptss. */ CPU_FOREACH(i) { hpts = tcp_pace.rp_ent[i]; hpts->p_cpu = i; error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); if (error) { panic("Can't add hpts:%p i:%d err:%d", hpts, i, error); } created++; if (tcp_bind_threads) { if (intr_event_bind(hpts->ie, i) == 0) bound++; } tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } } printf("TCP Hpts created %d swi interrupt thread and bound %d\n", created, bound); return; } SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); MODULE_VERSION(tcphpts, 1); Index: head/sys/netinet/tcp_hpts.h =================================================================== --- head/sys/netinet/tcp_hpts.h (revision 343754) +++ head/sys/netinet/tcp_hpts.h (revision 343755) @@ -1,304 +1,304 @@ /*- - * Copyright (c) 2016-2018 Netflix Inc. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_hpts_h__ #define __tcp_hpts_h__ /* * The hpts uses a 102400 wheel. The wheel * defines the time in 10 usec increments (102400 x 10). * This gives a range of 10usec - 1024ms to place * an entry within. If the user requests more than * 1.024 second, a remaineder is attached and the hpts * when seeing the remainder will re-insert the * inpcb forward in time from where it is until * the remainder is zero. */ #define NUM_OF_HPTSI_SLOTS 102400 TAILQ_HEAD(hptsh, inpcb); /* Number of useconds in a hpts tick */ #define HPTS_TICKS_PER_USEC 10 #define HPTS_MS_TO_SLOTS(x) (x * 100) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 #define DEFAULT_HPTS_LOG 3072 /* * Log flags consist of * 7f 7f 1 1 bits * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE * * So for example cpu 10, number 10 would with * input active would show up as: * p_flags = 0001010 0001010 1 0 * * p_flags = 0x142a */ #define HPTS_HPTS_ACTIVE 0x01 #define HPTS_INPUT_ACTIVE 0x02 #define HPTSLOG_IMMEDIATE 1 #define HPTSLOG_INSERT_NORMAL 2 #define HPTSLOG_INSERT_SLEEPER 3 #define HPTSLOG_SLEEP_AFTER 4 #define HPTSLOG_SLEEP_BEFORE 5 #define HPTSLOG_INSERTED 6 #define HPTSLOG_WAKEUP_HPTS 7 #define HPTSLOG_SETTORUN 8 #define HPTSLOG_HPTSI 9 #define HPTSLOG_TOLONG 10 #define HPTSLOG_AWAKENS 11 #define HPTSLOG_TIMESOUT 12 #define HPTSLOG_SLEEPSET 13 #define HPTSLOG_WAKEUP_INPUT 14 #define HPTSLOG_RESCHEDULE 15 #define HPTSLOG_AWAKE 16 #define HPTSLOG_INP_DONE 17 struct hpts_log { struct inpcb *inp; int32_t event; uint32_t cts; int32_t line; uint32_t ticknow; uint32_t t_paceslot; uint32_t t_hptsreq; uint32_t p_curtick; uint32_t p_prevtick; uint32_t slot_req; uint32_t p_on_queue_cnt; uint32_t p_nxt_slot; uint32_t p_cur_slot; uint32_t p_hpts_sleep_time; uint16_t p_flags; uint8_t p_onhpts; uint8_t p_oninput; uint8_t is_notempty; }; struct hpts_diag { uint32_t p_hpts_active; uint32_t p_nxt_slot; uint32_t p_cur_slot; uint32_t slot_req; uint32_t inp_hptsslot; uint32_t slot_now; uint32_t have_slept; uint32_t hpts_sleep_time; uint32_t yet_to_sleep; uint32_t need_new_to; int32_t co_ret; uint8_t p_on_min_sleep; }; #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ uint32_t p_hpts_active; /* Flag that says hpts is awake */ uint32_t p_curtick; /* Current tick in 10 us the hpts is at */ uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of * slots that the hpts is running on. */ int32_t p_on_queue_cnt; /* Count on queue in this hpts */ uint32_t enobuf_cnt; uint16_t p_log_at; uint8_t p_direct_wake :1, /* boolean */ p_log_wrapped :1, /* boolean */ p_on_min_sleep:1; /* boolean */ uint8_t p_fill; /* Cache line 0x40 */ void *p_inp; struct hptsh p_input; /* For the tcp-input runner */ /* Hptsi wheel */ struct hptsh *p_hptss; struct hpts_log *p_log; uint32_t p_logsize; int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ uint32_t hit_no_enobuf; uint32_t p_dyn_adjust; uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ uint32_t p_delayed_by; /* How much were we delayed by */ /* Cache line 0x80 */ struct sysctl_ctx_list hpts_ctx; struct sysctl_oid *hpts_root; struct intr_event *ie; void *ie_cookie; uint16_t p_num; /* The hpts number one per cpu */ uint16_t p_cpu; /* The hpts CPU */ /* There is extra space in here */ /* Cache line 0x100 */ struct callout co __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); struct tcp_hptsi { struct proc *rp_proc; /* Process structure for hpts */ struct tcp_hpts_entry **rp_ent; /* Array of hptss */ uint32_t rp_num_hptss; /* Number of hpts threads */ }; #endif #define HPTS_REMOVE_INPUT 0x01 #define HPTS_REMOVE_OUTPUT 0x02 #define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT) /* * When using the hpts, a TCP stack must make sure * that once a INP_DROPPED flag is applied to a INP * that it does not expect tcp_output() to ever be * called by the hpts. The hpts will *not* call * any output (or input) functions on a TCB that * is in the DROPPED state. * * This implies final ACK's and RST's that might * be sent when a TCB is still around must be * sent from a routine like tcp_respond(). */ #define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep * this determines min granularity of the * hpts. If 0, granularity is 10useconds at * the cost of more CPU (context switching). */ #ifdef _KERNEL #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp); struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp); int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line); #define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__) struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp); #define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__) void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line); /* * To insert a TCB on the hpts you *must* be holding the * INP_WLOCK(). The hpts insert code will then acqurire * the hpts's lock and insert the TCB on the requested * slot possibly waking up the hpts if you are requesting * a time earlier than what the hpts is sleeping to (if * the hpts is sleeping). You may check the inp->inp_in_hpts * flag without the hpts lock. The hpts is the only one * that will clear this flag holding only the hpts lock. This * means that in your tcp_output() routine when you test for * it to be 1 (so you wont call output) it may be transitioning * to 0 (by the hpts). That will be fine since that will just * mean an extra call to tcp_output that most likely will find * the call you executed (when the mis-match occured) will have * put the TCB back on the hpts and it will return. If your * call did not add it back to the hpts then you will either * over-send or the cwnd will block you from sending more. * * Note you should also be holding the INP_WLOCK() when you * call the remove from the hpts as well. Thoug usually * you are either doing this from a timer, where you need * that INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). */ uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line); #define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__) uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag); int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line); #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); void tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, int32_t tlen, int32_t drop_hdrlen, uint8_t iptos); int __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line); #define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) uint16_t tcp_hpts_delayedby(struct inpcb *inp); void __tcp_set_hpts(struct inpcb *inp, int32_t line); #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line); #define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__) extern int32_t tcp_min_hptsi_time; static __inline uint32_t tcp_tv_to_hptstick(struct timeval *sv) { return ((sv->tv_sec * 100000) + (sv->tv_usec / 10)); } static __inline uint32_t tcp_gethptstick(struct timeval *sv) { struct timeval tv; if (sv == NULL) sv = &tv; microuptime(sv); return (tcp_tv_to_hptstick(sv)); } static __inline uint32_t tcp_tv_to_usectick(struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); } static __inline uint32_t tcp_tv_to_mssectick(struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); } static __inline void tcp_hpts_unlock(struct tcp_hpts_entry *hpts) { mtx_unlock(&hpts->p_mtx); } static __inline uint32_t tcp_get_usecs(struct timeval *tv) { struct timeval tvd; if (tv == NULL) tv = &tvd; microuptime(tv); return (tcp_tv_to_usectick(tv)); } #endif /* _KERNEL */ #endif /* __tcp_hpts_h__ */ Index: head/sys/netinet/tcp_log_buf.c =================================================================== --- head/sys/netinet/tcp_log_buf.c (revision 343754) +++ head/sys/netinet/tcp_log_buf.c (revision 343755) @@ -1,2435 +1,2434 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2016-2018 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Default expiry time */ #define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S) /* Max interval at which to run the expiry timer */ #define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S) bool tcp_log_verbose; static uma_zone_t tcp_log_bucket_zone, tcp_log_node_zone, tcp_log_zone; static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT; static uint32_t tcp_log_version = TCP_LOG_BUF_VER; RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket); static struct tcp_log_id_tree tcp_log_id_head; static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head = STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head); static struct mtx tcp_log_expireq_mtx; static struct callout tcp_log_expireq_callout; static u_long tcp_log_auto_ratio = 0; static volatile u_long tcp_log_auto_ratio_cur = 0; static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL; static bool tcp_log_auto_all = false; RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW, 0, "TCP Black Box controls"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose, 0, "Force verbose logging for TCP traces"); SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit, CTLFLAG_RW, &tcp_log_session_limit, 0, "Maximum number of events maintained for each TCP session"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW, &tcp_log_zone, "Maximum number of events maintained for all TCP sessions"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD, &tcp_log_zone, "Current number of events maintained for all TCP sessions"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW, &tcp_log_bucket_zone, "Maximum number of log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD, &tcp_log_bucket_zone, "Current number of log IDs"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW, &tcp_log_node_zone, "Maximum number of tcpcbs with log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD, &tcp_log_node_zone, "Current number of tcpcbs with log IDs"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version, 0, "Version of log formats exported"); SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW, &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW, &tcp_log_auto_mode, TCP_LOG_STATE_HEAD_AUTO, "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_HEAD_AUTO)"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW, &tcp_log_auto_all, false, "Auto-select from all sessions (rather than just those with IDs)"); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_t tcp_log_queued; counter_u64_t tcp_log_que_fail1; counter_u64_t tcp_log_que_fail2; counter_u64_t tcp_log_que_fail3; counter_u64_t tcp_log_que_fail4; counter_u64_t tcp_log_que_fail5; counter_u64_t tcp_log_que_copyout; counter_u64_t tcp_log_que_read; counter_u64_t tcp_log_que_freed; SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD, &tcp_log_queued, "Number of entries queued"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD, &tcp_log_que_fail1, "Number of entries queued but fail 1"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD, &tcp_log_que_fail2, "Number of entries queued but fail 2"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD, &tcp_log_que_fail3, "Number of entries queued but fail 3"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD, &tcp_log_que_fail4, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD, &tcp_log_que_fail5, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD, &tcp_log_que_copyout, "Number of entries copied out"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD, &tcp_log_que_read, "Number of entries read from the queue"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD, &tcp_log_que_freed, "Number of entries freed after reading"); #endif #ifdef INVARIANTS #define TCPLOG_DEBUG_RINGBUF #endif struct tcp_log_mem { STAILQ_ENTRY(tcp_log_mem) tlm_queue; struct tcp_log_buffer tlm_buf; struct tcp_log_verbose tlm_v; #ifdef TCPLOG_DEBUG_RINGBUF volatile int tlm_refcnt; #endif }; /* 60 bytes for the header, + 16 bytes for padding */ static uint8_t zerobuf[76]; /* * Lock order: * 1. TCPID_TREE * 2. TCPID_BUCKET * 3. INP * * Rules: * A. You need a lock on the Tree to add/remove buckets. * B. You need a lock on the bucket to add/remove nodes from the bucket. * C. To change information in a node, you need the INP lock if the tln_closed * field is false. Otherwise, you need the bucket lock. (Note that the * tln_closed field can change at any point, so you need to recheck the * entry after acquiring the INP lock.) * D. To remove a node from the bucket, you must have that entry locked, * according to the criteria of Rule C. Also, the node must not be on * the expiry queue. * E. The exception to C is the expiry queue fields, which are locked by * the TCPLOG_EXPIREQ lock. * * Buckets have a reference count. Each node is a reference. Further, * other callers may add reference counts to keep a bucket from disappearing. * You can add a reference as long as you own a lock sufficient to keep the * bucket from disappearing. For example, a common use is: * a. Have a locked INP, but need to lock the TCPID_BUCKET. * b. Add a refcount on the bucket. (Safe because the INP lock prevents * the TCPID_BUCKET from going away.) * c. Drop the INP lock. * d. Acquire a lock on the TCPID_BUCKET. * e. Acquire a lock on the INP. * f. Drop the refcount on the bucket. * (At this point, the bucket may disappear.) * * Expire queue lock: * You can acquire this with either the bucket or INP lock. Don't reverse it. * When the expire code has committed to freeing a node, it resets the expiry * time to SBT_MAX. That is the signal to everyone else that they should * leave that node alone. */ static struct rwlock tcp_id_tree_lock; #define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock) #define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock) #define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock) #define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock) #define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock) #define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED) #define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED) #define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED) #define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF) #define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED) #define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED) #define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt)) #define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt)) #define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx) #define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx) SLIST_HEAD(tcp_log_id_head, tcp_log_id_node); struct tcp_log_id_bucket { /* * tlb_id must be first. This lets us use strcmp on * (struct tcp_log_id_bucket *) and (char *) interchangeably. */ char tlb_id[TCP_LOG_ID_LEN]; RB_ENTRY(tcp_log_id_bucket) tlb_rb; struct tcp_log_id_head tlb_head; struct mtx tlb_mtx; volatile u_int tlb_refcnt; }; struct tcp_log_id_node { SLIST_ENTRY(tcp_log_id_node) tln_list; STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */ sbintime_t tln_expiretime; /* Locked by the expireq lock */ /* * If INP is NULL, that means the connection has closed. We've * saved the connection endpoint information and the log entries * in the tln_ie and tln_entries members. We've also saved a pointer * to the enclosing bucket here. If INP is not NULL, the information is * in the PCB and not here. */ struct inpcb *tln_inp; struct tcpcb *tln_tp; struct tcp_log_id_bucket *tln_bucket; struct in_endpoints tln_ie; struct tcp_log_stailq tln_entries; int tln_count; volatile int tln_closed; uint8_t tln_af; }; enum tree_lock_state { TREE_UNLOCKED = 0, TREE_RLOCKED, TREE_WLOCKED, }; /* Do we want to select this session for auto-logging? */ static __inline bool tcp_log_selectauto(void) { /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ if (tcp_log_auto_ratio && (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) % tcp_log_auto_ratio) == 0) return (true); return (false); } static __inline int tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b) { KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL")); KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL")); return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN); } RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) static __inline void tcp_log_id_validate_tree_lock(int tree_locked) { #ifdef INVARIANTS switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WLOCK_ASSERT(); break; case TREE_RLOCKED: TCPID_TREE_RLOCK_ASSERT(); break; case TREE_UNLOCKED: TCPID_TREE_UNLOCK_ASSERT(); break; default: kassert_panic("%s:%d: unknown tree lock state", __func__, __LINE__); } #endif } static __inline void tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb) { TCPID_TREE_WLOCK_ASSERT(); KASSERT(SLIST_EMPTY(&tlb->tlb_head), ("%s: Attempt to remove non-empty bucket", __func__)); if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) { #ifdef INVARIANTS kassert_panic("%s:%d: error removing element from tree", __func__, __LINE__); #endif } TCPID_BUCKET_LOCK_DESTROY(tlb); uma_zfree(tcp_log_bucket_zone, tlb); } /* * Call with a referenced and locked bucket. * Will return true if the bucket was freed; otherwise, false. * tlb: The bucket to unreference. * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * inp: If not NULL and the function needs to drop the inp lock to relock the * tree, it will do so. (The caller must ensure inp will not become invalid, * probably by holding a reference to it.) */ static bool tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked, struct inpcb *inp) { KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); tcp_log_id_validate_tree_lock(*tree_locked); /* * Did we hold the last reference on the tlb? If so, we may need * to free it. (Note that we can realistically only execute the * loop twice: once without a write lock and once with a write * lock.) */ while (TCPID_BUCKET_UNREF(tlb)) { /* * We need a write lock on the tree to free this. * If we can upgrade the tree lock, this is "easy". If we * can't upgrade the tree lock, we need to do this the * "hard" way: unwind all our locks and relock everything. * In the meantime, anything could have changed. We even * need to validate that we still need to free the bucket. */ if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE()) *tree_locked = TREE_WLOCKED; else if (*tree_locked != TREE_WLOCKED) { TCPID_BUCKET_REF(tlb); if (inp != NULL) INP_WUNLOCK(inp); TCPID_BUCKET_UNLOCK(tlb); if (*tree_locked == TREE_RLOCKED) TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); *tree_locked = TREE_WLOCKED; TCPID_BUCKET_LOCK(tlb); if (inp != NULL) INP_WLOCK(inp); continue; } /* * We have an empty bucket and a write lock on the tree. * Remove the empty bucket. */ tcp_log_remove_bucket(tlb); return (true); } return (false); } /* * Call with a locked bucket. This function will release the lock on the * bucket before returning. * * The caller is responsible for freeing the tp->t_lin/tln node! * * Note: one of tp or both tlb and tln must be supplied. * * inp: A pointer to the inp. If the function needs to drop the inp lock to * acquire the tree write lock, it will do so. (The caller must ensure inp * will not become invalid, probably by holding a reference to it.) * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored) * tlb: A pointer to the bucket. (optional; ignored if tp is specified) * tln: A pointer to the node. (optional; ignored if tp is specified) * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * * Will return true if the INP lock was reacquired; otherwise, false. */ static bool tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp, struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln, int *tree_locked) { int orig_tree_locked; KASSERT(tp != NULL || (tlb != NULL && tln != NULL), ("%s: called with tp=%p, tlb=%p, tln=%p", __func__, tp, tlb, tln)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); if (tp != NULL) { tlb = tp->t_lib; tln = tp->t_lin; KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__)); KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__)); } tcp_log_id_validate_tree_lock(*tree_locked); TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Remove the node, clear the log bucket and node from the TCPCB, and * decrement the bucket refcount. In the process, if this is the * last reference, the bucket will be freed. */ SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list); if (tp != NULL) { tp->t_lib = NULL; tp->t_lin = NULL; } orig_tree_locked = *tree_locked; if (!tcp_log_unref_bucket(tlb, tree_locked, inp)) TCPID_BUCKET_UNLOCK(tlb); return (*tree_locked != orig_tree_locked); } #define RECHECK_INP_CLEAN(cleanup) do { \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ rv = ECONNRESET; \ cleanup; \ goto done; \ } \ tp = intotcpcb(inp); \ } while (0) #define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */) static void tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef NETFLIX if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL) (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id)); #endif } /* * Set the TCP log ID for a TCPCB. * Called with INPCB locked. Returns with it unlocked. */ int tcp_log_set_id(struct tcpcb *tp, char *id) { struct tcp_log_id_bucket *tlb, *tmp_tlb; struct tcp_log_id_node *tln; struct inpcb *inp; int tree_locked, rv; bool bucket_locked; tlb = NULL; tln = NULL; inp = tp->t_inpcb; tree_locked = TREE_UNLOCKED; bucket_locked = false; restart: INP_WLOCK_ASSERT(inp); /* See if the ID is unchanged. */ if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) || (tp->t_lib == NULL && *id == 0)) { rv = 0; goto done; } /* * If the TCPCB had a previous ID, we need to extricate it from * the previous list. * * Drop the TCPCB lock and lock the tree and the bucket. * Because this is called in the socket context, we (theoretically) * don't need to worry about the INPCB completely going away * while we are gone. */ if (tp->t_lib != NULL) { tlb = tp->t_lib; TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } TCPID_BUCKET_LOCK(tlb); bucket_locked = true; INP_WLOCK(inp); /* * Unreference the bucket. If our bucket went away, it is no * longer locked or valid. */ if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) { bucket_locked = false; tlb = NULL; } /* Validate the INP. */ RECHECK_INP(); /* * Evaluate whether the bucket changed while we were unlocked. * * Possible scenarios here: * 1. Bucket is unchanged and the same one we started with. * 2. The TCPCB no longer has a bucket and our bucket was * freed. * 3. The TCPCB has a new bucket, whether ours was freed. * 4. The TCPCB no longer has a bucket and our bucket was * not freed. * * In cases 2-4, we will start over. In case 1, we will * proceed here to remove the bucket. */ if (tlb == NULL || tp->t_lib != tlb) { KASSERT(bucket_locked || tlb == NULL, ("%s: bucket_locked (%d) and tlb (%p) are " "inconsistent", __func__, bucket_locked, tlb)); if (bucket_locked) { TCPID_BUCKET_UNLOCK(tlb); bucket_locked = false; tlb = NULL; } goto restart; } /* * Store the (struct tcp_log_id_node) for reuse. Then, remove * it from the bucket. In the process, we may end up relocking. * If so, we need to validate that the INP is still valid, and * the TCPCB entries match we expect. * * We will clear tlb and change the bucket_locked state just * before calling tcp_log_remove_id_node(), since that function * will unlock the bucket. */ if (tln != NULL) uma_zfree(tcp_log_node_zone, tln); tln = tp->t_lin; tlb = NULL; bucket_locked = false; if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) { RECHECK_INP(); /* * If the TCPCB moved to a new bucket while we had * dropped the lock, restart. */ if (tp->t_lib != NULL || tp->t_lin != NULL) goto restart; } /* * Yay! We successfully removed the TCPCB from its old * bucket. Phew! * * On to bigger and better things... */ } /* At this point, the TCPCB should not be in any bucket. */ KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__)); /* * If the new ID is not empty, we need to now assign this TCPCB to a * new bucket. */ if (*id) { /* Get a new tln, if we don't already have one to reuse. */ if (tln == NULL) { tln = uma_zalloc(tcp_log_node_zone, M_NOWAIT | M_ZERO); if (tln == NULL) { rv = ENOBUFS; goto done; } tln->tln_inp = inp; tln->tln_tp = tp; } /* * Drop the INP lock for a bit. We don't need it, and dropping * it prevents lock order reversals. */ INP_WUNLOCK(inp); /* Make sure we have at least a read lock on the tree. */ tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } refind: /* * Remember that we constructed (struct tcp_log_id_node) so * we can safely cast the id to it for the purposes of finding. */ KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL", __func__, __LINE__)); tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head, (struct tcp_log_id_bucket *) id); /* * If we didn't find a matching bucket, we need to add a new * one. This requires a write lock. But, of course, we will * need to recheck some things when we re-acquire the lock. */ if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) { tree_locked = TREE_WLOCKED; if (!TCPID_TREE_UPGRADE()) { TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); /* * The tree may have changed while we were * unlocked. */ goto refind; } } /* If we need to add a new bucket, do it now. */ if (tmp_tlb == NULL) { /* Allocate new bucket. */ tlb = uma_zalloc(tcp_log_bucket_zone, M_NOWAIT); if (tlb == NULL) { rv = ENOBUFS; goto done_noinp; } /* * Copy the ID to the bucket. * NB: Don't use strlcpy() unless you are sure * we've always validated NULL termination. * * TODO: When I'm done writing this, see if we * we have correctly validated NULL termination and * can use strlcpy(). :-) */ strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1); tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0'; /* * Take the refcount for the first node and go ahead * and lock this. Note that we zero the tlb_mtx * structure, since 0xdeadc0de flips the right bits * for the code to think that this mutex has already * been initialized. :-( */ SLIST_INIT(&tlb->tlb_head); refcount_init(&tlb->tlb_refcnt, 1); memset(&tlb->tlb_mtx, 0, sizeof(struct mtx)); TCPID_BUCKET_LOCK_INIT(tlb); TCPID_BUCKET_LOCK(tlb); bucket_locked = true; #define FREE_NEW_TLB() do { \ TCPID_BUCKET_LOCK_DESTROY(tlb); \ uma_zfree(tcp_log_bucket_zone, tlb); \ bucket_locked = false; \ tlb = NULL; \ } while (0) /* * Relock the INP and make sure we are still * unassigned. */ INP_WLOCK(inp); RECHECK_INP_CLEAN(FREE_NEW_TLB()); if (tp->t_lib != NULL) { FREE_NEW_TLB(); goto restart; } /* Add the new bucket to the tree. */ tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head, tlb); KASSERT(tmp_tlb == NULL, ("%s: Unexpected conflicting bucket (%p) while " "adding new bucket (%p)", __func__, tmp_tlb, tlb)); /* * If we found a conflicting bucket, free the new * one we made and fall through to use the existing * bucket. */ if (tmp_tlb != NULL) { FREE_NEW_TLB(); INP_WUNLOCK(inp); } #undef FREE_NEW_TLB } /* If we found an existing bucket, use it. */ if (tmp_tlb != NULL) { tlb = tmp_tlb; TCPID_BUCKET_LOCK(tlb); bucket_locked = true; /* * Relock the INP and make sure we are still * unassigned. */ INP_UNLOCK_ASSERT(inp); INP_WLOCK(inp); RECHECK_INP(); if (tp->t_lib != NULL) { TCPID_BUCKET_UNLOCK(tlb); tlb = NULL; goto restart; } /* Take a reference on the bucket. */ TCPID_BUCKET_REF(tlb); } tcp_log_grow_tlb(tlb->tlb_id, tp); /* Add the new node to the list. */ SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list); tp->t_lib = tlb; tp->t_lin = tln; tln = NULL; } rv = 0; done: /* Unlock things, as needed, and return. */ INP_WUNLOCK(inp); done_noinp: INP_UNLOCK_ASSERT(inp); if (bucket_locked) { TCPID_BUCKET_LOCK_ASSERT(tlb); TCPID_BUCKET_UNLOCK(tlb); } else if (tlb != NULL) TCPID_BUCKET_UNLOCK_ASSERT(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); if (tln != NULL) uma_zfree(tcp_log_node_zone, tln); return (rv); } /* * Get the TCP log ID for a TCPCB. * Called with INPCB locked. * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long. * Returns number of bytes copied. */ size_t tcp_log_get_id(struct tcpcb *tp, char *buf) { size_t len; INP_LOCK_ASSERT(tp->t_inpcb); if (tp->t_lib != NULL) { len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); KASSERT(len < TCP_LOG_ID_LEN, ("%s:%d: tp->t_lib->tlb_id too long (%zu)", __func__, __LINE__, len)); } else { *buf = '\0'; len = 0; } return (len); } /* * Get number of connections with the same log ID. * Log ID is taken from given TCPCB. * Called with INPCB locked. */ u_int tcp_log_get_id_cnt(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Functions/macros to increment/decrement reference count for a log * entry. This should catch when we do a double-free/double-remove or * a double-add. */ static inline void _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1); if (refcnt != 0) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_add(l) \ _tcp_log_entry_refcnt_add((l), __func__, __LINE__) static inline void _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1); if (refcnt != 1) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_rem(l) \ _tcp_log_entry_refcnt_rem((l), __func__, __LINE__) #else /* !TCPLOG_DEBUG_RINGBUF */ #define tcp_log_entry_refcnt_add(l) #define tcp_log_entry_refcnt_rem(l) #endif /* * Cleanup after removing a log entry, but only decrement the count if we * are running INVARIANTS. */ static inline void tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused) { uma_zfree(tcp_log_zone, log_entry); #ifdef INVARIANTS (*count)--; KASSERT(*count >= 0, ("%s: count unexpectedly negative", __func__)); #endif } static void tcp_log_free_entries(struct tcp_log_stailq *head, int *count) { struct tcp_log_mem *log_entry; /* Free the entries. */ while ((log_entry = STAILQ_FIRST(head)) != NULL) { STAILQ_REMOVE_HEAD(head, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_free_log_common(log_entry, count); } } /* Cleanup after removing a log entry. */ static inline void tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry) { uma_zfree(tcp_log_zone, log_entry); tp->t_lognum--; KASSERT(tp->t_lognum >= 0, ("%s: tp->t_lognum unexpectedly negative", __func__)); } /* Remove a log entry from the head of a list. */ static inline void tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry) { KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs), ("%s: attempt to remove non-HEAD log entry", __func__)); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_remove_log_cleanup(tp, log_entry); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Initialize the log entry's reference count, which we want to * survive allocations. */ static int tcp_log_zone_init(void *mem, int size, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; tlm->tlm_refcnt = 0; return (0); } /* * Double check that the refcnt is zero on allocation and return. */ static int tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); return (0); } static void tcp_log_zone_dtor(void *mem, int size, void *args __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); } #endif /* TCPLOG_DEBUG_RINGBUF */ /* Do global initialization. */ void tcp_log_init(void) { tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem), #ifdef TCPLOG_DEBUG_RINGBUF tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init, #else NULL, NULL, NULL, #endif NULL, UMA_ALIGN_PTR, 0); (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT); tcp_log_bucket_zone = uma_zcreate("tcp_log_bucket", sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_log_node_zone = uma_zcreate("tcp_log_node", sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #ifdef TCPLOG_DEBUG_COUNTERS tcp_log_queued = counter_u64_alloc(M_WAITOK); tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK); tcp_log_que_copyout = counter_u64_alloc(M_WAITOK); tcp_log_que_read = counter_u64_alloc(M_WAITOK); tcp_log_que_freed = counter_u64_alloc(M_WAITOK); #endif rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW); mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF); callout_init(&tcp_log_expireq_callout, 1); } /* Do per-TCPCB initialization. */ void tcp_log_tcpcbinit(struct tcpcb *tp) { /* A new TCPCB should start out zero-initialized. */ STAILQ_INIT(&tp->t_logs); /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ if (tcp_log_selectauto()) { tp->t_logstate = tcp_log_auto_mode; tp->t_flags2 |= TF2_LOG_AUTO; } } /* Remove entries */ static void tcp_log_expire(void *unused __unused) { struct tcp_log_id_bucket *tlb; struct tcp_log_id_node *tln; sbintime_t expiry_limit; int tree_locked; TCPLOG_EXPIREQ_LOCK(); if (callout_pending(&tcp_log_expireq_callout)) { /* Callout was reset. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Process entries until we reach one that expires too far in the * future. Look one second in the future. */ expiry_limit = getsbinuptime() + SBT_1S; tree_locked = TREE_UNLOCKED; while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL && tln->tln_expiretime <= expiry_limit) { if (!callout_active(&tcp_log_expireq_callout)) { /* * Callout was stopped. I guess we should * just quit at this point. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Remove the node from the head of the list and unlock * the list. Change the expiry time to SBT_MAX as a signal * to other threads that we now own this. */ STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq); tln->tln_expiretime = SBT_MAX; TCPLOG_EXPIREQ_UNLOCK(); /* * Remove the node from the bucket. */ tlb = tln->tln_bucket; TCPID_BUCKET_LOCK(tlb); if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) { tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_WLOCKED) TCPID_TREE_WUNLOCK(); else TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; } /* Drop the INP reference. */ INP_WLOCK(tln->tln_inp); if (!in_pcbrele_wlocked(tln->tln_inp)) INP_WUNLOCK(tln->tln_inp); /* Free the log records. */ tcp_log_free_entries(&tln->tln_entries, &tln->tln_count); /* Free the node. */ uma_zfree(tcp_log_node_zone, tln); /* Relock the expiry queue. */ TCPLOG_EXPIREQ_LOCK(); } /* * We've expired all the entries we can. Do we need to reschedule * ourselves? */ callout_deactivate(&tcp_log_expireq_callout); if (tln != NULL) { /* * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and * set the next callout to that. (This helps ensure we generally * run the callout no more often than desired.) */ expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL; if (expiry_limit < tln->tln_expiretime) expiry_limit = tln->tln_expiretime; callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* We're done. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Move log data from the TCPCB to a new node. This will reset the TCPCB log * entries and log count; however, it will not touch other things from the * TCPCB (e.g. t_lin, t_lib). * * NOTE: Must hold a lock on the INP. */ static void tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln) { INP_WLOCK_ASSERT(tp->t_inpcb); tln->tln_ie = tp->t_inpcb->inp_inc.inc_ie; if (tp->t_inpcb->inp_inc.inc_flags & INC_ISIPV6) tln->tln_af = AF_INET6; else tln->tln_af = AF_INET; tln->tln_entries = tp->t_logs; tln->tln_count = tp->t_lognum; tln->tln_bucket = tp->t_lib; /* Clear information from the PCB. */ STAILQ_INIT(&tp->t_logs); tp->t_lognum = 0; } /* Do per-TCPCB cleanup */ void tcp_log_tcpcbfini(struct tcpcb *tp) { struct tcp_log_id_node *tln, *tln_first; struct tcp_log_mem *log_entry; sbintime_t callouttime; INP_WLOCK_ASSERT(tp->t_inpcb); /* * If we were gathering packets to be automatically dumped, try to do * it now. If this succeeds, the log information in the TCPCB will be * cleared. Otherwise, we'll handle the log information as we do * for other states. */ switch(tp->t_logstate) { case TCP_LOG_STATE_HEAD_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false); break; case TCP_LOG_STATE_TAIL_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail", M_NOWAIT, false); break; case TCP_LOG_STATE_CONTINUAL: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); break; } /* * There are two ways we could keep logs: per-socket or per-ID. If * we are tracking logs with an ID, then the logs survive the * destruction of the TCPCB. * * If the TCPCB is associated with an ID node, move the logs from the * TCPCB to the ID node. In theory, this is safe, for reasons which I * will now explain for my own benefit when I next need to figure out * this code. :-) * * We own the INP lock. Therefore, no one else can change the contents * of this node (Rule C). Further, no one can remove this node from * the bucket while we hold the lock (Rule D). Basically, no one can * mess with this node. That leaves two states in which we could be: * * 1. Another thread is currently waiting to acquire the INP lock, with * plans to do something with this node. When we drop the INP lock, * they will have a chance to do that. They will recheck the * tln_closed field (see note to Rule C) and then acquire the * bucket lock before proceeding further. * * 2. Another thread will try to acquire a lock at some point in the * future. If they try to acquire a lock before we set the * tln_closed field, they will follow state #1. If they try to * acquire a lock after we set the tln_closed field, they will be * able to make changes to the node, at will, following Rule C. * * Therefore, we currently own this node and can make any changes * we want. But, as soon as we set the tln_closed field to true, we * have effectively dropped our lock on the node. (For this reason, we * also need to make sure our writes are ordered correctly. An atomic * operation with "release" semantics should be sufficient.) */ if (tp->t_lin != NULL) { /* Copy the relevant information to the log entry. */ tln = tp->t_lin; KASSERT(tln->tln_inp == tp->t_inpcb, ("%s: Mismatched inp (tln->tln_inp=%p, tp->t_inpcb=%p)", __func__, tln->tln_inp, tp->t_inpcb)); tcp_log_move_tp_to_node(tp, tln); /* Clear information from the PCB. */ tp->t_lin = NULL; tp->t_lib = NULL; /* * Take a reference on the INP. This ensures that the INP * remains valid while the node is on the expiry queue. This * ensures the INP is valid for other threads that may be * racing to lock this node when we move it to the expire * queue. */ in_pcbref(tp->t_inpcb); /* * Store the entry on the expiry list. The exact behavior * depends on whether we have entries to keep. If so, we * put the entry at the tail of the list and expire in * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put * the entry at the head of the list. (Handling the cleanup * via the expiry timer lets us avoid locking messy-ness here.) */ tln->tln_expiretime = getsbinuptime(); TCPLOG_EXPIREQ_LOCK(); if (tln->tln_count) { tln->tln_expiretime += TCP_LOG_EXPIRE_TIME; if (STAILQ_EMPTY(&tcp_log_expireq_head) && !callout_active(&tcp_log_expireq_callout)) { /* * We are adding the first entry and a callout * is not currently scheduled; therefore, we * need to schedule one. */ callout_reset_sbt(&tcp_log_expireq_callout, tln->tln_expiretime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln, tln_expireq); } else { callouttime = tln->tln_expiretime + TCP_LOG_EXPIRE_INTVL; tln_first = STAILQ_FIRST(&tcp_log_expireq_head); if ((tln_first == NULL || callouttime < tln_first->tln_expiretime) && (callout_pending(&tcp_log_expireq_callout) || !callout_active(&tcp_log_expireq_callout))) { /* * The list is empty, or we want to run the * expire code before the first entry's timer * fires. Also, we are in a case where a callout * is not actively running. We want to reset * the callout to occur sooner. */ callout_reset_sbt(&tcp_log_expireq_callout, callouttime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* * Insert to the head, or just after the head, as * appropriate. (This might result in small * mis-orderings as a bunch of "expire now" entries * gather at the start of the list, but that should * not produce big problems, since the expire timer * will walk through all of them.) */ if (tln_first == NULL || tln->tln_expiretime < tln_first->tln_expiretime) STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln, tln_expireq); else STAILQ_INSERT_AFTER(&tcp_log_expireq_head, tln_first, tln, tln_expireq); } TCPLOG_EXPIREQ_UNLOCK(); /* * We are done messing with the tln. After this point, we * can't touch it. (Note that the "release" semantics should * be included with the TCPLOG_EXPIREQ_UNLOCK() call above. * Therefore, they should be unnecessary here. However, it * seems like a good idea to include them anyway, since we * really are releasing a lock here.) */ atomic_store_rel_int(&tln->tln_closed, 1); } else { /* Remove log entries. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); } /* * Change the log state to off (just in case anything tries to sneak * in a last-minute log). */ tp->t_logstate = TCP_LOG_STATE_OFF; } /* * This logs an event for a TCP socket. Normally, this is called via * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for * TCP_LOG_EVENT(). */ struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *itv) { struct tcp_log_mem *log_entry; struct tcp_log_buffer *log_buf; int attempt_count = 0; struct tcp_log_verbose *log_verbose; uint32_t logsn; KASSERT((func == NULL && line == 0) || (func != NULL && line > 0), ("%s called with inconsistent func (%p) and line (%d) arguments", __func__, func, line)); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD || tp->t_logstate == TCP_LOG_STATE_TAIL || tp->t_logstate == TCP_LOG_STATE_CONTINUAL || tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO || tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO, ("%s called with unexpected tp->t_logstate (%d)", __func__, tp->t_logstate)); /* * Get the serial number. We do this early so it will * increment even if we end up skipping the log entry for some * reason. */ logsn = tp->t_logsn++; /* * Can we get a new log entry? If so, increment the lognum counter * here. */ retry: if (tp->t_lognum < tcp_log_session_limit) { if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL) tp->t_lognum++; } else log_entry = NULL; /* Do we need to try to reuse? */ if (log_entry == NULL) { /* * Sacrifice auto-logged sessions without a log ID if * tcp_log_auto_all is false. (If they don't have a log * ID by now, it is probable that either they won't get one * or we are resource-constrained.) */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all) { if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) { #ifdef INVARIANTS panic("%s:%d: tcp_log_state_change() failed " "to set tp %p to TCP_LOG_STATE_CLEAR", __func__, __LINE__, tp); #endif tp->t_logstate = TCP_LOG_STATE_OFF; } return (NULL); } /* * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump * the buffers. If successful, deactivate tracing. Otherwise, * leave it active so we will retry. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false)) { tp->t_logstate = TCP_LOG_STATE_OFF; return(NULL); } else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false)) { if (attempt_count == 0) { attempt_count++; goto retry; } #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail4, 1); #endif return(NULL); } else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) return(NULL); /* If in HEAD state, just deactivate the tracing and return. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD) { tp->t_logstate = TCP_LOG_STATE_OFF; return(NULL); } /* * Get a buffer to reuse. If that fails, just give up. * (We can't log anything without a buffer in which to * put it.) * * Note that we don't change the t_lognum counter * here. Because we are re-using the buffer, the total * number won't change. */ if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL) return(NULL); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); } KASSERT(log_entry != NULL, ("%s: log_entry unexpectedly NULL", __func__)); /* Extract the log buffer and verbose buffer pointers. */ log_buf = &log_entry->tlm_buf; log_verbose = &log_entry->tlm_v; /* Basic entries. */ if (itv == NULL) getmicrouptime(&log_buf->tlb_tv); else memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval)); log_buf->tlb_ticks = ticks; log_buf->tlb_sn = logsn; log_buf->tlb_stackid = tp->t_fb->tfb_id; log_buf->tlb_eventid = eventid; log_buf->tlb_eventflags = 0; log_buf->tlb_errno = errornum; /* Socket buffers */ if (rxbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_RXBUF; log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc; log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc; log_buf->tlb_rxbuf.tls_sb_spare = 0; } if (txbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_TXBUF; log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc; log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc; log_buf->tlb_txbuf.tls_sb_spare = 0; } /* Copy values from tp to the log entry. */ #define COPY_STAT(f) log_buf->tlb_ ## f = tp->f #define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f COPY_STAT_T(state); COPY_STAT_T(starttime); COPY_STAT(iss); COPY_STAT_T(flags); COPY_STAT(snd_una); COPY_STAT(snd_max); COPY_STAT(snd_cwnd); COPY_STAT(snd_nxt); COPY_STAT(snd_recover); COPY_STAT(snd_wnd); COPY_STAT(snd_ssthresh); COPY_STAT_T(srtt); COPY_STAT_T(rttvar); COPY_STAT(rcv_up); COPY_STAT(rcv_adv); COPY_STAT(rcv_nxt); COPY_STAT(sack_newdata); COPY_STAT(rcv_wnd); COPY_STAT_T(dupacks); COPY_STAT_T(segqlen); COPY_STAT(snd_numholes); COPY_STAT(snd_scale); COPY_STAT(rcv_scale); #undef COPY_STAT #undef COPY_STAT_T log_buf->tlb_flex1 = 0; log_buf->tlb_flex2 = 0; /* Copy stack-specific info. */ if (stackinfo != NULL) { memcpy(&log_buf->tlb_stackinfo, stackinfo, sizeof(log_buf->tlb_stackinfo)); log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO; } /* The packet */ log_buf->tlb_len = len; if (th) { int optlen; log_buf->tlb_eventflags |= TLB_FLAG_HDR; log_buf->tlb_th = *th; if (th_hostorder) tcp_fields_to_net(&log_buf->tlb_th); optlen = (th->th_off << 2) - sizeof (struct tcphdr); if (optlen > 0) memcpy(log_buf->tlb_opts, th + 1, optlen); } /* Verbose information */ if (func != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE; if (output_caller != NULL) strlcpy(log_verbose->tlv_snd_frm, output_caller, TCP_FUNC_LEN); else *log_verbose->tlv_snd_frm = 0; strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN); log_verbose->tlv_trace_line = line; } /* Insert the new log at the tail. */ STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_add(log_entry); return (log_buf); } /* * Change the logging state for a TCPCB. Returns 0 on success or an * error code on failure. */ int tcp_log_state_change(struct tcpcb *tp, int state) { struct tcp_log_mem *log_entry; INP_WLOCK_ASSERT(tp->t_inpcb); switch(state) { case TCP_LOG_STATE_CLEAR: while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); /* Fall through */ case TCP_LOG_STATE_OFF: tp->t_logstate = TCP_LOG_STATE_OFF; break; case TCP_LOG_STATE_TAIL: case TCP_LOG_STATE_HEAD: case TCP_LOG_STATE_CONTINUAL: case TCP_LOG_STATE_HEAD_AUTO: case TCP_LOG_STATE_TAIL_AUTO: tp->t_logstate = state; break; default: return (EINVAL); } tp->t_flags2 &= ~(TF2_LOG_AUTO); return (0); } /* If tcp_drain() is called, flush half the log entries. */ void tcp_log_drain(struct tcpcb *tp) { struct tcp_log_mem *log_entry, *next; int target, skip; INP_WLOCK_ASSERT(tp->t_inpcb); if ((target = tp->t_lognum / 2) == 0) return; /* * If we are logging the "head" packets, we want to discard * from the tail of the queue. Otherwise, we want to discard * from the head. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD || tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) { skip = tp->t_lognum - target; STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue) if (!--skip) break; KASSERT(log_entry != NULL, ("%s: skipped through all entries!", __func__)); if (log_entry == NULL) return; while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) { STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_rem(next); tcp_log_remove_log_cleanup(tp, next); #ifdef INVARIANTS target--; #endif } KASSERT(target == 0, ("%s: After removing from tail, target was %d", __func__, target)); } else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); } else { while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL && target--) tcp_log_remove_log_head(tp, log_entry); KASSERT(target <= 0, ("%s: After removing from head, target was %d", __func__, target)); KASSERT(tp->t_lognum > 0, ("%s: After removing from head, tp->t_lognum was %d", __func__, target)); KASSERT(log_entry != NULL, ("%s: After removing from head, the tailq was empty", __func__)); } } static inline int tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len) { if (sopt->sopt_td != NULL) return (copyout(src, dst, len)); bcopy(src, dst, len); return (0); } static int tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp, struct tcp_log_buffer **end, int count) { struct tcp_log_buffer *out_entry; struct tcp_log_mem *log_entry; size_t entrysize; int error; #ifdef INVARIANTS int orig_count = count; #endif /* Copy the data out. */ error = 0; out_entry = (struct tcp_log_buffer *) sopt->sopt_val; STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) { count--; KASSERT(count >= 0, ("%s:%d: Exceeded expected count (%d) processing list %p", __func__, __LINE__, orig_count, log_tailqp)); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_copyout, 1); #endif /* * Skip copying out the header if it isn't present. * Instead, copy out zeros (to ensure we don't leak info). * TODO: Make sure we truly do zero everything we don't * explicitly set. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR) entrysize = sizeof(struct tcp_log_buffer); else entrysize = offsetof(struct tcp_log_buffer, tlb_th); error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry, entrysize); if (error) break; if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) { error = tcp_log_copyout(sopt, zerobuf, ((uint8_t *)out_entry) + entrysize, sizeof(struct tcp_log_buffer) - entrysize); } /* * Copy out the verbose bit, if needed. Either way, * increment the output pointer the correct amount. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) { error = tcp_log_copyout(sopt, &log_entry->tlm_v, out_entry->tlb_verbose, sizeof(struct tcp_log_verbose)); if (error) break; out_entry = (struct tcp_log_buffer *) (((uint8_t *) (out_entry + 1)) + sizeof(struct tcp_log_verbose)); } else out_entry++; } *end = out_entry; KASSERT(error || count == 0, ("%s:%d: Less than expected count (%d) processing list %p" " (%d remain)", __func__, __LINE__, orig_count, log_tailqp, count)); return (error); } /* * Copy out the buffer. Note that we do incremental copying, so * sooptcopyout() won't work. However, the goal is to produce the same * end result as if we copied in the entire user buffer, updated it, * and then used sooptcopyout() to copy it out. * * NOTE: This should be called with a write lock on the PCB; however, * the function will drop it after it extracts the data from the TCPCB. */ int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp) { struct tcp_log_stailq log_tailq; struct tcp_log_mem *log_entry, *log_next; struct tcp_log_buffer *out_entry; struct inpcb *inp; size_t outsize, entrysize; int error, outnum; INP_WLOCK_ASSERT(tp->t_inpcb); inp = tp->t_inpcb; /* * Determine which log entries will fit in the buffer. As an * optimization, skip this if all the entries will clearly fit * in the buffer. (However, get an exact size if we are using * INVARIANTS.) */ #ifndef INVARIANTS if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)) >= tp->t_lognum) { log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue); log_next = NULL; outsize = 0; outnum = tp->t_lognum; } else { #endif outsize = outnum = 0; log_entry = NULL; STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) { entrysize = sizeof(struct tcp_log_buffer); if (log_next->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) entrysize += sizeof(struct tcp_log_verbose); if ((sopt->sopt_valsize - outsize) < entrysize) break; outsize += entrysize; outnum++; log_entry = log_next; } KASSERT(outsize <= sopt->sopt_valsize, ("%s: calculated output size (%zu) greater than available" "space (%zu)", __func__, outsize, sopt->sopt_valsize)); #ifndef INVARIANTS } #endif /* * Copy traditional sooptcopyout() behavior: if sopt->sopt_val * is NULL, silently skip the copy. However, in this case, we * will leave the list alone and return. Functionally, this * gives userspace a way to poll for an approximate buffer * size they will need to get the log entries. */ if (sopt->sopt_val == NULL) { INP_WUNLOCK(inp); if (outsize == 0) { outsize = outnum * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); } if (sopt->sopt_valsize > outsize) sopt->sopt_valsize = outsize; return (0); } /* * Break apart the list. We'll save the ones we want to copy * out locally and remove them from the TCPCB list. We can * then drop the INPCB lock while we do the copyout. * * There are roughly three cases: * 1. There was nothing to copy out. That's easy: drop the * lock and return. * 2. We are copying out the entire list. Again, that's easy: * move the whole list. * 3. We are copying out a partial list. That's harder. We * need to update the list book-keeping entries. */ if (log_entry != NULL && log_next == NULL) { /* Move entire list. */ KASSERT(outnum == tp->t_lognum, ("%s:%d: outnum (%d) should match tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); log_tailq = tp->t_logs; tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); } else if (log_entry != NULL) { /* Move partial list. */ KASSERT(outnum < tp->t_lognum, ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs); STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue); KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL, ("%s:%d: tp->t_logs is unexpectedly shorter than expected" "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)", __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum)); STAILQ_NEXT(log_entry, tlm_queue) = NULL; log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue); tp->t_lognum -= outnum; } else STAILQ_INIT(&log_tailq); /* Drop the PCB lock. */ INP_WUNLOCK(inp); /* Copy the data out. */ error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum); if (error) { /* Restore list */ INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0) { tp = intotcpcb(inp); /* Merge the two lists. */ STAILQ_CONCAT(&log_tailq, &tp->t_logs); tp->t_logs = log_tailq; tp->t_lognum += outnum; } INP_WUNLOCK(inp); } else { /* Sanity check entries */ KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) == outsize, ("%s: Actual output size (%zu) != " "calculated output size (%zu)", __func__, (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val), outsize)); /* Free the entries we just copied out. */ STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) { tcp_log_entry_refcnt_rem(log_entry); uma_zfree(tcp_log_zone, log_entry); } } sopt->sopt_valsize = (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val); return (error); } static void tcp_log_free_queue(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; KASSERT(param != NULL, ("%s: called with NULL param", __func__)); if (param == NULL) return; entry = (struct tcp_log_dev_log_queue *)param; /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); /* Free the buffer, if it is allocated. */ if (entry->tldl_common.tldq_buf != NULL) free(entry->tldl_common.tldq_buf, M_TCPLOGDEV); /* Free the queue entry. */ free(entry, M_TCPLOGDEV); } static struct tcp_log_common_header * tcp_log_expandlogbuf(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; struct tcp_log_header *hdr; uint8_t *end; struct sockopt sopt; int error; entry = (struct tcp_log_dev_log_queue *)param; /* Take a worst-case guess at space needs. */ sopt.sopt_valsize = sizeof(struct tcp_log_header) + entry->tldl_count * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT); if (hdr == NULL) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail5, entry->tldl_count); #endif return (NULL); } sopt.sopt_val = hdr + 1; sopt.sopt_valsize -= sizeof(struct tcp_log_header); sopt.sopt_td = NULL; error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries, (struct tcp_log_buffer **)&end, entry->tldl_count); if (error) { free(hdr, M_TCPLOGDEV); return (NULL); } /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); entry->tldl_count = 0; memset(hdr, 0, sizeof(struct tcp_log_header)); hdr->tlh_version = TCP_LOG_BUF_VER; hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR; hdr->tlh_length = end - (uint8_t *)hdr; hdr->tlh_ie = entry->tldl_ie; hdr->tlh_af = entry->tldl_af; getboottime(&hdr->tlh_offset); strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN); strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN); return ((struct tcp_log_common_header *)hdr); } /* * Queue the tcpcb's log buffer for transmission via the log buffer facility. * * NOTE: This should be called with a write lock on the PCB. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the INP lock if it needs to do so. * * If force is false, this will only dump auto-logged sessions if * tcp_log_auto_all is true or if there is a log ID defined for the session. */ int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force) { struct tcp_log_dev_log_queue *entry; struct inpcb *inp; #ifdef TCPLOG_DEBUG_COUNTERS int num_entries; #endif inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); /* If there are no log entries, there is nothing to do. */ if (tp->t_lognum == 0) return (0); /* Check for a log ID. */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all && !force) { struct tcp_log_mem *log_entry; /* * We needed a log ID and none was found. Free the log entries * and return success. Also, cancel further logging. If the * session doesn't have a log ID by now, we'll assume it isn't * going to get one. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); tp->t_logstate = TCP_LOG_STATE_OFF; return (0); } /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail3, 1); #endif return (ENOBUFS); } if (entry == NULL) { INP_WUNLOCK(inp); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); INP_WLOCK(inp); /* * Note that this check is slightly overly-restrictive in * that the TCB can survive either of these events. * However, there is currently not a good way to ensure * that is the case. So, if we hit this M_WAIT path, we * may end up dropping some entries. That seems like a * small price to pay for safety. */ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { free(entry, M_TCPLOGDEV); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail2, 1); #endif return (ECONNRESET); } tp = intotcpcb(inp); if (tp->t_lognum == 0) { free(entry, M_TCPLOGDEV); return (0); } } /* Fill in the unique parts of the queue entry. */ if (tp->t_lib != NULL) strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); else strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN); if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = inp->inp_inc.inc_ie; if (inp->inp_inc.inc_flags & INC_ISIPV6) entry->tldl_af = AF_INET6; else entry->tldl_af = AF_INET; entry->tldl_entries = tp->t_logs; entry->tldl_count = tp->t_lognum; /* Fill in the common parts of the queue entry. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Clear the log data from the TCPCB. */ #ifdef TCPLOG_DEBUG_COUNTERS num_entries = tp->t_lognum; #endif tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) { tcp_log_free_queue((struct tcp_log_dev_queue *)entry); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail1, num_entries); } else { counter_u64_add(tcp_log_queued, num_entries); #endif } return (0); } /* * Queue the log_id_node's log buffers for transmission via the log buffer * facility. * * NOTE: This should be called with the bucket locked and referenced. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the bucket lock if it needs to do so. (The caller must * ensure that the tln is no longer on any lists so no one else will mess * with this while the lock is dropped!) */ static int tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how) { struct tcp_log_dev_log_queue *entry; struct tcp_log_id_bucket *tlb; tlb = tln->tln_bucket; TCPID_BUCKET_LOCK_ASSERT(tlb); KASSERT(tlb->tlb_refcnt > 0, ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)", __func__, __LINE__, tln, tlb)); KASSERT(tln->tln_closed, ("%s:%d: Called for node with tln_closed==false (tln=%p)", __func__, __LINE__, tln)); /* If there are no log entries, there is nothing to do. */ if (tln->tln_count == 0) return (0); /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) return (ENOBUFS); if (entry == NULL) { TCPID_BUCKET_UNLOCK(tlb); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); TCPID_BUCKET_LOCK(tlb); } /* Fill in the common parts of the queue entry.. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Fill in the unique parts of the queue entry. */ strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN); if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = tln->tln_ie; entry->tldl_entries = tln->tln_entries; entry->tldl_count = tln->tln_count; entry->tldl_af = tln->tln_af; /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) tcp_log_free_queue((struct tcp_log_dev_queue *)entry); return (0); } /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked bucket; however, the function * will drop the lock. */ #define LOCAL_SAVE 10 static void tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason) { struct tcp_log_id_node local_entries[LOCAL_SAVE]; struct inpcb *inp; struct tcpcb *tp; struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln; int i, num_local_entries, tree_locked; bool expireq_locked; TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Take a reference on the bucket to keep it from disappearing until * we are done. */ TCPID_BUCKET_REF(tlb); /* * We'll try to create these without dropping locks. However, we * might very well need to drop locks to get memory. If that's the * case, we'll save up to 10 on the stack, and sacrifice the rest. * (Otherwise, we need to worry about finding our place again in a * potentially changed list. It just doesn't seem worth the trouble * to do that. */ expireq_locked = false; num_local_entries = 0; prev_tln = NULL; tree_locked = TREE_UNLOCKED; SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) { /* * If this isn't associated with a TCPCB, we can pull it off * the list now. We need to be careful that the expire timer * hasn't already taken ownership (tln_expiretime == SBT_MAX). * If so, we let the expire timer code free the data. */ if (cur_tln->tln_closed) { no_inp: /* * Get the expireq lock so we can get a consistent * read of tln_expiretime and so we can remove this * from the expireq. */ if (!expireq_locked) { TCPLOG_EXPIREQ_LOCK(); expireq_locked = true; } /* * We ignore entries with tln_expiretime == SBT_MAX. * The expire timer code already owns those. */ KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0, ("%s:%d: node on the expire queue without positive " "expire time", __func__, __LINE__)); if (cur_tln->tln_expiretime == SBT_MAX) { prev_tln = cur_tln; continue; } /* Remove the entry from the expireq. */ STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln, tcp_log_id_node, tln_expireq); /* Remove the entry from the bucket. */ if (prev_tln != NULL) SLIST_REMOVE_AFTER(prev_tln, tln_list); else SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list); /* * Drop the INP and bucket reference counts. Due to * lock-ordering rules, we need to drop the expire * queue lock. */ TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; /* Drop the INP reference. */ INP_WLOCK(cur_tln->tln_inp); if (!in_pcbrele_wlocked(cur_tln->tln_inp)) INP_WUNLOCK(cur_tln->tln_inp); if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { #ifdef INVARIANTS panic("%s: Bucket refcount unexpectedly 0.", __func__); #endif /* * Recover as best we can: free the entry we * own. */ tcp_log_free_entries(&cur_tln->tln_entries, &cur_tln->tln_count); uma_zfree(tcp_log_node_zone, cur_tln); goto done; } if (tcp_log_dump_node_logbuf(cur_tln, reason, M_NOWAIT)) { /* * If we have sapce, save the entries locally. * Otherwise, free them. */ if (num_local_entries < LOCAL_SAVE) { local_entries[num_local_entries] = *cur_tln; num_local_entries++; } else { tcp_log_free_entries( &cur_tln->tln_entries, &cur_tln->tln_count); } } /* No matter what, we are done with the node now. */ uma_zfree(tcp_log_node_zone, cur_tln); /* * Because we removed this entry from the list, prev_tln * (which tracks the previous entry still on the tlb * list) remains unchanged. */ continue; } /* * If we get to this point, the session data is still held in * the TCPCB. So, we need to pull the data out of that. * * We will need to drop the expireq lock so we can lock the INP. * We can then try to extract the data the "easy" way. If that * fails, we'll save the log entries for later. */ if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* Lock the INP and then re-check the state. */ inp = cur_tln->tln_inp; INP_WLOCK(inp); /* * If we caught this while it was transitioning, the data * might have moved from the TCPCB to the tln (signified by * setting tln_closed to true. If so, treat this like an * inactive connection. */ if (cur_tln->tln_closed) { /* * It looks like we may have caught this connection * while it was transitioning from active to inactive. * Treat this like an inactive connection. */ INP_WUNLOCK(inp); goto no_inp; } /* * Try to dump the data from the tp without dropping the lock. * If this fails, try to save off the data locally. */ tp = cur_tln->tln_tp; if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) && num_local_entries < LOCAL_SAVE) { tcp_log_move_tp_to_node(tp, &local_entries[num_local_entries]); local_entries[num_local_entries].tln_closed = 1; KASSERT(local_entries[num_local_entries].tln_bucket == tlb, ("%s: %d: bucket mismatch for node %p", __func__, __LINE__, cur_tln)); num_local_entries++; } INP_WUNLOCK(inp); /* * We are goint to leave the current tln on the list. It will * become the previous tln. */ prev_tln = cur_tln; } /* Drop our locks, if any. */ KASSERT(tree_locked == TREE_UNLOCKED, ("%s: %d: tree unexpectedly locked", __func__, __LINE__)); switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); tree_locked = TREE_UNLOCKED; break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; break; } if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* * Try again for any saved entries. tcp_log_dump_node_logbuf() is * guaranteed to free the log entries within the node. And, since * the node itself is on our stack, we don't need to free it. */ for (i = 0; i < num_local_entries; i++) tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK); /* Drop our reference. */ if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); done: /* Drop our locks, if any. */ switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } if (expireq_locked) TCPLOG_EXPIREQ_UNLOCK(); } #undef LOCAL_SAVE /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked INP; however, the function * will drop the lock. */ void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason) { struct tcp_log_id_bucket *tlb; int tree_locked; /* Figure out our bucket and lock it. */ INP_WLOCK_ASSERT(tp->t_inpcb); tlb = tp->t_lib; if (tlb == NULL) { /* * No bucket; treat this like a request to dump a single * session's traces. */ (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true); INP_WUNLOCK(tp->t_inpcb); return; } TCPID_BUCKET_REF(tlb); INP_WUNLOCK(tp->t_inpcb); TCPID_BUCKET_LOCK(tlb); /* If we are the last reference, we have nothing more to do here. */ tree_locked = TREE_UNLOCKED; if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } return; } /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */ tcp_log_dumpbucketlogs(tlb, reason); } /* * Mark the end of a flow with the current stack. A stack can add * stack-specific info to this trace event by overriding this * function (see bbr_log_flowend() for example). */ void tcp_log_flowend(struct tcpcb *tp) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { struct socket *so = tp->t_inpcb->inp_socket; TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FLOWEND, 0, 0, NULL, false); } } Index: head/sys/netinet/tcp_log_buf.h =================================================================== --- head/sys/netinet/tcp_log_buf.h (revision 343754) +++ head/sys/netinet/tcp_log_buf.h (revision 343755) @@ -1,370 +1,369 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2016-2018 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_log_buf_h__ #define __tcp_log_buf_h__ #define TCP_LOG_REASON_LEN 32 #define TCP_LOG_BUF_VER (6) /* * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires * 8-byte alignment to work properly on all platforms. Therefore, we will * enforce 8-byte alignment for all the structures that may appear by * themselves (instead of being embedded in another structure) in a data * stream. */ #define ALIGN_TCP_LOG __aligned(8) /* Information about the socketbuffer state. */ struct tcp_log_sockbuf { uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */ uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */ uint32_t tls_sb_spare; /* spare */ }; /* Optional, verbose information that may be appended to an event log. */ struct tcp_log_verbose { #define TCP_FUNC_LEN 32 char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */ char tlv_trace_func[TCP_FUNC_LEN]; /* Function that generated trace */ uint32_t tlv_trace_line; /* Line number that generated trace */ uint8_t _pad[4]; } ALIGN_TCP_LOG; /* Internal RACK state variables. */ struct tcp_log_rack { uint32_t tlr_rack_rtt; /* rc_rack_rtt */ uint8_t tlr_state; /* Internal RACK state */ uint8_t _pad[3]; /* Padding */ }; struct tcp_log_bbr { uint64_t cur_del_rate; uint64_t delRate; uint64_t rttProp; uint64_t bw_inuse; uint32_t inflight; uint32_t applimited; uint32_t delivered; uint32_t timeStamp; uint32_t epoch; uint32_t lt_epoch; uint32_t pkts_out; uint32_t flex1; uint32_t flex2; uint32_t flex3; uint32_t flex4; uint32_t flex5; uint32_t flex6; uint32_t lost; uint16_t pacing_gain; uint16_t cwnd_gain; uint16_t flex7; uint8_t bbr_state; uint8_t bbr_substate; uint8_t inhpts; uint8_t ininput; uint8_t use_lt_bw; uint8_t flex8; uint32_t pkt_epoch; }; /* Per-stack stack-specific info. */ union tcp_log_stackspecific { struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; }; struct tcp_log_buffer { /* Event basics */ struct timeval tlb_tv; /* Timestamp of trace */ uint32_t tlb_ticks; /* Timestamp of trace */ uint32_t tlb_sn; /* Serial number */ uint8_t tlb_stackid; /* Stack ID */ uint8_t tlb_eventid; /* Event ID */ uint16_t tlb_eventflags; /* Flags for the record */ #define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */ #define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */ #define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */ #define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */ #define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */ int tlb_errno; /* Event error (if any) */ /* Internal session state */ struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */ struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */ int tlb_state; /* TCPCB t_state */ uint32_t tlb_starttime; /* TCPCB t_starttime */ uint32_t tlb_iss; /* TCPCB iss */ uint32_t tlb_flags; /* TCPCB flags */ uint32_t tlb_snd_una; /* TCPCB snd_una */ uint32_t tlb_snd_max; /* TCPCB snd_max */ uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */ uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */ uint32_t tlb_snd_recover;/* TCPCB snd_recover */ uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */ uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */ uint32_t tlb_srtt; /* TCPCB t_srtt */ uint32_t tlb_rttvar; /* TCPCB t_rttvar */ uint32_t tlb_rcv_up; /* TCPCB rcv_up */ uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */ uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */ tcp_seq tlb_sack_newdata; /* TCPCB sack_newdata */ uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */ uint32_t tlb_dupacks; /* TCPCB t_dupacks */ int tlb_segqlen; /* TCPCB segqlen */ int tlb_snd_numholes; /* TCPCB snd_numholes */ uint32_t tlb_flex1; /* Event specific information */ uint32_t tlb_flex2; /* Event specific information */ uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */ tlb_rcv_scale:4; /* TCPCB rcv_scale */ uint8_t _pad[3]; /* Padding */ /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; #define tlb_rack tlb_stackinfo.u_rack /* The packet */ uint32_t tlb_len; /* The packet's data length */ struct tcphdr tlb_th; /* The TCP header */ uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */ /* Verbose information (optional) */ struct tcp_log_verbose tlb_verbose[0]; } ALIGN_TCP_LOG; enum tcp_log_events { TCP_LOG_IN = 1, /* Incoming packet 1 */ TCP_LOG_OUT, /* Transmit (without other event) 2 */ TCP_LOG_RTO, /* Retransmit timeout 3 */ TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ TCP_LOG_PACER, /* Pacer sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */ BBR_LOG_TIMERSTAR, /* Start a timer 13 */ BBR_LOG_TIMERCANC, /* Cancel a timer 14 */ BBR_LOG_ENTREC, /* Entered recovery 15 */ BBR_LOG_EXITREC, /* Exited recovery 16 */ BBR_LOG_CWND, /* Cwnd change 17 */ BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */ BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */ BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */ BBR_LOG_JUSTRET, /* We just returned out of output 21 */ BBR_LOG_STATE, /* A BBR state change occured 22 */ BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ UNUSED_32, /* Unused 32 */ UNUSED_33, /* Unused 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ TCP_LOG_END /* End (keep at end) 51 */ }; enum tcp_log_states { TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */ TCP_LOG_STATE_OFF = 0, /* Pause */ TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */ TCP_LOG_STATE_HEAD=2, /* Keep the leading events */ TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and automatically dump them to the device */ TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */ TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and automatically dump them when the session ends */ }; /* Use this if we don't know whether the operation succeeded. */ #define ERRNO_UNK (-1) /* * If the user included dev/tcp_log/tcp_log_dev.h, then include our private * headers. Otherwise, there is no reason to pollute all the files with an * additional include. * * This structure is aligned to an 8-byte boundary to match the alignment * requirements of (struct tcp_log_buffer). */ #ifdef __tcp_log_dev_h__ struct tcp_log_header { struct tcp_log_common_header tlh_common; #define tlh_version tlh_common.tlch_version #define tlh_type tlh_common.tlch_type #define tlh_length tlh_common.tlch_length struct in_endpoints tlh_ie; struct timeval tlh_offset; /* Uptime -> UTC offset */ char tlh_id[TCP_LOG_ID_LEN]; char tlh_reason[TCP_LOG_REASON_LEN]; uint8_t tlh_af; uint8_t _pad[7]; } ALIGN_TCP_LOG; #ifdef _KERNEL struct tcp_log_dev_log_queue { struct tcp_log_dev_queue tldl_common; char tldl_id[TCP_LOG_ID_LEN]; char tldl_reason[TCP_LOG_REASON_LEN]; struct in_endpoints tldl_ie; struct tcp_log_stailq tldl_entries; int tldl_count; uint8_t tldl_af; }; #endif /* _KERNEL */ #endif /* __tcp_log_dev_h__ */ #ifdef _KERNEL #define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000 #define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always * tries to record verbose information. */ #define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ tp->t_output_caller, __func__, __LINE__, tv); \ } while (0) /* * TCP_LOG_EVENT: This is a macro so we can capture function/line * information when needed. * * Prototype: * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, * struct sockbuf *txbuf, uint8_t eventid, int errornum, * union tcp_log_stackspecific *stackinfo) * * tp is mandatory and must be write locked. * th is optional; if present, it will appear in the record. * rxbuf and txbuf are optional; if present, they will appear in the record. * eventid is mandatory. * errornum is mandatory (it indicates the success or failure of the * operation associated with the event). * len indicates the length of the packet. If no packet, use 0. * stackinfo is optional; if present, it will appear in the record. */ #ifdef TCP_LOG_FORCEVERBOSE #define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE #else #define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \ do { \ if (tcp_log_verbose) \ TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \ eventid, errornum, len, stackinfo, \ th_hostorder, NULL); \ else if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX extern bool tcp_log_verbose; void tcp_log_drain(struct tcpcb *tp); int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force); void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason); struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv); size_t tcp_log_get_id(struct tcpcb *tp, char *buf); u_int tcp_log_get_id_cnt(struct tcpcb *tp); int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp); void tcp_log_init(void); int tcp_log_set_id(struct tcpcb *tp, char *id); int tcp_log_state_change(struct tcpcb *tp, int state); void tcp_log_tcpcbinit(struct tcpcb *tp); void tcp_log_tcpcbfini(struct tcpcb *tp); void tcp_log_flowend(struct tcpcb *tp); #else /* !TCP_BLACKBOX */ #define tcp_log_verbose (false) static inline struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv) { return (NULL); } #endif /* TCP_BLACKBOX */ #endif /* _KERNEL */ #endif /* __tcp_log_buf_h__ */ Index: head/sys/netinet/tcp_stacks/rack.c =================================================================== --- head/sys/netinet/tcp_stacks/rack.c (revision 343754) +++ head/sys/netinet/tcp_stacks/rack.c (revision 343755) @@ -1,9157 +1,9156 @@ /*- - * Copyright (c) 2016-2018 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #ifdef NETFLIX_STATS #include #endif #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #ifdef NETFLIX_CWV #include #endif #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "sack_filter.h" #include "tcp_rack.h" #include "rack_bbr_common.h" uma_zone_t rack_zone; uma_zone_t rack_pcb_zone; #ifndef TICKS2SBT #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) #endif struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; #define CUM_ACKED 1 #define SACKED 2 /* * The RACK module incorporates a number of * TCP ideas that have been put out into the IETF * over the last few years: * - Matt Mathis's Rate Halving which slowly drops * the congestion window so that the ack clock can * be maintained during a recovery. * - Yuchung Cheng's RACK TCP (for which its named) that * will stop us using the number of dup acks and instead * use time as the gage of when we retransmit. * - Reorder Detection of RFC4737 and the Tail-Loss probe draft * of Dukkipati et.al. * RACK depends on SACK, so if an endpoint arrives that * cannot do SACK the state machine below will shuttle the * connection back to using the "default" TCP stack that is * in FreeBSD. * * To implement RACK the original TCP stack was first decomposed * into a functional state machine with individual states * for each of the possible TCP connection states. The do_segement * functions role in life is to mandate the connection supports SACK * initially and then assure that the RACK state matches the conenction * state before calling the states do_segment function. Each * state is simplified due to the fact that the original do_segment * has been decomposed and we *know* what state we are in (no * switches on the state) and all tests for SACK are gone. This * greatly simplifies what each state does. * * TCP output is also over-written with a new version since it * must maintain the new rack scoreboard. * */ static int32_t rack_precache = 1; static int32_t rack_tlp_thresh = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 * - 60 seconds */ static int32_t rack_pkt_delay = 1; static int32_t rack_inc_var = 0;/* For TLP */ static int32_t rack_reduce_largest_on_idle = 0; static int32_t rack_min_pace_time = 0; static int32_t rack_min_pace_time_seg_req=6; static int32_t rack_early_recovery = 1; static int32_t rack_early_recovery_max_seg = 6; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up * being a total of 122.850 seconds before a * connection is killed. */ static int32_t rack_tlp_min = 10; static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ static int32_t rack_rto_max = 30000; /* 30 seconds */ static const int32_t rack_free_cache = 2; static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 1; static int32_t rack_delayed_ack_time = 200; /* 200ms */ static int32_t rack_slot_reduction = 4; static int32_t rack_lower_cwnd_at_tlp = 0; static int32_t rack_use_proportional_reduce = 0; static int32_t rack_proportional_rate = 10; static int32_t rack_tlp_max_resend = 2; static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; /* Rack specific counters */ counter_u64_t rack_badfr; counter_u64_t rack_badfr_bytes; counter_u64_t rack_rtm_prr_retran; counter_u64_t rack_rtm_prr_newdata; counter_u64_t rack_timestamp_mismatch; counter_u64_t rack_reorder_seen; counter_u64_t rack_paced_segments; counter_u64_t rack_unpaced_segments; counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enetunreach; /* Tail loss probe counters */ counter_u64_t rack_tlp_tot; counter_u64_t rack_tlp_newdata; counter_u64_t rack_tlp_retran; counter_u64_t rack_tlp_retran_bytes; counter_u64_t rack_tlp_retran_fail; counter_u64_t rack_to_tot; counter_u64_t rack_to_arm_rack; counter_u64_t rack_to_arm_tlp; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; counter_u64_t rack_runt_sacks; counter_u64_t rack_used_tlpmethod; counter_u64_t rack_used_tlpmethod2; counter_u64_t rack_enter_tlp_calc; counter_u64_t rack_input_idle_reduces; counter_u64_t rack_tlp_does_nada; /* Temp CPU counters */ counter_u64_t rack_find_high; counter_u64_t rack_progress_drops; counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); static void rack_counter_destroy(void); static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); static void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); static void rack_dtor(void *mem, int32_t size, void *arg); static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts); static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm); static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_handoff_ok(struct tcpcb *tp); static int32_t rack_init(struct tcpcb *tp); static void rack_init_sysctls(void); static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th); static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); static int32_t rack_output(struct tcpcb *tp); static void rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts); static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_stopall(struct tcpcb *tp); static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta); static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); static int32_t tcp_addrack(module_t mod, int32_t type, void *data); static void rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val); static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void rack_do_drop(struct mbuf *m, struct tcpcb *tp); static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt); static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val); static int rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); static int rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); int32_t rack_clear_counter=0; static int sysctl_rack_clear(SYSCTL_HANDLER_ARGS) { uint32_t stat; int32_t error; error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); if (error || req->newptr == NULL) return error; error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); if (error) return (error); if (stat == 1) { #ifdef INVARIANTS printf("Clearing RACK counters\n"); #endif counter_u64_zero(rack_badfr); counter_u64_zero(rack_badfr_bytes); counter_u64_zero(rack_rtm_prr_retran); counter_u64_zero(rack_rtm_prr_newdata); counter_u64_zero(rack_timestamp_mismatch); counter_u64_zero(rack_reorder_seen); counter_u64_zero(rack_tlp_tot); counter_u64_zero(rack_tlp_newdata); counter_u64_zero(rack_tlp_retran); counter_u64_zero(rack_tlp_retran_bytes); counter_u64_zero(rack_tlp_retran_fail); counter_u64_zero(rack_to_tot); counter_u64_zero(rack_to_arm_rack); counter_u64_zero(rack_to_arm_tlp); counter_u64_zero(rack_paced_segments); counter_u64_zero(rack_unpaced_segments); counter_u64_zero(rack_saw_enobuf); counter_u64_zero(rack_saw_enetunreach); counter_u64_zero(rack_to_alloc_hard); counter_u64_zero(rack_to_alloc_emerg); counter_u64_zero(rack_sack_proc_all); counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); counter_u64_zero(rack_find_high); counter_u64_zero(rack_runt_sacks); counter_u64_zero(rack_used_tlpmethod); counter_u64_zero(rack_used_tlpmethod2); counter_u64_zero(rack_enter_tlp_calc); counter_u64_zero(rack_progress_drops); counter_u64_zero(rack_tlp_does_nada); } rack_clear_counter = 0; return (0); } static void rack_init_sysctls() { SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, &rack_rate_sample_method , USE_RTT_LOW, "What method should we use for rate sampling 0=high, 1=low "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "data_after_close", CTLFLAG_RW, &rack_ignore_data_after_close, 0, "Do we hold off sending a RST until all pending data is ack'd"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpmethod", CTLFLAG_RW, &rack_tlp_threshold_use, TLP_USE_TWO_ONE, "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_time", CTLFLAG_RW, &rack_min_pace_time, 0, "Should we enforce a minimum pace time of 1ms"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_segs", CTLFLAG_RW, &rack_min_pace_time_seg_req, 6, "How many segments have to be in the len to enforce min-pace-time"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "idle_reduce_high", CTLFLAG_RW, &rack_reduce_largest_on_idle, 0, "Should we reduce the largest cwnd seen to IW on idle reduction"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "bb_verbose", CTLFLAG_RW, &rack_verbose_logging, 0, "Should RACK black box logging be verbose"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sackfiltering", CTLFLAG_RW, &rack_use_sack_filter, 1, "Do we use sack filtering?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "delayed_ack", CTLFLAG_RW, &rack_delayed_ack_time, 200, "Delayed ack time (200ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpminto", CTLFLAG_RW, &rack_tlp_min, 10, "TLP minimum timeout per the specification (10ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "precache", CTLFLAG_RW, &rack_precache, 0, "Where should we precache the mcopy (0 is not at all)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sblklimit", CTLFLAG_RW, &rack_sack_block_limit, 128, "When do we start paying attention to small sack blocks"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "send_oldest", CTLFLAG_RW, &rack_always_send_oldest, 1, "Should we always send the oldest TLP and RACK-TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, &rack_tlp_in_recovery, 1, "Can we do a TLP during recovery?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlimit", CTLFLAG_RW, &rack_limited_retran, 0, "How many times can a rack timeout drive out sends"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "minrto", CTLFLAG_RW, &rack_rto_min, 0, "Minimum RTO in ms -- set with caution below 1000 due to TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "maxrto", CTLFLAG_RW, &rack_rto_max, 0, "Maxiumum RTO in ms -- should be at least as large as min_rto"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retry", CTLFLAG_RW, &rack_tlp_max_resend, 2, "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, &rack_use_proportional_reduce, 0, "Should we proportionaly reduce cwnd based on the number of losses "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "recovery_prop", CTLFLAG_RW, &rack_proportional_rate, 10, "What percent reduction per loss"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, &rack_lower_cwnd_at_tlp, 0, "When a TLP completes a retran should we enter recovery?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_reduces", CTLFLAG_RW, &rack_slot_reduction, 4, "When setting a slot should we reduce by divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, &rack_pace_every_seg, 1, "Should we pace out every segment hptsi"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, &rack_hptsi_segments, 6, "Should we pace out only a limited size of segments"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prr_sendalot", CTLFLAG_RW, &rack_send_a_lot_in_prr, 1, "Send a lot in prr"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "minto", CTLFLAG_RW, &rack_min_to, 1, "Minimum rack timeout in milliseconds"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, &rack_early_recovery_max_seg, 6, "Max segments in early recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecovery", CTLFLAG_RW, &rack_early_recovery, 1, "Do we do early recovery with rack"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reorder_thresh", CTLFLAG_RW, &rack_reorder_thresh, 2, "What factor for rack will be added when seeing reordering (shift right)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, &rack_tlp_thresh, 1, "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reorder_fade", CTLFLAG_RW, &rack_reorder_fade, 0, "Does reorder detection fade, if so how many ms (0 means never)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "inc_var", CTLFLAG_RW, &rack_inc_var, 0, "Should rack add to the TLP timer the variance in rtt calculation"); rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "badfr", CTLFLAG_RD, &rack_badfr, "Total number of bad FRs"); rack_badfr_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "badfr_bytes", CTLFLAG_RD, &rack_badfr_bytes, "Total number of bad FRs"); rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prrsndret", CTLFLAG_RD, &rack_rtm_prr_retran, "Total number of prr based retransmits"); rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prrsndnew", CTLFLAG_RD, &rack_rtm_prr_newdata, "Total number of prr based new transmits"); rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tsnf", CTLFLAG_RD, &rack_timestamp_mismatch, "Total number of timestamps that we could not find the reported ts"); rack_find_high = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "findhigh", CTLFLAG_RD, &rack_find_high, "Total number of FIN causing find-high"); rack_reorder_seen = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reordering", CTLFLAG_RD, &rack_reorder_seen, "Total number of times we added delay due to reordering"); rack_tlp_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_to_total", CTLFLAG_RD, &rack_tlp_tot, "Total number of tail loss probe expirations"); rack_tlp_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_new", CTLFLAG_RD, &rack_tlp_newdata, "Total number of tail loss probe sending new data"); rack_tlp_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran", CTLFLAG_RD, &rack_tlp_retran, "Total number of tail loss probe sending retransmitted data"); rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, &rack_tlp_retran_bytes, "Total bytes of tail loss probe sending retransmitted data"); rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, &rack_tlp_retran_fail, "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); rack_to_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_to_tot", CTLFLAG_RD, &rack_to_tot, "Total number of times the rack to expired?"); rack_to_arm_rack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "arm_rack", CTLFLAG_RD, &rack_to_arm_rack, "Total number of times the rack timer armed?"); rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "arm_tlp", CTLFLAG_RD, &rack_to_arm_tlp, "Total number of times the tlp timer armed?"); rack_paced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "paced", CTLFLAG_RD, &rack_paced_segments, "Total number of times a segment send caused hptsi"); rack_unpaced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "unpaced", CTLFLAG_RD, &rack_unpaced_segments, "Total number of times a segment did not cause hptsi"); rack_saw_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "saw_enobufs", CTLFLAG_RD, &rack_saw_enobuf, "Total number of times a segment did not cause hptsi"); rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "saw_enetunreach", CTLFLAG_RD, &rack_saw_enetunreach, "Total number of times a segment did not cause hptsi"); rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocs", CTLFLAG_RD, &rack_to_alloc, "Total allocations of tracking structures"); rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allochard", CTLFLAG_RD, &rack_to_alloc_hard, "Total allocations done with sleeping the hard way"); rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total alocations done from emergency cache"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_long", CTLFLAG_RD, &rack_sack_proc_all, "Total times we had to walk whole list for sack processing"); rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_restart", CTLFLAG_RD, &rack_sack_proc_restart, "Total times we had to walk whole list due to a restart"); rack_sack_proc_short = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_short", CTLFLAG_RD, &rack_sack_proc_short, "Total times we took shortcut for sack processing"); rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, &rack_enter_tlp_calc, "Total times we called calc-tlp"); rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hit_tlp_method", CTLFLAG_RD, &rack_used_tlpmethod, "Total number of runt sacks"); rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, &rack_used_tlpmethod2, "Total number of runt sacks 2"); rack_runt_sacks = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "runtsacks", CTLFLAG_RD, &rack_runt_sacks, "Total number of runt sacks"); rack_progress_drops = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prog_drops", CTLFLAG_RD, &rack_progress_drops, "Total number of progress drops"); rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, &rack_input_idle_reduces, "Total number of idle reductions on input"); rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_nada", CTLFLAG_RD, &rack_tlp_does_nada, "Total number of nada tlp calls"); COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "opts", CTLFLAG_RD, rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); SYSCTL_ADD_PROC(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); } static inline int32_t rack_progress_timeout_check(struct tcpcb *tp) { if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { /* * There is an assumption that the caller * will drop the connection so we will * increment the counters here. */ struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; counter_u64_add(rack_progress_drops, 1); #ifdef NETFLIX_STATS TCPSTAT_INC(tcps_progdrops); #endif rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); return (1); } } return (0); } static void rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = which; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, 0, &log, false); } } static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, 0, &log, false); } } static void rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, uint32_t o_srtt, uint32_t o_var) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = t; log.u_bbr.flex2 = o_srtt; log.u_bbr.flex3 = o_var; log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; TCP_LOG_EVENT(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, 0, &log, false); } } static void rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) { /* * Log the rtt sample we are * applying to the srtt algorithm in * useconds. */ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; /* Convert our ms to a microsecond */ log.u_bbr.flex1 = rtt * 1000; log.u_bbr.timeStamp = tcp_get_usecs(&tv); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, TCP_LOG_RTT, 0, 0, &log, false, &tv); } } static inline void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) { if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; TCP_LOG_EVENT(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, 0, &log, false); } } static void rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, 0, &log, false); } } static void rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex7 = rack->r_wanted_output; log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, 0, &log, false); } } static void rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex7 = hpts_calling; log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, tlen, &log, false); } } static void rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = 0; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = hpts_removed; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, 0, &log, false); } } static void rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, 0, &log, false); } } static void rack_counter_destroy() { counter_u64_free(rack_badfr); counter_u64_free(rack_badfr_bytes); counter_u64_free(rack_rtm_prr_retran); counter_u64_free(rack_rtm_prr_newdata); counter_u64_free(rack_timestamp_mismatch); counter_u64_free(rack_reorder_seen); counter_u64_free(rack_tlp_tot); counter_u64_free(rack_tlp_newdata); counter_u64_free(rack_tlp_retran); counter_u64_free(rack_tlp_retran_bytes); counter_u64_free(rack_tlp_retran_fail); counter_u64_free(rack_to_tot); counter_u64_free(rack_to_arm_rack); counter_u64_free(rack_to_arm_tlp); counter_u64_free(rack_paced_segments); counter_u64_free(rack_unpaced_segments); counter_u64_free(rack_saw_enobuf); counter_u64_free(rack_saw_enetunreach); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); counter_u64_free(rack_sack_proc_all); counter_u64_free(rack_sack_proc_short); counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_to_alloc); counter_u64_free(rack_find_high); counter_u64_free(rack_runt_sacks); counter_u64_free(rack_enter_tlp_calc); counter_u64_free(rack_used_tlpmethod); counter_u64_free(rack_used_tlpmethod2); counter_u64_free(rack_progress_drops); counter_u64_free(rack_input_idle_reduces); counter_u64_free(rack_tlp_does_nada); COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); } static struct rack_sendmap * rack_alloc(struct tcp_rack *rack) { struct rack_sendmap *rsm; counter_u64_add(rack_to_alloc, 1); rack->r_ctl.rc_num_maps_alloced++; rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { return (rsm); } if (rack->rc_free_cnt) { counter_u64_add(rack_to_alloc_emerg, 1); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt--; return (rsm); } return (NULL); } static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { rack->r_ctl.rc_num_maps_alloced--; if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_next == rsm) rack->r_ctl.rc_next = NULL; if (rack->r_ctl.rc_sacklast == rsm) rack->r_ctl.rc_sacklast = NULL; if (rack->rc_free_cnt < rack_free_cache) { memset(rsm, 0, sizeof(struct rack_sendmap)); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt++; return; } uma_zfree(rack_zone, rsm); } /* * CC wrapper hook functions */ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery) { #ifdef NETFLIX_STATS int32_t gput; #endif #ifdef NETFLIX_CWV u_long old_cwnd = tp->snd_cwnd; #endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; if (tp->ccv->bytes_this_ack > max) { tp->ccv->bytes_this_ack = max; } } if (tp->snd_cwnd <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef NETFLIX_STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t) tp->snd_cwnd) - tp->snd_wnd); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an * API to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; #ifdef NETFLIX_CWV if (tp->t_maxpeakrate) { /* * We update t_peakrate_thr. This gives us roughly * one update per round trip time. */ tcp_update_peakrate_thr(tp); } #endif } #endif if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, nsegs * V_tcp_abc_l_var * tp->t_maxseg); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; } #ifdef NETFLIX_CWV if (tp->cwv_enabled) { /* * Per RFC 7661: The behaviour in the non-validated phase is * specified as: o A sender determines whether to increase * the cwnd based upon whether it is cwnd-limited (see * Section 4.5.3): * A sender that is cwnd-limited MAY use * the standard TCP method to increase cwnd (i.e., the * standard method permits a TCP sender that fully utilises * the cwnd to increase the cwnd each time it receives an * ACK). * A sender that is not cwnd-limited MUST NOT * increase the cwnd when ACK packets are received in this * phase (i.e., needs to avoid growing the cwnd when it has * not recently sent using the current size of cwnd). */ if ((tp->snd_cwnd > old_cwnd) && (tp->cwv_cwnd_valid == 0) && (!(tp->ccv->flags & CCF_CWND_LIMITED))) { tp->snd_cwnd = old_cwnd; } /* Try to update pipeAck and NCWV state */ if (TCPS_HAVEESTABLISHED(tp->t_state) && !IN_RECOVERY(tp->t_flags)) { uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); tcp_newcwv_update_pipeack(tp, data); } } /* we enforce max peak rate if it is set. */ if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { tp->snd_cwnd = tp->t_peakrate_thr; } #endif } static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); if (rack->r_ctl.rc_prr_sndcnt > 0) rack->r_wanted_output++; } static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* * Here we can in theory adjust cwnd to be based on the number of * losses in the window (rack->r_ctl.rc_loss_count). This is done * based on the rack_use_proportional flag. */ if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { int32_t reduce; reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); if (reduce > 50) { reduce = 50; } tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); } else { if (tp->snd_cwnd > tp->snd_ssthresh) { /* Drop us down to the ssthresh (1/2 cwnd at loss) */ tp->snd_cwnd = tp->snd_ssthresh; } } if (rack->r_ctl.rc_prr_sndcnt > 0) { /* Suck the next prr cnt back into cwnd */ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; } EXIT_RECOVERY(tp->t_flags); #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if ((tp->cwv_cwnd_valid == 0) && (tp->snd_cwv.in_recovery)) tcp_newcwv_end_recovery(tp); } #endif } static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { struct tcp_rack *rack; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: /* rack->r_ctl.rc_ssthresh_set = 1;*/ if (!IN_FASTRECOVERY(tp->t_flags)) { rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_ctl.rc_loss_count = 0; rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags)) { TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg) * tp->t_maxseg; tp->snd_cwnd = tp->t_maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { tcp_newcwv_enter_recovery(tp); } if (type == CC_RTO) { tcp_newcwv_reset(tp); } } #endif } static inline void rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) { uint32_t i_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef NETFLIX_STATS TCPSTAT_INC(tcps_idle_restarts); if (tp->t_state == TCPS_ESTABLISHED) TCPSTAT_INC(tcps_idle_estrestarts); #endif if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); if (tp->snd_cwnd == 1) i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ else i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); if (reduce_largest) { /* * Do we reduce the largest cwnd to make * rack play nice on restart hptsi wise? */ if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; } /* * Being idle is no differnt than the initial window. If the cc * clamps it down below the initial window raise it to the initial * window. */ if (tp->snd_cwnd < i_cwnd) { tp->snd_cwnd = i_cwnd; } } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. */ #define DELAY_ACK(tp, tlen) \ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ ((tp->t_flags & TF_DELACK) == 0) && \ (tlen <= tp->t_maxseg) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) static inline void rack_calc_rwin(struct socket *so, struct tcpcb *tp) { int32_t win; /* * Calculate amount of space in receive window, and then do TCP * input processing. Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } static void rack_do_drop(struct mbuf *m, struct tcpcb *tp) { /* * Drop space held by incoming segment and return. */ if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); if (m) m_freem(m); } static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) { /* * Generate an ACK dropping incoming segment if it occupies sequence * space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all paths to this * code happen after packets containing RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the segment * we received passes the SYN-RECEIVED ACK test. If it fails send a * RST. This breaks the loop in the "LAND" DoS attack, and also * prevents an ACK storm between two listening ports that have been * sent forged SYN segments, each with the source address of the * other. */ struct tcp_rack *rack; if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return; } else *ret_val = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; if (m) m_freem(m); } static int rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in * window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should test against * last_ack_sent instead of rcv_nxt. Note 2: we handle special case * of closed window, not covered by the RFC. */ int dropped = 0; if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || (tp->last_ack_sent == th->th_seq) || (tp->rcv_nxt == th->th_seq) || ((tp->last_ack_sent - 1) == th->th_seq)) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tp = tcp_close(tp); } dropped = 1; rack_do_drop(m, tp); } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; } } else { m_freem(m); } return (dropped); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ static void rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); *ret_val = 1; rack_do_drop(m, tp); } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; *ret_val = 0; rack_do_drop(m, NULL); } } /* * rack_ts_check returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ static int rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates ts_recent, * the age will be reset later and ts_recent will get a * valid value. If it does not, setting ts_recent to zero * will at least satisfy the requirement that zero be placed * in the timestamp echo reply when ts_recent isn't valid. * The age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be dropped * when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); *ret_val = 0; if (tlen) { rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); } else { rack_do_drop(m, NULL); } return (1); } return (0); } /* * rack_drop_checks returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ static int rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) { int32_t todrop; int32_t thflags; int32_t tlen; thflags = *thf; tlen = *tlenp; todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If segment ends after window, drop trailing data (and PUSH and * FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment and * ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else { rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH | TH_FIN); } *thf = thflags; *tlenp = tlen; return (0); } static struct rack_sendmap * rack_find_lowest_rsm(struct tcp_rack *rack) { struct rack_sendmap *rsm; /* * Walk the time-order transmitted list looking for an rsm that is * not acked. This will be the one that was sent the longest time * ago that is still outstanding. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { if (rsm->r_flags & RACK_ACKED) { continue; } goto finish; } finish: return (rsm); } static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *prsm; /* * Walk the sequence order list backward until we hit and arrive at * the highest seq not acked. In theory when this is called it * should be the last segment (which it was not). */ counter_u64_add(rack_find_high, 1); prsm = rsm; TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { continue; } return (prsm); } return (NULL); } static uint32_t rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) { int32_t lro; uint32_t thresh; /* * lro is the flag we use to determine if we have seen reordering. * If it gets set we have seen reordering. The reorder logic either * works in one of two ways: * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as * passed we no longer consider reordering has occuring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro * to 1. * * In the end if lro is non-zero we add the extra time for * reordering in. */ if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_reorder_ts) { if (rack->r_ctl.rc_reorder_fade) { if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { lro = cts - rack->r_ctl.rc_reorder_ts; if (lro == 0) { /* * No time as passed since the last * reorder, mark it as reordering. */ lro = 1; } } else { /* Negative time? */ lro = 0; } if (lro > rack->r_ctl.rc_reorder_fade) { /* Turn off reordering seen too */ rack->r_ctl.rc_reorder_ts = 0; lro = 0; } } else { /* Reodering does not fade */ lro = 1; } } else { lro = 0; } thresh = srtt + rack->r_ctl.rc_pkt_delay; if (lro) { /* It must be set, if not you get 1/4 rtt */ if (rack->r_ctl.rc_reorder_shift) thresh += (srtt >> rack->r_ctl.rc_reorder_shift); else thresh += (srtt >> 2); } else { thresh += 1; } /* We don't let the rack timeout be above a RTO */ if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); } /* And we don't want it above the RTO max either */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } return (thresh); } static uint32_t rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t srtt) { struct rack_sendmap *prsm; uint32_t thresh, len; int maxseg; if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_tlp_threshold) thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); else thresh = (srtt * 2); /* Get the previous sent packet, if any */ maxseg = tcp_maxseg(tp); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; if (rack->rack_tlp_threshold_use == TLP_USE_ID) { /* Exactly like the ID */ if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { /* 2.1 behavior */ prsm = TAILQ_PREV(rsm, rack_head, r_tnext); if (prsm && (len <= maxseg)) { /* * Two packets outstanding, thresh should be (2*srtt) + * possible inter-packet delay (if any). */ uint32_t inter_gap = 0; int idx, nidx; counter_u64_add(rack_used_tlpmethod, 1); idx = rsm->r_rtr_cnt - 1; nidx = prsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { /* Yes it was sent later (or at the same time) */ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; } else if (len <= maxseg) { /* * Possibly compensate for delayed-ack. */ uint32_t alt_thresh; counter_u64_add(rack_used_tlpmethod2, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { /* 2.2 behavior */ if (len <= maxseg) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } /* Not above an RTO */ if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { thresh = TICKS_2_MSEC(tp->t_rxtcur); } /* Not above a RTO max */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } /* Apply user supplied min TLP */ if (thresh < rack_tlp_min) { thresh = rack_tlp_min; } return (thresh); } static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) { /* * Check to see that we don't need to fall into recovery. We will * need to do so if our oldest transmit is past the time we should * have had an ack. */ struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t idx; uint32_t srtt_cur, srtt, thresh; rack = (struct tcp_rack *)tp->t_fb_ptr; if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { return (NULL); } srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; srtt = TICKS_2_MSEC(srtt_cur); if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) srtt = rack->rc_rack_rtt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) return (NULL); if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) return (NULL); } idx = rsm->r_rtr_cnt - 1; thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (tsused < rsm->r_tim_lastsent[idx]) { return (NULL); } if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { return (NULL); } /* Ok if we reach here we are over-due */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); return (rsm); } static uint32_t rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) { int32_t t; int32_t tt; uint32_t ret_val; t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; ret_val = (uint32_t)tt; return (ret_val); } static uint32_t rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * Start the FR timer, we do this based on getting the first one in * the rc_tmap. Note that if its NULL we must stop the timer. in all * events we need to stop the running timer (if its running) before * starting the new one. */ uint32_t thresh, exp, to, srtt, time_since_sent; uint32_t srtt_cur; int32_t idx; int32_t is_tlp_timer = 0; struct rack_sendmap *rsm; if (rack->t_timers_stopped) { /* All timers have been stopped none are to run */ return (0); } if (rack->rc_in_persist) { /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } if (tp->t_state < TCPS_ESTABLISHED) goto activate_rxt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) { /* Nothing on the send map */ activate_rxt: if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; to = TICKS_2_MSEC(tp->t_rxtcur); if (to == 0) to = 1; return (to); } return (0); } if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { /* No lowest? */ goto activate_rxt; } } /* Convert from ms to usecs */ if (rsm->r_flags & RACK_SACK_PASSED) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { /* * We don't start a rack timer if all we have is a * FIN outstanding. */ goto activate_rxt; } if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; thresh = rack_calc_thresh_rack(rack, srtt, cts); idx = rsm->r_rtr_cnt - 1; exp = rsm->r_tim_lastsent[idx] + thresh; if (SEQ_GEQ(exp, cts)) { to = exp - cts; if (to < rack->r_ctl.rc_min_to) { to = rack->r_ctl.rc_min_to; } } else { to = rack->r_ctl.rc_min_to; } } else { /* Ok we need to do a TLP not RACK */ if ((rack->rc_tlp_in_progress != 0) || (rack->r_ctl.rc_tlp_rtx_out != 0)) { /* * The previous send was a TLP or a tlp_rtx is in * process. */ goto activate_rxt; } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ goto activate_rxt; } if (rsm->r_flags & RACK_HAS_FIN) { /* If its a FIN we dont do TLP */ rsm = NULL; goto activate_rxt; } idx = rsm->r_rtr_cnt - 1; if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) time_since_sent = cts - rsm->r_tim_lastsent[idx]; else time_since_sent = 0; is_tlp_timer = 1; if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); if (thresh > time_since_sent) to = thresh - time_since_sent; else to = rack->r_ctl.rc_min_to; if (to > TCPTV_REXMTMAX) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. */ goto activate_rxt; } if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { /* * The tail is no longer the last one I did a probe * on */ rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_last_tlp_seq = rsm->r_start; } } if (is_tlp_timer == 0) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; } else { if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { /* * We have exceeded how many times we can retran the * current TLP timer, switch to the RTO timer. */ goto activate_rxt; } else { rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; } } if (to == 0) to = 1; return (to); } static void rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_in_persist == 0) { if (((tp->t_flags & TF_SENTFIN) == 0) && (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) /* Must need to send more data to enter persist */ return; rack->r_ctl.rc_went_idle_time = cts; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; rack->rc_in_persist = 1; } } static void rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) { if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack->r_ctl.rc_hpts_flags = 0; } rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_flags &= ~TF_FORCEDATA; tp->t_rxtshift = 0; } static void rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) { struct inpcb *inp; uint32_t delayed_ack = 0; uint32_t hpts_timeout; uint8_t stopped; uint32_t left = 0; inp = tp->t_inpcb; if (inp->inp_in_hpts) { /* A previous call is already set up */ return; } if (tp->t_state == TCPS_CLOSED) { return; } stopped = rack->rc_tmr_stopped; if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { left = rack->r_ctl.rc_timer_exp - cts; } rack->r_ctl.rc_timer_exp = 0; if (rack->rc_inp->inp_in_hpts == 0) { rack->r_ctl.rc_hpts_flags = 0; } if (slot) { /* We are hptsi too */ rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { /* * We are still left on the hpts when the to goes * it will be for output. */ if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) slot = cts - rack->r_ctl.rc_last_output_to; else slot = 1; } if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* No send window.. we must enter persist */ rack_enter_persist(tp, rack, cts); } else if ((frm_out_sbavail && (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && (tp->snd_wnd < tp->t_maxseg)) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* * If we have no window or we can't send a segment (and have * data to send.. we cheat here and frm_out_sbavail is * passed in with the sbavail(sb) only from bbr_output) and * we are established, then we must enter persits (if not * already in persits). */ rack_enter_persist(tp, rack, cts); } hpts_timeout = rack_timer_start(tp, rack, cts); if (tp->t_flags & TF_DELACK) { delayed_ack = TICKS_2_MSEC(tcp_delacktime); rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; } if (delayed_ack && ((hpts_timeout == 0) || (delayed_ack < hpts_timeout))) hpts_timeout = delayed_ack; else rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; /* * If no timers are going to run and we will fall off the hptsi * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && (slot == 0)) { if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* * Ok we have no timer (persists, rack, tlp, rxt or * del-ack), we don't have segments being paced. So * all that is left is the keepalive timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { /* Get the established keep-alive time */ hpts_timeout = TP_KEEPIDLE(tp); } else { /* Get the initial setup keep-alive time */ hpts_timeout = TP_KEEPINIT(tp); } rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; } } if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { /* * RACK, TLP, persists and RXT timers all are restartable * based on actions input .. i.e we received a packet (ack * or sack) and that changes things (rw, or snd_una etc). * Thus we can restart them with a new value. For * keep-alive, delayed_ack we keep track of what was left * and restart the timer with a smaller value. */ if (left < hpts_timeout) hpts_timeout = left; } if (hpts_timeout) { /* * Hack alert for now we can't time-out over 2,147,483 * seconds (a bit more than 596 hours), which is probably ok * :). */ if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } if (slot) { rack->r_ctl.rc_last_output_to = cts + slot; if ((hpts_timeout == 0) || (hpts_timeout > slot)) { if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); rack_log_to_start(rack, cts, hpts_timeout, slot, 1); } else { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } } else if (hpts_timeout) { if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { /* No timer starting */ #ifdef INVARIANTS if (SEQ_GT(tp->snd_max, tp->snd_una)) { panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", tp, rack, tot_len_this_send, cts, slot, hpts_timeout); } #endif } rack->rc_tmr_stopped = 0; if (slot) rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); } /* * RACK Timer, here we simply do logging and house keeping. * the normal rack_output() function will call the * appropriate thing to check if we need to do a RACK retransmit. * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * This timer simply provides an internal trigger to send out data. * The check_recovery_mode call will see if there are needed * retransmissions, if so we will enter fast-recovery. The output * call may or may not do the same thing depending on sysctl * settings. */ struct rack_sendmap *rsm; int32_t recovery; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } rack_log_to_event(rack, RACK_TO_FRM_RACK); recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); rsm = rack_check_recovery_mode(tp, cts); if (rsm) { uint32_t rtt; rtt = rack->rc_rack_rtt; if (rtt == 0) rtt = 1; if ((recovery == 0) && (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { /* * The rack-timeout that enter's us into recovery * will force out one MSS and set us up so that we * can do one more send in 2*rtt (transitioning the * rack timeout into a rack-tlp). */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { /* * When a rack timer goes, we have to send at * least one segment. They will be paced a min of 1ms * apart via the next rack timer (or further * if the rack timer dictates it). */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } } else { /* This is a case that should happen rarely if ever */ counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; return (0); } /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal rack_output() will then * send it out. * * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * Tail Loss Probe. */ struct rack_sendmap *rsm = NULL; struct socket *so; uint32_t amm, old_prr_snd = 0; uint32_t out, avail; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); return (1); } /* * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ rack_log_to_event(rack, RACK_TO_FRM_TLP); counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); so = tp->t_inpcb->inp_socket; avail = sbavail(&so->so_snd); out = tp->snd_max - tp->snd_una; rack->rc_timer_up = 1; /* * If we are in recovery we can jazz out a segment if new data is * present simply by setting rc_prr_sndcnt to a segment. */ if ((avail > out) && ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { /* New data is available */ amm = avail - out; if (amm > tp->t_maxseg) { amm = tp->t_maxseg; } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { /* not enough to fill a MTU and no-delay is off */ goto need_retran; } if (IN_RECOVERY(tp->t_flags)) { /* Unlikely */ old_prr_snd = rack->r_ctl.rc_prr_sndcnt; if (out + amm <= tp->snd_wnd) rack->r_ctl.rc_prr_sndcnt = amm; else goto need_retran; } else { /* Set the send-new override */ if (out + amm <= tp->snd_wnd) rack->r_ctl.rc_tlp_new_data = amm; else goto need_retran; } rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_last_tlp_seq = tp->snd_max; rack->r_ctl.rc_tlpsend = NULL; counter_u64_add(rack_tlp_newdata, 1); goto send; } need_retran: /* * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ if (rack_always_send_oldest) rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); else { rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { rsm = rack_find_high_nonack(rack, rsm); } } if (rsm == NULL) { counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif goto out; } if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { /* * We need to split this the last segment in two. */ int32_t idx; struct rack_sendmap *nrsm; nrsm = rack_alloc(rack); if (nrsm == NULL) { /* * No memory to split, we will just exit and punt * off to the RXT timer. */ counter_u64_add(rack_tlp_does_nada, 1); goto out; } nrsm->r_start = (rsm->r_end - tp->t_maxseg); nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; rsm->r_end = nrsm->r_start; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rsm = nrsm; } rack->r_ctl.rc_tlpsend = rsm; rack->r_ctl.rc_tlp_rtx_out = 1; if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { rack->r_ctl.rc_tlp_seg_send_cnt++; tp->t_rxtshift++; } else { rack->r_ctl.rc_last_tlp_seq = rsm->r_start; rack->r_ctl.rc_tlp_seg_send_cnt = 1; } send: rack->r_ctl.rc_tlp_send_cnt++; if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { /* * Can't [re]/transmit a segment we have not heard from the * peer in max times. We need the retransmit timer to take * over. */ restore: rack->r_ctl.rc_tlpsend = NULL; if (rsm) rsm->r_flags &= ~RACK_TLP; rack->r_ctl.rc_prr_sndcnt = old_prr_snd; counter_u64_add(rack_tlp_retran_fail, 1); goto out; } else if (rsm) { rsm->r_flags |= RACK_TLP; } if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { /* * We don't want to send a single segment more than the max * either. */ goto restore; } rack->r_timer_override = 1; rack->r_tlp_running = 1; rack->rc_tlp_in_progress = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); out: rack->rc_timer_up = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } /* * Delayed ack Timer, here we simply need to setup the * ACK_NOW flag and remove the DELACK flag. From there * the output routine will send the ack out. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack_log_to_event(rack, RACK_TO_FRM_DELACK); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; return (0); } /* * Persists timer, here we simply need to setup the * FORCE-DATA flag the output routine will send * the one byte send. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct inpcb *inp; int32_t retval = 0; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (rack->rc_in_persist == 0) return (0); if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); /* * Persistence timer into zero window. Force a byte to be output, if * possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not time out if the * window is closed. After a full backoff, drop the connection if * the idle time (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); retval = 1; tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) rack_exit_persist(tp, rack); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { retval = 1; TCPSTAT_INC(tcps_persistdrop); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } tp->t_flags |= TF_FORCEDATA; out: rack_log_to_event(rack, RACK_TO_FRM_PERSIST); return (retval); } /* * If a keepalive goes off, we had no other timers * happening. We always return 1 here since this * routine either drops the connection or sends * out a segment with respond. */ static int rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct tcptemp *t_template; struct inpcb *inp; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; rack_log_to_event(rack, RACK_TO_FRM_KEEP); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response if the peer is * up and reachable: either an ACK if the connection is * still alive, or an RST if the peer has closed the * connection due to timeout or reboot. Using sequence * number tp->snd_una-1 causes the transmitted zero-length * segment to lie outside the receive window; by the * protocol spec, this requires the correspondent TCP to * respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } } rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); return (1); dropit: TCPSTAT_INC(tcps_keepdrops); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); return (1); } /* * Retransmit helper function, clear up all the ack * flags and take care of important book keeping. */ static void rack_remxt_tmr(struct tcpcb *tp) { /* * The retransmit timer went off, all sack'd blocks must be * un-acked. */ struct rack_sendmap *rsm, *trsm = NULL; struct tcp_rack *rack; int32_t cnt = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); rack_log_to_event(rack, RACK_TO_FRM_TMR); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* * Ideally we would like to be able to * mark SACK-PASS on anything not acked here. * However, if we do that we would burst out * all that data 1ms apart. This would be unwise, * so for now we will just let the normal rxt timer * and tlp timer take care of it. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { if (rsm->r_flags & RACK_ACKED) { cnt++; rsm->r_sndcnt = 0; if (rsm->r_in_tmap == 0) { /* We must re-add it back to the tlist */ if (trsm == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; trsm = rsm; } } rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ rack->r_ctl.rc_sacked = 0; /* Clear the tlp rtx mark */ rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); /* Setup so we send one segment */ if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_timer_override = 1; } /* * Re-transmit timeout! If we drop the PCB we will return 1, otherwise * we will setup to retransmit the lowest seq number outstanding. */ static int rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { int32_t rexmt; struct inpcb *inp; int32_t retval = 0; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_una == tp->snd_max)) { /* Nothing outstanding .. nothing to do */ return (0); } /* * Retransmission timer went off. Message has not been acked within * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); retval = 1; tcp_set_inp_to_drop(rack->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } rack_remxt_tmr(tp); if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited * to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can be * recovered if this turns out to be a "bad" retransmit. A * retransmit is considered "bad" if an ACK for this segment * is received within RTT/2 interval; the assumption here is * that the ACK was already in flight. See "On Estimating * End-to-End Network Path Properties" by Allman and Paxson * for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, max(MSEC_2_TICKS(rack_rto_min), rexmt), MSEC_2_TICKS(rack_rto_max)); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple * of packets and process straight to FIN. In that case we won't * catch ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int32_t isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, * 1448 -> 1188 -> 524) should be given 2 chances to recover * before further clamping down. 'tp->t_rxtshift % 2 == 0' * should take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: - * Disable Path MTU Discovery (IP "DF" bit). - * Reduce MTU to lower value than what we negotiated * with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the * default in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole * and we restore the previous MSS and blackhole * detection flags. The limit '6' is determined by * giving each probe stage (1448, 1188, 524) 2 * chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } } /* * Disable RFC1323 and SACK if we haven't got any response to our * third SYN to work-around some broken terminal servers (most of * which have hopefully been retired) that have bad VJ header * compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit * times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; rack_cong_signal(tp, NULL, CC_RTO); out: return (retval); } static int rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) { int32_t ret = 0; int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); if (timers == 0) { return (0); } if (tp->t_state == TCPS_LISTEN) { /* no timers on listen sockets */ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) return (0); return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { uint32_t left; if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { ret = -1; rack_log_to_processing(rack, cts, ret, 0); return (0); } if (hpts_calling == 0) { ret = -2; rack_log_to_processing(rack, cts, ret, 0); return (0); } /* * Ok our timer went off early and we are not paced false * alarm, go back to sleep. */ ret = -3; left = rack->r_ctl.rc_timer_exp - cts; tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); rack_log_to_processing(rack, cts, ret, left); rack->rc_last_pto_set = 0; return (1); } rack->rc_tmr_stopped = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; if (timers & PACE_TMR_DELACK) { ret = rack_timeout_delack(tp, rack, cts); } else if (timers & PACE_TMR_RACK) { ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { ret = rack_timeout_tlp(tp, rack, cts); } else if (timers & PACE_TMR_RXT) { ret = rack_timeout_rxt(tp, rack, cts); } else if (timers & PACE_TMR_PERSIT) { ret = rack_timeout_persist(tp, rack, cts); } else if (timers & PACE_TMR_KEEP) { ret = rack_timeout_keepalive(tp, rack, cts); } rack_log_to_processing(rack, cts, ret, timers); return (ret); } static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) { uint8_t hpts_removed = 0; if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_inp->inp_in_hpts && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { /* * Canceling timer's when we have no output being * paced. We also must remove ourselves from the * hpts. */ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } rack_log_to_cancel(rack, hpts_removed, line); rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); } } static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) { return; } static int rack_stopall(struct tcpcb *tp) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; rack->t_timers_stopped = 1; return (0); } static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) { return; } static int rack_timer_active(struct tcpcb *tp, uint32_t timer_type) { return (0); } static void rack_stop_all_timers(struct tcpcb *tp) { struct tcp_rack *rack; /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ rack = (struct tcp_rack *)tp->t_fb_ptr; rack->rc_in_persist = 1; } tcp_timer_suspend(tp, TT_PERSIST); tcp_timer_suspend(tp, TT_REXMT); tcp_timer_suspend(tp, TT_KEEP); tcp_timer_suspend(tp, TT_DELACK); } static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts) { int32_t idx; rsm->r_rtr_cnt++; rsm->r_sndcnt++; if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; } if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = ts; if (rsm->r_flags & RACK_ACKED) { /* Problably MTU discovery messing with us */ rsm->r_flags &= ~RACK_ACKED; rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (rsm->r_flags & RACK_SACK_PASSED) { /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~RACK_SACK_PASSED; rsm->r_flags |= RACK_WAS_SACKPASS; } /* Update memory for next rtr */ rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); } static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ struct rack_sendmap *nrsm; uint32_t c_end; int32_t len; int32_t idx; len = *lenp; c_end = rsm->r_start + len; if (SEQ_GEQ(c_end, rsm->r_end)) { /* * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ rack_update_rsm(tp, rack, rsm, ts); if (c_end == rsm->r_end) { *lenp = 0; return (0); } else { int32_t act_len; /* Hangs over the end return whats left */ act_len = rsm->r_end - rsm->r_start; *lenp = (len - act_len); return (rsm->r_end); } /* We don't get out of this block. */ } /* * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ nrsm = rack_alloc(rack); if (nrsm == NULL) { /* * We can't get memory, so lets not proceed. */ *lenp = 0; return (0); } /* * So here we are going to take the original rsm and make it what we * retransmitted. nrsm will be the tail portion we did not * retransmit. For example say the chunk was 1, 11 (10 bytes). And * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ nrsm->r_start = c_end; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; rsm->r_end = c_end; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rack_update_rsm(tp, rack, rsm, ts); *lenp = 0; return (0); } static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm; register uint32_t snd_max, snd_una; int32_t idx; /* * Add to the RACK log of packets in flight or retransmitted. If * there is a TS option we will use the TS echoed, if not we will * grab a TS. * * Retransmissions will increment the count and move the ts to its * proper place. Note that if options do not include TS's then we * won't be able to effectively use the ACK for an RTT on a retran. * * Notes about r_start and r_end. Lets consider a send starting at * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next * slot (11). * */ /* * If err is set what do we do XXXrrs? should we not add the thing? * -- i.e. return if err != 0 or should we pretend we sent it? -- * i.e. proceed with add ** do this for now. */ INP_WLOCK_ASSERT(tp->t_inpcb); if (err) /* * We don't log errors -- we could but snd_max does not * advance in this case either. */ return; if (th_flags & TH_RST) { /* * We don't log resets and we return immediately from * sending */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; snd_una = tp->snd_una; if (SEQ_LEQ((seq_out + len), snd_una)) { /* Are sending an old segment to induce an ack (keep-alive)? */ return; } if (SEQ_LT(seq_out, snd_una)) { /* huh? should we panic? */ uint32_t end; end = seq_out + len; seq_out = snd_una; len = end - seq_out; } snd_max = tp->snd_max; if (th_flags & (TH_SYN | TH_FIN)) { /* * The call to rack_log_output is made before bumping * snd_max. This means we can record one extra byte on a SYN * or FIN if seq_out is adding more on and a FIN is present * (and we are not resending). */ if (th_flags & TH_SYN) len++; if (th_flags & TH_FIN) len++; if (SEQ_LT(snd_max, tp->snd_nxt)) { /* * The add/update as not been done for the FIN/SYN * yet. */ snd_max = tp->snd_nxt; } } if (len == 0) { /* We don't log zero window probes */ return; } rack->r_ctl.rc_time_last_sent = ts; if (IN_RECOVERY(tp->t_flags)) { rack->r_ctl.rc_prr_out += len; } /* First question is it a retransmission? */ if (seq_out == snd_max) { again: rsm = rack_alloc(rack); if (rsm == NULL) { /* * Hmm out of memory and the tcb got destroyed while * we tried to wait. */ #ifdef INVARIANTS panic("Out of memory when we should not be rack:%p", rack); #endif return; } if (th_flags & TH_FIN) { rsm->r_flags = RACK_HAS_FIN; } else { rsm->r_flags = 0; } rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; if (th_flags & TH_SYN) { /* The data space is one beyond snd_una */ rsm->r_start = seq_out + 1; rsm->r_end = rsm->r_start + (len - 1); } else { /* Normal case */ rsm->r_start = seq_out; rsm->r_end = rsm->r_start + len; } rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; return; } /* * If we reach here its a retransmission and we need to find it. */ more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; hintrsm = NULL; } else if (rack->r_ctl.rc_next) { /* We have a hint from a previous run */ rsm = rack->r_ctl.rc_next; } else { /* No hints sorry */ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { /* * We used rc_next or hintrsm to retransmit, hopefully the * likely case. */ seq_out = rack_update_entry(tp, rack, rsm, ts, &len); if (len == 0) { return; } else { goto more; } } /* Ok it was not the last pointer go through it the hard way. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { if (rsm->r_start == seq_out) { seq_out = rack_update_entry(tp, rack, rsm, ts, &len); rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (len == 0) { return; } else { continue; } } if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { /* Transmitted within this piece */ /* * Ok we must split off the front and then let the * update do the rest */ nrsm = rack_alloc(rack); if (nrsm == NULL) { #ifdef INVARIANTS panic("Ran out of memory that was preallocated? rack:%p", rack); #endif rack_update_rsm(tp, rack, rsm, ts); return; } /* * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ nrsm->r_start = seq_out; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); if (len == 0) { return; } } } /* * Hmm not found in map did they retransmit both old and on into the * new? */ if (seq_out == tp->snd_max) { goto again; } else if (SEQ_LT(seq_out, tp->snd_max)) { #ifdef INVARIANTS printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } printf("Dump complete\n"); panic("seq_out not found rack:%p tp:%p", rack, tp); #endif } else { #ifdef INVARIANTS /* * Hmm beyond sndmax? (only if we are using the new rtt-pack * flag) */ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", seq_out, len, tp->snd_max, tp); #endif } } /* * Record one of the RTT updates from an ack into * our sample structure. */ static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) { if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; } if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { rack->r_ctl.rack_rs.rs_rtt_highest = rtt; } rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; rack->r_ctl.rack_rs.rs_rtt_tot += rtt; rack->r_ctl.rack_rs.rs_rtt_cnt++; } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) { int32_t delta; uint32_t o_srtt, o_var; int32_t rtt; if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) /* No valid sample */ return; if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { /* We are to use the lowest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { /* We are to use the highest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_highest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { /* We are to use the average RTT seen in a single ack */ rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); } else { #ifdef INVARIANTS panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); #endif return; } if (rtt == 0) rtt = 1; rack_log_rtt_sample(rack, rtt); o_srtt = tp->t_srtt; o_var = tp->t_rttvar; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic is * equivalent to the smoothing algorithm in rfc793 with an * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). * Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt += delta; if (tp->t_srtt <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit timer * to smoothed rtt + 4 times the smoothed variance. rttvar * is stored as fixed point with 4 bits after the binary * point (scaled by 16). The following is equivalent to * rfc793 smoothing with an alpha of .75 (rttvar = * rttvar*3/4 + |delta| / 4). This replaces rfc793's * wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); tp->t_rttvar += delta; if (tp->t_rttvar <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. Set the * variance to half the rtt (so our first retransmit happens * at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } TCPSTAT_INC(tcps_rttupdated); rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); tp->t_rttupdated++; #ifdef NETFLIX_STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); #endif tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 * tick of bias. When we compute the retransmit timer, we want 1/2 * tick of rounding and 1 extra tick because of +-1/2 tick * uncertainty in the firing of the timer. The bias will give us * exactly the 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below the minimum * feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); tp->t_softerror = 0; } static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts) { /* * For this RSM, we acknowledged the data from a previous * transmission, not the last one we made. This means we did a false * retransmit. */ struct tcp_rack *rack; if (rsm->r_flags & RACK_HAS_FIN) { /* * The sending of the FIN often is multiple sent when we * have everything outstanding ack'd. We ignore this case * since its over now. */ return; } if (rsm->r_flags & RACK_TLP) { /* * We expect TLP's to have this occur. */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* should we undo cc changes and exit recovery? */ if (IN_RECOVERY(tp->t_flags)) { if (rack->r_ctl.rc_rsm_start == rsm->r_start) { /* * Undo what we ratched down and exit recovery if * possible */ EXIT_RECOVERY(tp->t_flags); tp->snd_recover = tp->snd_una; if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; } } if (rsm->r_flags & RACK_WAS_SACKPASS) { /* * We retransmitted based on a sack and the earlier * retransmission ack'd it - re-ordering is occuring. */ counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } counter_u64_add(rack_badfr, 1); counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); } static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) { int32_t i; uint32_t t; if (rsm->r_flags & RACK_ACKED) /* Already done */ return (0); if ((rsm->r_rtr_cnt == 1) || ((ack_type == CUM_ACKED) && (to->to_flags & TOF_TS) && (to->to_tsecr) && (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) ) { /* * We will only find a matching timestamp if its cum-acked. * But if its only one retransmission its for-sure matching * :-) */ t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; if ((int)t <= 0) t = 1; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); if ((rsm->r_flags & RACK_TLP) && (!IN_RECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ if (rack->r_ctl.rc_tlp_cwnd_reduce) { rack->r_ctl.rc_rsm_start = tp->snd_max; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure * we send one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } else rack->r_ctl.rc_tlp_rtx_out = 0; } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); } /* * We clear the soft/rxtshift since we got an ack. * There is no assurance we will call the commit() function * so we need to clear these to avoid incorrect handling. */ tp->t_rxtshift = 0; tp->t_softerror = 0; if ((to->to_flags & TOF_TS) && (ack_type == CUM_ACKED) && (to->to_tsecr) && ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { /* * Now which timestamp does it match? In this block the ACK * must be coming from a previous transmission. */ for (i = 0; i < rsm->r_rtr_cnt; i++) { if (rsm->r_tim_lastsent[i] == to->to_tsecr) { t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if ((i + 1) < rsm->r_rtr_cnt) { /* Likely */ rack_earlier_retran(tp, rsm, t, cts); } if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } /* * Note the following calls to * tcp_rack_xmit_timer() are being commented * out for now. They give us no more accuracy * and often lead to a wrong choice. We have * enough samples that have not been * retransmitted. I leave the commented out * code in here in case in the future we * decide to add it back (though I can't forsee * doing that). That way we will easily see * where they need to be placed. */ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); } } goto ts_not_found; } else { /* * Ok its a SACK block that we retransmitted. or a windows * machine without timestamps. We can tell nothing from the * time-stamp since its not there or the time the peer last * recieved a segment that moved forward its cum-ack point. */ ts_not_found: i = rsm->r_rtr_cnt - 1; t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { /* * We retransmitted and the ack came back in less * than the smallest rtt we have observed. We most * likey did an improper retransmit as outlined in * 4.2 Step 3 point 2 in the rack-draft. */ i = rsm->r_rtr_cnt - 2; t = cts - rsm->r_tim_lastsent[i]; rack_earlier_retran(tp, rsm, t, cts); } else if (rack->r_ctl.rc_rack_min_rtt) { /* * We retransmitted it and the retransmit did the * job. */ if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; rack->rc_rack_rtt = t; } return (1); } } return (0); } /* * Mark the SACK_PASSED flag on all entries prior to rsm send wise. */ static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *nrsm; uint32_t ts; int32_t idx; idx = rsm->r_rtr_cnt - 1; ts = rsm->r_tim_lastsent[idx]; nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, rack_head, r_tnext) { if (nrsm == rsm) { /* Skip orginal segment he is acked */ continue; } if (nrsm->r_flags & RACK_ACKED) { /* Skip ack'd segments */ continue; } idx = nrsm->r_rtr_cnt - 1; if (ts == nrsm->r_tim_lastsent[idx]) { /* * For this case lets use seq no, if we sent in a * big block (TSO) we would have a bunch of segments * sent at the same time. * * We would only get a report if its SEQ is earlier. * If we have done multiple retransmits the times * would not be equal. */ if (SEQ_LT(nrsm->r_start, rsm->r_start)) { nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } else { /* * Here they were sent at different times, not a big * block. Since we transmitted this one later and * see it sack'd then this must also be missing (or * we would have gotten a sack block for it) */ nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } } static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) { int32_t idx; int32_t times = 0; uint32_t start, end, changed = 0; struct rack_sendmap *rsm, *nrsm; int32_t used_ref = 1; start = sack->start; end = sack->end; rsm = *prsm; if (rsm && SEQ_LT(start, rsm->r_start)) { TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { goto do_rest_ofb; } } } if (rsm == NULL) { start_at_beginning: rsm = NULL; used_ref = 0; } /* First lets locate the block where this guy is */ TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { break; } } do_rest_ofb: if (rsm == NULL) { /* * This happens when we get duplicate sack blocks with the * same end. For example SACK 4: 100 SACK 3: 100 The sort * will not change there location so we would just start at * the end of the first one and get lost. */ if (tp->t_flags & TF_SENTFIN) { /* * Check to see if we have not logged the FIN that * went out. */ nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { /* * Ok we did not get the FIN logged. */ nrsm->r_end++; rsm = nrsm; goto do_rest_ofb; } } if (times == 1) { #ifdef INVARIANTS panic("tp:%p rack:%p sack:%p to:%p prsm:%p", tp, rack, sack, to, prsm); #else goto out; #endif } times++; counter_u64_add(rack_sack_proc_restart, 1); goto start_at_beginning; } /* Ok we have an ACK for some piece of rsm */ if (rsm->r_start != start) { /* * Need to split this in two pieces the before and after. */ nrsm = rack_alloc(rack); if (nrsm == NULL) { /* * failed XXXrrs what can we do but loose the sack * info? */ goto out; } nrsm->r_start = start; nrsm->r_rtr_bytes = 0; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rsm = nrsm; } if (SEQ_GEQ(end, rsm->r_end)) { /* * The end of this block is either beyond this guy or right * at this guy. */ if ((rsm->r_flags & RACK_ACKED) == 0) { rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } } if (end == rsm->r_end) { /* This block only - done */ goto out; } /* There is more not coverend by this rsm move on */ start = rsm->r_end; nrsm = TAILQ_NEXT(rsm, r_next); rsm = nrsm; times = 0; goto do_rest_ofb; } /* Ok we need to split off this one at the tail */ nrsm = rack_alloc(rack); if (nrsm == NULL) { /* failed rrs what can we do but loose the sack info? */ goto out; } /* Clone it */ nrsm->r_start = end; nrsm->r_end = rsm->r_end; nrsm->r_rtr_bytes = 0; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } /* The sack block does not cover this guy fully */ rsm->r_flags &= (~RACK_HAS_FIN); rsm->r_end = end; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } if (rsm->r_flags & RACK_ACKED) { /* Been here done that */ goto out; } rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } out: if (used_ref == 0) { counter_u64_add(rack_sack_proc_all, 1); } else { counter_u64_add(rack_sack_proc_short, 1); } /* Save off where we last were */ if (rsm) rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); else rack->r_ctl.rc_sacklast = NULL; *prsm = rsm; return (changed); } static void inline rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) { struct rack_sendmap *tmap; tmap = NULL; while (rsm && (rsm->r_flags & RACK_ACKED)) { /* Its no longer sacked, mark it so */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); #ifdef INVARIANTS if (rsm->r_in_tmap) { panic("rack:%p rsm:%p flags:0x%x in tmap?", rack, rsm, rsm->r_flags); } #endif rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); /* Rebuild it into our tmap */ if (tmap == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); tmap = rsm; } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); tmap = rsm; } tmap->r_in_tmap = 1; rsm = TAILQ_NEXT(rsm, r_next); } /* * Now lets possibly clear the sack filter so we start * recognizing sacks that cover this area. */ if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); } static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) { uint32_t changed, last_seq, entered_recovery = 0; struct tcp_rack *rack; struct rack_sendmap *rsm; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; register uint32_t th_ack; int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { /* We don't log resets */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; cts = tcp_ts_getticks(); rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); changed = 0; th_ack = th->th_ack; if (SEQ_GT(th_ack, tp->snd_una)) { rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); tp->t_acktime = ticks; } if (rsm && SEQ_GT(th_ack, rsm->r_start)) changed = th_ack - rsm->r_start; if (changed) { /* * The ACK point is advancing to th_ack, we must drop off * the packets in the rack log and calculate any eligble * RTT's. */ rack->r_wanted_output++; more: rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); if (rsm == NULL) { if ((th_ack - 1) == tp->iss) { /* * For the SYN incoming case we will not * have called tcp_output for the sending of * the SYN, so there will be no map. All * other cases should probably be a panic. */ goto proc_sack; } if (tp->t_flags & TF_SENTFIN) { /* if we send a FIN we will not hav a map */ goto proc_sack; } #ifdef INVARIANTS panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", tp, th, tp->t_state, rack, tp->snd_una, tp->snd_max, tp->snd_nxt, changed); #endif goto proc_sack; } if (SEQ_LT(th_ack, rsm->r_start)) { /* Huh map is missing this */ #ifdef INVARIANTS printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", rsm->r_start, th_ack, tp->t_state, rack->r_state); #endif goto proc_sack; } rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); /* Now do we consume the whole thing? */ if (SEQ_GEQ(th_ack, rsm->r_end)) { /* Its all consumed. */ uint32_t left; rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } if (rack->r_ctl.rc_next == rsm) { /* scoot along the marker */ rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove * it from total */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } else if (rsm->r_flags & RACK_SACK_PASSED) { /* * There are acked segments ACKED on the * scoreboard further up. We are seeing * reordering. */ counter_u64_add(rack_reorder_seen, 1); rsm->r_flags |= RACK_ACKED; rack->r_ctl.rc_reorder_ts = cts; } left = th_ack - rsm->r_end; if (rsm->r_rtr_cnt > 1) { /* * Technically we should make r_rtr_cnt be * monotonicly increasing and just mod it to * the timestamp it is replacing.. that way * we would have the last 3 retransmits. Now * rc_loss_count will be wrong if we * retransmit something more than 2 times in * recovery :( */ rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); } /* Free back to zone */ rack_free(rack, rsm); if (left) { goto more; } goto proc_sack; } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove it from * total for the part being cum-acked. */ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); } rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; rsm->r_start = th_ack; } proc_sack: /* Check for reneging */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to * the edge of this send, i.e. one * that it had previously acked. The only * way that can be true if the peer threw * away data (space issues) that it had * previously sacked (else it would have * given us snd_una up to (rsm->r_end). * We need to undo the acked markings here. * * Note we have to look to make sure th_ack is * our rsm->r_start in case we get an old ack * where th_ack is behind snd_una. */ rack_peer_reneges(rack, rsm, th->th_ack); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left to log */ goto out; } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (rsm) { last_seq = rsm->r_end; } else { last_seq = tp->snd_max; } /* Sack block processing */ if (SEQ_GT(th_ack, tp->snd_una)) ack_point = th_ack; else ack_point = tp->snd_una; for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, ack_point) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && SEQ_LEQ(sack.end, tp->snd_max)) { if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && (SEQ_LT(sack.end, last_seq)) && ((sack.end - sack.start) < (tp->t_maxseg / 8))) { /* * Not the last piece and its smaller than * 1/8th of a MSS. We ignore this. */ counter_u64_add(rack_runt_sacks, 1); continue; } sack_blocks[num_sack_blks] = sack; num_sack_blks++; #ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ tcp_record_dsack(sack.start, sack.end); #endif } } if (num_sack_blks == 0) goto out; /* * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ if (rack_use_sack_filter) { num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); } if (num_sack_blks < 2) { goto do_sack_work; } /* Sort the sacks */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } /* * Now are any of the sack block ends the same (yes some * implememtations send these)? */ again: if (num_sack_blks > 1) { for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (sack_blocks[i].end == sack_blocks[j].end) { /* * Ok these two have the same end we * want the smallest end and then * throw away the larger and start * again. */ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { /* * The second block covers * more area use that */ sack_blocks[i].start = sack_blocks[j].start; } /* * Now collapse out the dup-sack and * lower the count */ for (k = (j + 1); k < num_sack_blks; k++) { sack_blocks[j].start = sack_blocks[k].start; sack_blocks[j].end = sack_blocks[k].end; j++; } num_sack_blks--; goto again; } } } } do_sack_work: rsm = rack->r_ctl.rc_sacklast; for (i = 0; i < num_sack_blks; i++) { acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); if (acked) { rack->r_wanted_output++; changed += acked; sack_changed += acked; } } out: if (changed) { /* Something changed cancel the rack timer */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { /* * Ok we have a high probability that we need to go in to * recovery since we have data sack'd */ struct rack_sendmap *rsm; uint32_t tsused; tsused = tcp_ts_getticks(); rsm = tcp_rack_output(tp, rack, tsused); if (rsm) { /* Enter recovery */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; entered_recovery = 1; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_timer_override = 1; } } if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { /* Deal with changed an PRR here (in recovery only) */ uint32_t pipe, snd_una; rack->r_ctl.rc_prr_delivered += changed; /* Compute prr_sndcnt */ if (SEQ_GT(tp->snd_una, th_ack)) { snd_una = tp->snd_una; } else { snd_una = th_ack; } pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; if (pipe > tp->snd_ssthresh) { long sndcnt; sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; if (rack->r_ctl.rc_prr_recovery_fs > 0) sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; else { rack->r_ctl.rc_prr_sndcnt = 0; sndcnt = 0; } sndcnt++; if (sndcnt > (long)rack->r_ctl.rc_prr_out) sndcnt -= rack->r_ctl.rc_prr_out; else sndcnt = 0; rack->r_ctl.rc_prr_sndcnt = sndcnt; } else { uint32_t limit; if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); else limit = 0; if (changed > limit) limit = changed; limit += tp->t_maxseg; if (tp->snd_ssthresh > pipe) { rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); } else { rack->r_ctl.rc_prr_sndcnt = min(0, limit); } } if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { rack->r_timer_override = 1; } } } /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. * For ret_val if its 0 the TCP is locked, if its non-zero * its unlocked and probably unsafe to touch the TCB. */ static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val) { int32_t ourfinisacked = 0; int32_t nsegs, acked_amount; int32_t acked; struct mbuf *mfree; struct tcp_rack *rack; int32_t recovery = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { rack_log_ack(tp, to, th); } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* * Old ack, behind (or duplicate to) the last one rcv'd * Note: Should mark reordering is occuring! We should also * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, * 3-3, 4-4 would be reording. As well as ack 1, 3-3 ack 3 */ return (0); } /* * If we reach this point, ACK is not a duplicate, i.e., it ACKs * something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our SYN has * been ACK'd (so connection is now fully synchronized). Go * to non-starred state, increment snd_una for ACK of SYN, * and check if we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } nsegs = max(1, m->m_pkthdr.lro_nsegs); INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK arrives * within our recovery window, then it was a mistake to do the * retransmit in the first place. Recover our original cwnd and * ssthresh, and proceed to transmit where we left off. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } /* * If we have a timestamp reply, update smoothed round trip time. If * no timestamp is present but transmit timer is running and timed * sequence number was acked, update smoothed round trip time. Since * we now have an rtt measurement, cancel the timer backoff (cf., * Phil Karn's retransmit alg.). Recompute the initial retransmit * timer. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ /* * If all outstanding data is acked, stop retransmit timer and * remember to restart (more output or persist). If there is more * data to be acked, restart retransmit timer, using current * (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack->r_wanted_output++; } /* * If no data (only SYN) was ACK'd, skip rest of ACK processing. */ if (acked == 0) { if (ofia) *ofia = ourfinisacked; return (0); } if (rack->r_ctl.rc_early_recovery) { if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); recovery = 1; } } } /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); if ((sbused(&so->so_snd) == 0) && (acked > acked_amount) && (tp->t_state >= TCPS_FIN_WAIT_1)) { ourfinisacked = 1; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); if (rack->r_ctl.rc_early_recovery == 0) { if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); } } } tp->snd_una = th->th_ack; if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { tp->snd_nxt = tp->snd_una; } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); /* Set need output so persist might get set */ rack->r_wanted_output++; if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); if ((tp->t_state >= TCPS_FIN_WAIT_1) && (sbavail(&so->so_snd) == 0) && (tp->t_flags2 & TF2_DROP_AF_DATA)) { /* * The socket was gone and the * peer sent data, time to * reset him. */ *ret_val = 1; tp = tcp_close(tp); rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); return (1); } } if (ofia) *ofia = ourfinisacked; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { /* * Update window information. Don't look at window if no ACK: TAC's * send garbage on first SYN. */ int32_t nsegs; int32_t tfo_syn; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); nsegs = max(1, m->m_pkthdr.lro_nsegs); if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; rack->r_wanted_output++; } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; } } /* Was persist timer active and now we have window space? */ if ((rack->rc_in_persist != 0) && tp->snd_wnd) { rack_exit_persist(tp, rack); tp->snd_nxt = tp->snd_max; /* Make sure we output to start the timer */ rack->r_wanted_output++; } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); return (0); } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept random * urgent pointers, we'll crash in soreceive. It's hard to * imagine someone actually wanting to send this much urgent * data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, then * mark the data stream. This should not happen in * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a * FIN has been received from the remote side. In these * states we ignore the URG. * * According to RFC961 (Assigned Protocols), the urgent * pointer points to the last octet of urgent data. We * continue, however, to consider it to indicate the first * octet of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t) tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, pull receive urgent * pointer along with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing * queue, and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data is * presented to the user (this happens in tcp_usrreq.c, case * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly * queue with control block tp. Set thflags to whether * reassembly now includes a segment with FIN. This handles * the common case inline (segment is the next to be * received on an established connection, and the queue is * empty), avoiding linkage into and removal from the queue * and repetition of various conversions. Set DELACK for * segments received in order, but ack immediately when * segments are out of order (so fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs when * trimming from the head. */ thflags = tcp_reass(tp, th, &save_start, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0) tcp_update_sack_list(tp, save_start, save_start + tlen); } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know that the * connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized (ie NEEDSYN * flag on) then delay ACK, so it may be piggybacked * when SYN is sent. Otherwise, since we received a * FIN then no more input can be expected, send ACK * now. */ if (tp->t_flags & TF_NEEDSYN) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES enter the * CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been * acked so enter the CLOSING state. */ case TCPS_FIN_WAIT_1: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the * other standard timers. */ case TCPS_FIN_WAIT_2: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); return (1); } } /* * Return any desired output. */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { rack->r_wanted_output++; } INP_WLOCK_ASSERT(tp->t_inpcb); return (0); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. */ static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt) { int32_t nsegs; int32_t newsize = 0; /* automatic sockbuf scaling */ struct tcp_rack *rack; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if (__predict_false(th->th_seq != tp->rcv_nxt)) { return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { return (0); } if (tiwin && tiwin != tp->snd_wnd) { return (0); } if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { return (0); } if (__predict_false((to->to_flags & TOF_TS) && (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { return (0); } if (__predict_false((th->th_ack != tp->snd_una))) { return (0); } if (__predict_false(tlen > sbspace(&so->so_rcv))) { return (0); } if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* * This is a pure, in-sequence data packet with nothing on the * reassembly queue and we have enough buffer space to take it. */ nsegs = max(1, m->m_pkthdr.lro_nsegs); /* Clean receiver SACK report if present */ if (tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. Give up when limit is * reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); rack_calc_rwin(so, tp); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; rack->r_wanted_output++; } if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); return (1); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. */ static int rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) { int32_t acked; int32_t nsegs; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif struct tcp_rack *rack; if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { /* We are retransmitting */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_ctl.rc_sacked) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ nsegs = max(1, m->m_pkthdr.lro_nsegs); rack_log_ack(tp, to, th); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { rack_exit_persist(tp, rack); } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* ND6_HINT(tp); *//* Some progress has been made. */ /* * If all outstanding data are acked, stop retransmit timer, * otherwise restart timer using current (possibly backed-off) * value. If process is waiting for space, wakeup/selwakeup/signal. * If data are ready to send, let tcp_output decide between more * output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp->snd_una == tp->snd_max) { rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } /* Wake up the socket if we have room to write more */ sowwakeup(so); if (sbavail(&so->so_snd)) { rack->r_wanted_output++; } return (1); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t todrop; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the * connection. if seg does not contain SYN, then drop it. Otherwise * this is an acceptable SYN segment initialize tp->rcv_nxt and * tp->irs if seg contains ack then advance tp->snd_una if seg * contains an ECE and ECN support is enabled, the stream is ECN * capable. if SYN has been acked change to ESTABLISHED else * SYN_RCVD state arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); rack_do_drop(m, tp); return (1); } if (thflags & TH_RST) { rack_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { rack_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial = 1; } /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } if (SEQ_GT(th->th_ack, tp->snd_una)) { /* * We advance snd_una for the * fast open case. If th_ack is * acknowledging data beyond * snd_una we can't just call * ack-processing since the * data stream in our send-map * will start at snd_una + 1 (one * beyond the SYN). If its just * equal we don't need to do that * and there is no send_map. */ tp->snd_una++; } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => simultaneous * open. If segment contains CC option and there is a * cached CC, apply TAO test. If it succeeds, connection is * * half-synchronized. Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If * there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. If data, * trim to stay within window, dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. If the * remote host used T/TCP to validate the SYN, our data will be * ACK'd; if so, enter normal data segment processing in the middle * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ if (tp->t_state == TCPS_FIN_WAIT_1) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now * acknowledged then enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then * closing user can proceed. Starting the * timer is contrary to the specification, * but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and * use a compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { rack_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { rack_do_drop(m, NULL); return (0); } } if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know the * sequence numbers haven't wrapped. This is a partial fix for the * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } tp->snd_wnd = tiwin; /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (IS_FASTOPEN(tp->t_flags)) { cc_conn_init(tp); } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; /* * Account for the ACK of our SYN prior to * regular ACK processing below. */ tp->snd_una++; } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); } /* * If segment contains data or ACK, will call tcp_reass() later; if * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (tp->t_state == TCPS_FIN_WAIT_1) { /* We could have went to FIN_WAIT_1 (or EST) above */ /* * In FIN_WAIT_1 STATE in addition to the processing for the * ESTABLISHED state if our FIN is now acknowledged then * enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then closing * user can proceed. Starting the timer is contrary * to the specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; /* * Header prediction: check for the two common cases of a * uni-directional data xfer. If the packet has no control flags, * is in-sequence, the window didn't change and we're not * retransmitting, it's a candidate. If the length is zero and the * ack moved forward, we're the sender side of the xfer. Just free * the data acked & wake any higher level process that was blocked * waiting for space. If the length is non-zero and the ack didn't * move, we're the receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data toc The socket * buffer and note that we need a delayed ack. Make sure that the * hidden state-flags are also off. Since we check for * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. */ if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && __predict_true(SEGQ_EMPTY(tp)) && __predict_true(th->th_seq == tp->rcv_nxt)) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tlen == 0) { if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { return (0); } } else { if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt)) { return (0); } } } rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } /* State changes only happen in rack_process_data() */ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static int rack_check_data_after_close(struct mbuf *m, struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) { struct tcp_rack *rack; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_allow_data_af_clo == 0) { close_now: tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) goto close_now; /* Ok we allow data that is ignored and a followup reset */ tp->rcv_nxt = th->th_seq + *tlen; tp->t_flags2 |= TF2_DROP_AF_DATA; rack->r_wanted_output = 1; *tlen = 0; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { /* * If we can't receive any more data, then closing user can * proceed. Starting the timer is contrary to the * specification, but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); m_freem(m); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * case TCPS_LAST_ACK: Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); rack_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static void inline rack_clear_rate_sample(struct tcp_rack *rack) { rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; rack->r_ctl.rack_rs.rs_rtt_cnt = 0; rack->r_ctl.rack_rs.rs_rtt_tot = 0; } static int rack_init(struct tcpcb *tp) { struct tcp_rack *rack = NULL; tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (tp->t_fb_ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recusive (happens during setup. So a * scheme to drop the locks fails :( * */ return (ENOMEM); } memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); rack = (struct tcp_rack *)tp->t_fb_ptr; TAILQ_INIT(&rack->r_ctl.rc_map); TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); rack->rc_tp = tp; if (tp->t_inpcb) { rack->rc_inp = tp->t_inpcb; } /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); rack->r_cpu = 0; rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; rack->rc_pace_reduce = rack_slot_reduction; if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; rack->rc_pace_max_segs = rack_hptsi_segments; rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; rack->r_enforce_min_pace = rack_min_pace_time; rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_early_recovery = rack_early_recovery; rack->rc_always_pace = rack_pace_every_seg; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; rack->r_ctl.rc_prr_inc_var = rack_inc_var; rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; rsm = rack_alloc(rack); if (rsm == NULL) { uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; return (ENOMEM); } rsm->r_flags = RACK_OVERMAX; rsm->r_tim_lastsent[0] = tcp_ts_getticks(); rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; rsm->r_end = tp->snd_max; rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } return (0); } static int rack_handoff_ok(struct tcpcb *tp) { if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { /* Sure no problem though it may not stick */ return (0); } if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) { /* * We really don't know you have to get to ESTAB or beyond * to tell. */ return (EAGAIN); } if (tp->t_flags & TF_SACK_PERMIT) { return (0); } /* * If we reach here we don't do SACK on this connection so we can * never do rack. */ return (EINVAL); } static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { struct tcp_rack *rack; struct rack_sendmap *rsm; rack = (struct tcp_rack *)tp->t_fb_ptr; #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); } rack->rc_free_cnt = 0; uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } } static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) { switch (tp->t_state) { case TCPS_SYN_SENT: rack->r_state = TCPS_SYN_SENT; rack->r_substate = rack_do_syn_sent; break; case TCPS_SYN_RECEIVED: rack->r_state = TCPS_SYN_RECEIVED; rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; case TCPS_CLOSE_WAIT: rack->r_state = TCPS_CLOSE_WAIT; rack->r_substate = rack_do_close_wait; break; case TCPS_FIN_WAIT_1: rack->r_state = TCPS_FIN_WAIT_1; rack->r_substate = rack_do_fin_wait_1; break; case TCPS_CLOSING: rack->r_state = TCPS_CLOSING; rack->r_substate = rack_do_closing; break; case TCPS_LAST_ACK: rack->r_state = TCPS_LAST_ACK; rack->r_substate = rack_do_lastack; break; case TCPS_FIN_WAIT_2: rack->r_state = TCPS_FIN_WAIT_2; rack->r_substate = rack_do_fin_wait_2; break; case TCPS_LISTEN: case TCPS_CLOSED: case TCPS_TIME_WAIT: default: #ifdef INVARIANTS panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); #endif break; }; } static void rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) { /* * We received an ack, and then did not * call send or were bounced out due to the * hpts was running. Now a timer is up as well, is * it the right timer? */ struct rack_sendmap *rsm; int tmr_up; tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) return; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && (tmr_up == PACE_TMR_RXT)) { /* Should be an RXT */ return; } if (rsm == NULL) { /* Nothing outstanding? */ if (tp->t_flags & TF_DELACK) { if (tmr_up == PACE_TMR_DELACK) /* We are supposed to have delayed ack up and we do */ return; } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { /* * if we hit enobufs then we would expect the possiblity * of nothing outstanding and the RXT up (and the hptsi timer). */ return; } else if (((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && (tmr_up == PACE_TMR_KEEP) && (tp->snd_max == tp->snd_una)) { /* We should have keep alive up and we do */ return; } } if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { /* needs to be a RXT */ if (tmr_up == PACE_TMR_RXT) return; } else if (tmr_up == PACE_TMR_RACK) return; } else if (SEQ_GT(tp->snd_max,tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RXT))) { /* * Either a TLP or RXT is fine if no sack-passed * is in place and data is outstanding. */ return; } else if (tmr_up == PACE_TMR_DELACK) { /* * If the delayed ack was going to go off * before the rtx/tlp/rack timer were going to * expire, then that would be the timer in control. * Note we don't check the time here trusting the * code is correct. */ return; } /* * Ok the timer originally started is not what we want now. * We will force the hpts to be stopped if any, and restart * with the slot set to what was in the saved slot. */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); } static void rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv) { int32_t thflags, retval, did_out = 0; int32_t way_out = 0; uint32_t cts; uint32_t tiwin; struct tcpopt to; struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t prev_state = 0; cts = tcp_tv_to_mssectick(tv); rack = (struct tcp_rack *)tp->t_fb_ptr; kern_prefetch(rack, &prev_state); prev_state = 0; thflags = th->th_flags; /* * If this is either a state-changing packet or current state isn't * established, we require a read lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either locked or unlocked, as the * caller may have unnecessarily acquired a lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, &log, true); } if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { way_out = 4; goto done_with_input; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return; } /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { #ifdef NETFLIX_CWV if ((tp->cwv_enabled) && ((tp->cwv_cwnd_valid == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { tcp_newcwv_nvp_closedown(tp); } else #endif if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { counter_u64_add(rack_input_idle_reduces, 1); rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); } } rack->r_ctl.rc_rcvtime = cts; tp->t_rcvtime = ticks; #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if ((tp->cwv_cwnd_valid == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) tcp_newcwv_nvp_closedown(tp); } #endif /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move * this to occur after we've validated the segment. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { rack_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, fall back to * non RFC1323 RTT calculation. Normalize timestamp if syncookies * were used when this connection was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, cts)) to.to_tsecr = 0; } /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! */ if (rack->r_state == 0) { /* Should be init'd by rack_init() */ KASSERT(rack->rc_inp != NULL, ("%s: rack->rc_inp unexpectedly NULL", __func__)); if (rack->rc_inp == NULL) { rack->rc_inp = tp->t_inpcb; } /* * Process options only when we get SYN/ACK back. The SYN * case for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. XXX * this is traditional behavior, may need to be cleaned up. */ rack->r_cpu = inp_to_cpuid(tp->t_inpcb); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with the * next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = cts; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if (to.to_flags & TOF_FASTOPEN) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * At this point we are at the initial call. Here we decide * if we are doing RACK or not. We do this by seeing if * TF_SACK_PERMIT is set, if not rack is *not* possible and * we switch to the default code. */ if ((tp->t_flags & TF_SACK_PERMIT) == 0) { tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); return; } /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); rack_stop_all_timers(tp); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } /* * This is the one exception case where we set the rack state * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ if (rack->r_state != tp->t_state) rack_set_state(tp, rack); if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = rack->r_state; rack->r_ctl.rc_tlp_send_cnt = 0; rack_clear_rate_sample(rack); retval = (*rack->r_substate) (m, th, so, tp, &to, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt); #ifdef INVARIANTS if ((retval == 0) && (tp->t_inpcb == NULL)) { panic("retval:%d tp:%p t_inpcb:NULL state:%d", retval, tp, prev_state); } #endif if (retval == 0) { /* * If retval is 1 the tcb is unlocked and most likely the tp * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); tcp_rack_xmit_timer_commit(rack, tp); if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && (rack->rc_in_persist == 0)){ /* * The peer shrunk its window on us to the point * where we have sent too much. The only thing * we can do here is stop any timers and * enter persist. We most likely lost the last * bytes we sent but oh well, we will have to * retransmit them after the peer is caught up. */ if (rack->rc_inp->inp_in_hpts) tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack_timer_cancel(tp, rack, cts, __LINE__); rack_enter_persist(tp, rack, cts); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); way_out = 3; goto done_with_input; } if (nxt_pkt == 0) { if (rack->r_wanted_output != 0) { did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); } if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)))) { /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { if (rack->rc_inp->inp_in_hpts) tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); } way_out = 1; } else { /* Do we have the correct timer running? */ rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; } done_with_input: rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); if (did_out) rack->r_wanted_output = 0; #ifdef INVARIANTS if (tp->t_inpcb == NULL) { panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", did_out, retval, tp, prev_state); } #endif INP_WUNLOCK(tp->t_inpcb); } } void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; #ifdef RSS struct tcp_function_block *tfb; struct tcp_rack *rack; struct epoch_tracker et; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_state == 0) { /* * Initial input (ACK to SYN-ACK etc)lets go ahead and get * it processed */ INP_INFO_RLOCK_ET(&V_tcbinfo, et); tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return; } tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); INP_WUNLOCK(tp->t_inpcb); #else tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); #endif } struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) { struct rack_sendmap *rsm = NULL; int32_t idx; uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; /* Return the next guy to be re-transmitted */ if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { return (NULL); } if (tp->t_flags & TF_SENTFIN) { /* retran the end FIN? */ return (NULL); } /* ok lets look at this one */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { goto check_it; } rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { return (NULL); } check_it: srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; srtt = TICKS_2_MSEC(srtt_cur); if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) srtt = rack->rc_rack_rtt; if (rsm->r_flags & RACK_ACKED) { return (NULL); } if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { /* Its not yet ready */ return (NULL); } idx = rsm->r_rtr_cnt - 1; ts_low = rsm->r_tim_lastsent[idx]; thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (tsused <= ts_low) { return (NULL); } if ((tsused - ts_low) >= thresh) { return (rsm); } return (NULL); } static int rack_output(struct tcpcb *tp) { struct socket *so; uint32_t recwin, sendwin; uint32_t sb_offset; int32_t len, flags, error = 0; struct mbuf *m; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize; long tot_len_this_send = 0; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif struct udphdr *udp = NULL; struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen=0; uint32_t rack_seq; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int32_t idle, sendalot; int32_t sub_from_prr = 0; volatile int32_t sack_rxmit; struct rack_sendmap *rsm = NULL; int32_t tso, mtu, would_have_fin = 0; struct tcpopt to; int32_t slot = 0; uint32_t cts; uint8_t hpts_calling, doing_tlp = 0; int32_t do_a_prefetch; int32_t prefetch_rsm = 0; int32_t prefetch_so_done = 0; struct tcp_log_buffer *lgb = NULL; struct inpcb *inp; struct sockbuf *sb; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif /* setup and take the cache hits here */ rack = (struct tcp_rack *)tp->t_fb_ptr; inp = rack->rc_inp; so = inp->inp_socket; sb = &so->so_snd; kern_prefetch(sb, &do_a_prefetch); do_a_prefetch = 1; INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ isipv6 = rack->r_is_v6; } else { isipv6 = (inp->inp_vflag & INP_IPV6) != 0; } #endif cts = tcp_ts_getticks(); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && inp->inp_in_hpts) { /* * We are on the hpts for some timer but not hptsi output. * Remove from the hpts unconditionally. */ rack_timer_cancel(tp, rack, cts, __LINE__); } /* Mark that we have called rack_output(). */ if ((rack->r_timer_override) || (tp->t_flags & TF_FORCEDATA) || (tp->t_state < TCPS_ESTABLISHED)) { if (tp->t_inpcb->inp_in_hpts) tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } else if (tp->t_inpcb->inp_in_hpts) { /* * On the hpts you can't pass even if ACKNOW is on, we will * when the hpts fires. */ counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } hpts_calling = inp->inp_hpts_calls; inp->inp_hpts_calls = 0; if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (rack_process_timers(tp, rack, cts, hpts_calling)) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); return (0); } } rack->r_wanted_output = 0; rack->r_timer_override = 0; /* * For TFO connections in SYN_SENT or SYN_RECEIVED, * only allow the initial SYN or SYN|ACK and those sent * by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && ((tp->t_state == TCPS_SYN_RECEIVED) || (tp->t_state == TCPS_SYN_SENT)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->t_rxtshift == 0)) /* not a retransmit */ return (0); /* * Determine length of data that should be transmitted, and flags * that will be used. If there is some data or critical controls * (SYN, RST) to send, then transmit; otherwise, investigate * further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); #ifdef NETFLIX_CWV if (tp->cwv_enabled) { if ((tp->cwv_cwnd_valid == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) tcp_newcwv_nvp_closedown(tp); } else #endif if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ sendalot = 0; cts = tcp_ts_getticks(); tso = 0; mtu = 0; sb_offset = tp->snd_max - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly * trying to send out new data (when sendalot is 1), bypass this * function. If we retransmit in fast recovery mode, decrement * snd_cwnd, since we're replacing a (future) new transmission with * a retransmission now, and we previously incremented snd_cwnd in * tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ while (rack->rc_free_cnt < rack_free_cache) { rsm = rack_alloc(rack); if (rsm == NULL) { if (inp->inp_hpts_calls) /* Retry in a ms */ slot = 1; goto just_return_nolock; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt++; rsm = NULL; } if (inp->inp_hpts_calls) inp->inp_hpts_calls = 0; sack_rxmit = 0; len = 0; rsm = NULL; if (flags & TH_RST) { SOCKBUF_LOCK(sb); goto send; } if (rack->r_ctl.rc_tlpsend) { /* Tail loss probe */ long cwin; long tlen; doing_tlp = 1; rsm = rack->r_ctl.rc_tlpsend; rack->r_ctl.rc_tlpsend = NULL; sack_rxmit = 1; tlen = rsm->r_end - rsm->r_start; if (tlen > tp->t_maxseg) tlen = tp->t_maxseg; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; cwin = min(tp->snd_wnd, tlen); len = cwin; } else if (rack->r_ctl.rc_resend) { /* Retransmit timer */ rsm = rack->r_ctl.rc_resend; rack->r_ctl.rc_resend = NULL; len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; if (len >= tp->t_maxseg) { len = tp->t_maxseg; } } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { long tlen; if ((!IN_RECOVERY(tp->t_flags)) && ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { /* Enter recovery if not induced by a time-out */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", tp, rack, rsm, rsm->r_start, tp->snd_una); } #endif tlen = rsm->r_end - rsm->r_start; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; if (tlen > rack->r_ctl.rc_prr_sndcnt) { len = rack->r_ctl.rc_prr_sndcnt; } else { len = tlen; } if (len >= tp->t_maxseg) { sendalot = 1; len = tp->t_maxseg; } else { sendalot = 0; if ((rack->rc_timer_up == 0) && (len < tlen)) { /* * If its not a timer don't send a partial * segment. */ len = 0; goto just_return_nolock; } } if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); counter_u64_add(rack_rtm_prr_retran, 1); } } if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { /* we are retransmitting the fin */ len--; if (len) { /* * When retransmitting data do *not* include the * FIN. This could happen from a TLP probe. */ flags &= ~TH_FIN; } } #ifdef INVARIANTS /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; #endif /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { void *end_rsm; end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (end_rsm) kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } SOCKBUF_LOCK(sb); /* * If in persist timeout with window of 0, send 1 byte. Otherwise, * if window is small but nonzero and time TF_SENTFIN expired, we * will send what we can and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then clear * the FIN bit. Usually this would happen below * when it realizes that we aren't sending all the * data. However, if we have exactly 1 byte of * unsent data, then it won't clear the FIN bit * below, and if we are in persist state, we wind up * sending the packet without recording that we sent * the FIN bit. * * We can't just blindly clear the FIN bit, because * if we don't have any more data to send then the * probe will be the FIN itself. */ if (sb_offset < sbused(sb)) flags &= ~TH_FIN; sendwin = 1; } else { if (rack->rc_in_persist) rack_exit_persist(tp, rack); /* * If we are dropping persist mode then we need to * correct snd_nxt/snd_max and off. */ tp->snd_nxt = tp->snd_max; sb_offset = tp->snd_nxt - tp->snd_una; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a * negative length. This can also occur when TCP opens up its * congestion window while receiving additional duplicate acks after * fast-retransmit because TCP will reset snd_nxt to snd_max after * the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will be * set to snd_una, the sb_offset will be 0, and the length may wind * up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { uint32_t avail; avail = sbavail(sb); if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; if (IN_RECOVERY(tp->t_flags) == 0) { if (rack->r_ctl.rc_tlp_new_data) { /* TLP is forcing out new data */ if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); } if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) len = tp->snd_wnd; else len = rack->r_ctl.rc_tlp_new_data; rack->r_ctl.rc_tlp_new_data = 0; doing_tlp = 1; } else { if (sendwin > avail) { /* use the available */ if (avail > sb_offset) { len = (int32_t)(avail - sb_offset); } else { len = 0; } } else { if (sendwin > sb_offset) { len = (int32_t)(sendwin - sb_offset); } else { len = 0; } } } } else { uint32_t outstanding; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible so far in the scoreboard. */ outstanding = tp->snd_max - tp->snd_una; if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) len = 0; else if (avail > sb_offset) len = avail - sb_offset; else len = 0; if (len > 0) { if (len > rack->r_ctl.rc_prr_sndcnt) len = rack->r_ctl.rc_prr_sndcnt; if (len > 0) { sub_from_prr = 1; counter_u64_add(rack_rtm_prr_newdata, 1); } } if (len > tp->t_maxseg) { /* * We should never send more than a MSS when * retransmitting or sending new data in prr * mode unless the override flag is on. Most * likely the PRR algorithm is not going to * let us send a lot as well :-) */ if (rack->r_ctl.rc_prr_sendalot == 0) len = tp->t_maxseg; } else if (len < tp->t_maxseg) { /* * Do we send any? The idea here is if the * send empty's the socket buffer we want to * do it. However if not then lets just wait * for our prr_sndcnt to get bigger. */ long leftinsb; leftinsb = sbavail(sb) - sb_offset; if (leftinsb > len) { /* This send does not empty the sb */ len = 0; } } } } if (prefetch_so_done == 0) { kern_prefetch(so, &prefetch_so_done); prefetch_so_done = 1; } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; sb_offset--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. This * measure is needed to prevent interoperability problems with not * fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * * - When retransmitting SYN on an actively created socket * * - When sending a zero-length cookie (cookie request) on an * actively created socket * * - When the socket is in the CLOSED state (RST is being sent) */ if (IS_FASTOPEN(tp->t_flags) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) { sack_rxmit = 0; len = 0; } /* Without fast-open there should never be data sent on a SYN */ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) len = 0; if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been * called to retransmit, len will be < 0. Otherwise, window * shrank after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back to (closed) * window, and set the persist timer if it isn't already * going. If the window didn't close completely, just wait * for an ACK. * * We also do a general check here to ensure that we will * set the persist timer when we have data to send, but a * 0-byte window. This makes sure the persist timer is set * even if the packet hits one of the "goto send" lines * below. */ len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (sb_offset < (int)sbavail(sb))) { tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, cts); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP * options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per * generated segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); #endif /* INET */ #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0) tso = 1; { uint32_t outstanding; outstanding = tp->snd_max - tp->snd_una; if (tp->t_flags & TF_SENTFIN) { /* * If we sent a fin, snd_max is 1 higher than * snd_una */ outstanding--; } if (outstanding > 0) { /* * This is sub-optimal. We only send a stand alone * FIN on its own segment. */ if (flags & TH_FIN) { flags &= ~TH_FIN; would_have_fin = 1; } } else if (sack_rxmit) { if ((rsm->r_flags & RACK_HAS_FIN) == 0) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(sb))) flags &= ~TH_FIN; } } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) - This is the last * buffer in a write()/send() and we are either idle or running * NODELAY - we've timed out (e.g. persist timer) - we have more * then 1/2 the maximum send window's worth of data (receiver may be * limited the window size) - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) { pass = 1; goto send; } /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause us * to flush a buffer queued with moretocome. XXX * */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && (tp->t_flags & TF_NOPUSH) == 0) { pass = 2; goto send; } if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ pass = 3; goto send; } if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ goto send; } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { pass = 4; goto send; } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ pass = 5; goto send; } if (sack_rxmit) { pass = 6; goto send; } } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read from * the receive buffer, no matter how small, causes a window update * to be sent. We also should avoid sending a flurry of window * updates when the socket buffer had queued a lot of data and the * application is doing small reads. * * Prevent a flurry of pointless window updates by only sending an * update when we can increase the advertized window by more than * 1/4th of the socket buffer capacity. When the buffer is getting * full or is very small be more aggressive and send an update * whenever we can increase by two mss sized segments. In all other * situations the ACK's to new incoming data will carry further * window increases. * * Don't send an independent window update if a delayed ACK is * pending (it will get piggy-backed on it) or the remote side * already has done a half-close and won't send more data. Skip * this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, taking * into account that we are limited by TCP_MAXWIN << * tp->rcv_scale. */ int32_t adv; int oldwin; adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old * size when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { pass = 7; goto send; } if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) { pass = 8; goto send; } if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { pass = 9; goto send; } if (SEQ_GT(tp->snd_up, tp->snd_una)) { pass = 10; goto send; } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ if ((flags & TH_FIN) && (tp->snd_nxt == tp->snd_una)) { pass = 11; goto send; } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(sb); just_return_nolock: if (tot_len_this_send == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); tp->t_flags &= ~TF_FORCEDATA; return (0); send: if (doing_tlp == 0) { /* * Data not a TLP, and its not the rxt firing. If it is the * rxt firing, we want to leave the tlp_in_progress flag on * so we don't send another TLP. It has to be a rack timer * or normal send (response to acked data) to clear the tlp * in progress flag. */ rack->rc_tlp_in_progress = 0; } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options unless TCP * set not to do any options. NOTE: we assume that the IP/TCP header * plus TCP options always fit in a single mbuf, leaving room for a * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) * + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else #endif hdrlen = sizeof(struct tcpiphdr); /* * Compute options for segment. We only have to care about SYN and * established connection segments. Options for SYN-ACK segments * are handled in TCP syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); #ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* * If we wind up having more data to * send with the SYN than can fit in * one segment, don't send any more * until the SYN|ACK comes back from * the other end. */ sendalot = 0; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = cts + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } #ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } #endif ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we * cut off the tail of the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { if (flags & TH_FIN) { would_have_fin = 1; flags &= ~TH_FIN; } if (tso) { uint32_t if_hw_tsomax; uint32_t moff; int32_t max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being fractional * unless the send sockbuf can be emptied: */ max_len = (tp->t_maxseg - optlen); if ((sb_offset + len) < sbavail(sb)) { moff = len % (u_int)max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments don't * use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment after the bulk * sending is done. We don't trust the TSO * implementations to clear the FIN flag on all but * the last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); #ifdef DIAGNOSTIC #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); #endif /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further * down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); if ((len == 0) && (flags & TH_FIN) && (sbused(sb))) { /* * We have outstanding data, don't send a fin by itself!. */ goto just_return; } /* * Grab a header mbuf, attaching a copy of data to be transmitted, * and initialize the header from the template for sends on this * connection. */ if (len) { uint32_t max_val; uint32_t moff; if (rack->rc_pace_max_segs) max_val = rack->rc_pace_max_segs * tp->t_maxseg; else max_val = len; /* * We allow a limit on sending with hptsi. */ if (len > max_val) { len = max_val; } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(sb); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf to the * sb_offset in the socket buffer chain. */ mb = sbsndptr_noadv(sb, sb_offset, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(sb, mb, len); m->m_len += len; } else { struct sockbuf *msb; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = sb; m->m_next = tcp_m_copym(mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(sb); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); #ifdef NETFLIX_STATS if (SEQ_LT(tp->snd_nxt, tp->snd_max)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); else stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { if (rsm && (rsm->r_flags & RACK_TLP)) { /* * TLP should not count in retran count, but * in its own bin */ counter_u64_add(rack_tlp_retran, 1); counter_u64_add(rack_tlp_retran_bytes, len); } else { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } #ifdef NETFLIX_STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); #ifdef NETFLIX_STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } /* * If we're sending everything we've got, set PUSH. (This * will keep happy those implementations which only give * data to the user when a buffer fills or a PUSH comes in.) */ if (sb_offset + len == sbused(sb) && sbused(sb) && !(flags & TH_SYN)) flags |= TH_PUSH; /* * Are we doing hptsi, if so we must calculate the slot. We * only do hptsi in ESTABLISHED and with no RESET being * sent where we have data to send. */ if (((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_CLOSE_WAIT) || ((tp->t_state == TCPS_FIN_WAIT_1) && ((tp->t_flags & TF_SENTFIN) == 0) && ((flags & TH_FIN) == 0))) && ((flags & TH_RST) == 0) && (rack->rc_always_pace)) { /* * We use the most optimistic possible cwnd/srtt for * sending calculations. This will make our * calculation anticipate getting more through * quicker then possible. But thats ok we don't want * the peer to have a gap in data sending. */ uint32_t srtt, cwnd, tr_perms = 0; if (rack->r_ctl.rc_rack_min_rtt) srtt = rack->r_ctl.rc_rack_min_rtt; else srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); if (rack->r_ctl.rc_rack_largest_cwnd) cwnd = rack->r_ctl.rc_rack_largest_cwnd; else cwnd = tp->snd_cwnd; tr_perms = cwnd / srtt; if (tr_perms == 0) { tr_perms = tp->t_maxseg; } tot_len_this_send += len; /* * Calculate how long this will take to drain, if * the calculation comes out to zero, thats ok we * will use send_a_lot to possibly spin around for * more increasing tot_len_this_send to the point * that its going to require a pace, or we hit the * cwnd. Which in that case we are just waiting for * a ACK. */ slot = tot_len_this_send / tr_perms; /* Now do we reduce the time so we don't run dry? */ if (slot && rack->rc_pace_reduce) { int32_t reduce; reduce = (slot / rack->rc_pace_reduce); if (reduce < slot) { slot -= reduce; } else slot = 0; } if (rack->r_enforce_min_pace && (slot == 0) && (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { /* We are enforcing a minimum pace time of 1ms */ slot = rack->r_enforce_min_pace; } } SOCKBUF_UNLOCK(sb); } else { SOCKBUF_UNLOCK(sb); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(sb); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } /* * Fill in fields, remembering maximum advertised window for use in * delaying messages about window sizes. If resending a FIN, be sure * not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup SYN packet. If we * are on a retransmit, we may resend those bits a number of times * as per RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE | TH_CWR; } else flags |= TH_ECE | TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with ECN capable * transmission (ECT). Ignore pure ack packets, * retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will not reflect * the first unsent octet. For ACK only packets, we do not want the * sequence number of the retransmitted packet, we want the sequence * number of the next unsent octet. So, if there is no data (and no * SYN or FIN), use snd_max instead of snd_nxt when filling in * ti_seq. But if we are in persist state, snd_max might reflect * one byte beyond the right edge of the window, so use snd_nxt in * that case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN | TH_FIN)) || rack->rc_in_persist) { th->th_seq = htonl(tp->snd_nxt); rack_seq = tp->snd_nxt; } else if (flags & TH_RST) { /* * For a Reset send the last cum ack in sequence * (this like any other choice may still generate a * challenge ack, if a ack-update packet is in * flight). */ th->th_seq = htonl(tp->snd_una); rack_seq = tp->snd_una; } else { th->th_seq = htonl(tp->snd_max); rack_seq = tp->snd_max; } } else { th->th_seq = htonl(rsm->r_start); rack_seq = rsm->r_start; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. * If a RST segment is sent, advertise a window of zero. */ if (flags & TH_RST) { recwin = 0; } else { if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; } /* * According to RFC1323 the window field in a SYN (i.e., a or * ) segment itself is never scaled. The case is * handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 * window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is * attempting to read more data than can be buffered prior to * transmitting on the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull the urgent * pointer to the left edge of the send window so that it * doesn't drift into the send window on sequence number * wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ goto out; } } #endif /* * Put TCP length in extended header, and then checksum extended * header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); } /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); #else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + * (th->th_off << 2) */ ); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* We're getting ready to send; log now. */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; if (rsm || sack_rxmit) { log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, &log, false, NULL, NULL, 0, NULL); } else lgb = NULL; /* * Fill in IP length and desired time to live and send to IP level. * There should be a better way to handle ttl and tos; we could keep * them in the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. Also, * desired default hop limit might be changed via Neighbor * Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace * probes. ip6_output() will set it properly; it's supposed * to include the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &inp->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) mtu = inp->inp_route6.ro_rt->rt_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every * packet. This might not be the best thing to do according * to RFC3390 Section 2. However the tcp hostcache migitates * the problem so it affects only the first tcp connection * with a host. * * NB: Don't set DF on small MTU/MSS to have a safe * fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) mtu = inp->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: if (lgb) { lgb->tlb_errno = error; lgb = NULL; } /* * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ if (error == 0) { if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); } else if (len > 1) { int idx; idx = (len / tp->t_maxseg) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); else counter_u64_add(rack_out_size[idx], 1); } } if (sub_from_prr && (error == 0)) { rack->r_ctl.rc_prr_sndcnt -= len; } sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm); if ((tp->t_flags & TF_FORCEDATA) == 0 || (rack->rc_in_persist == 0)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ goto timer; if (flags & (TH_SYN | TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } /* In the ENOBUFS case we do *not* update snd_max */ if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. * This is only relevant in case of switching back to * the base stack. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } #ifdef NETFLIX_STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; tp->gput_seq = startseq; tp->gput_ack = startseq + ulmin(sbavail(sb) - sb_offset, sendwin); tp->gput_ts = tcp_ts_getticks(); } #endif } /* * Set retransmit timer if not currently set, and not doing * a pure ack or a keep-alive probe. Initial value for * retransmit timer is smoothed round-trip time + 2 * * round-trip time variance. Initialize shift counter which * is used for backoff of retransmit time. */ timer: if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* * If the persists timer was set above (right before * the goto send), and still needs to be on. Lets * make sure all is canceled. If the persist timer * is not running, we want to get it up. */ if (rack->rc_in_persist == 0) { rack_enter_persist(tp, rack, cts); } } } else { /* * Persist case, update snd_max but since we are in persist * mode (no window) we do not update snd_nxt. */ int32_t xlen = len; if (error) goto nomore; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } /* In the ENOBUFS case we do *not* update snd_max */ if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt + len; } } nomore: if (error) { SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ /* * Failures do not advance the seq counter above. For the * case of ENOBUFS we will fall out and retry in 1ms with * the hpts. Everything else will just have to retransmit * with the timer. * * In any case, we do not want to loop around for another * send without a good reason. */ sendalot = 0; switch (error) { case EPERM: tp->t_flags &= ~TF_FORCEDATA; tp->t_softerror = error; return (error); case ENOBUFS: if (slot == 0) { /* * Pace us right away to retry in a some * time */ slot = 1 + rack->rc_enobuf; if (rack->rc_enobuf < 255) rack->rc_enobuf++; if (slot > (rack->rc_rack_rtt / 2)) { slot = rack->rc_rack_rtt / 2; } if (slot < 10) slot = 10; } counter_u64_add(rack_saw_enobuf, 1); error = 0; goto enobufs; case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. If TSO was active we either got an * interface without TSO capabilits or TSO was * turned off. If we obtained mtu from ip_output() * then update it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } slot = 10; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); tp->t_flags &= ~TF_FORCEDATA; return (error); case ENETUNREACH: counter_u64_add(rack_saw_enetunreach, 1); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; } /* FALLTHROUGH */ default: slot = 10; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); tp->t_flags &= ~TF_FORCEDATA; return (error); } } else { rack->rc_enobuf = 0; } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). If this advertises a larger * window than any other segment, then remember the size of the * advertised window. Any pending ACK has now been sent. */ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: rack->r_tlp_running = 0; if ((flags & TH_RST) || (would_have_fin == 1)) { /* * We don't send again after a RST. We also do *not* send * again if we would have had a find, but now have * outstanding data. */ slot = 0; sendalot = 0; } if (slot) { /* set the rack tcb into the slot N */ counter_u64_add(rack_paced_segments, 1); } else if (sendalot) { if (len) counter_u64_add(rack_unpaced_segments, 1); sack_rxmit = 0; tp->t_flags &= ~TF_FORCEDATA; goto again; } else if (len) { counter_u64_add(rack_unpaced_segments, 1); } tp->t_flags &= ~TF_FORCEDATA; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); return (error); } /* * rack_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error = 0, optval; switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: case TCP_RACK_PROP: case TCP_RACK_TLP_REDUCE: case TCP_RACK_EARLY_RECOV: case TCP_RACK_PACE_ALWAYS: case TCP_DELACK: case TCP_RACK_PACE_REDUCE: case TCP_RACK_PACE_MAX_SEG: case TCP_RACK_PRR_SENDALOT: case TCP_RACK_MIN_TO: case TCP_RACK_EARLY_SEG: case TCP_RACK_REORD_THRESH: case TCP_RACK_REORD_FADE: case TCP_RACK_TLP_THRESH: case TCP_RACK_PKT_DELAY: case TCP_RACK_TLP_USE: case TCP_RACK_TLP_INC_VAR: case TCP_RACK_IDLE_REDUCE_HIGH: case TCP_RACK_MIN_PACE: case TCP_RACK_MIN_PACE_SEG: case TCP_BBR_RACK_RTT_USE: case TCP_DATA_AFTER_CLOSE: break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: if ((optval <= 0) || (optval >= 100)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_rack_prop_rate); rack->r_ctl.rc_prop_rate = optval; break; case TCP_RACK_TLP_USE: if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_tlp_use); rack->rack_tlp_threshold_use = optval; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ RACK_OPTS_INC(tcp_rack_prop); rack->r_ctl.rc_prop_reduce = optval; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ RACK_OPTS_INC(tcp_rack_tlp_reduce); rack->r_ctl.rc_tlp_cwnd_reduce = optval; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ RACK_OPTS_INC(tcp_rack_early_recov); rack->r_ctl.rc_early_recovery = optval; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method (bool) */ RACK_OPTS_INC(tcp_rack_pace_always); if (optval > 0) rack->rc_always_pace = 1; else rack->rc_always_pace = 0; break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ RACK_OPTS_INC(tcp_rack_pace_reduce); if (optval) /* Must be non-zero */ rack->rc_pace_reduce = optval; else error = EINVAL; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ RACK_OPTS_INC(tcp_rack_max_seg); rack->rc_pace_max_segs = optval; break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ RACK_OPTS_INC(tcp_rack_prr_sendalot); rack->r_ctl.rc_prr_sendalot = optval; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ RACK_OPTS_INC(tcp_rack_min_to); rack->r_ctl.rc_min_to = optval; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ RACK_OPTS_INC(tcp_rack_early_seg); rack->r_ctl.rc_early_recovery_segs = optval; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ RACK_OPTS_INC(tcp_rack_reord_thresh); if ((optval > 0) && (optval < 31)) rack->r_ctl.rc_reorder_shift = optval; else error = EINVAL; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ RACK_OPTS_INC(tcp_rack_reord_fade); rack->r_ctl.rc_reorder_fade = optval; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ RACK_OPTS_INC(tcp_rack_tlp_thresh); if (optval) rack->r_ctl.rc_tlp_threshold = optval; else error = EINVAL; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); rack->r_ctl.rc_pkt_delay = optval; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ RACK_OPTS_INC(tcp_rack_tlp_inc_var); rack->r_ctl.rc_prr_inc_var = optval; break; case TCP_RACK_IDLE_REDUCE_HIGH: RACK_OPTS_INC(tcp_rack_idle_reduce_high); if (optval) rack->r_idle_reduce_largest = 1; else rack->r_idle_reduce_largest = 0; break; case TCP_DELACK: if (optval == 0) tp->t_delayed_ack = 0; else tp->t_delayed_ack = 1; if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; rack_output(tp); } break; case TCP_RACK_MIN_PACE: RACK_OPTS_INC(tcp_rack_min_pace); if (optval > 3) rack->r_enforce_min_pace = 3; else rack->r_enforce_min_pace = optval; break; case TCP_RACK_MIN_PACE_SEG: RACK_OPTS_INC(tcp_rack_min_pace_seg); if (optval >= 16) rack->r_min_pace_seg_thresh = 15; else rack->r_min_pace_seg_thresh = optval; break; case TCP_BBR_RACK_RTT_USE: if ((optval != USE_RTT_HIGH) && (optval != USE_RTT_LOW) && (optval != USE_RTT_AVG)) error = EINVAL; else rack->r_ctl.rc_rate_sample_method = optval; break; case TCP_DATA_AFTER_CLOSE: if (optval) rack->rc_allow_data_af_clo = 1; else rack->rc_allow_data_af_clo = 0; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } #ifdef NETFLIX_STATS tcp_log_socket_option(tp, sopt->sopt_name, optval, error); #endif INP_WUNLOCK(inp); return (error); } static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error, optval; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever * add a option that is not a int, then this will have quite an * impact to this routine. */ switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: optval = rack->r_ctl.rc_prop_rate; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ optval = rack->r_ctl.rc_prop_reduce; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ optval = rack->r_ctl.rc_tlp_cwnd_reduce; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ optval = rack->r_ctl.rc_early_recovery; break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ optval = rack->rc_pace_reduce; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ optval = rack->rc_pace_max_segs; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method */ optval = rack->rc_always_pace; break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ optval = rack->r_ctl.rc_prr_sendalot; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ optval = rack->r_ctl.rc_min_to; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ optval = rack->r_ctl.rc_early_recovery_segs; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ optval = rack->r_ctl.rc_reorder_shift; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = rack->r_ctl.rc_tlp_threshold; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ optval = rack->r_ctl.rc_pkt_delay; break; case TCP_RACK_TLP_USE: optval = rack->rack_tlp_threshold_use; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ optval = rack->r_ctl.rc_prr_inc_var; break; case TCP_RACK_IDLE_REDUCE_HIGH: optval = rack->r_idle_reduce_largest; break; case TCP_RACK_MIN_PACE: optval = rack->r_enforce_min_pace; break; case TCP_RACK_MIN_PACE_SEG: optval = rack->r_min_pace_seg_thresh; break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; break; case TCP_DELACK: optval = tp->t_delayed_ack; break; case TCP_DATA_AFTER_CLOSE: optval = rack->rc_allow_data_af_clo; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); return (error); } static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int32_t error = EINVAL; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack == NULL) { /* Huh? */ goto out; } if (sopt->sopt_dir == SOPT_SET) { return (rack_set_sockopt(so, sopt, inp, tp, rack)); } else if (sopt->sopt_dir == SOPT_GET) { return (rack_get_sockopt(so, sopt, inp, tp, rack)); } out: INP_WUNLOCK(inp); return (error); } struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, .tfb_tcp_do_segment = rack_do_segment, .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, .tfb_tcp_timer_stop_all = rack_stopall, .tfb_tcp_timer_activate = rack_timer_activate, .tfb_tcp_timer_active = rack_timer_active, .tfb_tcp_timer_stop = rack_timer_stop, .tfb_tcp_rexmit_tmr = rack_remxt_tmr, .tfb_tcp_handoff_ok = rack_handoff_ok }; static const char *rack_stack_names[] = { __XSTRING(STACKNAME), #ifdef STACKALIAS __XSTRING(STACKALIAS), #endif }; static int rack_ctor(void *mem, int32_t size, void *arg, int32_t how) { memset(mem, 0, size); return (0); } static void rack_dtor(void *mem, int32_t size, void *arg) { } static bool rack_mod_inited = false; static int tcp_addrack(module_t mod, int32_t type, void *data) { int32_t err = 0; int num_stacks; switch (type) { case MOD_LOAD: rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", sizeof(struct rack_sendmap), rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", sizeof(struct tcp_rack), rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); sysctl_ctx_init(&rack_sysctl_ctx); rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp), OID_AUTO, __XSTRING(STACKNAME), CTLFLAG_RW, 0, ""); if (rack_sysctl_root == NULL) { printf("Failed to add sysctl node\n"); err = EFAULT; goto free_uma; } rack_init_sysctls(); num_stacks = nitems(rack_stack_names); err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, rack_stack_names, &num_stacks); if (err) { printf("Failed to register %s stack name for " "%s module\n", rack_stack_names[num_stacks], __XSTRING(MODNAME)); sysctl_ctx_free(&rack_sysctl_ctx); free_uma: uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); rack_counter_destroy(); printf("Failed to register rack module -- err:%d\n", err); return (err); } rack_mod_inited = true; break; case MOD_QUIESCE: err = deregister_tcp_functions(&__tcp_rack, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_rack, false, true); if (err == EBUSY) break; if (rack_mod_inited) { uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); sysctl_ctx_free(&rack_sysctl_ctx); rack_counter_destroy(); rack_mod_inited = false; } err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t tcp_rack = { .name = __XSTRING(MODNAME), .evhand = tcp_addrack, .priv = 0 }; MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); Index: head/sys/netinet/tcp_stacks/rack_bbr_common.h =================================================================== --- head/sys/netinet/tcp_stacks/rack_bbr_common.h (revision 343754) +++ head/sys/netinet/tcp_stacks/rack_bbr_common.h (revision 343755) @@ -1,70 +1,68 @@ #ifndef __pacer_timer_h__ #define __pacer_timer_h__ /*- - * Copyright (c) 2017 - * Netflix Inc. - * All rights reserved. + * Copyright (c) 2017 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * __FBSDID("$FreeBSD$"); */ /* Common defines and such used by both RACK and BBR */ /* Special values for mss accounting array */ #define TCP_MSS_ACCT_JUSTRET 0 #define TCP_MSS_ACCT_SNDACK 1 #define TCP_MSS_ACCT_PERSIST 2 #define TCP_MSS_ACCT_ATIMER 60 #define TCP_MSS_ACCT_INPACE 61 #define TCP_MSS_ACCT_LATE 62 #define TCP_MSS_SMALL_SIZE_OFF 63 /* Point where small sizes enter */ #define TCP_MSS_ACCT_SIZE 70 #define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF) /* Magic flags to tell whats cooking on the pacing wheel */ #define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */ #define PACE_TMR_RACK 0x02 /* RACK timer running */ #define PACE_TMR_TLP 0x04 /* TLP timer running */ #define PACE_TMR_RXT 0x08 /* Retransmit timer running */ #define PACE_TMR_PERSIT 0x10 /* Persists timer running */ #define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ #define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */ #define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) /* Magic flags for tracing progress events */ #define PROGRESS_DROP 1 #define PROGRESS_UPDATE 2 #define PROGRESS_CLEAR 3 #define PROGRESS_START 4 /* RTT sample methods */ #define USE_RTT_HIGH 0 #define USE_RTT_LOW 1 #define USE_RTT_AVG 2 #ifdef _KERNEL /* We have only 7 bits in rack so assert its true */ CTASSERT((PACE_TMR_MASK & 0x80) == 0); #endif #endif Index: head/sys/netinet/tcp_stacks/sack_filter.c =================================================================== --- head/sys/netinet/tcp_stacks/sack_filter.c (revision 343754) +++ head/sys/netinet/tcp_stacks/sack_filter.c (revision 343755) @@ -1,706 +1,704 @@ /*- - * Copyright (c) 2017 - * Netflix Inc. - * All rights reserved. + * Copyright (c) 2017 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #ifndef _KERNEL #include #include #include #include #include #include #include #endif #include "sack_filter.h" /* * Sack filter is used to filter out sacks * that have already been processed. The idea * is pretty simple really, consider two sacks * * SACK 1 * cum-ack A * sack B - C * SACK 2 * cum-ack A * sack D - E * sack B - C * * The previous sack information (B-C) is repeated * in SACK 2. If the receiver gets SACK 1 and then * SACK 2 then any work associated with B-C as already * been completed. This only effects where we may have * (as in bbr or rack) cases where we walk a linked list. * * Now the utility trys to keep everything in a single * cache line. This means that its not perfect and * it could be that so big of sack's come that a * "remembered" processed sack falls off the list and * so gets re-processed. Thats ok, it just means we * did some extra work. We could of course take more * cache line hits by expanding the size of this * structure, but then that would cost more. */ #ifndef _KERNEL int detailed_dump = 0; uint64_t cnt_skipped_oldsack = 0; uint64_t cnt_used_oldsack = 0; int highest_used=0; int over_written=0; int empty_avail=0; int no_collapse = 0; FILE *out = NULL; FILE *in = NULL; #endif #define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits) #define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits) #define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits) #ifndef _KERNEL static #endif void sack_filter_clear(struct sack_filter *sf, tcp_seq seq) { sf->sf_ack = seq; sf->sf_bits = 0; sf->sf_cur = 0; sf->sf_used = 0; } /* * Given a previous sack filter block, filter out * any entries where the cum-ack moves over them * fully or partially. */ static void sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack) { int32_t i; /* start with the oldest */ for (i = 0; i < SACK_FILTER_BLOCKS; i++) { if (sack_blk_used(sf, i)) { if (SEQ_GT(th_ack, sf->sf_blks[i].end)) { /* This block is consumed */ sf->sf_bits = sack_blk_clr(sf, i); sf->sf_used--; } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) { /* Some of it is acked */ sf->sf_blks[i].start = th_ack; /* We could in theory break here, but * there are some broken implementations * that send multiple blocks. We want * to catch them all with similar seq's. */ } } } sf->sf_ack = th_ack; } /* * Return true if you find that * the sackblock b is on the score * board. Update it along the way * if part of it is on the board. */ static int32_t is_sack_on_board(struct sack_filter *sf, struct sackblk *b) { int32_t i, cnt; for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) { if (sack_blk_used(sf, i)) { if (SEQ_LT(b->start, sf->sf_ack)) { /* Behind cum-ack update */ b->start = sf->sf_ack; } if (SEQ_LT(b->end, sf->sf_ack)) { /* End back behind too */ b->end = sf->sf_ack; } if (b->start == b->end) return(1); /* Jonathans Rule 1 */ if (SEQ_LEQ(sf->sf_blks[i].start, b->start) && SEQ_GEQ(sf->sf_blks[i].end, b->end)) { /** * Our board has this entirely in * whole or in part: * * board |-------------| * sack |-------------| * * board |-------------| * sack |----| * */ return(1); } /* Jonathans Rule 2 */ if(SEQ_LT(sf->sf_blks[i].end, b->start)) { /** * Not near each other: * * board |---| * sack |---| */ goto nxt_blk; } /* Jonathans Rule 3 */ if (SEQ_GT(sf->sf_blks[i].start, b->end)) { /** * Not near each other: * * board |---| * sack |---| */ goto nxt_blk; } if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) { /** * The board block partial meets: * * board |--------| * sack |----------| * * board |--------| * sack |--------------| * * up with this one (we have part of it). * 1) Update the board block to the new end * and * 2) Update the start of this block to my end. */ b->start = sf->sf_blks[i].end; sf->sf_blks[i].end = b->end; goto nxt_blk; } if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) { /** * The board block partial meets: * * board |--------| * sack |----------| * * board |----| * sack |----------| * 1) Update the board block to the new start * and * 2) Update the start of this block to my end. */ b->end = sf->sf_blks[i].start; sf->sf_blks[i].start = b->start; goto nxt_blk; } } nxt_blk: i++; i %= SACK_FILTER_BLOCKS; } /* Did we totally consume it in pieces? */ if (b->start != b->end) return(0); else return(1); } static int32_t sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks) { int32_t num, i; struct sackblk blkboard[TCP_MAX_SACK]; /* * An old sack has arrived. It may contain data * we do not have. We might not have it since * we could have had a lost ack we might have the * entire thing on our current board. We want to prune * off anything we have. With this function though we * won't add to the board. */ for( i = 0, num = 0; isf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk)); sf->sf_bits = sack_blk_clr(sf, idx); sf->sf_bits = sack_blk_set(sf, i); return; } i++; i %= SACK_FILTER_BLOCKS; } } static int32_t sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) { struct sackblk blkboard[TCP_MAX_SACK]; int32_t num, i; /* * First lets trim the old and possibly * throw any away we have. */ for(i=0, num=0; i=0; i--) { if (is_sack_on_board(sf, &blkboard[i])) continue; /* Add this guy its not listed */ sf->sf_cur++; sf->sf_cur %= SACK_FILTER_BLOCKS; if ((sack_blk_used(sf, sf->sf_cur)) && (sf->sf_used < SACK_FILTER_BLOCKS)) { sack_move_to_empty(sf, sf->sf_cur); } #ifndef _KERNEL if (sack_blk_used(sf, sf->sf_cur)) { over_written++; if (sf->sf_used < SACK_FILTER_BLOCKS) empty_avail++; } #endif memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); if (sack_blk_used(sf, sf->sf_cur) == 0) { sf->sf_used++; #ifndef _KERNEL if (sf->sf_used > highest_used) highest_used = sf->sf_used; #endif sf->sf_bits = sack_blk_set(sf, sf->sf_cur); } } return(numblks); } /* * Given a sack block on the board (the skip index) see if * any other used entries overlap or meet, if so return the index. */ static int32_t sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip) { int32_t i; for(i=0; isf_blks[i].end, sb->start) && SEQ_LEQ(sf->sf_blks[i].end, sb->end) && SEQ_LEQ(sf->sf_blks[i].start, sb->start)) { /** * The two board blocks meet: * * board1 |--------| * board2 |----------| * * board1 |--------| * board2 |--------------| * * board1 |--------| * board2 |--------| */ return(i); } if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) && SEQ_GEQ(sf->sf_blks[i].start, sb->start) && SEQ_GEQ(sf->sf_blks[i].end, sb->end)) { /** * The board block partial meets: * * board |--------| * sack |----------| * * board |----| * sack |----------| * 1) Update the board block to the new start * and * 2) Update the start of this block to my end. */ return(i); } } return (-1); } /* * Collapse entry src into entry into * and free up the src entry afterwards. */ static void sack_collapse(struct sack_filter *sf, int32_t src, int32_t into) { if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) { /* src has a lower starting point */ sf->sf_blks[into].start = sf->sf_blks[src].start; } if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) { /* src has a higher ending point */ sf->sf_blks[into].end = sf->sf_blks[src].end; } sf->sf_bits = sack_blk_clr(sf, src); sf->sf_used--; } static void sack_board_collapse(struct sack_filter *sf) { int32_t i, j, i_d, j_d; for(i=0; isf_blks[i], i); if (j == -1) { /* No overlap */ continue; } /* * Ok j and i overlap with each other, collapse the * one out furthest away from the current position. */ if (sf->sf_cur > i) i_d = sf->sf_cur - i; else i_d = i - sf->sf_cur; if (sf->sf_cur > j) j_d = sf->sf_cur - j; else j_d = j - sf->sf_cur; if (j_d > i_d) { sack_collapse(sf, j, i); } else sack_collapse(sf, i, j); } } #ifndef _KERNEL static #endif int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) { int32_t i, ret; if (numblks > TCP_MAX_SACK) { panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n", sf, in, numblks); return(numblks); } if ((sf->sf_used == 0) && numblks) { /* * We are brand new add the blocks in * reverse order. Note we can see more * than one in new, since ack's could be lost. */ sf->sf_ack = th_ack; for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); sf->sf_bits = sack_blk_set(sf, sf->sf_cur); sf->sf_cur++; sf->sf_cur %= SACK_FILTER_BLOCKS; sf->sf_used++; #ifndef _KERNEL if (sf->sf_used > highest_used) highest_used = sf->sf_used; #endif } if (sf->sf_cur) sf->sf_cur--; return(numblks); } if (SEQ_GT(th_ack, sf->sf_ack)) { sack_filter_prune(sf, th_ack); } if (numblks) { if (SEQ_GEQ(th_ack, sf->sf_ack)) { ret = sack_filter_new(sf, in, numblks, th_ack); } else { ret = sack_filter_old(sf, in, numblks); } } else ret = 0; #ifndef _KERNEL if ((sf->sf_used > 1) && (no_collapse == 0)) sack_board_collapse(sf); #else if (sf->sf_used > 1) sack_board_collapse(sf); #endif return (ret); } #ifndef _KERNEL uint64_t saved=0; uint64_t tot_sack_blks=0; static void sack_filter_dump(FILE *out, struct sack_filter *sf) { int i; fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n", sf->sf_ack, sf->sf_bits, sf->sf_cur, sf->sf_used); for(i=0; isf_blks[i].start, sf->sf_blks[i].end); } } } int main(int argc, char **argv) { char buffer[512]; struct sackblk blks[TCP_MAX_SACK]; FILE *err; tcp_seq th_ack, snd_una; struct sack_filter sf; int32_t numblks,i; int snd_una_set=0; double a, b, c; int invalid_sack_print = 0; uint32_t chg_remembered=0; uint32_t sack_chg=0; char line_buf[10][256]; int line_buf_at=0; in = stdin; out = stdout; while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) { switch (i) { case 'n': no_collapse = 1; break; case 'd': detailed_dump = 1; break; case'I': invalid_sack_print = 1; break; case 'i': in = fopen(optarg, "r"); if (in == NULL) { fprintf(stderr, "Fatal error can't open %s for input\n", optarg); exit(-1); } break; case 'o': out = fopen(optarg, "w"); if (out == NULL) { fprintf(stderr, "Fatal error can't open %s for output\n", optarg); exit(-1); } break; default: case '?': case 'h': fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]); return(0); break; }; } sack_filter_clear(&sf, 0); memset(buffer, 0, sizeof(buffer)); memset(blks, 0, sizeof(blks)); numblks = 0; fprintf(out, "************************************\n"); while (fgets(buffer, sizeof(buffer), in) != NULL) { sprintf(line_buf[line_buf_at], "%s", buffer); line_buf_at++; if (strncmp(buffer, "QUIT", 4) == 0) { break; } else if (strncmp(buffer, "DONE", 4) == 0) { int nn, ii; if (numblks) { uint32_t szof, tot_chg; for(ii=0; ii chg_remembered)){ fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n", sack_chg, chg_remembered ); } sack_chg = chg_remembered = 0; } else if (strncmp(buffer, "RXT", 3) == 0) { sack_filter_clear(&sf, snd_una); } else if (strncmp(buffer, "ACK:", 4) == 0) { th_ack = strtoul(&buffer[4], NULL, 0); if (snd_una_set == 0) { snd_una = th_ack; snd_una_set = 1; } else if (SEQ_GT(th_ack, snd_una)) { snd_una = th_ack; } } else if (strncmp(buffer, "EXIT", 4) == 0) { sack_filter_clear(&sf, snd_una); sack_chg = chg_remembered = 0; } else if (strncmp(buffer, "SACK:", 5) == 0) { char *end=NULL; uint32_t start; uint32_t endv; start = strtoul(&buffer[5], &end, 0); if (end) { endv = strtoul(&end[1], NULL, 0); } else { fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start); continue; } if (SEQ_LT(endv, start)) { fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start); continue; } if (numblks == TCP_MAX_SACK) { fprintf(out, "--Exceeded max %d\n", numblks); exit(0); } blks[numblks].start = start; blks[numblks].end = endv; numblks++; } memset(buffer, 0, sizeof(buffer)); } if (in != stdin) { fclose(in); } if (out != stdout) { fclose(out); } a = saved * 100.0; b = tot_sack_blks * 1.0; if (b > 0.0) c = a/b; else c = 0.0; if (out != stdout) err = stdout; else err = stderr; fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n", saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail); return(0); } #endif Index: head/sys/netinet/tcp_stacks/sack_filter.h =================================================================== --- head/sys/netinet/tcp_stacks/sack_filter.h (revision 343754) +++ head/sys/netinet/tcp_stacks/sack_filter.h (revision 343755) @@ -1,58 +1,56 @@ #ifndef __sack_filter_h__ #define __sack_filter_h__ /*- - * Copyright (c) 2017 - * Netflix Inc. - * All rights reserved. + * Copyright (c) 2017 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * __FBSDID("$FreeBSD$"); */ /* * Seven entry's is carefully choosen to * fit in one cache line. We can easily * change this to 15 (but it gets very * little extra filtering). To change it * to be larger than 15 would require either * sf_bits becoming a uint32_t and then you * could go to 31.. or change it to a full * bitstring.. It is really doubtful you * will get much benefit beyond 7, in testing * there was a small amount but very very small. */ #define SACK_FILTER_BLOCKS 7 struct sack_filter { tcp_seq sf_ack; uint16_t sf_bits; uint8_t sf_cur; uint8_t sf_used; struct sackblk sf_blks[SACK_FILTER_BLOCKS]; }; #ifdef _KERNEL void sack_filter_clear(struct sack_filter *sf, tcp_seq seq); int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack); #endif #endif Index: head/sys/netinet/tcp_stacks/tcp_rack.h =================================================================== --- head/sys/netinet/tcp_stacks/tcp_rack.h (revision 343754) +++ head/sys/netinet/tcp_stacks/tcp_rack.h (revision 343755) @@ -1,321 +1,320 @@ /*- - * Copyright (c) 2016 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET_TCP_RACK_H_ #define _NETINET_TCP_RACK_H_ #define RACK_ACKED 0x0001/* The remote endpoint acked this */ #define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */ #define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */ #define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ #define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ #define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ #define RACK_HAS_FIN 0x0040/* segment is sent with fin */ #define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ struct rack_sendmap { TAILQ_ENTRY(rack_sendmap) r_next; /* seq number arrayed next */ TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time * sent */ uint8_t r_flags; /* Flags as defined above */ uint8_t r_sndcnt; /* Retran count, not limited by * RACK_NUM_OF_RETRANS */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ uint8_t r_resv[3]; }; TAILQ_HEAD(rack_head, rack_sendmap); /* * We use the rate sample structure to * assist in single sack/ack rate and rtt * calculation. In the future we will expand * this in BBR to do forward rate sample * b/w estimation. */ #define RACK_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */ #define RACK_RTT_VALID 0x00000002 /* We have at least one valid RTT */ struct rack_rtt_sample { uint32_t rs_flags; uint32_t rs_rtt_lowest; uint32_t rs_rtt_highest; uint32_t rs_rtt_cnt; uint64_t rs_rtt_tot; }; #define RACK_LOG_TYPE_ACK 0x01 #define RACK_LOG_TYPE_OUT 0x02 #define RACK_LOG_TYPE_TO 0x03 #define RACK_LOG_TYPE_ALLOC 0x04 #define RACK_LOG_TYPE_FREE 0x05 struct rack_log { union { struct rack_sendmap *rsm; /* For alloc/free */ uint64_t sb_acc;/* For out/ack or t-o */ }; uint32_t th_seq; uint32_t th_ack; uint32_t snd_una; uint32_t snd_nxt; /* th_win for TYPE_ACK */ uint32_t snd_max; uint32_t blk_start[4]; uint32_t blk_end[4]; uint8_t type; uint8_t n_sackblks; uint16_t len; /* Timeout T3=1, TLP=2, RACK=3 */ }; /* * Magic numbers for logging timeout events if the * logging is enabled. */ #define RACK_TO_FRM_TMR 1 #define RACK_TO_FRM_TLP 2 #define RACK_TO_FRM_RACK 3 #define RACK_TO_FRM_KEEP 4 #define RACK_TO_FRM_PERSIST 5 #define RACK_TO_FRM_DELACK 6 struct rack_opts_stats { uint64_t tcp_rack_prop_rate; uint64_t tcp_rack_prop; uint64_t tcp_rack_tlp_reduce; uint64_t tcp_rack_early_recov; uint64_t tcp_rack_pace_always; uint64_t tcp_rack_pace_reduce; uint64_t tcp_rack_max_seg; uint64_t tcp_rack_prr_sendalot; uint64_t tcp_rack_min_to; uint64_t tcp_rack_early_seg; uint64_t tcp_rack_reord_thresh; uint64_t tcp_rack_reord_fade; uint64_t tcp_rack_tlp_thresh; uint64_t tcp_rack_pkt_delay; uint64_t tcp_rack_tlp_inc_var; uint64_t tcp_tlp_use; uint64_t tcp_rack_idle_reduce; uint64_t tcp_rack_idle_reduce_high; uint64_t rack_no_timer_in_hpts; uint64_t tcp_rack_min_pace_seg; uint64_t tcp_rack_min_pace; }; #define TLP_USE_ID 1 /* Internet draft behavior */ #define TLP_USE_TWO_ONE 2 /* Use 2.1 behavior */ #define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */ #ifdef _KERNEL #define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t)) extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; #define RACK_OPTS_ADD(name, amm) counter_u64_add(rack_opts_arry[(offsetof(struct rack_opts_stats, name)/sizeof(uint64_t))], (amm)) #define RACK_OPTS_INC(name) RACK_OPTS_ADD(name, 1) #endif /* * As we get each SACK we wade through the * rc_map and mark off what is acked. * We also increment rc_sacked as well. * * We also pay attention to missing entries * based on the time and possibly mark them * for retransmit. If we do and we are not already * in recovery we enter recovery. In doing * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec. * We also setup rc_next/rc_snd_nxt/rc_send_end so * we will know where to send from. When not in * recovery rc_next will be NULL and rc_snd_nxt should * equal snd_max. * * Whenever we retransmit from recovery we increment * rc_holes_rxt as we retran a block and mark it as retransmitted * with the time it was sent. During non-recovery sending we * add to our map and note the time down of any send expanding * the rc_map at the tail and moving rc_snd_nxt up with snd_max. * * In recovery during SACK/ACK processing if a chunk has * been retransmitted and it is now acked, we decrement rc_holes_rxt. * When we retransmit from the scoreboard we use * rc_next and rc_snd_nxt/rc_send_end to help us * find what needs to be retran. * * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt * This gets us the effect of RFC6675 pipe, counting twice for * bytes retransmitted. */ #define TT_RACK_FR_TMR 0x2000 /* * Locking for the rack control block. * a) Locked by INP_WLOCK * b) Locked by the hpts-mutex * */ struct rack_control { /* Second cache line 0x40 from tcp_rack */ struct rack_head rc_map;/* List of all segments Lock(a) */ struct rack_head rc_tmap; /* List in transmit order Lock(a) */ struct rack_sendmap *rc_tlpsend; /* Remembered place for * tlp_sending Lock(a) */ struct rack_sendmap *rc_resend; /* something we have been asked to * resend */ uint32_t rc_hpts_flags; uint32_t rc_timer_exp; /* If a timer ticks of expiry */ uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */ uint32_t rc_rack_largest_cwnd; /* Largest CWND we have seen Lock(a) */ /* Third Cache line 0x80 */ struct rack_head rc_free; /* Allocation array */ uint32_t rc_time_last_sent; /* Time we last sent some data and * logged it Lock(a). */ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP * Lock(a) */ uint32_t rc_prr_out; /* bytes sent during recovery Lock(a) */ uint32_t rc_prr_recovery_fs; /* recovery fs point Lock(a) */ uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */ uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */ uint16_t rc_tlp_send_cnt; /* Number of TLP sends we have done * since peer spoke to us Lock(a) */ uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent * rc_last_tlp_seq Lock(a) */ uint32_t rc_loss_count; /* During recovery how many segments were lost * Lock(a) */ uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ /* Forth cache line 0xc0 */ /* Times */ uint32_t rc_rack_tmit_time; /* Rack transmit time Lock(a) */ uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */ /* Variables to track bad retransmits and recover */ uint32_t rc_rsm_start; /* RSM seq number we retransmitted Lock(a) */ uint32_t rc_cwnd_at; /* cwnd at the retransmit Lock(a) */ uint32_t rc_ssthresh_at;/* ssthresh at the retransmit Lock(a) */ uint32_t rc_num_maps_alloced; /* Number of map blocks (sacks) we * have allocated */ uint32_t rc_rcvtime; /* When we last received data */ uint32_t rc_notused; uint32_t rc_last_output_to; uint32_t rc_went_idle_time; struct rack_sendmap *rc_sacklast; /* sack remembered place * Lock(a) */ struct rack_sendmap *rc_next; /* remembered place where we next * retransmit at Lock(a) */ struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for * cache line alignment * Lock(a) */ /* Cache line split 0x100 */ struct sack_filter rack_sf; /* Cache line split 0x140 */ /* Flags for various things */ struct rack_rtt_sample rack_rs; uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ uint8_t rc_prop_rate; /* Socket option value Lock(a) */ uint8_t rc_prop_reduce; /* Socket option value Lock(a) */ uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ uint8_t rc_early_recovery; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_min_to; /* Socket option value Lock(a) */ uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */ uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */ uint8_t rc_rate_sample_method; }; #ifdef _KERNEL struct tcp_rack { /* First cache line 0x00 */ TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */ int32_t(*r_substate) (struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, struct tcpopt *, int32_t, int32_t, uint32_t, int, int); /* Lock(a) */ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ struct inpcb *rc_inp; /* The inpcb Lock(a) */ uint32_t rc_free_cnt; /* Number of free entries on the rc_free list * Lock(a) */ uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */ uint16_t r_wanted_output; /* Output routine wanted to be called */ uint16_t r_cpu; /* CPU that the INP is running on Lock(a) */ uint16_t rc_pace_max_segs; /* Socket option value Lock(a) */ uint16_t rc_pace_reduce;/* Socket option value Lock(a) */ uint8_t r_state; /* Current rack state Lock(a) */ uint8_t rc_tmr_stopped : 7, t_timers_stopped : 1; uint8_t rc_enobuf; /* count of enobufs on connection provides * backoff Lock(a) */ uint8_t r_timer_override : 1, /* hpts override Lock(a) */ r_tlp_running : 1, /* Running from a TLP timeout Lock(a) */ r_is_v6 : 1, /* V6 pcb Lock(a) */ rc_in_persist : 1, rc_last_pto_set : 1, /* XXX not used */ rc_tlp_in_progress : 1, rc_always_pace : 1, /* Socket option value Lock(a) */ rc_timer_up : 1; /* The rack timer is up flag Lock(a) */ uint8_t r_idle_reduce_largest : 1, r_enforce_min_pace : 2, r_min_pace_seg_thresh : 5; uint8_t rack_tlp_threshold_use; uint8_t rc_allow_data_af_clo: 1, delayed_ack : 1, rc_avail : 6; uint8_t r_resv[2]; /* Fill to cache line boundary */ /* Cache line 2 0x40 */ struct rack_control r_ctl; } __aligned(CACHE_LINE_SIZE); #endif #endif Index: head/sys/sys/boot.h =================================================================== --- head/sys/sys/boot.h (revision 343754) +++ head/sys/sys/boot.h (revision 343755) @@ -1,42 +1,42 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2018 Netflix * Copyright (c) 2014 Roger Pau Monné * All rights reserved. + * Copyright (c) 2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_BOOT_H_ #define _SYS_BOOT_H_ int boot_env_to_howto(void); void boot_howto_to_env(int howto); int boot_parse_arg(char *v); int boot_parse_cmdline_delim(char *cmdline, const char *delim); int boot_parse_cmdline(char *cmdline); int boot_parse_args(int argc, char *argv[]); #endif /* !_SYS_BOOT_H_ */ Index: head/sys/sys/efiio.h =================================================================== --- head/sys/sys/efiio.h (revision 343754) +++ head/sys/sys/efiio.h (revision 343755) @@ -1,59 +1,58 @@ /*- * Copyright (c) 2016 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EFIIO_H_ #define _SYS_EFIIO_H_ #include #include #include struct efi_get_table_ioc { struct uuid uuid; /* UUID to look up */ void *ptr; /* Pointer to table in KVA space */ }; struct efi_var_ioc { efi_char *name; /* User pointer to name, in wide chars */ size_t namesize; /* Number of wide characters in name */ struct uuid vendor; /* Vendor's UUID for variable */ uint32_t attrib; /* Attributes */ void *data; /* User pointer to the data */ size_t datasize; /* Number of *bytes* in the data */ }; #define EFIIOC_GET_TABLE _IOWR('E', 1, struct efi_get_table_ioc) #define EFIIOC_GET_TIME _IOR('E', 2, struct efi_tm) #define EFIIOC_SET_TIME _IOW('E', 3, struct efi_tm) #define EFIIOC_VAR_GET _IOWR('E', 4, struct efi_var_ioc) #define EFIIOC_VAR_NEXT _IOWR('E', 5, struct efi_var_ioc) #define EFIIOC_VAR_SET _IOWR('E', 6, struct efi_var_ioc) #endif /* _SYS_EFIIO_H_ */ Index: head/sys/sys/kern_prefetch.h =================================================================== --- head/sys/sys/kern_prefetch.h (revision 343754) +++ head/sys/sys/kern_prefetch.h (revision 343755) @@ -1,42 +1,42 @@ /*- - * Copyright (c) 2016-2018 Netflix Inc. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __kern_prefetch_h__ #define __kern_prefetch_h__ #ifdef _KERNEL static __inline void kern_prefetch(const volatile void *addr, void* before) { #if defined(__amd64__) __asm __volatile("prefetcht1 (%1)":"=rm"(*((int32_t *)before)):"r"(addr):); #else /* __builtin_prefetch(addr);*/ #endif } #endif /* _KERNEL */ #endif /* __kern_prefetch_h__ */ Index: head/sys/tests/callout_test/callout_test.c =================================================================== --- head/sys/tests/callout_test/callout_test.c (revision 343754) +++ head/sys/tests/callout_test/callout_test.c (revision 343755) @@ -1,283 +1,284 @@ /*- - * Copyright (c) 2015 Netflix Inc. All rights reserved. + * Copyright (c) 2015 Netflix, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_CALLTMP, "Temp callout Memory", "CalloutTest"); struct callout_run { struct mtx lock; struct callout *co_array; int co_test; int co_number_callouts; int co_return_npa; int co_completed; int callout_waiting; int drain_calls; int cnt_zero; int cnt_one; int index; }; static struct callout_run *comaster[MAXCPU]; uint64_t callout_total = 0; static void execute_the_co_test(struct callout_run *rn); static void co_saydone(void *arg) { struct callout_run *rn; rn = (struct callout_run *)arg; printf("The callout test is now complete for thread %d\n", rn->index); printf("number_callouts:%d\n", rn->co_number_callouts); printf("Callouts that bailed (Not PENDING or ACTIVE cleared):%d\n", rn->co_return_npa); printf("Callouts that completed:%d\n", rn->co_completed); printf("Drain calls:%d\n", rn->drain_calls); printf("Zero returns:%d non-zero:%d\n", rn->cnt_zero, rn->cnt_one); } static void drainit(void *arg) { struct callout_run *rn; rn = (struct callout_run *)arg; mtx_lock(&rn->lock); rn->drain_calls++; mtx_unlock(&rn->lock); } static void test_callout(void *arg) { struct callout_run *rn; int cpu; critical_enter(); cpu = curcpu; critical_exit(); rn = (struct callout_run *)arg; atomic_add_int(&rn->callout_waiting, 1); mtx_lock(&rn->lock); if (callout_pending(&rn->co_array[cpu]) || !callout_active(&rn->co_array[cpu])) { rn->co_return_npa++; atomic_subtract_int(&rn->callout_waiting, 1); mtx_unlock(&rn->lock); return; } callout_deactivate(&rn->co_array[cpu]); rn->co_completed++; mtx_unlock(&rn->lock); atomic_subtract_int(&rn->callout_waiting, 1); } void execute_the_co_test(struct callout_run *rn) { int i, ret, cpu; uint32_t tk_s, tk_e, tk_d; mtx_lock(&rn->lock); rn->callout_waiting = 0; for (i = 0; i < rn->co_number_callouts; i++) { if (rn->co_test == 1) { /* start all on spread out cpu's */ cpu = i % mp_ncpus; callout_reset_sbt_on(&rn->co_array[i], 3, 0, test_callout, rn, cpu, 0); } else { /* Start all on the same CPU */ callout_reset_sbt_on(&rn->co_array[i], 3, 0, test_callout, rn, rn->index, 0); } } tk_s = ticks; while (rn->callout_waiting != rn->co_number_callouts) { cpu_spinwait(); tk_e = ticks; tk_d = tk_e - tk_s; if (tk_d > 100) { break; } } /* OK everyone is waiting and we have the lock */ for (i = 0; i < rn->co_number_callouts; i++) { ret = callout_async_drain(&rn->co_array[i], drainit); if (ret) { rn->cnt_one++; } else { rn->cnt_zero++; } } rn->callout_waiting -= rn->cnt_one; mtx_unlock(&rn->lock); /* Now wait until all are done */ tk_s = ticks; while (rn->callout_waiting > 0) { cpu_spinwait(); tk_e = ticks; tk_d = tk_e - tk_s; if (tk_d > 100) { break; } } co_saydone((void *)rn); } static void run_callout_test(struct kern_test *test) { struct callout_test *u; size_t sz; int i; struct callout_run *rn; int index = test->tot_threads_running; u = (struct callout_test *)test->test_options; if (comaster[index] == NULL) { rn = comaster[index] = malloc(sizeof(struct callout_run), M_CALLTMP, M_WAITOK); memset(comaster[index], 0, sizeof(struct callout_run)); mtx_init(&rn->lock, "callouttest", NULL, MTX_DUPOK); rn->index = index; } else { rn = comaster[index]; rn->co_number_callouts = rn->co_return_npa = 0; rn->co_completed = rn->callout_waiting = 0; rn->drain_calls = rn->cnt_zero = rn->cnt_one = 0; if (rn->co_array) { free(rn->co_array, M_CALLTMP); rn->co_array = NULL; } } rn->co_number_callouts = u->number_of_callouts; rn->co_test = u->test_number; sz = sizeof(struct callout) * rn->co_number_callouts; rn->co_array = malloc(sz, M_CALLTMP, M_WAITOK); for (i = 0; i < rn->co_number_callouts; i++) { callout_init(&rn->co_array[i], CALLOUT_MPSAFE); } execute_the_co_test(rn); } int callout_test_is_loaded = 0; static void cocleanup(void) { int i; for (i = 0; i < MAXCPU; i++) { if (comaster[i]) { if (comaster[i]->co_array) { free(comaster[i]->co_array, M_CALLTMP); comaster[i]->co_array = NULL; } free(comaster[i], M_CALLTMP); comaster[i] = NULL; } } } static int callout_test_modevent(module_t mod, int type, void *data) { int err = 0; switch (type) { case MOD_LOAD: err = kern_testframework_register("callout_test", run_callout_test); if (err) { printf("Can't load callout_test err:%d returned\n", err); } else { memset(comaster, 0, sizeof(comaster)); callout_test_is_loaded = 1; } break; case MOD_QUIESCE: err = kern_testframework_deregister("callout_test"); if (err == 0) { callout_test_is_loaded = 0; cocleanup(); } break; case MOD_UNLOAD: if (callout_test_is_loaded) { err = kern_testframework_deregister("callout_test"); if (err == 0) { cocleanup(); callout_test_is_loaded = 0; } } break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t callout_test_mod = { .name = "callout_test", .evhand = callout_test_modevent, .priv = 0 }; MODULE_DEPEND(callout_test, kern_testframework, 1, 1, 1); DECLARE_MODULE(callout_test, callout_test_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/tests/callout_test.h =================================================================== --- head/sys/tests/callout_test.h (revision 343754) +++ head/sys/tests/callout_test.h (revision 343755) @@ -1,34 +1,33 @@ #ifndef __callout_test_h__ #define __callout_test_h__ /*- - * Copyright (c) 2015 - * Netflix Incorporated, All rights reserved. + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. *__FBSDID("$FreeBSD$"); * */ struct callout_test { int number_of_callouts; int test_number; }; #endif Index: head/sys/tests/framework/kern_testfrwk.c =================================================================== --- head/sys/tests/framework/kern_testfrwk.c (revision 343754) +++ head/sys/tests/framework/kern_testfrwk.c (revision 343755) @@ -1,341 +1,340 @@ /*- - * Copyright (c) 2015 - * Netflix Incorporated, All rights reserved. + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif struct kern_test_list { TAILQ_ENTRY(kern_test_list) next; char name[TEST_NAME_LEN]; kerntfunc func; }; TAILQ_HEAD(ktestlist, kern_test_list); struct kern_test_entry { TAILQ_ENTRY(kern_test_entry) next; struct kern_test_list *kt_e; struct kern_test kt_data; }; TAILQ_HEAD(ktestqueue, kern_test_entry); MALLOC_DEFINE(M_KTFRWK, "kern_tfrwk", "Kernel Test Framework"); struct kern_totfrwk { struct taskqueue *kfrwk_tq; struct task kfrwk_que; struct ktestlist kfrwk_testlist; struct ktestqueue kfrwk_testq; struct mtx kfrwk_mtx; int kfrwk_waiting; }; struct kern_totfrwk kfrwk; static int ktest_frwk_inited = 0; #define KTFRWK_MUTEX_INIT() mtx_init(&kfrwk.kfrwk_mtx, "kern_test_frwk", "tfrwk", MTX_DEF) #define KTFRWK_DESTROY() mtx_destroy(&kfrwk.kfrwk_mtx) #define KTFRWK_LOCK() mtx_lock(&kfrwk.kfrwk_mtx) #define KTFRWK_UNLOCK() mtx_unlock(&kfrwk.kfrwk_mtx) static void kfrwk_task(void *context, int pending) { struct kern_totfrwk *tf; struct kern_test_entry *wk; int free_mem = 0; struct kern_test kt_data; kerntfunc ktf; memset(&kt_data, 0, sizeof(kt_data)); ktf = NULL; tf = (struct kern_totfrwk *)context; KTFRWK_LOCK(); wk = TAILQ_FIRST(&tf->kfrwk_testq); if (wk) { wk->kt_data.tot_threads_running--; tf->kfrwk_waiting--; memcpy(&kt_data, &wk->kt_data, sizeof(kt_data)); if (wk->kt_data.tot_threads_running == 0) { TAILQ_REMOVE(&tf->kfrwk_testq, wk, next); free_mem = 1; } else { /* Wake one of my colleages up to help too */ taskqueue_enqueue(tf->kfrwk_tq, &tf->kfrwk_que); } if (wk->kt_e) { ktf = wk->kt_e->func; } } KTFRWK_UNLOCK(); if (wk && free_mem) { free(wk, M_KTFRWK); } /* Execute the test */ if (ktf) { (*ktf) (&kt_data); } /* We are done */ atomic_add_int(&tf->kfrwk_waiting, 1); } static int kerntest_frwk_init(void) { u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU; KTFRWK_MUTEX_INIT(); TAILQ_INIT(&kfrwk.kfrwk_testq); TAILQ_INIT(&kfrwk.kfrwk_testlist); /* Now lets start up a number of tasks to do the work */ TASK_INIT(&kfrwk.kfrwk_que, 0, kfrwk_task, &kfrwk); kfrwk.kfrwk_tq = taskqueue_create_fast("sbtls_task", M_NOWAIT, taskqueue_thread_enqueue, &kfrwk.kfrwk_tq); if (kfrwk.kfrwk_tq == NULL) { printf("Can't start taskqueue for Kernel Test Framework\n"); panic("Taskqueue init fails for kfrwk"); } taskqueue_start_threads(&kfrwk.kfrwk_tq, ncpus, PI_NET, "[kt_frwk task]"); kfrwk.kfrwk_waiting = ncpus; ktest_frwk_inited = 1; return (0); } static int kerntest_frwk_fini(void) { KTFRWK_LOCK(); if (!TAILQ_EMPTY(&kfrwk.kfrwk_testlist)) { /* Still modules registered */ KTFRWK_UNLOCK(); return (EBUSY); } ktest_frwk_inited = 0; KTFRWK_UNLOCK(); taskqueue_free(kfrwk.kfrwk_tq); /* Ok lets destroy the mutex on the way outs */ KTFRWK_DESTROY(); return (0); } static int kerntest_execute(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_kern, OID_AUTO, testfrwk, CTLFLAG_RW, 0, "Kernel Test Framework"); SYSCTL_PROC(_kern_testfrwk, OID_AUTO, runtest, (CTLTYPE_STRUCT | CTLFLAG_RW), 0, 0, kerntest_execute, "IU", "Execute a kernel test"); int kerntest_execute(SYSCTL_HANDLER_ARGS) { struct kern_test kt; struct kern_test_list *li, *te = NULL; struct kern_test_entry *kte = NULL; int error = 0; if (ktest_frwk_inited == 0) { return (ENOENT); } /* Find the entry if possible */ error = SYSCTL_IN(req, &kt, sizeof(struct kern_test)); if (error) { return (error); } if (kt.num_threads <= 0) { return (EINVAL); } /* Grab some memory */ kte = malloc(sizeof(struct kern_test_entry), M_KTFRWK, M_WAITOK); if (kte == NULL) { error = ENOMEM; goto out; } KTFRWK_LOCK(); TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) { if (strcmp(li->name, kt.name) == 0) { te = li; break; } } if (te == NULL) { printf("Can't find the test %s\n", kt.name); error = ENOENT; free(kte, M_KTFRWK); goto out; } /* Ok we have a test item to run, can we? */ if (!TAILQ_EMPTY(&kfrwk.kfrwk_testq)) { /* We don't know if there is enough threads */ error = EAGAIN; free(kte, M_KTFRWK); goto out; } if (kfrwk.kfrwk_waiting < kt.num_threads) { error = E2BIG; free(kte, M_KTFRWK); goto out; } kt.tot_threads_running = kt.num_threads; /* Ok it looks like we can do it, lets get an entry */ kte->kt_e = li; memcpy(&kte->kt_data, &kt, sizeof(kt)); TAILQ_INSERT_TAIL(&kfrwk.kfrwk_testq, kte, next); taskqueue_enqueue(kfrwk.kfrwk_tq, &kfrwk.kfrwk_que); out: KTFRWK_UNLOCK(); return (error); } int kern_testframework_register(const char *name, kerntfunc func) { int error = 0; struct kern_test_list *li, *te = NULL; int len; len = strlen(name); if (len >= TEST_NAME_LEN) { return (E2BIG); } te = malloc(sizeof(struct kern_test_list), M_KTFRWK, M_WAITOK); if (te == NULL) { error = ENOMEM; goto out; } KTFRWK_LOCK(); /* First does it already exist? */ TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) { if (strcmp(li->name, name) == 0) { error = EALREADY; free(te, M_KTFRWK); goto out; } } /* Ok we can do it, lets add it to the list */ te->func = func; strcpy(te->name, name); TAILQ_INSERT_TAIL(&kfrwk.kfrwk_testlist, te, next); out: KTFRWK_UNLOCK(); return (error); } int kern_testframework_deregister(const char *name) { struct kern_test_list *li, *te = NULL; u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU; int error = 0; KTFRWK_LOCK(); /* First does it already exist? */ TAILQ_FOREACH(li, &kfrwk.kfrwk_testlist, next) { if (strcmp(li->name, name) == 0) { te = li; break; } } if (te == NULL) { /* It is not registered so no problem */ goto out; } if (ncpus != kfrwk.kfrwk_waiting) { /* We are busy executing something -- can't unload */ error = EBUSY; goto out; } if (!TAILQ_EMPTY(&kfrwk.kfrwk_testq)) { /* Something still to execute */ error = EBUSY; goto out; } /* Ok we can remove the dude safely */ TAILQ_REMOVE(&kfrwk.kfrwk_testlist, te, next); memset(te, 0, sizeof(struct kern_test_list)); free(te, M_KTFRWK); out: KTFRWK_UNLOCK(); return (error); } static int kerntest_mod_init(module_t mod, int type, void *data) { int err; switch (type) { case MOD_LOAD: err = kerntest_frwk_init(); break; case MOD_QUIESCE: KTFRWK_LOCK(); if (TAILQ_EMPTY(&kfrwk.kfrwk_testlist)) { err = 0; } else { err = EBUSY; } KTFRWK_UNLOCK(); break; case MOD_UNLOAD: err = kerntest_frwk_fini(); break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t kern_test_framework = { .name = "kernel_testfrwk", .evhand = kerntest_mod_init, .priv = 0 }; MODULE_VERSION(kern_testframework, 1); DECLARE_MODULE(kern_testframework, kern_test_framework, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: head/sys/tests/kern_testfrwk.h =================================================================== --- head/sys/tests/kern_testfrwk.h (revision 343754) +++ head/sys/tests/kern_testfrwk.h (revision 343755) @@ -1,49 +1,48 @@ /*- - * Copyright (c) 2015 - * Netflix Incorporated, All rights reserved. + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. *__FBSDID("$FreeBSD$"); * */ #ifndef _SYS_KERN_TESTFRWKT_H_ #define _SYS_KERN_TESTFRWKT_H_ #define TEST_NAME_LEN 32 #define TEST_OPTION_SPACE 256 struct kern_test { char name[TEST_NAME_LEN]; int num_threads; /* Fill in how many threads you want */ int tot_threads_running; /* For framework */ uint8_t test_options[TEST_OPTION_SPACE]; }; typedef void (*kerntfunc)(struct kern_test *); #ifdef _KERNEL int kern_testframework_register(const char *name, kerntfunc); int kern_testframework_deregister(const char *name); #endif #endif Index: head/usr.sbin/efibootmgr/efibootmgr.8 =================================================================== --- head/usr.sbin/efibootmgr/efibootmgr.8 (revision 343754) +++ head/usr.sbin/efibootmgr/efibootmgr.8 (revision 343755) @@ -1,154 +1,154 @@ -.\" Copyright (c) 2017-2018 Netflix, Inc -.\" All rights reserved. +.\" +.\" Copyright (c) 2017-2018 Netflix, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd December 28, 2018 .Dt EFIBOOTMGR 8 .Os .Sh NAME .Nm efibootmgr .Nd manipulate the EFI Boot Manager .Sh SYNOPSIS .Op Fl aAnNB .Op Fl b Ar bootnum .Op Fl t Ar timeout .Op Fl T .Op Fl o Ar bootorder .Op Fl v .Op Fl c l Ar loader [ Fl k Ar kernel ] [ Fl L Ar label ] [ Fl -dry-run ] .Sh "DESCRIPTION" .Nm manipulates how UEFI Boot Managers boot the system. Methods of booting can be created and destroyed. Boot methods can be activated or deactivated. The order of boot methods tried can be changed. Temporary boot methods can override the usual booting methods. .Pp The UEFI standard defines how hosts may control what is used to bootstrap the system. Each method is encapsulated within a persistent UEFI variable, stored by the UEFI BIOS of the form .Va BootXXXX . These variables are numbered, describe where to load the bootstrap program from, and whether or not the method is active. The boot order of these methods is controlled by another variable .Va BootOrder . The currently booting method is communicated using .Va BootCurrent . A global timeout can also be set. .Pp .Nm requires that the kernel efirt module be loaded to get and set these non-volatile variables. .Pp The following options are available: .Bl -tag -width 28m .It Fl c Fl -create Create a new Boot Variable .It Fl l -loader Ar loader The path to and name of the loader. .It Fl k -kernel Ar kernel The path to and name of the kernel. .It Fl b -bootnum Ar bootnum When creating or modifying an entry, use bootnum as the index. When creating a new entry, fail if it already exists. .It Fl L -label Ar label An optional description for the entry. .It Fl D -dry-run Process but do not change any variables. .It Fl B -delete Delete the given bootnum boot entry. .It Fl a -activate Activate the given bootnum boot entry, or the new entry when used with -c. .It Fl A -deactivate Deactivate the given bootnum boot entry. .It Fl n -bootnext Set bootnum boot entry as the BootNext variable. .It Fl N -delete-bootnext Delete the BootNext optional variable. .It Fl o -bootorder Ar bootorder Set BootOrder variable to the given comma delimited set of bootnums. The numbers are in hex to match BootXXXX, but may omit leading zeros. .It Fl t -set-timeout Ar timeout Set the bootmenu timeout value. .It Fl T -del-timeout Delete the BootTimeout variable. .It Fl v -verbose Display the device path of boot entries in the output. .El .Pp .Sh Examples .Pp To display the current Boot related variables in the system: .Pp .Dl efibootmgr [-v] .Pp This will display the optional BootNext bootnum, BootCurrent, or currently booted bootnum, followed by the optional Timeout value, any BootOrder that may be set, followed finally by all currently defined Boot variables, active or not. The verbose flag will augment this output with the disk partition uuids, size/offset and device-path of the variable. .Pp The .Nm program can be used to create new EFI boot variables. To create a new boot var pointing to an installation with its EFI partition mounted under /mnt, the given loader and a label "FreeBSD-11": .Pp .Dl efibootmgr -c -l /mnt/EFI/freebsd/loader.efi -L FreeBSD-11 .Pp This will result in the next available bootnum being assigned to a new UEFI boot variable, and given the label "FreeBSD-11" such as: .Pp .Dl Boot0009 FreeBSD-11 .Pp Note newly created boot entries are created inactive. The active state is denoted by an '*' following the BootXXXX name in the output. They are also inserted into the first position of current BootOrder variable if it exists. They must first be set to active before being considered available to attempt booting from, else they are ignored. .Pp .Dl efibootmgr -B -b 0009 .Pp Will delete the given boot entry Boot0009 .Pp To set a given newly created boot entry active use: .Pp .Dl efibootmgr -a -b 0009 .Pp To set a given boot entry to be used as the BootNext variable, irrespective of its active state, use: .Pp .Dl efibootmgr -n -b 0009 .Pp To set the BootOrder for the next reboot use: .Pp .Dl efibootmgr -o 0009,0003,... .Pp .Sh SEE ALSO .Xr efivar 8 , .Xr uefi 8 , .Xr gpart 8 Index: head/usr.sbin/efibootmgr/efibootmgr.c =================================================================== --- head/usr.sbin/efibootmgr/efibootmgr.c (revision 343754) +++ head/usr.sbin/efibootmgr/efibootmgr.c (revision 343755) @@ -1,943 +1,942 @@ /*- * Copyright (c) 2017-2018 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef LOAD_OPTION_ACTIVE #define LOAD_OPTION_ACTIVE 0x00000001 #endif #ifndef LOAD_OPTION_CATEGORY_BOOT #define LOAD_OPTION_CATEGORY_BOOT 0x00000000 #endif #define BAD_LENGTH ((size_t)-1) typedef struct _bmgr_opts { char *env; char *loader; char *label; char *kernel; char *name; char *order; int bootnum; bool copy; bool create; bool delete; bool delete_bootnext; bool del_timeout; bool dry_run; bool has_bootnum; bool once; int cp_src; bool set_active; bool set_bootnext; bool set_inactive; bool set_timeout; int timeout; bool verbose; } bmgr_opts_t; static struct option lopts[] = { {"activate", no_argument, NULL, 'a'}, {"bootnext", no_argument, NULL, 'n'}, /* set bootnext */ {"bootnum", required_argument, NULL, 'b'}, {"bootorder", required_argument, NULL, 'o'}, /* set order */ {"copy", required_argument, NULL, 'C'}, /* Copy boot method */ {"create", no_argument, NULL, 'c'}, {"deactivate", no_argument, NULL, 'A'}, {"del-timout", no_argument, NULL, 'T'}, {"delete", no_argument, NULL, 'B'}, {"delete-bootnext", no_argument, NULL, 'N'}, {"dry-run", no_argument, NULL, 'D'}, {"env", required_argument, NULL, 'e'}, {"help", no_argument, NULL, 'h'}, {"kernel", required_argument, NULL, 'k'}, {"label", required_argument, NULL, 'L'}, {"loader", required_argument, NULL, 'l'}, {"once", no_argument, NULL, 'O'}, {"set-timeout", required_argument, NULL, 't'}, {"verbose", no_argument, NULL, 'v'}, { NULL, 0, NULL, 0} }; /* global efibootmgr opts */ static bmgr_opts_t opts; static LIST_HEAD(efivars_head, entry) efivars = LIST_HEAD_INITIALIZER(efivars); struct entry { efi_guid_t guid; uint32_t attrs; uint8_t *data; size_t size; char *name; char *label; int idx; int flags; #define SEEN 1 LIST_ENTRY(entry) entries; }; #define MAX_DP_LEN 4096 #define MAX_LOADOPT_LEN 8192 static char * mangle_loader(char *loader) { char *c; for (c = loader; *c; c++) if (*c == '/') *c = '\\'; return loader; } #define COMMON_ATTRS EFI_VARIABLE_NON_VOLATILE | \ EFI_VARIABLE_BOOTSERVICE_ACCESS | \ EFI_VARIABLE_RUNTIME_ACCESS /* * We use global guid, and common var attrs and * find it better to just delete and re-create a var. */ static int set_bootvar(const char *name, uint8_t *data, size_t size) { return efi_set_variable(EFI_GLOBAL_GUID, name, data, size, COMMON_ATTRS); } #define USAGE \ " [-aAnB -b bootnum] [-N] [-t timeout] [-T] [-o bootorder] [-O] [--verbose] [--help]\n\ [-c -l loader [-k kernel] [-L label] [--dry-run] [-b bootnum]]" #define CREATE_USAGE \ " efibootmgr -c -l loader [-k kernel] [-L label] [--dry-run] [-b bootnum] [-a]" #define ORDER_USAGE \ " efibootmgr -o bootvarnum1,bootvarnum2,..." #define TIMEOUT_USAGE \ " efibootmgr -t seconds" #define DELETE_USAGE \ " efibootmgr -B -b bootnum" #define ACTIVE_USAGE \ " efibootmgr [-a | -A] -b bootnum" #define BOOTNEXT_USAGE \ " efibootmgr [-n | -N] -b bootnum" static void parse_args(int argc, char *argv[]) { int ch; while ((ch = getopt_long(argc, argv, "AaBb:C:cDe:hk:L:l:NnOo:Tt:v", lopts, NULL)) != -1) { switch (ch) { case 'A': opts.set_inactive = true; break; case 'a': opts.set_active = true; break; case 'b': opts.has_bootnum = true; opts.bootnum = strtoul(optarg, NULL, 16); break; case 'B': opts.delete = true; break; case 'C': opts.copy = true; opts.cp_src = strtoul(optarg, NULL, 16); case 'c': opts.create = true; break; case 'D': /* should be remove dups XXX */ opts.dry_run = true; break; case 'e': free(opts.env); opts.env = strdup(optarg); break; case 'h': default: errx(1, "%s", USAGE); break; case 'k': free(opts.kernel); opts.kernel = strdup(optarg); break; case 'L': free(opts.label); opts.label = strdup(optarg); break; case 'l': free(opts.loader); opts.loader = strdup(optarg); opts.loader = mangle_loader(opts.loader); break; case 'N': opts.delete_bootnext = true; break; case 'n': opts.set_bootnext = true; break; case 'O': opts.once = true; break; case 'o': free(opts.order); opts.order = strdup(optarg); break; case 'T': opts.del_timeout = true; break; case 't': opts.set_timeout = true; opts.timeout = strtoul(optarg, NULL, 10); break; case 'v': opts.verbose = true; break; } } if (opts.create) { if (!opts.loader) errx(1, "%s",CREATE_USAGE); return; } if (opts.order && !(opts.order)) errx(1, "%s", ORDER_USAGE); if ((opts.set_inactive || opts.set_active) && !opts.has_bootnum) errx(1, "%s", ACTIVE_USAGE); if (opts.delete && !opts.has_bootnum) errx(1, "%s", DELETE_USAGE); if (opts.set_bootnext && !opts.has_bootnum) errx(1, "%s", BOOTNEXT_USAGE); } static void read_vars(void) { efi_guid_t *guid; char *next_name = NULL; int ret = 0; struct entry *nent; LIST_INIT(&efivars); while ((ret = efi_get_next_variable_name(&guid, &next_name)) > 0) { /* * Only pay attention to EFI:BootXXXX variables to get the list. */ if (efi_guid_cmp(guid, &EFI_GLOBAL_GUID) != 0 || strlen(next_name) != 8 || strncmp(next_name, "Boot", 4) != 0 || !isxdigit(next_name[4]) || !isxdigit(next_name[5]) || !isxdigit(next_name[6]) || !isxdigit(next_name[7])) continue; nent = malloc(sizeof(struct entry)); nent->name = strdup(next_name); ret = efi_get_variable(*guid, next_name, &nent->data, &nent->size, &nent->attrs); if (ret < 0) err(1, "efi_get_variable"); nent->guid = *guid; nent->idx = strtoul(&next_name[4], NULL, 16); LIST_INSERT_HEAD(&efivars, nent, entries); } } static void set_boot_order(char *order) { uint16_t *new_data; size_t size; char *next, *cp; int cnt; int i; cp = order; cnt = 1; while (*cp) { if (*cp++ == ',') cnt++; } size = sizeof(uint16_t) * cnt; new_data = malloc(size); i = 0; cp = strdup(order); while ((next = strsep(&cp, ",")) != NULL) { new_data[i] = strtoul(next, NULL, 16); if (new_data[i] == 0 && errno == EINVAL) { warnx("can't parse %s as a numb", next); errx(1, "%s", ORDER_USAGE); } i++; } free(cp); if (set_bootvar("BootOrder", (uint8_t*)new_data, size) < 0) err(1, "Unabke to set BootOrder to %s", order); free(new_data); } static void handle_activity(int bootnum, bool active) { uint32_t attrs, load_attrs; uint8_t *data; size_t size; char *name; asprintf(&name, "%s%04X", "Boot", bootnum); if (name == NULL) err(1, "asprintf"); if (efi_get_variable(EFI_GLOBAL_GUID, name, &data, &size, &attrs) < 0) err(1, "No such bootvar %s\n", name); load_attrs = le32dec(data); if (active) load_attrs |= LOAD_OPTION_ACTIVE; else load_attrs &= ~LOAD_OPTION_ACTIVE; le32enc(data, load_attrs); if (set_bootvar(name, data, size) < 0) err(1, "handle activity efi_set_variable"); } /* * add boot var to boot order. * called by create boot var. There is no option * to add one independent of create. * * Note: we currently don't support where it goes * so it goes on the front, inactive. * use -o 2,3,7 etc to affect order, -a to activate. */ static void add_to_boot_order(char *bootvar) { size_t size; uint32_t attrs; uint16_t val; uint8_t *data, *new; val = strtoul(&bootvar[4], NULL, 16); if (efi_get_variable(EFI_GLOBAL_GUID, "BootOrder", &data, &size, &attrs) < 0) { if (errno == ENOENT) { /* create it and set this bootvar to active */ size = 0; data = NULL; } else err(1, "efi_get_variable BootOrder"); } /* * We have BootOrder with the current order * so grow the array by one, add the value * and write the new variable value. */ size += sizeof(uint16_t); new = malloc(size); if (!new) err(1, "malloc"); le16enc(new, val); if (size > sizeof(uint16_t)) memcpy(new + sizeof(uint16_t), data, size - sizeof(uint16_t)); if (set_bootvar("BootOrder", new, size) < 0) err(1, "set_bootvar"); free(new); } static void remove_from_order(uint16_t bootnum) { uint32_t attrs; size_t size, i, j; uint8_t *new, *data; if (efi_get_variable(EFI_GLOBAL_GUID, "BootOrder", &data, &size, &attrs) < 0) return; new = malloc(size); if (new == NULL) err(1, "malloc"); for (j = i = 0; i < size; i += sizeof(uint16_t)) { if (le16dec(data + i) == bootnum) continue; memcpy(new + j, data + i, sizeof(uint16_t)); j += sizeof(uint16_t); } if (i == j) warnx("Boot variable %04x not in BootOrder", bootnum); else if (set_bootvar("BootOrder", new, j) < 0) err(1, "Unable to update BootOrder with new value"); free(new); } static void delete_bootvar(int bootnum) { char *name; int defer = 0; /* * Try to delete the boot variable and remocve it * from the boot order. We always do both actions * to make it easy to clean up from oopses. */ if (bootnum < 0 || bootnum > 0xffff) errx(1, "Bad boot variable %#x", bootnum); asprintf(&name, "%s%04X", "Boot", bootnum); if (name == NULL) err(1, "asprintf"); printf("Removing boot variable '%s'\n", name); if (efi_del_variable(EFI_GLOBAL_GUID, name) < 0) { defer = 1; warn("cannot delete variable %s", name); } printf("Removing 0x%x from BootOrder\n", bootnum); remove_from_order(bootnum); free(name); if (defer) exit(defer); } static void del_bootnext(void) { if (efi_del_variable(EFI_GLOBAL_GUID, "BootNext") < 0) err(1, "efi_del_variable"); } static void handle_bootnext(uint16_t bootnum) { uint16_t num; le16enc(&num, bootnum); if (set_bootvar("BootNext", (uint8_t*)&bootnum, sizeof(uint16_t)) < 0) err(1, "set_bootvar"); } static int compare(const void *a, const void *b) { uint16_t c; uint16_t d; memcpy(&c, a, sizeof(uint16_t)); memcpy(&d, b, sizeof(uint16_t)); if (c < d) return -1; if (c == d) return 0; return 1; } static char * make_next_boot_var_name(void) { struct entry *v; uint16_t *vals, next_free = 0; char *name; int cnt = 0; int i; LIST_FOREACH(v, &efivars, entries) { cnt++; } vals = malloc(sizeof(uint16_t) * cnt); if (!vals) return NULL; i = 0; LIST_FOREACH(v, &efivars, entries) { vals[i++] = v->idx; } qsort(vals, cnt, sizeof(uint16_t), compare); /* if the hole is at the beginning, just return zero */ if (vals[0] > 0) { next_free = 0; } else { /* now just run the list looking for the first hole */ for (i = 0; i < cnt - 1 && next_free == 0; i++) if (vals[i] + 1 != vals[i + 1]) next_free = vals[i] + 1; if (next_free == 0) next_free = vals[cnt - 1] + 1; /* In theory we could have used all 65k slots -- what to do? */ } free(vals); asprintf(&name, "%s%04X", "Boot", next_free); if (name == NULL) err(1, "asprintf"); return name; } static char * make_boot_var_name(uint16_t bootnum) { struct entry *v; char *name; LIST_FOREACH(v, &efivars, entries) { if (v->idx == bootnum) return NULL; } asprintf(&name, "%s%04X", "Boot", bootnum); if (name == NULL) err(1, "asprintf"); return name; } static size_t create_loadopt(uint8_t *buf, size_t bufmax, uint32_t attributes, efidp dp, size_t dp_size, const char *description, const uint8_t *optional_data, size_t optional_data_size) { efi_char *bbuf = NULL; uint8_t *pos = buf; size_t desc_len = 0; size_t len; if (optional_data == NULL && optional_data_size != 0) return BAD_LENGTH; if (dp == NULL && dp_size != 0) return BAD_LENGTH; /* * Compute the length to make sure the passed in buffer is long enough. */ utf8_to_ucs2(description, &bbuf, &desc_len); len = sizeof(uint32_t) + sizeof(uint16_t) + desc_len + dp_size + optional_data_size; if (len > bufmax) { free(bbuf); return BAD_LENGTH; } le32enc(pos, attributes); pos += sizeof (attributes); le16enc(pos, dp_size); pos += sizeof (uint16_t); memcpy(pos, bbuf, desc_len); /* NB:desc_len includes strailing NUL */ pos += desc_len; free(bbuf); memcpy(pos, dp, dp_size); pos += dp_size; if (optional_data && optional_data_size > 0) { memcpy(pos, optional_data, optional_data_size); pos += optional_data_size; } return pos - buf; } static int make_boot_var(const char *label, const char *loader, const char *kernel, const char *env, bool dry_run, int bootnum, bool activate) { struct entry *new_ent; uint32_t load_attrs = 0; uint8_t *load_opt_buf; size_t lopt_size, llen, klen; efidp dp, loaderdp, kerneldp; char *bootvar = NULL; int ret; assert(label != NULL); if (bootnum == -1) bootvar = make_next_boot_var_name(); else bootvar = make_boot_var_name((uint16_t)bootnum); if (bootvar == NULL) err(1, "bootvar creation"); if (loader == NULL) errx(1, "Must specify boot loader"); if (efivar_unix_path_to_device_path(loader, &loaderdp) != 0) err(1, "Cannot translate unix loader path '%s' to UEFI", loader); if (kernel != NULL) { if (efivar_unix_path_to_device_path(kernel, &kerneldp) != 0) err(1, "Cannot translate unix kernel path '%s' to UEFI", kernel); } else { kerneldp = NULL; } llen = efidp_size(loaderdp); if (llen > MAX_DP_LEN) errx(1, "Loader path too long."); klen = efidp_size(kerneldp); if (klen > MAX_DP_LEN) errx(1, "Kernel path too long."); dp = malloc(llen + klen); if (dp == NULL) errx(1, "Can't allocate memory for new device paths"); memcpy(dp, loaderdp, llen); if (kerneldp != NULL) memcpy((char *)dp + llen, kerneldp, klen); /* don't make the new bootvar active by default, use the -a option later */ load_attrs = LOAD_OPTION_CATEGORY_BOOT; if (activate) load_attrs |= LOAD_OPTION_ACTIVE; load_opt_buf = malloc(MAX_LOADOPT_LEN); if (load_opt_buf == NULL) err(1, "malloc"); lopt_size = create_loadopt(load_opt_buf, MAX_LOADOPT_LEN, load_attrs, dp, llen + klen, label, env, env ? strlen(env) + 1 : 0); if (lopt_size == BAD_LENGTH) errx(1, "Can't crate loadopt"); ret = 0; if (!dry_run) { ret = efi_set_variable(EFI_GLOBAL_GUID, bootvar, (uint8_t*)load_opt_buf, lopt_size, COMMON_ATTRS); } if (ret) err(1, "efi_set_variable"); add_to_boot_order(bootvar); /* first, still not active */ new_ent = malloc(sizeof(struct entry)); if (new_ent == NULL) err(1, "malloc"); memset(new_ent, 0, sizeof(struct entry)); new_ent->name = bootvar; new_ent->guid = EFI_GLOBAL_GUID; LIST_INSERT_HEAD(&efivars, new_ent, entries); free(load_opt_buf); free(dp); return 0; } static void print_loadopt_str(uint8_t *data, size_t datalen) { char *dev, *relpath, *abspath; uint32_t attr; uint16_t fplen; efi_char *descr; uint8_t *ep = data + datalen; uint8_t *walker = data; efidp dp, edp; char buf[1024]; int len; int rv; int indent; if (datalen < sizeof(attr) + sizeof(fplen) + sizeof(efi_char)) return; // First 4 bytes are attribute flags attr = le32dec(walker); walker += sizeof(attr); // Next two bytes are length of the file paths fplen = le16dec(walker); walker += sizeof(fplen); // Next we have a 0 terminated UCS2 string that we know to be aligned descr = (efi_char *)(intptr_t)(void *)walker; len = ucs2len(descr); // XXX need to sanity check that len < (datalen - (ep - walker) / 2) walker += (len + 1) * sizeof(efi_char); if (walker > ep) return; // Now we have fplen bytes worth of file path stuff dp = (efidp)walker; walker += fplen; if (walker > ep) return; edp = (efidp)walker; /* * Everything left is the binary option args * opt = walker; * optlen = ep - walker; */ indent = 1; while (dp < edp) { efidp_format_device_path(buf, sizeof(buf), dp, (intptr_t)(void *)edp - (intptr_t)(void *)dp); printf("%*s%s\n", indent, "", buf); indent = 10 + len + 1; rv = efivar_device_path_to_unix_path(dp, &dev, &relpath, &abspath); if (rv == 0) { printf("%*s%s:%s %s\n", indent + 4, "", dev, relpath, abspath); free(dev); free(relpath); free(abspath); } dp = (efidp)((char *)dp + efidp_size(dp)); } } static char * get_descr(uint8_t *data) { uint8_t *pos = data; efi_char *desc; int len; char *buf; int i = 0; pos += sizeof(uint32_t) + sizeof(uint16_t); desc = (efi_char*)(intptr_t)(void *)pos; len = ucs2len(desc); buf = malloc(len + 1); memset(buf, 0, len + 1); while (desc[i]) { buf[i] = desc[i]; i++; } return (char*)buf; } static bool print_boot_var(const char *name, bool verbose, bool curboot) { size_t size; uint32_t load_attrs; uint8_t *data; int ret; char *d; ret = efi_get_variable(EFI_GLOBAL_GUID, name, &data, &size, NULL); if (ret < 0) return false; load_attrs = le32dec(data); d = get_descr(data); printf("%c%s%c %s", curboot ? '+' : ' ', name, ((load_attrs & LOAD_OPTION_ACTIVE) ? '*': ' '), d); free(d); if (verbose) print_loadopt_str(data, size); else printf("\n"); return true; } /* Cmd epilogue, or just the default with no args. * The order is [bootnext] bootcurrent, timeout, order, and the bootvars [-v] */ static int print_boot_vars(bool verbose) { /* * just read and print the current values * as a command epilogue */ struct entry *v; uint8_t *data; size_t size; uint32_t attrs; int ret, bolen; uint16_t *boot_order = NULL, current; ret = efi_get_variable(EFI_GLOBAL_GUID, "BootNext", &data, &size, &attrs); if (ret > 0) { printf("BootNext : %04x\n", le16dec(data)); } ret = efi_get_variable(EFI_GLOBAL_GUID, "BootCurrent", &data, &size,&attrs); current = le16dec(data); printf("BootCurrent: %04x\n", current); ret = efi_get_variable(EFI_GLOBAL_GUID, "Timeout", &data, &size, &attrs); if (ret > 0) { printf("Timeout : %d seconds\n", le16dec(data)); } if (efi_get_variable(EFI_GLOBAL_GUID, "BootOrder", &data, &size, &attrs) > 0) { if (size % 2 == 1) warn("Bad BootOrder variable: odd length %d", (int)size); boot_order = malloc(size); bolen = size / 2; printf("BootOrder : "); for (size_t i = 0; i < size; i += 2) { boot_order[i / 2] = le16dec(data + i); printf("%04X%s", boot_order[i / 2], i == size - 2 ? "\n" : ", "); } } if (boot_order == NULL) { /* * now we want to fetch 'em all fresh again * which possibly includes a newly created bootvar */ LIST_FOREACH(v, &efivars, entries) { print_boot_var(v->name, verbose, v->idx == current); } } else { LIST_FOREACH(v, &efivars, entries) { v->flags = 0; } for (int i = 0; i < bolen; i++) { char buffer[10]; snprintf(buffer, sizeof(buffer), "Boot%04X", boot_order[i]); if (!print_boot_var(buffer, verbose, boot_order[i] == current)) printf("%s: MISSING!\n", buffer); LIST_FOREACH(v, &efivars, entries) { if (v->idx == boot_order[i]) { v->flags |= SEEN; break; } } } if (verbose) { printf("\n\nUnreferenced Variables:\n"); LIST_FOREACH(v, &efivars, entries) { if (v->flags == 0) print_boot_var(v->name, verbose, v->idx == current); } } } return 0; } static void delete_timeout(void) { efi_del_variable(EFI_GLOBAL_GUID,"Timeout"); } static void handle_timeout(int to) { uint16_t timeout; le16enc(&timeout, to); if (set_bootvar("Timeout", (uint8_t *)&timeout, sizeof(timeout)) < 0) errx(1, "Can't set Timeout for booting."); } int main(int argc, char *argv[]) { if (!efi_variables_supported()) errx(1, "efi variables not supported on this system. root? kldload efirt?"); memset(&opts, 0, sizeof (bmgr_opts_t)); parse_args(argc, argv); read_vars(); if (opts.create) /* * side effect, adds to boot order, but not yet active. */ make_boot_var(opts.label ? opts.label : "", opts.loader, opts.kernel, opts.env, opts.dry_run, opts.has_bootnum ? opts.bootnum : -1, opts.set_active); else if (opts.set_active || opts.set_inactive ) handle_activity(opts.bootnum, opts.set_active); else if (opts.order != NULL) set_boot_order(opts.order); /* create a new bootorder with opts.order */ else if (opts.set_bootnext) handle_bootnext(opts.bootnum); else if (opts.delete_bootnext) del_bootnext(); else if (opts.delete) delete_bootvar(opts.bootnum); else if (opts.del_timeout) delete_timeout(); else if (opts.set_timeout) handle_timeout(opts.timeout); print_boot_vars(opts.verbose); } Index: head/usr.sbin/efidp/efidp.8 =================================================================== --- head/usr.sbin/efidp/efidp.8 (revision 343754) +++ head/usr.sbin/efidp/efidp.8 (revision 343755) @@ -1,81 +1,81 @@ -.\" Copyright (c) 2017 Netflix, Inc -.\" All rights reserved. +.\" +.\" Copyright (c) 2017 Netflix, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd December 1, 2017 .Dt EFIDP 8 .Os .Sh NAME .Nm efidp .Nd UEFI Device Path manipulation .Sh SYNOPSIS .Nm .Op Fl fp .Op Fl -parse .Op Fl -format .Sh DESCRIPTION This program converts .Dq Unified Extensible Firmware Interface .Pq UEFI Device Paths, as defined in the UEFI standard, to and from binary form. Binary and textual forms are defined in Chapter 9 of the UEFI Specification. .Pp .Bl -tag -width 20m .It Fl f Fl -format Formats a binary UEFI Device Path into its canonical UTF-8 textual form. A binary Device Path can be no longer than 65536 bytes. The textual form must fit into 65536 bytes. Multiple binary device paths may be specified. .It Fl p Fl -parse Parses a UEFI Device Path UTF-8 specification and outputs the binary Device Path form. Only one device path is parsed, even if there are multiple present in the input. Leading white space is ignored. The resulting binary Device Path can be no longer than 65536 bytes. Multiple lines may be specified. Each one will be translated. .It Fl e Fl --to-efi Translate a Unix file path to an EFI Device Path. The output is the textual representation of the EFI Device Path. .It Fl u Fl --to-unix Translate an EFI device path to a Unix file path. The input is the textual representation of the EFI Device Path. .El .Sh SEE ALSO Appendix A of the UEFI specification has the format for GUIDs. All GUIDs .Dq Globally Unique Identifiers have the format described in RFC 4122. .Pp The Unified Extensible Firmware Interface Specification is available from .Pa www.uefi.org . .Sh HISTORY The .Nm utility first appeared in .Fx 11.1 . Index: head/usr.sbin/efidp/efidp.c =================================================================== --- head/usr.sbin/efidp/efidp.c (revision 343754) +++ head/usr.sbin/efidp/efidp.c (revision 343755) @@ -1,252 +1,251 @@ /*- * Copyright (c) 2016 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #define MAXSIZE 65536 /* Everyting will be smaller than this, most 1000x smaller */ /* options descriptor */ static struct option longopts[] = { { "to-unix", no_argument, NULL, 'u' }, { "to-efi", no_argument, NULL, 'e' }, { "format", no_argument, NULL, 'f' }, { "parse", no_argument, NULL, 'p' }, { NULL, 0, NULL, 0 } }; static int flag_format, flag_parse, flag_unix, flag_efi; static void usage(void) { errx(1, "efidp [-efpu]"); } static ssize_t read_file(int fd, void **rv) { uint8_t *retval; size_t len; off_t off; ssize_t red; len = MAXSIZE; off = 0; retval = malloc(len); do { red = read(fd, retval + off, len - off); if (red == 0) break; off += red; if (off == (off_t)len) break; } while (1); *rv = retval; return off; } static void parse_args(int argc, char **argv) { int ch; while ((ch = getopt_long(argc, argv, "efpu", longopts, NULL)) != -1) { switch (ch) { case 'e': flag_efi++; break; case 'f': flag_format++; break; case 'p': flag_parse++; break; case 'u': flag_unix++; break; default: usage(); } } argc -= optind; argv += optind; if (argc >= 1) usage(); if (flag_parse + flag_format + flag_efi + flag_unix != 1) { warnx("Can only use one of -p (--parse), " "and -f (--format)"); usage(); } } static char * trim(char *s) { char *t; while (isspace(*s)) s++; t = s + strlen(s) - 1; while (t > s && isspace(*t)) *t-- = '\0'; return s; } static void unix_to_efi(void) { char buffer[MAXSIZE]; char efi[MAXSIZE]; efidp dp; char *walker; int rv; dp = NULL; while (fgets(buffer, sizeof(buffer), stdin)) { walker= trim(buffer); free(dp); dp = NULL; rv = efivar_unix_path_to_device_path(walker, &dp); if (rv != 0 || dp == NULL) { errno = rv; warn("Can't convert '%s' to efi", walker); continue; } if (efidp_format_device_path(efi, sizeof(efi), dp, efidp_size(dp)) < 0) { warnx("Can't format dp for '%s'", walker); continue; } printf("%s\n", efi); } free(dp); } static void efi_to_unix(void) { char buffer[MAXSIZE]; char dpbuf[MAXSIZE]; efidp dp; size_t dplen; char *walker, *dev, *relpath, *abspath; int rv; dp = (efidp)dpbuf; while (fgets(buffer, sizeof(buffer), stdin)) { walker= trim(buffer); dplen = efidp_parse_device_path(walker, dp, sizeof(dpbuf)); rv = efivar_device_path_to_unix_path(dp, &dev, &relpath, &abspath); if (rv == 0) printf("%s:%s %s\n", dev, relpath, abspath); else { errno = rv; warn("Can't convert '%s' to unix", walker); } } } static void format(void) { char buffer[MAXSIZE]; ssize_t fmtlen; ssize_t len; void *data; size_t dplen; const_efidp dp; len = read_file(STDIN_FILENO, &data); if (len == -1) err(1, "read"); dp = (const_efidp)data; while (len > 0) { dplen = efidp_size(dp); fmtlen = efidp_format_device_path(buffer, sizeof(buffer), dp, dplen); if (fmtlen > 0) printf("%s\n", buffer); len -= dplen; dp = (const_efidp)((const char *)dp + dplen); } free(data); } static void parse(void) { char buffer[MAXSIZE]; efidp dp; ssize_t dplen; char *walker; dplen = MAXSIZE; dp = malloc(dplen); if (dp == NULL) errx(1, "Can't allocate memory."); while (fgets(buffer, sizeof(buffer), stdin)) { walker= trim(buffer); dplen = efidp_parse_device_path(walker, dp, dplen); if (dplen == -1) errx(1, "Can't parse %s", walker); write(STDOUT_FILENO, dp, dplen); } free(dp); } int main(int argc, char **argv) { parse_args(argc, argv); if (flag_unix) efi_to_unix(); else if (flag_efi) unix_to_efi(); else if (flag_format) format(); else if (flag_parse) parse(); } Index: head/usr.sbin/efivar/efiutil.c =================================================================== --- head/usr.sbin/efivar/efiutil.c (revision 343754) +++ head/usr.sbin/efivar/efiutil.c (revision 343755) @@ -1,180 +1,179 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "efiutil.h" #include "efichar.h" #include /* * Dump the data as ASCII data, which is a pretty * printed form */ void asciidump(uint8_t *data, size_t datalen) { size_t i; int len; len = 0; for (i = 0; i < datalen; i++) { if (isprint(data[i])) { len++; if (len > 80) { len = 0; printf("\n"); } printf("%c", data[i]); } else { len +=3; if (len > 80) { len = 0; printf("\n"); } printf("%%%02x", data[i]); } } printf("\n"); } void utf8dump(uint8_t *data, size_t datalen) { char *utf8 = NULL; efi_char *ucs2; /* * NUL terminate the string. Not all strings need it, but some * do and an extra NUL won't change what's printed. */ ucs2 = malloc(datalen + sizeof(efi_char)); memcpy(ucs2, data, datalen); ucs2[datalen / sizeof(efi_char)] = 0; ucs2_to_utf8(ucs2, &utf8); printf("%s\n", utf8); free(utf8); free(ucs2); } void hexdump(uint8_t *data, size_t datalen) { size_t i; for (i = 0; i < datalen; i++) { if (i % 16 == 0) { if (i != 0) printf("\n"); printf("%04x: ", (int)i); } printf("%02x ", data[i]); } printf("\n"); } void bindump(uint8_t *data, size_t datalen) { write(1, data, datalen); } #define LOAD_OPTION_ACTIVE 1 #define SIZE(dp, edp) (size_t)((intptr_t)(void *)edp - (intptr_t)(void *)dp) void efi_print_load_option(uint8_t *data, size_t datalen, int Aflag, int bflag, int uflag) { uint8_t *ep = data + datalen; uint8_t *walker = data; uint32_t attr; uint16_t fplen; efi_char *descr; efidp dp, edp; char *str = NULL; char buf[1024]; int len; void *opt; int optlen; if (datalen < sizeof(attr) + sizeof(fplen) + sizeof(efi_char)) return; // First 4 bytes are attribute flags attr = le32dec(walker); walker += sizeof(attr); // Next two bytes are length of the file paths fplen = le16dec(walker); walker += sizeof(fplen); // Next we have a 0 terminated UCS2 string that we know to be aligned descr = (efi_char *)(intptr_t)(void *)walker; len = ucs2len(descr); // XXX need to sanity check that len < (datalen - (ep - walker) / 2) walker += (len + 1) * sizeof(efi_char); if (walker > ep) return; // Now we have fplen bytes worth of file path stuff dp = (efidp)walker; walker += fplen; if (walker > ep) return; edp = (efidp)walker; // Everything left is the binary option args opt = walker; optlen = ep - walker; // We got to here, everything is good printf("%c ", attr & LOAD_OPTION_ACTIVE ? '*' : ' '); ucs2_to_utf8(descr, &str); printf("%s", str); free(str); while (dp < edp && SIZE(dp, edp) > sizeof(efidp_header)) { efidp_format_device_path(buf, sizeof(buf), dp, SIZE(dp, edp)); dp = (efidp)((char *)dp + efidp_size(dp)); printf(" %s\n", buf); } if (optlen == 0) return; printf("Options: "); if (Aflag) asciidump(opt, optlen); else if (bflag) bindump(opt, optlen); else if (uflag) utf8dump(opt, optlen); else hexdump(opt, optlen); } Index: head/usr.sbin/efivar/efiutil.h =================================================================== --- head/usr.sbin/efivar/efiutil.h (revision 343754) +++ head/usr.sbin/efivar/efiutil.h (revision 343755) @@ -1,38 +1,37 @@ /*- * Copyright (c) 2017 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * differnt routines to dump data. */ void asciidump(uint8_t *data, size_t datalen); void bindump(uint8_t *data, size_t datalen); void efi_print_load_option(uint8_t *, size_t, int, int, int); void hexdump(uint8_t *data, size_t datalen); void utf8dump(uint8_t *data, size_t datalen); Index: head/usr.sbin/efivar/efivar.8 =================================================================== --- head/usr.sbin/efivar/efivar.8 (revision 343754) +++ head/usr.sbin/efivar/efivar.8 (revision 343755) @@ -1,196 +1,196 @@ -.\" Copyright (c) 2017 Netflix, Inc -.\" All rights reserved. +.\" +.\" Copyright (c) 2017 Netflix, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd May 22, 2017 .Dt EFIVAR 8 .Os .Sh NAME .Nm efivar .Nd UEFI environment variable interaction .Sh SYNOPSIS .Nm .Op Fl abdDHlLNpRtuw .Op Fl n Ar name .Op Fl f Ar file .Op Fl -append .Op Fl -ascii .Op Fl -attributes .Op Fl -binary .Op Fl -delete .Op Fl -device-path .Op Fl -fromfile Ar file .Op Fl -guid .Op Fl -hex .Op Fl -list-guids .Op Fl -list .Op Fl -name Ar name .Op Fl -no-name .Op Fl -print .Op Fl -print-decimal .Op Fl -raw-guid .Op Fl -utf8 .Op Fl -write .Sh DESCRIPTION This program manages .Dq Unified Extensible Firmware Interface .Pq UEFI environment variables. UEFI variables have three part: A namespace, a name and a value. The namespace is a GUID that is self assigned by the group defining the variables. The name is a Unicode name for the variable. The value is binary data. All Unicode data is presented to the user as UTF-8. .Pp The following options are available: .Bl -tag -width 20m .It Fl n Ar name Fl -name Ar name Specify the name of the variable to operate on. The .Ar name argument is the GUID of the variable, followed by a dash, followed by the UEFI variable name. The GUID may be in numeric format, or may be one of the well known symbolic name (see .Fl -list-guids for a complete list). .It Fl f Ar file Fl -fromfile Ar file When writing or appending to a variable, take the data for the variable's value from .Ar file instead of from the command line. This flag implies .Fl -write unless the .Fl -append or .Fl -print flags are given. This behavior is not well understood and is currently unimplemented for writes. When .Fl -print is specified, the contents of the file are used as the value to print using any other specified flags. This is used primarily for testing purposes for more complicated variable decoding. .It Fl a Fl -append Append the specified value to the UEFI variable rather than replacing it. .It Fl t Ar attr Fl -attributes Ar attr Specify, in hexadecimal, the attributes for this variable. See section 7.2 (GetVariable subsection, Related Definitions) of the UEFI Specification for hex values to use. .It Fl A Fl -ascii Display the variable data as modified ascii: All printable characters are printed, while unprintable characters are rendered as a two-digit hexadecimal number preceded by a % character. .It Fl b Fl -binary Display the variable data as binary data. Usually will be used with the .Fl N or .Fl -no-name flag. Useful in scripts. .It Fl D Fl -delete Delete the specified variable. May not be used with either the .Fl -write or the .Fl -append flags. No .Ar value may be specified. .It Fl d Fl -device Fl -device-path Interpret the variables printed as UEFI device paths and print the UEFI standard string representation. .It Fl g Fl -guid flag is specified, guids are converted to names if they are known (and show up in .Fl -list-guids ). .It Fl H Fl -hex List variable data as a hex dump. .It Fl L Fl -list-guids Lists the well known GUIDs. The names listed here may be used in place of the numeric GUID values. These names will replace the numeric GUID values unless .Fl -raw-guid flag is specified. .It Fl l Fl -list List all the variables. If the .Fl -print flag is also listed, their values will be displayed. .It Fl N Fl -no-name Do not display the variable name. .It Fl p Fl -print Print the value of the variable. .It Fl R Fl -raw-guid Do not substitute well known names for GUID numeric values in output. .It Fl u Fl -utf8 Treat the value of the variable as UCS2 and convert it to UTF8 and print the result. .It Fl w Fl -write Write (replace) the variable specified with the value specified from standard input. No command line option to do this is available since UEFI variables are binary structures rather than strings. .Xr echo 1 .Fl n can be used to specify simple strings. .It Ar name Display the .Ar name environment variable. .El .Sh COMPATIBILITY The .Nm program is intended to be compatible (strict superset) with a program of the same name included in the Red Hat libefivar package, but the .Fl d and .Fl -print-decimal flags are not implemented and never will be. .Pp The .Fl d flag is short for .Fl -device-path . .Sh SEE ALSO Appendix A of the UEFI specification has the format for GUIDs. All GUIDs .Dq Globally Unique Identifiers have the format described in RFC 4122. .Sh HISTORY The .Nm utility first appeared in .Fx 11.1 . Index: head/usr.sbin/efivar/efivar.c =================================================================== --- head/usr.sbin/efivar/efivar.c (revision 343754) +++ head/usr.sbin/efivar/efivar.c (revision 343755) @@ -1,386 +1,385 @@ /*- * Copyright (c) 2016 Netflix, Inc. - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "efiutil.h" #include "efichar.h" /* options descriptor */ static struct option longopts[] = { { "append", no_argument, NULL, 'a' }, { "ascii", no_argument, NULL, 'A' }, { "attributes", required_argument, NULL, 't' }, { "binary", no_argument, NULL, 'b' }, { "delete", no_argument, NULL, 'D' }, { "device", no_argument, NULL, 'd' }, { "device-path", no_argument, NULL, 'd' }, { "fromfile", required_argument, NULL, 'f' }, { "guid", no_argument, NULL, 'g' }, { "hex", no_argument, NULL, 'H' }, { "list-guids", no_argument, NULL, 'L' }, { "list", no_argument, NULL, 'l' }, { "load-option", no_argument, NULL, 'O' }, { "name", required_argument, NULL, 'n' }, { "no-name", no_argument, NULL, 'N' }, { "print", no_argument, NULL, 'p' }, { "print-decimal", no_argument, NULL, 'd' }, { "raw-guid", no_argument, NULL, 'R' }, { "utf8", no_argument, NULL, 'u' }, { "write", no_argument, NULL, 'w' }, { NULL, 0, NULL, 0 } }; static int aflag, Aflag, bflag, dflag, Dflag, gflag, Hflag, Nflag, lflag, Lflag, Rflag, wflag, pflag, uflag, load_opt_flag; static char *varname; static char *fromfile; static u_long attrib = EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS; static void usage(void) { errx(1, "efivar [-abdDHlLNpRtuw] [-n name] [-f file] [--append] [--ascii]\n" "\t[--attributes] [--binary] [--delete] [--fromfile file] [--hex]\n" "\t[--list-guids] [--list] [--load-option] [--name name] [--no-name]\n" "\t[--print] [--print-decimal] [--raw-guid] [--utf8] [--write]\n" "\tname[=value]"); } static void breakdown_name(char *name, efi_guid_t *guid, char **vname) { char *cp; cp = strrchr(name, '-'); if (cp == NULL) errx(1, "Invalid name: %s", name); *vname = cp + 1; *cp = '\0'; if (efi_name_to_guid(name, guid) < 0) errx(1, "Invalid guid %s", name); } static uint8_t * get_value(char *val, size_t *datalen) { static char buffer[16*1024]; if (val != NULL) { *datalen = strlen(val); return ((uint8_t *)val); } /* Read from stdin */ *datalen = sizeof(buffer); *datalen = read(0, buffer, *datalen); return ((uint8_t *)buffer); } static void append_variable(char *name, char *val) { char *vname; efi_guid_t guid; size_t datalen; uint8_t *data; breakdown_name(name, &guid, &vname); data = get_value(val, &datalen); if (efi_append_variable(guid, vname, data, datalen, attrib) < 0) err(1, "efi_append_variable"); } static void delete_variable(char *name) { char *vname; efi_guid_t guid; breakdown_name(name, &guid, &vname); if (efi_del_variable(guid, vname) < 0) err(1, "efi_del_variable"); } static void write_variable(char *name, char *val) { char *vname; efi_guid_t guid; size_t datalen; uint8_t *data; breakdown_name(name, &guid, &vname); data = get_value(val, &datalen); if (efi_set_variable(guid, vname, data, datalen, attrib) < 0) err(1, "efi_set_variable"); } static void devpath_dump(uint8_t *data, size_t datalen) { char buffer[1024]; efidp_format_device_path(buffer, sizeof(buffer), (const_efidp)data, datalen); if (!Nflag) printf(": "); printf("%s\n", buffer); } static void pretty_guid(efi_guid_t *guid, char **gname) { char *pretty = NULL; if (gflag) efi_guid_to_name(guid, &pretty); if (pretty == NULL) efi_guid_to_str(guid, gname); else *gname = pretty; } static void print_var(efi_guid_t *guid, char *name) { uint32_t att; uint8_t *data; size_t datalen; char *gname = NULL; int rv; if (guid) pretty_guid(guid, &gname); if (pflag || fromfile) { if (fromfile) { int fd; fd = open(fromfile, O_RDONLY); if (fd < 0) err(1, "open %s", fromfile); data = malloc(64 * 1024); if (data == NULL) err(1, "malloc"); datalen = read(fd, data, 64 * 1024); if (datalen <= 0) err(1, "read"); close(fd); } else { rv = efi_get_variable(*guid, name, &data, &datalen, &att); if (rv < 0) err(1, "fetching %s-%s", gname, name); } if (!Nflag) printf("%s-%s\n", gname, name); if (load_opt_flag) efi_print_load_option(data, datalen, Aflag, bflag, uflag); else if (Aflag) asciidump(data, datalen); else if (uflag) utf8dump(data, datalen); else if (bflag) bindump(data, datalen); else if (dflag) devpath_dump(data, datalen); else hexdump(data, datalen); } else { printf("%s-%s", gname, name); } free(gname); if (!Nflag) printf("\n"); } static void print_variable(char *name) { char *vname; efi_guid_t guid; breakdown_name(name, &guid, &vname); print_var(&guid, vname); } static void print_variables(void) { int rv; char *name = NULL; efi_guid_t *guid = NULL; while ((rv = efi_get_next_variable_name(&guid, &name)) > 0) print_var(guid, name); if (rv < 0) err(1, "Error listing names"); } static void print_known_guid(void) { struct uuid_table *tbl; int i, n; n = efi_known_guid(&tbl); for (i = 0; i < n; i++) printf("%s %s\n", tbl[i].uuid_str, tbl[i].name); } static void parse_args(int argc, char **argv) { int ch, i; while ((ch = getopt_long(argc, argv, "aAbdDf:gHlLNn:OpRt:uw", longopts, NULL)) != -1) { switch (ch) { case 'a': aflag++; break; case 'A': Aflag++; break; case 'b': bflag++; break; case 'd': dflag++; break; case 'D': Dflag++; break; case 'g': gflag++; break; case 'H': Hflag++; break; case 'l': lflag++; break; case 'L': Lflag++; break; case 'n': varname = optarg; break; case 'N': Nflag++; break; case 'O': load_opt_flag++; break; case 'p': pflag++; break; case 'R': Rflag++; break; case 't': attrib = strtoul(optarg, NULL, 16); break; case 'u': uflag++; break; case 'w': wflag++; break; case 'f': free(fromfile); fromfile = strdup(optarg); break; case 0: errx(1, "unknown or unimplemented option\n"); break; default: usage(); } } argc -= optind; argv += optind; if (argc == 1) varname = argv[0]; if (aflag + Dflag + wflag > 1) { warnx("Can only use one of -a (--append), " "-D (--delete) and -w (--write)"); usage(); } if (aflag + Dflag + wflag > 0 && varname == NULL) { warnx("Must specify a variable for -a (--append), " "-D (--delete) or -w (--write)"); usage(); } if (aflag) append_variable(varname, NULL); else if (Dflag) delete_variable(varname); else if (wflag) write_variable(varname, NULL); else if (Lflag) print_known_guid(); else if (fromfile) { Nflag = 1; print_var(NULL, NULL); } else if (varname) { pflag++; print_variable(varname); } else if (argc > 0) { pflag++; for (i = 0; i < argc; i++) print_variable(argv[i]); } else print_variables(); } int main(int argc, char **argv) { parse_args(argc, argv); } Index: head/usr.sbin/mpsutil/mps_cmd.c =================================================================== --- head/usr.sbin/mpsutil/mps_cmd.c (revision 343754) +++ head/usr.sbin/mpsutil/mps_cmd.c (revision 343755) @@ -1,730 +1,729 @@ /*- * Copyright (c) 2015 Baptiste Daroussin * * Copyright (c) 2015 Netflix, Inc. - * All rights reserved. * Written by: Scott Long * * Copyright (c) 2008 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __RCSID("$FreeBSD$"); #include #include #include #if 0 #include #else #include "mps_ioctl.h" #include "mpr_ioctl.h" #endif #include #include #include #include #include #include #include #include #include "mpsutil.h" #ifndef USE_MPT_IOCTLS #define USE_MPT_IOCTLS #endif static const char *mps_ioc_status_codes[] = { "Success", /* 0x0000 */ "Invalid function", "Busy", "Invalid scatter-gather list", "Internal error", "Reserved", "Insufficient resources", "Invalid field", "Invalid state", /* 0x0008 */ "Operation state not supported", NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0x0010 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0x0018 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, "Invalid configuration action", /* 0x0020 */ "Invalid configuration type", "Invalid configuration page", "Invalid configuration data", "No configuration defaults", "Unable to commit configuration change", NULL, NULL, NULL, /* 0x0028 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0x0030 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0x0038 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, "Recovered SCSI error", /* 0x0040 */ "Invalid SCSI bus", "Invalid SCSI target ID", "SCSI device not there", "SCSI data overrun", "SCSI data underrun", "SCSI I/O error", "SCSI protocol error", "SCSI task terminated", /* 0x0048 */ "SCSI residual mismatch", "SCSI task management failed", "SCSI I/O controller terminated", "SCSI external controller terminated", "EEDP guard error", "EEDP reference tag error", "EEDP application tag error", NULL, /* 0x0050 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0x0058 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, "SCSI target priority I/O", /* 0x0060 */ "Invalid SCSI target port", "Invalid SCSI target I/O index", "SCSI target aborted", "No connection retryable", "No connection", "FC aborted", "Invalid FC receive ID", "FC did invalid", /* 0x0068 */ "FC node logged out", "Transfer count mismatch", "STS data not set", "FC exchange canceled", "Data offset error", "Too much write data", "IU too short", "ACK NAK timeout", /* 0x0070 */ "NAK received", NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 0x0078 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, "LAN device not found", /* 0x0080 */ "LAN device failure", "LAN transmit error", "LAN transmit aborted", "LAN receive error", "LAN receive aborted", "LAN partial packet", "LAN canceled", NULL, /* 0x0088 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, "SAS SMP request failed", /* 0x0090 */ "SAS SMP data overrun", NULL, NULL, NULL, NULL, NULL, NULL, "Inband aborted", /* 0x0098 */ "No inband connection", NULL, NULL, NULL, NULL, NULL, NULL, "Diagnostic released", /* 0x00A0 */ }; struct mprs_pass_thru { uint64_t PtrRequest; uint64_t PtrReply; uint64_t PtrData; uint32_t RequestSize; uint32_t ReplySize; uint32_t DataSize; uint32_t DataDirection; uint64_t PtrDataOut; uint32_t DataOutSize; uint32_t Timeout; }; struct mprs_btdh_mapping { uint16_t TargetID; uint16_t Bus; uint16_t DevHandle; uint16_t Reserved; }; const char * mps_ioc_status(U16 IOCStatus) { static char buffer[16]; IOCStatus &= MPI2_IOCSTATUS_MASK; if (IOCStatus < sizeof(mps_ioc_status_codes) / sizeof(char *) && mps_ioc_status_codes[IOCStatus] != NULL) return (mps_ioc_status_codes[IOCStatus]); snprintf(buffer, sizeof(buffer), "Status: 0x%04x", IOCStatus); return (buffer); } #ifdef USE_MPT_IOCTLS int mps_map_btdh(int fd, uint16_t *devhandle, uint16_t *bus, uint16_t *target) { int error; struct mprs_btdh_mapping map; map.Bus = *bus; map.TargetID = *target; map.DevHandle = *devhandle; if ((error = ioctl(fd, MPTIOCTL_BTDH_MAPPING, &map)) != 0) { error = errno; warn("Failed to map bus/target/device"); return (error); } *bus = map.Bus; *target = map.TargetID; *devhandle = map.DevHandle; return (0); } int mps_read_config_page_header(int fd, U8 PageType, U8 PageNumber, U32 PageAddress, MPI2_CONFIG_PAGE_HEADER *header, U16 *IOCStatus) { MPI2_CONFIG_REQUEST req; MPI2_CONFIG_REPLY reply; bzero(&req, sizeof(req)); req.Function = MPI2_FUNCTION_CONFIG; req.Action = MPI2_CONFIG_ACTION_PAGE_HEADER; req.Header.PageType = PageType; req.Header.PageNumber = PageNumber; req.PageAddress = PageAddress; if (mps_pass_command(fd, &req, sizeof(req), &reply, sizeof(reply), NULL, 0, NULL, 0, 30)) return (errno); if (!IOC_STATUS_SUCCESS(reply.IOCStatus)) { if (IOCStatus != NULL) *IOCStatus = reply.IOCStatus; return (EIO); } if (header == NULL) return (EINVAL); *header = reply.Header; return (0); } int mps_read_ext_config_page_header(int fd, U8 ExtPageType, U8 PageNumber, U32 PageAddress, MPI2_CONFIG_PAGE_HEADER *header, U16 *ExtPageLength, U16 *IOCStatus) { MPI2_CONFIG_REQUEST req; MPI2_CONFIG_REPLY reply; bzero(&req, sizeof(req)); req.Function = MPI2_FUNCTION_CONFIG; req.Action = MPI2_CONFIG_ACTION_PAGE_HEADER; req.Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED; req.ExtPageType = ExtPageType; req.Header.PageNumber = PageNumber; req.PageAddress = PageAddress; if (mps_pass_command(fd, &req, sizeof(req), &reply, sizeof(reply), NULL, 0, NULL, 0, 30)) return (errno); if (!IOC_STATUS_SUCCESS(reply.IOCStatus)) { if (IOCStatus != NULL) *IOCStatus = reply.IOCStatus; return (EIO); } if ((header == NULL) || (ExtPageLength == NULL)) return (EINVAL); *header = reply.Header; *ExtPageLength = reply.ExtPageLength; return (0); } void * mps_read_config_page(int fd, U8 PageType, U8 PageNumber, U32 PageAddress, U16 *IOCStatus) { MPI2_CONFIG_REQUEST req; MPI2_CONFIG_PAGE_HEADER header; MPI2_CONFIG_REPLY reply; void *buf; int error, len; bzero(&header, sizeof(header)); error = mps_read_config_page_header(fd, PageType, PageNumber, PageAddress, &header, IOCStatus); if (error) { errno = error; return (NULL); } bzero(&req, sizeof(req)); req.Function = MPI2_FUNCTION_CONFIG; req.Action = MPI2_CONFIG_ACTION_PAGE_READ_CURRENT; req.PageAddress = PageAddress; req.Header = header; if (req.Header.PageLength == 0) req.Header.PageLength = 4; len = req.Header.PageLength * 4; buf = malloc(len); if (mps_pass_command(fd, &req, sizeof(req), &reply, sizeof(reply), buf, len, NULL, 0, 30)) { error = errno; free(buf); errno = error; return (NULL); } if (!IOC_STATUS_SUCCESS(reply.IOCStatus)) { if (IOCStatus != NULL) *IOCStatus = reply.IOCStatus; else warnx("Reading config page failed: 0x%x %s", reply.IOCStatus, mps_ioc_status(reply.IOCStatus)); free(buf); errno = EIO; return (NULL); } return (buf); } void * mps_read_extended_config_page(int fd, U8 ExtPageType, U8 PageVersion, U8 PageNumber, U32 PageAddress, U16 *IOCStatus) { MPI2_CONFIG_REQUEST req; MPI2_CONFIG_PAGE_HEADER header; MPI2_CONFIG_REPLY reply; U16 pagelen; void *buf; int error, len; if (IOCStatus != NULL) *IOCStatus = MPI2_IOCSTATUS_SUCCESS; bzero(&header, sizeof(header)); error = mps_read_ext_config_page_header(fd, ExtPageType, PageNumber, PageAddress, &header, &pagelen, IOCStatus); if (error) { errno = error; return (NULL); } bzero(&req, sizeof(req)); req.Function = MPI2_FUNCTION_CONFIG; req.Action = MPI2_CONFIG_ACTION_PAGE_READ_CURRENT; req.PageAddress = PageAddress; req.Header = header; if (pagelen == 0) pagelen = 4; req.ExtPageLength = pagelen; req.ExtPageType = ExtPageType; len = pagelen * 4; buf = malloc(len); if (mps_pass_command(fd, &req, sizeof(req), &reply, sizeof(reply), buf, len, NULL, 0, 30)) { error = errno; free(buf); errno = error; return (NULL); } if (!IOC_STATUS_SUCCESS(reply.IOCStatus)) { if (IOCStatus != NULL) *IOCStatus = reply.IOCStatus; else warnx("Reading extended config page failed: %s", mps_ioc_status(reply.IOCStatus)); free(buf); errno = EIO; return (NULL); } return (buf); } int mps_firmware_send(int fd, unsigned char *fw, uint32_t len, bool bios) { MPI2_FW_DOWNLOAD_REQUEST req; MPI2_FW_DOWNLOAD_REPLY reply; bzero(&req, sizeof(req)); bzero(&reply, sizeof(reply)); req.Function = MPI2_FUNCTION_FW_DOWNLOAD; req.ImageType = bios ? MPI2_FW_DOWNLOAD_ITYPE_BIOS : MPI2_FW_DOWNLOAD_ITYPE_FW; req.TotalImageSize = len; req.MsgFlags = MPI2_FW_DOWNLOAD_MSGFLGS_LAST_SEGMENT; if (mps_user_command(fd, &req, sizeof(req), &reply, sizeof(reply), fw, len, 0)) { return (-1); } return (0); } int mps_firmware_get(int fd, unsigned char **firmware, bool bios) { MPI2_FW_UPLOAD_REQUEST req; MPI2_FW_UPLOAD_REPLY reply; int size; *firmware = NULL; bzero(&req, sizeof(req)); bzero(&reply, sizeof(reply)); req.Function = MPI2_FUNCTION_FW_UPLOAD; req.ImageType = bios ? MPI2_FW_DOWNLOAD_ITYPE_BIOS : MPI2_FW_DOWNLOAD_ITYPE_FW; if (mps_user_command(fd, &req, sizeof(req), &reply, sizeof(reply), NULL, 0, 0)) { return (-1); } if (reply.ActualImageSize == 0) { return (-1); } size = reply.ActualImageSize; *firmware = calloc(size, sizeof(unsigned char)); if (*firmware == NULL) { warn("calloc"); return (-1); } if (mps_user_command(fd, &req, sizeof(req), &reply, sizeof(reply), *firmware, size, 0)) { free(*firmware); return (-1); } return (size); } #else int mps_read_config_page_header(int fd, U8 PageType, U8 PageNumber, U32 PageAddress, MPI2_CONFIG_PAGE_HEADER *header, U16 *IOCStatus) { struct mps_cfg_page_req req; if (IOCStatus != NULL) *IOCStatus = MPI2_IOCSTATUS_SUCCESS; if (header == NULL) return (EINVAL); bzero(&req, sizeof(req)); req.header.PageType = PageType; req.header.PageNumber = PageNumber; req.page_address = PageAddress; if (ioctl(fd, MPSIO_READ_CFG_HEADER, &req) < 0) return (errno); if (!IOC_STATUS_SUCCESS(req.ioc_status)) { if (IOCStatus != NULL) *IOCStatus = req.ioc_status; return (EIO); } bcopy(&req.header, header, sizeof(*header)); return (0); } void * mps_read_config_page(int fd, U8 PageType, U8 PageNumber, U32 PageAddress, U16 *IOCStatus) { struct mps_cfg_page_req req; void *buf; int error; error = mps_read_config_page_header(fd, PageType, PageNumber, PageAddress, &req.header, IOCStatus); if (error) { errno = error; return (NULL); } if (req.header.PageLength == 0) req.header.PageLength = 4; req.len = req.header.PageLength * 4; buf = malloc(req.len); req.buf = buf; bcopy(&req.header, buf, sizeof(req.header)); if (ioctl(fd, MPSIO_READ_CFG_PAGE, &req) < 0) { error = errno; free(buf); errno = error; return (NULL); } if (!IOC_STATUS_SUCCESS(req.ioc_status)) { if (IOCStatus != NULL) *IOCStatus = req.ioc_status; else warnx("Reading config page failed: 0x%x %s", req.ioc_status, mps_ioc_status(req.ioc_status)); free(buf); errno = EIO; return (NULL); } return (buf); } void * mps_read_extended_config_page(int fd, U8 ExtPageType, U8 PageVersion, U8 PageNumber, U32 PageAddress, U16 *IOCStatus) { struct mps_ext_cfg_page_req req; void *buf; int error; if (IOCStatus != NULL) *IOCStatus = MPI2_IOCSTATUS_SUCCESS; bzero(&req, sizeof(req)); req.header.PageVersion = PageVersion; req.header.PageNumber = PageNumber; req.header.ExtPageType = ExtPageType; req.page_address = PageAddress; if (ioctl(fd, MPSIO_READ_EXT_CFG_HEADER, &req) < 0) return (NULL); if (!IOC_STATUS_SUCCESS(req.ioc_status)) { if (IOCStatus != NULL) *IOCStatus = req.ioc_status; else warnx("Reading extended config page header failed: %s", mps_ioc_status(req.ioc_status)); errno = EIO; return (NULL); } req.len = req.header.ExtPageLength * 4; buf = malloc(req.len); req.buf = buf; bcopy(&req.header, buf, sizeof(req.header)); if (ioctl(fd, MPSIO_READ_EXT_CFG_PAGE, &req) < 0) { error = errno; free(buf); errno = error; return (NULL); } if (!IOC_STATUS_SUCCESS(req.ioc_status)) { if (IOCStatus != NULL) *IOCStatus = req.ioc_status; else warnx("Reading extended config page failed: %s", mps_ioc_status(req.ioc_status)); free(buf); errno = EIO; return (NULL); } return (buf); } #endif int mps_open(int unit) { char path[MAXPATHLEN]; snprintf(path, sizeof(path), "/dev/mp%s%d", is_mps ? "s": "r", unit); return (open(path, O_RDWR)); } int mps_user_command(int fd, void *req, uint32_t req_len, void *reply, uint32_t reply_len, void *buffer, int len, uint32_t flags) { struct mps_usr_command cmd; bzero(&cmd, sizeof(struct mps_usr_command)); cmd.req = req; cmd.req_len = req_len; cmd.rpl = reply; cmd.rpl_len = reply_len; cmd.buf = buffer; cmd.len = len; cmd.flags = flags; if (ioctl(fd, is_mps ? MPSIO_MPS_COMMAND : MPRIO_MPR_COMMAND, &cmd) < 0) return (errno); return (0); } int mps_pass_command(int fd, void *req, uint32_t req_len, void *reply, uint32_t reply_len, void *data_in, uint32_t datain_len, void *data_out, uint32_t dataout_len, uint32_t timeout) { struct mprs_pass_thru pass; pass.PtrRequest = (uint64_t)(uintptr_t)req; pass.PtrReply = (uint64_t)(uintptr_t)reply; pass.PtrData = (uint64_t)(uintptr_t)data_in; pass.PtrDataOut = (uint64_t)(uintptr_t)data_out; pass.RequestSize = req_len; pass.ReplySize = reply_len; pass.DataSize = datain_len; pass.DataOutSize = dataout_len; if (datain_len && dataout_len) { if (is_mps) { pass.DataDirection = MPS_PASS_THRU_DIRECTION_BOTH; } else { pass.DataDirection = MPR_PASS_THRU_DIRECTION_BOTH; } } else if (datain_len) { if (is_mps) { pass.DataDirection = MPS_PASS_THRU_DIRECTION_READ; } else { pass.DataDirection = MPR_PASS_THRU_DIRECTION_READ; } } else if (dataout_len) { if (is_mps) { pass.DataDirection = MPS_PASS_THRU_DIRECTION_WRITE; } else { pass.DataDirection = MPR_PASS_THRU_DIRECTION_WRITE; } } else { if (is_mps) { pass.DataDirection = MPS_PASS_THRU_DIRECTION_NONE; } else { pass.DataDirection = MPR_PASS_THRU_DIRECTION_NONE; } } pass.Timeout = timeout; if (ioctl(fd, MPTIOCTL_PASS_THRU, &pass) < 0) return (errno); return (0); } MPI2_IOC_FACTS_REPLY * mps_get_iocfacts(int fd) { MPI2_IOC_FACTS_REPLY *facts; MPI2_IOC_FACTS_REQUEST req; int error; facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY)); if (facts == NULL) { errno = ENOMEM; return (NULL); } bzero(&req, sizeof(MPI2_IOC_FACTS_REQUEST)); req.Function = MPI2_FUNCTION_IOC_FACTS; #if 1 error = mps_pass_command(fd, &req, sizeof(MPI2_IOC_FACTS_REQUEST), facts, sizeof(MPI2_IOC_FACTS_REPLY), NULL, 0, NULL, 0, 10); #else error = mps_user_command(fd, &req, sizeof(MPI2_IOC_FACTS_REQUEST), facts, sizeof(MPI2_IOC_FACTS_REPLY), NULL, 0, 0); #endif if (error) { free(facts); return (NULL); } if (!IOC_STATUS_SUCCESS(facts->IOCStatus)) { free(facts); errno = EINVAL; return (NULL); } return (facts); } Index: head/usr.sbin/mpsutil/mps_debug.c =================================================================== --- head/usr.sbin/mpsutil/mps_debug.c (revision 343754) +++ head/usr.sbin/mpsutil/mps_debug.c (revision 343755) @@ -1,190 +1,189 @@ /*- * Copyright (c) 2018 Netflix, Inc. - * All rights reserved. * Written by: Scott Long * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __RCSID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "mpsutil.h" MPS_TABLE(top, debug); struct mps_dumpreq_hdr { uint32_t smid; uint32_t state; uint32_t numframes; uint32_t deschi; uint32_t desclo; }; static int find_sgl(char *); static void print_sgl(char *, int, int); #define MPS_FRAME_LEN 128 static int debug_dumpreqs(int ac, char **av) { struct mps_dumpreq_hdr *hdr; char *buf, sysctlbuf[128]; size_t len; int numframes, error, offset; len = 0; buf = NULL; snprintf(sysctlbuf, sizeof(sysctlbuf), "dev.%s.%d.dump_reqs", is_mps ? "mps" : "mpr", mps_unit); error = sysctlbyname(sysctlbuf, NULL, &len, NULL, 0); if (error) return (error); if (len == 0) return (0); buf = malloc(len); if (buf == NULL) return (ENOMEM); error = sysctlbyname(sysctlbuf, buf, &len, NULL, 0); if (error) { printf("len= %zd, error= %d errno= %d\n", len, error, errno); return (error); } while (len >= MPS_FRAME_LEN) { hdr = (struct mps_dumpreq_hdr *)buf; numframes = hdr->numframes; printf("SMID= %d state= %#x numframes= %d desc.hi= %#08x " "desc.lo= %#08x\n", hdr->smid, hdr->state, hdr->numframes, hdr->deschi, hdr->desclo); buf += sizeof(struct mps_dumpreq_hdr); len -= sizeof(struct mps_dumpreq_hdr); if ((offset = find_sgl(buf)) != -1) print_sgl(buf, offset, numframes); buf += MPS_FRAME_LEN * numframes; len -= MPS_FRAME_LEN * numframes; } return (error); } static int find_sgl(char *buf) { MPI2_REQUEST_HEADER *req; MPI2_SCSI_IO_REQUEST *scsi; int offset = 0; req = (MPI2_REQUEST_HEADER *)buf; switch (req->Function) { case MPI2_FUNCTION_SCSI_IO_REQUEST: scsi = (MPI2_SCSI_IO_REQUEST *)buf; offset = scsi->SGLOffset0; break; default: offset = -1; } return (offset); } #define SGL_FLAGS "\10LastElement\7EndOfBuffer\4Local\3Host2IOC\2Addr64\1EndOfList" static void print_sgl(char *buf, int offset, int numframes) { MPI2_SGE_SIMPLE64 *sge; MPI2_SGE_CHAIN_UNION *sgc; MPI2_REQUEST_HEADER *req; u_int i = 0, flags; char *frame, tmpbuf[128]; req = (MPI2_REQUEST_HEADER *)buf; frame = (char *)buf; sge = (MPI2_SGE_SIMPLE64 *)&frame[offset * 4]; printf("SGL for command\n"); hexdump(frame, MPS_FRAME_LEN, NULL, 0); while (frame != NULL) { flags = sge->FlagsLength >> MPI2_SGE_FLAGS_SHIFT; bzero(tmpbuf, sizeof(tmpbuf)); mps_parse_flags(flags, SGL_FLAGS, tmpbuf, sizeof(tmpbuf)); printf("seg%d flags=%x %s len= 0x%06x addr=0x%016jx\n", i, flags, tmpbuf, sge->FlagsLength & 0xffffff, mps_to_u64(&sge->Address)); if (flags & (MPI2_SGE_FLAGS_END_OF_LIST | MPI2_SGE_FLAGS_END_OF_BUFFER)) break; sge++; i++; if (flags & MPI2_SGE_FLAGS_LAST_ELEMENT) { sgc = (MPI2_SGE_CHAIN_UNION *)sge; if ((sgc->Flags & MPI2_SGE_FLAGS_CHAIN_ELEMENT) == 0) { printf("Invalid chain element\n"); break; } bzero(tmpbuf, sizeof(tmpbuf)); mps_parse_flags(sgc->Flags, SGL_FLAGS, tmpbuf, sizeof(tmpbuf)); if (sgc->Flags & MPI2_SGE_FLAGS_64_BIT_ADDRESSING) printf("chain64 flags=0x%x %s len=0x%x " "Offset=0x%x addr=0x%016jx\n", sgc->Flags, tmpbuf, sgc->Length, sgc->NextChainOffset, mps_to_u64(&sgc->u.Address64)); else printf("chain32 flags=0x%x %s len=0x%x " "Offset=0x%x addr=0x%08x\n", sgc->Flags, tmpbuf, sgc->Length, sgc->NextChainOffset, sgc->u.Address32); if (--numframes <= 0) break; frame += MPS_FRAME_LEN; sge = (MPI2_SGE_SIMPLE64 *)frame; hexdump(frame, MPS_FRAME_LEN, NULL, 0); } } } MPS_COMMAND(debug, dumpreqs, debug_dumpreqs, "", "Dump the active request queue") Index: head/usr.sbin/mpsutil/mps_show.c =================================================================== --- head/usr.sbin/mpsutil/mps_show.c (revision 343754) +++ head/usr.sbin/mpsutil/mps_show.c (revision 343755) @@ -1,811 +1,810 @@ /*- * Copyright (c) 2015 Netflix, Inc. - * All rights reserved. * Written by: Scott Long * * Copyright (c) 2008 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __RCSID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "mpsutil.h" static char * get_device_speed(uint8_t rate); static char * get_device_type(uint32_t di); static int show_all(int ac, char **av); static int show_devices(int ac, char **av); static int show_enclosures(int ac, char **av); static int show_expanders(int ac, char **av); MPS_TABLE(top, show); #define STANDALONE_STATE "ONLINE" static int show_adapter(int ac, char **av) { MPI2_CONFIG_PAGE_SASIOUNIT_0 *sas0; MPI2_CONFIG_PAGE_SASIOUNIT_1 *sas1; MPI2_SAS_IO_UNIT0_PHY_DATA *phy0; MPI2_SAS_IO_UNIT1_PHY_DATA *phy1; MPI2_CONFIG_PAGE_MAN_0 *man0; MPI2_CONFIG_PAGE_BIOS_3 *bios3; MPI2_IOC_FACTS_REPLY *facts; U16 IOCStatus; char *speed, *minspeed, *maxspeed, *isdisabled, *type; char devhandle[5], ctrlhandle[5]; int error, fd, v, i; if (ac != 1) { warnx("show adapter: extra arguments"); return (EINVAL); } fd = mps_open(mps_unit); if (fd < 0) { error = errno; warn("mps_open"); return (error); } man0 = mps_read_man_page(fd, 0, NULL); if (man0 == NULL) { error = errno; warn("Failed to get controller info"); return (error); } if (man0->Header.PageLength < sizeof(*man0) / 4) { warnx("Invalid controller info"); return (EINVAL); } printf("mp%s%d Adapter:\n", is_mps ? "s": "r", mps_unit); printf(" Board Name: %.16s\n", man0->BoardName); printf(" Board Assembly: %.16s\n", man0->BoardAssembly); printf(" Chip Name: %.16s\n", man0->ChipName); printf(" Chip Revision: %.16s\n", man0->ChipRevision); free(man0); bios3 = mps_read_config_page(fd, MPI2_CONFIG_PAGETYPE_BIOS, 3, 0, NULL); if (bios3 == NULL) { error = errno; warn("Failed to get BIOS page 3 info"); return (error); } v = bios3->BiosVersion; printf(" BIOS Revision: %d.%02d.%02d.%02d\n", ((v & 0xff000000) >> 24), ((v &0xff0000) >> 16), ((v & 0xff00) >> 8), (v & 0xff)); free(bios3); if ((facts = mps_get_iocfacts(fd)) == NULL) { printf("could not get controller IOCFacts\n"); close(fd); return (errno); } v = facts->FWVersion.Word; printf("Firmware Revision: %d.%02d.%02d.%02d\n", ((v & 0xff000000) >> 24), ((v &0xff0000) >> 16), ((v & 0xff00) >> 8), (v & 0xff)); printf(" Integrated RAID: %s\n", (facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID) ? "yes" : "no"); free(facts); fd = mps_open(mps_unit); if (fd < 0) { error = errno; warn("mps_open"); return (error); } sas0 = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_SAS_IO_UNIT, MPI2_SASIOUNITPAGE0_PAGEVERSION, 0, 0, &IOCStatus); if (sas0 == NULL) { error = errno; warn("Error retrieving SAS IO Unit page %d", IOCStatus); free(sas0); close(fd); return (error); } sas1 = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_SAS_IO_UNIT, MPI2_SASIOUNITPAGE1_PAGEVERSION, 1, 0, &IOCStatus); if (sas1 == NULL) { error = errno; warn("Error retrieving SAS IO Unit page %d", IOCStatus); free(sas0); close(fd); return (error); } printf("\n"); printf("%-8s%-12s%-11s%-10s%-8s%-7s%-7s%s\n", "PhyNum", "CtlrHandle", "DevHandle", "Disabled", "Speed", "Min", "Max", "Device"); for (i = 0; i < sas0->NumPhys; i++) { phy0 = &sas0->PhyData[i]; phy1 = &sas1->PhyData[i]; if (phy0->PortFlags & MPI2_SASIOUNIT0_PORTFLAGS_DISCOVERY_IN_PROGRESS) { printf("Discovery still in progress\n"); continue; } if (phy0->PhyFlags & MPI2_SASIOUNIT0_PHYFLAGS_PHY_DISABLED) isdisabled = "Y"; else isdisabled = "N"; minspeed = get_device_speed(phy1->MaxMinLinkRate); maxspeed = get_device_speed(phy1->MaxMinLinkRate >> 4); type = get_device_type(phy0->ControllerPhyDeviceInfo); if (phy0->AttachedDevHandle != 0) { snprintf(devhandle, 5, "%04x", phy0->AttachedDevHandle); snprintf(ctrlhandle, 5, "%04x", phy0->ControllerDevHandle); speed = get_device_speed(phy0->NegotiatedLinkRate); } else { snprintf(devhandle, 5, " "); snprintf(ctrlhandle, 5, " "); speed = " "; } printf("%-8d%-12s%-11s%-10s%-8s%-7s%-7s%s\n", i, ctrlhandle, devhandle, isdisabled, speed, minspeed, maxspeed, type); } free(sas0); free(sas1); printf("\n"); close(fd); return (0); } MPS_COMMAND(show, adapter, show_adapter, "", "display controller information") static int show_iocfacts(int ac, char **av) { MPI2_IOC_FACTS_REPLY *facts; char tmpbuf[128]; int error, fd; fd = mps_open(mps_unit); if (fd < 0) { error = errno; warn("mps_open"); return (error); } if ((facts = mps_get_iocfacts(fd)) == NULL) { printf("could not get controller IOCFacts\n"); close(fd); return (errno); } #define IOCCAP "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf" \ "\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR" \ "\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc" \ "\22FastPath" "\23RDPQArray" "\24AtomicReqDesc" "\25PCIeSRIOV" bzero(tmpbuf, sizeof(tmpbuf)); mps_parse_flags(facts->IOCCapabilities, IOCCAP, tmpbuf, sizeof(tmpbuf)); printf(" MsgVersion: %02d.%02d\n", facts->MsgVersion >> 8, facts->MsgVersion & 0xff); printf(" MsgLength: %d\n", facts->MsgLength); printf(" Function: 0x%x\n", facts->Function); printf(" HeaderVersion: %02d,%02d\n", facts->HeaderVersion >> 8, facts->HeaderVersion & 0xff); printf(" IOCNumber: %d\n", facts->IOCNumber); printf(" MsgFlags: 0x%x\n", facts->MsgFlags); printf(" VP_ID: %d\n", facts->VP_ID); printf(" VF_ID: %d\n", facts->VF_ID); printf(" IOCExceptions: %d\n", facts->IOCExceptions); printf(" IOCStatus: %d\n", facts->IOCStatus); printf(" IOCLogInfo: 0x%x\n", facts->IOCLogInfo); printf(" MaxChainDepth: %d\n", facts->MaxChainDepth); printf(" WhoInit: 0x%x\n", facts->WhoInit); printf(" NumberOfPorts: %d\n", facts->NumberOfPorts); printf(" MaxMSIxVectors: %d\n", facts->MaxMSIxVectors); printf(" RequestCredit: %d\n", facts->RequestCredit); printf(" ProductID: 0x%x\n", facts->ProductID); printf(" IOCCapabilities: 0x%x %s\n", facts->IOCCapabilities, tmpbuf); printf(" FWVersion: 0x%08x\n", facts->FWVersion.Word); printf(" IOCRequestFrameSize: %d\n", facts->IOCRequestFrameSize); printf(" MaxInitiators: %d\n", facts->MaxInitiators); printf(" MaxTargets: %d\n", facts->MaxTargets); printf(" MaxSasExpanders: %d\n", facts->MaxSasExpanders); printf(" MaxEnclosures: %d\n", facts->MaxEnclosures); bzero(tmpbuf, sizeof(tmpbuf)); mps_parse_flags(facts->ProtocolFlags, "\4NvmeDevices\2ScsiTarget\1ScsiInitiator", tmpbuf, sizeof(tmpbuf)); printf(" ProtocolFlags: 0x%x %s\n", facts->ProtocolFlags, tmpbuf); printf(" HighPriorityCredit: %d\n", facts->HighPriorityCredit); printf("MaxRepDescPostQDepth: %d\n", facts->MaxReplyDescriptorPostQueueDepth); printf(" ReplyFrameSize: %d\n", facts->ReplyFrameSize); printf(" MaxVolumes: %d\n", facts->MaxVolumes); printf(" MaxDevHandle: %d\n", facts->MaxDevHandle); printf("MaxPersistentEntries: %d\n", facts->MaxPersistentEntries); printf(" MinDevHandle: %d\n", facts->MinDevHandle); free(facts); return (0); } MPS_COMMAND(show, iocfacts, show_iocfacts, "", "Show IOC Facts Message"); static int show_adapters(int ac, char **av) { MPI2_CONFIG_PAGE_MAN_0 *man0; MPI2_IOC_FACTS_REPLY *facts; int unit, fd, error; printf("Device Name\t Chip Name Board Name Firmware\n"); for (unit = 0; unit < MPS_MAX_UNIT; unit++) { fd = mps_open(unit); if (fd < 0) continue; facts = mps_get_iocfacts(fd); if (facts == NULL) { error = errno; warn("Faled to get controller iocfacts"); close(fd); return (error); } man0 = mps_read_man_page(fd, 0, NULL); if (man0 == NULL) { error = errno; warn("Failed to get controller info"); close(fd); free(facts); return (error); } if (man0->Header.PageLength < sizeof(*man0) / 4) { warnx("Invalid controller info"); close(fd); free(man0); free(facts); return (EINVAL); } printf("/dev/mp%s%d\t%16s %16s %08x\n", is_mps ? "s": "r", unit, man0->ChipName, man0->BoardName, facts->FWVersion.Word); free(man0); free(facts); close(fd); } return (0); } MPS_COMMAND(show, adapters, show_adapters, "", "Show a summary of all adapters"); static char * get_device_type(uint32_t di) { if (di & 0x4000) return ("SEP Target "); if (di & 0x2000) return ("ATAPI Target "); if (di & 0x400) return ("SAS Target "); if (di & 0x200) return ("STP Target "); if (di & 0x100) return ("SMP Target "); if (di & 0x80) return ("SATA Target "); if (di & 0x70) return ("SAS Initiator "); if (di & 0x8) return ("SATA Initiator"); if ((di & 0x7) == 0) return ("No Device "); return ("Unknown Device"); } static char * get_enc_type(uint32_t flags, int *issep) { char *type; *issep = 0; switch (flags & 0xf) { case 0x01: type = "Direct Attached SES-2"; *issep = 1; break; case 0x02: type = "Direct Attached SGPIO"; break; case 0x03: type = "Expander SGPIO"; break; case 0x04: type = "External SES-2"; *issep = 1; break; case 0x05: type = "Direct Attached GPIO"; break; case 0x0: default: return ("Unknown"); } return (type); } static char * mps_device_speed[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "1.5", "3.0", "6.0", "12 " }; static char * get_device_speed(uint8_t rate) { char *speed; rate &= 0xf; if (rate >= sizeof(mps_device_speed)) return ("Unk"); if ((speed = mps_device_speed[rate]) == NULL) return ("???"); return (speed); } static char * mps_page_name[] = { "IO Unit", "IOC", "BIOS", NULL, NULL, NULL, NULL, NULL, "RAID Volume", "Manufacturing", "RAID Physical Disk", NULL, NULL, NULL, NULL, NULL, "SAS IO Unit", "SAS Expander", "SAS Device", "SAS PHY", "Log", "Enclosure", "RAID Configuration", "Driver Persistent Mapping", "SAS Port", "Ethernet Port", "Extended Manufacturing" }; static char * get_page_name(u_int page) { char *name; if (page >= sizeof(mps_page_name)) return ("Unknown"); if ((name = mps_page_name[page]) == NULL) return ("Unknown"); return (name); } static int show_all(int ac, char **av) { int error; printf("Adapter:\n"); error = show_adapter(ac, av); printf("Devices:\n"); error = show_devices(ac, av); printf("Enclosures:\n"); error = show_enclosures(ac, av); printf("Expanders:\n"); error = show_expanders(ac, av); return (error); } MPS_COMMAND(show, all, show_all, "", "Show all devices"); static int show_devices(int ac, char **av) { MPI2_CONFIG_PAGE_SASIOUNIT_0 *sas0; MPI2_SAS_IO_UNIT0_PHY_DATA *phydata; MPI2_CONFIG_PAGE_SAS_DEV_0 *device; MPI2_CONFIG_PAGE_EXPANDER_1 *exp1; uint16_t IOCStatus, handle, bus, target; char *type, *speed, enchandle[5], slot[3], bt[8]; char buf[256]; int fd, error, nphys; fd = mps_open(mps_unit); if (fd < 0) { error = errno; warn("mps_open"); return (error); } sas0 = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_SAS_IO_UNIT, MPI2_SASIOUNITPAGE0_PAGEVERSION, 0, 0, &IOCStatus); if (sas0 == NULL) { error = errno; warn("Error retrieving SAS IO Unit page %d", IOCStatus); return (error); } nphys = sas0->NumPhys; printf("B____%-5s%-17s%-8s%-10s%-14s%-6s%-5s%-6s%s\n", "T", "SAS Address", "Handle", "Parent", "Device", "Speed", "Enc", "Slot", "Wdt"); handle = 0xffff; while (1) { device = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_SAS_DEVICE, MPI2_SASDEVICE0_PAGEVERSION, 0, MPI2_SAS_DEVICE_PGAD_FORM_GET_NEXT_HANDLE | handle, &IOCStatus); if (device == NULL) { if (IOCStatus == MPI2_IOCSTATUS_CONFIG_INVALID_PAGE) break; error = errno; warn("Error retrieving device page"); close(fd); return (error); } handle = device->DevHandle; if (device->ParentDevHandle == 0x0) { free(device); continue; } bus = 0xffff; target = 0xffff; error = mps_map_btdh(fd, &handle, &bus, &target); if (error) { free(device); continue; } if ((bus == 0xffff) || (target == 0xffff)) snprintf(bt, sizeof(bt), " "); else snprintf(bt, sizeof(bt), "%02d %02d", bus, target); type = get_device_type(device->DeviceInfo); if (device->PhyNum < nphys) { phydata = &sas0->PhyData[device->PhyNum]; speed = get_device_speed(phydata->NegotiatedLinkRate); } else if (device->ParentDevHandle > 0) { exp1 = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_SAS_EXPANDER, MPI2_SASEXPANDER1_PAGEVERSION, 1, MPI2_SAS_EXPAND_PGAD_FORM_HNDL_PHY_NUM | (device->PhyNum << MPI2_SAS_EXPAND_PGAD_PHYNUM_SHIFT) | device->ParentDevHandle, &IOCStatus); if (exp1 == NULL) { if (IOCStatus != MPI2_IOCSTATUS_CONFIG_INVALID_PAGE) { error = errno; warn("Error retrieving expander page 1: 0x%x", IOCStatus); close(fd); free(device); return (error); } speed = " "; } else { speed = get_device_speed(exp1->NegotiatedLinkRate); free(exp1); } } else speed = " "; if (device->EnclosureHandle != 0) { snprintf(enchandle, 5, "%04x", device->EnclosureHandle); snprintf(slot, 3, "%02d", device->Slot); } else { snprintf(enchandle, 5, " "); snprintf(slot, 3, " "); } printf("%-10s", bt); snprintf(buf, sizeof(buf), "%08x%08x", device->SASAddress.High, device->SASAddress.Low); printf("%-17s", buf); snprintf(buf, sizeof(buf), "%04x", device->DevHandle); printf("%-8s", buf); snprintf(buf, sizeof(buf), "%04x", device->ParentDevHandle); printf("%-10s", buf); printf("%-14s%-6s%-5s%-6s%d\n", type, speed, enchandle, slot, device->MaxPortConnections); free(device); } printf("\n"); free(sas0); close(fd); return (0); } MPS_COMMAND(show, devices, show_devices, "", "Show attached devices"); static int show_enclosures(int ac, char **av) { MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0 *enc; char *type, sepstr[5]; uint16_t IOCStatus, handle; int fd, error, issep; fd = mps_open(mps_unit); if (fd < 0) { error = errno; warn("mps_open"); return (error); } printf("Slots Logical ID SEPHandle EncHandle Type\n"); handle = 0xffff; while (1) { enc = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_ENCLOSURE, MPI2_SASENCLOSURE0_PAGEVERSION, 0, MPI2_SAS_ENCLOS_PGAD_FORM_GET_NEXT_HANDLE | handle, &IOCStatus); if (enc == NULL) { if (IOCStatus == MPI2_IOCSTATUS_CONFIG_INVALID_PAGE) break; error = errno; warn("Error retrieving enclosure page"); close(fd); return (error); } type = get_enc_type(enc->Flags, &issep); if (issep == 0) snprintf(sepstr, 5, " "); else snprintf(sepstr, 5, "%04x", enc->SEPDevHandle); printf(" %.2d %08x%08x %s %04x %s\n", enc->NumSlots, enc->EnclosureLogicalID.High, enc->EnclosureLogicalID.Low, sepstr, enc->EnclosureHandle, type); handle = enc->EnclosureHandle; free(enc); } printf("\n"); close(fd); return (0); } MPS_COMMAND(show, enclosures, show_enclosures, "", "Show attached enclosures"); static int show_expanders(int ac, char **av) { MPI2_CONFIG_PAGE_EXPANDER_0 *exp0; MPI2_CONFIG_PAGE_EXPANDER_1 *exp1; uint16_t IOCStatus, handle; char enchandle[5], parent[5], rphy[3], rhandle[5]; char *speed, *min, *max, *type; int fd, error, nphys, i; fd = mps_open(mps_unit); if (fd < 0) { error = errno; warn("mps_open"); return (error); } printf("NumPhys SAS Address DevHandle Parent EncHandle SAS Level\n"); handle = 0xffff; while (1) { exp0 = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_SAS_EXPANDER, MPI2_SASEXPANDER0_PAGEVERSION, 0, MPI2_SAS_EXPAND_PGAD_FORM_GET_NEXT_HNDL | handle, &IOCStatus); if (exp0 == NULL) { if (IOCStatus == MPI2_IOCSTATUS_CONFIG_INVALID_PAGE) break; error = errno; warn("Error retrieving expander page 0"); close(fd); return (error); } nphys = exp0->NumPhys; handle = exp0->DevHandle; if (exp0->EnclosureHandle == 0x00) snprintf(enchandle, 5, " "); else snprintf(enchandle, 5, "%04d", exp0->EnclosureHandle); if (exp0->ParentDevHandle == 0x0) snprintf(parent, 5, " "); else snprintf(parent, 5, "%04x", exp0->ParentDevHandle); printf(" %02d %08x%08x %04x %s %s %d\n", exp0->NumPhys, exp0->SASAddress.High, exp0->SASAddress.Low, exp0->DevHandle, parent, enchandle, exp0->SASLevel); printf("\n"); printf(" Phy RemotePhy DevHandle Speed Min Max Device\n"); for (i = 0; i < nphys; i++) { exp1 = mps_read_extended_config_page(fd, MPI2_CONFIG_EXTPAGETYPE_SAS_EXPANDER, MPI2_SASEXPANDER1_PAGEVERSION, 1, MPI2_SAS_EXPAND_PGAD_FORM_HNDL_PHY_NUM | (i << MPI2_SAS_EXPAND_PGAD_PHYNUM_SHIFT) | exp0->DevHandle, &IOCStatus); if (exp1 == NULL) { if (IOCStatus != MPI2_IOCSTATUS_CONFIG_INVALID_PAGE) warn("Error retrieving expander pg 1"); continue; } type = get_device_type(exp1->AttachedDeviceInfo); if ((exp1->AttachedDeviceInfo &0x7) == 0) { speed = " "; snprintf(rphy, 3, " "); snprintf(rhandle, 5, " "); } else { speed = get_device_speed( exp1->NegotiatedLinkRate); snprintf(rphy, 3, "%02d", exp1->AttachedPhyIdentifier); snprintf(rhandle, 5, "%04x", exp1->AttachedDevHandle); } min = get_device_speed(exp1->HwLinkRate); max = get_device_speed(exp1->HwLinkRate >> 4); printf(" %02d %s %s %s %s %s %s\n", exp1->Phy, rphy, rhandle, speed, min, max, type); free(exp1); } free(exp0); } printf("\n"); close(fd); return (0); } MPS_COMMAND(show, expanders, show_expanders, "", "Show attached expanders"); static int show_cfgpage(int ac, char **av) { MPI2_CONFIG_PAGE_HEADER *hdr; MPI2_CONFIG_EXTENDED_PAGE_HEADER *ehdr; void *data; uint32_t addr; uint16_t IOCStatus; uint8_t page, num; int fd, error, len, attrs; char *pgname, *pgattr; fd = mps_open(mps_unit); if (fd < 0) { error = errno; warn("mps_open"); return (error); } addr = 0; num = 0; page = 0; switch (ac) { case 4: addr = (uint32_t)strtoul(av[3], NULL, 0); case 3: num = (uint8_t)strtoul(av[2], NULL, 0); case 2: page = (uint8_t)strtoul(av[1], NULL, 0); break; default: errno = EINVAL; warn("cfgpage: not enough arguments"); return (EINVAL); } if (page >= 0x10) data = mps_read_extended_config_page(fd, page, 0, num, addr, &IOCStatus); else data = mps_read_config_page(fd, page, num, addr, &IOCStatus); if (data == NULL) { error = errno; warn("Error retrieving cfg page: %s\n", mps_ioc_status(IOCStatus)); return (error); } if (page >= 0x10) { ehdr = data; len = ehdr->ExtPageLength * 4; page = ehdr->ExtPageType; attrs = ehdr->PageType >> 4; } else { hdr = data; len = hdr->PageLength * 4; page = hdr->PageType & 0xf; attrs = hdr->PageType >> 4; } pgname = get_page_name(page); if (attrs == 0) pgattr = "Read-only"; else if (attrs == 1) pgattr = "Read-Write"; else if (attrs == 2) pgattr = "Read-Write Persistent"; else pgattr = "Unknown Page Attribute"; printf("Page 0x%x: %s %d, %s\n", page, pgname, num, pgattr); hexdump(data, len, NULL, HD_REVERSED | 4); free(data); close(fd); return (0); } MPS_COMMAND(show, cfgpage, show_cfgpage, "page [num] [addr]", "Display config page"); Index: head/usr.sbin/mpsutil/mpsutil.c =================================================================== --- head/usr.sbin/mpsutil/mpsutil.c (revision 343754) +++ head/usr.sbin/mpsutil/mpsutil.c (revision 343755) @@ -1,237 +1,236 @@ /*- * Copyright (c) 2015 Netflix, Inc. - * All rights reserved. * Written by: Scott Long * * Copyright (c) 2008 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __RCSID("$FreeBSD$"); #include #include #include #include #include #include #include #include "mpsutil.h" SET_DECLARE(MPS_DATASET(top), struct mpsutil_command); SET_DECLARE(MPS_DATASET(usage), struct mpsutil_usage); int mps_unit; int is_mps; static void usage(void) { struct mpsutil_usage **cmd; const char *args, *desc; fprintf(stderr, "usage: %s [-u unit] ...\n\n", getprogname()); fprintf(stderr, "Commands include:\n"); SET_FOREACH(cmd, MPS_DATASET(usage)) { if (*cmd == NULL) { fprintf(stderr, "\n"); } else { (*cmd)->handler(&args, &desc); if (strncmp((*cmd)->set, "top", 3) == 0) fprintf(stderr, "%s %-30s\t%s\n", (*cmd)->name, args, desc); else fprintf(stderr, "%s %s %-30s\t%s\n", (*cmd)->set, (*cmd)->name, args, desc); } } exit(1); } static int version(int ac, char **av) { printf("%s: version %s", MPSUTIL_VERSION, getprogname()); #ifdef DEBUG printf(" (DEBUG)"); #endif printf("\n"); return (0); } MPS_COMMAND(top, version, version, "", "version") int main(int ac, char **av) { struct mpsutil_command **cmd; int ch; is_mps = !strcmp(getprogname(), "mpsutil"); while ((ch = getopt(ac, av, "u:h?")) != -1) { switch (ch) { case 'u': mps_unit = atoi(optarg); break; case 'h': case '?': usage(); return (1); } } av += optind; ac -= optind; /* getopt() eats av[0], so we can't use mpt_table_handler() directly. */ if (ac == 0) { usage(); return (1); } SET_FOREACH(cmd, MPS_DATASET(top)) { if (strcmp((*cmd)->name, av[0]) == 0) { if ((*cmd)->handler(ac, av)) return (1); else return (0); } } warnx("Unknown command %s.", av[0]); return (1); } int mps_table_handler(struct mpsutil_command **start, struct mpsutil_command **end, int ac, char **av) { struct mpsutil_command **cmd; if (ac < 2) { warnx("The %s command requires a sub-command.", av[0]); return (EINVAL); } for (cmd = start; cmd < end; cmd++) { if (strcmp((*cmd)->name, av[1]) == 0) return ((*cmd)->handler(ac - 1, av + 1)); } warnx("%s is not a valid sub-command of %s.", av[1], av[0]); return (ENOENT); } void hexdump(const void *ptr, int length, const char *hdr, int flags) { int i, j, k; int cols; const unsigned char *cp; char delim; if ((flags & HD_DELIM_MASK) != 0) delim = (flags & HD_DELIM_MASK) >> 8; else delim = ' '; if ((flags & HD_COLUMN_MASK) != 0) cols = flags & HD_COLUMN_MASK; else cols = 16; cp = ptr; for (i = 0; i < length; i+= cols) { if (hdr != NULL) printf("%s", hdr); if ((flags & HD_OMIT_COUNT) == 0) printf("%04x ", i); if ((flags & HD_OMIT_HEX) == 0) { for (j = 0; j < cols; j++) { if (flags & HD_REVERSED) k = i + (cols - 1 - j); else k = i + j; if (k < length) printf("%c%02x", delim, cp[k]); else printf(" "); } } if ((flags & HD_OMIT_CHARS) == 0) { printf(" |"); for (j = 0; j < cols; j++) { if (flags & HD_REVERSED) k = i + (cols - 1 - j); else k = i + j; if (k >= length) printf(" "); else if (cp[k] >= ' ' && cp[k] <= '~') printf("%c", cp[k]); else printf("."); } printf("|"); } printf("\n"); } } #define PCHAR(c) { if (retval < tmpsz) { *outbuf++ = (c); retval++; } } int mps_parse_flags(uintmax_t num, const char *q, char *outbuf, int tmpsz) { int n, tmp, retval = 0; if (num == 0) return (retval); /* %b conversion flag format. */ tmp = retval; while (*q) { n = *q++; if (num & (1 << (n - 1))) { PCHAR(retval != tmp ? ',' : '<'); for (; (n = *q) > ' '; ++q) PCHAR(n); } else for (; *q > ' '; ++q) continue; } if (retval != tmp) PCHAR('>'); return (retval); } Index: head/usr.sbin/pmcstat/pmcpl_annotate_cg.c =================================================================== --- head/usr.sbin/pmcstat/pmcpl_annotate_cg.c (revision 343754) +++ head/usr.sbin/pmcstat/pmcpl_annotate_cg.c (revision 343755) @@ -1,129 +1,131 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2007, Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation - * Copyright (c) 2014, Adrian Chadd, Netflix Inc. * All rights reserved. + * + * Copyright (c) 2014 Netflix, Inc. + * Written by: Adrian Chadd * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Transform a hwpmc(4) log into human readable form, and into * gprof(1) compatible profiles. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pmcstat.h" #include "pmcstat_log.h" #include "pmcpl_annotate_cg.h" /* * Record a callchain. */ void pmcpl_annotate_cg_process(struct pmcstat_process *pp, struct pmcstat_pmcrecord *pmcr, uint32_t nsamples, uintfptr_t *cc, int usermode, uint32_t cpu) { struct pmcstat_pcmap *map; struct pmcstat_symbol *sym; uintfptr_t newpc; struct pmcstat_image *image; int i; char filename[PATH_MAX], funcname[PATH_MAX]; unsigned sline; (void) pmcr; (void) nsamples; (void) usermode; (void) cpu; for (i = 0; i < (int) nsamples; i++) { map = NULL; sym = NULL; image = NULL; filename[0] = '\0'; funcname[0] = '\0'; sline = 0; map = pmcstat_process_find_map(usermode ? pp : pmcstat_kernproc, cc[i]); if (map != NULL) { assert(cc[i] >= map->ppm_lowpc && cc[i] < map->ppm_highpc); image = map->ppm_image; newpc = cc[i] - (map->ppm_lowpc + (image->pi_vaddr - image->pi_start)); sym = pmcstat_symbol_search(image, newpc); } if (map != NULL && image != NULL && sym != NULL) { (void) pmcstat_image_addr2line(image, cc[i], filename, sizeof(filename), &sline, funcname, sizeof(funcname)); } if (map != NULL && sym != NULL) { fprintf(args.pa_graphfile, "%p %s %s:%d\n", (void *)cc[i], funcname, filename, sline); } else { fprintf(args.pa_graphfile, "%p ??:0\n", (void *) cc[i]); } } fprintf(args.pa_graphfile, "--\n"); } Index: head/usr.sbin/pmcstat/pmcpl_annotate_cg.h =================================================================== --- head/usr.sbin/pmcstat/pmcpl_annotate_cg.h (revision 343754) +++ head/usr.sbin/pmcstat/pmcpl_annotate_cg.h (revision 343755) @@ -1,44 +1,46 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2007, Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation - * Copyright (c) 2014, Adrian Chadd, Netflix Inc. * All rights reserved. + * + * Copyright (c) 2014 Netflix, Inc. + * Written by: Adrian Chadd * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PMCSTAT_PL_ANNOTATE_CG_H_ #define _PMCSTAT_PL_ANNOTATE_CG_H_ /* Function prototypes */ void pmcpl_annotate_cg_process( struct pmcstat_process *pp, struct pmcstat_pmcrecord *pmcr, uint32_t nsamples, uintfptr_t *cc, int usermode, uint32_t cpu); #endif /* _PMCSTAT_PL_ANNOTATE_CG_H_ */ Index: head/usr.sbin/pmcstudy/eval_expr.c =================================================================== --- head/usr.sbin/pmcstudy/eval_expr.c (revision 343754) +++ head/usr.sbin/pmcstudy/eval_expr.c (revision 343755) @@ -1,717 +1,716 @@ /*- - * Copyright (c) 2015 Netflix Inc. - * All rights reserved. + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include "eval_expr.h" __FBSDID("$FreeBSD$"); static struct expression * alloc_and_hook_expr(struct expression **exp_p, struct expression **last_p) { struct expression *ex, *at; ex = malloc(sizeof(struct expression)); if (ex == NULL) { printf("Out of memory in exp allocation\n"); exit(-2); } memset(ex, 0, sizeof(struct expression)); if (*exp_p == NULL) { *exp_p = ex; } at = *last_p; if (at == NULL) { /* First one, its last */ *last_p = ex; } else { /* Chain it to the end and update last */ at->next = ex; ex->prev = at; *last_p = ex; } return (ex); } static int validate_expr(struct expression *exp, int val1_is_set, int op_is_set, int val2_is_set, int *op_cnt) { int val1, op, val2; int open_cnt; val1 = op = val2 = 0; if (val1_is_set) { val1 = 1; } if (op_is_set) { op = 1; } if (val2_is_set) { val2 = 1; } open_cnt = *op_cnt; if (exp == NULL) { /* End of the road */ if (val1 && op && val2 && (open_cnt == 0)) { return(0); } else { return(1); } } switch(exp->type) { case TYPE_OP_PLUS: case TYPE_OP_MINUS: case TYPE_OP_MULT: case TYPE_OP_DIVIDE: if (val1 && op && val2) { /* We are at x + y + * collapse back to val/op */ val1 = 1; op = 1; val2 = 0; } else if ((op == 0) && (val1)) { op = 1; } else { printf("Op but no val1 set\n"); return(-1); } break; case TYPE_PARN_OPEN: if (exp->next == NULL) { printf("NULL after open paren\n"); exit(-1); } if ((exp->next->type == TYPE_OP_PLUS) || (exp->next->type == TYPE_OP_MINUS) || (exp->next->type == TYPE_OP_DIVIDE) || (exp->next->type == TYPE_OP_MULT)) { printf("'( OP' -- not allowed\n"); return(-1); } if (val1 && (op == 0)) { printf("'Val (' -- not allowed\n"); return(-1); } if (val1 && op && val2) { printf("'Val OP Val (' -- not allowed\n"); return(-1); } open_cnt++; *op_cnt = open_cnt; if (val1) { if (validate_expr(exp->next, 0, 0, 0, op_cnt) == 0) { val2 = 1; } else { return(-1); } } else { return(validate_expr(exp->next, 0, 0, 0, op_cnt)); } break; case TYPE_PARN_CLOSE: open_cnt--; *op_cnt = open_cnt; if (val1 && op && val2) { return(0); } else { printf("Found close paren and not complete\n"); return(-1); } break; case TYPE_VALUE_CON: case TYPE_VALUE_PMC: if (val1 == 0) { val1 = 1; } else if (val1 && op) { val2 = 1; } else { printf("val1 set, val2 about to be set op empty\n"); return(-1); } break; default: printf("unknown type %d\n", exp->type); exit(-5); break; } return(validate_expr(exp->next, val1, op, val2, op_cnt)); } void print_exp(struct expression *exp) { if (exp == NULL) { printf("\n"); return; } switch(exp->type) { case TYPE_OP_PLUS: printf(" + "); break; case TYPE_OP_MINUS: printf(" - "); break; case TYPE_OP_MULT: printf(" * "); break; case TYPE_OP_DIVIDE: printf(" / "); break; case TYPE_PARN_OPEN: printf(" ( "); break; case TYPE_PARN_CLOSE: printf(" ) "); break; case TYPE_VALUE_CON: printf("%f", exp->value); break; case TYPE_VALUE_PMC: printf("%s", exp->name); break; default: printf("Unknown op %d\n", exp->type); break; } print_exp(exp->next); } static void walk_back_and_insert_paren(struct expression **beg, struct expression *frm) { struct expression *at, *ex; /* Setup our new open paren */ ex = malloc(sizeof(struct expression)); if (ex == NULL) { printf("Out of memory in exp allocation\n"); exit(-2); } memset(ex, 0, sizeof(struct expression)); ex->type = TYPE_PARN_OPEN; /* Now lets place it */ at = frm->prev; if (at == *beg) { /* We are inserting at the head of the list */ in_beg: ex->next = at; at->prev = ex; *beg = ex; return; } else if ((at->type == TYPE_VALUE_CON) || (at->type == TYPE_VALUE_PMC)) { /* Simple case we have a value in the previous position */ in_mid: ex->prev = at->prev; ex->prev->next = ex; ex->next = at; at->prev = ex; return; } else if (at->type == TYPE_PARN_CLOSE) { /* Skip through until we reach beg or all ( closes */ int par_cnt=1; at = at->prev; while(par_cnt) { if (at->type == TYPE_PARN_CLOSE) { par_cnt++; } else if (at->type == TYPE_PARN_OPEN) { par_cnt--; if (par_cnt == 0) { break; } } at = at->prev; } if (at == *beg) { /* At beginning we insert */ goto in_beg; } else { goto in_mid; } } else { printf("%s:Unexpected type:%d?\n", __FUNCTION__, at->type); exit(-1); } } static void walk_fwd_and_insert_paren(struct expression *frm, struct expression **added) { struct expression *at, *ex; /* Setup our new close paren */ ex = malloc(sizeof(struct expression)); if (ex == NULL) { printf("Out of memory in exp allocation\n"); exit(-2); } memset(ex, 0, sizeof(struct expression)); ex->type = TYPE_PARN_CLOSE; *added = ex; /* Now lets place it */ at = frm->next; if ((at->type == TYPE_VALUE_CON) || (at->type == TYPE_VALUE_PMC)) { /* Simple case we have a value in the previous position */ insertit: ex->next = at->next; ex->prev = at; at->next = ex; return; } else if (at->type == TYPE_PARN_OPEN) { int par_cnt=1; at = at->next; while(par_cnt) { if (at->type == TYPE_PARN_OPEN) { par_cnt++; } else if (at->type == TYPE_PARN_CLOSE) { par_cnt--; if (par_cnt == 0) { break; } } at = at->next; } goto insertit; } else { printf("%s:Unexpected type:%d?\n", __FUNCTION__, at->type); exit(-1); } } static void add_precendence(struct expression **beg, struct expression *start, struct expression *end) { /* * Between start and end add () around any * or /. This * is quite tricky since if there is a () set inside the * list we need to skip over everything in the ()'s considering * that just a value. */ struct expression *at, *newone; int open_cnt; at = start; open_cnt = 0; while(at != end) { if (at->type == TYPE_PARN_OPEN) { open_cnt++; } if (at->type == TYPE_PARN_CLOSE) { open_cnt--; } if (open_cnt == 0) { if ((at->type == TYPE_OP_MULT) || (at->type == TYPE_OP_DIVIDE)) { walk_back_and_insert_paren(beg, at); walk_fwd_and_insert_paren(at, &newone); at = newone->next; continue; } } at = at->next; } } static void set_math_precidence(struct expression **beg, struct expression *exp, struct expression **stopped) { struct expression *at, *start, *end; int cnt_lower, cnt_upper; /* * Walk through and set any math precedence to * get proper precedence we insert () around * / over + - */ end = NULL; start = at = exp; cnt_lower = cnt_upper = 0; while(at) { if (at->type == TYPE_PARN_CLOSE) { /* Done with that paren */ if (stopped) { *stopped = at; } if (cnt_lower && cnt_upper) { /* We have a mixed set ... add precedence between start/end */ add_precendence(beg, start, end); } return; } if (at->type == TYPE_PARN_OPEN) { set_math_precidence(beg, at->next, &end); at = end; continue; } else if ((at->type == TYPE_OP_PLUS) || (at->type == TYPE_OP_MINUS)) { cnt_lower++; } else if ((at->type == TYPE_OP_DIVIDE) || (at->type == TYPE_OP_MULT)) { cnt_upper++; } at = at->next; } if (cnt_lower && cnt_upper) { add_precendence(beg, start, NULL); } } extern char **valid_pmcs; extern int valid_pmc_cnt; static void pmc_name_set(struct expression *at) { int i, idx, fnd; if (at->name[0] == '%') { /* Special number after $ gives index */ idx = strtol(&at->name[1], NULL, 0); if (idx >= valid_pmc_cnt) { printf("Unknown PMC %s -- largest we have is $%d -- can't run your expression\n", at->name, valid_pmc_cnt); exit(-1); } strcpy(at->name, valid_pmcs[idx]); } else { for(i=0, fnd=0; iname) == 0) { fnd = 1; break; } } if (!fnd) { printf("PMC %s does not exist on this machine -- can't run your expression\n", at->name); exit(-1); } } } struct expression * parse_expression(char *str) { struct expression *exp=NULL, *last=NULL, *at; int open_par, close_par; int op_cnt=0; size_t siz, i, x; /* * Walk through a string expression and convert * it to a linked list of actions. We do this by: * a) Counting the open/close paren's, there must * be a matching number. * b) If we have balanced paren's then create a linked list * of the operators, then we validate that expression further. * c) Validating that we have: * val OP val * val OP ( * inside every paran you have a: * val OP val * val OP ( * d) A final optional step (not implemented yet) would be * to insert the mathematical precedence paran's. For * the start we will just do the left to right evaluation and * then later we can add this guy to add paran's to make it * mathimatically correct... i.e instead of 1 + 2 * 3 we * would translate it into 1 + ( 2 * 3). */ open_par = close_par = 0; siz = strlen(str); /* No trailing newline please */ if (str[(siz-1)] == '\n') { str[(siz-1)] = 0; siz--; } for(i=0; itype = TYPE_PARN_OPEN; } else if (str[i] == ')') { at = alloc_and_hook_expr(&exp, &last); at->type = TYPE_PARN_CLOSE; } else if (str[i] == ' ') { /* Extra blank */ continue; } else if (str[i] == '\t') { /* Extra tab */ continue; } else if (str[i] == '+') { at = alloc_and_hook_expr(&exp, &last); at->type = TYPE_OP_PLUS; } else if (str[i] == '-') { at = alloc_and_hook_expr(&exp, &last); at->type = TYPE_OP_MINUS; } else if (str[i] == '/') { at = alloc_and_hook_expr(&exp, &last); at->type = TYPE_OP_DIVIDE; } else if (str[i] == '*') { at = alloc_and_hook_expr(&exp, &last); at->type = TYPE_OP_MULT; } else { /* Its a value or PMC constant */ at = alloc_and_hook_expr(&exp, &last); if (isdigit(str[i]) || (str[i] == '.')) { at->type = TYPE_VALUE_CON; } else { at->type = TYPE_VALUE_PMC; } x = 0; while ((str[i] != ' ') && (str[i] != '\t') && (str[i] != 0) && (str[i] != ')') && (str[i] != '(')) { /* We collect the constant until a space or tab */ at->name[x] = str[i]; i++; x++; if (x >=(sizeof(at->name)-1)) { printf("Value/Constant too long %d max:%d\n", (int)x, (int)(sizeof(at->name)-1)); exit(-3); } } if (str[i] != 0) { /* Need to back up and see the last char since * the for will increment the loop. */ i--; } /* Now we have pulled the string, set it up */ if (at->type == TYPE_VALUE_CON) { at->state = STATE_FILLED; at->value = strtod(at->name, NULL); } else { pmc_name_set(at); } } } /* Now lets validate its a workable expression */ if (validate_expr(exp, 0, 0, 0, &op_cnt)) { printf("Invalid expression\n"); exit(-4); } set_math_precidence(&exp, exp, NULL); return (exp); } static struct expression * gather_exp_to_paren_close(struct expression *exp, double *val_fill) { /* * I have been given ( ??? * so I could see either * ( * or * Val Op * */ struct expression *lastproc; double val; if (exp->type == TYPE_PARN_OPEN) { lastproc = gather_exp_to_paren_close(exp->next, &val); *val_fill = val; } else { *val_fill = run_expr(exp, 0, &lastproc); } return(lastproc); } double run_expr(struct expression *exp, int initial_call, struct expression **lastone) { /* * We expect to find either * a) A Open Paren * or * b) Val-> Op -> Val * or * c) Val-> Op -> Open Paren */ double val1, val2, res; struct expression *op, *other_half, *rest; if (exp->type == TYPE_PARN_OPEN) { op = gather_exp_to_paren_close(exp->next, &val1); } else if(exp->type == TYPE_VALUE_CON) { val1 = exp->value; op = exp->next; } else if (exp->type == TYPE_VALUE_PMC) { val1 = exp->value; op = exp->next; } else { printf("Illegal value in %s huh?\n", __FUNCTION__); exit(-1); } if (op == NULL) { return (val1); } more_to_do: other_half = op->next; if (other_half->type == TYPE_PARN_OPEN) { rest = gather_exp_to_paren_close(other_half->next, &val2); } else if(other_half->type == TYPE_VALUE_CON) { val2 = other_half->value; rest = other_half->next; } else if (other_half->type == TYPE_VALUE_PMC) { val2 = other_half->value; rest = other_half->next; } else { printf("Illegal2 value in %s huh?\n", __FUNCTION__); exit(-1); } switch(op->type) { case TYPE_OP_PLUS: res = val1 + val2; break; case TYPE_OP_MINUS: res = val1 - val2; break; case TYPE_OP_MULT: res = val1 * val2; break; case TYPE_OP_DIVIDE: if (val2 != 0.0) res = val1 / val2; else { printf("Division by zero averted\n"); res = 1.0; } break; default: printf("Op is not an operator -- its %d\n", op->type); exit(-1); break; } if (rest == NULL) { if (lastone) { *lastone = NULL; } return (res); } if ((rest->type == TYPE_PARN_CLOSE) && (initial_call == 0)) { if (lastone) { *lastone = rest->next; } return(res); } /* There is more, as in * a + b + c * where we just did a + b * so now it becomes val1 is set to res and * we need to proceed with the rest of it. */ val1 = res; op = rest; if ((op->type != TYPE_OP_PLUS) && (op->type != TYPE_OP_MULT) && (op->type != TYPE_OP_MINUS) && (op->type != TYPE_OP_DIVIDE)) { printf("%s ending on type:%d not an op??\n", __FUNCTION__, op->type); return(res); } if (op) goto more_to_do; return (res); } #ifdef STAND_ALONE_TESTING static double calc_expr(struct expression *exp) { struct expression *at; double xx; /* First clear PMC's setting */ for(at = exp; at != NULL; at = at->next) { if (at->type == TYPE_VALUE_PMC) { at->state = STATE_UNSET; } } /* Now for all pmc's make up values .. here is where I would pull them */ for(at = exp; at != NULL; at = at->next) { if (at->type == TYPE_VALUE_PMC) { at->value = (random() * 1.0); at->state = STATE_FILLED; if (at->value == 0.0) { /* So we don't have div by 0 */ at->value = 1.0; } } } /* Now lets calculate the expression */ print_exp(exp); xx = run_expr(exp, 1, NULL); printf("Answer is %f\n", xx); return(xx); } int main(int argc, char **argv) { struct expression *exp; if (argc < 2) { printf("Use %s expression\n", argv[0]); return(-1); } exp = parse_expression(argv[1]); printf("Now the calc\n"); calc_expr(exp); return(0); } #endif Index: head/usr.sbin/pmcstudy/eval_expr.h =================================================================== --- head/usr.sbin/pmcstudy/eval_expr.h (revision 343754) +++ head/usr.sbin/pmcstudy/eval_expr.h (revision 343755) @@ -1,58 +1,57 @@ #ifndef __eval_expr_h__ #define __eval_expr_h__ /*- - * Copyright (c) 2015 Netflix Inc. - * All rights reserved. + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ __FBSDID("$FreeBSD$"); enum exptype { TYPE_OP_PLUS, TYPE_OP_MINUS, TYPE_OP_MULT, TYPE_OP_DIVIDE, TYPE_PARN_OPEN, TYPE_PARN_CLOSE, TYPE_VALUE_CON, TYPE_VALUE_PMC }; #define STATE_UNSET 0 /* We have no setting yet in value */ #define STATE_FILLED 1 /* We have filled in value */ struct expression { struct expression *next; /* Next in expression. */ struct expression *prev; /* Prev in expression. */ double value; /* If there is a value to set */ enum exptype type; /* What is it */ uint8_t state; /* Current state if value type */ char name[252]; /* If a PMC whats the name, con value*/ }; struct expression *parse_expression(char *str); double run_expr(struct expression *exp, int initial_call, struct expression **lastone); void print_exp(struct expression *exp); #endif Index: head/usr.sbin/pmcstudy/pmcstudy.8 =================================================================== --- head/usr.sbin/pmcstudy/pmcstudy.8 (revision 343754) +++ head/usr.sbin/pmcstudy/pmcstudy.8 (revision 343755) @@ -1,145 +1,145 @@ -.\" Copyright (c) 2015 -.\" Netflix Inc. +.\" +.\" Copyright (c) 2015 Netflix, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd Mar 26, 2015 .Dt PMCSTUDY 8 .Os .Sh NAME .Nm pmcstudy .Nd Perform various studies on a system's overall PMCs .Sh SYNOPSIS .Nm .Oo Fl i Ar inputfile | Fl A | Fl T | Fl v | Fl m Ar max | Fl e exp | Fl Ar E | Fl h | fl H Oc .Nm .Fl i Ar inputfile .Nm .Fl v .Nm .Fl m Ar max .Nm .Fl e Ar exp-name .Nm .Fl E Ar your-expr .Nm .Fl h .Nm .Fl H .Nm .Fl T .Sh DESCRIPTION The .Nm program is designed to run various tests against your systems performance. There are roughly 20-22 canned tests that setup specific PMCs and then run various formulas on the output information. These formulas can be found in Intel documentation "Using Intel Vtune amplifier xe on NNN Generation Intel Core Processors". The NNN is either 2nd, 3rd, 4th or 5th generation i.e., Sandy Bridge, Ivy Bridge, Haswell and Broadwell. Currently the program only works on these four Intel processor types. .Sh OPTIONS The following options are available: .Bl -tag -width indent .It Fl i Ar filename If this option is supplied, instead of running a .Xr pmcstat 8 command to collect the current running information the filename will be read in as input instead. .It Fl H This option will display the complete list of canned formulas that can be run including their names which can be input to the .Fl e option. .It Fl e Ar name Execute the canned test .Ar name on the running kernel. .It Fl h If you add this option to the .Fl e option the test will not execute but instead give you a small description of the test that would run. .It Fl T This option will execute a test of every PMC to validate that they are working on your system. If a PMC does not show up in this test chances are the kernel .Xr hwpmc 4 driver needs updating with new PMC information. .It Fl m Ar num This option can restrict the number of one second samples that will be collected by your system when running a test (it bounds the time the test will run). Without this option the test will run for 1024 seconds or until the user types ctrl-c. .It Fl v The verbose option adds debugging output to the command. .It Fl E Ar expression This option can be used by those that have their own ideas on what formulas they want to run. The expression given to the .Fl E option is a "formula". The formula can declare directly the PMCs by name or you can use an abbreviation %NNN. To find out the abbreviations on your system you may run the .Fl L option. An example of a formula of your own might be .Fl E "FP_ASSIST.ANY / INST_RETIRED.ANY_P" or using the abbreviations on a Haswell machine you would type .Fl E " %176 / %150". You must have spaces between each entry and you may use parentheses to prioritize the operators. Add (+), Subtract (-), Divide (/) and Multiplication (*) are supported. You may also introduce constant numbers. For example you can do a standard efficiency test like .Fl E "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD_P)". .It Fl L This option will list all known PMCs and their abbreviation (%NNN). .It Fl A Run all canned tests. .El .Sh SEE ALSO .Xr pmc 3 , .Xr pmclog 3 , .Xr hwpmc 4 , .Xr pmcstat 8 .Sh HISTORY The .Nm utility first appeared in .Fx 11.0. .Sh AUTHORS .An Randall Stewart Aq Mt rrs@FreeBSD.org Index: head/usr.sbin/pmcstudy/pmcstudy.c =================================================================== --- head/usr.sbin/pmcstudy/pmcstudy.c (revision 343754) +++ head/usr.sbin/pmcstudy/pmcstudy.c (revision 343755) @@ -1,2950 +1,2949 @@ /*- - * Copyright (c) 2014, 2015 Netflix Inc. - * All rights reserved. + * Copyright (c) 2014-2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include "eval_expr.h" __FBSDID("$FreeBSD$"); static int max_pmc_counters = 1; static int run_all = 0; #define MAX_COUNTER_SLOTS 1024 #define MAX_NLEN 64 #define MAX_CPU 64 static int verbose = 0; extern char **environ; extern struct expression *master_exp; struct expression *master_exp=NULL; #define PMC_INITIAL_ALLOC 512 extern char **valid_pmcs; char **valid_pmcs = NULL; extern int valid_pmc_cnt; int valid_pmc_cnt=0; extern int pmc_allocated_cnt; int pmc_allocated_cnt=0; /* * The following two varients on popen and pclose with * the cavet that they get you the PID so that you * can supply it to pclose so it can send a SIGTERM * to the process. */ static FILE * my_popen(const char *command, const char *dir, pid_t *p_pid) { FILE *io_out, *io_in; int pdesin[2], pdesout[2]; char *argv[4]; pid_t pid; char cmd[4]; char cmd2[1024]; char arg1[4]; if ((strcmp(dir, "r") != 0) && (strcmp(dir, "w") != 0)) { errno = EINVAL; return(NULL); } if (pipe(pdesin) < 0) return (NULL); if (pipe(pdesout) < 0) { (void)close(pdesin[0]); (void)close(pdesin[1]); return (NULL); } strcpy(cmd, "sh"); strcpy(arg1, "-c"); strcpy(cmd2, command); argv[0] = cmd; argv[1] = arg1; argv[2] = cmd2; argv[3] = NULL; switch (pid = fork()) { case -1: /* Error. */ (void)close(pdesin[0]); (void)close(pdesin[1]); (void)close(pdesout[0]); (void)close(pdesout[1]); return (NULL); /* NOTREACHED */ case 0: /* Child. */ /* Close out un-used sides */ (void)close(pdesin[1]); (void)close(pdesout[0]); /* Now prepare the stdin of the process */ close(0); (void)dup(pdesin[0]); (void)close(pdesin[0]); /* Now prepare the stdout of the process */ close(1); (void)dup(pdesout[1]); /* And lets do stderr just in case */ close(2); (void)dup(pdesout[1]); (void)close(pdesout[1]); /* Now run it */ execve("/bin/sh", argv, environ); exit(127); /* NOTREACHED */ } /* Parent; assume fdopen can't fail. */ /* Store the pid */ *p_pid = pid; if (strcmp(dir, "r") != 0) { io_out = fdopen(pdesin[1], "w"); (void)close(pdesin[0]); (void)close(pdesout[0]); (void)close(pdesout[1]); return(io_out); } else { /* Prepare the input stream */ io_in = fdopen(pdesout[0], "r"); (void)close(pdesout[1]); (void)close(pdesin[0]); (void)close(pdesin[1]); return (io_in); } } /* * pclose -- * Pclose returns -1 if stream is not associated with a `popened' command, * if already `pclosed', or waitpid returns an error. */ static void my_pclose(FILE *io, pid_t the_pid) { int pstat; pid_t pid; /* * Find the appropriate file pointer and remove it from the list. */ (void)fclose(io); /* Die if you are not dead! */ kill(the_pid, SIGTERM); do { pid = wait4(the_pid, &pstat, 0, (struct rusage *)0); } while (pid == -1 && errno == EINTR); } struct counters { struct counters *next_cpu; char counter_name[MAX_NLEN]; /* Name of counter */ int cpu; /* CPU we are on */ int pos; /* Index we are filling to. */ uint64_t vals[MAX_COUNTER_SLOTS]; /* Last 64 entries */ uint64_t sum; /* Summary of entries */ }; extern struct counters *glob_cpu[MAX_CPU]; struct counters *glob_cpu[MAX_CPU]; extern struct counters *cnts; struct counters *cnts=NULL; extern int ncnts; int ncnts=0; extern int (*expression)(struct counters *, int); int (*expression)(struct counters *, int); static const char *threshold=NULL; static const char *command; struct cpu_entry { const char *name; const char *thresh; const char *command; int (*func)(struct counters *, int); int counters_required; }; struct cpu_type { char cputype[32]; int number; struct cpu_entry *ents; void (*explain)(const char *name); }; extern struct cpu_type the_cpu; struct cpu_type the_cpu; static void explain_name_sb(const char *name) { const char *mythresh; if (strcmp(name, "allocstall1") == 0) { printf("Examine PARTIAL_RAT_STALLS.SLOW_LEA_WINDOW / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "allocstall2") == 0) { printf("Examine PARTIAL_RAT_STALLS.FLAGS_MERGE_UOP_CYCLES/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "br_miss") == 0) { printf("Examine (20 * BR_MISP_RETIRED.ALL_BRANCHES)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "splitload") == 0) { printf("Examine MEM_UOPS_RETIRED.SPLIT_LOADS * 5) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "splitstore") == 0) { printf("Examine MEM_UOPS_RETIRED.SPLIT_STORES / MEM_UOPS_RETIRED.ALL_STORES\n"); mythresh = "thresh >= .01"; } else if (strcmp(name, "contested") == 0) { printf("Examine (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 60) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "blockstorefwd") == 0) { printf("Examine (LD_BLOCKS_STORE_FORWARD * 13) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "cache2") == 0) { printf("Examine ((MEM_LOAD_RETIRED.L3_HIT * 26) + \n"); printf(" (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * 43) + \n"); printf(" (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 60)) / CPU_CLK_UNHALTED.THREAD_P\n"); printf("**Note we have it labeled MEM_LOAD_UOPS_RETIRED.LLC_HIT not MEM_LOAD_RETIRED.L3_HIT\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "cache1") == 0) { printf("Examine (MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS * 180) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "dtlbmissload") == 0) { printf("Examine (((DTLB_LOAD_MISSES.STLB_HIT * 7) + DTLB_LOAD_MISSES.WALK_DURATION)\n"); printf(" / CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "frontendstall") == 0) { printf("Examine IDQ_UOPS_NOT_DELIVERED.CORE / (CPU_CLK_UNHALTED.THREAD_P * 4)\n"); mythresh = "thresh >= .15"; } else if (strcmp(name, "clears") == 0) { printf("Examine ((MACHINE_CLEARS.MEMORY_ORDERING + \n"); printf(" MACHINE_CLEARS.SMC + \n"); printf(" MACHINE_CLEARS.MASKMOV ) * 100 ) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .02"; } else if (strcmp(name, "microassist") == 0) { printf("Examine IDQ.MS_CYCLES / (CPU_CLK_UNHALTED.THREAD_P * 4)\n"); printf("***We use IDQ.MS_UOPS,cmask=1 to get cycles\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "aliasing_4k") == 0) { printf("Examine (LD_BLOCKS_PARTIAL.ADDRESS_ALIAS * 5) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "fpassist") == 0) { printf("Examine FP_ASSIST.ANY/INST_RETIRED.ANY_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "otherassistavx") == 0) { printf("Examine (OTHER_ASSISTS.AVX_TO_SSE * 75)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "otherassistsse") == 0) { printf("Examine (OTHER_ASSISTS.SSE_TO_AVX * 75)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "eff1") == 0) { printf("Examine (UOPS_RETIRED.RETIRE_SLOTS)/(4 *CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh < .9"; } else if (strcmp(name, "eff2") == 0) { printf("Examine CPU_CLK_UNHALTED.THREAD_P/INST_RETIRED.ANY_P\n"); mythresh = "thresh > 1.0"; } else if (strcmp(name, "dtlbmissstore") == 0) { printf("Examine (((DTLB_STORE_MISSES.STLB_HIT * 7) + DTLB_STORE_MISSES.WALK_DURATION)\n"); printf(" / CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh >= .05"; } else { printf("Unknown name:%s\n", name); mythresh = "unknown entry"; } printf("If the value printed is %s we may have the ability to improve performance\n", mythresh); } static void explain_name_ib(const char *name) { const char *mythresh; if (strcmp(name, "br_miss") == 0) { printf("Examine ((BR_MISP_RETIRED.ALL_BRANCHES /(BR_MISP_RETIRED.ALL_BRANCHES +\n"); printf(" MACHINE_CLEAR.COUNT) * ((UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES)\n"); printf("/ (4 * CPU_CLK_UNHALTED.THREAD))))\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "eff1") == 0) { printf("Examine (UOPS_RETIRED.RETIRE_SLOTS)/(4 *CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh < .9"; } else if (strcmp(name, "eff2") == 0) { printf("Examine CPU_CLK_UNHALTED.THREAD_P/INST_RETIRED.ANY_P\n"); mythresh = "thresh > 1.0"; } else if (strcmp(name, "cache1") == 0) { printf("Examine (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * 180) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "cache2") == 0) { printf("Examine (MEM_LOAD_UOPS_RETIRED.LLC_HIT / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "itlbmiss") == 0) { printf("Examine ITLB_MISSES.WALK_DURATION / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "icachemiss") == 0) { printf("Examine (ICACHE.IFETCH_STALL - ITLB_MISSES.WALK_DURATION)/ CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "lcpstall") == 0) { printf("Examine ILD_STALL.LCP/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "datashare") == 0) { printf("Examine (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * 43)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "blockstorefwd") == 0) { printf("Examine (LD_BLOCKS_STORE_FORWARD * 13) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "splitload") == 0) { printf("Examine ((L1D_PEND_MISS.PENDING / MEM_LOAD_UOPS_RETIRED.L1_MISS) *\n"); printf(" LD_BLOCKS.NO_SR)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "splitstore") == 0) { printf("Examine MEM_UOPS_RETIRED.SPLIT_STORES / MEM_UOPS_RETIRED.ALL_STORES\n"); mythresh = "thresh >= .01"; } else if (strcmp(name, "aliasing_4k") == 0) { printf("Examine (LD_BLOCKS_PARTIAL.ADDRESS_ALIAS * 5) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "dtlbmissload") == 0) { printf("Examine (((DTLB_LOAD_MISSES.STLB_HIT * 7) + DTLB_LOAD_MISSES.WALK_DURATION)\n"); printf(" / CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "dtlbmissstore") == 0) { printf("Examine (((DTLB_STORE_MISSES.STLB_HIT * 7) + DTLB_STORE_MISSES.WALK_DURATION)\n"); printf(" / CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "contested") == 0) { printf("Examine (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 60) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "clears") == 0) { printf("Examine ((MACHINE_CLEARS.MEMORY_ORDERING + \n"); printf(" MACHINE_CLEARS.SMC + \n"); printf(" MACHINE_CLEARS.MASKMOV ) * 100 ) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .02"; } else if (strcmp(name, "microassist") == 0) { printf("Examine IDQ.MS_CYCLES / (4 * CPU_CLK_UNHALTED.THREAD_P)\n"); printf("***We use IDQ.MS_UOPS,cmask=1 to get cycles\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "fpassist") == 0) { printf("Examine FP_ASSIST.ANY/INST_RETIRED.ANY_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "otherassistavx") == 0) { printf("Examine (OTHER_ASSISTS.AVX_TO_SSE * 75)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "otherassistsse") == 0) { printf("Examine (OTHER_ASSISTS.SSE_TO_AVX * 75)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "look for a excessive value"; } else { printf("Unknown name:%s\n", name); mythresh = "unknown entry"; } printf("If the value printed is %s we may have the ability to improve performance\n", mythresh); } static void explain_name_has(const char *name) { const char *mythresh; if (strcmp(name, "eff1") == 0) { printf("Examine (UOPS_RETIRED.RETIRE_SLOTS)/(4 *CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh < .75"; } else if (strcmp(name, "eff2") == 0) { printf("Examine CPU_CLK_UNHALTED.THREAD_P/INST_RETIRED.ANY_P\n"); mythresh = "thresh > 1.0"; } else if (strcmp(name, "itlbmiss") == 0) { printf("Examine ITLB_MISSES.WALK_DURATION / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "icachemiss") == 0) { printf("Examine (36 * ICACHE.MISSES)/ CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "lcpstall") == 0) { printf("Examine ILD_STALL.LCP/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "cache1") == 0) { printf("Examine (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * 180) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "cache2") == 0) { printf("Examine ((MEM_LOAD_UOPS_RETIRED.LLC_HIT * 36) + \n"); printf(" (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * 72) + \n"); printf(" (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 84))\n"); printf(" / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "contested") == 0) { printf("Examine (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 84) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "datashare") == 0) { printf("Examine (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * 72)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "blockstorefwd") == 0) { printf("Examine (LD_BLOCKS_STORE_FORWARD * 13) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "splitload") == 0) { printf("Examine (MEM_UOPS_RETIRED.SPLIT_LOADS * 5) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "splitstore") == 0) { printf("Examine MEM_UOPS_RETIRED.SPLIT_STORES / MEM_UOPS_RETIRED.ALL_STORES\n"); mythresh = "thresh >= .01"; } else if (strcmp(name, "aliasing_4k") == 0) { printf("Examine (LD_BLOCKS_PARTIAL.ADDRESS_ALIAS * 5) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "dtlbmissload") == 0) { printf("Examine (((DTLB_LOAD_MISSES.STLB_HIT * 7) + DTLB_LOAD_MISSES.WALK_DURATION)\n"); printf(" / CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "br_miss") == 0) { printf("Examine (20 * BR_MISP_RETIRED.ALL_BRANCHES)/CPU_CLK_UNHALTED.THREAD\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "clears") == 0) { printf("Examine ((MACHINE_CLEARS.MEMORY_ORDERING + \n"); printf(" MACHINE_CLEARS.SMC + \n"); printf(" MACHINE_CLEARS.MASKMOV ) * 100 ) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .02"; } else if (strcmp(name, "microassist") == 0) { printf("Examine IDQ.MS_CYCLES / (4 * CPU_CLK_UNHALTED.THREAD_P)\n"); printf("***We use IDQ.MS_UOPS,cmask=1 to get cycles\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "fpassist") == 0) { printf("Examine FP_ASSIST.ANY/INST_RETIRED.ANY_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "otherassistavx") == 0) { printf("Examine (OTHER_ASSISTS.AVX_TO_SSE * 75)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "otherassistsse") == 0) { printf("Examine (OTHER_ASSISTS.SSE_TO_AVX * 75)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "look for a excessive value"; } else { printf("Unknown name:%s\n", name); mythresh = "unknown entry"; } printf("If the value printed is %s we may have the ability to improve performance\n", mythresh); } static struct counters * find_counter(struct counters *base, const char *name) { struct counters *at; int len; at = base; len = strlen(name); while(at) { if (strncmp(at->counter_name, name, len) == 0) { return(at); } at = at->next_cpu; } printf("Can't find counter %s\n", name); printf("We have:\n"); at = base; while(at) { printf("- %s\n", at->counter_name); at = at->next_cpu; } exit(-1); } static int allocstall1(struct counters *cpu, int pos) { /* 1 - PARTIAL_RAT_STALLS.SLOW_LEA_WINDOW/CPU_CLK_UNHALTED.THREAD_P (thresh > .05)*/ int ret; struct counters *partial; struct counters *unhalt; double un, par, res; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); partial = find_counter(cpu, "PARTIAL_RAT_STALLS.SLOW_LEA_WINDOW"); if (pos != -1) { par = partial->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { par = partial->sum * 1.0; un = unhalt->sum * 1.0; } res = par/un; ret = printf("%1.3f", res); return(ret); } static int allocstall2(struct counters *cpu, int pos) { /* 2 - PARTIAL_RAT_STALLS.FLAGS_MERGE_UOP_CYCLES/CPU_CLK_UNHALTED.THREAD_P (thresh >.05) */ int ret; struct counters *partial; struct counters *unhalt; double un, par, res; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); partial = find_counter(cpu, "PARTIAL_RAT_STALLS.FLAGS_MERGE_UOP"); if (pos != -1) { par = partial->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { par = partial->sum * 1.0; un = unhalt->sum * 1.0; } res = par/un; ret = printf("%1.3f", res); return(ret); } static int br_mispredict(struct counters *cpu, int pos) { struct counters *brctr; struct counters *unhalt; int ret; /* 3 - (20 * BR_MISP_RETIRED.ALL_BRANCHES)/CPU_CLK_UNHALTED.THREAD_P (thresh >= .2) */ double br, un, con, res; con = 20.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); brctr = find_counter(cpu, "BR_MISP_RETIRED.ALL_BRANCHES"); if (pos != -1) { br = brctr->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { br = brctr->sum * 1.0; un = unhalt->sum * 1.0; } res = (con * br)/un; ret = printf("%1.3f", res); return(ret); } static int br_mispredictib(struct counters *cpu, int pos) { struct counters *brctr; struct counters *unhalt; struct counters *clear, *clear2, *clear3; struct counters *uops; struct counters *recv; struct counters *iss; /* "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s BR_MISP_RETIRED.ALL_BRANCHES -s MACHINE_CLEARS.MEMORY_ORDERING -s MACHINE_CLEARS.SMC -s MACHINE_CLEARS.MASKMOV -s UOPS_ISSUED.ANY -s UOPS_RETIRED.RETIRE_SLOTS -s INT_MISC.RECOVERY_CYCLES -w 1",*/ int ret; /* * (BR_MISP_RETIRED.ALL_BRANCHES / * (BR_MISP_RETIRED.ALL_BRANCHES + * MACHINE_CLEAR.COUNT) * * ((UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES) / (4 * CPU_CLK_UNHALTED.THREAD))) * */ double br, cl, cl2, cl3, uo, re, un, con, res, is; con = 4.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); brctr = find_counter(cpu, "BR_MISP_RETIRED.ALL_BRANCHES"); clear = find_counter(cpu, "MACHINE_CLEARS.MEMORY_ORDERING"); clear2 = find_counter(cpu, "MACHINE_CLEARS.SMC"); clear3 = find_counter(cpu, "MACHINE_CLEARS.MASKMOV"); uops = find_counter(cpu, "UOPS_RETIRED.RETIRE_SLOTS"); iss = find_counter(cpu, "UOPS_ISSUED.ANY"); recv = find_counter(cpu, "INT_MISC.RECOVERY_CYCLES"); if (pos != -1) { br = brctr->vals[pos] * 1.0; cl = clear->vals[pos] * 1.0; cl2 = clear2->vals[pos] * 1.0; cl3 = clear3->vals[pos] * 1.0; uo = uops->vals[pos] * 1.0; re = recv->vals[pos] * 1.0; is = iss->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { br = brctr->sum * 1.0; cl = clear->sum * 1.0; cl2 = clear2->sum * 1.0; cl3 = clear3->sum * 1.0; uo = uops->sum * 1.0; re = recv->sum * 1.0; is = iss->sum * 1.0; un = unhalt->sum * 1.0; } res = (br/(br + cl + cl2 + cl3) * ((is - uo + con * re) / (con * un))); ret = printf("%1.3f", res); return(ret); } static int br_mispredict_broad(struct counters *cpu, int pos) { struct counters *brctr; struct counters *unhalt; struct counters *clear; struct counters *uops; struct counters *uops_ret; struct counters *recv; int ret; double br, cl, uo, uo_r, re, con, un, res; con = 4.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); brctr = find_counter(cpu, "BR_MISP_RETIRED.ALL_BRANCHES"); clear = find_counter(cpu, "MACHINE_CLEARS.CYCLES"); uops = find_counter(cpu, "UOPS_ISSUED.ANY"); uops_ret = find_counter(cpu, "UOPS_RETIRED.RETIRE_SLOTS"); recv = find_counter(cpu, "INT_MISC.RECOVERY_CYCLES"); if (pos != -1) { un = unhalt->vals[pos] * 1.0; br = brctr->vals[pos] * 1.0; cl = clear->vals[pos] * 1.0; uo = uops->vals[pos] * 1.0; uo_r = uops_ret->vals[pos] * 1.0; re = recv->vals[pos] * 1.0; } else { un = unhalt->sum * 1.0; br = brctr->sum * 1.0; cl = clear->sum * 1.0; uo = uops->sum * 1.0; uo_r = uops_ret->sum * 1.0; re = recv->sum * 1.0; } res = br / (br + cl) * (uo - uo_r + con * re) / (un * con); ret = printf("%1.3f", res); return(ret); } static int splitloadib(struct counters *cpu, int pos) { int ret; struct counters *mem; struct counters *l1d, *ldblock; struct counters *unhalt; double un, memd, res, l1, ldb; /* * ((L1D_PEND_MISS.PENDING / MEM_LOAD_UOPS_RETIRED.L1_MISS) * LD_BLOCKS.NO_SR) / CPU_CLK_UNHALTED.THREAD_P * "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s L1D_PEND_MISS.PENDING -s MEM_LOAD_UOPS_RETIRED.L1_MISS -s LD_BLOCKS.NO_SR -w 1", */ unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_RETIRED.L1_MISS"); l1d = find_counter(cpu, "L1D_PEND_MISS.PENDING"); ldblock = find_counter(cpu, "LD_BLOCKS.NO_SR"); if (pos != -1) { memd = mem->vals[pos] * 1.0; l1 = l1d->vals[pos] * 1.0; ldb = ldblock->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { memd = mem->sum * 1.0; l1 = l1d->sum * 1.0; ldb = ldblock->sum * 1.0; un = unhalt->sum * 1.0; } res = ((l1 / memd) * ldb)/un; ret = printf("%1.3f", res); return(ret); } static int splitload(struct counters *cpu, int pos) { int ret; struct counters *mem; struct counters *unhalt; double con, un, memd, res; /* 4 - (MEM_UOPS_RETIRED.SPLIT_LOADS * 5) / CPU_CLK_UNHALTED.THREAD_P (thresh >= .1)*/ con = 5.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_UOPS_RETIRED.SPLIT_LOADS"); if (pos != -1) { memd = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { memd = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (memd * con)/un; ret = printf("%1.3f", res); return(ret); } static int splitload_sb(struct counters *cpu, int pos) { int ret; struct counters *mem; struct counters *unhalt; double con, un, memd, res; /* 4 - (MEM_UOP_RETIRED.SPLIT_LOADS * 5) / CPU_CLK_UNHALTED.THREAD_P (thresh >= .1)*/ con = 5.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_UOP_RETIRED.SPLIT_LOADS"); if (pos != -1) { memd = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { memd = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (memd * con)/un; ret = printf("%1.3f", res); return(ret); } static int splitstore_sb(struct counters *cpu, int pos) { /* 5 - MEM_UOP_RETIRED.SPLIT_STORES / MEM_UOP_RETIRED.ALL_STORES (thresh > 0.01) */ int ret; struct counters *mem_split; struct counters *mem_stores; double memsplit, memstore, res; mem_split = find_counter(cpu, "MEM_UOP_RETIRED.SPLIT_STORES"); mem_stores = find_counter(cpu, "MEM_UOP_RETIRED.ALL_STORES"); if (pos != -1) { memsplit = mem_split->vals[pos] * 1.0; memstore = mem_stores->vals[pos] * 1.0; } else { memsplit = mem_split->sum * 1.0; memstore = mem_stores->sum * 1.0; } res = memsplit/memstore; ret = printf("%1.3f", res); return(ret); } static int splitstore(struct counters *cpu, int pos) { /* 5 - MEM_UOPS_RETIRED.SPLIT_STORES / MEM_UOPS_RETIRED.ALL_STORES (thresh > 0.01) */ int ret; struct counters *mem_split; struct counters *mem_stores; double memsplit, memstore, res; mem_split = find_counter(cpu, "MEM_UOPS_RETIRED.SPLIT_STORES"); mem_stores = find_counter(cpu, "MEM_UOPS_RETIRED.ALL_STORES"); if (pos != -1) { memsplit = mem_split->vals[pos] * 1.0; memstore = mem_stores->vals[pos] * 1.0; } else { memsplit = mem_split->sum * 1.0; memstore = mem_stores->sum * 1.0; } res = memsplit/memstore; ret = printf("%1.3f", res); return(ret); } static int contested(struct counters *cpu, int pos) { /* 6 - (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 60) / CPU_CLK_UNHALTED.THREAD_P (thresh >.05) */ int ret; struct counters *mem; struct counters *unhalt; double con, un, memd, res; con = 60.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM"); if (pos != -1) { memd = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { memd = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (memd * con)/un; ret = printf("%1.3f", res); return(ret); } static int contested_has(struct counters *cpu, int pos) { /* 6 - (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 84) / CPU_CLK_UNHALTED.THREAD_P (thresh >.05) */ int ret; struct counters *mem; struct counters *unhalt; double con, un, memd, res; con = 84.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM"); if (pos != -1) { memd = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { memd = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (memd * con)/un; ret = printf("%1.3f", res); return(ret); } static int contestedbroad(struct counters *cpu, int pos) { /* 6 - (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 84) / CPU_CLK_UNHALTED.THREAD_P (thresh >.05) */ int ret; struct counters *mem; struct counters *mem2; struct counters *unhalt; double con, un, memd, memtoo, res; con = 84.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM"); mem2 = find_counter(cpu,"MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS"); if (pos != -1) { memd = mem->vals[pos] * 1.0; memtoo = mem2->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { memd = mem->sum * 1.0; memtoo = mem2->sum * 1.0; un = unhalt->sum * 1.0; } res = ((memd * con) + memtoo)/un; ret = printf("%1.3f", res); return(ret); } static int blockstoreforward(struct counters *cpu, int pos) { /* 7 - (LD_BLOCKS_STORE_FORWARD * 13) / CPU_CLK_UNHALTED.THREAD_P (thresh >= .05)*/ int ret; struct counters *ldb; struct counters *unhalt; double con, un, ld, res; con = 13.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); ldb = find_counter(cpu, "LD_BLOCKS_STORE_FORWARD"); if (pos != -1) { ld = ldb->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { ld = ldb->sum * 1.0; un = unhalt->sum * 1.0; } res = (ld * con)/un; ret = printf("%1.3f", res); return(ret); } static int cache2(struct counters *cpu, int pos) { /* ** Suspect *** * 8 - ((MEM_LOAD_RETIRED.L3_HIT * 26) + (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * 43) + * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 60)) / CPU_CLK_UNHALTED.THREAD_P (thresh >.2) */ int ret; struct counters *mem1, *mem2, *mem3; struct counters *unhalt; double con1, con2, con3, un, me_1, me_2, me_3, res; con1 = 26.0; con2 = 43.0; con3 = 60.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); /* Call for MEM_LOAD_RETIRED.L3_HIT possibly MEM_LOAD_UOPS_RETIRED.LLC_HIT ?*/ mem1 = find_counter(cpu, "MEM_LOAD_UOPS_RETIRED.LLC_HIT"); mem2 = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT"); mem3 = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM"); if (pos != -1) { me_1 = mem1->vals[pos] * 1.0; me_2 = mem2->vals[pos] * 1.0; me_3 = mem3->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me_1 = mem1->sum * 1.0; me_2 = mem2->sum * 1.0; me_3 = mem3->sum * 1.0; un = unhalt->sum * 1.0; } res = ((me_1 * con1) + (me_2 * con2) + (me_3 * con3))/un; ret = printf("%1.3f", res); return(ret); } static int datasharing(struct counters *cpu, int pos) { /* * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * 43)/ CPU_CLK_UNHALTED.THREAD_P (thresh >.2) */ int ret; struct counters *mem; struct counters *unhalt; double con, res, me, un; con = 43.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT"); if (pos != -1) { me = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (me * con)/un; ret = printf("%1.3f", res); return(ret); } static int datasharing_has(struct counters *cpu, int pos) { /* * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * 43)/ CPU_CLK_UNHALTED.THREAD_P (thresh >.2) */ int ret; struct counters *mem; struct counters *unhalt; double con, res, me, un; con = 72.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT"); if (pos != -1) { me = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (me * con)/un; ret = printf("%1.3f", res); return(ret); } static int cache2ib(struct counters *cpu, int pos) { /* * (29 * MEM_LOAD_UOPS_RETIRED.LLC_HIT / CPU_CLK_UNHALTED.THREAD_P (thresh >.2) */ int ret; struct counters *mem; struct counters *unhalt; double con, un, me, res; con = 29.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_RETIRED.LLC_HIT"); if (pos != -1) { me = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (con * me)/un; ret = printf("%1.3f", res); return(ret); } static int cache2has(struct counters *cpu, int pos) { /* * Examine ((MEM_LOAD_UOPS_RETIRED.LLC_HIT * 36) + \ * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT * 72) + * (MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 84)) * / CPU_CLK_UNHALTED.THREAD_P */ int ret; struct counters *mem1, *mem2, *mem3; struct counters *unhalt; double con1, con2, con3, un, me1, me2, me3, res; con1 = 36.0; con2 = 72.0; con3 = 84.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem1 = find_counter(cpu, "MEM_LOAD_UOPS_RETIRED.LLC_HIT"); mem2 = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT"); mem3 = find_counter(cpu, "MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM"); if (pos != -1) { me1 = mem1->vals[pos] * 1.0; me2 = mem2->vals[pos] * 1.0; me3 = mem3->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me1 = mem1->sum * 1.0; me2 = mem2->sum * 1.0; me3 = mem3->sum * 1.0; un = unhalt->sum * 1.0; } res = ((me1 * con1) + (me2 * con2) + (me3 * con3))/un; ret = printf("%1.3f", res); return(ret); } static int cache2broad(struct counters *cpu, int pos) { /* * (29 * MEM_LOAD_UOPS_RETIRED.LLC_HIT / CPU_CLK_UNHALTED.THREAD_P (thresh >.2) */ int ret; struct counters *mem; struct counters *unhalt; double con, un, me, res; con = 36.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_RETIRED.L3_HIT"); if (pos != -1) { me = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (con * me)/un; ret = printf("%1.3f", res); return(ret); } static int cache1(struct counters *cpu, int pos) { /* 9 - (MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS * 180) / CPU_CLK_UNHALTED.THREAD_P (thresh >= .2) */ int ret; struct counters *mem; struct counters *unhalt; double con, un, me, res; con = 180.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS"); if (pos != -1) { me = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (me * con)/un; ret = printf("%1.3f", res); return(ret); } static int cache1ib(struct counters *cpu, int pos) { /* 9 - (MEM_LOAD_UOPS_L3_MISS_RETIRED.LCOAL_DRAM * 180) / CPU_CLK_UNHALTED.THREAD_P (thresh >= .2) */ int ret; struct counters *mem; struct counters *unhalt; double con, un, me, res; con = 180.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM"); if (pos != -1) { me = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (me * con)/un; ret = printf("%1.3f", res); return(ret); } static int cache1broad(struct counters *cpu, int pos) { /* 9 - (MEM_LOAD_UOPS_L3_MISS_RETIRED.LCOAL_DRAM * 180) / CPU_CLK_UNHALTED.THREAD_P (thresh >= .2) */ int ret; struct counters *mem; struct counters *unhalt; double con, un, me, res; con = 180.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); mem = find_counter(cpu, "MEM_LOAD_UOPS_RETIRED.L3_MISS"); if (pos != -1) { me = mem->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { me = mem->sum * 1.0; un = unhalt->sum * 1.0; } res = (me * con)/un; ret = printf("%1.3f", res); return(ret); } static int dtlb_missload(struct counters *cpu, int pos) { /* 10 - ((DTLB_LOAD_MISSES.STLB_HIT * 7) + DTLB_LOAD_MISSES.WALK_DURATION) / CPU_CLK_UNHALTED.THREAD_P (t >=.1) */ int ret; struct counters *dtlb_m, *dtlb_d; struct counters *unhalt; double con, un, d1, d2, res; con = 7.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); dtlb_m = find_counter(cpu, "DTLB_LOAD_MISSES.STLB_HIT"); dtlb_d = find_counter(cpu, "DTLB_LOAD_MISSES.WALK_DURATION"); if (pos != -1) { d1 = dtlb_m->vals[pos] * 1.0; d2 = dtlb_d->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { d1 = dtlb_m->sum * 1.0; d2 = dtlb_d->sum * 1.0; un = unhalt->sum * 1.0; } res = ((d1 * con) + d2)/un; ret = printf("%1.3f", res); return(ret); } static int dtlb_missstore(struct counters *cpu, int pos) { /* * ((DTLB_STORE_MISSES.STLB_HIT * 7) + DTLB_STORE_MISSES.WALK_DURATION) / * CPU_CLK_UNHALTED.THREAD_P (t >= .1) */ int ret; struct counters *dtsb_m, *dtsb_d; struct counters *unhalt; double con, un, d1, d2, res; con = 7.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); dtsb_m = find_counter(cpu, "DTLB_STORE_MISSES.STLB_HIT"); dtsb_d = find_counter(cpu, "DTLB_STORE_MISSES.WALK_DURATION"); if (pos != -1) { d1 = dtsb_m->vals[pos] * 1.0; d2 = dtsb_d->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { d1 = dtsb_m->sum * 1.0; d2 = dtsb_d->sum * 1.0; un = unhalt->sum * 1.0; } res = ((d1 * con) + d2)/un; ret = printf("%1.3f", res); return(ret); } static int itlb_miss(struct counters *cpu, int pos) { /* ITLB_MISSES.WALK_DURATION / CPU_CLK_UNTHREAD_P IB */ int ret; struct counters *itlb; struct counters *unhalt; double un, d1, res; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); itlb = find_counter(cpu, "ITLB_MISSES.WALK_DURATION"); if (pos != -1) { d1 = itlb->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { d1 = itlb->sum * 1.0; un = unhalt->sum * 1.0; } res = d1/un; ret = printf("%1.3f", res); return(ret); } static int itlb_miss_broad(struct counters *cpu, int pos) { /* (7 * ITLB_MISSES.STLB_HIT_4K + ITLB_MISSES.WALK_DURATION) / CPU_CLK_UNTHREAD_P */ int ret; struct counters *itlb; struct counters *unhalt; struct counters *four_k; double un, d1, res, k; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); itlb = find_counter(cpu, "ITLB_MISSES.WALK_DURATION"); four_k = find_counter(cpu, "ITLB_MISSES.STLB_HIT_4K"); if (pos != -1) { d1 = itlb->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; k = four_k->vals[pos] * 1.0; } else { d1 = itlb->sum * 1.0; un = unhalt->sum * 1.0; k = four_k->sum * 1.0; } res = (7.0 * k + d1)/un; ret = printf("%1.3f", res); return(ret); } static int icache_miss(struct counters *cpu, int pos) { /* (ICACHE.IFETCH_STALL - ITLB_MISSES.WALK_DURATION) / CPU_CLK_UNHALTED.THREAD_P IB */ int ret; struct counters *itlb, *icache; struct counters *unhalt; double un, d1, ic, res; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); itlb = find_counter(cpu, "ITLB_MISSES.WALK_DURATION"); icache = find_counter(cpu, "ICACHE.IFETCH_STALL"); if (pos != -1) { d1 = itlb->vals[pos] * 1.0; ic = icache->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { d1 = itlb->sum * 1.0; ic = icache->sum * 1.0; un = unhalt->sum * 1.0; } res = (ic-d1)/un; ret = printf("%1.3f", res); return(ret); } static int icache_miss_has(struct counters *cpu, int pos) { /* (36 * ICACHE.MISSES) / CPU_CLK_UNHALTED.THREAD_P */ int ret; struct counters *icache; struct counters *unhalt; double un, con, ic, res; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); icache = find_counter(cpu, "ICACHE.MISSES"); con = 36.0; if (pos != -1) { ic = icache->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { ic = icache->sum * 1.0; un = unhalt->sum * 1.0; } res = (con * ic)/un; ret = printf("%1.3f", res); return(ret); } static int lcp_stall(struct counters *cpu, int pos) { /* ILD_STALL.LCP/CPU_CLK_UNHALTED.THREAD_P IB */ int ret; struct counters *ild; struct counters *unhalt; double un, d1, res; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); ild = find_counter(cpu, "ILD_STALL.LCP"); if (pos != -1) { d1 = ild->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { d1 = ild->sum * 1.0; un = unhalt->sum * 1.0; } res = d1/un; ret = printf("%1.3f", res); return(ret); } static int frontendstall(struct counters *cpu, int pos) { /* 12 - IDQ_UOPS_NOT_DELIVERED.CORE / (CPU_CLK_UNHALTED.THREAD_P * 4) (thresh >= .15) */ int ret; struct counters *idq; struct counters *unhalt; double con, un, id, res; con = 4.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); idq = find_counter(cpu, "IDQ_UOPS_NOT_DELIVERED.CORE"); if (pos != -1) { id = idq->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { id = idq->sum * 1.0; un = unhalt->sum * 1.0; } res = id/(un * con); ret = printf("%1.3f", res); return(ret); } static int clears(struct counters *cpu, int pos) { /* 13 - ((MACHINE_CLEARS.MEMORY_ORDERING + MACHINE_CLEARS.SMC + MACHINE_CLEARS.MASKMOV ) * 100 ) * / CPU_CLK_UNHALTED.THREAD_P (thresh >= .02)*/ int ret; struct counters *clr1, *clr2, *clr3; struct counters *unhalt; double con, un, cl1, cl2, cl3, res; con = 100.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); clr1 = find_counter(cpu, "MACHINE_CLEARS.MEMORY_ORDERING"); clr2 = find_counter(cpu, "MACHINE_CLEARS.SMC"); clr3 = find_counter(cpu, "MACHINE_CLEARS.MASKMOV"); if (pos != -1) { cl1 = clr1->vals[pos] * 1.0; cl2 = clr2->vals[pos] * 1.0; cl3 = clr3->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { cl1 = clr1->sum * 1.0; cl2 = clr2->sum * 1.0; cl3 = clr3->sum * 1.0; un = unhalt->sum * 1.0; } res = ((cl1 + cl2 + cl3) * con)/un; ret = printf("%1.3f", res); return(ret); } static int clears_broad(struct counters *cpu, int pos) { int ret; struct counters *clr1, *clr2, *clr3, *cyc; struct counters *unhalt; double con, un, cl1, cl2, cl3, cy, res; con = 100.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); clr1 = find_counter(cpu, "MACHINE_CLEARS.MEMORY_ORDERING"); clr2 = find_counter(cpu, "MACHINE_CLEARS.SMC"); clr3 = find_counter(cpu, "MACHINE_CLEARS.MASKMOV"); cyc = find_counter(cpu, "MACHINE_CLEARS.CYCLES"); if (pos != -1) { cl1 = clr1->vals[pos] * 1.0; cl2 = clr2->vals[pos] * 1.0; cl3 = clr3->vals[pos] * 1.0; cy = cyc->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { cl1 = clr1->sum * 1.0; cl2 = clr2->sum * 1.0; cl3 = clr3->sum * 1.0; cy = cyc->sum * 1.0; un = unhalt->sum * 1.0; } /* Formula not listed but extrapulated to add the cy ?? */ res = ((cl1 + cl2 + cl3 + cy) * con)/un; ret = printf("%1.3f", res); return(ret); } static int microassist(struct counters *cpu, int pos) { /* 14 - IDQ.MS_CYCLES / CPU_CLK_UNHALTED.THREAD_P (thresh > .05) */ int ret; struct counters *idq; struct counters *unhalt; double un, id, res, con; con = 4.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); idq = find_counter(cpu, "IDQ.MS_UOPS"); if (pos != -1) { id = idq->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { id = idq->sum * 1.0; un = unhalt->sum * 1.0; } res = id/(un * con); ret = printf("%1.3f", res); return(ret); } static int microassist_broad(struct counters *cpu, int pos) { int ret; struct counters *idq; struct counters *unhalt; struct counters *uopiss; struct counters *uopret; double un, id, res, con, uoi, uor; con = 4.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); idq = find_counter(cpu, "IDQ.MS_UOPS"); uopiss = find_counter(cpu, "UOPS_ISSUED.ANY"); uopret = find_counter(cpu, "UOPS_RETIRED.RETIRE_SLOTS"); if (pos != -1) { id = idq->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; uoi = uopiss->vals[pos] * 1.0; uor = uopret->vals[pos] * 1.0; } else { id = idq->sum * 1.0; un = unhalt->sum * 1.0; uoi = uopiss->sum * 1.0; uor = uopret->sum * 1.0; } res = (uor/uoi) * (id/(un * con)); ret = printf("%1.3f", res); return(ret); } static int aliasing(struct counters *cpu, int pos) { /* 15 - (LD_BLOCKS_PARTIAL.ADDRESS_ALIAS * 5) / CPU_CLK_UNHALTED.THREAD_P (thresh > .1) */ int ret; struct counters *ld; struct counters *unhalt; double un, lds, con, res; con = 5.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); ld = find_counter(cpu, "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS"); if (pos != -1) { lds = ld->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { lds = ld->sum * 1.0; un = unhalt->sum * 1.0; } res = (lds * con)/un; ret = printf("%1.3f", res); return(ret); } static int aliasing_broad(struct counters *cpu, int pos) { /* 15 - (LD_BLOCKS_PARTIAL.ADDRESS_ALIAS * 5) / CPU_CLK_UNHALTED.THREAD_P (thresh > .1) */ int ret; struct counters *ld; struct counters *unhalt; double un, lds, con, res; con = 7.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); ld = find_counter(cpu, "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS"); if (pos != -1) { lds = ld->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { lds = ld->sum * 1.0; un = unhalt->sum * 1.0; } res = (lds * con)/un; ret = printf("%1.3f", res); return(ret); } static int fpassists(struct counters *cpu, int pos) { /* 16 - FP_ASSIST.ANY/INST_RETIRED.ANY_P */ int ret; struct counters *fp; struct counters *inst; double un, fpd, res; inst = find_counter(cpu, "INST_RETIRED.ANY_P"); fp = find_counter(cpu, "FP_ASSIST.ANY"); if (pos != -1) { fpd = fp->vals[pos] * 1.0; un = inst->vals[pos] * 1.0; } else { fpd = fp->sum * 1.0; un = inst->sum * 1.0; } res = fpd/un; ret = printf("%1.3f", res); return(ret); } static int otherassistavx(struct counters *cpu, int pos) { /* 17 - (OTHER_ASSISTS.AVX_TO_SSE * 75)/CPU_CLK_UNHALTED.THREAD_P thresh .1*/ int ret; struct counters *oth; struct counters *unhalt; double un, ot, con, res; con = 75.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); oth = find_counter(cpu, "OTHER_ASSISTS.AVX_TO_SSE"); if (pos != -1) { ot = oth->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { ot = oth->sum * 1.0; un = unhalt->sum * 1.0; } res = (ot * con)/un; ret = printf("%1.3f", res); return(ret); } static int otherassistsse(struct counters *cpu, int pos) { int ret; struct counters *oth; struct counters *unhalt; double un, ot, con, res; /* 18 (OTHER_ASSISTS.SSE_TO_AVX * 75)/CPU_CLK_UNHALTED.THREAD_P thresh .1*/ con = 75.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); oth = find_counter(cpu, "OTHER_ASSISTS.SSE_TO_AVX"); if (pos != -1) { ot = oth->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { ot = oth->sum * 1.0; un = unhalt->sum * 1.0; } res = (ot * con)/un; ret = printf("%1.3f", res); return(ret); } static int efficiency1(struct counters *cpu, int pos) { int ret; struct counters *uops; struct counters *unhalt; double un, ot, con, res; /* 19 (UOPS_RETIRED.RETIRE_SLOTS/(4*CPU_CLK_UNHALTED.THREAD_P) look if thresh < .9*/ con = 4.0; unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); uops = find_counter(cpu, "UOPS_RETIRED.RETIRE_SLOTS"); if (pos != -1) { ot = uops->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { ot = uops->sum * 1.0; un = unhalt->sum * 1.0; } res = ot/(con * un); ret = printf("%1.3f", res); return(ret); } static int efficiency2(struct counters *cpu, int pos) { int ret; struct counters *uops; struct counters *unhalt; double un, ot, res; /* 20 - CPU_CLK_UNHALTED.THREAD_P/INST_RETIRED.ANY_P good if > 1. (comp factor)*/ unhalt = find_counter(cpu, "CPU_CLK_UNHALTED.THREAD_P"); uops = find_counter(cpu, "INST_RETIRED.ANY_P"); if (pos != -1) { ot = uops->vals[pos] * 1.0; un = unhalt->vals[pos] * 1.0; } else { ot = uops->sum * 1.0; un = unhalt->sum * 1.0; } res = un/ot; ret = printf("%1.3f", res); return(ret); } #define SANDY_BRIDGE_COUNT 20 static struct cpu_entry sandy_bridge[SANDY_BRIDGE_COUNT] = { /*01*/ { "allocstall1", "thresh > .05", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s PARTIAL_RAT_STALLS.SLOW_LEA_WINDOW -w 1", allocstall1, 2 }, /* -- not defined for SB right (partial-rat_stalls) 02*/ { "allocstall2", "thresh > .05", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s PARTIAL_RAT_STALLS.FLAGS_MERGE_UOP -w 1", allocstall2, 2 }, /*03*/ { "br_miss", "thresh >= .2", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s BR_MISP_RETIRED.ALL_BRANCHES -w 1", br_mispredict, 2 }, /*04*/ { "splitload", "thresh >= .1", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s MEM_UOP_RETIRED.SPLIT_LOADS -w 1", splitload_sb, 2 }, /* 05*/ { "splitstore", "thresh >= .01", "pmcstat -s MEM_UOP_RETIRED.SPLIT_STORES -s MEM_UOP_RETIRED.ALL_STORES -w 1", splitstore_sb, 2 }, /*06*/ { "contested", "thresh >= .05", "pmcstat -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM -s CPU_CLK_UNHALTED.THREAD_P -w 1", contested, 2 }, /*07*/ { "blockstorefwd", "thresh >= .05", "pmcstat -s LD_BLOCKS_STORE_FORWARD -s CPU_CLK_UNHALTED.THREAD_P -w 1", blockstoreforward, 2 }, /*08*/ { "cache2", "thresh >= .2", "pmcstat -s MEM_LOAD_UOPS_RETIRED.LLC_HIT -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache2, 4 }, /*09*/ { "cache1", "thresh >= .2", "pmcstat -s MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache1, 2 }, /*10*/ { "dtlbmissload", "thresh >= .1", "pmcstat -s DTLB_LOAD_MISSES.STLB_HIT -s DTLB_LOAD_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", dtlb_missload, 3 }, /*11*/ { "dtlbmissstore", "thresh >= .05", "pmcstat -s DTLB_STORE_MISSES.STLB_HIT -s DTLB_STORE_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", dtlb_missstore, 3 }, /*12*/ { "frontendstall", "thresh >= .15", "pmcstat -s IDQ_UOPS_NOT_DELIVERED.CORE -s CPU_CLK_UNHALTED.THREAD_P -w 1", frontendstall, 2 }, /*13*/ { "clears", "thresh >= .02", "pmcstat -s MACHINE_CLEARS.MEMORY_ORDERING -s MACHINE_CLEARS.SMC -s MACHINE_CLEARS.MASKMOV -s CPU_CLK_UNHALTED.THREAD_P -w 1", clears, 4 }, /*14*/ { "microassist", "thresh >= .05", "pmcstat -s IDQ.MS_UOPS,cmask=1 -s CPU_CLK_UNHALTED.THREAD_P -w 1", microassist, 2 }, /*15*/ { "aliasing_4k", "thresh >= .1", "pmcstat -s LD_BLOCKS_PARTIAL.ADDRESS_ALIAS -s CPU_CLK_UNHALTED.THREAD_P -w 1", aliasing, 2 }, /*16*/ { "fpassist", "look for a excessive value", "pmcstat -s FP_ASSIST.ANY -s INST_RETIRED.ANY_P -w 1", fpassists, 2 }, /*17*/ { "otherassistavx", "look for a excessive value", "pmcstat -s OTHER_ASSISTS.AVX_TO_SSE -s CPU_CLK_UNHALTED.THREAD_P -w 1", otherassistavx, 2}, /*18*/ { "otherassistsse", "look for a excessive value", "pmcstat -s OTHER_ASSISTS.SSE_TO_AVX -s CPU_CLK_UNHALTED.THREAD_P -w 1", otherassistsse, 2 }, /*19*/ { "eff1", "thresh < .9", "pmcstat -s UOPS_RETIRED.RETIRE_SLOTS -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency1, 2 }, /*20*/ { "eff2", "thresh > 1.0", "pmcstat -s INST_RETIRED.ANY_P -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency2, 2 }, }; #define IVY_BRIDGE_COUNT 21 static struct cpu_entry ivy_bridge[IVY_BRIDGE_COUNT] = { /*1*/ { "eff1", "thresh < .75", "pmcstat -s UOPS_RETIRED.RETIRE_SLOTS -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency1, 2 }, /*2*/ { "eff2", "thresh > 1.0", "pmcstat -s INST_RETIRED.ANY_P -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency2, 2 }, /*3*/ { "itlbmiss", "thresh > .05", "pmcstat -s ITLB_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", itlb_miss, 2 }, /*4*/ { "icachemiss", "thresh > .05", "pmcstat -s ICACHE.IFETCH_STALL -s ITLB_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", icache_miss, 3 }, /*5*/ { "lcpstall", "thresh > .05", "pmcstat -s ILD_STALL.LCP -s CPU_CLK_UNHALTED.THREAD_P -w 1", lcp_stall, 2 }, /*6*/ { "cache1", "thresh >= .2", "pmcstat -s MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache1ib, 2 }, /*7*/ { "cache2", "thresh >= .2", "pmcstat -s MEM_LOAD_UOPS_RETIRED.LLC_HIT -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache2ib, 2 }, /*8*/ { "contested", "thresh >= .05", "pmcstat -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM -s CPU_CLK_UNHALTED.THREAD_P -w 1", contested, 2 }, /*9*/ { "datashare", "thresh >= .05", "pmcstat -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT -s CPU_CLK_UNHALTED.THREAD_P -w 1", datasharing, 2 }, /*10*/ { "blockstorefwd", "thresh >= .05", "pmcstat -s LD_BLOCKS_STORE_FORWARD -s CPU_CLK_UNHALTED.THREAD_P -w 1", blockstoreforward, 2 }, /*11*/ { "splitload", "thresh >= .1", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s L1D_PEND_MISS.PENDING -s MEM_LOAD_UOPS_RETIRED.L1_MISS -s LD_BLOCKS.NO_SR -w 1", splitloadib, 4 }, /*12*/ { "splitstore", "thresh >= .01", "pmcstat -s MEM_UOPS_RETIRED.SPLIT_STORES -s MEM_UOPS_RETIRED.ALL_STORES -w 1", splitstore, 2 }, /*13*/ { "aliasing_4k", "thresh >= .1", "pmcstat -s LD_BLOCKS_PARTIAL.ADDRESS_ALIAS -s CPU_CLK_UNHALTED.THREAD_P -w 1", aliasing, 2 }, /*14*/ { "dtlbmissload", "thresh >= .1", "pmcstat -s DTLB_LOAD_MISSES.STLB_HIT -s DTLB_LOAD_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", dtlb_missload , 3}, /*15*/ { "dtlbmissstore", "thresh >= .05", "pmcstat -s DTLB_STORE_MISSES.STLB_HIT -s DTLB_STORE_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", dtlb_missstore, 3 }, /*16*/ { "br_miss", "thresh >= .2", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s BR_MISP_RETIRED.ALL_BRANCHES -s MACHINE_CLEARS.MEMORY_ORDERING -s MACHINE_CLEARS.SMC -s MACHINE_CLEARS.MASKMOV -s UOPS_ISSUED.ANY -s UOPS_RETIRED.RETIRE_SLOTS -s INT_MISC.RECOVERY_CYCLES -w 1", br_mispredictib, 8 }, /*17*/ { "clears", "thresh >= .02", "pmcstat -s MACHINE_CLEARS.MEMORY_ORDERING -s MACHINE_CLEARS.SMC -s MACHINE_CLEARS.MASKMOV -s CPU_CLK_UNHALTED.THREAD_P -w 1", clears, 4 }, /*18*/ { "microassist", "thresh >= .05", "pmcstat -s IDQ.MS_UOPS,cmask=1 -s CPU_CLK_UNHALTED.THREAD_P -w 1", microassist, 2 }, /*19*/ { "fpassist", "look for a excessive value", "pmcstat -s FP_ASSIST.ANY -s INST_RETIRED.ANY_P -w 1", fpassists, 2 }, /*20*/ { "otherassistavx", "look for a excessive value", "pmcstat -s OTHER_ASSISTS.AVX_TO_SSE -s CPU_CLK_UNHALTED.THREAD_P -w 1", otherassistavx , 2}, /*21*/ { "otherassistsse", "look for a excessive value", "pmcstat -s OTHER_ASSISTS.SSE_TO_AVX -s CPU_CLK_UNHALTED.THREAD_P -w 1", otherassistsse, 2 }, }; #define HASWELL_COUNT 20 static struct cpu_entry haswell[HASWELL_COUNT] = { /*1*/ { "eff1", "thresh < .75", "pmcstat -s UOPS_RETIRED.RETIRE_SLOTS -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency1, 2 }, /*2*/ { "eff2", "thresh > 1.0", "pmcstat -s INST_RETIRED.ANY_P -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency2, 2 }, /*3*/ { "itlbmiss", "thresh > .05", "pmcstat -s ITLB_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", itlb_miss, 2 }, /*4*/ { "icachemiss", "thresh > .05", "pmcstat -s ICACHE.MISSES -s CPU_CLK_UNHALTED.THREAD_P -w 1", icache_miss_has, 2 }, /*5*/ { "lcpstall", "thresh > .05", "pmcstat -s ILD_STALL.LCP -s CPU_CLK_UNHALTED.THREAD_P -w 1", lcp_stall, 2 }, /*6*/ { "cache1", "thresh >= .2", "pmcstat -s MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache1ib, 2 }, /*7*/ { "cache2", "thresh >= .2", "pmcstat -s MEM_LOAD_UOPS_RETIRED.LLC_HIT -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache2has, 4 }, /*8*/ { "contested", "thresh >= .05", "pmcstat -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM -s CPU_CLK_UNHALTED.THREAD_P -w 1", contested_has, 2 }, /*9*/ { "datashare", "thresh >= .05", "pmcstat -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT -s CPU_CLK_UNHALTED.THREAD_P -w 1", datasharing_has, 2 }, /*10*/ { "blockstorefwd", "thresh >= .05", "pmcstat -s LD_BLOCKS_STORE_FORWARD -s CPU_CLK_UNHALTED.THREAD_P -w 1", blockstoreforward, 2 }, /*11*/ { "splitload", "thresh >= .1", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s MEM_UOPS_RETIRED.SPLIT_LOADS -w 1", splitload , 2}, /*12*/ { "splitstore", "thresh >= .01", "pmcstat -s MEM_UOPS_RETIRED.SPLIT_STORES -s MEM_UOPS_RETIRED.ALL_STORES -w 1", splitstore, 2 }, /*13*/ { "aliasing_4k", "thresh >= .1", "pmcstat -s LD_BLOCKS_PARTIAL.ADDRESS_ALIAS -s CPU_CLK_UNHALTED.THREAD_P -w 1", aliasing, 2 }, /*14*/ { "dtlbmissload", "thresh >= .1", "pmcstat -s DTLB_LOAD_MISSES.STLB_HIT -s DTLB_LOAD_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", dtlb_missload, 3 }, /*15*/ { "br_miss", "thresh >= .2", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s BR_MISP_RETIRED.ALL_BRANCHES -w 1", br_mispredict, 2 }, /*16*/ { "clears", "thresh >= .02", "pmcstat -s MACHINE_CLEARS.MEMORY_ORDERING -s MACHINE_CLEARS.SMC -s MACHINE_CLEARS.MASKMOV -s CPU_CLK_UNHALTED.THREAD_P -w 1", clears, 4 }, /*17*/ { "microassist", "thresh >= .05", "pmcstat -s IDQ.MS_UOPS,cmask=1 -s CPU_CLK_UNHALTED.THREAD_P -w 1", microassist, 2 }, /*18*/ { "fpassist", "look for a excessive value", "pmcstat -s FP_ASSIST.ANY -s INST_RETIRED.ANY_P -w 1", fpassists, 2 }, /*19*/ { "otherassistavx", "look for a excessive value", "pmcstat -s OTHER_ASSISTS.AVX_TO_SSE -s CPU_CLK_UNHALTED.THREAD_P -w 1", otherassistavx, 2 }, /*20*/ { "otherassistsse", "look for a excessive value", "pmcstat -s OTHER_ASSISTS.SSE_TO_AVX -s CPU_CLK_UNHALTED.THREAD_P -w 1", otherassistsse, 2 }, }; static void explain_name_broad(const char *name) { const char *mythresh; if (strcmp(name, "eff1") == 0) { printf("Examine (UOPS_RETIRED.RETIRE_SLOTS)/(4 *CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh < .75"; } else if (strcmp(name, "eff2") == 0) { printf("Examine CPU_CLK_UNHALTED.THREAD_P/INST_RETIRED.ANY_P\n"); mythresh = "thresh > 1.0"; } else if (strcmp(name, "itlbmiss") == 0) { printf("Examine (7 * ITLB_MISSES_STLB_HIT_4K + ITLB_MISSES.WALK_DURATION)/ CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "icachemiss") == 0) { printf("Examine ( 36.0 * ICACHE.MISSES)/ CPU_CLK_UNHALTED.THREAD_P ??? may not be right \n"); mythresh = "thresh > .05"; } else if (strcmp(name, "lcpstall") == 0) { printf("Examine ILD_STALL.LCP/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "cache1") == 0) { printf("Examine (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * 180) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "cache2") == 0) { printf("Examine (36.0 * MEM_LOAD_UOPS_RETIRED.L3_HIT / CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "contested") == 0) { printf("Examine ((MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM * 84) + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS)/ CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "datashare") == 0) { printf("Examine (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * 72)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh > .05"; } else if (strcmp(name, "blockstorefwd") == 0) { printf("Examine (LD_BLOCKS_STORE_FORWARD * 13) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .05"; } else if (strcmp(name, "aliasing_4k") == 0) { printf("Examine (LD_BLOCKS_PARTIAL.ADDRESS_ALIAS * 7) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "dtlbmissload") == 0) { printf("Examine (((DTLB_LOAD_MISSES.STLB_HIT * 7) + DTLB_LOAD_MISSES.WALK_DURATION)\n"); printf(" / CPU_CLK_UNHALTED.THREAD_P)\n"); mythresh = "thresh >= .1"; } else if (strcmp(name, "br_miss") == 0) { printf("Examine BR_MISP_RETIRED.ALL_BRANCHS_PS / (BR_MISP_RETIED.ALL_BRANCHES_PS + MACHINE_CLEARS.COUNT) *\n"); printf(" (UOPS_ISSUEDF.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES) /\n"); printf("CPU_CLK_UNHALTED.THREAD * 4)\n"); mythresh = "thresh >= .2"; } else if (strcmp(name, "clears") == 0) { printf("Examine ((MACHINE_CLEARS.MEMORY_ORDERING + \n"); printf(" MACHINE_CLEARS.SMC + \n"); printf(" MACHINE_CLEARS.MASKMOV ) * 100 ) / CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "thresh >= .02"; } else if (strcmp(name, "fpassist") == 0) { printf("Examine FP_ASSIST.ANY/INST_RETIRED.ANY_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "otherassistavx") == 0) { printf("Examine (OTHER_ASSISTS.AVX_TO_SSE * 75)/CPU_CLK_UNHALTED.THREAD_P\n"); mythresh = "look for a excessive value"; } else if (strcmp(name, "microassist") == 0) { printf("Examine (UOPS_RETIRED.RETIRE_SLOTS/UOPS_ISSUED.ANY) * (IDQ.MS_CYCLES / (4 * CPU_CLK_UNHALTED.THREAD_P)\n"); printf("***We use IDQ.MS_UOPS,cmask=1 to get cycles\n"); mythresh = "thresh >= .05"; } else { printf("Unknown name:%s\n", name); mythresh = "unknown entry"; } printf("If the value printed is %s we may have the ability to improve performance\n", mythresh); } #define BROADWELL_COUNT 17 static struct cpu_entry broadwell[BROADWELL_COUNT] = { /*1*/ { "eff1", "thresh < .75", "pmcstat -s UOPS_RETIRED.RETIRE_SLOTS -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency1, 2 }, /*2*/ { "eff2", "thresh > 1.0", "pmcstat -s INST_RETIRED.ANY_P -s CPU_CLK_UNHALTED.THREAD_P -w 1", efficiency2, 2 }, /*3*/ { "itlbmiss", "thresh > .05", "pmcstat -s ITLB_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -s ITLB_MISSES.STLB_HIT_4K -w 1", itlb_miss_broad, 3 }, /*4*/ { "icachemiss", "thresh > .05", "pmcstat -s ICACHE.MISSES -s CPU_CLK_UNHALTED.THREAD_P -w 1", icache_miss_has, 2 }, /*5*/ { "lcpstall", "thresh > .05", "pmcstat -s ILD_STALL.LCP -s CPU_CLK_UNHALTED.THREAD_P -w 1", lcp_stall, 2 }, /*6*/ { "cache1", "thresh >= .1", "pmcstat -s MEM_LOAD_UOPS_RETIRED.L3_MISS -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache1broad, 2 }, /*7*/ { "cache2", "thresh >= .2", "pmcstat -s MEM_LOAD_UOPS_RETIRED.L3_HIT -s CPU_CLK_UNHALTED.THREAD_P -w 1", cache2broad, 2 }, /*8*/ { "contested", "thresh >= .05", "pmcstat -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM -s CPU_CLK_UNHALTED.THREAD_P -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS -w 1", contestedbroad, 2 }, /*9*/ { "datashare", "thresh >= .05", "pmcstat -s MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT -s CPU_CLK_UNHALTED.THREAD_P -w 1", datasharing_has, 2 }, /*10*/ { "blockstorefwd", "thresh >= .05", "pmcstat -s LD_BLOCKS_STORE_FORWARD -s CPU_CLK_UNHALTED.THREAD_P -w 1", blockstoreforward, 2 }, /*11*/ { "aliasing_4k", "thresh >= .1", "pmcstat -s LD_BLOCKS_PARTIAL.ADDRESS_ALIAS -s CPU_CLK_UNHALTED.THREAD_P -w 1", aliasing_broad, 2 }, /*12*/ { "dtlbmissload", "thresh >= .1", "pmcstat -s DTLB_LOAD_MISSES.STLB_HIT_4K -s DTLB_LOAD_MISSES.WALK_DURATION -s CPU_CLK_UNHALTED.THREAD_P -w 1", dtlb_missload, 3 }, /*13*/ { "br_miss", "thresh >= .2", "pmcstat -s CPU_CLK_UNHALTED.THREAD_P -s BR_MISP_RETIRED.ALL_BRANCHES -s MACHINE_CLEARS.CYCLES -s UOPS_ISSUED.ANY -s UOPS_RETIRED.RETIRE_SLOTS -s INT_MISC.RECOVERY_CYCLES -w 1", br_mispredict_broad, 7 }, /*14*/ { "clears", "thresh >= .02", "pmcstat -s MACHINE_CLEARS.CYCLES -s MACHINE_CLEARS.MEMORY_ORDERING -s MACHINE_CLEARS.SMC -s MACHINE_CLEARS.MASKMOV -s CPU_CLK_UNHALTED.THREAD_P -w 1", clears_broad, 5 }, /*15*/ { "fpassist", "look for a excessive value", "pmcstat -s FP_ASSIST.ANY -s INST_RETIRED.ANY_P -w 1", fpassists, 2 }, /*16*/ { "otherassistavx", "look for a excessive value", "pmcstat -s OTHER_ASSISTS.AVX_TO_SSE -s CPU_CLK_UNHALTED.THREAD_P -w 1", otherassistavx, 2 }, /*17*/ { "microassist", "thresh >= .2", "pmcstat -s IDQ.MS_UOPS,cmask=1 -s CPU_CLK_UNHALTED.THREAD_P -s UOPS_ISSUED.ANY -s UOPS_RETIRED.RETIRE_SLOTS -w 1", microassist_broad, 4 }, }; static void set_sandybridge(void) { strcpy(the_cpu.cputype, "SandyBridge PMC"); the_cpu.number = SANDY_BRIDGE_COUNT; the_cpu.ents = sandy_bridge; the_cpu.explain = explain_name_sb; } static void set_ivybridge(void) { strcpy(the_cpu.cputype, "IvyBridge PMC"); the_cpu.number = IVY_BRIDGE_COUNT; the_cpu.ents = ivy_bridge; the_cpu.explain = explain_name_ib; } static void set_haswell(void) { strcpy(the_cpu.cputype, "HASWELL PMC"); the_cpu.number = HASWELL_COUNT; the_cpu.ents = haswell; the_cpu.explain = explain_name_has; } static void set_broadwell(void) { strcpy(the_cpu.cputype, "HASWELL PMC"); the_cpu.number = BROADWELL_COUNT; the_cpu.ents = broadwell; the_cpu.explain = explain_name_broad; } static int set_expression(const char *name) { int found = 0, i; for(i=0 ; i< the_cpu.number; i++) { if (strcmp(name, the_cpu.ents[i].name) == 0) { found = 1; expression = the_cpu.ents[i].func; command = the_cpu.ents[i].command; threshold = the_cpu.ents[i].thresh; if (the_cpu.ents[i].counters_required > max_pmc_counters) { printf("Test %s requires that the CPU have %d counters and this CPU has only %d\n", the_cpu.ents[i].name, the_cpu.ents[i].counters_required, max_pmc_counters); printf("Sorry this test can not be run\n"); if (run_all == 0) { exit(-1); } else { return(-1); } } break; } } if (!found) { printf("For CPU type %s we have no expression:%s\n", the_cpu.cputype, name); exit(-1); } return(0); } static int validate_expression(char *name) { int i, found; found = 0; for(i=0 ; i< the_cpu.number; i++) { if (strcmp(name, the_cpu.ents[i].name) == 0) { found = 1; break; } } if (!found) { return(-1); } return (0); } static void do_expression(struct counters *cpu, int pos) { if (expression == NULL) return; (*expression)(cpu, pos); } static void process_header(int idx, char *p) { struct counters *up; int i, len, nlen; /* * Given header element idx, at p in * form 's/NN/nameof' * process the entry to pull out the name and * the CPU number. */ if (strncmp(p, "s/", 2)) { printf("Check -- invalid header no s/ in %s\n", p); return; } up = &cnts[idx]; up->cpu = strtol(&p[2], NULL, 10); len = strlen(p); for (i=2; icounter_name, &p[(i+1)]); } else { strncpy(up->counter_name, &p[(i+1)], (MAX_NLEN-1)); } } } } static void build_counters_from_header(FILE *io) { char buffer[8192], *p; int i, len, cnt; size_t mlen; /* We have a new start, lets * setup our headers and cpus. */ if (fgets(buffer, sizeof(buffer), io) == NULL) { printf("First line can't be read from file err:%d\n", errno); return; } /* * Ok output is an array of counters. Once * we start to read the values in we must * put them in there slot to match there CPU and * counter being updated. We create a mass array * of the counters, filling in the CPU and * counter name. */ /* How many do we get? */ len = strlen(buffer); for (i=0, cnt=0; inext_cpu) { /* Already laced in */ continue; } lace_cpu = cpat->cpu; if (lace_cpu >= MAX_CPU) { printf("CPU %d to big\n", lace_cpu); continue; } if (glob_cpu[lace_cpu] == NULL) { glob_cpu[lace_cpu] = cpat; } else { /* Already processed this cpu */ continue; } /* Ok look forward for cpu->cpu and link in */ for(j=(i+1); jnext_cpu) { continue; } if (at->cpu == lace_cpu) { /* Found one */ cpat->next_cpu = at; cpat = at; } } } } static void process_file(char *filename) { FILE *io; int i; int line_at, not_done; pid_t pid_of_command=0; if (filename == NULL) { io = my_popen(command, "r", &pid_of_command); } else { io = fopen(filename, "r"); if (io == NULL) { printf("Can't process file %s err:%d\n", filename, errno); return; } } build_counters_from_header(io); if (cnts == NULL) { /* Nothing we can do */ printf("Nothing to do -- no counters built\n"); if (io) { fclose(io); } return; } lace_cpus_together(); print_header(); if (verbose) { for (i=0; i= max_to_collect) { not_done = 0; } if (filename == NULL) { int cnt; /* For the ones we dynamically open we print now */ for(i=0, cnt=0; i> 12) | ((eax & 0xF0) >> 4)); printf("CPU model is 0x%x id:0x%lx\n", model, eax); switch (eax & 0xF00) { case 0x500: /* Pentium family processors */ printf("Intel Pentium P5\n"); goto not_supported; break; case 0x600: /* Pentium Pro, Celeron, Pentium II & III */ switch (model) { case 0x1: printf("Intel Pentium P6\n"); goto not_supported; break; case 0x3: case 0x5: printf("Intel PII\n"); goto not_supported; break; case 0x6: case 0x16: printf("Intel CL\n"); goto not_supported; break; case 0x7: case 0x8: case 0xA: case 0xB: printf("Intel PIII\n"); goto not_supported; break; case 0x9: case 0xD: printf("Intel PM\n"); goto not_supported; break; case 0xE: printf("Intel CORE\n"); goto not_supported; break; case 0xF: printf("Intel CORE2\n"); goto not_supported; break; case 0x17: printf("Intel CORE2EXTREME\n"); goto not_supported; break; case 0x1C: /* Per Intel document 320047-002. */ printf("Intel ATOM\n"); goto not_supported; break; case 0x1A: case 0x1E: /* * Per Intel document 253669-032 9/2009, * pages A-2 and A-57 */ case 0x1F: /* * Per Intel document 253669-032 9/2009, * pages A-2 and A-57 */ printf("Intel COREI7\n"); goto not_supported; break; case 0x2E: printf("Intel NEHALEM\n"); goto not_supported; break; case 0x25: /* Per Intel document 253669-033US 12/2009. */ case 0x2C: /* Per Intel document 253669-033US 12/2009. */ printf("Intel WESTMERE\n"); goto not_supported; break; case 0x2F: /* Westmere-EX, seen in wild */ printf("Intel WESTMERE\n"); goto not_supported; break; case 0x2A: /* Per Intel document 253669-039US 05/2011. */ printf("Intel SANDYBRIDGE\n"); set_sandybridge(); break; case 0x2D: /* Per Intel document 253669-044US 08/2012. */ printf("Intel SANDYBRIDGE_XEON\n"); set_sandybridge(); break; case 0x3A: /* Per Intel document 253669-043US 05/2012. */ printf("Intel IVYBRIDGE\n"); set_ivybridge(); break; case 0x3E: /* Per Intel document 325462-045US 01/2013. */ printf("Intel IVYBRIDGE_XEON\n"); set_ivybridge(); break; case 0x3F: /* Per Intel document 325462-045US 09/2014. */ printf("Intel HASWELL (Xeon)\n"); set_haswell(); break; case 0x3C: /* Per Intel document 325462-045US 01/2013. */ case 0x45: case 0x46: printf("Intel HASWELL\n"); set_haswell(); break; case 0x4e: case 0x5e: printf("Intel SKY-LAKE\n"); goto not_supported; break; case 0x3D: case 0x47: printf("Intel BROADWELL\n"); set_broadwell(); break; case 0x4f: case 0x56: printf("Intel BROADWEL (Xeon)\n"); set_broadwell(); break; case 0x4D: /* Per Intel document 330061-001 01/2014. */ printf("Intel ATOM_SILVERMONT\n"); goto not_supported; break; default: printf("Intel model 0x%x is not known -- sorry\n", model); goto not_supported; break; } break; case 0xF00: /* P4 */ printf("Intel unknown model %d\n", model); goto not_supported; break; } do_cpuid(0xa, 0, reg); max_pmc_counters = (reg[3] & 0x0000000f) + 1; printf("We have %d PMC counters to work with\n", max_pmc_counters); /* Ok lets load the list of all known PMC's */ io = my_popen("/usr/sbin/pmccontrol -L", "r", &pid_of_command); if (valid_pmcs == NULL) { /* Likely */ pmc_allocated_cnt = PMC_INITIAL_ALLOC; sz = sizeof(char *) * pmc_allocated_cnt; valid_pmcs = malloc(sz); if (valid_pmcs == NULL) { printf("No memory allocation fails at startup?\n"); exit(-1); } memset(valid_pmcs, 0, sz); } while (fgets(linebuf, sizeof(linebuf), io) != NULL) { if (linebuf[0] != '\t') { /* sometimes headers ;-) */ continue; } len = strlen(linebuf); if (linebuf[(len-1)] == '\n') { /* Likely */ linebuf[(len-1)] = 0; } str = &linebuf[1]; len = strlen(str) + 1; valid_pmcs[valid_pmc_cnt] = malloc(len); if (valid_pmcs[valid_pmc_cnt] == NULL) { printf("No memory2 allocation fails at startup?\n"); exit(-1); } memset(valid_pmcs[valid_pmc_cnt], 0, len); strcpy(valid_pmcs[valid_pmc_cnt], str); valid_pmc_cnt++; if (valid_pmc_cnt >= pmc_allocated_cnt) { /* Got to expand -- unlikely */ char **more; sz = sizeof(char *) * (pmc_allocated_cnt * 2); more = malloc(sz); if (more == NULL) { printf("No memory3 allocation fails at startup?\n"); exit(-1); } memset(more, 0, sz); memcpy(more, valid_pmcs, sz); pmc_allocated_cnt *= 2; free(valid_pmcs); valid_pmcs = more; } } my_pclose(io, pid_of_command); return; not_supported: printf("Not supported\n"); exit(-1); } static void explain_all(void) { int i; printf("For CPU's of type %s the following expressions are available:\n",the_cpu.cputype); printf("-------------------------------------------------------------\n"); for(i=0; itype == TYPE_VALUE_PMC) { cnt_pmc++; } at = at->next; } if (cnt_pmc == 0) { printf("No PMC's in your expression -- nothing to do!!\n"); exit(0); } mal = cnt_pmc * sizeof(char *); vars = malloc(mal); if (vars == NULL) { printf("No memory\n"); exit(-1); } memset(vars, 0, mal); at = exp; while (at) { if (at->type == TYPE_VALUE_PMC) { if(add_it_to(vars, alloced_pmcs, at->name)) { alloced_pmcs++; } } at = at->next; } /* Now we have a unique list in vars so create our command */ mal = 23; /* "/usr/sbin/pmcstat -w 1" + \0 */ for(i=0; itype == TYPE_VALUE_PMC) { var = find_counter(cpu, at->name); if (var == NULL) { printf("%s:Can't find counter %s?\n", __FUNCTION__, at->name); exit(-1); } if (pos != -1) { at->value = var->vals[pos] * 1.0; } else { at->value = var->sum * 1.0; } } at = at->next; } res = run_expr(master_exp, 1, NULL); ret = printf("%1.3f", res); return(ret); } static void set_manual_exp(struct expression *exp) { expression = user_expr; command = build_command_for_exp(exp); threshold = "User defined threshold"; } static void run_tests(void) { int i, lenout; printf("Running tests on %d PMC's this may take some time\n", valid_pmc_cnt); printf("------------------------------------------------------------------------\n"); for(i=0; i MAX_COUNTER_SLOTS) { /* You can't collect more than max in array */ max_to_collect = MAX_COUNTER_SLOTS; } break; case 'v': verbose++; break; case 'h': help_only = 1; break; case 'i': filename = optarg; break; case '?': default: use: printf("Use %s [ -i inputfile -v -m max_to_collect -e expr -E -h -? -H]\n", argv[0]); printf("-i inputfile -- use source as inputfile not stdin (if stdin collect)\n"); printf("-v -- verbose dump debug type things -- you don't want this\n"); printf("-m N -- maximum to collect is N measurments\n"); printf("-e expr-name -- Do expression expr-name\n"); printf("-E 'your expression' -- Do your expression\n"); printf("-h -- Don't do the expression I put in -e xxx just explain what it does and exit\n"); printf("-H -- Don't run anything, just explain all canned expressions\n"); printf("-T -- Test all PMC's defined by this processor\n"); printf("-A -- Run all canned tests\n"); return(0); break; } } if ((run_all == 0) && (name == NULL) && (filename == NULL) && (test_mode == 0) && (master_exp == NULL)) { printf("Without setting an expression we cannot dynamically gather information\n"); printf("you must supply a filename (and you probably want verbosity)\n"); goto use; } if (run_all && max_to_collect > 10) { max_to_collect = 3; } if (test_mode) { run_tests(); return(0); } printf("*********************************\n"); if ((master_exp == NULL) && name) { (*the_cpu.explain)(name); } else if (master_exp) { printf("Examine your expression "); print_exp(master_exp); printf("User defined threshold\n"); } if (help_only) { return(0); } if (run_all) { more: name = the_cpu.ents[test_at].name; printf("***Test %s (threshold %s)****\n", name, the_cpu.ents[test_at].thresh); test_at++; if (set_expression(name) == -1) { if (test_at >= the_cpu.number) { goto done; } else goto more; } } process_file(filename); if (verbose >= 2) { for (i=0; i 1) { for(i=0, cnt=0; i