Index: head/sys/amd64/conf/GENERIC =================================================================== --- head/sys/amd64/conf/GENERIC (revision 308154) +++ head/sys/amd64/conf/GENERIC (revision 308155) @@ -1,369 +1,371 @@ # # GENERIC -- Generic kernel configuration file for FreeBSD/amd64 # # For more information on this file, please read the config(5) manual page, # and/or the handbook section on Kernel Configuration Files: # # http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the ../../conf/NOTES and NOTES files. # If you are in doubt as to the purpose or necessity of a line, check first # in NOTES. # # $FreeBSD$ cpu HAMMER ident GENERIC makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols makeoptions WITH_CTF=1 # Run ctfconvert(1) for DTrace support options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption options INET # InterNETworking options INET6 # IPv6 communications protocols options IPSEC # IP (v4/v6) security options TCP_OFFLOAD # TCP offload options TCP_HHOOK # hhook(9) framework for TCP options SCTP # Stream Control Transmission Protocol options FFS # Berkeley Fast Filesystem options SOFTUPDATES # Enable FFS soft updates support options UFS_ACL # Support for access control lists options UFS_DIRHASH # Improve performance on big directories options UFS_GJOURNAL # Enable gjournal-based UFS journaling options QUOTA # Enable disk quotas for UFS options MD_ROOT # MD is a potential root device options NFSCL # Network Filesystem Client options NFSD # Network Filesystem Server options NFSLOCKD # Network Lock Manager options NFS_ROOT # NFS usable as /, requires NFSCL options MSDOSFS # MSDOS Filesystem options CD9660 # ISO 9660 Filesystem options PROCFS # Process filesystem (requires PSEUDOFS) options PSEUDOFS # Pseudo-filesystem framework options GEOM_PART_GPT # GUID Partition Tables. options GEOM_RAID # Soft RAID functionality. options GEOM_LABEL # Provides labelization options COMPAT_FREEBSD32 # Compatible with i386 binaries options COMPAT_FREEBSD4 # Compatible with FreeBSD4 options COMPAT_FREEBSD5 # Compatible with FreeBSD5 options COMPAT_FREEBSD6 # Compatible with FreeBSD6 options COMPAT_FREEBSD7 # Compatible with FreeBSD7 options COMPAT_FREEBSD9 # Compatible with FreeBSD9 options COMPAT_FREEBSD10 # Compatible with FreeBSD10 options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI options KTRACE # ktrace(1) support options STACK # stack(9) support options SYSVSHM # SYSV-style shared memory options SYSVMSG # SYSV-style message queues options SYSVSEM # SYSV-style semaphores options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing options CAPABILITY_MODE # Capsicum capability mode options CAPABILITIES # Capsicum capabilities options MAC # TrustedBSD MAC Framework options KDTRACE_FRAME # Ensure frames are compiled in options KDTRACE_HOOKS # Kernel DTrace hooks options DDB_CTF # Kernel ELF linker loads CTF data options INCLUDE_CONFIG_FILE # Include this file in kernel options RACCT # Resource accounting framework options RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default options RCTL # Resource limits # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. # For full debugger support use (turn off in stable branch): +options BUF_TRACKING # Track buffer history options DDB # Support DDB. +options FULL_BUF_TRACKING # Track more buffer history options GDB # Support remote GDB. options DEADLKRES # Enable the deadlock resolver options INVARIANTS # Enable calls of extra sanity checking options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel options DEVICE_NUMA # I/O Device Affinity # CPU frequency control device cpufreq # Bus support. device acpi options ACPI_DMAR device pci options PCI_HP # PCI-Express native HotPlug options PCI_IOV # PCI SR-IOV support # Floppy drives device fdc # ATA controllers device ahci # AHCI-compatible SATA controllers device ata # Legacy ATA/SATA controllers device mvs # Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA # SCSI Controllers device ahc # AHA2940 and onboard AIC7xxx devices options AHC_REG_PRETTY_PRINT # Print register bitfields in debug # output. Adds ~128k to driver. device ahd # AHA39320/29320 and onboard AIC79xx devices options AHD_REG_PRETTY_PRINT # Print register bitfields in debug # output. Adds ~215k to driver. device esp # AMD Am53C974 (Tekram DC-390(T)) device hptiop # Highpoint RocketRaid 3xxx series device isp # Qlogic family #device ispfw # Firmware for QLogic HBAs- normally a module device mpt # LSI-Logic MPT-Fusion device mps # LSI-Logic MPT-Fusion 2 device mpr # LSI-Logic MPT-Fusion 3 #device ncr # NCR/Symbios Logic device sym # NCR/Symbios Logic (newer chipsets + those of `ncr') device trm # Tekram DC395U/UW/F DC315U adapters device adv # Advansys SCSI adapters device adw # Advansys wide SCSI adapters device aic # Adaptec 15[012]x SCSI adapters, AIC-6[23]60. device bt # Buslogic/Mylex MultiMaster SCSI adapters device isci # Intel C600 SAS controller # ATA/SCSI peripherals device scbus # SCSI bus (required for ATA/SCSI) device ch # SCSI media changers device da # Direct Access (disks) device sa # Sequential Access (tape etc) device cd # CD device pass # Passthrough device (direct ATA/SCSI access) device ses # Enclosure Services (SES and SAF-TE) #device ctl # CAM Target Layer # RAID controllers interfaced to the SCSI subsystem device amr # AMI MegaRAID device arcmsr # Areca SATA II RAID device ciss # Compaq Smart RAID 5* device dpt # DPT Smartcache III, IV - See NOTES for options device hptmv # Highpoint RocketRAID 182x device hptnr # Highpoint DC7280, R750 device hptrr # Highpoint RocketRAID 17xx, 22xx, 23xx, 25xx device hpt27xx # Highpoint RocketRAID 27xx device iir # Intel Integrated RAID device ips # IBM (Adaptec) ServeRAID device mly # Mylex AcceleRAID/eXtremeRAID device twa # 3ware 9000 series PATA/SATA RAID device tws # LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller # RAID controllers device aac # Adaptec FSA RAID device aacp # SCSI passthrough for aac (requires CAM) device aacraid # Adaptec by PMC RAID device ida # Compaq Smart RAID device mfi # LSI MegaRAID SAS device mlx # Mylex DAC960 family device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s device pmspcv # PMC-Sierra SAS/SATA Controller driver #XXX pointer/int warnings #device pst # Promise Supertrak SX6000 device twe # 3ware ATA RAID # NVM Express (NVMe) support device nvme # base NVMe driver device nvd # expose NVMe namespaces as disks, depends on nvme # atkbdc0 controls both the keyboard and the PS/2 mouse device atkbdc # AT keyboard controller device atkbd # AT keyboard device psm # PS/2 mouse device kbdmux # keyboard multiplexer device vga # VGA video card driver options VESA # Add support for VESA BIOS Extensions (VBE) device splash # Splash screen and screen saver support # syscons is the default console driver, resembling an SCO console device sc options SC_PIXEL_MODE # add support for the raster text mode # vt is the new video console driver device vt device vt_vga device vt_efifb device agp # support several AGP chipsets # PCCARD (PCMCIA) support # PCMCIA and cardbus bridge support device cbb # cardbus (yenta) bridge device pccard # PC Card (16-bit) bus device cardbus # CardBus (32-bit) bus # Serial (COM) ports device uart # Generic UART driver # Parallel port device ppc device ppbus # Parallel port bus (required) device lpt # Printer device ppi # Parallel port interface device #device vpo # Requires scbus and da device puc # Multi I/O cards and multi-channel UARTs # PCI Ethernet NICs. device bxe # Broadcom NetXtreme II BCM5771X/BCM578XX 10GbE device de # DEC/Intel DC21x4x (``Tulip'') device em # Intel PRO/1000 Gigabit Ethernet Family device igb # Intel PRO/1000 PCIE Server Gigabit Family device ix # Intel PRO/10GbE PCIE PF Ethernet device ixv # Intel PRO/10GbE PCIE VF Ethernet device ixl # Intel XL710 40Gbe PCIE Ethernet device ixlv # Intel XL710 40Gbe VF PCIE Ethernet device le # AMD Am7900 LANCE and Am79C9xx PCnet device ti # Alteon Networks Tigon I/II gigabit Ethernet device txp # 3Com 3cR990 (``Typhoon'') device vx # 3Com 3c590, 3c595 (``Vortex'') # PCI Ethernet NICs that use the common MII bus controller code. # NOTE: Be sure to keep the 'device miibus' line in order to use these NICs! device miibus # MII bus support device ae # Attansic/Atheros L2 FastEthernet device age # Attansic/Atheros L1 Gigabit Ethernet device alc # Atheros AR8131/AR8132 Ethernet device ale # Atheros AR8121/AR8113/AR8114 Ethernet device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet device bfe # Broadcom BCM440x 10/100 Ethernet device bge # Broadcom BCM570xx Gigabit Ethernet device cas # Sun Cassini/Cassini+ and NS DP83065 Saturn device dc # DEC/Intel 21143 and various workalikes device et # Agere ET1310 10/100/Gigabit Ethernet device fxp # Intel EtherExpress PRO/100B (82557, 82558) device gem # Sun GEM/Sun ERI/Apple GMAC device hme # Sun HME (Happy Meal Ethernet) device jme # JMicron JMC250 Gigabit/JMC260 Fast Ethernet device lge # Level 1 LXT1001 gigabit Ethernet device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet device nfe # nVidia nForce MCP on-board Ethernet device nge # NatSemi DP83820 gigabit Ethernet device pcn # AMD Am79C97x PCI 10/100 (precedence over 'le') device re # RealTek 8139C+/8169/8169S/8110S device rl # RealTek 8129/8139 device sf # Adaptec AIC-6915 (``Starfire'') device sge # Silicon Integrated Systems SiS190/191 device sis # Silicon Integrated Systems SiS 900/SiS 7016 device sk # SysKonnect SK-984x & SK-982x gigabit Ethernet device ste # Sundance ST201 (D-Link DFE-550TX) device stge # Sundance/Tamarack TC9021 gigabit Ethernet device tl # Texas Instruments ThunderLAN device tx # SMC EtherPower II (83c170 ``EPIC'') device vge # VIA VT612x gigabit Ethernet device vr # VIA Rhine, Rhine II device wb # Winbond W89C840F device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # Wireless NIC cards device wlan # 802.11 support options IEEE80211_DEBUG # enable debug msgs options IEEE80211_AMPDU_AGE # age frames in AMPDU reorder q's options IEEE80211_SUPPORT_MESH # enable 802.11s draft support device wlan_wep # 802.11 WEP support device wlan_ccmp # 802.11 CCMP support device wlan_tkip # 802.11 TKIP support device wlan_amrr # AMRR transmit rate control algorithm device an # Aironet 4500/4800 802.11 wireless NICs. device ath # Atheros NICs device ath_pci # Atheros pci/cardbus glue device ath_hal # pci/cardbus chip support options AH_SUPPORT_AR5416 # enable AR5416 tx/rx descriptors options AH_AR5416_INTERRUPT_MITIGATION # AR5416 interrupt mitigation options ATH_ENABLE_11N # Enable 802.11n support for AR5416 and later device ath_rate_sample # SampleRate tx rate control for ath #device bwi # Broadcom BCM430x/BCM431x wireless NICs. #device bwn # Broadcom BCM43xx wireless NICs. device ipw # Intel 2100 wireless NICs. device iwi # Intel 2200BG/2225BG/2915ABG wireless NICs. device iwn # Intel 4965/1000/5000/6000 wireless NICs. device malo # Marvell Libertas wireless NICs. device mwl # Marvell 88W8363 802.11n wireless NICs. device ral # Ralink Technology RT2500 wireless NICs. device wi # WaveLAN/Intersil/Symbol 802.11 wireless NICs. device wpi # Intel 3945ABG wireless NICs. # Pseudo devices. device loop # Network loopback device random # Entropy device device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device ether # Ethernet support device vlan # 802.1Q VLAN support device tun # Packet tunnel. device md # Memory "disks" device gif # IPv6 and IPv4 tunneling device firmware # firmware assist module # The `bpf' device enables the Berkeley Packet Filter. # Be aware of the administrative consequences of enabling this! # Note that 'bpf' is required for DHCP. device bpf # Berkeley packet filter # USB support options USB_DEBUG # enable debug msgs device uhci # UHCI PCI->USB interface device ohci # OHCI PCI->USB interface device ehci # EHCI PCI->USB interface (USB 2.0) device xhci # XHCI PCI->USB interface (USB 3.0) device usb # USB Bus (required) device ukbd # Keyboard device umass # Disks/Mass storage - Requires scbus and da # Sound support device sound # Generic sound driver (required) device snd_cmi # CMedia CMI8338/CMI8738 device snd_csa # Crystal Semiconductor CS461x/428x device snd_emu10kx # Creative SoundBlaster Live! and Audigy device snd_es137x # Ensoniq AudioPCI ES137x device snd_hda # Intel High Definition Audio device snd_ich # Intel, NVidia and other ICH AC'97 Audio device snd_via8233 # VIA VT8233x Audio # MMC/SD device mmc # MMC/SD bus device mmcsd # MMC/SD memory card device sdhci # Generic PCI SD Host Controller # VirtIO support device virtio # Generic VirtIO bus (required) device virtio_pci # VirtIO PCI device device vtnet # VirtIO Ethernet device device virtio_blk # VirtIO Block device device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device # HyperV drivers and enhancement support device hyperv # HyperV drivers # Xen HVM Guest Optimizations # NOTE: XENHVM depends on xenpci. They must be added or removed together. options XENHVM # Xen HVM kernel infrastructure device xenpci # Xen HVM Hypervisor services driver # VMware support device vmx # VMware VMXNET3 Ethernet # Netmap provides direct access to TX/RX rings on supported NICs device netmap # netmap(4) support # The crypto framework is required by IPSEC device crypto # Required by IPSEC Index: head/sys/amd64/conf/GENERIC-NODEBUG =================================================================== --- head/sys/amd64/conf/GENERIC-NODEBUG (revision 308154) +++ head/sys/amd64/conf/GENERIC-NODEBUG (revision 308155) @@ -1,38 +1,40 @@ # # GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file # for FreeBSD/amd64 # # This configuration file removes several debugging options, including # WITNESS and INVARIANTS checking, which are known to have significant # performance impact on running systems. When benchmarking new features # this kernel should be used instead of the standard GENERIC. # This kernel configuration should never appear outside of the HEAD # of the FreeBSD tree. # # For more information on this file, please read the config(5) manual page, # and/or the handbook section on Kernel Configuration Files: # # http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the ../../conf/NOTES and NOTES files. # If you are in doubt as to the purpose or necessity of a line, check first # in NOTES. # # $FreeBSD$ include GENERIC ident GENERIC-NODEBUG nooptions INVARIANTS nooptions INVARIANT_SUPPORT nooptions WITNESS nooptions WITNESS_SKIPSPIN +nooptions BUF_TRACKING nooptions DEADLKRES +nooptions FULL_BUF_TRACKING Index: head/sys/cam/cam_ccb.h =================================================================== --- head/sys/cam/cam_ccb.h (revision 308154) +++ head/sys/cam/cam_ccb.h (revision 308155) @@ -1,1435 +1,1441 @@ /*- * Data structures and definitions for CAM Control Blocks (CCBs). * * Copyright (c) 1997, 1998 Justin T. Gibbs. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _CAM_CAM_CCB_H #define _CAM_CAM_CCB_H 1 #include #include #include #include #ifndef _KERNEL #include #endif #include #include #include #include /* General allocation length definitions for CCB structures */ #define IOCDBLEN CAM_MAX_CDBLEN /* Space for CDB bytes/pointer */ #define VUHBALEN 14 /* Vendor Unique HBA length */ #define SIM_IDLEN 16 /* ASCII string len for SIM ID */ #define HBA_IDLEN 16 /* ASCII string len for HBA ID */ #define DEV_IDLEN 16 /* ASCII string len for device names */ #define CCB_PERIPH_PRIV_SIZE 2 /* size of peripheral private area */ #define CCB_SIM_PRIV_SIZE 2 /* size of sim private area */ /* Struct definitions for CAM control blocks */ /* Common CCB header */ /* CAM CCB flags */ typedef enum { CAM_CDB_POINTER = 0x00000001,/* The CDB field is a pointer */ CAM_QUEUE_ENABLE = 0x00000002,/* SIM queue actions are enabled */ CAM_CDB_LINKED = 0x00000004,/* CCB contains a linked CDB */ CAM_NEGOTIATE = 0x00000008,/* * Perform transport negotiation * with this command. */ CAM_DATA_ISPHYS = 0x00000010,/* Data type with physical addrs */ CAM_DIS_AUTOSENSE = 0x00000020,/* Disable autosense feature */ CAM_DIR_BOTH = 0x00000000,/* Data direction (00:IN/OUT) */ CAM_DIR_IN = 0x00000040,/* Data direction (01:DATA IN) */ CAM_DIR_OUT = 0x00000080,/* Data direction (10:DATA OUT) */ CAM_DIR_NONE = 0x000000C0,/* Data direction (11:no data) */ CAM_DIR_MASK = 0x000000C0,/* Data direction Mask */ CAM_DATA_VADDR = 0x00000000,/* Data type (000:Virtual) */ CAM_DATA_PADDR = 0x00000010,/* Data type (001:Physical) */ CAM_DATA_SG = 0x00040000,/* Data type (010:sglist) */ CAM_DATA_SG_PADDR = 0x00040010,/* Data type (011:sglist phys) */ CAM_DATA_BIO = 0x00200000,/* Data type (100:bio) */ CAM_DATA_MASK = 0x00240010,/* Data type mask */ CAM_SOFT_RST_OP = 0x00000100,/* Use Soft reset alternative */ CAM_ENG_SYNC = 0x00000200,/* Flush resid bytes on complete */ CAM_DEV_QFRZDIS = 0x00000400,/* Disable DEV Q freezing */ CAM_DEV_QFREEZE = 0x00000800,/* Freeze DEV Q on execution */ CAM_HIGH_POWER = 0x00001000,/* Command takes a lot of power */ CAM_SENSE_PTR = 0x00002000,/* Sense data is a pointer */ CAM_SENSE_PHYS = 0x00004000,/* Sense pointer is physical addr*/ CAM_TAG_ACTION_VALID = 0x00008000,/* Use the tag action in this ccb*/ CAM_PASS_ERR_RECOVER = 0x00010000,/* Pass driver does err. recovery*/ CAM_DIS_DISCONNECT = 0x00020000,/* Disable disconnect */ CAM_MSG_BUF_PHYS = 0x00080000,/* Message buffer ptr is physical*/ CAM_SNS_BUF_PHYS = 0x00100000,/* Autosense data ptr is physical*/ CAM_CDB_PHYS = 0x00400000,/* CDB poiner is physical */ CAM_ENG_SGLIST = 0x00800000,/* SG list is for the HBA engine */ /* Phase cognizant mode flags */ CAM_DIS_AUTOSRP = 0x01000000,/* Disable autosave/restore ptrs */ CAM_DIS_AUTODISC = 0x02000000,/* Disable auto disconnect */ CAM_TGT_CCB_AVAIL = 0x04000000,/* Target CCB available */ CAM_TGT_PHASE_MODE = 0x08000000,/* The SIM runs in phase mode */ CAM_MSGB_VALID = 0x10000000,/* Message buffer valid */ CAM_STATUS_VALID = 0x20000000,/* Status buffer valid */ CAM_DATAB_VALID = 0x40000000,/* Data buffer valid */ /* Host target Mode flags */ CAM_SEND_SENSE = 0x08000000,/* Send sense data with status */ CAM_TERM_IO = 0x10000000,/* Terminate I/O Message sup. */ CAM_DISCONNECT = 0x20000000,/* Disconnects are mandatory */ CAM_SEND_STATUS = 0x40000000,/* Send status after data phase */ CAM_UNLOCKED = 0x80000000 /* Call callback without lock. */ } ccb_flags; typedef enum { CAM_USER_DATA_ADDR = 0x00000002,/* Userspace data pointers */ CAM_SG_FORMAT_IOVEC = 0x00000004,/* iovec instead of busdma S/G*/ CAM_UNMAPPED_BUF = 0x00000008 /* use unmapped I/O */ } ccb_xflags; /* XPT Opcodes for xpt_action */ typedef enum { /* Function code flags are bits greater than 0xff */ XPT_FC_QUEUED = 0x100, /* Non-immediate function code */ XPT_FC_USER_CCB = 0x200, XPT_FC_XPT_ONLY = 0x400, /* Only for the transport layer device */ XPT_FC_DEV_QUEUED = 0x800 | XPT_FC_QUEUED, /* Passes through the device queues */ /* Common function commands: 0x00->0x0F */ XPT_NOOP = 0x00, /* Execute Nothing */ XPT_SCSI_IO = 0x01 | XPT_FC_DEV_QUEUED, /* Execute the requested I/O operation */ XPT_GDEV_TYPE = 0x02, /* Get type information for specified device */ XPT_GDEVLIST = 0x03, /* Get a list of peripheral devices */ XPT_PATH_INQ = 0x04, /* Path routing inquiry */ XPT_REL_SIMQ = 0x05, /* Release a frozen device queue */ XPT_SASYNC_CB = 0x06, /* Set Asynchronous Callback Parameters */ XPT_SDEV_TYPE = 0x07, /* Set device type information */ XPT_SCAN_BUS = 0x08 | XPT_FC_QUEUED | XPT_FC_USER_CCB | XPT_FC_XPT_ONLY, /* (Re)Scan the SCSI Bus */ XPT_DEV_MATCH = 0x09 | XPT_FC_XPT_ONLY, /* Get EDT entries matching the given pattern */ XPT_DEBUG = 0x0a, /* Turn on debugging for a bus, target or lun */ XPT_PATH_STATS = 0x0b, /* Path statistics (error counts, etc.) */ XPT_GDEV_STATS = 0x0c, /* Device statistics (error counts, etc.) */ XPT_DEV_ADVINFO = 0x0e, /* Get/Set Device advanced information */ XPT_ASYNC = 0x0f | XPT_FC_QUEUED | XPT_FC_USER_CCB | XPT_FC_XPT_ONLY, /* Asynchronous event */ /* SCSI Control Functions: 0x10->0x1F */ XPT_ABORT = 0x10, /* Abort the specified CCB */ XPT_RESET_BUS = 0x11 | XPT_FC_XPT_ONLY, /* Reset the specified SCSI bus */ XPT_RESET_DEV = 0x12 | XPT_FC_DEV_QUEUED, /* Bus Device Reset the specified SCSI device */ XPT_TERM_IO = 0x13, /* Terminate the I/O process */ XPT_SCAN_LUN = 0x14 | XPT_FC_QUEUED | XPT_FC_USER_CCB | XPT_FC_XPT_ONLY, /* Scan Logical Unit */ XPT_GET_TRAN_SETTINGS = 0x15, /* * Get default/user transfer settings * for the target */ XPT_SET_TRAN_SETTINGS = 0x16, /* * Set transfer rate/width * negotiation settings */ XPT_CALC_GEOMETRY = 0x17, /* * Calculate the geometry parameters for * a device give the sector size and * volume size. */ XPT_ATA_IO = 0x18 | XPT_FC_DEV_QUEUED, /* Execute the requested ATA I/O operation */ XPT_GET_SIM_KNOB_OLD = 0x18, /* Compat only */ XPT_SET_SIM_KNOB = 0x19, /* * Set SIM specific knob values. */ XPT_GET_SIM_KNOB = 0x1a, /* * Get SIM specific knob values. */ XPT_SMP_IO = 0x1b | XPT_FC_DEV_QUEUED, /* Serial Management Protocol */ XPT_NVME_IO = 0x1c | XPT_FC_DEV_QUEUED, /* Execiute the requestred NVMe I/O operation */ XPT_MMCSD_IO = 0x1d | XPT_FC_DEV_QUEUED, /* Placeholder for MMC / SD / SDIO I/O stuff */ XPT_SCAN_TGT = 0x1E | XPT_FC_QUEUED | XPT_FC_USER_CCB | XPT_FC_XPT_ONLY, /* Scan Target */ /* HBA engine commands 0x20->0x2F */ XPT_ENG_INQ = 0x20 | XPT_FC_XPT_ONLY, /* HBA engine feature inquiry */ XPT_ENG_EXEC = 0x21 | XPT_FC_DEV_QUEUED, /* HBA execute engine request */ /* Target mode commands: 0x30->0x3F */ XPT_EN_LUN = 0x30, /* Enable LUN as a target */ XPT_TARGET_IO = 0x31 | XPT_FC_DEV_QUEUED, /* Execute target I/O request */ XPT_ACCEPT_TARGET_IO = 0x32 | XPT_FC_QUEUED | XPT_FC_USER_CCB, /* Accept Host Target Mode CDB */ XPT_CONT_TARGET_IO = 0x33 | XPT_FC_DEV_QUEUED, /* Continue Host Target I/O Connection */ XPT_IMMED_NOTIFY = 0x34 | XPT_FC_QUEUED | XPT_FC_USER_CCB, /* Notify Host Target driver of event (obsolete) */ XPT_NOTIFY_ACK = 0x35, /* Acknowledgement of event (obsolete) */ XPT_IMMEDIATE_NOTIFY = 0x36 | XPT_FC_QUEUED | XPT_FC_USER_CCB, /* Notify Host Target driver of event */ XPT_NOTIFY_ACKNOWLEDGE = 0x37 | XPT_FC_QUEUED | XPT_FC_USER_CCB, /* Acknowledgement of event */ XPT_REPROBE_LUN = 0x38 | XPT_FC_QUEUED | XPT_FC_USER_CCB, /* Query device capacity and notify GEOM */ /* Vendor Unique codes: 0x80->0x8F */ XPT_VUNIQUE = 0x80 } xpt_opcode; #define XPT_FC_GROUP_MASK 0xF0 #define XPT_FC_GROUP(op) ((op) & XPT_FC_GROUP_MASK) #define XPT_FC_GROUP_COMMON 0x00 #define XPT_FC_GROUP_SCSI_CONTROL 0x10 #define XPT_FC_GROUP_HBA_ENGINE 0x20 #define XPT_FC_GROUP_TMODE 0x30 #define XPT_FC_GROUP_VENDOR_UNIQUE 0x80 #define XPT_FC_IS_DEV_QUEUED(ccb) \ (((ccb)->ccb_h.func_code & XPT_FC_DEV_QUEUED) == XPT_FC_DEV_QUEUED) #define XPT_FC_IS_QUEUED(ccb) \ (((ccb)->ccb_h.func_code & XPT_FC_QUEUED) != 0) typedef enum { PROTO_UNKNOWN, PROTO_UNSPECIFIED, PROTO_SCSI, /* Small Computer System Interface */ PROTO_ATA, /* AT Attachment */ PROTO_ATAPI, /* AT Attachment Packetized Interface */ PROTO_SATAPM, /* SATA Port Multiplier */ PROTO_SEMB, /* SATA Enclosure Management Bridge */ PROTO_NVME, /* NVME */ } cam_proto; typedef enum { XPORT_UNKNOWN, XPORT_UNSPECIFIED, XPORT_SPI, /* SCSI Parallel Interface */ XPORT_FC, /* Fiber Channel */ XPORT_SSA, /* Serial Storage Architecture */ XPORT_USB, /* Universal Serial Bus */ XPORT_PPB, /* Parallel Port Bus */ XPORT_ATA, /* AT Attachment */ XPORT_SAS, /* Serial Attached SCSI */ XPORT_SATA, /* Serial AT Attachment */ XPORT_ISCSI, /* iSCSI */ XPORT_SRP, /* SCSI RDMA Protocol */ XPORT_NVME, /* NVMe over PCIe */ } cam_xport; #define XPORT_IS_NVME(t) ((t) == XPORT_NVME) #define XPORT_IS_ATA(t) ((t) == XPORT_ATA || (t) == XPORT_SATA) #define XPORT_IS_SCSI(t) ((t) != XPORT_UNKNOWN && \ (t) != XPORT_UNSPECIFIED && \ !XPORT_IS_ATA(t) && !XPORT_IS_NVME(t)) #define XPORT_DEVSTAT_TYPE(t) (XPORT_IS_ATA(t) ? DEVSTAT_TYPE_IF_IDE : \ XPORT_IS_SCSI(t) ? DEVSTAT_TYPE_IF_SCSI : \ DEVSTAT_TYPE_IF_OTHER) #define PROTO_VERSION_UNKNOWN (UINT_MAX - 1) #define PROTO_VERSION_UNSPECIFIED UINT_MAX #define XPORT_VERSION_UNKNOWN (UINT_MAX - 1) #define XPORT_VERSION_UNSPECIFIED UINT_MAX typedef union { LIST_ENTRY(ccb_hdr) le; SLIST_ENTRY(ccb_hdr) sle; TAILQ_ENTRY(ccb_hdr) tqe; STAILQ_ENTRY(ccb_hdr) stqe; } camq_entry; typedef union { void *ptr; u_long field; u_int8_t bytes[sizeof(uintptr_t)]; } ccb_priv_entry; typedef union { ccb_priv_entry entries[CCB_PERIPH_PRIV_SIZE]; u_int8_t bytes[CCB_PERIPH_PRIV_SIZE * sizeof(ccb_priv_entry)]; } ccb_ppriv_area; typedef union { ccb_priv_entry entries[CCB_SIM_PRIV_SIZE]; u_int8_t bytes[CCB_SIM_PRIV_SIZE * sizeof(ccb_priv_entry)]; } ccb_spriv_area; typedef struct { struct timeval *etime; uintptr_t sim_data; uintptr_t periph_data; } ccb_qos_area; struct ccb_hdr { cam_pinfo pinfo; /* Info for priority scheduling */ camq_entry xpt_links; /* For chaining in the XPT layer */ camq_entry sim_links; /* For chaining in the SIM layer */ camq_entry periph_links; /* For chaining in the type driver */ u_int32_t retry_count; void (*cbfcnp)(struct cam_periph *, union ccb *); /* Callback on completion function */ xpt_opcode func_code; /* XPT function code */ u_int32_t status; /* Status returned by CAM subsystem */ struct cam_path *path; /* Compiled path for this ccb */ path_id_t path_id; /* Path ID for the request */ target_id_t target_id; /* Target device ID */ lun_id_t target_lun; /* Target LUN number */ u_int32_t flags; /* ccb_flags */ u_int32_t xflags; /* Extended flags */ ccb_ppriv_area periph_priv; ccb_spriv_area sim_priv; ccb_qos_area qos; u_int32_t timeout; /* Hard timeout value in mseconds */ struct timeval softtimeout; /* Soft timeout value in sec + usec */ }; /* Get Device Information CCB */ struct ccb_getdev { struct ccb_hdr ccb_h; cam_proto protocol; struct scsi_inquiry_data inq_data; struct ata_params ident_data; u_int8_t serial_num[252]; u_int8_t inq_flags; u_int8_t serial_num_len; const struct nvme_controller_data *nvme_cdata; const struct nvme_namespace_data *nvme_data; }; /* Device Statistics CCB */ struct ccb_getdevstats { struct ccb_hdr ccb_h; int dev_openings; /* Space left for more work on device*/ int dev_active; /* Transactions running on the device */ int allocated; /* CCBs allocated for the device */ int queued; /* CCBs queued to be sent to the device */ int held; /* * CCBs held by peripheral drivers * for this device */ int maxtags; /* * Boundary conditions for number of * tagged operations */ int mintags; struct timeval last_reset; /* Time of last bus reset/loop init */ }; typedef enum { CAM_GDEVLIST_LAST_DEVICE, CAM_GDEVLIST_LIST_CHANGED, CAM_GDEVLIST_MORE_DEVS, CAM_GDEVLIST_ERROR } ccb_getdevlist_status_e; struct ccb_getdevlist { struct ccb_hdr ccb_h; char periph_name[DEV_IDLEN]; u_int32_t unit_number; unsigned int generation; u_int32_t index; ccb_getdevlist_status_e status; }; typedef enum { PERIPH_MATCH_NONE = 0x000, PERIPH_MATCH_PATH = 0x001, PERIPH_MATCH_TARGET = 0x002, PERIPH_MATCH_LUN = 0x004, PERIPH_MATCH_NAME = 0x008, PERIPH_MATCH_UNIT = 0x010, PERIPH_MATCH_ANY = 0x01f } periph_pattern_flags; struct periph_match_pattern { char periph_name[DEV_IDLEN]; u_int32_t unit_number; path_id_t path_id; target_id_t target_id; lun_id_t target_lun; periph_pattern_flags flags; }; typedef enum { DEV_MATCH_NONE = 0x000, DEV_MATCH_PATH = 0x001, DEV_MATCH_TARGET = 0x002, DEV_MATCH_LUN = 0x004, DEV_MATCH_INQUIRY = 0x008, DEV_MATCH_DEVID = 0x010, DEV_MATCH_ANY = 0x00f } dev_pattern_flags; struct device_id_match_pattern { uint8_t id_len; uint8_t id[256]; }; struct device_match_pattern { path_id_t path_id; target_id_t target_id; lun_id_t target_lun; dev_pattern_flags flags; union { struct scsi_static_inquiry_pattern inq_pat; struct device_id_match_pattern devid_pat; } data; }; typedef enum { BUS_MATCH_NONE = 0x000, BUS_MATCH_PATH = 0x001, BUS_MATCH_NAME = 0x002, BUS_MATCH_UNIT = 0x004, BUS_MATCH_BUS_ID = 0x008, BUS_MATCH_ANY = 0x00f } bus_pattern_flags; struct bus_match_pattern { path_id_t path_id; char dev_name[DEV_IDLEN]; u_int32_t unit_number; u_int32_t bus_id; bus_pattern_flags flags; }; union match_pattern { struct periph_match_pattern periph_pattern; struct device_match_pattern device_pattern; struct bus_match_pattern bus_pattern; }; typedef enum { DEV_MATCH_PERIPH, DEV_MATCH_DEVICE, DEV_MATCH_BUS } dev_match_type; struct dev_match_pattern { dev_match_type type; union match_pattern pattern; }; struct periph_match_result { char periph_name[DEV_IDLEN]; u_int32_t unit_number; path_id_t path_id; target_id_t target_id; lun_id_t target_lun; }; typedef enum { DEV_RESULT_NOFLAG = 0x00, DEV_RESULT_UNCONFIGURED = 0x01 } dev_result_flags; struct device_match_result { path_id_t path_id; target_id_t target_id; lun_id_t target_lun; cam_proto protocol; struct scsi_inquiry_data inq_data; struct ata_params ident_data; dev_result_flags flags; }; struct bus_match_result { path_id_t path_id; char dev_name[DEV_IDLEN]; u_int32_t unit_number; u_int32_t bus_id; }; union match_result { struct periph_match_result periph_result; struct device_match_result device_result; struct bus_match_result bus_result; }; struct dev_match_result { dev_match_type type; union match_result result; }; typedef enum { CAM_DEV_MATCH_LAST, CAM_DEV_MATCH_MORE, CAM_DEV_MATCH_LIST_CHANGED, CAM_DEV_MATCH_SIZE_ERROR, CAM_DEV_MATCH_ERROR } ccb_dev_match_status; typedef enum { CAM_DEV_POS_NONE = 0x000, CAM_DEV_POS_BUS = 0x001, CAM_DEV_POS_TARGET = 0x002, CAM_DEV_POS_DEVICE = 0x004, CAM_DEV_POS_PERIPH = 0x008, CAM_DEV_POS_PDPTR = 0x010, CAM_DEV_POS_TYPEMASK = 0xf00, CAM_DEV_POS_EDT = 0x100, CAM_DEV_POS_PDRV = 0x200 } dev_pos_type; struct ccb_dm_cookie { void *bus; void *target; void *device; void *periph; void *pdrv; }; struct ccb_dev_position { u_int generations[4]; #define CAM_BUS_GENERATION 0x00 #define CAM_TARGET_GENERATION 0x01 #define CAM_DEV_GENERATION 0x02 #define CAM_PERIPH_GENERATION 0x03 dev_pos_type position_type; struct ccb_dm_cookie cookie; }; struct ccb_dev_match { struct ccb_hdr ccb_h; ccb_dev_match_status status; u_int32_t num_patterns; u_int32_t pattern_buf_len; struct dev_match_pattern *patterns; u_int32_t num_matches; u_int32_t match_buf_len; struct dev_match_result *matches; struct ccb_dev_position pos; }; /* * Definitions for the path inquiry CCB fields. */ #define CAM_VERSION 0x19 /* Hex value for current version */ typedef enum { PI_MDP_ABLE = 0x80, /* Supports MDP message */ PI_WIDE_32 = 0x40, /* Supports 32 bit wide SCSI */ PI_WIDE_16 = 0x20, /* Supports 16 bit wide SCSI */ PI_SDTR_ABLE = 0x10, /* Supports SDTR message */ PI_LINKED_CDB = 0x08, /* Supports linked CDBs */ PI_SATAPM = 0x04, /* Supports SATA PM */ PI_TAG_ABLE = 0x02, /* Supports tag queue messages */ PI_SOFT_RST = 0x01 /* Supports soft reset alternative */ } pi_inqflag; typedef enum { PIT_PROCESSOR = 0x80, /* Target mode processor mode */ PIT_PHASE = 0x40, /* Target mode phase cog. mode */ PIT_DISCONNECT = 0x20, /* Disconnects supported in target mode */ PIT_TERM_IO = 0x10, /* Terminate I/O message supported in TM */ PIT_GRP_6 = 0x08, /* Group 6 commands supported */ PIT_GRP_7 = 0x04 /* Group 7 commands supported */ } pi_tmflag; typedef enum { PIM_ATA_EXT = 0x200,/* ATA requests can understand ata_ext requests */ PIM_EXTLUNS = 0x100,/* 64bit extended LUNs supported */ PIM_SCANHILO = 0x80, /* Bus scans from high ID to low ID */ PIM_NOREMOVE = 0x40, /* Removeable devices not included in scan */ PIM_NOINITIATOR = 0x20, /* Initiator role not supported. */ PIM_NOBUSRESET = 0x10, /* User has disabled initial BUS RESET */ PIM_NO_6_BYTE = 0x08, /* Do not send 6-byte commands */ PIM_SEQSCAN = 0x04, /* Do bus scans sequentially, not in parallel */ PIM_UNMAPPED = 0x02, PIM_NOSCAN = 0x01 /* SIM does its own scanning */ } pi_miscflag; /* Path Inquiry CCB */ struct ccb_pathinq_settings_spi { u_int8_t ppr_options; }; struct ccb_pathinq_settings_fc { u_int64_t wwnn; /* world wide node name */ u_int64_t wwpn; /* world wide port name */ u_int32_t port; /* 24 bit port id, if known */ u_int32_t bitrate; /* Mbps */ }; struct ccb_pathinq_settings_sas { u_int32_t bitrate; /* Mbps */ }; struct ccb_pathinq_settings_nvme { uint16_t nsid; /* Namespace ID for this path */ }; #define PATHINQ_SETTINGS_SIZE 128 struct ccb_pathinq { struct ccb_hdr ccb_h; u_int8_t version_num; /* Version number for the SIM/HBA */ u_int8_t hba_inquiry; /* Mimic of INQ byte 7 for the HBA */ u_int16_t target_sprt; /* Flags for target mode support */ u_int32_t hba_misc; /* Misc HBA features */ u_int16_t hba_eng_cnt; /* HBA engine count */ /* Vendor Unique capabilities */ u_int8_t vuhba_flags[VUHBALEN]; u_int32_t max_target; /* Maximum supported Target */ u_int32_t max_lun; /* Maximum supported Lun */ u_int32_t async_flags; /* Installed Async handlers */ path_id_t hpath_id; /* Highest Path ID in the subsystem */ target_id_t initiator_id; /* ID of the HBA on the SCSI bus */ char sim_vid[SIM_IDLEN]; /* Vendor ID of the SIM */ char hba_vid[HBA_IDLEN]; /* Vendor ID of the HBA */ char dev_name[DEV_IDLEN];/* Device name for SIM */ u_int32_t unit_number; /* Unit number for SIM */ u_int32_t bus_id; /* Bus ID for SIM */ u_int32_t base_transfer_speed;/* Base bus speed in KB/sec */ cam_proto protocol; u_int protocol_version; cam_xport transport; u_int transport_version; union { struct ccb_pathinq_settings_spi spi; struct ccb_pathinq_settings_fc fc; struct ccb_pathinq_settings_sas sas; struct ccb_pathinq_settings_nvme nvme; char ccb_pathinq_settings_opaque[PATHINQ_SETTINGS_SIZE]; } xport_specific; u_int maxio; /* Max supported I/O size, in bytes. */ u_int16_t hba_vendor; /* HBA vendor ID */ u_int16_t hba_device; /* HBA device ID */ u_int16_t hba_subvendor; /* HBA subvendor ID */ u_int16_t hba_subdevice; /* HBA subdevice ID */ }; /* Path Statistics CCB */ struct ccb_pathstats { struct ccb_hdr ccb_h; struct timeval last_reset; /* Time of last bus reset/loop init */ }; typedef enum { SMP_FLAG_NONE = 0x00, SMP_FLAG_REQ_SG = 0x01, SMP_FLAG_RSP_SG = 0x02 } ccb_smp_pass_flags; /* * Serial Management Protocol CCB * XXX Currently the semantics for this CCB are that it is executed either * by the addressed device, or that device's parent (i.e. an expander for * any device on an expander) if the addressed device doesn't support SMP. * Later, once we have the ability to probe SMP-only devices and put them * in CAM's topology, the CCB will only be executed by the addressed device * if possible. */ struct ccb_smpio { struct ccb_hdr ccb_h; uint8_t *smp_request; int smp_request_len; uint16_t smp_request_sglist_cnt; uint8_t *smp_response; int smp_response_len; uint16_t smp_response_sglist_cnt; ccb_smp_pass_flags flags; }; typedef union { u_int8_t *sense_ptr; /* * Pointer to storage * for sense information */ /* Storage Area for sense information */ struct scsi_sense_data sense_buf; } sense_t; typedef union { u_int8_t *cdb_ptr; /* Pointer to the CDB bytes to send */ /* Area for the CDB send */ u_int8_t cdb_bytes[IOCDBLEN]; } cdb_t; /* * SCSI I/O Request CCB used for the XPT_SCSI_IO and XPT_CONT_TARGET_IO * function codes. */ struct ccb_scsiio { struct ccb_hdr ccb_h; union ccb *next_ccb; /* Ptr for next CCB for action */ u_int8_t *req_map; /* Ptr to mapping info */ u_int8_t *data_ptr; /* Ptr to the data buf/SG list */ u_int32_t dxfer_len; /* Data transfer length */ /* Autosense storage */ struct scsi_sense_data sense_data; u_int8_t sense_len; /* Number of bytes to autosense */ u_int8_t cdb_len; /* Number of bytes for the CDB */ u_int16_t sglist_cnt; /* Number of SG list entries */ u_int8_t scsi_status; /* Returned SCSI status */ u_int8_t sense_resid; /* Autosense resid length: 2's comp */ u_int32_t resid; /* Transfer residual length: 2's comp */ cdb_t cdb_io; /* Union for CDB bytes/pointer */ u_int8_t *msg_ptr; /* Pointer to the message buffer */ u_int16_t msg_len; /* Number of bytes for the Message */ u_int8_t tag_action; /* What to do for tag queueing */ /* * The tag action should be either the define below (to send a * non-tagged transaction) or one of the defined scsi tag messages * from scsi_message.h. */ #define CAM_TAG_ACTION_NONE 0x00 u_int tag_id; /* tag id from initator (target mode) */ u_int init_id; /* initiator id of who selected */ +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + struct bio *bio; /* Associated bio */ +#endif }; static __inline uint8_t * scsiio_cdb_ptr(struct ccb_scsiio *ccb) { return ((ccb->ccb_h.flags & CAM_CDB_POINTER) ? ccb->cdb_io.cdb_ptr : ccb->cdb_io.cdb_bytes); } /* * ATA I/O Request CCB used for the XPT_ATA_IO function code. */ struct ccb_ataio { struct ccb_hdr ccb_h; union ccb *next_ccb; /* Ptr for next CCB for action */ struct ata_cmd cmd; /* ATA command register set */ struct ata_res res; /* ATA result register set */ u_int8_t *data_ptr; /* Ptr to the data buf/SG list */ u_int32_t dxfer_len; /* Data transfer length */ u_int32_t resid; /* Transfer residual length: 2's comp */ u_int8_t ata_flags; /* Flags for the rest of the buffer */ #define ATA_FLAG_AUX 0x1 uint32_t aux; uint32_t unused; }; struct ccb_accept_tio { struct ccb_hdr ccb_h; cdb_t cdb_io; /* Union for CDB bytes/pointer */ u_int8_t cdb_len; /* Number of bytes for the CDB */ u_int8_t tag_action; /* What to do for tag queueing */ u_int8_t sense_len; /* Number of bytes of Sense Data */ u_int tag_id; /* tag id from initator (target mode) */ u_int init_id; /* initiator id of who selected */ struct scsi_sense_data sense_data; }; /* Release SIM Queue */ struct ccb_relsim { struct ccb_hdr ccb_h; u_int32_t release_flags; #define RELSIM_ADJUST_OPENINGS 0x01 #define RELSIM_RELEASE_AFTER_TIMEOUT 0x02 #define RELSIM_RELEASE_AFTER_CMDCMPLT 0x04 #define RELSIM_RELEASE_AFTER_QEMPTY 0x08 u_int32_t openings; u_int32_t release_timeout; /* Abstract argument. */ u_int32_t qfrozen_cnt; }; /* * NVMe I/O Request CCB used for the XPT_NVME_IO function code. */ struct ccb_nvmeio { struct ccb_hdr ccb_h; union ccb *next_ccb; /* Ptr for next CCB for action */ struct nvme_command cmd; /* NVME command, per NVME standard */ struct nvme_completion cpl; /* NVME completion, per NVME standard */ uint8_t *data_ptr; /* Ptr to the data buf/SG list */ uint32_t dxfer_len; /* Data transfer length */ uint32_t resid; /* Transfer residual length: 2's comp unused ?*/ }; /* * Definitions for the asynchronous callback CCB fields. */ typedef enum { AC_UNIT_ATTENTION = 0x4000,/* Device reported UNIT ATTENTION */ AC_ADVINFO_CHANGED = 0x2000,/* Advance info might have changes */ AC_CONTRACT = 0x1000,/* A contractual callback */ AC_GETDEV_CHANGED = 0x800,/* Getdev info might have changed */ AC_INQ_CHANGED = 0x400,/* Inquiry info might have changed */ AC_TRANSFER_NEG = 0x200,/* New transfer settings in effect */ AC_LOST_DEVICE = 0x100,/* A device went away */ AC_FOUND_DEVICE = 0x080,/* A new device was found */ AC_PATH_DEREGISTERED = 0x040,/* A path has de-registered */ AC_PATH_REGISTERED = 0x020,/* A new path has been registered */ AC_SENT_BDR = 0x010,/* A BDR message was sent to target */ AC_SCSI_AEN = 0x008,/* A SCSI AEN has been received */ AC_UNSOL_RESEL = 0x002,/* Unsolicited reselection occurred */ AC_BUS_RESET = 0x001 /* A SCSI bus reset occurred */ } ac_code; typedef void ac_callback_t (void *softc, u_int32_t code, struct cam_path *path, void *args); /* * Generic Asynchronous callbacks. * * Generic arguments passed bac which are then interpreted between a per-system * contract number. */ #define AC_CONTRACT_DATA_MAX (128 - sizeof (u_int64_t)) struct ac_contract { u_int64_t contract_number; u_int8_t contract_data[AC_CONTRACT_DATA_MAX]; }; #define AC_CONTRACT_DEV_CHG 1 struct ac_device_changed { u_int64_t wwpn; u_int32_t port; target_id_t target; u_int8_t arrived; }; /* Set Asynchronous Callback CCB */ struct ccb_setasync { struct ccb_hdr ccb_h; u_int32_t event_enable; /* Async Event enables */ ac_callback_t *callback; void *callback_arg; }; /* Set Device Type CCB */ struct ccb_setdev { struct ccb_hdr ccb_h; u_int8_t dev_type; /* Value for dev type field in EDT */ }; /* SCSI Control Functions */ /* Abort XPT request CCB */ struct ccb_abort { struct ccb_hdr ccb_h; union ccb *abort_ccb; /* Pointer to CCB to abort */ }; /* Reset SCSI Bus CCB */ struct ccb_resetbus { struct ccb_hdr ccb_h; }; /* Reset SCSI Device CCB */ struct ccb_resetdev { struct ccb_hdr ccb_h; }; /* Terminate I/O Process Request CCB */ struct ccb_termio { struct ccb_hdr ccb_h; union ccb *termio_ccb; /* Pointer to CCB to terminate */ }; typedef enum { CTS_TYPE_CURRENT_SETTINGS, CTS_TYPE_USER_SETTINGS } cts_type; struct ccb_trans_settings_scsi { u_int valid; /* Which fields to honor */ #define CTS_SCSI_VALID_TQ 0x01 u_int flags; #define CTS_SCSI_FLAGS_TAG_ENB 0x01 }; struct ccb_trans_settings_ata { u_int valid; /* Which fields to honor */ #define CTS_ATA_VALID_TQ 0x01 u_int flags; #define CTS_ATA_FLAGS_TAG_ENB 0x01 }; struct ccb_trans_settings_spi { u_int valid; /* Which fields to honor */ #define CTS_SPI_VALID_SYNC_RATE 0x01 #define CTS_SPI_VALID_SYNC_OFFSET 0x02 #define CTS_SPI_VALID_BUS_WIDTH 0x04 #define CTS_SPI_VALID_DISC 0x08 #define CTS_SPI_VALID_PPR_OPTIONS 0x10 u_int flags; #define CTS_SPI_FLAGS_DISC_ENB 0x01 u_int sync_period; u_int sync_offset; u_int bus_width; u_int ppr_options; }; struct ccb_trans_settings_fc { u_int valid; /* Which fields to honor */ #define CTS_FC_VALID_WWNN 0x8000 #define CTS_FC_VALID_WWPN 0x4000 #define CTS_FC_VALID_PORT 0x2000 #define CTS_FC_VALID_SPEED 0x1000 u_int64_t wwnn; /* world wide node name */ u_int64_t wwpn; /* world wide port name */ u_int32_t port; /* 24 bit port id, if known */ u_int32_t bitrate; /* Mbps */ }; struct ccb_trans_settings_sas { u_int valid; /* Which fields to honor */ #define CTS_SAS_VALID_SPEED 0x1000 u_int32_t bitrate; /* Mbps */ }; struct ccb_trans_settings_pata { u_int valid; /* Which fields to honor */ #define CTS_ATA_VALID_MODE 0x01 #define CTS_ATA_VALID_BYTECOUNT 0x02 #define CTS_ATA_VALID_ATAPI 0x20 #define CTS_ATA_VALID_CAPS 0x40 int mode; /* Mode */ u_int bytecount; /* Length of PIO transaction */ u_int atapi; /* Length of ATAPI CDB */ u_int caps; /* Device and host SATA caps. */ #define CTS_ATA_CAPS_H 0x0000ffff #define CTS_ATA_CAPS_H_DMA48 0x00000001 /* 48-bit DMA */ #define CTS_ATA_CAPS_D 0xffff0000 }; struct ccb_trans_settings_sata { u_int valid; /* Which fields to honor */ #define CTS_SATA_VALID_MODE 0x01 #define CTS_SATA_VALID_BYTECOUNT 0x02 #define CTS_SATA_VALID_REVISION 0x04 #define CTS_SATA_VALID_PM 0x08 #define CTS_SATA_VALID_TAGS 0x10 #define CTS_SATA_VALID_ATAPI 0x20 #define CTS_SATA_VALID_CAPS 0x40 int mode; /* Legacy PATA mode */ u_int bytecount; /* Length of PIO transaction */ int revision; /* SATA revision */ u_int pm_present; /* PM is present (XPT->SIM) */ u_int tags; /* Number of allowed tags */ u_int atapi; /* Length of ATAPI CDB */ u_int caps; /* Device and host SATA caps. */ #define CTS_SATA_CAPS_H 0x0000ffff #define CTS_SATA_CAPS_H_PMREQ 0x00000001 #define CTS_SATA_CAPS_H_APST 0x00000002 #define CTS_SATA_CAPS_H_DMAAA 0x00000010 /* Auto-activation */ #define CTS_SATA_CAPS_H_AN 0x00000020 /* Async. notification */ #define CTS_SATA_CAPS_D 0xffff0000 #define CTS_SATA_CAPS_D_PMREQ 0x00010000 #define CTS_SATA_CAPS_D_APST 0x00020000 }; struct ccb_trans_settings_nvme { u_int valid; /* Which fields to honor */ #define CTS_NVME_VALID_SPEC 0x01 #define CTS_NVME_VALID_CAPS 0x02 u_int spec_major; /* Major version of spec supported */ u_int spec_minor; /* Minor verison of spec supported */ u_int spec_tiny; /* Tiny version of spec supported */ u_int max_xfer; /* Max transfer size (0 -> unlimited */ u_int caps; }; /* Get/Set transfer rate/width/disconnection/tag queueing settings */ struct ccb_trans_settings { struct ccb_hdr ccb_h; cts_type type; /* Current or User settings */ cam_proto protocol; u_int protocol_version; cam_xport transport; u_int transport_version; union { u_int valid; /* Which fields to honor */ struct ccb_trans_settings_ata ata; struct ccb_trans_settings_scsi scsi; struct ccb_trans_settings_nvme nvme; } proto_specific; union { u_int valid; /* Which fields to honor */ struct ccb_trans_settings_spi spi; struct ccb_trans_settings_fc fc; struct ccb_trans_settings_sas sas; struct ccb_trans_settings_pata ata; struct ccb_trans_settings_sata sata; struct ccb_trans_settings_nvme nvme; } xport_specific; }; /* * Calculate the geometry parameters for a device * give the block size and volume size in blocks. */ struct ccb_calc_geometry { struct ccb_hdr ccb_h; u_int32_t block_size; u_int64_t volume_size; u_int32_t cylinders; u_int8_t heads; u_int8_t secs_per_track; }; /* * Set or get SIM (and transport) specific knobs */ #define KNOB_VALID_ADDRESS 0x1 #define KNOB_VALID_ROLE 0x2 #define KNOB_ROLE_NONE 0x0 #define KNOB_ROLE_INITIATOR 0x1 #define KNOB_ROLE_TARGET 0x2 #define KNOB_ROLE_BOTH 0x3 struct ccb_sim_knob_settings_spi { u_int valid; u_int initiator_id; u_int role; }; struct ccb_sim_knob_settings_fc { u_int valid; u_int64_t wwnn; /* world wide node name */ u_int64_t wwpn; /* world wide port name */ u_int role; }; struct ccb_sim_knob_settings_sas { u_int valid; u_int64_t wwnn; /* world wide node name */ u_int role; }; #define KNOB_SETTINGS_SIZE 128 struct ccb_sim_knob { struct ccb_hdr ccb_h; union { u_int valid; /* Which fields to honor */ struct ccb_sim_knob_settings_spi spi; struct ccb_sim_knob_settings_fc fc; struct ccb_sim_knob_settings_sas sas; char pad[KNOB_SETTINGS_SIZE]; } xport_specific; }; /* * Rescan the given bus, or bus/target/lun */ struct ccb_rescan { struct ccb_hdr ccb_h; cam_flags flags; }; /* * Turn on debugging for the given bus, bus/target, or bus/target/lun. */ struct ccb_debug { struct ccb_hdr ccb_h; cam_debug_flags flags; }; /* Target mode structures. */ struct ccb_en_lun { struct ccb_hdr ccb_h; u_int16_t grp6_len; /* Group 6 VU CDB length */ u_int16_t grp7_len; /* Group 7 VU CDB length */ u_int8_t enable; }; /* old, barely used immediate notify, binary compatibility */ struct ccb_immed_notify { struct ccb_hdr ccb_h; struct scsi_sense_data sense_data; u_int8_t sense_len; /* Number of bytes in sense buffer */ u_int8_t initiator_id; /* Id of initiator that selected */ u_int8_t message_args[7]; /* Message Arguments */ }; struct ccb_notify_ack { struct ccb_hdr ccb_h; u_int16_t seq_id; /* Sequence identifier */ u_int8_t event; /* Event flags */ }; struct ccb_immediate_notify { struct ccb_hdr ccb_h; u_int tag_id; /* Tag for immediate notify */ u_int seq_id; /* Tag for target of notify */ u_int initiator_id; /* Initiator Identifier */ u_int arg; /* Function specific */ }; struct ccb_notify_acknowledge { struct ccb_hdr ccb_h; u_int tag_id; /* Tag for immediate notify */ u_int seq_id; /* Tar for target of notify */ u_int initiator_id; /* Initiator Identifier */ u_int arg; /* Response information */ /* * Lower byte of arg is one of RESPONSE CODE values defined below * (subset of response codes from SPL-4 and FCP-4 specifications), * upper 3 bytes is code-specific ADDITIONAL RESPONSE INFORMATION. */ #define CAM_RSP_TMF_COMPLETE 0x00 #define CAM_RSP_TMF_REJECTED 0x04 #define CAM_RSP_TMF_FAILED 0x05 #define CAM_RSP_TMF_SUCCEEDED 0x08 #define CAM_RSP_TMF_INCORRECT_LUN 0x09 }; /* HBA engine structures. */ typedef enum { EIT_BUFFER, /* Engine type: buffer memory */ EIT_LOSSLESS, /* Engine type: lossless compression */ EIT_LOSSY, /* Engine type: lossy compression */ EIT_ENCRYPT /* Engine type: encryption */ } ei_type; typedef enum { EAD_VUNIQUE, /* Engine algorithm ID: vendor unique */ EAD_LZ1V1, /* Engine algorithm ID: LZ1 var.1 */ EAD_LZ2V1, /* Engine algorithm ID: LZ2 var.1 */ EAD_LZ2V2 /* Engine algorithm ID: LZ2 var.2 */ } ei_algo; struct ccb_eng_inq { struct ccb_hdr ccb_h; u_int16_t eng_num; /* The engine number for this inquiry */ ei_type eng_type; /* Returned engine type */ ei_algo eng_algo; /* Returned engine algorithm type */ u_int32_t eng_memeory; /* Returned engine memory size */ }; struct ccb_eng_exec { /* This structure must match SCSIIO size */ struct ccb_hdr ccb_h; u_int8_t *pdrv_ptr; /* Ptr used by the peripheral driver */ u_int8_t *req_map; /* Ptr for mapping info on the req. */ u_int8_t *data_ptr; /* Pointer to the data buf/SG list */ u_int32_t dxfer_len; /* Data transfer length */ u_int8_t *engdata_ptr; /* Pointer to the engine buffer data */ u_int16_t sglist_cnt; /* Num of scatter gather list entries */ u_int32_t dmax_len; /* Destination data maximum length */ u_int32_t dest_len; /* Destination data length */ int32_t src_resid; /* Source residual length: 2's comp */ u_int32_t timeout; /* Timeout value */ u_int16_t eng_num; /* Engine number for this request */ u_int16_t vu_flags; /* Vendor Unique flags */ }; /* * Definitions for the timeout field in the SCSI I/O CCB. */ #define CAM_TIME_DEFAULT 0x00000000 /* Use SIM default value */ #define CAM_TIME_INFINITY 0xFFFFFFFF /* Infinite timeout */ #define CAM_SUCCESS 0 /* For signaling general success */ #define CAM_FAILURE 1 /* For signaling general failure */ #define CAM_FALSE 0 #define CAM_TRUE 1 #define XPT_CCB_INVALID -1 /* for signaling a bad CCB to free */ /* * CCB for working with advanced device information. This operates in a fashion * similar to XPT_GDEV_TYPE. Specify the target in ccb_h, the buffer * type requested, and provide a buffer size/buffer to write to. If the * buffer is too small, provsiz will be larger than bufsiz. */ struct ccb_dev_advinfo { struct ccb_hdr ccb_h; uint32_t flags; #define CDAI_FLAG_NONE 0x0 /* No flags set */ #define CDAI_FLAG_STORE 0x1 /* If set, action becomes store */ uint32_t buftype; /* IN: Type of data being requested */ /* NB: buftype is interpreted on a per-transport basis */ #define CDAI_TYPE_SCSI_DEVID 1 #define CDAI_TYPE_SERIAL_NUM 2 #define CDAI_TYPE_PHYS_PATH 3 #define CDAI_TYPE_RCAPLONG 4 #define CDAI_TYPE_EXT_INQ 5 off_t bufsiz; /* IN: Size of external buffer */ #define CAM_SCSI_DEVID_MAXLEN 65536 /* length in buffer is an uint16_t */ off_t provsiz; /* OUT: Size required/used */ uint8_t *buf; /* IN/OUT: Buffer for requested data */ }; /* * CCB for sending async events */ struct ccb_async { struct ccb_hdr ccb_h; uint32_t async_code; off_t async_arg_size; void *async_arg_ptr; }; /* * Union of all CCB types for kernel space allocation. This union should * never be used for manipulating CCBs - its only use is for the allocation * and deallocation of raw CCB space and is the return type of xpt_ccb_alloc * and the argument to xpt_ccb_free. */ union ccb { struct ccb_hdr ccb_h; /* For convenience */ struct ccb_scsiio csio; struct ccb_getdev cgd; struct ccb_getdevlist cgdl; struct ccb_pathinq cpi; struct ccb_relsim crs; struct ccb_setasync csa; struct ccb_setdev csd; struct ccb_pathstats cpis; struct ccb_getdevstats cgds; struct ccb_dev_match cdm; struct ccb_trans_settings cts; struct ccb_calc_geometry ccg; struct ccb_sim_knob knob; struct ccb_abort cab; struct ccb_resetbus crb; struct ccb_resetdev crd; struct ccb_termio tio; struct ccb_accept_tio atio; struct ccb_scsiio ctio; struct ccb_en_lun cel; struct ccb_immed_notify cin; struct ccb_notify_ack cna; struct ccb_immediate_notify cin1; struct ccb_notify_acknowledge cna2; struct ccb_eng_inq cei; struct ccb_eng_exec cee; struct ccb_smpio smpio; struct ccb_rescan crcn; struct ccb_debug cdbg; struct ccb_ataio ataio; struct ccb_dev_advinfo cdai; struct ccb_async casync; struct ccb_nvmeio nvmeio; }; #define CCB_CLEAR_ALL_EXCEPT_HDR(ccbp) \ bzero((char *)(ccbp) + sizeof((ccbp)->ccb_h), \ sizeof(*(ccbp)) - sizeof((ccbp)->ccb_h)) __BEGIN_DECLS static __inline void cam_fill_csio(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t tag_action, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int8_t cdb_len, u_int32_t timeout); static __inline void cam_fill_nvmeio(struct ccb_nvmeio *nvmeio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int32_t timeout); static __inline void cam_fill_ctio(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int tag_action, u_int tag_id, u_int init_id, u_int scsi_status, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int32_t timeout); static __inline void cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int tag_action, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int32_t timeout); static __inline void cam_fill_smpio(struct ccb_smpio *smpio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags, uint8_t *smp_request, int smp_request_len, uint8_t *smp_response, int smp_response_len, uint32_t timeout); static __inline void cam_fill_csio(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t tag_action, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int8_t cdb_len, u_int32_t timeout) { csio->ccb_h.func_code = XPT_SCSI_IO; csio->ccb_h.flags = flags; csio->ccb_h.xflags = 0; csio->ccb_h.retry_count = retries; csio->ccb_h.cbfcnp = cbfcnp; csio->ccb_h.timeout = timeout; csio->data_ptr = data_ptr; csio->dxfer_len = dxfer_len; csio->sense_len = sense_len; csio->cdb_len = cdb_len; csio->tag_action = tag_action; +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + csio->bio = NULL; +#endif } static __inline void cam_fill_ctio(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int tag_action, u_int tag_id, u_int init_id, u_int scsi_status, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int32_t timeout) { csio->ccb_h.func_code = XPT_CONT_TARGET_IO; csio->ccb_h.flags = flags; csio->ccb_h.xflags = 0; csio->ccb_h.retry_count = retries; csio->ccb_h.cbfcnp = cbfcnp; csio->ccb_h.timeout = timeout; csio->data_ptr = data_ptr; csio->dxfer_len = dxfer_len; csio->scsi_status = scsi_status; csio->tag_action = tag_action; csio->tag_id = tag_id; csio->init_id = init_id; } static __inline void cam_fill_ataio(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int tag_action __unused, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int32_t timeout) { ataio->ccb_h.func_code = XPT_ATA_IO; ataio->ccb_h.flags = flags; ataio->ccb_h.retry_count = retries; ataio->ccb_h.cbfcnp = cbfcnp; ataio->ccb_h.timeout = timeout; ataio->data_ptr = data_ptr; ataio->dxfer_len = dxfer_len; ataio->ata_flags = 0; } static __inline void cam_fill_smpio(struct ccb_smpio *smpio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags, uint8_t *smp_request, int smp_request_len, uint8_t *smp_response, int smp_response_len, uint32_t timeout) { #ifdef _KERNEL KASSERT((flags & CAM_DIR_MASK) == CAM_DIR_BOTH, ("direction != CAM_DIR_BOTH")); KASSERT((smp_request != NULL) && (smp_response != NULL), ("need valid request and response buffers")); KASSERT((smp_request_len != 0) && (smp_response_len != 0), ("need non-zero request and response lengths")); #endif /*_KERNEL*/ smpio->ccb_h.func_code = XPT_SMP_IO; smpio->ccb_h.flags = flags; smpio->ccb_h.retry_count = retries; smpio->ccb_h.cbfcnp = cbfcnp; smpio->ccb_h.timeout = timeout; smpio->smp_request = smp_request; smpio->smp_request_len = smp_request_len; smpio->smp_response = smp_response; smpio->smp_response_len = smp_response_len; } static __inline void cam_set_ccbstatus(union ccb *ccb, cam_status status) { ccb->ccb_h.status &= ~CAM_STATUS_MASK; ccb->ccb_h.status |= status; } static __inline cam_status cam_ccb_status(union ccb *ccb) { return ((cam_status)(ccb->ccb_h.status & CAM_STATUS_MASK)); } void cam_calc_geometry(struct ccb_calc_geometry *ccg, int extended); static __inline void cam_fill_nvmeio(struct ccb_nvmeio *nvmeio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int32_t timeout) { nvmeio->ccb_h.func_code = XPT_NVME_IO; nvmeio->ccb_h.flags = flags; nvmeio->ccb_h.retry_count = retries; nvmeio->ccb_h.cbfcnp = cbfcnp; nvmeio->ccb_h.timeout = timeout; nvmeio->data_ptr = data_ptr; nvmeio->dxfer_len = dxfer_len; } __END_DECLS #endif /* _CAM_CAM_CCB_H */ Index: head/sys/cam/cam_periph.c =================================================================== --- head/sys/cam/cam_periph.c (revision 308154) +++ head/sys/cam/cam_periph.c (revision 308155) @@ -1,1944 +1,1949 @@ /*- * Common functions for CAM "type" (peripheral) drivers. * * Copyright (c) 1997, 1998 Justin T. Gibbs. * Copyright (c) 1997, 1998, 1999, 2000 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static u_int camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired, path_id_t pathid, target_id_t target, lun_id_t lun); static u_int camperiphunit(struct periph_driver *p_drv, path_id_t pathid, target_id_t target, lun_id_t lun); static void camperiphdone(struct cam_periph *periph, union ccb *done_ccb); static void camperiphfree(struct cam_periph *periph); static int camperiphscsistatuserror(union ccb *ccb, union ccb **orig_ccb, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string); static int camperiphscsisenseerror(union ccb *ccb, union ccb **orig_ccb, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string); static void cam_periph_devctl_notify(union ccb *ccb); static int nperiph_drivers; static int initialized = 0; struct periph_driver **periph_drivers; static MALLOC_DEFINE(M_CAMPERIPH, "CAM periph", "CAM peripheral buffers"); static int periph_selto_delay = 1000; TUNABLE_INT("kern.cam.periph_selto_delay", &periph_selto_delay); static int periph_noresrc_delay = 500; TUNABLE_INT("kern.cam.periph_noresrc_delay", &periph_noresrc_delay); static int periph_busy_delay = 500; TUNABLE_INT("kern.cam.periph_busy_delay", &periph_busy_delay); void periphdriver_register(void *data) { struct periph_driver *drv = (struct periph_driver *)data; struct periph_driver **newdrivers, **old; int ndrivers; again: ndrivers = nperiph_drivers + 2; newdrivers = malloc(sizeof(*newdrivers) * ndrivers, M_CAMPERIPH, M_WAITOK); xpt_lock_buses(); if (ndrivers != nperiph_drivers + 2) { /* * Lost race against itself; go around. */ xpt_unlock_buses(); free(newdrivers, M_CAMPERIPH); goto again; } if (periph_drivers) bcopy(periph_drivers, newdrivers, sizeof(*newdrivers) * nperiph_drivers); newdrivers[nperiph_drivers] = drv; newdrivers[nperiph_drivers + 1] = NULL; old = periph_drivers; periph_drivers = newdrivers; nperiph_drivers++; xpt_unlock_buses(); if (old) free(old, M_CAMPERIPH); /* If driver marked as early or it is late now, initialize it. */ if (((drv->flags & CAM_PERIPH_DRV_EARLY) != 0 && initialized > 0) || initialized > 1) (*drv->init)(); } void periphdriver_init(int level) { int i, early; initialized = max(initialized, level); for (i = 0; periph_drivers[i] != NULL; i++) { early = (periph_drivers[i]->flags & CAM_PERIPH_DRV_EARLY) ? 1 : 2; if (early == initialized) (*periph_drivers[i]->init)(); } } cam_status cam_periph_alloc(periph_ctor_t *periph_ctor, periph_oninv_t *periph_oninvalidate, periph_dtor_t *periph_dtor, periph_start_t *periph_start, char *name, cam_periph_type type, struct cam_path *path, ac_callback_t *ac_callback, ac_code code, void *arg) { struct periph_driver **p_drv; struct cam_sim *sim; struct cam_periph *periph; struct cam_periph *cur_periph; path_id_t path_id; target_id_t target_id; lun_id_t lun_id; cam_status status; u_int init_level; init_level = 0; /* * Handle Hot-Plug scenarios. If there is already a peripheral * of our type assigned to this path, we are likely waiting for * final close on an old, invalidated, peripheral. If this is * the case, queue up a deferred call to the peripheral's async * handler. If it looks like a mistaken re-allocation, complain. */ if ((periph = cam_periph_find(path, name)) != NULL) { if ((periph->flags & CAM_PERIPH_INVALID) != 0 && (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) == 0) { periph->flags |= CAM_PERIPH_NEW_DEV_FOUND; periph->deferred_callback = ac_callback; periph->deferred_ac = code; return (CAM_REQ_INPROG); } else { printf("cam_periph_alloc: attempt to re-allocate " "valid device %s%d rejected flags %#x " "refcount %d\n", periph->periph_name, periph->unit_number, periph->flags, periph->refcount); } return (CAM_REQ_INVALID); } periph = (struct cam_periph *)malloc(sizeof(*periph), M_CAMPERIPH, M_NOWAIT|M_ZERO); if (periph == NULL) return (CAM_RESRC_UNAVAIL); init_level++; sim = xpt_path_sim(path); path_id = xpt_path_path_id(path); target_id = xpt_path_target_id(path); lun_id = xpt_path_lun_id(path); periph->periph_start = periph_start; periph->periph_dtor = periph_dtor; periph->periph_oninval = periph_oninvalidate; periph->type = type; periph->periph_name = name; periph->scheduled_priority = CAM_PRIORITY_NONE; periph->immediate_priority = CAM_PRIORITY_NONE; periph->refcount = 1; /* Dropped by invalidation. */ periph->sim = sim; SLIST_INIT(&periph->ccb_list); status = xpt_create_path(&path, periph, path_id, target_id, lun_id); if (status != CAM_REQ_CMP) goto failure; periph->path = path; xpt_lock_buses(); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { if (strcmp((*p_drv)->driver_name, name) == 0) break; } if (*p_drv == NULL) { printf("cam_periph_alloc: invalid periph name '%s'\n", name); xpt_unlock_buses(); xpt_free_path(periph->path); free(periph, M_CAMPERIPH); return (CAM_REQ_INVALID); } periph->unit_number = camperiphunit(*p_drv, path_id, target_id, lun_id); cur_periph = TAILQ_FIRST(&(*p_drv)->units); while (cur_periph != NULL && cur_periph->unit_number < periph->unit_number) cur_periph = TAILQ_NEXT(cur_periph, unit_links); if (cur_periph != NULL) { KASSERT(cur_periph->unit_number != periph->unit_number, ("duplicate units on periph list")); TAILQ_INSERT_BEFORE(cur_periph, periph, unit_links); } else { TAILQ_INSERT_TAIL(&(*p_drv)->units, periph, unit_links); (*p_drv)->generation++; } xpt_unlock_buses(); init_level++; status = xpt_add_periph(periph); if (status != CAM_REQ_CMP) goto failure; init_level++; CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph created\n")); status = periph_ctor(periph, arg); if (status == CAM_REQ_CMP) init_level++; failure: switch (init_level) { case 4: /* Initialized successfully */ break; case 3: CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n")); xpt_remove_periph(periph); /* FALLTHROUGH */ case 2: xpt_lock_buses(); TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links); xpt_unlock_buses(); xpt_free_path(periph->path); /* FALLTHROUGH */ case 1: free(periph, M_CAMPERIPH); /* FALLTHROUGH */ case 0: /* No cleanup to perform. */ break; default: panic("%s: Unknown init level", __func__); } return(status); } /* * Find a peripheral structure with the specified path, target, lun, * and (optionally) type. If the name is NULL, this function will return * the first peripheral driver that matches the specified path. */ struct cam_periph * cam_periph_find(struct cam_path *path, char *name) { struct periph_driver **p_drv; struct cam_periph *periph; xpt_lock_buses(); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { if (name != NULL && (strcmp((*p_drv)->driver_name, name) != 0)) continue; TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) { if (xpt_path_comp(periph->path, path) == 0) { xpt_unlock_buses(); cam_periph_assert(periph, MA_OWNED); return(periph); } } if (name != NULL) { xpt_unlock_buses(); return(NULL); } } xpt_unlock_buses(); return(NULL); } /* * Find peripheral driver instances attached to the specified path. */ int cam_periph_list(struct cam_path *path, struct sbuf *sb) { struct sbuf local_sb; struct periph_driver **p_drv; struct cam_periph *periph; int count; int sbuf_alloc_len; sbuf_alloc_len = 16; retry: sbuf_new(&local_sb, NULL, sbuf_alloc_len, SBUF_FIXEDLEN); count = 0; xpt_lock_buses(); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { TAILQ_FOREACH(periph, &(*p_drv)->units, unit_links) { if (xpt_path_comp(periph->path, path) != 0) continue; if (sbuf_len(&local_sb) != 0) sbuf_cat(&local_sb, ","); sbuf_printf(&local_sb, "%s%d", periph->periph_name, periph->unit_number); if (sbuf_error(&local_sb) == ENOMEM) { sbuf_alloc_len *= 2; xpt_unlock_buses(); sbuf_delete(&local_sb); goto retry; } count++; } } xpt_unlock_buses(); sbuf_finish(&local_sb); sbuf_cpy(sb, sbuf_data(&local_sb)); sbuf_delete(&local_sb); return (count); } cam_status cam_periph_acquire(struct cam_periph *periph) { cam_status status; status = CAM_REQ_CMP_ERR; if (periph == NULL) return (status); xpt_lock_buses(); if ((periph->flags & CAM_PERIPH_INVALID) == 0) { periph->refcount++; status = CAM_REQ_CMP; } xpt_unlock_buses(); return (status); } void cam_periph_doacquire(struct cam_periph *periph) { xpt_lock_buses(); KASSERT(periph->refcount >= 1, ("cam_periph_doacquire() with refcount == %d", periph->refcount)); periph->refcount++; xpt_unlock_buses(); } void cam_periph_release_locked_buses(struct cam_periph *periph) { cam_periph_assert(periph, MA_OWNED); KASSERT(periph->refcount >= 1, ("periph->refcount >= 1")); if (--periph->refcount == 0) camperiphfree(periph); } void cam_periph_release_locked(struct cam_periph *periph) { if (periph == NULL) return; xpt_lock_buses(); cam_periph_release_locked_buses(periph); xpt_unlock_buses(); } void cam_periph_release(struct cam_periph *periph) { struct mtx *mtx; if (periph == NULL) return; cam_periph_assert(periph, MA_NOTOWNED); mtx = cam_periph_mtx(periph); mtx_lock(mtx); cam_periph_release_locked(periph); mtx_unlock(mtx); } int cam_periph_hold(struct cam_periph *periph, int priority) { int error; /* * Increment the reference count on the peripheral * while we wait for our lock attempt to succeed * to ensure the peripheral doesn't disappear out * from user us while we sleep. */ if (cam_periph_acquire(periph) != CAM_REQ_CMP) return (ENXIO); cam_periph_assert(periph, MA_OWNED); while ((periph->flags & CAM_PERIPH_LOCKED) != 0) { periph->flags |= CAM_PERIPH_LOCK_WANTED; if ((error = cam_periph_sleep(periph, periph, priority, "caplck", 0)) != 0) { cam_periph_release_locked(periph); return (error); } if (periph->flags & CAM_PERIPH_INVALID) { cam_periph_release_locked(periph); return (ENXIO); } } periph->flags |= CAM_PERIPH_LOCKED; return (0); } void cam_periph_unhold(struct cam_periph *periph) { cam_periph_assert(periph, MA_OWNED); periph->flags &= ~CAM_PERIPH_LOCKED; if ((periph->flags & CAM_PERIPH_LOCK_WANTED) != 0) { periph->flags &= ~CAM_PERIPH_LOCK_WANTED; wakeup(periph); } cam_periph_release_locked(periph); } /* * Look for the next unit number that is not currently in use for this * peripheral type starting at "newunit". Also exclude unit numbers that * are reserved by for future "hardwiring" unless we already know that this * is a potential wired device. Only assume that the device is "wired" the * first time through the loop since after that we'll be looking at unit * numbers that did not match a wiring entry. */ static u_int camperiphnextunit(struct periph_driver *p_drv, u_int newunit, int wired, path_id_t pathid, target_id_t target, lun_id_t lun) { struct cam_periph *periph; char *periph_name; int i, val, dunit, r; const char *dname, *strval; periph_name = p_drv->driver_name; for (;;newunit++) { for (periph = TAILQ_FIRST(&p_drv->units); periph != NULL && periph->unit_number != newunit; periph = TAILQ_NEXT(periph, unit_links)) ; if (periph != NULL && periph->unit_number == newunit) { if (wired != 0) { xpt_print(periph->path, "Duplicate Wired " "Device entry!\n"); xpt_print(periph->path, "Second device (%s " "device at scbus%d target %d lun %d) will " "not be wired\n", periph_name, pathid, target, lun); wired = 0; } continue; } if (wired) break; /* * Don't match entries like "da 4" as a wired down * device, but do match entries like "da 4 target 5" * or even "da 4 scbus 1". */ i = 0; dname = periph_name; for (;;) { r = resource_find_dev(&i, dname, &dunit, NULL, NULL); if (r != 0) break; /* if no "target" and no specific scbus, skip */ if (resource_int_value(dname, dunit, "target", &val) && (resource_string_value(dname, dunit, "at",&strval)|| strcmp(strval, "scbus") == 0)) continue; if (newunit == dunit) break; } if (r != 0) break; } return (newunit); } static u_int camperiphunit(struct periph_driver *p_drv, path_id_t pathid, target_id_t target, lun_id_t lun) { u_int unit; int wired, i, val, dunit; const char *dname, *strval; char pathbuf[32], *periph_name; periph_name = p_drv->driver_name; snprintf(pathbuf, sizeof(pathbuf), "scbus%d", pathid); unit = 0; i = 0; dname = periph_name; for (wired = 0; resource_find_dev(&i, dname, &dunit, NULL, NULL) == 0; wired = 0) { if (resource_string_value(dname, dunit, "at", &strval) == 0) { if (strcmp(strval, pathbuf) != 0) continue; wired++; } if (resource_int_value(dname, dunit, "target", &val) == 0) { if (val != target) continue; wired++; } if (resource_int_value(dname, dunit, "lun", &val) == 0) { if (val != lun) continue; wired++; } if (wired != 0) { unit = dunit; break; } } /* * Either start from 0 looking for the next unit or from * the unit number given in the resource config. This way, * if we have wildcard matches, we don't return the same * unit number twice. */ unit = camperiphnextunit(p_drv, unit, wired, pathid, target, lun); return (unit); } void cam_periph_invalidate(struct cam_periph *periph) { cam_periph_assert(periph, MA_OWNED); /* * We only call this routine the first time a peripheral is * invalidated. */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) return; CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph invalidated\n")); if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting) xpt_denounce_periph(periph); periph->flags |= CAM_PERIPH_INVALID; periph->flags &= ~CAM_PERIPH_NEW_DEV_FOUND; if (periph->periph_oninval != NULL) periph->periph_oninval(periph); cam_periph_release_locked(periph); } static void camperiphfree(struct cam_periph *periph) { struct periph_driver **p_drv; cam_periph_assert(periph, MA_OWNED); KASSERT(periph->periph_allocating == 0, ("%s%d: freed while allocating", periph->periph_name, periph->unit_number)); for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) { if (strcmp((*p_drv)->driver_name, periph->periph_name) == 0) break; } if (*p_drv == NULL) { printf("camperiphfree: attempt to free non-existant periph\n"); return; } /* * We need to set this flag before dropping the topology lock, to * let anyone who is traversing the list that this peripheral is * about to be freed, and there will be no more reference count * checks. */ periph->flags |= CAM_PERIPH_FREE; /* * The peripheral destructor semantics dictate calling with only the * SIM mutex held. Since it might sleep, it should not be called * with the topology lock held. */ xpt_unlock_buses(); /* * We need to call the peripheral destructor prior to removing the * peripheral from the list. Otherwise, we risk running into a * scenario where the peripheral unit number may get reused * (because it has been removed from the list), but some resources * used by the peripheral are still hanging around. In particular, * the devfs nodes used by some peripherals like the pass(4) driver * aren't fully cleaned up until the destructor is run. If the * unit number is reused before the devfs instance is fully gone, * devfs will panic. */ if (periph->periph_dtor != NULL) periph->periph_dtor(periph); /* * The peripheral list is protected by the topology lock. */ xpt_lock_buses(); TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links); (*p_drv)->generation++; xpt_remove_periph(periph); xpt_unlock_buses(); if ((periph->flags & CAM_PERIPH_ANNOUNCED) && !rebooting) xpt_print(periph->path, "Periph destroyed\n"); else CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n")); if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) { union ccb ccb; void *arg; switch (periph->deferred_ac) { case AC_FOUND_DEVICE: ccb.ccb_h.func_code = XPT_GDEV_TYPE; xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL); xpt_action(&ccb); arg = &ccb; break; case AC_PATH_REGISTERED: ccb.ccb_h.func_code = XPT_PATH_INQ; xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL); xpt_action(&ccb); arg = &ccb; break; default: arg = NULL; break; } periph->deferred_callback(NULL, periph->deferred_ac, periph->path, arg); } xpt_free_path(periph->path); free(periph, M_CAMPERIPH); xpt_lock_buses(); } /* * Map user virtual pointers into kernel virtual address space, so we can * access the memory. This is now a generic function that centralizes most * of the sanity checks on the data flags, if any. * This also only works for up to MAXPHYS memory. Since we use * buffers to map stuff in and out, we're limited to the buffer size. */ int cam_periph_mapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo, u_int maxmap) { int numbufs, i, j; int flags[CAM_PERIPH_MAXMAPS]; u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; u_int32_t lengths[CAM_PERIPH_MAXMAPS]; u_int32_t dirs[CAM_PERIPH_MAXMAPS]; if (maxmap == 0) maxmap = DFLTPHYS; /* traditional default */ else if (maxmap > MAXPHYS) maxmap = MAXPHYS; /* for safety */ switch(ccb->ccb_h.func_code) { case XPT_DEV_MATCH: if (ccb->cdm.match_buf_len == 0) { printf("cam_periph_mapmem: invalid match buffer " "length 0\n"); return(EINVAL); } if (ccb->cdm.pattern_buf_len > 0) { data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; lengths[0] = ccb->cdm.pattern_buf_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; lengths[1] = ccb->cdm.match_buf_len; dirs[1] = CAM_DIR_IN; numbufs = 2; } else { data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; lengths[0] = ccb->cdm.match_buf_len; dirs[0] = CAM_DIR_IN; numbufs = 1; } /* * This request will not go to the hardware, no reason * to be so strict. vmapbuf() is able to map up to MAXPHYS. */ maxmap = MAXPHYS; break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) return (EINVAL); data_ptrs[0] = &ccb->csio.data_ptr; lengths[0] = ccb->csio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_ATA_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) return (EINVAL); data_ptrs[0] = &ccb->ataio.data_ptr; lengths[0] = ccb->ataio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; break; case XPT_SMP_IO: data_ptrs[0] = &ccb->smpio.smp_request; lengths[0] = ccb->smpio.smp_request_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = &ccb->smpio.smp_response; lengths[1] = ccb->smpio.smp_response_len; dirs[1] = CAM_DIR_IN; numbufs = 2; break; case XPT_DEV_ADVINFO: if (ccb->cdai.bufsiz == 0) return (0); data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; lengths[0] = ccb->cdai.bufsiz; dirs[0] = CAM_DIR_IN; numbufs = 1; /* * This request will not go to the hardware, no reason * to be so strict. vmapbuf() is able to map up to MAXPHYS. */ maxmap = MAXPHYS; break; default: return(EINVAL); break; /* NOTREACHED */ } /* * Check the transfer length and permissions first, so we don't * have to unmap any previously mapped buffers. */ for (i = 0; i < numbufs; i++) { flags[i] = 0; /* * The userland data pointer passed in may not be page * aligned. vmapbuf() truncates the address to a page * boundary, so if the address isn't page aligned, we'll * need enough space for the given transfer length, plus * whatever extra space is necessary to make it to the page * boundary. */ if ((lengths[i] + (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK)) > maxmap){ printf("cam_periph_mapmem: attempt to map %lu bytes, " "which is greater than %lu\n", (long)(lengths[i] + (((vm_offset_t)(*data_ptrs[i])) & PAGE_MASK)), (u_long)maxmap); return(E2BIG); } if (dirs[i] & CAM_DIR_OUT) { flags[i] = BIO_WRITE; } if (dirs[i] & CAM_DIR_IN) { flags[i] = BIO_READ; } } /* * This keeps the kernel stack of current thread from getting * swapped. In low-memory situations where the kernel stack might * otherwise get swapped out, this holds it and allows the thread * to make progress and release the kernel mapped pages sooner. * * XXX KDM should I use P_NOSWAP instead? */ PHOLD(curproc); for (i = 0; i < numbufs; i++) { /* * Get the buffer. */ mapinfo->bp[i] = getpbuf(NULL); /* put our pointer in the data slot */ mapinfo->bp[i]->b_data = *data_ptrs[i]; /* save the user's data address */ mapinfo->bp[i]->b_caller1 = *data_ptrs[i]; /* set the transfer length, we know it's < MAXPHYS */ mapinfo->bp[i]->b_bufsize = lengths[i]; /* set the direction */ mapinfo->bp[i]->b_iocmd = flags[i]; /* * Map the buffer into kernel memory. * * Note that useracc() alone is not a sufficient test. * vmapbuf() can still fail due to a smaller file mapped * into a larger area of VM, or if userland races against * vmapbuf() after the useracc() check. */ if (vmapbuf(mapinfo->bp[i], 1) < 0) { for (j = 0; j < i; ++j) { *data_ptrs[j] = mapinfo->bp[j]->b_caller1; vunmapbuf(mapinfo->bp[j]); relpbuf(mapinfo->bp[j], NULL); } relpbuf(mapinfo->bp[i], NULL); PRELE(curproc); return(EACCES); } /* set our pointer to the new mapped area */ *data_ptrs[i] = mapinfo->bp[i]->b_data; mapinfo->num_bufs_used++; } /* * Now that we've gotten this far, change ownership to the kernel * of the buffers so that we don't run afoul of returning to user * space with locks (on the buffer) held. */ for (i = 0; i < numbufs; i++) { BUF_KERNPROC(mapinfo->bp[i]); } return(0); } /* * Unmap memory segments mapped into kernel virtual address space by * cam_periph_mapmem(). */ void cam_periph_unmapmem(union ccb *ccb, struct cam_periph_map_info *mapinfo) { int numbufs, i; u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; if (mapinfo->num_bufs_used <= 0) { /* nothing to free and the process wasn't held. */ return; } switch (ccb->ccb_h.func_code) { case XPT_DEV_MATCH: numbufs = min(mapinfo->num_bufs_used, 2); if (numbufs == 1) { data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; } else { data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; } break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: data_ptrs[0] = &ccb->csio.data_ptr; numbufs = min(mapinfo->num_bufs_used, 1); break; case XPT_ATA_IO: data_ptrs[0] = &ccb->ataio.data_ptr; numbufs = min(mapinfo->num_bufs_used, 1); break; case XPT_SMP_IO: numbufs = min(mapinfo->num_bufs_used, 2); data_ptrs[0] = &ccb->smpio.smp_request; data_ptrs[1] = &ccb->smpio.smp_response; break; case XPT_DEV_ADVINFO: numbufs = min(mapinfo->num_bufs_used, 1); data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; break; default: /* allow ourselves to be swapped once again */ PRELE(curproc); return; break; /* NOTREACHED */ } for (i = 0; i < numbufs; i++) { /* Set the user's pointer back to the original value */ *data_ptrs[i] = mapinfo->bp[i]->b_caller1; /* unmap the buffer */ vunmapbuf(mapinfo->bp[i]); /* release the buffer */ relpbuf(mapinfo->bp[i], NULL); } /* allow ourselves to be swapped once again */ PRELE(curproc); } int cam_periph_ioctl(struct cam_periph *periph, u_long cmd, caddr_t addr, int (*error_routine)(union ccb *ccb, cam_flags camflags, u_int32_t sense_flags)) { union ccb *ccb; int error; int found; error = found = 0; switch(cmd){ case CAMGETPASSTHRU: ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); ccb->ccb_h.func_code = XPT_GDEVLIST; /* * Basically, the point of this is that we go through * getting the list of devices, until we find a passthrough * device. In the current version of the CAM code, the * only way to determine what type of device we're dealing * with is by its name. */ while (found == 0) { ccb->cgdl.index = 0; ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS; while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) { /* we want the next device in the list */ xpt_action(ccb); if (strncmp(ccb->cgdl.periph_name, "pass", 4) == 0){ found = 1; break; } } if ((ccb->cgdl.status == CAM_GDEVLIST_LAST_DEVICE) && (found == 0)) { ccb->cgdl.periph_name[0] = '\0'; ccb->cgdl.unit_number = 0; break; } } /* copy the result back out */ bcopy(ccb, addr, sizeof(union ccb)); /* and release the ccb */ xpt_release_ccb(ccb); break; default: error = ENOTTY; break; } return(error); } static void cam_periph_done_panic(struct cam_periph *periph, union ccb *done_ccb) { panic("%s: already done with ccb %p", __func__, done_ccb); } static void cam_periph_done(struct cam_periph *periph, union ccb *done_ccb) { /* Caller will release the CCB */ xpt_path_assert(done_ccb->ccb_h.path, MA_OWNED); done_ccb->ccb_h.cbfcnp = cam_periph_done_panic; wakeup(&done_ccb->ccb_h.cbfcnp); } static void cam_periph_ccbwait(union ccb *ccb) { if ((ccb->ccb_h.func_code & XPT_FC_QUEUED) != 0) { while (ccb->ccb_h.cbfcnp != cam_periph_done_panic) xpt_path_sleep(ccb->ccb_h.path, &ccb->ccb_h.cbfcnp, PRIBIO, "cbwait", 0); } KASSERT(ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX && (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG, ("%s: proceeding with incomplete ccb: ccb=%p, func_code=%#x, " "status=%#x, index=%d", __func__, ccb, ccb->ccb_h.func_code, ccb->ccb_h.status, ccb->ccb_h.pinfo.index)); } int cam_periph_runccb(union ccb *ccb, int (*error_routine)(union ccb *ccb, cam_flags camflags, u_int32_t sense_flags), cam_flags camflags, u_int32_t sense_flags, struct devstat *ds) { struct bintime *starttime; struct bintime ltime; int error; starttime = NULL; xpt_path_assert(ccb->ccb_h.path, MA_OWNED); KASSERT((ccb->ccb_h.flags & CAM_UNLOCKED) == 0, ("%s: ccb=%p, func_code=%#x, flags=%#x", __func__, ccb, ccb->ccb_h.func_code, ccb->ccb_h.flags)); /* * If the user has supplied a stats structure, and if we understand * this particular type of ccb, record the transaction start. */ if ((ds != NULL) && (ccb->ccb_h.func_code == XPT_SCSI_IO || ccb->ccb_h.func_code == XPT_ATA_IO)) { starttime = <ime; binuptime(starttime); devstat_start_transaction(ds, starttime); } ccb->ccb_h.cbfcnp = cam_periph_done; xpt_action(ccb); do { cam_periph_ccbwait(ccb); if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) error = 0; else if (error_routine != NULL) { ccb->ccb_h.cbfcnp = cam_periph_done; error = (*error_routine)(ccb, camflags, sense_flags); } else error = 0; } while (error == ERESTART); if ((ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { cam_release_devq(ccb->ccb_h.path, /* relsim_flags */0, /* openings */0, /* timeout */0, /* getcount_only */ FALSE); ccb->ccb_h.status &= ~CAM_DEV_QFRZN; } if (ds != NULL) { if (ccb->ccb_h.func_code == XPT_SCSI_IO) { devstat_end_transaction(ds, ccb->csio.dxfer_len - ccb->csio.resid, ccb->csio.tag_action & 0x3, ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ? DEVSTAT_NO_DATA : (ccb->ccb_h.flags & CAM_DIR_OUT) ? DEVSTAT_WRITE : DEVSTAT_READ, NULL, starttime); } else if (ccb->ccb_h.func_code == XPT_ATA_IO) { devstat_end_transaction(ds, ccb->ataio.dxfer_len - ccb->ataio.resid, 0, /* Not used in ATA */ ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ? DEVSTAT_NO_DATA : (ccb->ccb_h.flags & CAM_DIR_OUT) ? DEVSTAT_WRITE : DEVSTAT_READ, NULL, starttime); } } return(error); } void cam_freeze_devq(struct cam_path *path) { struct ccb_hdr ccb_h; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_freeze_devq\n")); xpt_setup_ccb(&ccb_h, path, /*priority*/1); ccb_h.func_code = XPT_NOOP; ccb_h.flags = CAM_DEV_QFREEZE; xpt_action((union ccb *)&ccb_h); } u_int32_t cam_release_devq(struct cam_path *path, u_int32_t relsim_flags, u_int32_t openings, u_int32_t arg, int getcount_only) { struct ccb_relsim crs; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("cam_release_devq(%u, %u, %u, %d)\n", relsim_flags, openings, arg, getcount_only)); xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL); crs.ccb_h.func_code = XPT_REL_SIMQ; crs.ccb_h.flags = getcount_only ? CAM_DEV_QFREEZE : 0; crs.release_flags = relsim_flags; crs.openings = openings; crs.release_timeout = arg; xpt_action((union ccb *)&crs); return (crs.qfrozen_cnt); } #define saved_ccb_ptr ppriv_ptr0 static void camperiphdone(struct cam_periph *periph, union ccb *done_ccb) { union ccb *saved_ccb; cam_status status; struct scsi_start_stop_unit *scsi_cmd; int error_code, sense_key, asc, ascq; scsi_cmd = (struct scsi_start_stop_unit *) &done_ccb->csio.cdb_io.cdb_bytes; status = done_ccb->ccb_h.status; if ((status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) { /* * If the error is "invalid field in CDB", * and the load/eject flag is set, turn the * flag off and try again. This is just in * case the drive in question barfs on the * load eject flag. The CAM code should set * the load/eject flag by default for * removable media. */ if ((scsi_cmd->opcode == START_STOP_UNIT) && ((scsi_cmd->how & SSS_LOEJ) != 0) && (asc == 0x24) && (ascq == 0x00)) { scsi_cmd->how &= ~SSS_LOEJ; if (status & CAM_DEV_QFRZN) { cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0); done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN; } xpt_action(done_ccb); goto out; } } if (cam_periph_error(done_ccb, 0, SF_RETRY_UA | SF_NO_PRINT, NULL) == ERESTART) goto out; if (done_ccb->ccb_h.status & CAM_DEV_QFRZN) { cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0); done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN; } } else { /* * If we have successfully taken a device from the not * ready to ready state, re-scan the device and re-get * the inquiry information. Many devices (mostly disks) * don't properly report their inquiry information unless * they are spun up. */ if (scsi_cmd->opcode == START_STOP_UNIT) xpt_async(AC_INQ_CHANGED, done_ccb->ccb_h.path, NULL); } /* * Perform the final retry with the original CCB so that final * error processing is performed by the owner of the CCB. */ saved_ccb = (union ccb *)done_ccb->ccb_h.saved_ccb_ptr; bcopy(saved_ccb, done_ccb, sizeof(*done_ccb)); xpt_free_ccb(saved_ccb); if (done_ccb->ccb_h.cbfcnp != camperiphdone) periph->flags &= ~CAM_PERIPH_RECOVERY_INPROG; xpt_action(done_ccb); out: /* Drop freeze taken due to CAM_DEV_QFREEZE flag set. */ cam_release_devq(done_ccb->ccb_h.path, 0, 0, 0, 0); } /* * Generic Async Event handler. Peripheral drivers usually * filter out the events that require personal attention, * and leave the rest to this function. */ void cam_periph_async(struct cam_periph *periph, u_int32_t code, struct cam_path *path, void *arg) { switch (code) { case AC_LOST_DEVICE: cam_periph_invalidate(periph); break; default: break; } } void cam_periph_bus_settle(struct cam_periph *periph, u_int bus_settle) { struct ccb_getdevstats cgds; xpt_setup_ccb(&cgds.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cgds.ccb_h.func_code = XPT_GDEV_STATS; xpt_action((union ccb *)&cgds); cam_periph_freeze_after_event(periph, &cgds.last_reset, bus_settle); } void cam_periph_freeze_after_event(struct cam_periph *periph, struct timeval* event_time, u_int duration_ms) { struct timeval delta; struct timeval duration_tv; if (!timevalisset(event_time)) return; microtime(&delta); timevalsub(&delta, event_time); duration_tv.tv_sec = duration_ms / 1000; duration_tv.tv_usec = (duration_ms % 1000) * 1000; if (timevalcmp(&delta, &duration_tv, <)) { timevalsub(&duration_tv, &delta); duration_ms = duration_tv.tv_sec * 1000; duration_ms += duration_tv.tv_usec / 1000; cam_freeze_devq(periph->path); cam_release_devq(periph->path, RELSIM_RELEASE_AFTER_TIMEOUT, /*reduction*/0, /*timeout*/duration_ms, /*getcount_only*/0); } } static int camperiphscsistatuserror(union ccb *ccb, union ccb **orig_ccb, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string) { int error; switch (ccb->csio.scsi_status) { case SCSI_STATUS_OK: case SCSI_STATUS_COND_MET: case SCSI_STATUS_INTERMED: case SCSI_STATUS_INTERMED_COND_MET: error = 0; break; case SCSI_STATUS_CMD_TERMINATED: case SCSI_STATUS_CHECK_COND: error = camperiphscsisenseerror(ccb, orig_ccb, camflags, sense_flags, openings, relsim_flags, timeout, action, action_string); break; case SCSI_STATUS_QUEUE_FULL: { /* no decrement */ struct ccb_getdevstats cgds; /* * First off, find out what the current * transaction counts are. */ xpt_setup_ccb(&cgds.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgds.ccb_h.func_code = XPT_GDEV_STATS; xpt_action((union ccb *)&cgds); /* * If we were the only transaction active, treat * the QUEUE FULL as if it were a BUSY condition. */ if (cgds.dev_active != 0) { int total_openings; /* * Reduce the number of openings to * be 1 less than the amount it took * to get a queue full bounded by the * minimum allowed tag count for this * device. */ total_openings = cgds.dev_active + cgds.dev_openings; *openings = cgds.dev_active; if (*openings < cgds.mintags) *openings = cgds.mintags; if (*openings < total_openings) *relsim_flags = RELSIM_ADJUST_OPENINGS; else { /* * Some devices report queue full for * temporary resource shortages. For * this reason, we allow a minimum * tag count to be entered via a * quirk entry to prevent the queue * count on these devices from falling * to a pessimisticly low value. We * still wait for the next successful * completion, however, before queueing * more transactions to the device. */ *relsim_flags = RELSIM_RELEASE_AFTER_CMDCMPLT; } *timeout = 0; error = ERESTART; *action &= ~SSQ_PRINT_SENSE; break; } /* FALLTHROUGH */ } case SCSI_STATUS_BUSY: /* * Restart the queue after either another * command completes or a 1 second timeout. */ if ((sense_flags & SF_RETRY_BUSY) != 0 || (ccb->ccb_h.retry_count--) > 0) { error = ERESTART; *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT | RELSIM_RELEASE_AFTER_CMDCMPLT; *timeout = 1000; } else { error = EIO; } break; case SCSI_STATUS_RESERV_CONFLICT: default: error = EIO; break; } return (error); } static int camperiphscsisenseerror(union ccb *ccb, union ccb **orig, cam_flags camflags, u_int32_t sense_flags, int *openings, u_int32_t *relsim_flags, u_int32_t *timeout, u_int32_t *action, const char **action_string) { struct cam_periph *periph; union ccb *orig_ccb = ccb; int error, recoveryccb; +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + if (ccb->ccb_h.func_code == XPT_SCSI_IO && ccb->csio.bio != NULL) + biotrack(ccb->csio.bio, __func__); +#endif + periph = xpt_path_periph(ccb->ccb_h.path); recoveryccb = (ccb->ccb_h.cbfcnp == camperiphdone); if ((periph->flags & CAM_PERIPH_RECOVERY_INPROG) && !recoveryccb) { /* * If error recovery is already in progress, don't attempt * to process this error, but requeue it unconditionally * and attempt to process it once error recovery has * completed. This failed command is probably related to * the error that caused the currently active error recovery * action so our current recovery efforts should also * address this command. Be aware that the error recovery * code assumes that only one recovery action is in progress * on a particular peripheral instance at any given time * (e.g. only one saved CCB for error recovery) so it is * imperitive that we don't violate this assumption. */ error = ERESTART; *action &= ~SSQ_PRINT_SENSE; } else { scsi_sense_action err_action; struct ccb_getdev cgd; /* * Grab the inquiry data for this device. */ xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); err_action = scsi_error_action(&ccb->csio, &cgd.inq_data, sense_flags); error = err_action & SS_ERRMASK; /* * Do not autostart sequential access devices * to avoid unexpected tape loading. */ if ((err_action & SS_MASK) == SS_START && SID_TYPE(&cgd.inq_data) == T_SEQUENTIAL) { *action_string = "Will not autostart a " "sequential access device"; goto sense_error_done; } /* * Avoid recovery recursion if recovery action is the same. */ if ((err_action & SS_MASK) >= SS_START && recoveryccb) { if (((err_action & SS_MASK) == SS_START && ccb->csio.cdb_io.cdb_bytes[0] == START_STOP_UNIT) || ((err_action & SS_MASK) == SS_TUR && (ccb->csio.cdb_io.cdb_bytes[0] == TEST_UNIT_READY))) { err_action = SS_RETRY|SSQ_DECREMENT_COUNT|EIO; *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; *timeout = 500; } } /* * If the recovery action will consume a retry, * make sure we actually have retries available. */ if ((err_action & SSQ_DECREMENT_COUNT) != 0) { if (ccb->ccb_h.retry_count > 0 && (periph->flags & CAM_PERIPH_INVALID) == 0) ccb->ccb_h.retry_count--; else { *action_string = "Retries exhausted"; goto sense_error_done; } } if ((err_action & SS_MASK) >= SS_START) { /* * Do common portions of commands that * use recovery CCBs. */ orig_ccb = xpt_alloc_ccb_nowait(); if (orig_ccb == NULL) { *action_string = "Can't allocate recovery CCB"; goto sense_error_done; } /* * Clear freeze flag for original request here, as * this freeze will be dropped as part of ERESTART. */ ccb->ccb_h.status &= ~CAM_DEV_QFRZN; bcopy(ccb, orig_ccb, sizeof(*orig_ccb)); } switch (err_action & SS_MASK) { case SS_NOP: *action_string = "No recovery action needed"; error = 0; break; case SS_RETRY: *action_string = "Retrying command (per sense data)"; error = ERESTART; break; case SS_FAIL: *action_string = "Unretryable error"; break; case SS_START: { int le; /* * Send a start unit command to the device, and * then retry the command. */ *action_string = "Attempting to start unit"; periph->flags |= CAM_PERIPH_RECOVERY_INPROG; /* * Check for removable media and set * load/eject flag appropriately. */ if (SID_IS_REMOVABLE(&cgd.inq_data)) le = TRUE; else le = FALSE; scsi_start_stop(&ccb->csio, /*retries*/1, camperiphdone, MSG_SIMPLE_Q_TAG, /*start*/TRUE, /*load/eject*/le, /*immediate*/FALSE, SSD_FULL_SIZE, /*timeout*/50000); break; } case SS_TUR: { /* * Send a Test Unit Ready to the device. * If the 'many' flag is set, we send 120 * test unit ready commands, one every half * second. Otherwise, we just send one TUR. * We only want to do this if the retry * count has not been exhausted. */ int retries; if ((err_action & SSQ_MANY) != 0) { *action_string = "Polling device for readiness"; retries = 120; } else { *action_string = "Testing device for readiness"; retries = 1; } periph->flags |= CAM_PERIPH_RECOVERY_INPROG; scsi_test_unit_ready(&ccb->csio, retries, camperiphdone, MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, /*timeout*/5000); /* * Accomplish our 500ms delay by deferring * the release of our device queue appropriately. */ *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; *timeout = 500; break; } default: panic("Unhandled error action %x", err_action); } if ((err_action & SS_MASK) >= SS_START) { /* * Drop the priority, so that the recovery * CCB is the first to execute. Freeze the queue * after this command is sent so that we can * restore the old csio and have it queued in * the proper order before we release normal * transactions to the device. */ ccb->ccb_h.pinfo.priority--; ccb->ccb_h.flags |= CAM_DEV_QFREEZE; ccb->ccb_h.saved_ccb_ptr = orig_ccb; error = ERESTART; *orig = orig_ccb; } sense_error_done: *action = err_action; } return (error); } /* * Generic error handler. Peripheral drivers usually filter * out the errors that they handle in a unique manner, then * call this function. */ int cam_periph_error(union ccb *ccb, cam_flags camflags, u_int32_t sense_flags, union ccb *save_ccb) { struct cam_path *newpath; union ccb *orig_ccb, *scan_ccb; struct cam_periph *periph; const char *action_string; cam_status status; int frozen, error, openings, devctl_err; u_int32_t action, relsim_flags, timeout; action = SSQ_PRINT_SENSE; periph = xpt_path_periph(ccb->ccb_h.path); action_string = NULL; status = ccb->ccb_h.status; frozen = (status & CAM_DEV_QFRZN) != 0; status &= CAM_STATUS_MASK; devctl_err = openings = relsim_flags = timeout = 0; orig_ccb = ccb; /* Filter the errors that should be reported via devctl */ switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: case CAM_REQ_ABORTED: case CAM_REQ_CMP_ERR: case CAM_REQ_TERMIO: case CAM_UNREC_HBA_ERROR: case CAM_DATA_RUN_ERR: case CAM_SCSI_STATUS_ERROR: case CAM_ATA_STATUS_ERROR: case CAM_SMP_STATUS_ERROR: devctl_err++; break; default: break; } switch (status) { case CAM_REQ_CMP: error = 0; action &= ~SSQ_PRINT_SENSE; break; case CAM_SCSI_STATUS_ERROR: error = camperiphscsistatuserror(ccb, &orig_ccb, camflags, sense_flags, &openings, &relsim_flags, &timeout, &action, &action_string); break; case CAM_AUTOSENSE_FAIL: error = EIO; /* we have to kill the command */ break; case CAM_UA_ABORT: case CAM_UA_TERMIO: case CAM_MSG_REJECT_REC: /* XXX Don't know that these are correct */ error = EIO; break; case CAM_SEL_TIMEOUT: if ((camflags & CAM_RETRY_SELTO) != 0) { if (ccb->ccb_h.retry_count > 0 && (periph->flags & CAM_PERIPH_INVALID) == 0) { ccb->ccb_h.retry_count--; error = ERESTART; /* * Wait a bit to give the device * time to recover before we try again. */ relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; timeout = periph_selto_delay; break; } action_string = "Retries exhausted"; } /* FALLTHROUGH */ case CAM_DEV_NOT_THERE: error = ENXIO; action = SSQ_LOST; break; case CAM_REQ_INVALID: case CAM_PATH_INVALID: case CAM_NO_HBA: case CAM_PROVIDE_FAIL: case CAM_REQ_TOO_BIG: case CAM_LUN_INVALID: case CAM_TID_INVALID: case CAM_FUNC_NOTAVAIL: error = EINVAL; break; case CAM_SCSI_BUS_RESET: case CAM_BDR_SENT: /* * Commands that repeatedly timeout and cause these * kinds of error recovery actions, should return * CAM_CMD_TIMEOUT, which allows us to safely assume * that this command was an innocent bystander to * these events and should be unconditionally * retried. */ case CAM_REQUEUE_REQ: /* Unconditional requeue if device is still there */ if (periph->flags & CAM_PERIPH_INVALID) { action_string = "Periph was invalidated"; error = EIO; } else if (sense_flags & SF_NO_RETRY) { error = EIO; action_string = "Retry was blocked"; } else { error = ERESTART; action &= ~SSQ_PRINT_SENSE; } break; case CAM_RESRC_UNAVAIL: /* Wait a bit for the resource shortage to abate. */ timeout = periph_noresrc_delay; /* FALLTHROUGH */ case CAM_BUSY: if (timeout == 0) { /* Wait a bit for the busy condition to abate. */ timeout = periph_busy_delay; } relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT; /* FALLTHROUGH */ case CAM_ATA_STATUS_ERROR: case CAM_REQ_CMP_ERR: case CAM_CMD_TIMEOUT: case CAM_UNEXP_BUSFREE: case CAM_UNCOR_PARITY: case CAM_DATA_RUN_ERR: default: if (periph->flags & CAM_PERIPH_INVALID) { error = EIO; action_string = "Periph was invalidated"; } else if (ccb->ccb_h.retry_count == 0) { error = EIO; action_string = "Retries exhausted"; } else if (sense_flags & SF_NO_RETRY) { error = EIO; action_string = "Retry was blocked"; } else { ccb->ccb_h.retry_count--; error = ERESTART; } break; } if ((sense_flags & SF_PRINT_ALWAYS) || CAM_DEBUGGED(ccb->ccb_h.path, CAM_DEBUG_INFO)) action |= SSQ_PRINT_SENSE; else if (sense_flags & SF_NO_PRINT) action &= ~SSQ_PRINT_SENSE; if ((action & SSQ_PRINT_SENSE) != 0) cam_error_print(orig_ccb, CAM_ESF_ALL, CAM_EPF_ALL); if (error != 0 && (action & SSQ_PRINT_SENSE) != 0) { if (error != ERESTART) { if (action_string == NULL) action_string = "Unretryable error"; xpt_print(ccb->ccb_h.path, "Error %d, %s\n", error, action_string); } else if (action_string != NULL) xpt_print(ccb->ccb_h.path, "%s\n", action_string); else xpt_print(ccb->ccb_h.path, "Retrying command\n"); } if (devctl_err && (error != 0 || (action & SSQ_PRINT_SENSE) != 0)) cam_periph_devctl_notify(orig_ccb); if ((action & SSQ_LOST) != 0) { lun_id_t lun_id; /* * For a selection timeout, we consider all of the LUNs on * the target to be gone. If the status is CAM_DEV_NOT_THERE, * then we only get rid of the device(s) specified by the * path in the original CCB. */ if (status == CAM_SEL_TIMEOUT) lun_id = CAM_LUN_WILDCARD; else lun_id = xpt_path_lun_id(ccb->ccb_h.path); /* Should we do more if we can't create the path?? */ if (xpt_create_path(&newpath, periph, xpt_path_path_id(ccb->ccb_h.path), xpt_path_target_id(ccb->ccb_h.path), lun_id) == CAM_REQ_CMP) { /* * Let peripheral drivers know that this * device has gone away. */ xpt_async(AC_LOST_DEVICE, newpath, NULL); xpt_free_path(newpath); } } /* Broadcast UNIT ATTENTIONs to all periphs. */ if ((action & SSQ_UA) != 0) xpt_async(AC_UNIT_ATTENTION, orig_ccb->ccb_h.path, orig_ccb); /* Rescan target on "Reported LUNs data has changed" */ if ((action & SSQ_RESCAN) != 0) { if (xpt_create_path(&newpath, NULL, xpt_path_path_id(ccb->ccb_h.path), xpt_path_target_id(ccb->ccb_h.path), CAM_LUN_WILDCARD) == CAM_REQ_CMP) { scan_ccb = xpt_alloc_ccb_nowait(); if (scan_ccb != NULL) { scan_ccb->ccb_h.path = newpath; scan_ccb->ccb_h.func_code = XPT_SCAN_TGT; scan_ccb->crcn.flags = 0; xpt_rescan(scan_ccb); } else { xpt_print(newpath, "Can't allocate CCB to rescan target\n"); xpt_free_path(newpath); } } } /* Attempt a retry */ if (error == ERESTART || error == 0) { if (frozen != 0) ccb->ccb_h.status &= ~CAM_DEV_QFRZN; if (error == ERESTART) xpt_action(ccb); if (frozen != 0) cam_release_devq(ccb->ccb_h.path, relsim_flags, openings, timeout, /*getcount_only*/0); } return (error); } #define CAM_PERIPH_DEVD_MSG_SIZE 256 static void cam_periph_devctl_notify(union ccb *ccb) { struct cam_periph *periph; struct ccb_getdev *cgd; struct sbuf sb; int serr, sk, asc, ascq; char *sbmsg, *type; sbmsg = malloc(CAM_PERIPH_DEVD_MSG_SIZE, M_CAMPERIPH, M_NOWAIT); if (sbmsg == NULL) return; sbuf_new(&sb, sbmsg, CAM_PERIPH_DEVD_MSG_SIZE, SBUF_FIXEDLEN); periph = xpt_path_periph(ccb->ccb_h.path); sbuf_printf(&sb, "device=%s%d ", periph->periph_name, periph->unit_number); sbuf_printf(&sb, "serial=\""); if ((cgd = (struct ccb_getdev *)xpt_alloc_ccb_nowait()) != NULL) { xpt_setup_ccb(&cgd->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd->ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)cgd); if (cgd->ccb_h.status == CAM_REQ_CMP) sbuf_bcat(&sb, cgd->serial_num, cgd->serial_num_len); xpt_free_ccb((union ccb *)cgd); } sbuf_printf(&sb, "\" "); sbuf_printf(&sb, "cam_status=\"0x%x\" ", ccb->ccb_h.status); switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: sbuf_printf(&sb, "timeout=%d ", ccb->ccb_h.timeout); type = "timeout"; break; case CAM_SCSI_STATUS_ERROR: sbuf_printf(&sb, "scsi_status=%d ", ccb->csio.scsi_status); if (scsi_extract_sense_ccb(ccb, &serr, &sk, &asc, &ascq)) sbuf_printf(&sb, "scsi_sense=\"%02x %02x %02x %02x\" ", serr, sk, asc, ascq); type = "error"; break; case CAM_ATA_STATUS_ERROR: sbuf_printf(&sb, "RES=\""); ata_res_sbuf(&ccb->ataio.res, &sb); sbuf_printf(&sb, "\" "); type = "error"; break; default: type = "error"; break; } if (ccb->ccb_h.func_code == XPT_SCSI_IO) { sbuf_printf(&sb, "CDB=\""); if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0) scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_ptr, &sb); else scsi_cdb_sbuf(ccb->csio.cdb_io.cdb_bytes, &sb); sbuf_printf(&sb, "\" "); } else if (ccb->ccb_h.func_code == XPT_ATA_IO) { sbuf_printf(&sb, "ACB=\""); ata_cmd_sbuf(&ccb->ataio.cmd, &sb); sbuf_printf(&sb, "\" "); } if (sbuf_finish(&sb) == 0) devctl_notify("CAM", "periph", type, sbuf_data(&sb)); sbuf_delete(&sb); free(sbmsg, M_CAMPERIPH); } Index: head/sys/cam/cam_xpt.c =================================================================== --- head/sys/cam/cam_xpt.c (revision 308154) +++ head/sys/cam/cam_xpt.c (revision 308155) @@ -1,5407 +1,5424 @@ /*- * Implementation of the Common Access Method Transport (XPT) layer. * * Copyright (c) 1997, 1998, 1999 Justin T. Gibbs. * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* geometry translation */ #include /* for xpt_print below */ #include "opt_cam.h" /* * This is the maximum number of high powered commands (e.g. start unit) * that can be outstanding at a particular time. */ #ifndef CAM_MAX_HIGHPOWER #define CAM_MAX_HIGHPOWER 4 #endif /* Datastructures internal to the xpt layer */ MALLOC_DEFINE(M_CAMXPT, "CAM XPT", "CAM XPT buffers"); MALLOC_DEFINE(M_CAMDEV, "CAM DEV", "CAM devices"); MALLOC_DEFINE(M_CAMCCB, "CAM CCB", "CAM CCBs"); MALLOC_DEFINE(M_CAMPATH, "CAM path", "CAM paths"); /* Object for defering XPT actions to a taskqueue */ struct xpt_task { struct task task; void *data1; uintptr_t data2; }; struct xpt_softc { uint32_t xpt_generation; /* number of high powered commands that can go through right now */ struct mtx xpt_highpower_lock; STAILQ_HEAD(highpowerlist, cam_ed) highpowerq; int num_highpower; /* queue for handling async rescan requests. */ TAILQ_HEAD(, ccb_hdr) ccb_scanq; int buses_to_config; int buses_config_done; /* Registered busses */ TAILQ_HEAD(,cam_eb) xpt_busses; u_int bus_generation; struct intr_config_hook *xpt_config_hook; int boot_delay; struct callout boot_callout; struct mtx xpt_topo_lock; struct mtx xpt_lock; struct taskqueue *xpt_taskq; }; typedef enum { DM_RET_COPY = 0x01, DM_RET_FLAG_MASK = 0x0f, DM_RET_NONE = 0x00, DM_RET_STOP = 0x10, DM_RET_DESCEND = 0x20, DM_RET_ERROR = 0x30, DM_RET_ACTION_MASK = 0xf0 } dev_match_ret; typedef enum { XPT_DEPTH_BUS, XPT_DEPTH_TARGET, XPT_DEPTH_DEVICE, XPT_DEPTH_PERIPH } xpt_traverse_depth; struct xpt_traverse_config { xpt_traverse_depth depth; void *tr_func; void *tr_arg; }; typedef int xpt_busfunc_t (struct cam_eb *bus, void *arg); typedef int xpt_targetfunc_t (struct cam_et *target, void *arg); typedef int xpt_devicefunc_t (struct cam_ed *device, void *arg); typedef int xpt_periphfunc_t (struct cam_periph *periph, void *arg); typedef int xpt_pdrvfunc_t (struct periph_driver **pdrv, void *arg); /* Transport layer configuration information */ static struct xpt_softc xsoftc; MTX_SYSINIT(xpt_topo_init, &xsoftc.xpt_topo_lock, "XPT topology lock", MTX_DEF); SYSCTL_INT(_kern_cam, OID_AUTO, boot_delay, CTLFLAG_RDTUN, &xsoftc.boot_delay, 0, "Bus registration wait time"); SYSCTL_UINT(_kern_cam, OID_AUTO, xpt_generation, CTLFLAG_RD, &xsoftc.xpt_generation, 0, "CAM peripheral generation count"); struct cam_doneq { struct mtx_padalign cam_doneq_mtx; STAILQ_HEAD(, ccb_hdr) cam_doneq; int cam_doneq_sleep; }; static struct cam_doneq cam_doneqs[MAXCPU]; static int cam_num_doneqs; static struct proc *cam_proc; SYSCTL_INT(_kern_cam, OID_AUTO, num_doneqs, CTLFLAG_RDTUN, &cam_num_doneqs, 0, "Number of completion queues/threads"); struct cam_periph *xpt_periph; static periph_init_t xpt_periph_init; static struct periph_driver xpt_driver = { xpt_periph_init, "xpt", TAILQ_HEAD_INITIALIZER(xpt_driver.units), /* generation */ 0, CAM_PERIPH_DRV_EARLY }; PERIPHDRIVER_DECLARE(xpt, xpt_driver); static d_open_t xptopen; static d_close_t xptclose; static d_ioctl_t xptioctl; static d_ioctl_t xptdoioctl; static struct cdevsw xpt_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = xptopen, .d_close = xptclose, .d_ioctl = xptioctl, .d_name = "xpt", }; /* Storage for debugging datastructures */ struct cam_path *cam_dpath; u_int32_t cam_dflags = CAM_DEBUG_FLAGS; SYSCTL_UINT(_kern_cam, OID_AUTO, dflags, CTLFLAG_RWTUN, &cam_dflags, 0, "Enabled debug flags"); u_int32_t cam_debug_delay = CAM_DEBUG_DELAY; SYSCTL_UINT(_kern_cam, OID_AUTO, debug_delay, CTLFLAG_RWTUN, &cam_debug_delay, 0, "Delay in us after each debug message"); /* Our boot-time initialization hook */ static int cam_module_event_handler(module_t, int /*modeventtype_t*/, void *); static moduledata_t cam_moduledata = { "cam", cam_module_event_handler, NULL }; static int xpt_init(void *); DECLARE_MODULE(cam, cam_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND); MODULE_VERSION(cam, 1); static void xpt_async_bcast(struct async_list *async_head, u_int32_t async_code, struct cam_path *path, void *async_arg); static path_id_t xptnextfreepathid(void); static path_id_t xptpathid(const char *sim_name, int sim_unit, int sim_bus); static union ccb *xpt_get_ccb(struct cam_periph *periph); static union ccb *xpt_get_ccb_nowait(struct cam_periph *periph); static void xpt_run_allocq(struct cam_periph *periph, int sleep); static void xpt_run_allocq_task(void *context, int pending); static void xpt_run_devq(struct cam_devq *devq); static timeout_t xpt_release_devq_timeout; static void xpt_release_simq_timeout(void *arg) __unused; static void xpt_acquire_bus(struct cam_eb *bus); static void xpt_release_bus(struct cam_eb *bus); static uint32_t xpt_freeze_devq_device(struct cam_ed *dev, u_int count); static int xpt_release_devq_device(struct cam_ed *dev, u_int count, int run_queue); static struct cam_et* xpt_alloc_target(struct cam_eb *bus, target_id_t target_id); static void xpt_acquire_target(struct cam_et *target); static void xpt_release_target(struct cam_et *target); static struct cam_eb* xpt_find_bus(path_id_t path_id); static struct cam_et* xpt_find_target(struct cam_eb *bus, target_id_t target_id); static struct cam_ed* xpt_find_device(struct cam_et *target, lun_id_t lun_id); static void xpt_config(void *arg); static int xpt_schedule_dev(struct camq *queue, cam_pinfo *dev_pinfo, u_int32_t new_priority); static xpt_devicefunc_t xptpassannouncefunc; static void xptaction(struct cam_sim *sim, union ccb *work_ccb); static void xptpoll(struct cam_sim *sim); static void camisr_runqueue(void); static void xpt_done_process(struct ccb_hdr *ccb_h); static void xpt_done_td(void *); static dev_match_ret xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_eb *bus); static dev_match_ret xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_ed *device); static dev_match_ret xptperiphmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_periph *periph); static xpt_busfunc_t xptedtbusfunc; static xpt_targetfunc_t xptedttargetfunc; static xpt_devicefunc_t xptedtdevicefunc; static xpt_periphfunc_t xptedtperiphfunc; static xpt_pdrvfunc_t xptplistpdrvfunc; static xpt_periphfunc_t xptplistperiphfunc; static int xptedtmatch(struct ccb_dev_match *cdm); static int xptperiphlistmatch(struct ccb_dev_match *cdm); static int xptbustraverse(struct cam_eb *start_bus, xpt_busfunc_t *tr_func, void *arg); static int xpttargettraverse(struct cam_eb *bus, struct cam_et *start_target, xpt_targetfunc_t *tr_func, void *arg); static int xptdevicetraverse(struct cam_et *target, struct cam_ed *start_device, xpt_devicefunc_t *tr_func, void *arg); static int xptperiphtraverse(struct cam_ed *device, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg); static int xptpdrvtraverse(struct periph_driver **start_pdrv, xpt_pdrvfunc_t *tr_func, void *arg); static int xptpdperiphtraverse(struct periph_driver **pdrv, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg); static xpt_busfunc_t xptdefbusfunc; static xpt_targetfunc_t xptdeftargetfunc; static xpt_devicefunc_t xptdefdevicefunc; static xpt_periphfunc_t xptdefperiphfunc; static void xpt_finishconfig_task(void *context, int pending); static void xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus, struct cam_et *target, struct cam_ed *device, void *async_arg); static struct cam_ed * xpt_alloc_device_default(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id); static xpt_devicefunc_t xptsetasyncfunc; static xpt_busfunc_t xptsetasyncbusfunc; static cam_status xptregister(struct cam_periph *periph, void *arg); static const char * xpt_action_name(uint32_t action); static __inline int device_is_queued(struct cam_ed *device); static __inline int xpt_schedule_devq(struct cam_devq *devq, struct cam_ed *dev) { int retval; mtx_assert(&devq->send_mtx, MA_OWNED); if ((dev->ccbq.queue.entries > 0) && (dev->ccbq.dev_openings > 0) && (dev->ccbq.queue.qfrozen_cnt == 0)) { /* * The priority of a device waiting for controller * resources is that of the highest priority CCB * enqueued. */ retval = xpt_schedule_dev(&devq->send_queue, &dev->devq_entry, CAMQ_GET_PRIO(&dev->ccbq.queue)); } else { retval = 0; } return (retval); } static __inline int device_is_queued(struct cam_ed *device) { return (device->devq_entry.index != CAM_UNQUEUED_INDEX); } static void xpt_periph_init() { make_dev(&xpt_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "xpt0"); } static int xptopen(struct cdev *dev, int flags, int fmt, struct thread *td) { /* * Only allow read-write access. */ if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0)) return(EPERM); /* * We don't allow nonblocking access. */ if ((flags & O_NONBLOCK) != 0) { printf("%s: can't do nonblocking access\n", devtoname(dev)); return(ENODEV); } return(0); } static int xptclose(struct cdev *dev, int flag, int fmt, struct thread *td) { return(0); } /* * Don't automatically grab the xpt softc lock here even though this is going * through the xpt device. The xpt device is really just a back door for * accessing other devices and SIMs, so the right thing to do is to grab * the appropriate SIM lock once the bus/SIM is located. */ static int xptioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; if ((error = xptdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) { error = cam_compat_ioctl(dev, cmd, addr, flag, td, xptdoioctl); } return (error); } static int xptdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; error = 0; switch(cmd) { /* * For the transport layer CAMIOCOMMAND ioctl, we really only want * to accept CCB types that don't quite make sense to send through a * passthrough driver. XPT_PATH_INQ is an exception to this, as stated * in the CAM spec. */ case CAMIOCOMMAND: { union ccb *ccb; union ccb *inccb; struct cam_eb *bus; inccb = (union ccb *)addr; bus = xpt_find_bus(inccb->ccb_h.path_id); if (bus == NULL) return (EINVAL); switch (inccb->ccb_h.func_code) { case XPT_SCAN_BUS: case XPT_RESET_BUS: if (inccb->ccb_h.target_id != CAM_TARGET_WILDCARD || inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) { xpt_release_bus(bus); return (EINVAL); } break; case XPT_SCAN_TGT: if (inccb->ccb_h.target_id == CAM_TARGET_WILDCARD || inccb->ccb_h.target_lun != CAM_LUN_WILDCARD) { xpt_release_bus(bus); return (EINVAL); } break; default: break; } switch(inccb->ccb_h.func_code) { case XPT_SCAN_BUS: case XPT_RESET_BUS: case XPT_PATH_INQ: case XPT_ENG_INQ: case XPT_SCAN_LUN: case XPT_SCAN_TGT: ccb = xpt_alloc_ccb(); /* * Create a path using the bus, target, and lun the * user passed in. */ if (xpt_create_path(&ccb->ccb_h.path, NULL, inccb->ccb_h.path_id, inccb->ccb_h.target_id, inccb->ccb_h.target_lun) != CAM_REQ_CMP){ error = EINVAL; xpt_free_ccb(ccb); break; } /* Ensure all of our fields are correct */ xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, inccb->ccb_h.pinfo.priority); xpt_merge_ccb(ccb, inccb); xpt_path_lock(ccb->ccb_h.path); cam_periph_runccb(ccb, NULL, 0, 0, NULL); xpt_path_unlock(ccb->ccb_h.path); bcopy(ccb, inccb, sizeof(union ccb)); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); break; case XPT_DEBUG: { union ccb ccb; /* * This is an immediate CCB, so it's okay to * allocate it on the stack. */ /* * Create a path using the bus, target, and lun the * user passed in. */ if (xpt_create_path(&ccb.ccb_h.path, NULL, inccb->ccb_h.path_id, inccb->ccb_h.target_id, inccb->ccb_h.target_lun) != CAM_REQ_CMP){ error = EINVAL; break; } /* Ensure all of our fields are correct */ xpt_setup_ccb(&ccb.ccb_h, ccb.ccb_h.path, inccb->ccb_h.pinfo.priority); xpt_merge_ccb(&ccb, inccb); xpt_action(&ccb); bcopy(&ccb, inccb, sizeof(union ccb)); xpt_free_path(ccb.ccb_h.path); break; } case XPT_DEV_MATCH: { struct cam_periph_map_info mapinfo; struct cam_path *old_path; /* * We can't deal with physical addresses for this * type of transaction. */ if ((inccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) { error = EINVAL; break; } /* * Save this in case the caller had it set to * something in particular. */ old_path = inccb->ccb_h.path; /* * We really don't need a path for the matching * code. The path is needed because of the * debugging statements in xpt_action(). They * assume that the CCB has a valid path. */ inccb->ccb_h.path = xpt_periph->path; bzero(&mapinfo, sizeof(mapinfo)); /* * Map the pattern and match buffers into kernel * virtual address space. */ error = cam_periph_mapmem(inccb, &mapinfo, MAXPHYS); if (error) { inccb->ccb_h.path = old_path; break; } /* * This is an immediate CCB, we can send it on directly. */ xpt_action(inccb); /* * Map the buffers back into user space. */ cam_periph_unmapmem(inccb, &mapinfo); inccb->ccb_h.path = old_path; error = 0; break; } default: error = ENOTSUP; break; } xpt_release_bus(bus); break; } /* * This is the getpassthru ioctl. It takes a XPT_GDEVLIST ccb as input, * with the periphal driver name and unit name filled in. The other * fields don't really matter as input. The passthrough driver name * ("pass"), and unit number are passed back in the ccb. The current * device generation number, and the index into the device peripheral * driver list, and the status are also passed back. Note that * since we do everything in one pass, unlike the XPT_GDEVLIST ccb, * we never return a status of CAM_GDEVLIST_LIST_CHANGED. It is * (or rather should be) impossible for the device peripheral driver * list to change since we look at the whole thing in one pass, and * we do it with lock protection. * */ case CAMGETPASSTHRU: { union ccb *ccb; struct cam_periph *periph; struct periph_driver **p_drv; char *name; u_int unit; int base_periph_found; ccb = (union ccb *)addr; unit = ccb->cgdl.unit_number; name = ccb->cgdl.periph_name; base_periph_found = 0; /* * Sanity check -- make sure we don't get a null peripheral * driver name. */ if (*ccb->cgdl.periph_name == '\0') { error = EINVAL; break; } /* Keep the list from changing while we traverse it */ xpt_lock_buses(); /* first find our driver in the list of drivers */ for (p_drv = periph_drivers; *p_drv != NULL; p_drv++) if (strcmp((*p_drv)->driver_name, name) == 0) break; if (*p_drv == NULL) { xpt_unlock_buses(); ccb->ccb_h.status = CAM_REQ_CMP_ERR; ccb->cgdl.status = CAM_GDEVLIST_ERROR; *ccb->cgdl.periph_name = '\0'; ccb->cgdl.unit_number = 0; error = ENOENT; break; } /* * Run through every peripheral instance of this driver * and check to see whether it matches the unit passed * in by the user. If it does, get out of the loops and * find the passthrough driver associated with that * peripheral driver. */ for (periph = TAILQ_FIRST(&(*p_drv)->units); periph != NULL; periph = TAILQ_NEXT(periph, unit_links)) { if (periph->unit_number == unit) break; } /* * If we found the peripheral driver that the user passed * in, go through all of the peripheral drivers for that * particular device and look for a passthrough driver. */ if (periph != NULL) { struct cam_ed *device; int i; base_periph_found = 1; device = periph->path->device; for (i = 0, periph = SLIST_FIRST(&device->periphs); periph != NULL; periph = SLIST_NEXT(periph, periph_links), i++) { /* * Check to see whether we have a * passthrough device or not. */ if (strcmp(periph->periph_name, "pass") == 0) { /* * Fill in the getdevlist fields. */ strcpy(ccb->cgdl.periph_name, periph->periph_name); ccb->cgdl.unit_number = periph->unit_number; if (SLIST_NEXT(periph, periph_links)) ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS; else ccb->cgdl.status = CAM_GDEVLIST_LAST_DEVICE; ccb->cgdl.generation = device->generation; ccb->cgdl.index = i; /* * Fill in some CCB header fields * that the user may want. */ ccb->ccb_h.path_id = periph->path->bus->path_id; ccb->ccb_h.target_id = periph->path->target->target_id; ccb->ccb_h.target_lun = periph->path->device->lun_id; ccb->ccb_h.status = CAM_REQ_CMP; break; } } } /* * If the periph is null here, one of two things has * happened. The first possibility is that we couldn't * find the unit number of the particular peripheral driver * that the user is asking about. e.g. the user asks for * the passthrough driver for "da11". We find the list of * "da" peripherals all right, but there is no unit 11. * The other possibility is that we went through the list * of peripheral drivers attached to the device structure, * but didn't find one with the name "pass". Either way, * we return ENOENT, since we couldn't find something. */ if (periph == NULL) { ccb->ccb_h.status = CAM_REQ_CMP_ERR; ccb->cgdl.status = CAM_GDEVLIST_ERROR; *ccb->cgdl.periph_name = '\0'; ccb->cgdl.unit_number = 0; error = ENOENT; /* * It is unfortunate that this is even necessary, * but there are many, many clueless users out there. * If this is true, the user is looking for the * passthrough driver, but doesn't have one in his * kernel. */ if (base_periph_found == 1) { printf("xptioctl: pass driver is not in the " "kernel\n"); printf("xptioctl: put \"device pass\" in " "your kernel config file\n"); } } xpt_unlock_buses(); break; } default: error = ENOTTY; break; } return(error); } static int cam_module_event_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if ((error = xpt_init(NULL)) != 0) return (error); break; case MOD_UNLOAD: return EBUSY; default: return EOPNOTSUPP; } return 0; } static struct xpt_proto * xpt_proto_find(cam_proto proto) { struct xpt_proto **pp; SET_FOREACH(pp, cam_xpt_proto_set) { if ((*pp)->proto == proto) return *pp; } return NULL; } static void xpt_rescan_done(struct cam_periph *periph, union ccb *done_ccb) { if (done_ccb->ccb_h.ppriv_ptr1 == NULL) { xpt_free_path(done_ccb->ccb_h.path); xpt_free_ccb(done_ccb); } else { done_ccb->ccb_h.cbfcnp = done_ccb->ccb_h.ppriv_ptr1; (*done_ccb->ccb_h.cbfcnp)(periph, done_ccb); } xpt_release_boot(); } /* thread to handle bus rescans */ static void xpt_scanner_thread(void *dummy) { union ccb *ccb; struct cam_path path; xpt_lock_buses(); for (;;) { if (TAILQ_EMPTY(&xsoftc.ccb_scanq)) msleep(&xsoftc.ccb_scanq, &xsoftc.xpt_topo_lock, PRIBIO, "-", 0); if ((ccb = (union ccb *)TAILQ_FIRST(&xsoftc.ccb_scanq)) != NULL) { TAILQ_REMOVE(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe); xpt_unlock_buses(); /* * Since lock can be dropped inside and path freed * by completion callback even before return here, * take our own path copy for reference. */ xpt_copy_path(&path, ccb->ccb_h.path); xpt_path_lock(&path); xpt_action(ccb); xpt_path_unlock(&path); xpt_release_path(&path); xpt_lock_buses(); } } } void xpt_rescan(union ccb *ccb) { struct ccb_hdr *hdr; /* Prepare request */ if (ccb->ccb_h.path->target->target_id == CAM_TARGET_WILDCARD && ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_BUS; else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD && ccb->ccb_h.path->device->lun_id == CAM_LUN_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_TGT; else if (ccb->ccb_h.path->target->target_id != CAM_TARGET_WILDCARD && ccb->ccb_h.path->device->lun_id != CAM_LUN_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_LUN; else { xpt_print(ccb->ccb_h.path, "illegal scan path\n"); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_rescan: func %#x %s\n", ccb->ccb_h.func_code, xpt_action_name(ccb->ccb_h.func_code))); ccb->ccb_h.ppriv_ptr1 = ccb->ccb_h.cbfcnp; ccb->ccb_h.cbfcnp = xpt_rescan_done; xpt_setup_ccb(&ccb->ccb_h, ccb->ccb_h.path, CAM_PRIORITY_XPT); /* Don't make duplicate entries for the same paths. */ xpt_lock_buses(); if (ccb->ccb_h.ppriv_ptr1 == NULL) { TAILQ_FOREACH(hdr, &xsoftc.ccb_scanq, sim_links.tqe) { if (xpt_path_comp(hdr->path, ccb->ccb_h.path) == 0) { wakeup(&xsoftc.ccb_scanq); xpt_unlock_buses(); xpt_print(ccb->ccb_h.path, "rescan already queued\n"); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } } } TAILQ_INSERT_TAIL(&xsoftc.ccb_scanq, &ccb->ccb_h, sim_links.tqe); xsoftc.buses_to_config++; wakeup(&xsoftc.ccb_scanq); xpt_unlock_buses(); } /* Functions accessed by the peripheral drivers */ static int xpt_init(void *dummy) { struct cam_sim *xpt_sim; struct cam_path *path; struct cam_devq *devq; cam_status status; int error, i; TAILQ_INIT(&xsoftc.xpt_busses); TAILQ_INIT(&xsoftc.ccb_scanq); STAILQ_INIT(&xsoftc.highpowerq); xsoftc.num_highpower = CAM_MAX_HIGHPOWER; mtx_init(&xsoftc.xpt_lock, "XPT lock", NULL, MTX_DEF); mtx_init(&xsoftc.xpt_highpower_lock, "XPT highpower lock", NULL, MTX_DEF); xsoftc.xpt_taskq = taskqueue_create("CAM XPT task", M_WAITOK, taskqueue_thread_enqueue, /*context*/&xsoftc.xpt_taskq); #ifdef CAM_BOOT_DELAY /* * Override this value at compile time to assist our users * who don't use loader to boot a kernel. */ xsoftc.boot_delay = CAM_BOOT_DELAY; #endif /* * The xpt layer is, itself, the equivalent of a SIM. * Allow 16 ccbs in the ccb pool for it. This should * give decent parallelism when we probe busses and * perform other XPT functions. */ devq = cam_simq_alloc(16); xpt_sim = cam_sim_alloc(xptaction, xptpoll, "xpt", /*softc*/NULL, /*unit*/0, /*mtx*/&xsoftc.xpt_lock, /*max_dev_transactions*/0, /*max_tagged_dev_transactions*/0, devq); if (xpt_sim == NULL) return (ENOMEM); mtx_lock(&xsoftc.xpt_lock); if ((status = xpt_bus_register(xpt_sim, NULL, 0)) != CAM_SUCCESS) { mtx_unlock(&xsoftc.xpt_lock); printf("xpt_init: xpt_bus_register failed with status %#x," " failing attach\n", status); return (EINVAL); } mtx_unlock(&xsoftc.xpt_lock); /* * Looking at the XPT from the SIM layer, the XPT is * the equivalent of a peripheral driver. Allocate * a peripheral driver entry for us. */ if ((status = xpt_create_path(&path, NULL, CAM_XPT_PATH_ID, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD)) != CAM_REQ_CMP) { printf("xpt_init: xpt_create_path failed with status %#x," " failing attach\n", status); return (EINVAL); } xpt_path_lock(path); cam_periph_alloc(xptregister, NULL, NULL, NULL, "xpt", CAM_PERIPH_BIO, path, NULL, 0, xpt_sim); xpt_path_unlock(path); xpt_free_path(path); if (cam_num_doneqs < 1) cam_num_doneqs = 1 + mp_ncpus / 6; else if (cam_num_doneqs > MAXCPU) cam_num_doneqs = MAXCPU; for (i = 0; i < cam_num_doneqs; i++) { mtx_init(&cam_doneqs[i].cam_doneq_mtx, "CAM doneq", NULL, MTX_DEF); STAILQ_INIT(&cam_doneqs[i].cam_doneq); error = kproc_kthread_add(xpt_done_td, &cam_doneqs[i], &cam_proc, NULL, 0, 0, "cam", "doneq%d", i); if (error != 0) { cam_num_doneqs = i; break; } } if (cam_num_doneqs < 1) { printf("xpt_init: Cannot init completion queues " "- failing attach\n"); return (ENOMEM); } /* * Register a callback for when interrupts are enabled. */ xsoftc.xpt_config_hook = (struct intr_config_hook *)malloc(sizeof(struct intr_config_hook), M_CAMXPT, M_NOWAIT | M_ZERO); if (xsoftc.xpt_config_hook == NULL) { printf("xpt_init: Cannot malloc config hook " "- failing attach\n"); return (ENOMEM); } xsoftc.xpt_config_hook->ich_func = xpt_config; if (config_intrhook_establish(xsoftc.xpt_config_hook) != 0) { free (xsoftc.xpt_config_hook, M_CAMXPT); printf("xpt_init: config_intrhook_establish failed " "- failing attach\n"); } return (0); } static cam_status xptregister(struct cam_periph *periph, void *arg) { struct cam_sim *xpt_sim; if (periph == NULL) { printf("xptregister: periph was NULL!!\n"); return(CAM_REQ_CMP_ERR); } xpt_sim = (struct cam_sim *)arg; xpt_sim->softc = periph; xpt_periph = periph; periph->softc = NULL; return(CAM_REQ_CMP); } int32_t xpt_add_periph(struct cam_periph *periph) { struct cam_ed *device; int32_t status; TASK_INIT(&periph->periph_run_task, 0, xpt_run_allocq_task, periph); device = periph->path->device; status = CAM_REQ_CMP; if (device != NULL) { mtx_lock(&device->target->bus->eb_mtx); device->generation++; SLIST_INSERT_HEAD(&device->periphs, periph, periph_links); mtx_unlock(&device->target->bus->eb_mtx); atomic_add_32(&xsoftc.xpt_generation, 1); } return (status); } void xpt_remove_periph(struct cam_periph *periph) { struct cam_ed *device; device = periph->path->device; if (device != NULL) { mtx_lock(&device->target->bus->eb_mtx); device->generation++; SLIST_REMOVE(&device->periphs, periph, cam_periph, periph_links); mtx_unlock(&device->target->bus->eb_mtx); atomic_add_32(&xsoftc.xpt_generation, 1); } } void xpt_announce_periph(struct cam_periph *periph, char *announce_string) { struct cam_path *path = periph->path; struct xpt_proto *proto; cam_periph_assert(periph, MA_OWNED); periph->flags |= CAM_PERIPH_ANNOUNCED; printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n", periph->periph_name, periph->unit_number, path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id, path->bus->path_id, path->target->target_id, (uintmax_t)path->device->lun_id); printf("%s%d: ", periph->periph_name, periph->unit_number); proto = xpt_proto_find(path->device->protocol); if (proto) proto->ops->announce(path->device); else printf("%s%d: Unknown protocol device %d\n", periph->periph_name, periph->unit_number, path->device->protocol); if (path->device->serial_num_len > 0) { /* Don't wrap the screen - print only the first 60 chars */ printf("%s%d: Serial Number %.60s\n", periph->periph_name, periph->unit_number, path->device->serial_num); } /* Announce transport details. */ path->bus->xport->ops->announce(periph); /* Announce command queueing. */ if (path->device->inq_flags & SID_CmdQue || path->device->flags & CAM_DEV_TAG_AFTER_COUNT) { printf("%s%d: Command Queueing enabled\n", periph->periph_name, periph->unit_number); } /* Announce caller's details if they've passed in. */ if (announce_string != NULL) printf("%s%d: %s\n", periph->periph_name, periph->unit_number, announce_string); } void xpt_announce_quirks(struct cam_periph *periph, int quirks, char *bit_string) { if (quirks != 0) { printf("%s%d: quirks=0x%b\n", periph->periph_name, periph->unit_number, quirks, bit_string); } } void xpt_denounce_periph(struct cam_periph *periph) { struct cam_path *path = periph->path; struct xpt_proto *proto; cam_periph_assert(periph, MA_OWNED); printf("%s%d at %s%d bus %d scbus%d target %d lun %jx\n", periph->periph_name, periph->unit_number, path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id, path->bus->path_id, path->target->target_id, (uintmax_t)path->device->lun_id); printf("%s%d: ", periph->periph_name, periph->unit_number); proto = xpt_proto_find(path->device->protocol); if (proto) proto->ops->denounce(path->device); else printf("%s%d: Unknown protocol device %d\n", periph->periph_name, periph->unit_number, path->device->protocol); if (path->device->serial_num_len > 0) printf(" s/n %.60s", path->device->serial_num); printf(" detached\n"); } int xpt_getattr(char *buf, size_t len, const char *attr, struct cam_path *path) { int ret = -1, l; struct ccb_dev_advinfo cdai; struct scsi_vpd_id_descriptor *idd; xpt_path_assert(path, MA_OWNED); memset(&cdai, 0, sizeof(cdai)); xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL); cdai.ccb_h.func_code = XPT_DEV_ADVINFO; cdai.bufsiz = len; if (!strcmp(attr, "GEOM::ident")) cdai.buftype = CDAI_TYPE_SERIAL_NUM; else if (!strcmp(attr, "GEOM::physpath")) cdai.buftype = CDAI_TYPE_PHYS_PATH; else if (strcmp(attr, "GEOM::lunid") == 0 || strcmp(attr, "GEOM::lunname") == 0) { cdai.buftype = CDAI_TYPE_SCSI_DEVID; cdai.bufsiz = CAM_SCSI_DEVID_MAXLEN; } else goto out; cdai.buf = malloc(cdai.bufsiz, M_CAMXPT, M_NOWAIT|M_ZERO); if (cdai.buf == NULL) { ret = ENOMEM; goto out; } xpt_action((union ccb *)&cdai); /* can only be synchronous */ if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); if (cdai.provsiz == 0) goto out; if (cdai.buftype == CDAI_TYPE_SCSI_DEVID) { if (strcmp(attr, "GEOM::lunid") == 0) { idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf, cdai.provsiz, scsi_devid_is_lun_naa); if (idd == NULL) idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf, cdai.provsiz, scsi_devid_is_lun_eui64); } else idd = NULL; if (idd == NULL) idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf, cdai.provsiz, scsi_devid_is_lun_t10); if (idd == NULL) idd = scsi_get_devid((struct scsi_vpd_device_id *)cdai.buf, cdai.provsiz, scsi_devid_is_lun_name); if (idd == NULL) goto out; ret = 0; if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) == SVPD_ID_CODESET_ASCII) { if (idd->length < len) { for (l = 0; l < idd->length; l++) buf[l] = idd->identifier[l] ? idd->identifier[l] : ' '; buf[l] = 0; } else ret = EFAULT; } else if ((idd->proto_codeset & SVPD_ID_CODESET_MASK) == SVPD_ID_CODESET_UTF8) { l = strnlen(idd->identifier, idd->length); if (l < len) { bcopy(idd->identifier, buf, l); buf[l] = 0; } else ret = EFAULT; } else { if (idd->length * 2 < len) { for (l = 0; l < idd->length; l++) sprintf(buf + l * 2, "%02x", idd->identifier[l]); } else ret = EFAULT; } } else { ret = 0; if (strlcpy(buf, cdai.buf, len) >= len) ret = EFAULT; } out: if (cdai.buf != NULL) free(cdai.buf, M_CAMXPT); return ret; } static dev_match_ret xptbusmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_eb *bus) { dev_match_ret retval; u_int i; retval = DM_RET_NONE; /* * If we aren't given something to match against, that's an error. */ if (bus == NULL) return(DM_RET_ERROR); /* * If there are no match entries, then this bus matches no * matter what. */ if ((patterns == NULL) || (num_patterns == 0)) return(DM_RET_DESCEND | DM_RET_COPY); for (i = 0; i < num_patterns; i++) { struct bus_match_pattern *cur_pattern; /* * If the pattern in question isn't for a bus node, we * aren't interested. However, we do indicate to the * calling routine that we should continue descending the * tree, since the user wants to match against lower-level * EDT elements. */ if (patterns[i].type != DEV_MATCH_BUS) { if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE) retval |= DM_RET_DESCEND; continue; } cur_pattern = &patterns[i].pattern.bus_pattern; /* * If they want to match any bus node, we give them any * device node. */ if (cur_pattern->flags == BUS_MATCH_ANY) { /* set the copy flag */ retval |= DM_RET_COPY; /* * If we've already decided on an action, go ahead * and return. */ if ((retval & DM_RET_ACTION_MASK) != DM_RET_NONE) return(retval); } /* * Not sure why someone would do this... */ if (cur_pattern->flags == BUS_MATCH_NONE) continue; if (((cur_pattern->flags & BUS_MATCH_PATH) != 0) && (cur_pattern->path_id != bus->path_id)) continue; if (((cur_pattern->flags & BUS_MATCH_BUS_ID) != 0) && (cur_pattern->bus_id != bus->sim->bus_id)) continue; if (((cur_pattern->flags & BUS_MATCH_UNIT) != 0) && (cur_pattern->unit_number != bus->sim->unit_number)) continue; if (((cur_pattern->flags & BUS_MATCH_NAME) != 0) && (strncmp(cur_pattern->dev_name, bus->sim->sim_name, DEV_IDLEN) != 0)) continue; /* * If we get to this point, the user definitely wants * information on this bus. So tell the caller to copy the * data out. */ retval |= DM_RET_COPY; /* * If the return action has been set to descend, then we * know that we've already seen a non-bus matching * expression, therefore we need to further descend the tree. * This won't change by continuing around the loop, so we * go ahead and return. If we haven't seen a non-bus * matching expression, we keep going around the loop until * we exhaust the matching expressions. We'll set the stop * flag once we fall out of the loop. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND) return(retval); } /* * If the return action hasn't been set to descend yet, that means * we haven't seen anything other than bus matching patterns. So * tell the caller to stop descending the tree -- the user doesn't * want to match against lower level tree elements. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE) retval |= DM_RET_STOP; return(retval); } static dev_match_ret xptdevicematch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_ed *device) { dev_match_ret retval; u_int i; retval = DM_RET_NONE; /* * If we aren't given something to match against, that's an error. */ if (device == NULL) return(DM_RET_ERROR); /* * If there are no match entries, then this device matches no * matter what. */ if ((patterns == NULL) || (num_patterns == 0)) return(DM_RET_DESCEND | DM_RET_COPY); for (i = 0; i < num_patterns; i++) { struct device_match_pattern *cur_pattern; struct scsi_vpd_device_id *device_id_page; /* * If the pattern in question isn't for a device node, we * aren't interested. */ if (patterns[i].type != DEV_MATCH_DEVICE) { if ((patterns[i].type == DEV_MATCH_PERIPH) && ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE)) retval |= DM_RET_DESCEND; continue; } cur_pattern = &patterns[i].pattern.device_pattern; /* Error out if mutually exclusive options are specified. */ if ((cur_pattern->flags & (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID)) == (DEV_MATCH_INQUIRY|DEV_MATCH_DEVID)) return(DM_RET_ERROR); /* * If they want to match any device node, we give them any * device node. */ if (cur_pattern->flags == DEV_MATCH_ANY) goto copy_dev_node; /* * Not sure why someone would do this... */ if (cur_pattern->flags == DEV_MATCH_NONE) continue; if (((cur_pattern->flags & DEV_MATCH_PATH) != 0) && (cur_pattern->path_id != device->target->bus->path_id)) continue; if (((cur_pattern->flags & DEV_MATCH_TARGET) != 0) && (cur_pattern->target_id != device->target->target_id)) continue; if (((cur_pattern->flags & DEV_MATCH_LUN) != 0) && (cur_pattern->target_lun != device->lun_id)) continue; if (((cur_pattern->flags & DEV_MATCH_INQUIRY) != 0) && (cam_quirkmatch((caddr_t)&device->inq_data, (caddr_t)&cur_pattern->data.inq_pat, 1, sizeof(cur_pattern->data.inq_pat), scsi_static_inquiry_match) == NULL)) continue; device_id_page = (struct scsi_vpd_device_id *)device->device_id; if (((cur_pattern->flags & DEV_MATCH_DEVID) != 0) && (device->device_id_len < SVPD_DEVICE_ID_HDR_LEN || scsi_devid_match((uint8_t *)device_id_page->desc_list, device->device_id_len - SVPD_DEVICE_ID_HDR_LEN, cur_pattern->data.devid_pat.id, cur_pattern->data.devid_pat.id_len) != 0)) continue; copy_dev_node: /* * If we get to this point, the user definitely wants * information on this device. So tell the caller to copy * the data out. */ retval |= DM_RET_COPY; /* * If the return action has been set to descend, then we * know that we've already seen a peripheral matching * expression, therefore we need to further descend the tree. * This won't change by continuing around the loop, so we * go ahead and return. If we haven't seen a peripheral * matching expression, we keep going around the loop until * we exhaust the matching expressions. We'll set the stop * flag once we fall out of the loop. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_DESCEND) return(retval); } /* * If the return action hasn't been set to descend yet, that means * we haven't seen any peripheral matching patterns. So tell the * caller to stop descending the tree -- the user doesn't want to * match against lower level tree elements. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_NONE) retval |= DM_RET_STOP; return(retval); } /* * Match a single peripheral against any number of match patterns. */ static dev_match_ret xptperiphmatch(struct dev_match_pattern *patterns, u_int num_patterns, struct cam_periph *periph) { dev_match_ret retval; u_int i; /* * If we aren't given something to match against, that's an error. */ if (periph == NULL) return(DM_RET_ERROR); /* * If there are no match entries, then this peripheral matches no * matter what. */ if ((patterns == NULL) || (num_patterns == 0)) return(DM_RET_STOP | DM_RET_COPY); /* * There aren't any nodes below a peripheral node, so there's no * reason to descend the tree any further. */ retval = DM_RET_STOP; for (i = 0; i < num_patterns; i++) { struct periph_match_pattern *cur_pattern; /* * If the pattern in question isn't for a peripheral, we * aren't interested. */ if (patterns[i].type != DEV_MATCH_PERIPH) continue; cur_pattern = &patterns[i].pattern.periph_pattern; /* * If they want to match on anything, then we will do so. */ if (cur_pattern->flags == PERIPH_MATCH_ANY) { /* set the copy flag */ retval |= DM_RET_COPY; /* * We've already set the return action to stop, * since there are no nodes below peripherals in * the tree. */ return(retval); } /* * Not sure why someone would do this... */ if (cur_pattern->flags == PERIPH_MATCH_NONE) continue; if (((cur_pattern->flags & PERIPH_MATCH_PATH) != 0) && (cur_pattern->path_id != periph->path->bus->path_id)) continue; /* * For the target and lun id's, we have to make sure the * target and lun pointers aren't NULL. The xpt peripheral * has a wildcard target and device. */ if (((cur_pattern->flags & PERIPH_MATCH_TARGET) != 0) && ((periph->path->target == NULL) ||(cur_pattern->target_id != periph->path->target->target_id))) continue; if (((cur_pattern->flags & PERIPH_MATCH_LUN) != 0) && ((periph->path->device == NULL) || (cur_pattern->target_lun != periph->path->device->lun_id))) continue; if (((cur_pattern->flags & PERIPH_MATCH_UNIT) != 0) && (cur_pattern->unit_number != periph->unit_number)) continue; if (((cur_pattern->flags & PERIPH_MATCH_NAME) != 0) && (strncmp(cur_pattern->periph_name, periph->periph_name, DEV_IDLEN) != 0)) continue; /* * If we get to this point, the user definitely wants * information on this peripheral. So tell the caller to * copy the data out. */ retval |= DM_RET_COPY; /* * The return action has already been set to stop, since * peripherals don't have any nodes below them in the EDT. */ return(retval); } /* * If we get to this point, the peripheral that was passed in * doesn't match any of the patterns. */ return(retval); } static int xptedtbusfunc(struct cam_eb *bus, void *arg) { struct ccb_dev_match *cdm; struct cam_et *target; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; /* * If our position is for something deeper in the tree, that means * that we've already seen this node. So, we keep going down. */ if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target != NULL)) retval = DM_RET_DESCEND; else retval = xptbusmatch(cdm->patterns, cdm->num_patterns, bus); /* * If we got an error, bail out of the search. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this bus out. */ if (retval & DM_RET_COPY) { int spaceleft, j; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_EDT | CAM_DEV_POS_BUS; cdm->pos.cookie.bus = bus; cdm->pos.generations[CAM_BUS_GENERATION]= xsoftc.bus_generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_BUS; cdm->matches[j].result.bus_result.path_id = bus->path_id; cdm->matches[j].result.bus_result.bus_id = bus->sim->bus_id; cdm->matches[j].result.bus_result.unit_number = bus->sim->unit_number; strncpy(cdm->matches[j].result.bus_result.dev_name, bus->sim->sim_name, DEV_IDLEN); } /* * If the user is only interested in busses, there's no * reason to descend to the next level in the tree. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP) return(1); /* * If there is a target generation recorded, check it to * make sure the target list hasn't changed. */ mtx_lock(&bus->eb_mtx); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target != NULL)) { if ((cdm->pos.generations[CAM_TARGET_GENERATION] != bus->generation)) { mtx_unlock(&bus->eb_mtx); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return (0); } target = (struct cam_et *)cdm->pos.cookie.target; target->refcount++; } else target = NULL; mtx_unlock(&bus->eb_mtx); return (xpttargettraverse(bus, target, xptedttargetfunc, arg)); } static int xptedttargetfunc(struct cam_et *target, void *arg) { struct ccb_dev_match *cdm; struct cam_eb *bus; struct cam_ed *device; cdm = (struct ccb_dev_match *)arg; bus = target->bus; /* * If there is a device list generation recorded, check it to * make sure the device list hasn't changed. */ mtx_lock(&bus->eb_mtx); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target == target) && (cdm->pos.position_type & CAM_DEV_POS_DEVICE) && (cdm->pos.cookie.device != NULL)) { if (cdm->pos.generations[CAM_DEV_GENERATION] != target->generation) { mtx_unlock(&bus->eb_mtx); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } device = (struct cam_ed *)cdm->pos.cookie.device; device->refcount++; } else device = NULL; mtx_unlock(&bus->eb_mtx); return (xptdevicetraverse(target, device, xptedtdevicefunc, arg)); } static int xptedtdevicefunc(struct cam_ed *device, void *arg) { struct cam_eb *bus; struct cam_periph *periph; struct ccb_dev_match *cdm; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; bus = device->target->bus; /* * If our position is for something deeper in the tree, that means * that we've already seen this node. So, we keep going down. */ if ((cdm->pos.position_type & CAM_DEV_POS_DEVICE) && (cdm->pos.cookie.device == device) && (cdm->pos.position_type & CAM_DEV_POS_PERIPH) && (cdm->pos.cookie.periph != NULL)) retval = DM_RET_DESCEND; else retval = xptdevicematch(cdm->patterns, cdm->num_patterns, device); if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this device out. */ if (retval & DM_RET_COPY) { int spaceleft, j; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_EDT | CAM_DEV_POS_BUS | CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE; cdm->pos.cookie.bus = device->target->bus; cdm->pos.generations[CAM_BUS_GENERATION]= xsoftc.bus_generation; cdm->pos.cookie.target = device->target; cdm->pos.generations[CAM_TARGET_GENERATION] = device->target->bus->generation; cdm->pos.cookie.device = device; cdm->pos.generations[CAM_DEV_GENERATION] = device->target->generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_DEVICE; cdm->matches[j].result.device_result.path_id = device->target->bus->path_id; cdm->matches[j].result.device_result.target_id = device->target->target_id; cdm->matches[j].result.device_result.target_lun = device->lun_id; cdm->matches[j].result.device_result.protocol = device->protocol; bcopy(&device->inq_data, &cdm->matches[j].result.device_result.inq_data, sizeof(struct scsi_inquiry_data)); bcopy(&device->ident_data, &cdm->matches[j].result.device_result.ident_data, sizeof(struct ata_params)); /* Let the user know whether this device is unconfigured */ if (device->flags & CAM_DEV_UNCONFIGURED) cdm->matches[j].result.device_result.flags = DEV_RESULT_UNCONFIGURED; else cdm->matches[j].result.device_result.flags = DEV_RESULT_NOFLAG; } /* * If the user isn't interested in peripherals, don't descend * the tree any further. */ if ((retval & DM_RET_ACTION_MASK) == DM_RET_STOP) return(1); /* * If there is a peripheral list generation recorded, make sure * it hasn't changed. */ xpt_lock_buses(); mtx_lock(&bus->eb_mtx); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus == bus) && (cdm->pos.position_type & CAM_DEV_POS_TARGET) && (cdm->pos.cookie.target == device->target) && (cdm->pos.position_type & CAM_DEV_POS_DEVICE) && (cdm->pos.cookie.device == device) && (cdm->pos.position_type & CAM_DEV_POS_PERIPH) && (cdm->pos.cookie.periph != NULL)) { if (cdm->pos.generations[CAM_PERIPH_GENERATION] != device->generation) { mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } periph = (struct cam_periph *)cdm->pos.cookie.periph; periph->refcount++; } else periph = NULL; mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); return (xptperiphtraverse(device, periph, xptedtperiphfunc, arg)); } static int xptedtperiphfunc(struct cam_periph *periph, void *arg) { struct ccb_dev_match *cdm; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph); if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this peripheral out. */ if (retval & DM_RET_COPY) { int spaceleft, j; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_EDT | CAM_DEV_POS_BUS | CAM_DEV_POS_TARGET | CAM_DEV_POS_DEVICE | CAM_DEV_POS_PERIPH; cdm->pos.cookie.bus = periph->path->bus; cdm->pos.generations[CAM_BUS_GENERATION]= xsoftc.bus_generation; cdm->pos.cookie.target = periph->path->target; cdm->pos.generations[CAM_TARGET_GENERATION] = periph->path->bus->generation; cdm->pos.cookie.device = periph->path->device; cdm->pos.generations[CAM_DEV_GENERATION] = periph->path->target->generation; cdm->pos.cookie.periph = periph; cdm->pos.generations[CAM_PERIPH_GENERATION] = periph->path->device->generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_PERIPH; cdm->matches[j].result.periph_result.path_id = periph->path->bus->path_id; cdm->matches[j].result.periph_result.target_id = periph->path->target->target_id; cdm->matches[j].result.periph_result.target_lun = periph->path->device->lun_id; cdm->matches[j].result.periph_result.unit_number = periph->unit_number; strncpy(cdm->matches[j].result.periph_result.periph_name, periph->periph_name, DEV_IDLEN); } return(1); } static int xptedtmatch(struct ccb_dev_match *cdm) { struct cam_eb *bus; int ret; cdm->num_matches = 0; /* * Check the bus list generation. If it has changed, the user * needs to reset everything and start over. */ xpt_lock_buses(); if ((cdm->pos.position_type & CAM_DEV_POS_BUS) && (cdm->pos.cookie.bus != NULL)) { if (cdm->pos.generations[CAM_BUS_GENERATION] != xsoftc.bus_generation) { xpt_unlock_buses(); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } bus = (struct cam_eb *)cdm->pos.cookie.bus; bus->refcount++; } else bus = NULL; xpt_unlock_buses(); ret = xptbustraverse(bus, xptedtbusfunc, cdm); /* * If we get back 0, that means that we had to stop before fully * traversing the EDT. It also means that one of the subroutines * has set the status field to the proper value. If we get back 1, * we've fully traversed the EDT and copied out any matching entries. */ if (ret == 1) cdm->status = CAM_DEV_MATCH_LAST; return(ret); } static int xptplistpdrvfunc(struct periph_driver **pdrv, void *arg) { struct cam_periph *periph; struct ccb_dev_match *cdm; cdm = (struct ccb_dev_match *)arg; xpt_lock_buses(); if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR) && (cdm->pos.cookie.pdrv == pdrv) && (cdm->pos.position_type & CAM_DEV_POS_PERIPH) && (cdm->pos.cookie.periph != NULL)) { if (cdm->pos.generations[CAM_PERIPH_GENERATION] != (*pdrv)->generation) { xpt_unlock_buses(); cdm->status = CAM_DEV_MATCH_LIST_CHANGED; return(0); } periph = (struct cam_periph *)cdm->pos.cookie.periph; periph->refcount++; } else periph = NULL; xpt_unlock_buses(); return (xptpdperiphtraverse(pdrv, periph, xptplistperiphfunc, arg)); } static int xptplistperiphfunc(struct cam_periph *periph, void *arg) { struct ccb_dev_match *cdm; dev_match_ret retval; cdm = (struct ccb_dev_match *)arg; retval = xptperiphmatch(cdm->patterns, cdm->num_patterns, periph); if ((retval & DM_RET_ACTION_MASK) == DM_RET_ERROR) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } /* * If the copy flag is set, copy this peripheral out. */ if (retval & DM_RET_COPY) { int spaceleft, j; spaceleft = cdm->match_buf_len - (cdm->num_matches * sizeof(struct dev_match_result)); /* * If we don't have enough space to put in another * match result, save our position and tell the * user there are more devices to check. */ if (spaceleft < sizeof(struct dev_match_result)) { struct periph_driver **pdrv; pdrv = NULL; bzero(&cdm->pos, sizeof(cdm->pos)); cdm->pos.position_type = CAM_DEV_POS_PDRV | CAM_DEV_POS_PDPTR | CAM_DEV_POS_PERIPH; /* * This may look a bit non-sensical, but it is * actually quite logical. There are very few * peripheral drivers, and bloating every peripheral * structure with a pointer back to its parent * peripheral driver linker set entry would cost * more in the long run than doing this quick lookup. */ for (pdrv = periph_drivers; *pdrv != NULL; pdrv++) { if (strcmp((*pdrv)->driver_name, periph->periph_name) == 0) break; } if (*pdrv == NULL) { cdm->status = CAM_DEV_MATCH_ERROR; return(0); } cdm->pos.cookie.pdrv = pdrv; /* * The periph generation slot does double duty, as * does the periph pointer slot. They are used for * both edt and pdrv lookups and positioning. */ cdm->pos.cookie.periph = periph; cdm->pos.generations[CAM_PERIPH_GENERATION] = (*pdrv)->generation; cdm->status = CAM_DEV_MATCH_MORE; return(0); } j = cdm->num_matches; cdm->num_matches++; cdm->matches[j].type = DEV_MATCH_PERIPH; cdm->matches[j].result.periph_result.path_id = periph->path->bus->path_id; /* * The transport layer peripheral doesn't have a target or * lun. */ if (periph->path->target) cdm->matches[j].result.periph_result.target_id = periph->path->target->target_id; else cdm->matches[j].result.periph_result.target_id = CAM_TARGET_WILDCARD; if (periph->path->device) cdm->matches[j].result.periph_result.target_lun = periph->path->device->lun_id; else cdm->matches[j].result.periph_result.target_lun = CAM_LUN_WILDCARD; cdm->matches[j].result.periph_result.unit_number = periph->unit_number; strncpy(cdm->matches[j].result.periph_result.periph_name, periph->periph_name, DEV_IDLEN); } return(1); } static int xptperiphlistmatch(struct ccb_dev_match *cdm) { int ret; cdm->num_matches = 0; /* * At this point in the edt traversal function, we check the bus * list generation to make sure that no busses have been added or * removed since the user last sent a XPT_DEV_MATCH ccb through. * For the peripheral driver list traversal function, however, we * don't have to worry about new peripheral driver types coming or * going; they're in a linker set, and therefore can't change * without a recompile. */ if ((cdm->pos.position_type & CAM_DEV_POS_PDPTR) && (cdm->pos.cookie.pdrv != NULL)) ret = xptpdrvtraverse( (struct periph_driver **)cdm->pos.cookie.pdrv, xptplistpdrvfunc, cdm); else ret = xptpdrvtraverse(NULL, xptplistpdrvfunc, cdm); /* * If we get back 0, that means that we had to stop before fully * traversing the peripheral driver tree. It also means that one of * the subroutines has set the status field to the proper value. If * we get back 1, we've fully traversed the EDT and copied out any * matching entries. */ if (ret == 1) cdm->status = CAM_DEV_MATCH_LAST; return(ret); } static int xptbustraverse(struct cam_eb *start_bus, xpt_busfunc_t *tr_func, void *arg) { struct cam_eb *bus, *next_bus; int retval; retval = 1; if (start_bus) bus = start_bus; else { xpt_lock_buses(); bus = TAILQ_FIRST(&xsoftc.xpt_busses); if (bus == NULL) { xpt_unlock_buses(); return (retval); } bus->refcount++; xpt_unlock_buses(); } for (; bus != NULL; bus = next_bus) { retval = tr_func(bus, arg); if (retval == 0) { xpt_release_bus(bus); break; } xpt_lock_buses(); next_bus = TAILQ_NEXT(bus, links); if (next_bus) next_bus->refcount++; xpt_unlock_buses(); xpt_release_bus(bus); } return(retval); } static int xpttargettraverse(struct cam_eb *bus, struct cam_et *start_target, xpt_targetfunc_t *tr_func, void *arg) { struct cam_et *target, *next_target; int retval; retval = 1; if (start_target) target = start_target; else { mtx_lock(&bus->eb_mtx); target = TAILQ_FIRST(&bus->et_entries); if (target == NULL) { mtx_unlock(&bus->eb_mtx); return (retval); } target->refcount++; mtx_unlock(&bus->eb_mtx); } for (; target != NULL; target = next_target) { retval = tr_func(target, arg); if (retval == 0) { xpt_release_target(target); break; } mtx_lock(&bus->eb_mtx); next_target = TAILQ_NEXT(target, links); if (next_target) next_target->refcount++; mtx_unlock(&bus->eb_mtx); xpt_release_target(target); } return(retval); } static int xptdevicetraverse(struct cam_et *target, struct cam_ed *start_device, xpt_devicefunc_t *tr_func, void *arg) { struct cam_eb *bus; struct cam_ed *device, *next_device; int retval; retval = 1; bus = target->bus; if (start_device) device = start_device; else { mtx_lock(&bus->eb_mtx); device = TAILQ_FIRST(&target->ed_entries); if (device == NULL) { mtx_unlock(&bus->eb_mtx); return (retval); } device->refcount++; mtx_unlock(&bus->eb_mtx); } for (; device != NULL; device = next_device) { mtx_lock(&device->device_mtx); retval = tr_func(device, arg); mtx_unlock(&device->device_mtx); if (retval == 0) { xpt_release_device(device); break; } mtx_lock(&bus->eb_mtx); next_device = TAILQ_NEXT(device, links); if (next_device) next_device->refcount++; mtx_unlock(&bus->eb_mtx); xpt_release_device(device); } return(retval); } static int xptperiphtraverse(struct cam_ed *device, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg) { struct cam_eb *bus; struct cam_periph *periph, *next_periph; int retval; retval = 1; bus = device->target->bus; if (start_periph) periph = start_periph; else { xpt_lock_buses(); mtx_lock(&bus->eb_mtx); periph = SLIST_FIRST(&device->periphs); while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0) periph = SLIST_NEXT(periph, periph_links); if (periph == NULL) { mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); return (retval); } periph->refcount++; mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); } for (; periph != NULL; periph = next_periph) { retval = tr_func(periph, arg); if (retval == 0) { cam_periph_release_locked(periph); break; } xpt_lock_buses(); mtx_lock(&bus->eb_mtx); next_periph = SLIST_NEXT(periph, periph_links); while (next_periph != NULL && (next_periph->flags & CAM_PERIPH_FREE) != 0) next_periph = SLIST_NEXT(next_periph, periph_links); if (next_periph) next_periph->refcount++; mtx_unlock(&bus->eb_mtx); xpt_unlock_buses(); cam_periph_release_locked(periph); } return(retval); } static int xptpdrvtraverse(struct periph_driver **start_pdrv, xpt_pdrvfunc_t *tr_func, void *arg) { struct periph_driver **pdrv; int retval; retval = 1; /* * We don't traverse the peripheral driver list like we do the * other lists, because it is a linker set, and therefore cannot be * changed during runtime. If the peripheral driver list is ever * re-done to be something other than a linker set (i.e. it can * change while the system is running), the list traversal should * be modified to work like the other traversal functions. */ for (pdrv = (start_pdrv ? start_pdrv : periph_drivers); *pdrv != NULL; pdrv++) { retval = tr_func(pdrv, arg); if (retval == 0) return(retval); } return(retval); } static int xptpdperiphtraverse(struct periph_driver **pdrv, struct cam_periph *start_periph, xpt_periphfunc_t *tr_func, void *arg) { struct cam_periph *periph, *next_periph; int retval; retval = 1; if (start_periph) periph = start_periph; else { xpt_lock_buses(); periph = TAILQ_FIRST(&(*pdrv)->units); while (periph != NULL && (periph->flags & CAM_PERIPH_FREE) != 0) periph = TAILQ_NEXT(periph, unit_links); if (periph == NULL) { xpt_unlock_buses(); return (retval); } periph->refcount++; xpt_unlock_buses(); } for (; periph != NULL; periph = next_periph) { cam_periph_lock(periph); retval = tr_func(periph, arg); cam_periph_unlock(periph); if (retval == 0) { cam_periph_release(periph); break; } xpt_lock_buses(); next_periph = TAILQ_NEXT(periph, unit_links); while (next_periph != NULL && (next_periph->flags & CAM_PERIPH_FREE) != 0) next_periph = TAILQ_NEXT(next_periph, unit_links); if (next_periph) next_periph->refcount++; xpt_unlock_buses(); cam_periph_release(periph); } return(retval); } static int xptdefbusfunc(struct cam_eb *bus, void *arg) { struct xpt_traverse_config *tr_config; tr_config = (struct xpt_traverse_config *)arg; if (tr_config->depth == XPT_DEPTH_BUS) { xpt_busfunc_t *tr_func; tr_func = (xpt_busfunc_t *)tr_config->tr_func; return(tr_func(bus, tr_config->tr_arg)); } else return(xpttargettraverse(bus, NULL, xptdeftargetfunc, arg)); } static int xptdeftargetfunc(struct cam_et *target, void *arg) { struct xpt_traverse_config *tr_config; tr_config = (struct xpt_traverse_config *)arg; if (tr_config->depth == XPT_DEPTH_TARGET) { xpt_targetfunc_t *tr_func; tr_func = (xpt_targetfunc_t *)tr_config->tr_func; return(tr_func(target, tr_config->tr_arg)); } else return(xptdevicetraverse(target, NULL, xptdefdevicefunc, arg)); } static int xptdefdevicefunc(struct cam_ed *device, void *arg) { struct xpt_traverse_config *tr_config; tr_config = (struct xpt_traverse_config *)arg; if (tr_config->depth == XPT_DEPTH_DEVICE) { xpt_devicefunc_t *tr_func; tr_func = (xpt_devicefunc_t *)tr_config->tr_func; return(tr_func(device, tr_config->tr_arg)); } else return(xptperiphtraverse(device, NULL, xptdefperiphfunc, arg)); } static int xptdefperiphfunc(struct cam_periph *periph, void *arg) { struct xpt_traverse_config *tr_config; xpt_periphfunc_t *tr_func; tr_config = (struct xpt_traverse_config *)arg; tr_func = (xpt_periphfunc_t *)tr_config->tr_func; /* * Unlike the other default functions, we don't check for depth * here. The peripheral driver level is the last level in the EDT, * so if we're here, we should execute the function in question. */ return(tr_func(periph, tr_config->tr_arg)); } /* * Execute the given function for every bus in the EDT. */ static int xpt_for_all_busses(xpt_busfunc_t *tr_func, void *arg) { struct xpt_traverse_config tr_config; tr_config.depth = XPT_DEPTH_BUS; tr_config.tr_func = tr_func; tr_config.tr_arg = arg; return(xptbustraverse(NULL, xptdefbusfunc, &tr_config)); } /* * Execute the given function for every device in the EDT. */ static int xpt_for_all_devices(xpt_devicefunc_t *tr_func, void *arg) { struct xpt_traverse_config tr_config; tr_config.depth = XPT_DEPTH_DEVICE; tr_config.tr_func = tr_func; tr_config.tr_arg = arg; return(xptbustraverse(NULL, xptdefbusfunc, &tr_config)); } static int xptsetasyncfunc(struct cam_ed *device, void *arg) { struct cam_path path; struct ccb_getdev cgd; struct ccb_setasync *csa = (struct ccb_setasync *)arg; /* * Don't report unconfigured devices (Wildcard devs, * devices only for target mode, device instances * that have been invalidated but are waiting for * their last reference count to be released). */ if ((device->flags & CAM_DEV_UNCONFIGURED) != 0) return (1); xpt_compile_path(&path, NULL, device->target->bus->path_id, device->target->target_id, device->lun_id); xpt_setup_ccb(&cgd.ccb_h, &path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); csa->callback(csa->callback_arg, AC_FOUND_DEVICE, &path, &cgd); xpt_release_path(&path); return(1); } static int xptsetasyncbusfunc(struct cam_eb *bus, void *arg) { struct cam_path path; struct ccb_pathinq cpi; struct ccb_setasync *csa = (struct ccb_setasync *)arg; xpt_compile_path(&path, /*periph*/NULL, bus->path_id, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); xpt_path_lock(&path); xpt_setup_ccb(&cpi.ccb_h, &path, CAM_PRIORITY_NORMAL); cpi.ccb_h.func_code = XPT_PATH_INQ; xpt_action((union ccb *)&cpi); csa->callback(csa->callback_arg, AC_PATH_REGISTERED, &path, &cpi); xpt_path_unlock(&path); xpt_release_path(&path); return(1); } void xpt_action(union ccb *start_ccb) { CAM_DEBUG(start_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_action: func %#x %s\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code))); start_ccb->ccb_h.status = CAM_REQ_INPROG; (*(start_ccb->ccb_h.path->bus->xport->ops->action))(start_ccb); } void xpt_action_default(union ccb *start_ccb) { struct cam_path *path; struct cam_sim *sim; int lock; path = start_ccb->ccb_h.path; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_action_default: func %#x %s\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code))); switch (start_ccb->ccb_h.func_code) { case XPT_SCSI_IO: { struct cam_ed *device; /* * For the sake of compatibility with SCSI-1 * devices that may not understand the identify * message, we include lun information in the * second byte of all commands. SCSI-1 specifies * that luns are a 3 bit value and reserves only 3 * bits for lun information in the CDB. Later * revisions of the SCSI spec allow for more than 8 * luns, but have deprecated lun information in the * CDB. So, if the lun won't fit, we must omit. * * Also be aware that during initial probing for devices, * the inquiry information is unknown but initialized to 0. * This means that this code will be exercised while probing * devices with an ANSI revision greater than 2. */ device = path->device; if (device->protocol_version <= SCSI_REV_2 && start_ccb->ccb_h.target_lun < 8 && (start_ccb->ccb_h.flags & CAM_CDB_POINTER) == 0) { start_ccb->csio.cdb_io.cdb_bytes[1] |= start_ccb->ccb_h.target_lun << 5; } start_ccb->csio.scsi_status = SCSI_STATUS_OK; } /* FALLTHROUGH */ case XPT_TARGET_IO: case XPT_CONT_TARGET_IO: start_ccb->csio.sense_resid = 0; start_ccb->csio.resid = 0; /* FALLTHROUGH */ case XPT_ATA_IO: if (start_ccb->ccb_h.func_code == XPT_ATA_IO) start_ccb->ataio.resid = 0; /* FALLTHROUGH */ case XPT_NVME_IO: if (start_ccb->ccb_h.func_code == XPT_NVME_IO) start_ccb->nvmeio.resid = 0; /* FALLTHROUGH */ case XPT_RESET_DEV: case XPT_ENG_EXEC: case XPT_SMP_IO: { struct cam_devq *devq; devq = path->bus->sim->devq; mtx_lock(&devq->send_mtx); cam_ccbq_insert_ccb(&path->device->ccbq, start_ccb); if (xpt_schedule_devq(devq, path->device) != 0) xpt_run_devq(devq); mtx_unlock(&devq->send_mtx); break; } case XPT_CALC_GEOMETRY: /* Filter out garbage */ if (start_ccb->ccg.block_size == 0 || start_ccb->ccg.volume_size == 0) { start_ccb->ccg.cylinders = 0; start_ccb->ccg.heads = 0; start_ccb->ccg.secs_per_track = 0; start_ccb->ccb_h.status = CAM_REQ_CMP; break; } #if defined(PC98) || defined(__sparc64__) /* * In a PC-98 system, geometry translation depens on * the "real" device geometry obtained from mode page 4. * SCSI geometry translation is performed in the * initialization routine of the SCSI BIOS and the result * stored in host memory. If the translation is available * in host memory, use it. If not, rely on the default * translation the device driver performs. * For sparc64, we may need adjust the geometry of large * disks in order to fit the limitations of the 16-bit * fields of the VTOC8 disk label. */ if (scsi_da_bios_params(&start_ccb->ccg) != 0) { start_ccb->ccb_h.status = CAM_REQ_CMP; break; } #endif goto call_sim; case XPT_ABORT: { union ccb* abort_ccb; abort_ccb = start_ccb->cab.abort_ccb; if (XPT_FC_IS_DEV_QUEUED(abort_ccb)) { struct cam_ed *device; struct cam_devq *devq; device = abort_ccb->ccb_h.path->device; devq = device->sim->devq; mtx_lock(&devq->send_mtx); if (abort_ccb->ccb_h.pinfo.index > 0) { cam_ccbq_remove_ccb(&device->ccbq, abort_ccb); abort_ccb->ccb_h.status = CAM_REQ_ABORTED|CAM_DEV_QFRZN; xpt_freeze_devq_device(device, 1); mtx_unlock(&devq->send_mtx); xpt_done(abort_ccb); start_ccb->ccb_h.status = CAM_REQ_CMP; break; } mtx_unlock(&devq->send_mtx); if (abort_ccb->ccb_h.pinfo.index == CAM_UNQUEUED_INDEX && (abort_ccb->ccb_h.status & CAM_SIM_QUEUED) == 0) { /* * We've caught this ccb en route to * the SIM. Flag it for abort and the * SIM will do so just before starting * real work on the CCB. */ abort_ccb->ccb_h.status = CAM_REQ_ABORTED|CAM_DEV_QFRZN; xpt_freeze_devq(abort_ccb->ccb_h.path, 1); start_ccb->ccb_h.status = CAM_REQ_CMP; break; } } if (XPT_FC_IS_QUEUED(abort_ccb) && (abort_ccb->ccb_h.pinfo.index == CAM_DONEQ_INDEX)) { /* * It's already completed but waiting * for our SWI to get to it. */ start_ccb->ccb_h.status = CAM_UA_ABORT; break; } /* * If we weren't able to take care of the abort request * in the XPT, pass the request down to the SIM for processing. */ } /* FALLTHROUGH */ case XPT_ACCEPT_TARGET_IO: case XPT_EN_LUN: case XPT_IMMED_NOTIFY: case XPT_NOTIFY_ACK: case XPT_RESET_BUS: case XPT_IMMEDIATE_NOTIFY: case XPT_NOTIFY_ACKNOWLEDGE: case XPT_GET_SIM_KNOB_OLD: case XPT_GET_SIM_KNOB: case XPT_SET_SIM_KNOB: case XPT_GET_TRAN_SETTINGS: case XPT_SET_TRAN_SETTINGS: case XPT_PATH_INQ: call_sim: sim = path->bus->sim; lock = (mtx_owned(sim->mtx) == 0); if (lock) CAM_SIM_LOCK(sim); CAM_DEBUG(path, CAM_DEBUG_TRACE, ("sim->sim_action: func=%#x\n", start_ccb->ccb_h.func_code)); (*(sim->sim_action))(sim, start_ccb); CAM_DEBUG(path, CAM_DEBUG_TRACE, ("sim->sim_action: status=%#x\n", start_ccb->ccb_h.status)); if (lock) CAM_SIM_UNLOCK(sim); break; case XPT_PATH_STATS: start_ccb->cpis.last_reset = path->bus->last_reset; start_ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_GDEV_TYPE: { struct cam_ed *dev; dev = path->device; if ((dev->flags & CAM_DEV_UNCONFIGURED) != 0) { start_ccb->ccb_h.status = CAM_DEV_NOT_THERE; } else { struct ccb_getdev *cgd; cgd = &start_ccb->cgd; cgd->protocol = dev->protocol; cgd->inq_data = dev->inq_data; cgd->ident_data = dev->ident_data; cgd->inq_flags = dev->inq_flags; cgd->nvme_data = dev->nvme_data; cgd->nvme_cdata = dev->nvme_cdata; cgd->ccb_h.status = CAM_REQ_CMP; cgd->serial_num_len = dev->serial_num_len; if ((dev->serial_num_len > 0) && (dev->serial_num != NULL)) bcopy(dev->serial_num, cgd->serial_num, dev->serial_num_len); } break; } case XPT_GDEV_STATS: { struct cam_ed *dev; dev = path->device; if ((dev->flags & CAM_DEV_UNCONFIGURED) != 0) { start_ccb->ccb_h.status = CAM_DEV_NOT_THERE; } else { struct ccb_getdevstats *cgds; struct cam_eb *bus; struct cam_et *tar; struct cam_devq *devq; cgds = &start_ccb->cgds; bus = path->bus; tar = path->target; devq = bus->sim->devq; mtx_lock(&devq->send_mtx); cgds->dev_openings = dev->ccbq.dev_openings; cgds->dev_active = dev->ccbq.dev_active; cgds->allocated = dev->ccbq.allocated; cgds->queued = cam_ccbq_pending_ccb_count(&dev->ccbq); cgds->held = cgds->allocated - cgds->dev_active - cgds->queued; cgds->last_reset = tar->last_reset; cgds->maxtags = dev->maxtags; cgds->mintags = dev->mintags; if (timevalcmp(&tar->last_reset, &bus->last_reset, <)) cgds->last_reset = bus->last_reset; mtx_unlock(&devq->send_mtx); cgds->ccb_h.status = CAM_REQ_CMP; } break; } case XPT_GDEVLIST: { struct cam_periph *nperiph; struct periph_list *periph_head; struct ccb_getdevlist *cgdl; u_int i; struct cam_ed *device; int found; found = 0; /* * Don't want anyone mucking with our data. */ device = path->device; periph_head = &device->periphs; cgdl = &start_ccb->cgdl; /* * Check and see if the list has changed since the user * last requested a list member. If so, tell them that the * list has changed, and therefore they need to start over * from the beginning. */ if ((cgdl->index != 0) && (cgdl->generation != device->generation)) { cgdl->status = CAM_GDEVLIST_LIST_CHANGED; break; } /* * Traverse the list of peripherals and attempt to find * the requested peripheral. */ for (nperiph = SLIST_FIRST(periph_head), i = 0; (nperiph != NULL) && (i <= cgdl->index); nperiph = SLIST_NEXT(nperiph, periph_links), i++) { if (i == cgdl->index) { strncpy(cgdl->periph_name, nperiph->periph_name, DEV_IDLEN); cgdl->unit_number = nperiph->unit_number; found = 1; } } if (found == 0) { cgdl->status = CAM_GDEVLIST_ERROR; break; } if (nperiph == NULL) cgdl->status = CAM_GDEVLIST_LAST_DEVICE; else cgdl->status = CAM_GDEVLIST_MORE_DEVS; cgdl->index++; cgdl->generation = device->generation; cgdl->ccb_h.status = CAM_REQ_CMP; break; } case XPT_DEV_MATCH: { dev_pos_type position_type; struct ccb_dev_match *cdm; cdm = &start_ccb->cdm; /* * There are two ways of getting at information in the EDT. * The first way is via the primary EDT tree. It starts * with a list of busses, then a list of targets on a bus, * then devices/luns on a target, and then peripherals on a * device/lun. The "other" way is by the peripheral driver * lists. The peripheral driver lists are organized by * peripheral driver. (obviously) So it makes sense to * use the peripheral driver list if the user is looking * for something like "da1", or all "da" devices. If the * user is looking for something on a particular bus/target * or lun, it's generally better to go through the EDT tree. */ if (cdm->pos.position_type != CAM_DEV_POS_NONE) position_type = cdm->pos.position_type; else { u_int i; position_type = CAM_DEV_POS_NONE; for (i = 0; i < cdm->num_patterns; i++) { if ((cdm->patterns[i].type == DEV_MATCH_BUS) ||(cdm->patterns[i].type == DEV_MATCH_DEVICE)){ position_type = CAM_DEV_POS_EDT; break; } } if (cdm->num_patterns == 0) position_type = CAM_DEV_POS_EDT; else if (position_type == CAM_DEV_POS_NONE) position_type = CAM_DEV_POS_PDRV; } switch(position_type & CAM_DEV_POS_TYPEMASK) { case CAM_DEV_POS_EDT: xptedtmatch(cdm); break; case CAM_DEV_POS_PDRV: xptperiphlistmatch(cdm); break; default: cdm->status = CAM_DEV_MATCH_ERROR; break; } if (cdm->status == CAM_DEV_MATCH_ERROR) start_ccb->ccb_h.status = CAM_REQ_CMP_ERR; else start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_SASYNC_CB: { struct ccb_setasync *csa; struct async_node *cur_entry; struct async_list *async_head; u_int32_t added; csa = &start_ccb->csa; added = csa->event_enable; async_head = &path->device->asyncs; /* * If there is already an entry for us, simply * update it. */ cur_entry = SLIST_FIRST(async_head); while (cur_entry != NULL) { if ((cur_entry->callback_arg == csa->callback_arg) && (cur_entry->callback == csa->callback)) break; cur_entry = SLIST_NEXT(cur_entry, links); } if (cur_entry != NULL) { /* * If the request has no flags set, * remove the entry. */ added &= ~cur_entry->event_enable; if (csa->event_enable == 0) { SLIST_REMOVE(async_head, cur_entry, async_node, links); xpt_release_device(path->device); free(cur_entry, M_CAMXPT); } else { cur_entry->event_enable = csa->event_enable; } csa->event_enable = added; } else { cur_entry = malloc(sizeof(*cur_entry), M_CAMXPT, M_NOWAIT); if (cur_entry == NULL) { csa->ccb_h.status = CAM_RESRC_UNAVAIL; break; } cur_entry->event_enable = csa->event_enable; cur_entry->event_lock = mtx_owned(path->bus->sim->mtx) ? 1 : 0; cur_entry->callback_arg = csa->callback_arg; cur_entry->callback = csa->callback; SLIST_INSERT_HEAD(async_head, cur_entry, links); xpt_acquire_device(path->device); } start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_REL_SIMQ: { struct ccb_relsim *crs; struct cam_ed *dev; crs = &start_ccb->crs; dev = path->device; if (dev == NULL) { crs->ccb_h.status = CAM_DEV_NOT_THERE; break; } if ((crs->release_flags & RELSIM_ADJUST_OPENINGS) != 0) { /* Don't ever go below one opening */ if (crs->openings > 0) { xpt_dev_ccbq_resize(path, crs->openings); if (bootverbose) { xpt_print(path, "number of openings is now %d\n", crs->openings); } } } mtx_lock(&dev->sim->devq->send_mtx); if ((crs->release_flags & RELSIM_RELEASE_AFTER_TIMEOUT) != 0) { if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) { /* * Just extend the old timeout and decrement * the freeze count so that a single timeout * is sufficient for releasing the queue. */ start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE; callout_stop(&dev->callout); } else { start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE; } callout_reset_sbt(&dev->callout, SBT_1MS * crs->release_timeout, 0, xpt_release_devq_timeout, dev, 0); dev->flags |= CAM_DEV_REL_TIMEOUT_PENDING; } if ((crs->release_flags & RELSIM_RELEASE_AFTER_CMDCMPLT) != 0) { if ((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0) { /* * Decrement the freeze count so that a single * completion is still sufficient to unfreeze * the queue. */ start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE; } else { dev->flags |= CAM_DEV_REL_ON_COMPLETE; start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE; } } if ((crs->release_flags & RELSIM_RELEASE_AFTER_QEMPTY) != 0) { if ((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0 || (dev->ccbq.dev_active == 0)) { start_ccb->ccb_h.flags &= ~CAM_DEV_QFREEZE; } else { dev->flags |= CAM_DEV_REL_ON_QUEUE_EMPTY; start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE; } } mtx_unlock(&dev->sim->devq->send_mtx); if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) == 0) xpt_release_devq(path, /*count*/1, /*run_queue*/TRUE); start_ccb->crs.qfrozen_cnt = dev->ccbq.queue.qfrozen_cnt; start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_DEBUG: { struct cam_path *oldpath; /* Check that all request bits are supported. */ if (start_ccb->cdbg.flags & ~(CAM_DEBUG_COMPILE)) { start_ccb->ccb_h.status = CAM_FUNC_NOTAVAIL; break; } cam_dflags = CAM_DEBUG_NONE; if (cam_dpath != NULL) { oldpath = cam_dpath; cam_dpath = NULL; xpt_free_path(oldpath); } if (start_ccb->cdbg.flags != CAM_DEBUG_NONE) { if (xpt_create_path(&cam_dpath, NULL, start_ccb->ccb_h.path_id, start_ccb->ccb_h.target_id, start_ccb->ccb_h.target_lun) != CAM_REQ_CMP) { start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL; } else { cam_dflags = start_ccb->cdbg.flags; start_ccb->ccb_h.status = CAM_REQ_CMP; xpt_print(cam_dpath, "debugging flags now %x\n", cam_dflags); } } else start_ccb->ccb_h.status = CAM_REQ_CMP; break; } case XPT_NOOP: if ((start_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0) xpt_freeze_devq(path, 1); start_ccb->ccb_h.status = CAM_REQ_CMP; break; case XPT_REPROBE_LUN: xpt_async(AC_INQ_CHANGED, path, NULL); start_ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(start_ccb); break; default: case XPT_SDEV_TYPE: case XPT_TERM_IO: case XPT_ENG_INQ: /* XXX Implement */ xpt_print_path(start_ccb->ccb_h.path); printf("%s: CCB type %#x %s not supported\n", __func__, start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code)); start_ccb->ccb_h.status = CAM_PROVIDE_FAIL; if (start_ccb->ccb_h.func_code & XPT_FC_DEV_QUEUED) { xpt_done(start_ccb); } break; } CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_action_default: func= %#x %s status %#x\n", start_ccb->ccb_h.func_code, xpt_action_name(start_ccb->ccb_h.func_code), start_ccb->ccb_h.status)); } void xpt_polled_action(union ccb *start_ccb) { u_int32_t timeout; struct cam_sim *sim; struct cam_devq *devq; struct cam_ed *dev; timeout = start_ccb->ccb_h.timeout * 10; sim = start_ccb->ccb_h.path->bus->sim; devq = sim->devq; dev = start_ccb->ccb_h.path->device; mtx_unlock(&dev->device_mtx); /* * Steal an opening so that no other queued requests * can get it before us while we simulate interrupts. */ mtx_lock(&devq->send_mtx); dev->ccbq.dev_openings--; while((devq->send_openings <= 0 || dev->ccbq.dev_openings < 0) && (--timeout > 0)) { mtx_unlock(&devq->send_mtx); DELAY(100); CAM_SIM_LOCK(sim); (*(sim->sim_poll))(sim); CAM_SIM_UNLOCK(sim); camisr_runqueue(); mtx_lock(&devq->send_mtx); } dev->ccbq.dev_openings++; mtx_unlock(&devq->send_mtx); if (timeout != 0) { xpt_action(start_ccb); while(--timeout > 0) { CAM_SIM_LOCK(sim); (*(sim->sim_poll))(sim); CAM_SIM_UNLOCK(sim); camisr_runqueue(); if ((start_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG) break; DELAY(100); } if (timeout == 0) { /* * XXX Is it worth adding a sim_timeout entry * point so we can attempt recovery? If * this is only used for dumps, I don't think * it is. */ start_ccb->ccb_h.status = CAM_CMD_TIMEOUT; } } else { start_ccb->ccb_h.status = CAM_RESRC_UNAVAIL; } mtx_lock(&dev->device_mtx); } /* * Schedule a peripheral driver to receive a ccb when its * target device has space for more transactions. */ void xpt_schedule(struct cam_periph *periph, u_int32_t new_priority) { CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("xpt_schedule\n")); cam_periph_assert(periph, MA_OWNED); if (new_priority < periph->scheduled_priority) { periph->scheduled_priority = new_priority; xpt_run_allocq(periph, 0); } } /* * Schedule a device to run on a given queue. * If the device was inserted as a new entry on the queue, * return 1 meaning the device queue should be run. If we * were already queued, implying someone else has already * started the queue, return 0 so the caller doesn't attempt * to run the queue. */ static int xpt_schedule_dev(struct camq *queue, cam_pinfo *pinfo, u_int32_t new_priority) { int retval; u_int32_t old_priority; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_schedule_dev\n")); old_priority = pinfo->priority; /* * Are we already queued? */ if (pinfo->index != CAM_UNQUEUED_INDEX) { /* Simply reorder based on new priority */ if (new_priority < old_priority) { camq_change_priority(queue, pinfo->index, new_priority); CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("changed priority to %d\n", new_priority)); retval = 1; } else retval = 0; } else { /* New entry on the queue */ if (new_priority < old_priority) pinfo->priority = new_priority; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("Inserting onto queue\n")); pinfo->generation = ++queue->generation; camq_insert(queue, pinfo); retval = 1; } return (retval); } static void xpt_run_allocq_task(void *context, int pending) { struct cam_periph *periph = context; cam_periph_lock(periph); periph->flags &= ~CAM_PERIPH_RUN_TASK; xpt_run_allocq(periph, 1); cam_periph_unlock(periph); cam_periph_release(periph); } static void xpt_run_allocq(struct cam_periph *periph, int sleep) { struct cam_ed *device; union ccb *ccb; uint32_t prio; cam_periph_assert(periph, MA_OWNED); if (periph->periph_allocating) return; periph->periph_allocating = 1; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_allocq(%p)\n", periph)); device = periph->path->device; ccb = NULL; restart: while ((prio = min(periph->scheduled_priority, periph->immediate_priority)) != CAM_PRIORITY_NONE && (periph->periph_allocated - (ccb != NULL ? 1 : 0) < device->ccbq.total_openings || prio <= CAM_PRIORITY_OOB)) { if (ccb == NULL && (ccb = xpt_get_ccb_nowait(periph)) == NULL) { if (sleep) { ccb = xpt_get_ccb(periph); goto restart; } if (periph->flags & CAM_PERIPH_RUN_TASK) break; cam_periph_doacquire(periph); periph->flags |= CAM_PERIPH_RUN_TASK; taskqueue_enqueue(xsoftc.xpt_taskq, &periph->periph_run_task); break; } xpt_setup_ccb(&ccb->ccb_h, periph->path, prio); if (prio == periph->immediate_priority) { periph->immediate_priority = CAM_PRIORITY_NONE; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("waking cam_periph_getccb()\n")); SLIST_INSERT_HEAD(&periph->ccb_list, &ccb->ccb_h, periph_links.sle); wakeup(&periph->ccb_list); } else { periph->scheduled_priority = CAM_PRIORITY_NONE; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("calling periph_start()\n")); periph->periph_start(periph, ccb); } ccb = NULL; } if (ccb != NULL) xpt_release_ccb(ccb); periph->periph_allocating = 0; } static void xpt_run_devq(struct cam_devq *devq) { int lock; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_run_devq\n")); devq->send_queue.qfrozen_cnt++; while ((devq->send_queue.entries > 0) && (devq->send_openings > 0) && (devq->send_queue.qfrozen_cnt <= 1)) { struct cam_ed *device; union ccb *work_ccb; struct cam_sim *sim; struct xpt_proto *proto; device = (struct cam_ed *)camq_remove(&devq->send_queue, CAMQ_HEAD); CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("running device %p\n", device)); work_ccb = cam_ccbq_peek_ccb(&device->ccbq, CAMQ_HEAD); if (work_ccb == NULL) { printf("device on run queue with no ccbs???\n"); continue; } if ((work_ccb->ccb_h.flags & CAM_HIGH_POWER) != 0) { mtx_lock(&xsoftc.xpt_highpower_lock); if (xsoftc.num_highpower <= 0) { /* * We got a high power command, but we * don't have any available slots. Freeze * the device queue until we have a slot * available. */ xpt_freeze_devq_device(device, 1); STAILQ_INSERT_TAIL(&xsoftc.highpowerq, device, highpowerq_entry); mtx_unlock(&xsoftc.xpt_highpower_lock); continue; } else { /* * Consume a high power slot while * this ccb runs. */ xsoftc.num_highpower--; } mtx_unlock(&xsoftc.xpt_highpower_lock); } cam_ccbq_remove_ccb(&device->ccbq, work_ccb); cam_ccbq_send_ccb(&device->ccbq, work_ccb); devq->send_openings--; devq->send_active++; xpt_schedule_devq(devq, device); mtx_unlock(&devq->send_mtx); if ((work_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0) { /* * The client wants to freeze the queue * after this CCB is sent. */ xpt_freeze_devq(work_ccb->ccb_h.path, 1); } /* In Target mode, the peripheral driver knows best... */ if (work_ccb->ccb_h.func_code == XPT_SCSI_IO) { if ((device->inq_flags & SID_CmdQue) != 0 && work_ccb->csio.tag_action != CAM_TAG_ACTION_NONE) work_ccb->ccb_h.flags |= CAM_TAG_ACTION_VALID; else /* * Clear this in case of a retried CCB that * failed due to a rejected tag. */ work_ccb->ccb_h.flags &= ~CAM_TAG_ACTION_VALID; } KASSERT(device == work_ccb->ccb_h.path->device, ("device (%p) / path->device (%p) mismatch", device, work_ccb->ccb_h.path->device)); proto = xpt_proto_find(device->protocol); if (proto && proto->ops->debug_out) proto->ops->debug_out(work_ccb); /* * Device queues can be shared among multiple SIM instances * that reside on different busses. Use the SIM from the * queued device, rather than the one from the calling bus. */ sim = device->sim; lock = (mtx_owned(sim->mtx) == 0); if (lock) CAM_SIM_LOCK(sim); work_ccb->ccb_h.qos.sim_data = sbinuptime(); // xxx uintprt_t too small 32bit platforms (*(sim->sim_action))(sim, work_ccb); if (lock) CAM_SIM_UNLOCK(sim); mtx_lock(&devq->send_mtx); } devq->send_queue.qfrozen_cnt--; } /* * This function merges stuff from the slave ccb into the master ccb, while * keeping important fields in the master ccb constant. */ void xpt_merge_ccb(union ccb *master_ccb, union ccb *slave_ccb) { /* * Pull fields that are valid for peripheral drivers to set * into the master CCB along with the CCB "payload". */ master_ccb->ccb_h.retry_count = slave_ccb->ccb_h.retry_count; master_ccb->ccb_h.func_code = slave_ccb->ccb_h.func_code; master_ccb->ccb_h.timeout = slave_ccb->ccb_h.timeout; master_ccb->ccb_h.flags = slave_ccb->ccb_h.flags; bcopy(&(&slave_ccb->ccb_h)[1], &(&master_ccb->ccb_h)[1], sizeof(union ccb) - sizeof(struct ccb_hdr)); } void xpt_setup_ccb_flags(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority, u_int32_t flags) { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_setup_ccb\n")); ccb_h->pinfo.priority = priority; ccb_h->path = path; ccb_h->path_id = path->bus->path_id; if (path->target) ccb_h->target_id = path->target->target_id; else ccb_h->target_id = CAM_TARGET_WILDCARD; if (path->device) { ccb_h->target_lun = path->device->lun_id; ccb_h->pinfo.generation = ++path->device->ccbq.queue.generation; } else { ccb_h->target_lun = CAM_TARGET_WILDCARD; } ccb_h->pinfo.index = CAM_UNQUEUED_INDEX; ccb_h->flags = flags; ccb_h->xflags = 0; } void xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority) { xpt_setup_ccb_flags(ccb_h, path, priority, /*flags*/ 0); } /* Path manipulation functions */ cam_status xpt_create_path(struct cam_path **new_path_ptr, struct cam_periph *perph, path_id_t path_id, target_id_t target_id, lun_id_t lun_id) { struct cam_path *path; cam_status status; path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT); if (path == NULL) { status = CAM_RESRC_UNAVAIL; return(status); } status = xpt_compile_path(path, perph, path_id, target_id, lun_id); if (status != CAM_REQ_CMP) { free(path, M_CAMPATH); path = NULL; } *new_path_ptr = path; return (status); } cam_status xpt_create_path_unlocked(struct cam_path **new_path_ptr, struct cam_periph *periph, path_id_t path_id, target_id_t target_id, lun_id_t lun_id) { return (xpt_create_path(new_path_ptr, periph, path_id, target_id, lun_id)); } cam_status xpt_compile_path(struct cam_path *new_path, struct cam_periph *perph, path_id_t path_id, target_id_t target_id, lun_id_t lun_id) { struct cam_eb *bus; struct cam_et *target; struct cam_ed *device; cam_status status; status = CAM_REQ_CMP; /* Completed without error */ target = NULL; /* Wildcarded */ device = NULL; /* Wildcarded */ /* * We will potentially modify the EDT, so block interrupts * that may attempt to create cam paths. */ bus = xpt_find_bus(path_id); if (bus == NULL) { status = CAM_PATH_INVALID; } else { xpt_lock_buses(); mtx_lock(&bus->eb_mtx); target = xpt_find_target(bus, target_id); if (target == NULL) { /* Create one */ struct cam_et *new_target; new_target = xpt_alloc_target(bus, target_id); if (new_target == NULL) { status = CAM_RESRC_UNAVAIL; } else { target = new_target; } } xpt_unlock_buses(); if (target != NULL) { device = xpt_find_device(target, lun_id); if (device == NULL) { /* Create one */ struct cam_ed *new_device; new_device = (*(bus->xport->ops->alloc_device))(bus, target, lun_id); if (new_device == NULL) { status = CAM_RESRC_UNAVAIL; } else { device = new_device; } } } mtx_unlock(&bus->eb_mtx); } /* * Only touch the user's data if we are successful. */ if (status == CAM_REQ_CMP) { new_path->periph = perph; new_path->bus = bus; new_path->target = target; new_path->device = device; CAM_DEBUG(new_path, CAM_DEBUG_TRACE, ("xpt_compile_path\n")); } else { if (device != NULL) xpt_release_device(device); if (target != NULL) xpt_release_target(target); if (bus != NULL) xpt_release_bus(bus); } return (status); } cam_status xpt_clone_path(struct cam_path **new_path_ptr, struct cam_path *path) { struct cam_path *new_path; new_path = (struct cam_path *)malloc(sizeof(*path), M_CAMPATH, M_NOWAIT); if (new_path == NULL) return(CAM_RESRC_UNAVAIL); xpt_copy_path(new_path, path); *new_path_ptr = new_path; return (CAM_REQ_CMP); } void xpt_copy_path(struct cam_path *new_path, struct cam_path *path) { *new_path = *path; if (path->bus != NULL) xpt_acquire_bus(path->bus); if (path->target != NULL) xpt_acquire_target(path->target); if (path->device != NULL) xpt_acquire_device(path->device); } void xpt_release_path(struct cam_path *path) { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_path\n")); if (path->device != NULL) { xpt_release_device(path->device); path->device = NULL; } if (path->target != NULL) { xpt_release_target(path->target); path->target = NULL; } if (path->bus != NULL) { xpt_release_bus(path->bus); path->bus = NULL; } } void xpt_free_path(struct cam_path *path) { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_free_path\n")); xpt_release_path(path); free(path, M_CAMPATH); } void xpt_path_counts(struct cam_path *path, uint32_t *bus_ref, uint32_t *periph_ref, uint32_t *target_ref, uint32_t *device_ref) { xpt_lock_buses(); if (bus_ref) { if (path->bus) *bus_ref = path->bus->refcount; else *bus_ref = 0; } if (periph_ref) { if (path->periph) *periph_ref = path->periph->refcount; else *periph_ref = 0; } xpt_unlock_buses(); if (target_ref) { if (path->target) *target_ref = path->target->refcount; else *target_ref = 0; } if (device_ref) { if (path->device) *device_ref = path->device->refcount; else *device_ref = 0; } } /* * Return -1 for failure, 0 for exact match, 1 for match with wildcards * in path1, 2 for match with wildcards in path2. */ int xpt_path_comp(struct cam_path *path1, struct cam_path *path2) { int retval = 0; if (path1->bus != path2->bus) { if (path1->bus->path_id == CAM_BUS_WILDCARD) retval = 1; else if (path2->bus->path_id == CAM_BUS_WILDCARD) retval = 2; else return (-1); } if (path1->target != path2->target) { if (path1->target->target_id == CAM_TARGET_WILDCARD) { if (retval == 0) retval = 1; } else if (path2->target->target_id == CAM_TARGET_WILDCARD) retval = 2; else return (-1); } if (path1->device != path2->device) { if (path1->device->lun_id == CAM_LUN_WILDCARD) { if (retval == 0) retval = 1; } else if (path2->device->lun_id == CAM_LUN_WILDCARD) retval = 2; else return (-1); } return (retval); } int xpt_path_comp_dev(struct cam_path *path, struct cam_ed *dev) { int retval = 0; if (path->bus != dev->target->bus) { if (path->bus->path_id == CAM_BUS_WILDCARD) retval = 1; else if (dev->target->bus->path_id == CAM_BUS_WILDCARD) retval = 2; else return (-1); } if (path->target != dev->target) { if (path->target->target_id == CAM_TARGET_WILDCARD) { if (retval == 0) retval = 1; } else if (dev->target->target_id == CAM_TARGET_WILDCARD) retval = 2; else return (-1); } if (path->device != dev) { if (path->device->lun_id == CAM_LUN_WILDCARD) { if (retval == 0) retval = 1; } else if (dev->lun_id == CAM_LUN_WILDCARD) retval = 2; else return (-1); } return (retval); } void xpt_print_path(struct cam_path *path) { if (path == NULL) printf("(nopath): "); else { if (path->periph != NULL) printf("(%s%d:", path->periph->periph_name, path->periph->unit_number); else printf("(noperiph:"); if (path->bus != NULL) printf("%s%d:%d:", path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id); else printf("nobus:"); if (path->target != NULL) printf("%d:", path->target->target_id); else printf("X:"); if (path->device != NULL) printf("%jx): ", (uintmax_t)path->device->lun_id); else printf("X): "); } } void xpt_print_device(struct cam_ed *device) { if (device == NULL) printf("(nopath): "); else { printf("(noperiph:%s%d:%d:%d:%jx): ", device->sim->sim_name, device->sim->unit_number, device->sim->bus_id, device->target->target_id, (uintmax_t)device->lun_id); } } void xpt_print(struct cam_path *path, const char *fmt, ...) { va_list ap; xpt_print_path(path); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); } int xpt_path_string(struct cam_path *path, char *str, size_t str_len) { struct sbuf sb; sbuf_new(&sb, str, str_len, 0); if (path == NULL) sbuf_printf(&sb, "(nopath): "); else { if (path->periph != NULL) sbuf_printf(&sb, "(%s%d:", path->periph->periph_name, path->periph->unit_number); else sbuf_printf(&sb, "(noperiph:"); if (path->bus != NULL) sbuf_printf(&sb, "%s%d:%d:", path->bus->sim->sim_name, path->bus->sim->unit_number, path->bus->sim->bus_id); else sbuf_printf(&sb, "nobus:"); if (path->target != NULL) sbuf_printf(&sb, "%d:", path->target->target_id); else sbuf_printf(&sb, "X:"); if (path->device != NULL) sbuf_printf(&sb, "%jx): ", (uintmax_t)path->device->lun_id); else sbuf_printf(&sb, "X): "); } sbuf_finish(&sb); return(sbuf_len(&sb)); } path_id_t xpt_path_path_id(struct cam_path *path) { return(path->bus->path_id); } target_id_t xpt_path_target_id(struct cam_path *path) { if (path->target != NULL) return (path->target->target_id); else return (CAM_TARGET_WILDCARD); } lun_id_t xpt_path_lun_id(struct cam_path *path) { if (path->device != NULL) return (path->device->lun_id); else return (CAM_LUN_WILDCARD); } struct cam_sim * xpt_path_sim(struct cam_path *path) { return (path->bus->sim); } struct cam_periph* xpt_path_periph(struct cam_path *path) { return (path->periph); } /* * Release a CAM control block for the caller. Remit the cost of the structure * to the device referenced by the path. If the this device had no 'credits' * and peripheral drivers have registered async callbacks for this notification * call them now. */ void xpt_release_ccb(union ccb *free_ccb) { struct cam_ed *device; struct cam_periph *periph; CAM_DEBUG_PRINT(CAM_DEBUG_XPT, ("xpt_release_ccb\n")); xpt_path_assert(free_ccb->ccb_h.path, MA_OWNED); device = free_ccb->ccb_h.path->device; periph = free_ccb->ccb_h.path->periph; xpt_free_ccb(free_ccb); periph->periph_allocated--; cam_ccbq_release_opening(&device->ccbq); xpt_run_allocq(periph, 0); } /* Functions accessed by SIM drivers */ static struct xpt_xport_ops xport_default_ops = { .alloc_device = xpt_alloc_device_default, .action = xpt_action_default, .async = xpt_dev_async_default, }; static struct xpt_xport xport_default = { .xport = XPORT_UNKNOWN, .name = "unknown", .ops = &xport_default_ops, }; CAM_XPT_XPORT(xport_default); /* * A sim structure, listing the SIM entry points and instance * identification info is passed to xpt_bus_register to hook the SIM * into the CAM framework. xpt_bus_register creates a cam_eb entry * for this new bus and places it in the array of busses and assigns * it a path_id. The path_id may be influenced by "hard wiring" * information specified by the user. Once interrupt services are * available, the bus will be probed. */ int32_t xpt_bus_register(struct cam_sim *sim, device_t parent, u_int32_t bus) { struct cam_eb *new_bus; struct cam_eb *old_bus; struct ccb_pathinq cpi; struct cam_path *path; cam_status status; mtx_assert(sim->mtx, MA_OWNED); sim->bus_id = bus; new_bus = (struct cam_eb *)malloc(sizeof(*new_bus), M_CAMXPT, M_NOWAIT|M_ZERO); if (new_bus == NULL) { /* Couldn't satisfy request */ return (CAM_RESRC_UNAVAIL); } mtx_init(&new_bus->eb_mtx, "CAM bus lock", NULL, MTX_DEF); TAILQ_INIT(&new_bus->et_entries); cam_sim_hold(sim); new_bus->sim = sim; timevalclear(&new_bus->last_reset); new_bus->flags = 0; new_bus->refcount = 1; /* Held until a bus_deregister event */ new_bus->generation = 0; xpt_lock_buses(); sim->path_id = new_bus->path_id = xptpathid(sim->sim_name, sim->unit_number, sim->bus_id); old_bus = TAILQ_FIRST(&xsoftc.xpt_busses); while (old_bus != NULL && old_bus->path_id < new_bus->path_id) old_bus = TAILQ_NEXT(old_bus, links); if (old_bus != NULL) TAILQ_INSERT_BEFORE(old_bus, new_bus, links); else TAILQ_INSERT_TAIL(&xsoftc.xpt_busses, new_bus, links); xsoftc.bus_generation++; xpt_unlock_buses(); /* * Set a default transport so that a PATH_INQ can be issued to * the SIM. This will then allow for probing and attaching of * a more appropriate transport. */ new_bus->xport = &xport_default; status = xpt_create_path(&path, /*periph*/NULL, sim->path_id, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) { xpt_release_bus(new_bus); free(path, M_CAMXPT); return (CAM_RESRC_UNAVAIL); } xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NORMAL); cpi.ccb_h.func_code = XPT_PATH_INQ; xpt_action((union ccb *)&cpi); if (cpi.ccb_h.status == CAM_REQ_CMP) { struct xpt_xport **xpt; SET_FOREACH(xpt, cam_xpt_xport_set) { if ((*xpt)->xport == cpi.transport) { new_bus->xport = *xpt; break; } } if (new_bus->xport == NULL) { xpt_print_path(path); printf("No transport found for %d\n", cpi.transport); xpt_release_bus(new_bus); free(path, M_CAMXPT); return (CAM_RESRC_UNAVAIL); } } /* Notify interested parties */ if (sim->path_id != CAM_XPT_PATH_ID) { xpt_async(AC_PATH_REGISTERED, path, &cpi); if ((cpi.hba_misc & PIM_NOSCAN) == 0) { union ccb *scan_ccb; /* Initiate bus rescan. */ scan_ccb = xpt_alloc_ccb_nowait(); if (scan_ccb != NULL) { scan_ccb->ccb_h.path = path; scan_ccb->ccb_h.func_code = XPT_SCAN_BUS; scan_ccb->crcn.flags = 0; xpt_rescan(scan_ccb); } else { xpt_print(path, "Can't allocate CCB to scan bus\n"); xpt_free_path(path); } } else xpt_free_path(path); } else xpt_free_path(path); return (CAM_SUCCESS); } int32_t xpt_bus_deregister(path_id_t pathid) { struct cam_path bus_path; cam_status status; status = xpt_compile_path(&bus_path, NULL, pathid, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) return (status); xpt_async(AC_LOST_DEVICE, &bus_path, NULL); xpt_async(AC_PATH_DEREGISTERED, &bus_path, NULL); /* Release the reference count held while registered. */ xpt_release_bus(bus_path.bus); xpt_release_path(&bus_path); return (CAM_REQ_CMP); } static path_id_t xptnextfreepathid(void) { struct cam_eb *bus; path_id_t pathid; const char *strval; mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED); pathid = 0; bus = TAILQ_FIRST(&xsoftc.xpt_busses); retry: /* Find an unoccupied pathid */ while (bus != NULL && bus->path_id <= pathid) { if (bus->path_id == pathid) pathid++; bus = TAILQ_NEXT(bus, links); } /* * Ensure that this pathid is not reserved for * a bus that may be registered in the future. */ if (resource_string_value("scbus", pathid, "at", &strval) == 0) { ++pathid; /* Start the search over */ goto retry; } return (pathid); } static path_id_t xptpathid(const char *sim_name, int sim_unit, int sim_bus) { path_id_t pathid; int i, dunit, val; char buf[32]; const char *dname; pathid = CAM_XPT_PATH_ID; snprintf(buf, sizeof(buf), "%s%d", sim_name, sim_unit); if (strcmp(buf, "xpt0") == 0 && sim_bus == 0) return (pathid); i = 0; while ((resource_find_match(&i, &dname, &dunit, "at", buf)) == 0) { if (strcmp(dname, "scbus")) { /* Avoid a bit of foot shooting. */ continue; } if (dunit < 0) /* unwired?! */ continue; if (resource_int_value("scbus", dunit, "bus", &val) == 0) { if (sim_bus == val) { pathid = dunit; break; } } else if (sim_bus == 0) { /* Unspecified matches bus 0 */ pathid = dunit; break; } else { printf("Ambiguous scbus configuration for %s%d " "bus %d, cannot wire down. The kernel " "config entry for scbus%d should " "specify a controller bus.\n" "Scbus will be assigned dynamically.\n", sim_name, sim_unit, sim_bus, dunit); break; } } if (pathid == CAM_XPT_PATH_ID) pathid = xptnextfreepathid(); return (pathid); } static const char * xpt_async_string(u_int32_t async_code) { switch (async_code) { case AC_BUS_RESET: return ("AC_BUS_RESET"); case AC_UNSOL_RESEL: return ("AC_UNSOL_RESEL"); case AC_SCSI_AEN: return ("AC_SCSI_AEN"); case AC_SENT_BDR: return ("AC_SENT_BDR"); case AC_PATH_REGISTERED: return ("AC_PATH_REGISTERED"); case AC_PATH_DEREGISTERED: return ("AC_PATH_DEREGISTERED"); case AC_FOUND_DEVICE: return ("AC_FOUND_DEVICE"); case AC_LOST_DEVICE: return ("AC_LOST_DEVICE"); case AC_TRANSFER_NEG: return ("AC_TRANSFER_NEG"); case AC_INQ_CHANGED: return ("AC_INQ_CHANGED"); case AC_GETDEV_CHANGED: return ("AC_GETDEV_CHANGED"); case AC_CONTRACT: return ("AC_CONTRACT"); case AC_ADVINFO_CHANGED: return ("AC_ADVINFO_CHANGED"); case AC_UNIT_ATTENTION: return ("AC_UNIT_ATTENTION"); } return ("AC_UNKNOWN"); } static int xpt_async_size(u_int32_t async_code) { switch (async_code) { case AC_BUS_RESET: return (0); case AC_UNSOL_RESEL: return (0); case AC_SCSI_AEN: return (0); case AC_SENT_BDR: return (0); case AC_PATH_REGISTERED: return (sizeof(struct ccb_pathinq)); case AC_PATH_DEREGISTERED: return (0); case AC_FOUND_DEVICE: return (sizeof(struct ccb_getdev)); case AC_LOST_DEVICE: return (0); case AC_TRANSFER_NEG: return (sizeof(struct ccb_trans_settings)); case AC_INQ_CHANGED: return (0); case AC_GETDEV_CHANGED: return (0); case AC_CONTRACT: return (sizeof(struct ac_contract)); case AC_ADVINFO_CHANGED: return (-1); case AC_UNIT_ATTENTION: return (sizeof(struct ccb_scsiio)); } return (0); } static int xpt_async_process_dev(struct cam_ed *device, void *arg) { union ccb *ccb = arg; struct cam_path *path = ccb->ccb_h.path; void *async_arg = ccb->casync.async_arg_ptr; u_int32_t async_code = ccb->casync.async_code; int relock; if (path->device != device && path->device->lun_id != CAM_LUN_WILDCARD && device->lun_id != CAM_LUN_WILDCARD) return (1); /* * The async callback could free the device. * If it is a broadcast async, it doesn't hold * device reference, so take our own reference. */ xpt_acquire_device(device); /* * If async for specific device is to be delivered to * the wildcard client, take the specific device lock. * XXX: We may need a way for client to specify it. */ if ((device->lun_id == CAM_LUN_WILDCARD && path->device->lun_id != CAM_LUN_WILDCARD) || (device->target->target_id == CAM_TARGET_WILDCARD && path->target->target_id != CAM_TARGET_WILDCARD) || (device->target->bus->path_id == CAM_BUS_WILDCARD && path->target->bus->path_id != CAM_BUS_WILDCARD)) { mtx_unlock(&device->device_mtx); xpt_path_lock(path); relock = 1; } else relock = 0; (*(device->target->bus->xport->ops->async))(async_code, device->target->bus, device->target, device, async_arg); xpt_async_bcast(&device->asyncs, async_code, path, async_arg); if (relock) { xpt_path_unlock(path); mtx_lock(&device->device_mtx); } xpt_release_device(device); return (1); } static int xpt_async_process_tgt(struct cam_et *target, void *arg) { union ccb *ccb = arg; struct cam_path *path = ccb->ccb_h.path; if (path->target != target && path->target->target_id != CAM_TARGET_WILDCARD && target->target_id != CAM_TARGET_WILDCARD) return (1); if (ccb->casync.async_code == AC_SENT_BDR) { /* Update our notion of when the last reset occurred */ microtime(&target->last_reset); } return (xptdevicetraverse(target, NULL, xpt_async_process_dev, ccb)); } static void xpt_async_process(struct cam_periph *periph, union ccb *ccb) { struct cam_eb *bus; struct cam_path *path; void *async_arg; u_int32_t async_code; path = ccb->ccb_h.path; async_code = ccb->casync.async_code; async_arg = ccb->casync.async_arg_ptr; CAM_DEBUG(path, CAM_DEBUG_TRACE | CAM_DEBUG_INFO, ("xpt_async(%s)\n", xpt_async_string(async_code))); bus = path->bus; if (async_code == AC_BUS_RESET) { /* Update our notion of when the last reset occurred */ microtime(&bus->last_reset); } xpttargettraverse(bus, NULL, xpt_async_process_tgt, ccb); /* * If this wasn't a fully wildcarded async, tell all * clients that want all async events. */ if (bus != xpt_periph->path->bus) { xpt_path_lock(xpt_periph->path); xpt_async_process_dev(xpt_periph->path->device, ccb); xpt_path_unlock(xpt_periph->path); } if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD) xpt_release_devq(path, 1, TRUE); else xpt_release_simq(path->bus->sim, TRUE); if (ccb->casync.async_arg_size > 0) free(async_arg, M_CAMXPT); xpt_free_path(path); xpt_free_ccb(ccb); } static void xpt_async_bcast(struct async_list *async_head, u_int32_t async_code, struct cam_path *path, void *async_arg) { struct async_node *cur_entry; int lock; cur_entry = SLIST_FIRST(async_head); while (cur_entry != NULL) { struct async_node *next_entry; /* * Grab the next list entry before we call the current * entry's callback. This is because the callback function * can delete its async callback entry. */ next_entry = SLIST_NEXT(cur_entry, links); if ((cur_entry->event_enable & async_code) != 0) { lock = cur_entry->event_lock; if (lock) CAM_SIM_LOCK(path->device->sim); cur_entry->callback(cur_entry->callback_arg, async_code, path, async_arg); if (lock) CAM_SIM_UNLOCK(path->device->sim); } cur_entry = next_entry; } } void xpt_async(u_int32_t async_code, struct cam_path *path, void *async_arg) { union ccb *ccb; int size; ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { xpt_print(path, "Can't allocate CCB to send %s\n", xpt_async_string(async_code)); return; } if (xpt_clone_path(&ccb->ccb_h.path, path) != CAM_REQ_CMP) { xpt_print(path, "Can't allocate path to send %s\n", xpt_async_string(async_code)); xpt_free_ccb(ccb); return; } ccb->ccb_h.path->periph = NULL; ccb->ccb_h.func_code = XPT_ASYNC; ccb->ccb_h.cbfcnp = xpt_async_process; ccb->ccb_h.flags |= CAM_UNLOCKED; ccb->casync.async_code = async_code; ccb->casync.async_arg_size = 0; size = xpt_async_size(async_code); CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_async: func %#x %s aync_code %d %s\n", ccb->ccb_h.func_code, xpt_action_name(ccb->ccb_h.func_code), async_code, xpt_async_string(async_code))); if (size > 0 && async_arg != NULL) { ccb->casync.async_arg_ptr = malloc(size, M_CAMXPT, M_NOWAIT); if (ccb->casync.async_arg_ptr == NULL) { xpt_print(path, "Can't allocate argument to send %s\n", xpt_async_string(async_code)); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } memcpy(ccb->casync.async_arg_ptr, async_arg, size); ccb->casync.async_arg_size = size; } else if (size < 0) { ccb->casync.async_arg_ptr = async_arg; ccb->casync.async_arg_size = size; } if (path->device != NULL && path->device->lun_id != CAM_LUN_WILDCARD) xpt_freeze_devq(path, 1); else xpt_freeze_simq(path->bus->sim, 1); xpt_done(ccb); } static void xpt_dev_async_default(u_int32_t async_code, struct cam_eb *bus, struct cam_et *target, struct cam_ed *device, void *async_arg) { /* * We only need to handle events for real devices. */ if (target->target_id == CAM_TARGET_WILDCARD || device->lun_id == CAM_LUN_WILDCARD) return; printf("%s called\n", __func__); } static uint32_t xpt_freeze_devq_device(struct cam_ed *dev, u_int count) { struct cam_devq *devq; uint32_t freeze; devq = dev->sim->devq; mtx_assert(&devq->send_mtx, MA_OWNED); CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_freeze_devq_device(%d) %u->%u\n", count, dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt + count)); freeze = (dev->ccbq.queue.qfrozen_cnt += count); /* Remove frozen device from sendq. */ if (device_is_queued(dev)) camq_remove(&devq->send_queue, dev->devq_entry.index); return (freeze); } u_int32_t xpt_freeze_devq(struct cam_path *path, u_int count) { struct cam_ed *dev = path->device; struct cam_devq *devq; uint32_t freeze; devq = dev->sim->devq; mtx_lock(&devq->send_mtx); CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_freeze_devq(%d)\n", count)); freeze = xpt_freeze_devq_device(dev, count); mtx_unlock(&devq->send_mtx); return (freeze); } u_int32_t xpt_freeze_simq(struct cam_sim *sim, u_int count) { struct cam_devq *devq; uint32_t freeze; devq = sim->devq; mtx_lock(&devq->send_mtx); freeze = (devq->send_queue.qfrozen_cnt += count); mtx_unlock(&devq->send_mtx); return (freeze); } static void xpt_release_devq_timeout(void *arg) { struct cam_ed *dev; struct cam_devq *devq; dev = (struct cam_ed *)arg; CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_release_devq_timeout\n")); devq = dev->sim->devq; mtx_assert(&devq->send_mtx, MA_OWNED); if (xpt_release_devq_device(dev, /*count*/1, /*run_queue*/TRUE)) xpt_run_devq(devq); } void xpt_release_devq(struct cam_path *path, u_int count, int run_queue) { struct cam_ed *dev; struct cam_devq *devq; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_release_devq(%d, %d)\n", count, run_queue)); dev = path->device; devq = dev->sim->devq; mtx_lock(&devq->send_mtx); if (xpt_release_devq_device(dev, count, run_queue)) xpt_run_devq(dev->sim->devq); mtx_unlock(&devq->send_mtx); } static int xpt_release_devq_device(struct cam_ed *dev, u_int count, int run_queue) { mtx_assert(&dev->sim->devq->send_mtx, MA_OWNED); CAM_DEBUG_DEV(dev, CAM_DEBUG_TRACE, ("xpt_release_devq_device(%d, %d) %u->%u\n", count, run_queue, dev->ccbq.queue.qfrozen_cnt, dev->ccbq.queue.qfrozen_cnt - count)); if (count > dev->ccbq.queue.qfrozen_cnt) { #ifdef INVARIANTS printf("xpt_release_devq(): requested %u > present %u\n", count, dev->ccbq.queue.qfrozen_cnt); #endif count = dev->ccbq.queue.qfrozen_cnt; } dev->ccbq.queue.qfrozen_cnt -= count; if (dev->ccbq.queue.qfrozen_cnt == 0) { /* * No longer need to wait for a successful * command completion. */ dev->flags &= ~CAM_DEV_REL_ON_COMPLETE; /* * Remove any timeouts that might be scheduled * to release this queue. */ if ((dev->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) { callout_stop(&dev->callout); dev->flags &= ~CAM_DEV_REL_TIMEOUT_PENDING; } /* * Now that we are unfrozen schedule the * device so any pending transactions are * run. */ xpt_schedule_devq(dev->sim->devq, dev); } else run_queue = 0; return (run_queue); } void xpt_release_simq(struct cam_sim *sim, int run_queue) { struct cam_devq *devq; devq = sim->devq; mtx_lock(&devq->send_mtx); if (devq->send_queue.qfrozen_cnt <= 0) { #ifdef INVARIANTS printf("xpt_release_simq: requested 1 > present %u\n", devq->send_queue.qfrozen_cnt); #endif } else devq->send_queue.qfrozen_cnt--; if (devq->send_queue.qfrozen_cnt == 0) { /* * If there is a timeout scheduled to release this * sim queue, remove it. The queue frozen count is * already at 0. */ if ((sim->flags & CAM_SIM_REL_TIMEOUT_PENDING) != 0){ callout_stop(&sim->callout); sim->flags &= ~CAM_SIM_REL_TIMEOUT_PENDING; } if (run_queue) { /* * Now that we are unfrozen run the send queue. */ xpt_run_devq(sim->devq); } } mtx_unlock(&devq->send_mtx); } /* * XXX Appears to be unused. */ static void xpt_release_simq_timeout(void *arg) { struct cam_sim *sim; sim = (struct cam_sim *)arg; xpt_release_simq(sim, /* run_queue */ TRUE); } void xpt_done(union ccb *done_ccb) { struct cam_doneq *queue; int run, hash; +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + if (done_ccb->ccb_h.func_code == XPT_SCSI_IO && + done_ccb->csio.bio != NULL) + biotrack(done_ccb->csio.bio, __func__); +#endif + CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_done: func= %#x %s status %#x\n", done_ccb->ccb_h.func_code, xpt_action_name(done_ccb->ccb_h.func_code), done_ccb->ccb_h.status)); if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0) return; /* Store the time the ccb was in the sim */ done_ccb->ccb_h.qos.sim_data = sbinuptime() - done_ccb->ccb_h.qos.sim_data; hash = (done_ccb->ccb_h.path_id + done_ccb->ccb_h.target_id + done_ccb->ccb_h.target_lun) % cam_num_doneqs; queue = &cam_doneqs[hash]; mtx_lock(&queue->cam_doneq_mtx); run = (queue->cam_doneq_sleep && STAILQ_EMPTY(&queue->cam_doneq)); STAILQ_INSERT_TAIL(&queue->cam_doneq, &done_ccb->ccb_h, sim_links.stqe); done_ccb->ccb_h.pinfo.index = CAM_DONEQ_INDEX; mtx_unlock(&queue->cam_doneq_mtx); if (run) wakeup(&queue->cam_doneq); } void xpt_done_direct(union ccb *done_ccb) { CAM_DEBUG(done_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xpt_done_direct: status %#x\n", done_ccb->ccb_h.status)); if ((done_ccb->ccb_h.func_code & XPT_FC_QUEUED) == 0) return; /* Store the time the ccb was in the sim */ done_ccb->ccb_h.qos.sim_data = sbinuptime() - done_ccb->ccb_h.qos.sim_data; xpt_done_process(&done_ccb->ccb_h); } union ccb * xpt_alloc_ccb() { union ccb *new_ccb; new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK); return (new_ccb); } union ccb * xpt_alloc_ccb_nowait() { union ccb *new_ccb; new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT); return (new_ccb); } void xpt_free_ccb(union ccb *free_ccb) { free(free_ccb, M_CAMCCB); } /* Private XPT functions */ /* * Get a CAM control block for the caller. Charge the structure to the device * referenced by the path. If we don't have sufficient resources to allocate * more ccbs, we return NULL. */ static union ccb * xpt_get_ccb_nowait(struct cam_periph *periph) { union ccb *new_ccb; new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_NOWAIT); if (new_ccb == NULL) return (NULL); periph->periph_allocated++; cam_ccbq_take_opening(&periph->path->device->ccbq); return (new_ccb); } static union ccb * xpt_get_ccb(struct cam_periph *periph) { union ccb *new_ccb; cam_periph_unlock(periph); new_ccb = malloc(sizeof(*new_ccb), M_CAMCCB, M_ZERO|M_WAITOK); cam_periph_lock(periph); periph->periph_allocated++; cam_ccbq_take_opening(&periph->path->device->ccbq); return (new_ccb); } union ccb * cam_periph_getccb(struct cam_periph *periph, u_int32_t priority) { struct ccb_hdr *ccb_h; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("cam_periph_getccb\n")); cam_periph_assert(periph, MA_OWNED); while ((ccb_h = SLIST_FIRST(&periph->ccb_list)) == NULL || ccb_h->pinfo.priority != priority) { if (priority < periph->immediate_priority) { periph->immediate_priority = priority; xpt_run_allocq(periph, 0); } else cam_periph_sleep(periph, &periph->ccb_list, PRIBIO, "cgticb", 0); } SLIST_REMOVE_HEAD(&periph->ccb_list, periph_links.sle); return ((union ccb *)ccb_h); } static void xpt_acquire_bus(struct cam_eb *bus) { xpt_lock_buses(); bus->refcount++; xpt_unlock_buses(); } static void xpt_release_bus(struct cam_eb *bus) { xpt_lock_buses(); KASSERT(bus->refcount >= 1, ("bus->refcount >= 1")); if (--bus->refcount > 0) { xpt_unlock_buses(); return; } TAILQ_REMOVE(&xsoftc.xpt_busses, bus, links); xsoftc.bus_generation++; xpt_unlock_buses(); KASSERT(TAILQ_EMPTY(&bus->et_entries), ("destroying bus, but target list is not empty")); cam_sim_release(bus->sim); mtx_destroy(&bus->eb_mtx); free(bus, M_CAMXPT); } static struct cam_et * xpt_alloc_target(struct cam_eb *bus, target_id_t target_id) { struct cam_et *cur_target, *target; mtx_assert(&xsoftc.xpt_topo_lock, MA_OWNED); mtx_assert(&bus->eb_mtx, MA_OWNED); target = (struct cam_et *)malloc(sizeof(*target), M_CAMXPT, M_NOWAIT|M_ZERO); if (target == NULL) return (NULL); TAILQ_INIT(&target->ed_entries); target->bus = bus; target->target_id = target_id; target->refcount = 1; target->generation = 0; target->luns = NULL; mtx_init(&target->luns_mtx, "CAM LUNs lock", NULL, MTX_DEF); timevalclear(&target->last_reset); /* * Hold a reference to our parent bus so it * will not go away before we do. */ bus->refcount++; /* Insertion sort into our bus's target list */ cur_target = TAILQ_FIRST(&bus->et_entries); while (cur_target != NULL && cur_target->target_id < target_id) cur_target = TAILQ_NEXT(cur_target, links); if (cur_target != NULL) { TAILQ_INSERT_BEFORE(cur_target, target, links); } else { TAILQ_INSERT_TAIL(&bus->et_entries, target, links); } bus->generation++; return (target); } static void xpt_acquire_target(struct cam_et *target) { struct cam_eb *bus = target->bus; mtx_lock(&bus->eb_mtx); target->refcount++; mtx_unlock(&bus->eb_mtx); } static void xpt_release_target(struct cam_et *target) { struct cam_eb *bus = target->bus; mtx_lock(&bus->eb_mtx); if (--target->refcount > 0) { mtx_unlock(&bus->eb_mtx); return; } TAILQ_REMOVE(&bus->et_entries, target, links); bus->generation++; mtx_unlock(&bus->eb_mtx); KASSERT(TAILQ_EMPTY(&target->ed_entries), ("destroying target, but device list is not empty")); xpt_release_bus(bus); mtx_destroy(&target->luns_mtx); if (target->luns) free(target->luns, M_CAMXPT); free(target, M_CAMXPT); } static struct cam_ed * xpt_alloc_device_default(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id) { struct cam_ed *device; device = xpt_alloc_device(bus, target, lun_id); if (device == NULL) return (NULL); device->mintags = 1; device->maxtags = 1; return (device); } static void xpt_destroy_device(void *context, int pending) { struct cam_ed *device = context; mtx_lock(&device->device_mtx); mtx_destroy(&device->device_mtx); free(device, M_CAMDEV); } struct cam_ed * xpt_alloc_device(struct cam_eb *bus, struct cam_et *target, lun_id_t lun_id) { struct cam_ed *cur_device, *device; struct cam_devq *devq; cam_status status; mtx_assert(&bus->eb_mtx, MA_OWNED); /* Make space for us in the device queue on our bus */ devq = bus->sim->devq; mtx_lock(&devq->send_mtx); status = cam_devq_resize(devq, devq->send_queue.array_size + 1); mtx_unlock(&devq->send_mtx); if (status != CAM_REQ_CMP) return (NULL); device = (struct cam_ed *)malloc(sizeof(*device), M_CAMDEV, M_NOWAIT|M_ZERO); if (device == NULL) return (NULL); cam_init_pinfo(&device->devq_entry); device->target = target; device->lun_id = lun_id; device->sim = bus->sim; if (cam_ccbq_init(&device->ccbq, bus->sim->max_dev_openings) != 0) { free(device, M_CAMDEV); return (NULL); } SLIST_INIT(&device->asyncs); SLIST_INIT(&device->periphs); device->generation = 0; device->flags = CAM_DEV_UNCONFIGURED; device->tag_delay_count = 0; device->tag_saved_openings = 0; device->refcount = 1; mtx_init(&device->device_mtx, "CAM device lock", NULL, MTX_DEF); callout_init_mtx(&device->callout, &devq->send_mtx, 0); TASK_INIT(&device->device_destroy_task, 0, xpt_destroy_device, device); /* * Hold a reference to our parent bus so it * will not go away before we do. */ target->refcount++; cur_device = TAILQ_FIRST(&target->ed_entries); while (cur_device != NULL && cur_device->lun_id < lun_id) cur_device = TAILQ_NEXT(cur_device, links); if (cur_device != NULL) TAILQ_INSERT_BEFORE(cur_device, device, links); else TAILQ_INSERT_TAIL(&target->ed_entries, device, links); target->generation++; return (device); } void xpt_acquire_device(struct cam_ed *device) { struct cam_eb *bus = device->target->bus; mtx_lock(&bus->eb_mtx); device->refcount++; mtx_unlock(&bus->eb_mtx); } void xpt_release_device(struct cam_ed *device) { struct cam_eb *bus = device->target->bus; struct cam_devq *devq; mtx_lock(&bus->eb_mtx); if (--device->refcount > 0) { mtx_unlock(&bus->eb_mtx); return; } TAILQ_REMOVE(&device->target->ed_entries, device,links); device->target->generation++; mtx_unlock(&bus->eb_mtx); /* Release our slot in the devq */ devq = bus->sim->devq; mtx_lock(&devq->send_mtx); cam_devq_resize(devq, devq->send_queue.array_size - 1); mtx_unlock(&devq->send_mtx); KASSERT(SLIST_EMPTY(&device->periphs), ("destroying device, but periphs list is not empty")); KASSERT(device->devq_entry.index == CAM_UNQUEUED_INDEX, ("destroying device while still queued for ccbs")); if ((device->flags & CAM_DEV_REL_TIMEOUT_PENDING) != 0) callout_stop(&device->callout); xpt_release_target(device->target); cam_ccbq_fini(&device->ccbq); /* * Free allocated memory. free(9) does nothing if the * supplied pointer is NULL, so it is safe to call without * checking. */ free(device->supported_vpds, M_CAMXPT); free(device->device_id, M_CAMXPT); free(device->ext_inq, M_CAMXPT); free(device->physpath, M_CAMXPT); free(device->rcap_buf, M_CAMXPT); free(device->serial_num, M_CAMXPT); taskqueue_enqueue(xsoftc.xpt_taskq, &device->device_destroy_task); } u_int32_t xpt_dev_ccbq_resize(struct cam_path *path, int newopenings) { int result; struct cam_ed *dev; dev = path->device; mtx_lock(&dev->sim->devq->send_mtx); result = cam_ccbq_resize(&dev->ccbq, newopenings); mtx_unlock(&dev->sim->devq->send_mtx); if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0 || (dev->inq_flags & SID_CmdQue) != 0) dev->tag_saved_openings = newopenings; return (result); } static struct cam_eb * xpt_find_bus(path_id_t path_id) { struct cam_eb *bus; xpt_lock_buses(); for (bus = TAILQ_FIRST(&xsoftc.xpt_busses); bus != NULL; bus = TAILQ_NEXT(bus, links)) { if (bus->path_id == path_id) { bus->refcount++; break; } } xpt_unlock_buses(); return (bus); } static struct cam_et * xpt_find_target(struct cam_eb *bus, target_id_t target_id) { struct cam_et *target; mtx_assert(&bus->eb_mtx, MA_OWNED); for (target = TAILQ_FIRST(&bus->et_entries); target != NULL; target = TAILQ_NEXT(target, links)) { if (target->target_id == target_id) { target->refcount++; break; } } return (target); } static struct cam_ed * xpt_find_device(struct cam_et *target, lun_id_t lun_id) { struct cam_ed *device; mtx_assert(&target->bus->eb_mtx, MA_OWNED); for (device = TAILQ_FIRST(&target->ed_entries); device != NULL; device = TAILQ_NEXT(device, links)) { if (device->lun_id == lun_id) { device->refcount++; break; } } return (device); } void xpt_start_tags(struct cam_path *path) { struct ccb_relsim crs; struct cam_ed *device; struct cam_sim *sim; int newopenings; device = path->device; sim = path->bus->sim; device->flags &= ~CAM_DEV_TAG_AFTER_COUNT; xpt_freeze_devq(path, /*count*/1); device->inq_flags |= SID_CmdQue; if (device->tag_saved_openings != 0) newopenings = device->tag_saved_openings; else newopenings = min(device->maxtags, sim->max_tagged_dev_openings); xpt_dev_ccbq_resize(path, newopenings); xpt_async(AC_GETDEV_CHANGED, path, NULL); xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL); crs.ccb_h.func_code = XPT_REL_SIMQ; crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY; crs.openings = crs.release_timeout = crs.qfrozen_cnt = 0; xpt_action((union ccb *)&crs); } void xpt_stop_tags(struct cam_path *path) { struct ccb_relsim crs; struct cam_ed *device; struct cam_sim *sim; device = path->device; sim = path->bus->sim; device->flags &= ~CAM_DEV_TAG_AFTER_COUNT; device->tag_delay_count = 0; xpt_freeze_devq(path, /*count*/1); device->inq_flags &= ~SID_CmdQue; xpt_dev_ccbq_resize(path, sim->max_dev_openings); xpt_async(AC_GETDEV_CHANGED, path, NULL); xpt_setup_ccb(&crs.ccb_h, path, CAM_PRIORITY_NORMAL); crs.ccb_h.func_code = XPT_REL_SIMQ; crs.release_flags = RELSIM_RELEASE_AFTER_QEMPTY; crs.openings = crs.release_timeout = crs.qfrozen_cnt = 0; xpt_action((union ccb *)&crs); } static void xpt_boot_delay(void *arg) { xpt_release_boot(); } static void xpt_config(void *arg) { /* * Now that interrupts are enabled, go find our devices */ if (taskqueue_start_threads(&xsoftc.xpt_taskq, 1, PRIBIO, "CAM taskq")) printf("xpt_config: failed to create taskqueue thread.\n"); /* Setup debugging path */ if (cam_dflags != CAM_DEBUG_NONE) { if (xpt_create_path(&cam_dpath, NULL, CAM_DEBUG_BUS, CAM_DEBUG_TARGET, CAM_DEBUG_LUN) != CAM_REQ_CMP) { printf("xpt_config: xpt_create_path() failed for debug" " target %d:%d:%d, debugging disabled\n", CAM_DEBUG_BUS, CAM_DEBUG_TARGET, CAM_DEBUG_LUN); cam_dflags = CAM_DEBUG_NONE; } } else cam_dpath = NULL; periphdriver_init(1); xpt_hold_boot(); callout_init(&xsoftc.boot_callout, 1); callout_reset_sbt(&xsoftc.boot_callout, SBT_1MS * xsoftc.boot_delay, 0, xpt_boot_delay, NULL, 0); /* Fire up rescan thread. */ if (kproc_kthread_add(xpt_scanner_thread, NULL, &cam_proc, NULL, 0, 0, "cam", "scanner")) { printf("xpt_config: failed to create rescan thread.\n"); } } void xpt_hold_boot(void) { xpt_lock_buses(); xsoftc.buses_to_config++; xpt_unlock_buses(); } void xpt_release_boot(void) { xpt_lock_buses(); xsoftc.buses_to_config--; if (xsoftc.buses_to_config == 0 && xsoftc.buses_config_done == 0) { struct xpt_task *task; xsoftc.buses_config_done = 1; xpt_unlock_buses(); /* Call manually because we don't have any busses */ task = malloc(sizeof(struct xpt_task), M_CAMXPT, M_NOWAIT); if (task != NULL) { TASK_INIT(&task->task, 0, xpt_finishconfig_task, task); taskqueue_enqueue(taskqueue_thread, &task->task); } } else xpt_unlock_buses(); } /* * If the given device only has one peripheral attached to it, and if that * peripheral is the passthrough driver, announce it. This insures that the * user sees some sort of announcement for every peripheral in their system. */ static int xptpassannouncefunc(struct cam_ed *device, void *arg) { struct cam_periph *periph; int i; for (periph = SLIST_FIRST(&device->periphs), i = 0; periph != NULL; periph = SLIST_NEXT(periph, periph_links), i++); periph = SLIST_FIRST(&device->periphs); if ((i == 1) && (strncmp(periph->periph_name, "pass", 4) == 0)) xpt_announce_periph(periph, NULL); return(1); } static void xpt_finishconfig_task(void *context, int pending) { periphdriver_init(2); /* * Check for devices with no "standard" peripheral driver * attached. For any devices like that, announce the * passthrough driver so the user will see something. */ if (!bootverbose) xpt_for_all_devices(xptpassannouncefunc, NULL); /* Release our hook so that the boot can continue. */ config_intrhook_disestablish(xsoftc.xpt_config_hook); free(xsoftc.xpt_config_hook, M_CAMXPT); xsoftc.xpt_config_hook = NULL; free(context, M_CAMXPT); } cam_status xpt_register_async(int event, ac_callback_t *cbfunc, void *cbarg, struct cam_path *path) { struct ccb_setasync csa; cam_status status; int xptpath = 0; if (path == NULL) { status = xpt_create_path(&path, /*periph*/NULL, CAM_XPT_PATH_ID, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) return (status); xpt_path_lock(path); xptpath = 1; } xpt_setup_ccb(&csa.ccb_h, path, CAM_PRIORITY_NORMAL); csa.ccb_h.func_code = XPT_SASYNC_CB; csa.event_enable = event; csa.callback = cbfunc; csa.callback_arg = cbarg; xpt_action((union ccb *)&csa); status = csa.ccb_h.status; CAM_DEBUG(csa.ccb_h.path, CAM_DEBUG_TRACE, ("xpt_register_async: func %p\n", cbfunc)); if (xptpath) { xpt_path_unlock(path); xpt_free_path(path); } if ((status == CAM_REQ_CMP) && (csa.event_enable & AC_FOUND_DEVICE)) { /* * Get this peripheral up to date with all * the currently existing devices. */ xpt_for_all_devices(xptsetasyncfunc, &csa); } if ((status == CAM_REQ_CMP) && (csa.event_enable & AC_PATH_REGISTERED)) { /* * Get this peripheral up to date with all * the currently existing busses. */ xpt_for_all_busses(xptsetasyncbusfunc, &csa); } return (status); } static void xptaction(struct cam_sim *sim, union ccb *work_ccb) { CAM_DEBUG(work_ccb->ccb_h.path, CAM_DEBUG_TRACE, ("xptaction\n")); switch (work_ccb->ccb_h.func_code) { /* Common cases first */ case XPT_PATH_INQ: /* Path routing inquiry */ { struct ccb_pathinq *cpi; cpi = &work_ccb->cpi; cpi->version_num = 1; /* XXX??? */ cpi->hba_inquiry = 0; cpi->target_sprt = 0; cpi->hba_misc = 0; cpi->hba_eng_cnt = 0; cpi->max_target = 0; cpi->max_lun = 0; cpi->initiator_id = 0; strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strncpy(cpi->hba_vid, "", HBA_IDLEN); strncpy(cpi->dev_name, sim->sim_name, DEV_IDLEN); cpi->unit_number = sim->unit_number; cpi->bus_id = sim->bus_id; cpi->base_transfer_speed = 0; cpi->protocol = PROTO_UNSPECIFIED; cpi->protocol_version = PROTO_VERSION_UNSPECIFIED; cpi->transport = XPORT_UNSPECIFIED; cpi->transport_version = XPORT_VERSION_UNSPECIFIED; cpi->ccb_h.status = CAM_REQ_CMP; xpt_done(work_ccb); break; } default: work_ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(work_ccb); break; } } /* * The xpt as a "controller" has no interrupt sources, so polling * is a no-op. */ static void xptpoll(struct cam_sim *sim) { } void xpt_lock_buses(void) { mtx_lock(&xsoftc.xpt_topo_lock); } void xpt_unlock_buses(void) { mtx_unlock(&xsoftc.xpt_topo_lock); } struct mtx * xpt_path_mtx(struct cam_path *path) { return (&path->device->device_mtx); } static void xpt_done_process(struct ccb_hdr *ccb_h) { struct cam_sim *sim; struct cam_devq *devq; struct mtx *mtx = NULL; + +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + struct ccb_scsiio *csio; + + if (ccb_h->func_code == XPT_SCSI_IO) { + csio = &((union ccb *)ccb_h)->csio; + if (csio->bio != NULL) + biotrack(csio->bio, __func__); + } +#endif if (ccb_h->flags & CAM_HIGH_POWER) { struct highpowerlist *hphead; struct cam_ed *device; mtx_lock(&xsoftc.xpt_highpower_lock); hphead = &xsoftc.highpowerq; device = STAILQ_FIRST(hphead); /* * Increment the count since this command is done. */ xsoftc.num_highpower++; /* * Any high powered commands queued up? */ if (device != NULL) { STAILQ_REMOVE_HEAD(hphead, highpowerq_entry); mtx_unlock(&xsoftc.xpt_highpower_lock); mtx_lock(&device->sim->devq->send_mtx); xpt_release_devq_device(device, /*count*/1, /*runqueue*/TRUE); mtx_unlock(&device->sim->devq->send_mtx); } else mtx_unlock(&xsoftc.xpt_highpower_lock); } sim = ccb_h->path->bus->sim; if (ccb_h->status & CAM_RELEASE_SIMQ) { xpt_release_simq(sim, /*run_queue*/FALSE); ccb_h->status &= ~CAM_RELEASE_SIMQ; } if ((ccb_h->flags & CAM_DEV_QFRZDIS) && (ccb_h->status & CAM_DEV_QFRZN)) { xpt_release_devq(ccb_h->path, /*count*/1, /*run_queue*/TRUE); ccb_h->status &= ~CAM_DEV_QFRZN; } devq = sim->devq; if ((ccb_h->func_code & XPT_FC_USER_CCB) == 0) { struct cam_ed *dev = ccb_h->path->device; mtx_lock(&devq->send_mtx); devq->send_active--; devq->send_openings++; cam_ccbq_ccb_done(&dev->ccbq, (union ccb *)ccb_h); if (((dev->flags & CAM_DEV_REL_ON_QUEUE_EMPTY) != 0 && (dev->ccbq.dev_active == 0))) { dev->flags &= ~CAM_DEV_REL_ON_QUEUE_EMPTY; xpt_release_devq_device(dev, /*count*/1, /*run_queue*/FALSE); } if (((dev->flags & CAM_DEV_REL_ON_COMPLETE) != 0 && (ccb_h->status&CAM_STATUS_MASK) != CAM_REQUEUE_REQ)) { dev->flags &= ~CAM_DEV_REL_ON_COMPLETE; xpt_release_devq_device(dev, /*count*/1, /*run_queue*/FALSE); } if (!device_is_queued(dev)) (void)xpt_schedule_devq(devq, dev); xpt_run_devq(devq); mtx_unlock(&devq->send_mtx); if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0) { mtx = xpt_path_mtx(ccb_h->path); mtx_lock(mtx); if ((dev->flags & CAM_DEV_TAG_AFTER_COUNT) != 0 && (--dev->tag_delay_count == 0)) xpt_start_tags(ccb_h->path); } } if ((ccb_h->flags & CAM_UNLOCKED) == 0) { if (mtx == NULL) { mtx = xpt_path_mtx(ccb_h->path); mtx_lock(mtx); } } else { if (mtx != NULL) { mtx_unlock(mtx); mtx = NULL; } } /* Call the peripheral driver's callback */ ccb_h->pinfo.index = CAM_UNQUEUED_INDEX; (*ccb_h->cbfcnp)(ccb_h->path->periph, (union ccb *)ccb_h); if (mtx != NULL) mtx_unlock(mtx); } void xpt_done_td(void *arg) { struct cam_doneq *queue = arg; struct ccb_hdr *ccb_h; STAILQ_HEAD(, ccb_hdr) doneq; STAILQ_INIT(&doneq); mtx_lock(&queue->cam_doneq_mtx); while (1) { while (STAILQ_EMPTY(&queue->cam_doneq)) { queue->cam_doneq_sleep = 1; msleep(&queue->cam_doneq, &queue->cam_doneq_mtx, PRIBIO, "-", 0); queue->cam_doneq_sleep = 0; } STAILQ_CONCAT(&doneq, &queue->cam_doneq); mtx_unlock(&queue->cam_doneq_mtx); THREAD_NO_SLEEPING(); while ((ccb_h = STAILQ_FIRST(&doneq)) != NULL) { STAILQ_REMOVE_HEAD(&doneq, sim_links.stqe); xpt_done_process(ccb_h); } THREAD_SLEEPING_OK(); mtx_lock(&queue->cam_doneq_mtx); } } static void camisr_runqueue(void) { struct ccb_hdr *ccb_h; struct cam_doneq *queue; int i; /* Process global queues. */ for (i = 0; i < cam_num_doneqs; i++) { queue = &cam_doneqs[i]; mtx_lock(&queue->cam_doneq_mtx); while ((ccb_h = STAILQ_FIRST(&queue->cam_doneq)) != NULL) { STAILQ_REMOVE_HEAD(&queue->cam_doneq, sim_links.stqe); mtx_unlock(&queue->cam_doneq_mtx); xpt_done_process(ccb_h); mtx_lock(&queue->cam_doneq_mtx); } mtx_unlock(&queue->cam_doneq_mtx); } } struct kv { uint32_t v; const char *name; }; static struct kv map[] = { { XPT_NOOP, "XPT_NOOP" }, { XPT_SCSI_IO, "XPT_SCSI_IO" }, { XPT_GDEV_TYPE, "XPT_GDEV_TYPE" }, { XPT_GDEVLIST, "XPT_GDEVLIST" }, { XPT_PATH_INQ, "XPT_PATH_INQ" }, { XPT_REL_SIMQ, "XPT_REL_SIMQ" }, { XPT_SASYNC_CB, "XPT_SASYNC_CB" }, { XPT_SDEV_TYPE, "XPT_SDEV_TYPE" }, { XPT_SCAN_BUS, "XPT_SCAN_BUS" }, { XPT_DEV_MATCH, "XPT_DEV_MATCH" }, { XPT_DEBUG, "XPT_DEBUG" }, { XPT_PATH_STATS, "XPT_PATH_STATS" }, { XPT_GDEV_STATS, "XPT_GDEV_STATS" }, { XPT_DEV_ADVINFO, "XPT_DEV_ADVINFO" }, { XPT_ASYNC, "XPT_ASYNC" }, { XPT_ABORT, "XPT_ABORT" }, { XPT_RESET_BUS, "XPT_RESET_BUS" }, { XPT_RESET_DEV, "XPT_RESET_DEV" }, { XPT_TERM_IO, "XPT_TERM_IO" }, { XPT_SCAN_LUN, "XPT_SCAN_LUN" }, { XPT_GET_TRAN_SETTINGS, "XPT_GET_TRAN_SETTINGS" }, { XPT_SET_TRAN_SETTINGS, "XPT_SET_TRAN_SETTINGS" }, { XPT_CALC_GEOMETRY, "XPT_CALC_GEOMETRY" }, { XPT_ATA_IO, "XPT_ATA_IO" }, { XPT_GET_SIM_KNOB, "XPT_GET_SIM_KNOB" }, { XPT_SET_SIM_KNOB, "XPT_SET_SIM_KNOB" }, { XPT_NVME_IO, "XPT_NVME_IO" }, { XPT_MMCSD_IO, "XPT_MMCSD_IO" }, { XPT_SMP_IO, "XPT_SMP_IO" }, { XPT_SCAN_TGT, "XPT_SCAN_TGT" }, { XPT_ENG_INQ, "XPT_ENG_INQ" }, { XPT_ENG_EXEC, "XPT_ENG_EXEC" }, { XPT_EN_LUN, "XPT_EN_LUN" }, { XPT_TARGET_IO, "XPT_TARGET_IO" }, { XPT_ACCEPT_TARGET_IO, "XPT_ACCEPT_TARGET_IO" }, { XPT_CONT_TARGET_IO, "XPT_CONT_TARGET_IO" }, { XPT_IMMED_NOTIFY, "XPT_IMMED_NOTIFY" }, { XPT_NOTIFY_ACK, "XPT_NOTIFY_ACK" }, { XPT_IMMEDIATE_NOTIFY, "XPT_IMMEDIATE_NOTIFY" }, { XPT_NOTIFY_ACKNOWLEDGE, "XPT_NOTIFY_ACKNOWLEDGE" }, { 0, 0 } }; static const char * xpt_action_name(uint32_t action) { static char buffer[32]; /* Only for unknown messages -- racy */ struct kv *walker = map; while (walker->name != NULL) { if (walker->v == action) return (walker->name); walker++; } snprintf(buffer, sizeof(buffer), "%#x", action); return (buffer); } Index: head/sys/cam/scsi/scsi_da.c =================================================================== --- head/sys/cam/scsi/scsi_da.c (revision 308154) +++ head/sys/cam/scsi/scsi_da.c (revision 308155) @@ -1,5903 +1,5918 @@ /*- * Implementation of SCSI Direct Access Peripheral driver for CAM. * * Copyright (c) 1997 Justin T. Gibbs. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #ifndef _KERNEL #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #ifdef _KERNEL /* * Note that there are probe ordering dependencies here. The order isn't * controlled by this enumeration, but by explicit state transitions in * dastart() and dadone(). Here are some of the dependencies: * * 1. RC should come first, before RC16, unless there is evidence that RC16 * is supported. * 2. BDC needs to come before any of the ATA probes, or the ZONE probe. * 3. The ATA probes should go in this order: * ATA -> LOGDIR -> IDDIR -> SUP -> ATA_ZONE */ typedef enum { DA_STATE_PROBE_RC, DA_STATE_PROBE_RC16, DA_STATE_PROBE_LBP, DA_STATE_PROBE_BLK_LIMITS, DA_STATE_PROBE_BDC, DA_STATE_PROBE_ATA, DA_STATE_PROBE_ATA_LOGDIR, DA_STATE_PROBE_ATA_IDDIR, DA_STATE_PROBE_ATA_SUP, DA_STATE_PROBE_ATA_ZONE, DA_STATE_PROBE_ZONE, DA_STATE_NORMAL } da_state; typedef enum { DA_FLAG_PACK_INVALID = 0x000001, DA_FLAG_NEW_PACK = 0x000002, DA_FLAG_PACK_LOCKED = 0x000004, DA_FLAG_PACK_REMOVABLE = 0x000008, DA_FLAG_NEED_OTAG = 0x000020, DA_FLAG_WAS_OTAG = 0x000040, DA_FLAG_RETRY_UA = 0x000080, DA_FLAG_OPEN = 0x000100, DA_FLAG_SCTX_INIT = 0x000200, DA_FLAG_CAN_RC16 = 0x000400, DA_FLAG_PROBED = 0x000800, DA_FLAG_DIRTY = 0x001000, DA_FLAG_ANNOUNCED = 0x002000, DA_FLAG_CAN_ATA_DMA = 0x004000, DA_FLAG_CAN_ATA_LOG = 0x008000, DA_FLAG_CAN_ATA_IDLOG = 0x010000, DA_FLAG_CAN_ATA_SUPCAP = 0x020000, DA_FLAG_CAN_ATA_ZONE = 0x040000 } da_flags; typedef enum { DA_Q_NONE = 0x00, DA_Q_NO_SYNC_CACHE = 0x01, DA_Q_NO_6_BYTE = 0x02, DA_Q_NO_PREVENT = 0x04, DA_Q_4K = 0x08, DA_Q_NO_RC16 = 0x10, DA_Q_NO_UNMAP = 0x20, DA_Q_RETRY_BUSY = 0x40, DA_Q_SMR_DM = 0x80 } da_quirks; #define DA_Q_BIT_STRING \ "\020" \ "\001NO_SYNC_CACHE" \ "\002NO_6_BYTE" \ "\003NO_PREVENT" \ "\0044K" \ "\005NO_RC16" \ "\006NO_UNMAP" \ "\007RETRY_BUSY" \ "\008SMR_DM" typedef enum { DA_CCB_PROBE_RC = 0x01, DA_CCB_PROBE_RC16 = 0x02, DA_CCB_PROBE_LBP = 0x03, DA_CCB_PROBE_BLK_LIMITS = 0x04, DA_CCB_PROBE_BDC = 0x05, DA_CCB_PROBE_ATA = 0x06, DA_CCB_BUFFER_IO = 0x07, DA_CCB_DUMP = 0x0A, DA_CCB_DELETE = 0x0B, DA_CCB_TUR = 0x0C, DA_CCB_PROBE_ZONE = 0x0D, DA_CCB_PROBE_ATA_LOGDIR = 0x0E, DA_CCB_PROBE_ATA_IDDIR = 0x0F, DA_CCB_PROBE_ATA_SUP = 0x10, DA_CCB_PROBE_ATA_ZONE = 0x11, DA_CCB_TYPE_MASK = 0x1F, DA_CCB_RETRY_UA = 0x20 } da_ccb_state; /* * Order here is important for method choice * * We prefer ATA_TRIM as tests run against a Sandforce 2281 SSD attached to * LSI 2008 (mps) controller (FW: v12, Drv: v14) resulted 20% quicker deletes * using ATA_TRIM than the corresponding UNMAP results for a real world mysql * import taking 5mins. * */ typedef enum { DA_DELETE_NONE, DA_DELETE_DISABLE, DA_DELETE_ATA_TRIM, DA_DELETE_UNMAP, DA_DELETE_WS16, DA_DELETE_WS10, DA_DELETE_ZERO, DA_DELETE_MIN = DA_DELETE_ATA_TRIM, DA_DELETE_MAX = DA_DELETE_ZERO } da_delete_methods; /* * For SCSI, host managed drives show up as a separate device type. For * ATA, host managed drives also have a different device signature. * XXX KDM figure out the ATA host managed signature. */ typedef enum { DA_ZONE_NONE = 0x00, DA_ZONE_DRIVE_MANAGED = 0x01, DA_ZONE_HOST_AWARE = 0x02, DA_ZONE_HOST_MANAGED = 0x03 } da_zone_mode; /* * We distinguish between these interface cases in addition to the drive type: * o ATA drive behind a SCSI translation layer that knows about ZBC/ZAC * o ATA drive behind a SCSI translation layer that does not know about * ZBC/ZAC, and so needs to be managed via ATA passthrough. In this * case, we would need to share the ATA code with the ada(4) driver. * o SCSI drive. */ typedef enum { DA_ZONE_IF_SCSI, DA_ZONE_IF_ATA_PASS, DA_ZONE_IF_ATA_SAT, } da_zone_interface; typedef enum { DA_ZONE_FLAG_RZ_SUP = 0x0001, DA_ZONE_FLAG_OPEN_SUP = 0x0002, DA_ZONE_FLAG_CLOSE_SUP = 0x0004, DA_ZONE_FLAG_FINISH_SUP = 0x0008, DA_ZONE_FLAG_RWP_SUP = 0x0010, DA_ZONE_FLAG_SUP_MASK = (DA_ZONE_FLAG_RZ_SUP | DA_ZONE_FLAG_OPEN_SUP | DA_ZONE_FLAG_CLOSE_SUP | DA_ZONE_FLAG_FINISH_SUP | DA_ZONE_FLAG_RWP_SUP), DA_ZONE_FLAG_URSWRZ = 0x0020, DA_ZONE_FLAG_OPT_SEQ_SET = 0x0040, DA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080, DA_ZONE_FLAG_MAX_SEQ_SET = 0x0100, DA_ZONE_FLAG_SET_MASK = (DA_ZONE_FLAG_OPT_SEQ_SET | DA_ZONE_FLAG_OPT_NONSEQ_SET | DA_ZONE_FLAG_MAX_SEQ_SET) } da_zone_flags; static struct da_zone_desc { da_zone_flags value; const char *desc; } da_zone_desc_table[] = { {DA_ZONE_FLAG_RZ_SUP, "Report Zones" }, {DA_ZONE_FLAG_OPEN_SUP, "Open" }, {DA_ZONE_FLAG_CLOSE_SUP, "Close" }, {DA_ZONE_FLAG_FINISH_SUP, "Finish" }, {DA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" }, }; typedef void da_delete_func_t (struct cam_periph *periph, union ccb *ccb, struct bio *bp); static da_delete_func_t da_delete_trim; static da_delete_func_t da_delete_unmap; static da_delete_func_t da_delete_ws; static const void * da_delete_functions[] = { NULL, NULL, da_delete_trim, da_delete_unmap, da_delete_ws, da_delete_ws, da_delete_ws }; static const char *da_delete_method_names[] = { "NONE", "DISABLE", "ATA_TRIM", "UNMAP", "WS16", "WS10", "ZERO" }; static const char *da_delete_method_desc[] = { "NONE", "DISABLED", "ATA TRIM", "UNMAP", "WRITE SAME(16) with UNMAP", "WRITE SAME(10) with UNMAP", "ZERO" }; /* Offsets into our private area for storing information */ #define ccb_state ppriv_field0 #define ccb_bp ppriv_ptr1 struct disk_params { u_int8_t heads; u_int32_t cylinders; u_int8_t secs_per_track; u_int32_t secsize; /* Number of bytes/sector */ u_int64_t sectors; /* total number sectors */ u_int stripesize; u_int stripeoffset; }; #define UNMAP_RANGE_MAX 0xffffffff #define UNMAP_HEAD_SIZE 8 #define UNMAP_RANGE_SIZE 16 #define UNMAP_MAX_RANGES 2048 /* Protocol Max is 4095 */ #define UNMAP_BUF_SIZE ((UNMAP_MAX_RANGES * UNMAP_RANGE_SIZE) + \ UNMAP_HEAD_SIZE) #define WS10_MAX_BLKS 0xffff #define WS16_MAX_BLKS 0xffffffff #define ATA_TRIM_MAX_RANGES ((UNMAP_BUF_SIZE / \ (ATA_DSM_RANGE_SIZE * ATA_DSM_BLK_SIZE)) * ATA_DSM_BLK_SIZE) #define DA_WORK_TUR (1 << 16) struct da_softc { struct cam_iosched_softc *cam_iosched; struct bio_queue_head delete_run_queue; LIST_HEAD(, ccb_hdr) pending_ccbs; int refcount; /* Active xpt_action() calls */ da_state state; da_flags flags; da_quirks quirks; int minimum_cmd_size; int error_inject; int trim_max_ranges; int delete_available; /* Delete methods possibly available */ da_zone_mode zone_mode; da_zone_interface zone_interface; da_zone_flags zone_flags; struct ata_gp_log_dir ata_logdir; int valid_logdir_len; struct ata_identify_log_pages ata_iddir; int valid_iddir_len; uint64_t optimal_seq_zones; uint64_t optimal_nonseq_zones; uint64_t max_seq_zones; u_int maxio; uint32_t unmap_max_ranges; uint32_t unmap_max_lba; /* Max LBAs in UNMAP req */ uint64_t ws_max_blks; da_delete_methods delete_method_pref; da_delete_methods delete_method; da_delete_func_t *delete_func; int unmappedio; int rotating; struct disk_params params; struct disk *disk; union ccb saved_ccb; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; struct callout sendordered_c; uint64_t wwpn; uint8_t unmap_buf[UNMAP_BUF_SIZE]; struct scsi_read_capacity_data_long rcaplong; struct callout mediapoll_c; #ifdef CAM_IO_STATS struct sysctl_ctx_list sysctl_stats_ctx; struct sysctl_oid *sysctl_stats_tree; u_int errors; u_int timeouts; u_int invalidations; #endif }; #define dadeleteflag(softc, delete_method, enable) \ if (enable) { \ softc->delete_available |= (1 << delete_method); \ } else { \ softc->delete_available &= ~(1 << delete_method); \ } struct da_quirk_entry { struct scsi_inquiry_pattern inq_pat; da_quirks quirks; }; static const char quantum[] = "QUANTUM"; static const char microp[] = "MICROP"; static struct da_quirk_entry da_quirk_table[] = { /* SPI, FC devices */ { /* * Fujitsu M2513A MO drives. * Tested devices: M2513A2 firmware versions 1200 & 1300. * (dip switch selects whether T_DIRECT or T_OPTICAL device) * Reported by: W.Scholten */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* See above. */ {T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This particular Fujitsu drive doesn't like the * synchronize cache command. * Reported by: Tom Jackson */ {T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This drive doesn't like the synchronize cache command * either. Reported by: Matthew Jacob * in NetBSD PR kern/6027, August 24, 1998. */ {T_DIRECT, SIP_MEDIA_FIXED, microp, "2217*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This drive doesn't like the synchronize cache command * either. Reported by: Hellmuth Michaelis (hm@kts.org) * (PR 8882). */ {T_DIRECT, SIP_MEDIA_FIXED, microp, "2112*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: Blaz Zupan */ {T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: Blaz Zupan */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: walter@pelissero.de */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't work correctly with 6 byte reads/writes. * Returns illegal request, and points to byte 9 of the * 6-byte CDB. * Reported by: Adam McDougall */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4*", "*"}, /*quirks*/ DA_Q_NO_6_BYTE }, { /* See above. */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2*", "*"}, /*quirks*/ DA_Q_NO_6_BYTE }, { /* * Doesn't like the synchronize cache command. * Reported by: walter@pelissero.de */ {T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * The CISS RAID controllers do not support SYNC_CACHE */ {T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * The STEC SSDs sometimes hang on UNMAP. */ {T_DIRECT, SIP_MEDIA_FIXED, "STEC", "*", "*"}, /*quirks*/ DA_Q_NO_UNMAP }, { /* * VMware returns BUSY status when storage has transient * connectivity problems, so better wait. */ {T_DIRECT, SIP_MEDIA_FIXED, "VMware*", "*", "*"}, /*quirks*/ DA_Q_RETRY_BUSY }, /* USB mass storage devices supported by umass(4) */ { /* * EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player * PR: kern/51675 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Power Quotient Int. (PQI) USB flash key * PR: kern/53067 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "USB Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Creative Nomad MUVO mp3 player (USB) * PR: kern/53094 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * Jungsoft NEXDISK USB flash key * PR: kern/54737 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * FreeDik USB Mini Data Drive * PR: kern/54786 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Sigmatel USB Flash MP3 Player * PR: kern/57046 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * Neuros USB Digital Audio Computer * PR: kern/63645 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SEAGRAND NP-900 MP3 Player * PR: kern/64563 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * iRiver iFP MP3 player (with UMS Firmware) * PR: kern/54881, i386/63941, kern/66124 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01 * PR: kern/70158 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * ZICPlay USB MP3 Player with FM * PR: kern/75057 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS*" , "USB DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * TEAC USB floppy mechanisms */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Kingston DataTraveler II+ USB Pen-Drive. * Reported by: Pawel Jakub Dawidek */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * USB DISK Pro PMAP * Reported by: jhs * PR: usb/96381 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Motorola E398 Mobile Phone (TransFlash memory card). * Reported by: Wojciech A. Koszek * PR: usb/89889 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Qware BeatZkey! Pro * PR: usb/79164 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Time DPA20B 1GB MP3 Player * PR: usb/81846 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0*", "(FS) FLASH DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Samsung USB key 128Mb * PR: usb/90081 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Kingston DataTraveler 2.0 USB Flash memory. * PR: usb/89196 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Creative MUVO Slim mp3 player (USB) * PR: usb/86131 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3) * PR: usb/80487 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SanDisk Micro Cruzer 128MB * PR: usb/75970 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * TOSHIBA TransMemory USB sticks * PR: kern/94660 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * PNY USB 3.0 Flash Drives */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "PNY", "USB 3.0 FD*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_RC16 }, { /* * PNY USB Flash keys * PR: usb/75578, usb/72344, usb/65436 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "*" , "USB DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Genesys 6-in-1 Card Reader * PR: usb/94647 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Rekam Digital CAMERA * PR: usb/98713 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA*", "4MP-9J6*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * iRiver H10 MP3 player * PR: usb/102547 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * iRiver U10 MP3 player * PR: usb/92306 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * X-Micro Flash Disk * PR: usb/96901 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * EasyMP3 EM732X USB 2.0 Flash MP3 Player * PR: usb/96546 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*", "1.00"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Denver MP3 player * PR: usb/107101 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Philips USB Key Audio KEY013 * PR: usb/68412 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT }, { /* * JNC MP3 Player * PR: usb/94439 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC*" , "MP3 Player*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SAMSUNG MP0402H * PR: usb/108427 */ {T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * I/O Magic USB flash - Giga Bank * PR: usb/108810 */ {T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * JoyFly 128mb USB Flash Drive * PR: 96133 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * ChipsBnk usb stick * PR: 103702 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A * PR: 129858 */ {T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Samsung YP-U3 mp3-player * PR: 125398 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { {T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*", "2000"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Sony Cyber-Shot DSC cameras * PR: usb/137035 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT }, { {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3", "1.00"}, /*quirks*/ DA_Q_NO_PREVENT }, { /* At least several Transcent USB sticks lie on RC16. */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JetFlash", "Transcend*", "*"}, /*quirks*/ DA_Q_NO_RC16 }, { /* * I-O Data USB Flash Disk * PR: usb/211716 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "I-O DATA", "USB Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_RC16 }, /* ATA/SATA devices over SAS/USB/... */ { /* Hitachi Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST???DM*", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Thin Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Thin Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST???LT*", "*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* * Olympus FE-210 camera */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * LG UP3S MP3 player */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Laser MP3-2GA13 MP3 player */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * LaCie external 250GB Hard drive des by Porsche * Submitted by: Ben Stuyts * PR: 121474 */ {T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, /* SATA SSDs */ { /* * Corsair Force 2 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair CSSD-F*", "*" }, /*quirks*/DA_Q_4K }, { /* * Corsair Force 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force 3*", "*" }, /*quirks*/DA_Q_4K }, { /* * Corsair Neutron GTX SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Neutron GTX*", "*" }, /*quirks*/DA_Q_4K }, { /* * Corsair Force GT & GS SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force G*", "*" }, /*quirks*/DA_Q_4K }, { /* * Crucial M4 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "M4-CT???M4SSD2*", "*" }, /*quirks*/DA_Q_4K }, { /* * Crucial RealSSD C300 SSDs * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "C300-CTFDDAC???MAG*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 320 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2CW*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 330 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2CT*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 510 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2MH*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 520 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BW*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel X25-M Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2M*", "*" }, /*quirks*/DA_Q_4K }, { /* * Kingston E100 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SE100S3*", "*" }, /*quirks*/DA_Q_4K }, { /* * Kingston HyperX 3k SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SH103S3*", "*" }, /*quirks*/DA_Q_4K }, { /* * Marvell SSDs (entry taken from OpenSolaris) * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MARVELL SD88SA02*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Agility 2 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY2*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Agility 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-AGILITY3*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Deneva R Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "DENRSTE251M45*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Vertex 2 SSDs (inc pro series) * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ?VERTEX2*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Vertex 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX3*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Vertex 4 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX4*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 830 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG SSD 830 Series*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 840 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 840*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 850 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 850*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 843T Series SSDs (MZ7WD*) * Samsung PM851 Series SSDs (MZ7TE*) * Samsung PM853T Series SSDs (MZ7GE*) * Samsung SM863 Series SSDs (MZ7KM*) * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG MZ7*", "*" }, /*quirks*/DA_Q_4K }, { /* * SuperTalent TeraDrive CT SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "FTM??CT25H*", "*" }, /*quirks*/DA_Q_4K }, { /* * XceedIOPS SATA SSDs * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SG9XCS2D*", "*" }, /*quirks*/DA_Q_4K }, { /* * Hama Innostor USB-Stick */ { T_DIRECT, SIP_MEDIA_REMOVABLE, "Innostor", "Innostor*", "*" }, /*quirks*/DA_Q_NO_RC16 }, { /* * Seagate Lamarr 8TB Shingled Magnetic Recording (SMR) * Drive Managed SATA hard drive. This drive doesn't report * in firmware that it is a drive managed SMR drive. */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST8000AS0002*", "*" }, /*quirks*/DA_Q_SMR_DM }, { /* * MX-ES USB Drive by Mach Xtreme */ { T_DIRECT, SIP_MEDIA_REMOVABLE, "MX", "MXUB3*", "*"}, /*quirks*/DA_Q_NO_RC16 }, }; static disk_strategy_t dastrategy; static dumper_t dadump; static periph_init_t dainit; static void daasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void dasysctlinit(void *context, int pending); static int dasysctlsofttimeout(SYSCTL_HANDLER_ARGS); static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS); static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS); static int dazonemodesysctl(SYSCTL_HANDLER_ARGS); static int dazonesupsysctl(SYSCTL_HANDLER_ARGS); static int dadeletemaxsysctl(SYSCTL_HANDLER_ARGS); static void dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method); static off_t dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method); static void dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method); static void daprobedone(struct cam_periph *periph, union ccb *ccb); static periph_ctor_t daregister; static periph_dtor_t dacleanup; static periph_start_t dastart; static periph_oninv_t daoninvalidate; static void dazonedone(struct cam_periph *periph, union ccb *ccb); static void dadone(struct cam_periph *periph, union ccb *done_ccb); static int daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static void daprevent(struct cam_periph *periph, int action); static void dareprobe(struct cam_periph *periph); static void dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector, struct scsi_read_capacity_data_long *rcaplong, size_t rcap_size); static timeout_t dasendorderedtag; static void dashutdown(void *arg, int howto); static timeout_t damediapoll; #ifndef DA_DEFAULT_POLL_PERIOD #define DA_DEFAULT_POLL_PERIOD 3 #endif #ifndef DA_DEFAULT_TIMEOUT #define DA_DEFAULT_TIMEOUT 60 /* Timeout in seconds */ #endif #ifndef DA_DEFAULT_SOFTTIMEOUT #define DA_DEFAULT_SOFTTIMEOUT 0 #endif #ifndef DA_DEFAULT_RETRY #define DA_DEFAULT_RETRY 4 #endif #ifndef DA_DEFAULT_SEND_ORDERED #define DA_DEFAULT_SEND_ORDERED 1 #endif static int da_poll_period = DA_DEFAULT_POLL_PERIOD; static int da_retry_count = DA_DEFAULT_RETRY; static int da_default_timeout = DA_DEFAULT_TIMEOUT; static sbintime_t da_default_softtimeout = DA_DEFAULT_SOFTTIMEOUT; static int da_send_ordered = DA_DEFAULT_SEND_ORDERED; static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD, 0, "CAM Direct Access Disk driver"); SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RWTUN, &da_poll_period, 0, "Media polling period in seconds"); SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RWTUN, &da_retry_count, 0, "Normal I/O retry count"); SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RWTUN, &da_default_timeout, 0, "Normal I/O timeout (in seconds)"); SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RWTUN, &da_send_ordered, 0, "Send Ordered Tags"); SYSCTL_PROC(_kern_cam_da, OID_AUTO, default_softtimeout, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, dasysctlsofttimeout, "I", "Soft I/O timeout (ms)"); TUNABLE_INT64("kern.cam.da.default_softtimeout", &da_default_softtimeout); /* * DA_ORDEREDTAG_INTERVAL determines how often, relative * to the default timeout, we check to see whether an ordered * tagged transaction is appropriate to prevent simple tag * starvation. Since we'd like to ensure that there is at least * 1/2 of the timeout length left for a starved transaction to * complete after we've sent an ordered tag, we must poll at least * four times in every timeout period. This takes care of the worst * case where a starved transaction starts during an interval that * meets the requirement "don't send an ordered tag" test so it takes * us two intervals to determine that a tag must be sent. */ #ifndef DA_ORDEREDTAG_INTERVAL #define DA_ORDEREDTAG_INTERVAL 4 #endif static struct periph_driver dadriver = { dainit, "da", TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(da, dadriver); static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers"); static int daopen(struct disk *dp) { struct cam_periph *periph; struct da_softc *softc; int error; periph = (struct cam_periph *)dp->d_drv1; if (cam_periph_acquire(periph) != CAM_REQ_CMP) { return (ENXIO); } cam_periph_lock(periph); if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (error); } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("daopen\n")); softc = (struct da_softc *)periph->softc; dareprobe(periph); /* Wait for the disk size update. */ error = cam_periph_sleep(periph, &softc->disk->d_mediasize, PRIBIO, "dareprobe", 0); if (error != 0) xpt_print(periph->path, "unable to retrieve capacity data\n"); if (periph->flags & CAM_PERIPH_INVALID) error = ENXIO; if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 && (softc->quirks & DA_Q_NO_PREVENT) == 0) daprevent(periph, PR_PREVENT); if (error == 0) { softc->flags &= ~DA_FLAG_PACK_INVALID; softc->flags |= DA_FLAG_OPEN; } cam_periph_unhold(periph); cam_periph_unlock(periph); if (error != 0) cam_periph_release(periph); return (error); } static int daclose(struct disk *dp) { struct cam_periph *periph; struct da_softc *softc; union ccb *ccb; int error; periph = (struct cam_periph *)dp->d_drv1; softc = (struct da_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("daclose\n")); if (cam_periph_hold(periph, PRIBIO) == 0) { /* Flush disk cache. */ if ((softc->flags & DA_FLAG_DIRTY) != 0 && (softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 && (softc->flags & DA_FLAG_PACK_INVALID) == 0) { ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_synchronize_cache(&ccb->csio, /*retries*/1, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0, /*lb_count*/0, SSD_FULL_SIZE, 5 * 60 * 1000); error = cam_periph_runccb(ccb, daerror, /*cam_flags*/0, /*sense_flags*/SF_RETRY_UA | SF_QUIET_IR, softc->disk->d_devstat); softc->flags &= ~DA_FLAG_DIRTY; xpt_release_ccb(ccb); } /* Allow medium removal. */ if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 && (softc->quirks & DA_Q_NO_PREVENT) == 0) daprevent(periph, PR_ALLOW); cam_periph_unhold(periph); } /* * If we've got removeable media, mark the blocksize as * unavailable, since it could change when new media is * inserted. */ if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0) softc->disk->d_devstat->flags |= DEVSTAT_BS_UNAVAILABLE; softc->flags &= ~DA_FLAG_OPEN; while (softc->refcount != 0) cam_periph_sleep(periph, &softc->refcount, PRIBIO, "daclose", 1); cam_periph_unlock(periph); cam_periph_release(periph); return (0); } static void daschedule(struct cam_periph *periph) { struct da_softc *softc = (struct da_softc *)periph->softc; if (softc->state != DA_STATE_NORMAL) return; cam_iosched_schedule(softc->cam_iosched, periph); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void dastrategy(struct bio *bp) { struct cam_periph *periph; struct da_softc *softc; periph = (struct cam_periph *)bp->bio_disk->d_drv1; softc = (struct da_softc *)periph->softc; cam_periph_lock(periph); /* * If the device has been made invalid, error out */ if ((softc->flags & DA_FLAG_PACK_INVALID)) { cam_periph_unlock(periph); biofinish(bp, NULL, ENXIO); return; } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp)); /* * Zone commands must be ordered, because they can depend on the * effects of previously issued commands, and they may affect * commands after them. */ if (bp->bio_cmd == BIO_ZONE) bp->bio_flags |= BIO_ORDERED; /* * Place it in the queue of disk activities for this disk */ cam_iosched_queue_work(softc->cam_iosched, bp); /* * Schedule ourselves for performing the work. */ daschedule(periph); cam_periph_unlock(periph); return; } static int dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct cam_periph *periph; struct da_softc *softc; u_int secsize; struct ccb_scsiio csio; struct disk *dp; int error = 0; dp = arg; periph = dp->d_drv1; softc = (struct da_softc *)periph->softc; cam_periph_lock(periph); secsize = softc->params.secsize; if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) { cam_periph_unlock(periph); return (ENXIO); } if (length > 0) { xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); csio.ccb_h.ccb_state = DA_CCB_DUMP; scsi_read_write(&csio, /*retries*/0, dadone, MSG_ORDERED_Q_TAG, /*read*/SCSI_RW_WRITE, /*byte2*/0, /*minimum_cmd_size*/ softc->minimum_cmd_size, offset / secsize, length / secsize, /*data_ptr*/(u_int8_t *) virtual, /*dxfer_len*/length, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); xpt_polled_action((union ccb *)&csio); error = cam_periph_error((union ccb *)&csio, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); if (error != 0) printf("Aborting dump due to I/O error.\n"); cam_periph_unlock(periph); return (error); } /* * Sync the disk cache contents to the physical media. */ if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) { xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); csio.ccb_h.ccb_state = DA_CCB_DUMP; scsi_synchronize_cache(&csio, /*retries*/0, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0,/* Cover the whole disk */ /*lb_count*/0, SSD_FULL_SIZE, 5 * 1000); xpt_polled_action((union ccb *)&csio); error = cam_periph_error((union ccb *)&csio, 0, SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, NULL); if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); } cam_periph_unlock(periph); return (error); } static int dagetattr(struct bio *bp) { int ret; struct cam_periph *periph; periph = (struct cam_periph *)bp->bio_disk->d_drv1; cam_periph_lock(periph); ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, periph->path); cam_periph_unlock(periph); if (ret == 0) bp->bio_completed = bp->bio_length; return ret; } static void dainit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("da: Failed to attach master async callback " "due to status 0x%x!\n", status); } else if (da_send_ordered) { /* Register our shutdown event handler */ if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown, NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) printf("dainit: shutdown event registration failed!\n"); } } /* * Callback from GEOM, called when it has finished cleaning up its * resources. */ static void dadiskgonecb(struct disk *dp) { struct cam_periph *periph; periph = (struct cam_periph *)dp->d_drv1; cam_periph_release(periph); } static void daoninvalidate(struct cam_periph *periph) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, daasync, periph, periph->path); softc->flags |= DA_FLAG_PACK_INVALID; #ifdef CAM_IO_STATS softc->invalidations++; #endif /* * Return all queued I/O with ENXIO. * XXX Handle any transactions queued to the card * with XPT_ABORT_CCB. */ cam_iosched_flush(softc->cam_iosched, NULL, ENXIO); /* * Tell GEOM that we've gone away, we'll get a callback when it is * done cleaning up its resources. */ disk_gone(softc->disk); } static void dacleanup(struct cam_periph *periph) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; cam_periph_unlock(periph); cam_iosched_fini(softc->cam_iosched); /* * If we can't free the sysctl tree, oh well... */ if ((softc->flags & DA_FLAG_SCTX_INIT) != 0) { #ifdef CAM_IO_STATS if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0) xpt_print(periph->path, "can't remove sysctl stats context\n"); #endif if (sysctl_ctx_free(&softc->sysctl_ctx) != 0) xpt_print(periph->path, "can't remove sysctl context\n"); } callout_drain(&softc->mediapoll_c); disk_destroy(softc->disk); callout_drain(&softc->sendordered_c); free(softc, M_DEVBUF); cam_periph_lock(periph); } static void daasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; struct da_softc *softc; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_SCSI) break; if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED) break; if (SID_TYPE(&cgd->inq_data) != T_DIRECT && SID_TYPE(&cgd->inq_data) != T_RBC && SID_TYPE(&cgd->inq_data) != T_OPTICAL && SID_TYPE(&cgd->inq_data) != T_ZBC_HM) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(daregister, daoninvalidate, dacleanup, dastart, "da", CAM_PERIPH_BIO, path, daasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("daasync: Unable to attach to new device " "due to status 0x%x\n", status); return; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct da_softc *softc; softc = periph->softc; disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); } break; } case AC_UNIT_ATTENTION: { union ccb *ccb; int error_code, sense_key, asc, ascq; softc = (struct da_softc *)periph->softc; ccb = (union ccb *)arg; /* * Handle all UNIT ATTENTIONs except our own, * as they will be handled by daerror(). */ if (xpt_path_periph(ccb->ccb_h.path) != periph && scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (asc == 0x2A && ascq == 0x09) { xpt_print(ccb->ccb_h.path, "Capacity data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); } else if (asc == 0x28 && ascq == 0x00) { softc->flags &= ~DA_FLAG_PROBED; disk_media_changed(softc->disk, M_NOWAIT); } else if (asc == 0x3F && ascq == 0x03) { xpt_print(ccb->ccb_h.path, "INQUIRY data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); } } cam_periph_async(periph, code, path, arg); break; } case AC_SCSI_AEN: softc = (struct da_softc *)periph->softc; if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) { if (cam_periph_acquire(periph) == CAM_REQ_CMP) { cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR); daschedule(periph); } } /* FALLTHROUGH */ case AC_SENT_BDR: case AC_BUS_RESET: { struct ccb_hdr *ccbh; softc = (struct da_softc *)periph->softc; /* * Don't fail on the expected unit attention * that will occur. */ softc->flags |= DA_FLAG_RETRY_UA; LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le) ccbh->ccb_state |= DA_CCB_RETRY_UA; break; } case AC_INQ_CHANGED: softc = (struct da_softc *)periph->softc; softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); break; default: break; } cam_periph_async(periph, code, path, arg); } static void dasysctlinit(void *context, int pending) { struct cam_periph *periph; struct da_softc *softc; char tmpstr[80], tmpstr2[80]; struct ccb_trans_settings cts; periph = (struct cam_periph *)context; /* * periph was held for us when this task was enqueued */ if (periph->flags & CAM_PERIPH_INVALID) { cam_periph_release(periph); return; } softc = (struct da_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); softc->flags |= DA_FLAG_SCTX_INIT; softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr); if (softc->sysctl_tree == NULL) { printf("dasysctlinit: unable to allocate sysctl tree\n"); cam_periph_release(periph); return; } /* * Now register the sysctl handler, so the user can change the value on * the fly. */ SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RWTUN, softc, 0, dadeletemethodsysctl, "A", "BIO_DELETE execution method"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "delete_max", CTLTYPE_U64 | CTLFLAG_RW, softc, 0, dadeletemaxsysctl, "Q", "Maximum BIO_DELETE size"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "minimum_cmd_size", CTLTYPE_INT | CTLFLAG_RW, &softc->minimum_cmd_size, 0, dacmdsizesysctl, "I", "Minimum CDB size"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "zone_mode", CTLTYPE_STRING | CTLFLAG_RD, softc, 0, dazonemodesysctl, "A", "Zone Mode"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "zone_support", CTLTYPE_STRING | CTLFLAG_RD, softc, 0, dazonesupsysctl, "A", "Zone Support"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones, "Optimal Number of Open Sequential Write Preferred Zones"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "optimal_nonseq_zones", CTLFLAG_RD, &softc->optimal_nonseq_zones, "Optimal Number of Non-Sequentially Written Sequential Write " "Preferred Zones"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones, "Maximum Number of Open Sequential Write Required Zones"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "error_inject", CTLFLAG_RW, &softc->error_inject, 0, "error_inject leaf"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "unmapped_io", CTLFLAG_RD, &softc->unmappedio, 0, "Unmapped I/O leaf"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "rotating", CTLFLAG_RD, &softc->rotating, 0, "Rotating media"); /* * Add some addressing info. */ memset(&cts, 0, sizeof (cts)); xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE); cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS; cts.type = CTS_TYPE_CURRENT_SETTINGS; cam_periph_lock(periph); xpt_action((union ccb *)&cts); cam_periph_unlock(periph); if (cts.ccb_h.status != CAM_REQ_CMP) { cam_periph_release(periph); return; } if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) { struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc; if (fc->valid & CTS_FC_VALID_WWPN) { softc->wwpn = fc->wwpn; SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "wwpn", CTLFLAG_RD, &softc->wwpn, "World Wide Port Name"); } } #ifdef CAM_IO_STATS /* * Now add some useful stats. * XXX These should live in cam_periph and be common to all periphs */ softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats", CTLFLAG_RD, 0, "Statistics"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "errors", CTLFLAG_RD, &softc->errors, 0, "Transport errors reported by the SIM"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "timeouts", CTLFLAG_RD, &softc->timeouts, 0, "Device timeouts reported by the SIM"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "pack_invalidations", CTLFLAG_RD, &softc->invalidations, 0, "Device pack invalidations"); #endif cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx, softc->sysctl_tree); cam_periph_release(periph); } static int dadeletemaxsysctl(SYSCTL_HANDLER_ARGS) { int error; uint64_t value; struct da_softc *softc; softc = (struct da_softc *)arg1; value = softc->disk->d_delmaxsize; error = sysctl_handle_64(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* only accept values smaller than the calculated value */ if (value > dadeletemaxsize(softc, softc->delete_method)) { return (EINVAL); } softc->disk->d_delmaxsize = value; return (0); } static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS) { int error, value; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* * Acceptable values here are 6, 10, 12 or 16. */ if (value < 6) value = 6; else if ((value > 6) && (value <= 10)) value = 10; else if ((value > 10) && (value <= 12)) value = 12; else if (value > 12) value = 16; *(int *)arg1 = value; return (0); } static int dasysctlsofttimeout(SYSCTL_HANDLER_ARGS) { sbintime_t value; int error; value = da_default_softtimeout / SBT_1MS; error = sysctl_handle_int(oidp, (int *)&value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* XXX Should clip this to a reasonable level */ if (value > da_default_timeout * 1000) return (EINVAL); da_default_softtimeout = value * SBT_1MS; return (0); } static void dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method) { softc->delete_method = delete_method; softc->disk->d_delmaxsize = dadeletemaxsize(softc, delete_method); softc->delete_func = da_delete_functions[delete_method]; if (softc->delete_method > DA_DELETE_DISABLE) softc->disk->d_flags |= DISKFLAG_CANDELETE; else softc->disk->d_flags &= ~DISKFLAG_CANDELETE; } static off_t dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method) { off_t sectors; switch(delete_method) { case DA_DELETE_UNMAP: sectors = (off_t)softc->unmap_max_lba; break; case DA_DELETE_ATA_TRIM: sectors = (off_t)ATA_DSM_RANGE_MAX * softc->trim_max_ranges; break; case DA_DELETE_WS16: sectors = omin(softc->ws_max_blks, WS16_MAX_BLKS); break; case DA_DELETE_ZERO: case DA_DELETE_WS10: sectors = omin(softc->ws_max_blks, WS10_MAX_BLKS); break; default: return 0; } return (off_t)softc->params.secsize * omin(sectors, softc->params.sectors); } static void daprobedone(struct cam_periph *periph, union ccb *ccb) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; dadeletemethodchoose(softc, DA_DELETE_NONE); if (bootverbose && (softc->flags & DA_FLAG_ANNOUNCED) == 0) { char buf[80]; int i, sep; snprintf(buf, sizeof(buf), "Delete methods: <"); sep = 0; for (i = 0; i <= DA_DELETE_MAX; i++) { if ((softc->delete_available & (1 << i)) == 0 && i != softc->delete_method) continue; if (sep) strlcat(buf, ",", sizeof(buf)); strlcat(buf, da_delete_method_names[i], sizeof(buf)); if (i == softc->delete_method) strlcat(buf, "(*)", sizeof(buf)); sep = 1; } strlcat(buf, ">", sizeof(buf)); printf("%s%d: %s\n", periph->periph_name, periph->unit_number, buf); } /* * Since our peripheral may be invalidated by an error * above or an external event, we must release our CCB * before releasing the probe lock on the peripheral. * The peripheral will only go away once the last lock * is removed, and we need it around for the CCB release * operation. */ xpt_release_ccb(ccb); softc->state = DA_STATE_NORMAL; softc->flags |= DA_FLAG_PROBED; daschedule(periph); wakeup(&softc->disk->d_mediasize); if ((softc->flags & DA_FLAG_ANNOUNCED) == 0) { softc->flags |= DA_FLAG_ANNOUNCED; cam_periph_unhold(periph); } else cam_periph_release_locked(periph); } static void dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method) { int i, methods; /* If available, prefer the method requested by user. */ i = softc->delete_method_pref; methods = softc->delete_available | (1 << DA_DELETE_DISABLE); if (methods & (1 << i)) { dadeletemethodset(softc, i); return; } /* Use the pre-defined order to choose the best performing delete. */ for (i = DA_DELETE_MIN; i <= DA_DELETE_MAX; i++) { if (i == DA_DELETE_ZERO) continue; if (softc->delete_available & (1 << i)) { dadeletemethodset(softc, i); return; } } /* Fallback to default. */ dadeletemethodset(softc, default_method); } static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; const char *p; struct da_softc *softc; int i, error, methods, value; softc = (struct da_softc *)arg1; value = softc->delete_method; if (value < 0 || value > DA_DELETE_MAX) p = "UNKNOWN"; else p = da_delete_method_names[value]; strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); methods = softc->delete_available | (1 << DA_DELETE_DISABLE); for (i = 0; i <= DA_DELETE_MAX; i++) { if (strcmp(buf, da_delete_method_names[i]) == 0) break; } if (i > DA_DELETE_MAX) return (EINVAL); softc->delete_method_pref = i; dadeletemethodchoose(softc, DA_DELETE_NONE); return (0); } static int dazonemodesysctl(SYSCTL_HANDLER_ARGS) { char tmpbuf[40]; struct da_softc *softc; int error; softc = (struct da_softc *)arg1; switch (softc->zone_mode) { case DA_ZONE_DRIVE_MANAGED: snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed"); break; case DA_ZONE_HOST_AWARE: snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware"); break; case DA_ZONE_HOST_MANAGED: snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed"); break; case DA_ZONE_NONE: default: snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned"); break; } error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req); return (error); } static int dazonesupsysctl(SYSCTL_HANDLER_ARGS) { char tmpbuf[180]; struct da_softc *softc; struct sbuf sb; int error, first; unsigned int i; softc = (struct da_softc *)arg1; error = 0; first = 1; sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0); for (i = 0; i < sizeof(da_zone_desc_table) / sizeof(da_zone_desc_table[0]); i++) { if (softc->zone_flags & da_zone_desc_table[i].value) { if (first == 0) sbuf_printf(&sb, ", "); else first = 0; sbuf_cat(&sb, da_zone_desc_table[i].desc); } } if (first == 1) sbuf_printf(&sb, "None"); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); return (error); } static cam_status daregister(struct cam_periph *periph, void *arg) { struct da_softc *softc; struct ccb_pathinq cpi; struct ccb_getdev *cgd; char tmpstr[80]; caddr_t match; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("daregister: no getdev CCB, can't register device\n"); return(CAM_REQ_CMP_ERR); } softc = (struct da_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT|M_ZERO); if (softc == NULL) { printf("daregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } if (cam_iosched_init(&softc->cam_iosched, periph) != 0) { printf("daregister: Unable to probe new device. " "Unable to allocate iosched memory\n"); free(softc, M_DEVBUF); return(CAM_REQ_CMP_ERR); } LIST_INIT(&softc->pending_ccbs); softc->state = DA_STATE_PROBE_RC; bioq_init(&softc->delete_run_queue); if (SID_IS_REMOVABLE(&cgd->inq_data)) softc->flags |= DA_FLAG_PACK_REMOVABLE; softc->unmap_max_ranges = UNMAP_MAX_RANGES; softc->unmap_max_lba = UNMAP_RANGE_MAX; softc->ws_max_blks = WS16_MAX_BLKS; softc->trim_max_ranges = ATA_TRIM_MAX_RANGES; softc->rotating = 1; periph->softc = softc; /* * See if this device has any quirks. */ match = cam_quirkmatch((caddr_t)&cgd->inq_data, (caddr_t)da_quirk_table, nitems(da_quirk_table), sizeof(*da_quirk_table), scsi_inquiry_match); if (match != NULL) softc->quirks = ((struct da_quirk_entry *)match)->quirks; else softc->quirks = DA_Q_NONE; /* Check if the SIM does not want 6 byte commands */ bzero(&cpi, sizeof(cpi)); xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cpi.ccb_h.func_code = XPT_PATH_INQ; xpt_action((union ccb *)&cpi); if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE)) softc->quirks |= DA_Q_NO_6_BYTE; if (SID_TYPE(&cgd->inq_data) == T_ZBC_HM) softc->zone_mode = DA_ZONE_HOST_MANAGED; else if (softc->quirks & DA_Q_SMR_DM) softc->zone_mode = DA_ZONE_DRIVE_MANAGED; else softc->zone_mode = DA_ZONE_NONE; if (softc->zone_mode != DA_ZONE_NONE) { if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) { if (scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) softc->zone_interface = DA_ZONE_IF_ATA_SAT; else softc->zone_interface = DA_ZONE_IF_ATA_PASS; } else softc->zone_interface = DA_ZONE_IF_SCSI; } TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph); /* * Take an exclusive refcount on the periph while dastart is called * to finish the probe. The reference will be dropped in dadone at * the end of probe. */ (void)cam_periph_hold(periph, PRIBIO); /* * Schedule a periodic event to occasionally send an * ordered tag to a device. */ callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0); callout_reset(&softc->sendordered_c, (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL, dasendorderedtag, softc); cam_periph_unlock(periph); /* * RBC devices don't have to support READ(6), only READ(10). */ if (softc->quirks & DA_Q_NO_6_BYTE || SID_TYPE(&cgd->inq_data) == T_RBC) softc->minimum_cmd_size = 10; else softc->minimum_cmd_size = 6; /* * Load the user's default, if any. */ snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size", periph->unit_number); TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size); /* * 6, 10, 12 and 16 are the currently permissible values. */ if (softc->minimum_cmd_size < 6) softc->minimum_cmd_size = 6; else if ((softc->minimum_cmd_size > 6) && (softc->minimum_cmd_size <= 10)) softc->minimum_cmd_size = 10; else if ((softc->minimum_cmd_size > 10) && (softc->minimum_cmd_size <= 12)) softc->minimum_cmd_size = 12; else if (softc->minimum_cmd_size > 12) softc->minimum_cmd_size = 16; /* Predict whether device may support READ CAPACITY(16). */ if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3 && (softc->quirks & DA_Q_NO_RC16) == 0) { softc->flags |= DA_FLAG_CAN_RC16; softc->state = DA_STATE_PROBE_RC16; } /* * Register this media as a disk. */ softc->disk = disk_alloc(); softc->disk->d_devstat = devstat_new_entry(periph->periph_name, periph->unit_number, 0, DEVSTAT_BS_UNAVAILABLE, SID_TYPE(&cgd->inq_data) | XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_DISK); softc->disk->d_open = daopen; softc->disk->d_close = daclose; softc->disk->d_strategy = dastrategy; softc->disk->d_dump = dadump; softc->disk->d_getattr = dagetattr; softc->disk->d_gone = dadiskgonecb; softc->disk->d_name = "da"; softc->disk->d_drv1 = periph; if (cpi.maxio == 0) softc->maxio = DFLTPHYS; /* traditional default */ else if (cpi.maxio > MAXPHYS) softc->maxio = MAXPHYS; /* for safety */ else softc->maxio = cpi.maxio; softc->disk->d_maxsize = softc->maxio; softc->disk->d_unit = periph->unit_number; softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION | DISKFLAG_CANZONE; if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE; if ((cpi.hba_misc & PIM_UNMAPPED) != 0) { softc->unmappedio = 1; softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO; xpt_print(periph->path, "UNMAPPED\n"); } cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor, sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr)); strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr)); cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)], cgd->inq_data.product, sizeof(cgd->inq_data.product), sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr)); softc->disk->d_hba_vendor = cpi.hba_vendor; softc->disk->d_hba_device = cpi.hba_device; softc->disk->d_hba_subvendor = cpi.hba_subvendor; softc->disk->d_hba_subdevice = cpi.hba_subdevice; /* * Acquire a reference to the periph before we register with GEOM. * We'll release this reference once GEOM calls us back (via * dadiskgonecb()) telling us that our provider has been freed. */ if (cam_periph_acquire(periph) != CAM_REQ_CMP) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } disk_create(softc->disk, DISK_VERSION); cam_periph_lock(periph); /* * Add async callbacks for events of interest. * I don't bother checking if this fails as, * in most cases, the system will function just * fine without them and the only alternative * would be to not attach the device on failure. */ xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE | AC_ADVINFO_CHANGED | AC_SCSI_AEN | AC_UNIT_ATTENTION | AC_INQ_CHANGED, daasync, periph, periph->path); /* * Emit an attribute changed notification just in case * physical path information arrived before our async * event handler was registered, but after anyone attaching * to our disk device polled it. */ disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); /* * Schedule a periodic media polling events. */ callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0); if ((softc->flags & DA_FLAG_PACK_REMOVABLE) && (cgd->inq_flags & SID_AEN) == 0 && da_poll_period != 0) callout_reset(&softc->mediapoll_c, da_poll_period * hz, damediapoll, periph); xpt_schedule(periph, CAM_PRIORITY_DEV); return(CAM_REQ_CMP); } static int da_zone_bio_to_scsi(int disk_zone_cmd) { switch (disk_zone_cmd) { case DISK_ZONE_OPEN: return ZBC_OUT_SA_OPEN; case DISK_ZONE_CLOSE: return ZBC_OUT_SA_CLOSE; case DISK_ZONE_FINISH: return ZBC_OUT_SA_FINISH; case DISK_ZONE_RWP: return ZBC_OUT_SA_RWP; } return -1; } static int da_zone_cmd(struct cam_periph *periph, union ccb *ccb, struct bio *bp, int *queue_ccb) { struct da_softc *softc; int error; error = 0; if (bp->bio_cmd != BIO_ZONE) { error = EINVAL; goto bailout; } softc = periph->softc; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: case DISK_ZONE_CLOSE: case DISK_ZONE_FINISH: case DISK_ZONE_RWP: { int zone_flags; int zone_sa; uint64_t lba; zone_sa = da_zone_bio_to_scsi(bp->bio_zone.zone_cmd); if (zone_sa == -1) { xpt_print(periph->path, "Cannot translate zone " "cmd %#x to SCSI\n", bp->bio_zone.zone_cmd); error = EINVAL; goto bailout; } zone_flags = 0; lba = bp->bio_zone.zone_params.rwp.id; if (bp->bio_zone.zone_params.rwp.flags & DISK_ZONE_RWP_FLAG_ALL) zone_flags |= ZBC_OUT_ALL; if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) { scsi_zbc_out(&ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*service_action*/ zone_sa, /*zone_id*/ lba, /*zone_flags*/ zone_flags, /*data_ptr*/ NULL, /*dxfer_len*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); } else { /* * Note that in this case, even though we can * technically use NCQ, we don't bother for several * reasons: * 1. It hasn't been tested on a SAT layer that * supports it. This is new as of SAT-4. * 2. Even when there is a SAT layer that supports * it, that SAT layer will also probably support * ZBC -> ZAC translation, since they are both * in the SAT-4 spec. * 3. Translation will likely be preferable to ATA * passthrough. LSI / Avago at least single * steps ATA passthrough commands in the HBA, * regardless of protocol, so unless that * changes, there is a performance penalty for * doing ATA passthrough no matter whether * you're using NCQ/FPDMA, DMA or PIO. * 4. It requires a 32-byte CDB, which at least at * this point in CAM requires a CDB pointer, which * would require us to allocate an additional bit * of storage separate from the CCB. */ error = scsi_ata_zac_mgmt_out(&ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*use_ncq*/ 0, /*zm_action*/ zone_sa, /*zone_id*/ lba, /*zone_flags*/ zone_flags, /*data_ptr*/ NULL, /*dxfer_len*/ 0, /*cdb_storage*/ NULL, /*cdb_storage_len*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (error != 0) { error = EINVAL; xpt_print(periph->path, "scsi_ata_zac_mgmt_out() returned an " "error!"); goto bailout; } } *queue_ccb = 1; break; } case DISK_ZONE_REPORT_ZONES: { uint8_t *rz_ptr; uint32_t num_entries, alloc_size; struct disk_zone_report *rep; rep = &bp->bio_zone.zone_params.report; num_entries = rep->entries_allocated; if (num_entries == 0) { xpt_print(periph->path, "No entries allocated for " "Report Zones request\n"); error = EINVAL; goto bailout; } alloc_size = sizeof(struct scsi_report_zones_hdr) + (sizeof(struct scsi_report_zones_desc) * num_entries); alloc_size = min(alloc_size, softc->disk->d_maxsize); rz_ptr = malloc(alloc_size, M_SCSIDA, M_NOWAIT | M_ZERO); if (rz_ptr == NULL) { xpt_print(periph->path, "Unable to allocate memory " "for Report Zones request\n"); error = ENOMEM; goto bailout; } if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) { scsi_zbc_in(&ccb->csio, /*retries*/ da_retry_count, /*cbcfnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*service_action*/ ZBC_IN_SA_REPORT_ZONES, /*zone_start_lba*/ rep->starting_id, /*zone_options*/ rep->rep_options, /*data_ptr*/ rz_ptr, /*dxfer_len*/ alloc_size, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); } else { /* * Note that in this case, even though we can * technically use NCQ, we don't bother for several * reasons: * 1. It hasn't been tested on a SAT layer that * supports it. This is new as of SAT-4. * 2. Even when there is a SAT layer that supports * it, that SAT layer will also probably support * ZBC -> ZAC translation, since they are both * in the SAT-4 spec. * 3. Translation will likely be preferable to ATA * passthrough. LSI / Avago at least single * steps ATA passthrough commands in the HBA, * regardless of protocol, so unless that * changes, there is a performance penalty for * doing ATA passthrough no matter whether * you're using NCQ/FPDMA, DMA or PIO. * 4. It requires a 32-byte CDB, which at least at * this point in CAM requires a CDB pointer, which * would require us to allocate an additional bit * of storage separate from the CCB. */ error = scsi_ata_zac_mgmt_in(&ccb->csio, /*retries*/ da_retry_count, /*cbcfnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*use_ncq*/ 0, /*zm_action*/ ATA_ZM_REPORT_ZONES, /*zone_id*/ rep->starting_id, /*zone_flags*/ rep->rep_options, /*data_ptr*/ rz_ptr, /*dxfer_len*/ alloc_size, /*cdb_storage*/ NULL, /*cdb_storage_len*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (error != 0) { error = EINVAL; xpt_print(periph->path, "scsi_ata_zac_mgmt_in() returned an " "error!"); goto bailout; } } /* * For BIO_ZONE, this isn't normally needed. However, it * is used by devstat_end_transaction_bio() to determine * how much data was transferred. */ /* * XXX KDM we have a problem. But I'm not sure how to fix * it. devstat uses bio_bcount - bio_resid to calculate * the amount of data transferred. The GEOM disk code * uses bio_length - bio_resid to calculate the amount of * data in bio_completed. We have different structure * sizes above and below the ada(4) driver. So, if we * use the sizes above, the amount transferred won't be * quite accurate for devstat. If we use different sizes * for bio_bcount and bio_length (above and below * respectively), then the residual needs to match one or * the other. Everything is calculated after the bio * leaves the driver, so changing the values around isn't * really an option. For now, just set the count to the * passed in length. This means that the calculations * above (e.g. bio_completed) will be correct, but the * amount of data reported to devstat will be slightly * under or overstated. */ bp->bio_bcount = bp->bio_length; *queue_ccb = 1; break; } case DISK_ZONE_GET_PARAMS: { struct disk_zone_disk_params *params; params = &bp->bio_zone.zone_params.disk_params; bzero(params, sizeof(*params)); switch (softc->zone_mode) { case DA_ZONE_DRIVE_MANAGED: params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED; break; case DA_ZONE_HOST_AWARE: params->zone_mode = DISK_ZONE_MODE_HOST_AWARE; break; case DA_ZONE_HOST_MANAGED: params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED; break; default: case DA_ZONE_NONE: params->zone_mode = DISK_ZONE_MODE_NONE; break; } if (softc->zone_flags & DA_ZONE_FLAG_URSWRZ) params->flags |= DISK_ZONE_DISK_URSWRZ; if (softc->zone_flags & DA_ZONE_FLAG_OPT_SEQ_SET) { params->optimal_seq_zones = softc->optimal_seq_zones; params->flags |= DISK_ZONE_OPT_SEQ_SET; } if (softc->zone_flags & DA_ZONE_FLAG_OPT_NONSEQ_SET) { params->optimal_nonseq_zones = softc->optimal_nonseq_zones; params->flags |= DISK_ZONE_OPT_NONSEQ_SET; } if (softc->zone_flags & DA_ZONE_FLAG_MAX_SEQ_SET) { params->max_seq_zones = softc->max_seq_zones; params->flags |= DISK_ZONE_MAX_SEQ_SET; } if (softc->zone_flags & DA_ZONE_FLAG_RZ_SUP) params->flags |= DISK_ZONE_RZ_SUP; if (softc->zone_flags & DA_ZONE_FLAG_OPEN_SUP) params->flags |= DISK_ZONE_OPEN_SUP; if (softc->zone_flags & DA_ZONE_FLAG_CLOSE_SUP) params->flags |= DISK_ZONE_CLOSE_SUP; if (softc->zone_flags & DA_ZONE_FLAG_FINISH_SUP) params->flags |= DISK_ZONE_FINISH_SUP; if (softc->zone_flags & DA_ZONE_FLAG_RWP_SUP) params->flags |= DISK_ZONE_RWP_SUP; break; } default: break; } bailout: return (error); } static void dastart(struct cam_periph *periph, union ccb *start_ccb) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n")); skipstate: switch (softc->state) { case DA_STATE_NORMAL: { struct bio *bp; uint8_t tag_code; more: bp = cam_iosched_next_bio(softc->cam_iosched); if (bp == NULL) { if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) { cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR); scsi_test_unit_ready(&start_ccb->csio, /*retries*/ da_retry_count, dadone, MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_TUR; xpt_action(start_ccb); } else xpt_release_ccb(start_ccb); break; } if (bp->bio_cmd == BIO_DELETE) { if (softc->delete_func != NULL) { softc->delete_func(periph, start_ccb, bp); goto out; } else { /* Not sure this is possible, but failsafe by lying and saying "sure, done." */ biofinish(bp, NULL, 0); goto more; } } if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) { cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR); cam_periph_release_locked(periph); /* XXX is this still valid? I think so but unverified */ } if ((bp->bio_flags & BIO_ORDERED) != 0 || (softc->flags & DA_FLAG_NEED_OTAG) != 0) { softc->flags &= ~DA_FLAG_NEED_OTAG; softc->flags |= DA_FLAG_WAS_OTAG; tag_code = MSG_ORDERED_Q_TAG; } else { tag_code = MSG_SIMPLE_Q_TAG; } switch (bp->bio_cmd) { case BIO_WRITE: case BIO_READ: { void *data_ptr; int rw_op; + biotrack(bp, __func__); + if (bp->bio_cmd == BIO_WRITE) { softc->flags |= DA_FLAG_DIRTY; rw_op = SCSI_RW_WRITE; } else { rw_op = SCSI_RW_READ; } data_ptr = bp->bio_data; if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) { rw_op |= SCSI_RW_BIO; data_ptr = bp; } scsi_read_write(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/tag_code, rw_op, /*byte2*/0, softc->minimum_cmd_size, /*lba*/bp->bio_pblkno, /*block_count*/bp->bio_bcount / softc->params.secsize, data_ptr, /*dxfer_len*/ bp->bio_bcount, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + start_ccb->csio.bio = bp; +#endif break; } case BIO_FLUSH: /* * BIO_FLUSH doesn't currently communicate * range data, so we synchronize the cache * over the whole disk. We also force * ordered tag semantics the flush applies * to all previously queued I/O. */ scsi_synchronize_cache(&start_ccb->csio, /*retries*/1, /*cbfcnp*/dadone, MSG_ORDERED_Q_TAG, /*begin_lba*/0, /*lb_count*/0, SSD_FULL_SIZE, da_default_timeout*1000); break; case BIO_ZONE: { int error, queue_ccb; queue_ccb = 0; error = da_zone_cmd(periph, start_ccb, bp,&queue_ccb); if ((error != 0) || (queue_ccb == 0)) { biofinish(bp, NULL, error); xpt_release_ccb(start_ccb); return; } break; } } start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO; start_ccb->ccb_h.flags |= CAM_UNLOCKED; start_ccb->ccb_h.softtimeout = sbttotv(da_default_softtimeout); out: LIST_INSERT_HEAD(&softc->pending_ccbs, &start_ccb->ccb_h, periph_links.le); /* We expect a unit attention from this device */ if ((softc->flags & DA_FLAG_RETRY_UA) != 0) { start_ccb->ccb_h.ccb_state |= DA_CCB_RETRY_UA; softc->flags &= ~DA_FLAG_RETRY_UA; } start_ccb->ccb_h.ccb_bp = bp; softc->refcount++; cam_periph_unlock(periph); xpt_action(start_ccb); cam_periph_lock(periph); softc->refcount--; /* May have more work to do, so ensure we stay scheduled */ daschedule(periph); break; } case DA_STATE_PROBE_RC: { struct scsi_read_capacity_data *rcap; rcap = (struct scsi_read_capacity_data *) malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT|M_ZERO); if (rcap == NULL) { printf("dastart: Couldn't malloc read_capacity data\n"); /* da_free_periph??? */ break; } scsi_read_capacity(&start_ccb->csio, /*retries*/da_retry_count, dadone, MSG_SIMPLE_Q_TAG, rcap, SSD_FULL_SIZE, /*timeout*/5000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC; xpt_action(start_ccb); break; } case DA_STATE_PROBE_RC16: { struct scsi_read_capacity_data_long *rcaplong; rcaplong = (struct scsi_read_capacity_data_long *) malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT|M_ZERO); if (rcaplong == NULL) { printf("dastart: Couldn't malloc read_capacity data\n"); /* da_free_periph??? */ break; } scsi_read_capacity_16(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*lba*/ 0, /*reladr*/ 0, /*pmi*/ 0, /*rcap_buf*/ (uint8_t *)rcaplong, /*rcap_buf_len*/ sizeof(*rcaplong), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC16; xpt_action(start_ccb); break; } case DA_STATE_PROBE_LBP: { struct scsi_vpd_logical_block_prov *lbp; if (!scsi_vpd_supported_page(periph, SVPD_LBP)) { /* * If we get here we don't support any SBC-3 delete * methods with UNMAP as the Logical Block Provisioning * VPD page support is required for devices which * support it according to T10/1799-D Revision 31 * however older revisions of the spec don't mandate * this so we currently don't remove these methods * from the available set. */ softc->state = DA_STATE_PROBE_BLK_LIMITS; goto skipstate; } lbp = (struct scsi_vpd_logical_block_prov *) malloc(sizeof(*lbp), M_SCSIDA, M_NOWAIT|M_ZERO); if (lbp == NULL) { printf("dastart: Couldn't malloc lbp data\n"); /* da_free_periph??? */ break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)lbp, /*inq_len*/sizeof(*lbp), /*evpd*/TRUE, /*page_code*/SVPD_LBP, /*sense_len*/SSD_MIN_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_LBP; xpt_action(start_ccb); break; } case DA_STATE_PROBE_BLK_LIMITS: { struct scsi_vpd_block_limits *block_limits; if (!scsi_vpd_supported_page(periph, SVPD_BLOCK_LIMITS)) { /* Not supported skip to next probe */ softc->state = DA_STATE_PROBE_BDC; goto skipstate; } block_limits = (struct scsi_vpd_block_limits *) malloc(sizeof(*block_limits), M_SCSIDA, M_NOWAIT|M_ZERO); if (block_limits == NULL) { printf("dastart: Couldn't malloc block_limits data\n"); /* da_free_periph??? */ break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)block_limits, /*inq_len*/sizeof(*block_limits), /*evpd*/TRUE, /*page_code*/SVPD_BLOCK_LIMITS, /*sense_len*/SSD_MIN_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BLK_LIMITS; xpt_action(start_ccb); break; } case DA_STATE_PROBE_BDC: { struct scsi_vpd_block_characteristics *bdc; if (!scsi_vpd_supported_page(periph, SVPD_BDC)) { softc->state = DA_STATE_PROBE_ATA; goto skipstate; } bdc = (struct scsi_vpd_block_characteristics *) malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO); if (bdc == NULL) { printf("dastart: Couldn't malloc bdc data\n"); /* da_free_periph??? */ break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)bdc, /*inq_len*/sizeof(*bdc), /*evpd*/TRUE, /*page_code*/SVPD_BDC, /*sense_len*/SSD_MIN_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BDC; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA: { struct ata_params *ata_params; if (!scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) { if ((softc->zone_mode == DA_ZONE_HOST_AWARE) || (softc->zone_mode == DA_ZONE_HOST_MANAGED)) { /* * Note that if the ATA VPD page isn't * supported, we aren't talking to an ATA * device anyway. Support for that VPD * page is mandatory for SCSI to ATA (SAT) * translation layers. */ softc->state = DA_STATE_PROBE_ZONE; goto skipstate; } daprobedone(periph, start_ccb); break; } ata_params = (struct ata_params*) malloc(sizeof(*ata_params), M_SCSIDA,M_NOWAIT|M_ZERO); if (ata_params == NULL) { xpt_print(periph->path, "Couldn't malloc ata_params " "data\n"); /* da_free_periph??? */ break; } scsi_ata_identify(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*data_ptr*/(u_int8_t *)ata_params, /*dxfer_len*/sizeof(*ata_params), /*sense_len*/SSD_FULL_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_LOGDIR: { struct ata_gp_log_dir *log_dir; int retval; retval = 0; if ((softc->flags & DA_FLAG_CAN_ATA_LOG) == 0) { /* * If we don't have log support, not much point in * trying to probe zone support. */ daprobedone(periph, start_ccb); break; } /* * If we have an ATA device (the SCSI ATA Information VPD * page should be present and the ATA identify should have * succeeded) and it supports logs, ask for the log directory. */ log_dir = malloc(sizeof(*log_dir), M_SCSIDA, M_NOWAIT|M_ZERO); if (log_dir == NULL) { xpt_print(periph->path, "Couldn't malloc log_dir " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_LOG_DIRECTORY, /*page_number*/ 0, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)log_dir, /*dxfer_len*/ sizeof(*log_dir), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(log_dir, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_LOGDIR; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_IDDIR: { struct ata_identify_log_pages *id_dir; int retval; retval = 0; /* * Check here to see whether the Identify Device log is * supported in the directory of logs. If so, continue * with requesting the log of identify device pages. */ if ((softc->flags & DA_FLAG_CAN_ATA_IDLOG) == 0) { daprobedone(periph, start_ccb); break; } id_dir = malloc(sizeof(*id_dir), M_SCSIDA, M_NOWAIT | M_ZERO); if (id_dir == NULL) { xpt_print(periph->path, "Couldn't malloc id_dir " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_PAGE_LIST, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)id_dir, /*dxfer_len*/ sizeof(*id_dir), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(id_dir, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_IDDIR; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_SUP: { struct ata_identify_log_sup_cap *sup_cap; int retval; retval = 0; /* * Check here to see whether the Supported Capabilities log * is in the list of Identify Device logs. */ if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) == 0) { daprobedone(periph, start_ccb); break; } sup_cap = malloc(sizeof(*sup_cap), M_SCSIDA, M_NOWAIT|M_ZERO); if (sup_cap == NULL) { xpt_print(periph->path, "Couldn't malloc sup_cap " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_SUP_CAP, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)sup_cap, /*dxfer_len*/ sizeof(*sup_cap), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(sup_cap, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_SUP; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_ZONE: { struct ata_zoned_info_log *ata_zone; int retval; retval = 0; /* * Check here to see whether the zoned device information * page is supported. If so, continue on to request it. * If not, skip to DA_STATE_PROBE_LOG or done. */ if ((softc->flags & DA_FLAG_CAN_ATA_ZONE) == 0) { daprobedone(periph, start_ccb); break; } ata_zone = malloc(sizeof(*ata_zone), M_SCSIDA, M_NOWAIT|M_ZERO); if (ata_zone == NULL) { xpt_print(periph->path, "Couldn't malloc ata_zone " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_ZDI, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)ata_zone, /*dxfer_len*/ sizeof(*ata_zone), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(ata_zone, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_ZONE; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ZONE: { struct scsi_vpd_zoned_bdc *bdc; /* * Note that this page will be supported for SCSI protocol * devices that support ZBC (SMR devices), as well as ATA * protocol devices that are behind a SAT (SCSI to ATA * Translation) layer that supports converting ZBC commands * to their ZAC equivalents. */ if (!scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) { daprobedone(periph, start_ccb); break; } bdc = (struct scsi_vpd_zoned_bdc *) malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO); if (bdc == NULL) { xpt_release_ccb(start_ccb); xpt_print(periph->path, "Couldn't malloc zone VPD " "data\n"); break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)bdc, /*inq_len*/sizeof(*bdc), /*evpd*/TRUE, /*page_code*/SVPD_ZONED_BDC, /*sense_len*/SSD_FULL_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ZONE; xpt_action(start_ccb); break; } } } /* * In each of the methods below, while its the caller's * responsibility to ensure the request will fit into a * single device request, we might have changed the delete * method due to the device incorrectly advertising either * its supported methods or limits. * * To prevent this causing further issues we validate the * against the methods limits, and warn which would * otherwise be unnecessary. */ static void da_delete_unmap(struct cam_periph *periph, union ccb *ccb, struct bio *bp) { struct da_softc *softc = (struct da_softc *)periph->softc;; struct bio *bp1; uint8_t *buf = softc->unmap_buf; uint64_t lba, lastlba = (uint64_t)-1; uint64_t totalcount = 0; uint64_t count; uint32_t lastcount = 0, c; uint32_t off, ranges = 0; /* * Currently this doesn't take the UNMAP * Granularity and Granularity Alignment * fields into account. * * This could result in both unoptimal unmap * requests as as well as UNMAP calls unmapping * fewer LBA's than requested. */ bzero(softc->unmap_buf, sizeof(softc->unmap_buf)); bp1 = bp; do { /* * Note: ada and da are different in how they store the * pending bp's in a trim. ada stores all of them in the * trim_req.bps. da stores all but the first one in the * delete_run_queue. ada then completes all the bps in * its adadone() loop. da completes all the bps in the * delete_run_queue in dadone, and relies on the biodone * after to complete. This should be reconciled since there's * no real reason to do it differently. XXX */ if (bp1 != bp) bioq_insert_tail(&softc->delete_run_queue, bp1); lba = bp1->bio_pblkno; count = bp1->bio_bcount / softc->params.secsize; /* Try to extend the previous range. */ if (lba == lastlba) { c = omin(count, UNMAP_RANGE_MAX - lastcount); lastcount += c; off = ((ranges - 1) * UNMAP_RANGE_SIZE) + UNMAP_HEAD_SIZE; scsi_ulto4b(lastcount, &buf[off + 8]); count -= c; lba +=c; totalcount += c; } while (count > 0) { c = omin(count, UNMAP_RANGE_MAX); if (totalcount + c > softc->unmap_max_lba || ranges >= softc->unmap_max_ranges) { xpt_print(periph->path, "%s issuing short delete %ld > %ld" "|| %d >= %d", da_delete_method_desc[softc->delete_method], totalcount + c, softc->unmap_max_lba, ranges, softc->unmap_max_ranges); break; } off = (ranges * UNMAP_RANGE_SIZE) + UNMAP_HEAD_SIZE; scsi_u64to8b(lba, &buf[off + 0]); scsi_ulto4b(c, &buf[off + 8]); lba += c; totalcount += c; ranges++; count -= c; lastcount = c; } lastlba = lba; bp1 = cam_iosched_next_trim(softc->cam_iosched); if (bp1 == NULL) break; if (ranges >= softc->unmap_max_ranges || totalcount + bp1->bio_bcount / softc->params.secsize > softc->unmap_max_lba) { cam_iosched_put_back_trim(softc->cam_iosched, bp1); break; } } while (1); scsi_ulto2b(ranges * 16 + 6, &buf[0]); scsi_ulto2b(ranges * 16, &buf[2]); scsi_unmap(&ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*byte2*/0, /*data_ptr*/ buf, /*dxfer_len*/ ranges * 16 + 8, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); ccb->ccb_h.ccb_state = DA_CCB_DELETE; ccb->ccb_h.flags |= CAM_UNLOCKED; cam_iosched_submit_trim(softc->cam_iosched); } static void da_delete_trim(struct cam_periph *periph, union ccb *ccb, struct bio *bp) { struct da_softc *softc = (struct da_softc *)periph->softc; struct bio *bp1; uint8_t *buf = softc->unmap_buf; uint64_t lastlba = (uint64_t)-1; uint64_t count; uint64_t lba; uint32_t lastcount = 0, c, requestcount; int ranges = 0, off, block_count; bzero(softc->unmap_buf, sizeof(softc->unmap_buf)); bp1 = bp; do { if (bp1 != bp)//XXX imp XXX bioq_insert_tail(&softc->delete_run_queue, bp1); lba = bp1->bio_pblkno; count = bp1->bio_bcount / softc->params.secsize; requestcount = count; /* Try to extend the previous range. */ if (lba == lastlba) { c = omin(count, ATA_DSM_RANGE_MAX - lastcount); lastcount += c; off = (ranges - 1) * 8; buf[off + 6] = lastcount & 0xff; buf[off + 7] = (lastcount >> 8) & 0xff; count -= c; lba += c; } while (count > 0) { c = omin(count, ATA_DSM_RANGE_MAX); off = ranges * 8; buf[off + 0] = lba & 0xff; buf[off + 1] = (lba >> 8) & 0xff; buf[off + 2] = (lba >> 16) & 0xff; buf[off + 3] = (lba >> 24) & 0xff; buf[off + 4] = (lba >> 32) & 0xff; buf[off + 5] = (lba >> 40) & 0xff; buf[off + 6] = c & 0xff; buf[off + 7] = (c >> 8) & 0xff; lba += c; ranges++; count -= c; lastcount = c; if (count != 0 && ranges == softc->trim_max_ranges) { xpt_print(periph->path, "%s issuing short delete %ld > %ld\n", da_delete_method_desc[softc->delete_method], requestcount, (softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX); break; } } lastlba = lba; bp1 = cam_iosched_next_trim(softc->cam_iosched); if (bp1 == NULL) break; if (bp1->bio_bcount / softc->params.secsize > (softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) { cam_iosched_put_back_trim(softc->cam_iosched, bp1); break; } } while (1); block_count = howmany(ranges, ATA_DSM_BLK_RANGES); scsi_ata_trim(&ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, block_count, /*data_ptr*/buf, /*dxfer_len*/block_count * ATA_DSM_BLK_SIZE, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); ccb->ccb_h.ccb_state = DA_CCB_DELETE; ccb->ccb_h.flags |= CAM_UNLOCKED; cam_iosched_submit_trim(softc->cam_iosched); } /* * We calculate ws_max_blks here based off d_delmaxsize instead * of using softc->ws_max_blks as it is absolute max for the * device not the protocol max which may well be lower. */ static void da_delete_ws(struct cam_periph *periph, union ccb *ccb, struct bio *bp) { struct da_softc *softc; struct bio *bp1; uint64_t ws_max_blks; uint64_t lba; uint64_t count; /* forward compat with WS32 */ softc = (struct da_softc *)periph->softc; ws_max_blks = softc->disk->d_delmaxsize / softc->params.secsize; lba = bp->bio_pblkno; count = 0; bp1 = bp; do { if (bp1 != bp)//XXX imp XXX bioq_insert_tail(&softc->delete_run_queue, bp1); count += bp1->bio_bcount / softc->params.secsize; if (count > ws_max_blks) { xpt_print(periph->path, "%s issuing short delete %ld > %ld\n", da_delete_method_desc[softc->delete_method], count, ws_max_blks); count = omin(count, ws_max_blks); break; } bp1 = cam_iosched_next_trim(softc->cam_iosched); if (bp1 == NULL) break; if (lba + count != bp1->bio_pblkno || count + bp1->bio_bcount / softc->params.secsize > ws_max_blks) { cam_iosched_put_back_trim(softc->cam_iosched, bp1); break; } } while (1); scsi_write_same(&ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*byte2*/softc->delete_method == DA_DELETE_ZERO ? 0 : SWS_UNMAP, softc->delete_method == DA_DELETE_WS16 ? 16 : 10, /*lba*/lba, /*block_count*/count, /*data_ptr*/ __DECONST(void *, zero_region), /*dxfer_len*/ softc->params.secsize, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); ccb->ccb_h.ccb_state = DA_CCB_DELETE; ccb->ccb_h.flags |= CAM_UNLOCKED; cam_iosched_submit_trim(softc->cam_iosched); } static int cmd6workaround(union ccb *ccb) { struct scsi_rw_6 cmd6; struct scsi_rw_10 *cmd10; struct da_softc *softc; u_int8_t *cdb; struct bio *bp; int frozen; cdb = ccb->csio.cdb_io.cdb_bytes; softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc; if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) { da_delete_methods old_method = softc->delete_method; /* * Typically there are two reasons for failure here * 1. Delete method was detected as supported but isn't * 2. Delete failed due to invalid params e.g. too big * * While we will attempt to choose an alternative delete method * this may result in short deletes if the existing delete * requests from geom are big for the new method chosen. * * This method assumes that the error which triggered this * will not retry the io otherwise a panic will occur */ dadeleteflag(softc, old_method, 0); dadeletemethodchoose(softc, DA_DELETE_DISABLE); if (softc->delete_method == DA_DELETE_DISABLE) xpt_print(ccb->ccb_h.path, "%s failed, disabling BIO_DELETE\n", da_delete_method_desc[old_method]); else xpt_print(ccb->ccb_h.path, "%s failed, switching to %s BIO_DELETE\n", da_delete_method_desc[old_method], da_delete_method_desc[softc->delete_method]); while ((bp = bioq_takefirst(&softc->delete_run_queue)) != NULL) cam_iosched_queue_work(softc->cam_iosched, bp); cam_iosched_queue_work(softc->cam_iosched, (struct bio *)ccb->ccb_h.ccb_bp); ccb->ccb_h.ccb_bp = NULL; return (0); } /* Detect unsupported PREVENT ALLOW MEDIUM REMOVAL. */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 && (*cdb == PREVENT_ALLOW) && (softc->quirks & DA_Q_NO_PREVENT) == 0) { if (bootverbose) xpt_print(ccb->ccb_h.path, "PREVENT ALLOW MEDIUM REMOVAL not supported.\n"); softc->quirks |= DA_Q_NO_PREVENT; return (0); } /* Detect unsupported SYNCHRONIZE CACHE(10). */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 && (*cdb == SYNCHRONIZE_CACHE) && (softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) { if (bootverbose) xpt_print(ccb->ccb_h.path, "SYNCHRONIZE CACHE(10) not supported.\n"); softc->quirks |= DA_Q_NO_SYNC_CACHE; softc->disk->d_flags &= ~DISKFLAG_CANFLUSHCACHE; return (0); } /* Translation only possible if CDB is an array and cmd is R/W6 */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 || (*cdb != READ_6 && *cdb != WRITE_6)) return 0; xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, " "increasing minimum_cmd_size to 10.\n"); softc->minimum_cmd_size = 10; bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6)); cmd10 = (struct scsi_rw_10 *)cdb; cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10; cmd10->byte2 = 0; scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr); cmd10->reserved = 0; scsi_ulto2b(cmd6.length, cmd10->length); cmd10->control = cmd6.control; ccb->csio.cdb_len = sizeof(*cmd10); /* Requeue request, unfreezing queue if necessary */ frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0; ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_action(ccb); if (frozen) { cam_release_devq(ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } return (ERESTART); } static void dazonedone(struct cam_periph *periph, union ccb *ccb) { struct da_softc *softc; struct bio *bp; softc = periph->softc; bp = (struct bio *)ccb->ccb_h.ccb_bp; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: case DISK_ZONE_CLOSE: case DISK_ZONE_FINISH: case DISK_ZONE_RWP: break; case DISK_ZONE_REPORT_ZONES: { uint32_t avail_len; struct disk_zone_report *rep; struct scsi_report_zones_hdr *hdr; struct scsi_report_zones_desc *desc; struct disk_zone_rep_entry *entry; uint32_t num_alloced, hdr_len, num_avail; uint32_t num_to_fill, i; int ata; rep = &bp->bio_zone.zone_params.report; avail_len = ccb->csio.dxfer_len - ccb->csio.resid; /* * Note that bio_resid isn't normally used for zone * commands, but it is used by devstat_end_transaction_bio() * to determine how much data was transferred. Because * the size of the SCSI/ATA data structures is different * than the size of the BIO interface structures, the * amount of data actually transferred from the drive will * be different than the amount of data transferred to * the user. */ bp->bio_resid = ccb->csio.resid; num_alloced = rep->entries_allocated; hdr = (struct scsi_report_zones_hdr *)ccb->csio.data_ptr; if (avail_len < sizeof(*hdr)) { /* * Is there a better error than EIO here? We asked * for at least the header, and we got less than * that. */ bp->bio_error = EIO; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; break; } if (softc->zone_interface == DA_ZONE_IF_ATA_PASS) ata = 1; else ata = 0; hdr_len = ata ? le32dec(hdr->length) : scsi_4btoul(hdr->length); if (hdr_len > 0) rep->entries_available = hdr_len / sizeof(*desc); else rep->entries_available = 0; /* * NOTE: using the same values for the BIO version of the * same field as the SCSI/ATA values. This means we could * get some additional values that aren't defined in bio.h * if more values of the same field are defined later. */ rep->header.same = hdr->byte4 & SRZ_SAME_MASK; rep->header.maximum_lba = ata ? le64dec(hdr->maximum_lba) : scsi_8btou64(hdr->maximum_lba); /* * If the drive reports no entries that match the query, * we're done. */ if (hdr_len == 0) { rep->entries_filled = 0; break; } num_avail = min((avail_len - sizeof(*hdr)) / sizeof(*desc), hdr_len / sizeof(*desc)); /* * If the drive didn't return any data, then we're done. */ if (num_avail == 0) { rep->entries_filled = 0; break; } num_to_fill = min(num_avail, rep->entries_allocated); /* * If the user didn't allocate any entries for us to fill, * we're done. */ if (num_to_fill == 0) { rep->entries_filled = 0; break; } for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0]; i < num_to_fill; i++, desc++, entry++) { /* * NOTE: we're mapping the values here directly * from the SCSI/ATA bit definitions to the bio.h * definitons. There is also a warning in * disk_zone.h, but the impact is that if * additional values are added in the SCSI/ATA * specs these will be visible to consumers of * this interface. */ entry->zone_type = desc->zone_type & SRZ_TYPE_MASK; entry->zone_condition = (desc->zone_flags & SRZ_ZONE_COND_MASK) >> SRZ_ZONE_COND_SHIFT; entry->zone_flags |= desc->zone_flags & (SRZ_ZONE_NON_SEQ|SRZ_ZONE_RESET); entry->zone_length = ata ? le64dec(desc->zone_length) : scsi_8btou64(desc->zone_length); entry->zone_start_lba = ata ? le64dec(desc->zone_start_lba) : scsi_8btou64(desc->zone_start_lba); entry->write_pointer_lba = ata ? le64dec(desc->write_pointer_lba) : scsi_8btou64(desc->write_pointer_lba); } rep->entries_filled = num_to_fill; break; } case DISK_ZONE_GET_PARAMS: default: /* * In theory we should not get a GET_PARAMS bio, since it * should be handled without queueing the command to the * drive. */ panic("%s: Invalid zone command %d", __func__, bp->bio_zone.zone_cmd); break; } if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) free(ccb->csio.data_ptr, M_SCSIDA); } static void dadone(struct cam_periph *periph, union ccb *done_ccb) { struct da_softc *softc; struct ccb_scsiio *csio; u_int32_t priority; da_ccb_state state; softc = (struct da_softc *)periph->softc; priority = done_ccb->ccb_h.pinfo.priority; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n")); csio = &done_ccb->csio; +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + if (csio->bio != NULL) + biotrack(csio->bio, __func__); +#endif state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK; switch (state) { case DA_CCB_BUFFER_IO: case DA_CCB_DELETE: { struct bio *bp, *bp1; cam_periph_lock(periph); bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { int error; int sf; if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0) sf = SF_RETRY_UA; else sf = 0; error = daerror(done_ccb, CAM_RETRY_SELTO, sf); if (error == ERESTART) { /* * A retry was scheduled, so * just return. */ cam_periph_unlock(periph); return; } bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if (error != 0) { int queued_error; /* * return all queued I/O with EIO, so that * the client can retry these I/Os in the * proper order should it attempt to recover. */ queued_error = EIO; if (error == ENXIO && (softc->flags & DA_FLAG_PACK_INVALID)== 0) { /* * Catastrophic error. Mark our pack as * invalid. */ /* * XXX See if this is really a media * XXX change first? */ xpt_print(periph->path, "Invalidating pack\n"); softc->flags |= DA_FLAG_PACK_INVALID; #ifdef CAM_IO_STATS softc->invalidations++; #endif queued_error = ENXIO; } cam_iosched_flush(softc->cam_iosched, NULL, queued_error); if (bp != NULL) { bp->bio_error = error; bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; } } else if (bp != NULL) { if (state == DA_CCB_DELETE) bp->bio_resid = 0; else bp->bio_resid = csio->resid; bp->bio_error = 0; if (bp->bio_resid != 0) bp->bio_flags |= BIO_ERROR; } if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } else if (bp != NULL) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) panic("REQ_CMP with QFRZN"); if (bp->bio_cmd == BIO_ZONE) dazonedone(periph, done_ccb); else if (state == DA_CCB_DELETE) bp->bio_resid = 0; else bp->bio_resid = csio->resid; if ((csio->resid > 0) && (bp->bio_cmd != BIO_ZONE)) bp->bio_flags |= BIO_ERROR; if (softc->error_inject != 0) { bp->bio_error = softc->error_inject; bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; softc->error_inject = 0; } } + biotrack(bp, __func__); LIST_REMOVE(&done_ccb->ccb_h, periph_links.le); if (LIST_EMPTY(&softc->pending_ccbs)) softc->flags |= DA_FLAG_WAS_OTAG; cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb); xpt_release_ccb(done_ccb); if (state == DA_CCB_DELETE) { TAILQ_HEAD(, bio) queue; TAILQ_INIT(&queue); TAILQ_CONCAT(&queue, &softc->delete_run_queue.queue, bio_queue); softc->delete_run_queue.insert_point = NULL; /* * Normally, the xpt_release_ccb() above would make sure * that when we have more work to do, that work would * get kicked off. However, we specifically keep * delete_running set to 0 before the call above to * allow other I/O to progress when many BIO_DELETE * requests are pushed down. We set delete_running to 0 * and call daschedule again so that we don't stall if * there are no other I/Os pending apart from BIO_DELETEs. */ cam_iosched_trim_done(softc->cam_iosched); daschedule(periph); cam_periph_unlock(periph); while ((bp1 = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bp1, bio_queue); bp1->bio_error = bp->bio_error; if (bp->bio_flags & BIO_ERROR) { bp1->bio_flags |= BIO_ERROR; bp1->bio_resid = bp1->bio_bcount; } else bp1->bio_resid = 0; biodone(bp1); } } else { daschedule(periph); cam_periph_unlock(periph); } if (bp != NULL) biodone(bp); return; } case DA_CCB_PROBE_RC: case DA_CCB_PROBE_RC16: { struct scsi_read_capacity_data *rdcap; struct scsi_read_capacity_data_long *rcaplong; char announce_buf[80]; int lbp; lbp = 0; rdcap = NULL; rcaplong = NULL; if (state == DA_CCB_PROBE_RC) rdcap =(struct scsi_read_capacity_data *)csio->data_ptr; else rcaplong = (struct scsi_read_capacity_data_long *) csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { struct disk_params *dp; uint32_t block_size; uint64_t maxsector; u_int lalba; /* Lowest aligned LBA. */ if (state == DA_CCB_PROBE_RC) { block_size = scsi_4btoul(rdcap->length); maxsector = scsi_4btoul(rdcap->addr); lalba = 0; /* * According to SBC-2, if the standard 10 * byte READ CAPACITY command returns 2^32, * we should issue the 16 byte version of * the command, since the device in question * has more sectors than can be represented * with the short version of the command. */ if (maxsector == 0xffffffff) { free(rdcap, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_RC16; xpt_schedule(periph, priority); return; } } else { block_size = scsi_4btoul(rcaplong->length); maxsector = scsi_8btou64(rcaplong->addr); lalba = scsi_2btoul(rcaplong->lalba_lbp); } /* * Because GEOM code just will panic us if we * give them an 'illegal' value we'll avoid that * here. */ if (block_size == 0) { block_size = 512; if (maxsector == 0) maxsector = -1; } if (block_size >= MAXPHYS) { xpt_print(periph->path, "unsupportable block size %ju\n", (uintmax_t) block_size); announce_buf[0] = '\0'; cam_periph_invalidate(periph); } else { /* * We pass rcaplong into dasetgeom(), * because it will only use it if it is * non-NULL. */ dasetgeom(periph, block_size, maxsector, rcaplong, sizeof(*rcaplong)); lbp = (lalba & SRC16_LBPME_A); dp = &softc->params; snprintf(announce_buf, sizeof(announce_buf), "%juMB (%ju %u byte sectors)", ((uintmax_t)dp->secsize * dp->sectors) / (1024 * 1024), (uintmax_t)dp->sectors, dp->secsize); } } else { int error; announce_buf[0] = '\0'; /* * Retry any UNIT ATTENTION type errors. They * are expected at boot. */ error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) { /* * A retry was scheuled, so * just return. */ return; } else if (error != 0) { int asc, ascq; int sense_key, error_code; int have_sense; cam_status status; struct ccb_getdev cgd; /* Don't wedge this device's queue */ status = done_ccb->ccb_h.status; if ((status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); xpt_setup_ccb(&cgd.ccb_h, done_ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) have_sense = TRUE; else have_sense = FALSE; /* * If we tried READ CAPACITY(16) and failed, * fallback to READ CAPACITY(10). */ if ((state == DA_CCB_PROBE_RC16) && (softc->flags & DA_FLAG_CAN_RC16) && (((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) || ((have_sense) && (error_code == SSD_CURRENT_ERROR) && (sense_key == SSD_KEY_ILLEGAL_REQUEST)))) { softc->flags &= ~DA_FLAG_CAN_RC16; free(rdcap, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_RC; xpt_schedule(periph, priority); return; } /* * Attach to anything that claims to be a * direct access or optical disk device, * as long as it doesn't return a "Logical * unit not supported" (0x25) error. */ if ((have_sense) && (asc != 0x25) && (error_code == SSD_CURRENT_ERROR)) { const char *sense_key_desc; const char *asc_desc; dasetgeom(periph, 512, -1, NULL, 0); scsi_sense_desc(sense_key, asc, ascq, &cgd.inq_data, &sense_key_desc, &asc_desc); snprintf(announce_buf, sizeof(announce_buf), "Attempt to query device " "size failed: %s, %s", sense_key_desc, asc_desc); } else { if (have_sense) scsi_sense_print( &done_ccb->csio); else { xpt_print(periph->path, "got CAM status %#x\n", done_ccb->ccb_h.status); } xpt_print(periph->path, "fatal error, " "failed to attach to device\n"); /* * Free up resources. */ cam_periph_invalidate(periph); } } } free(csio->data_ptr, M_SCSIDA); if (announce_buf[0] != '\0' && ((softc->flags & DA_FLAG_ANNOUNCED) == 0)) { /* * Create our sysctl variables, now that we know * we have successfully attached. */ /* increase the refcount */ if (cam_periph_acquire(periph) == CAM_REQ_CMP) { taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task); xpt_announce_periph(periph, announce_buf); xpt_announce_quirks(periph, softc->quirks, DA_Q_BIT_STRING); } else { xpt_print(periph->path, "fatal error, " "could not acquire reference count\n"); } } /* We already probed the device. */ if (softc->flags & DA_FLAG_PROBED) { daprobedone(periph, done_ccb); return; } /* Ensure re-probe doesn't see old delete. */ softc->delete_available = 0; dadeleteflag(softc, DA_DELETE_ZERO, 1); if (lbp && (softc->quirks & DA_Q_NO_UNMAP) == 0) { /* * Based on older SBC-3 spec revisions * any of the UNMAP methods "may" be * available via LBP given this flag so * we flag all of them as available and * then remove those which further * probes confirm aren't available * later. * * We could also check readcap(16) p_type * flag to exclude one or more invalid * write same (X) types here */ dadeleteflag(softc, DA_DELETE_WS16, 1); dadeleteflag(softc, DA_DELETE_WS10, 1); dadeleteflag(softc, DA_DELETE_UNMAP, 1); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_LBP; xpt_schedule(periph, priority); return; } xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_BDC; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_LBP: { struct scsi_vpd_logical_block_prov *lbp; lbp = (struct scsi_vpd_logical_block_prov *)csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { /* * T10/1799-D Revision 31 states at least one of these * must be supported but we don't currently enforce this. */ dadeleteflag(softc, DA_DELETE_WS16, (lbp->flags & SVPD_LBP_WS16)); dadeleteflag(softc, DA_DELETE_WS10, (lbp->flags & SVPD_LBP_WS10)); dadeleteflag(softc, DA_DELETE_UNMAP, (lbp->flags & SVPD_LBP_UNMAP)); } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } /* * Failure indicates we don't support any SBC-3 * delete methods with UNMAP */ } } free(lbp, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_BLK_LIMITS; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_BLK_LIMITS: { struct scsi_vpd_block_limits *block_limits; block_limits = (struct scsi_vpd_block_limits *)csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t max_txfer_len = scsi_4btoul( block_limits->max_txfer_len); uint32_t max_unmap_lba_cnt = scsi_4btoul( block_limits->max_unmap_lba_cnt); uint32_t max_unmap_blk_cnt = scsi_4btoul( block_limits->max_unmap_blk_cnt); uint64_t ws_max_blks = scsi_8btou64( block_limits->max_write_same_length); if (max_txfer_len != 0) { softc->disk->d_maxsize = MIN(softc->maxio, (off_t)max_txfer_len * softc->params.secsize); } /* * We should already support UNMAP but we check lba * and block count to be sure */ if (max_unmap_lba_cnt != 0x00L && max_unmap_blk_cnt != 0x00L) { softc->unmap_max_lba = max_unmap_lba_cnt; softc->unmap_max_ranges = min(max_unmap_blk_cnt, UNMAP_MAX_RANGES); } else { /* * Unexpected UNMAP limits which means the * device doesn't actually support UNMAP */ dadeleteflag(softc, DA_DELETE_UNMAP, 0); } if (ws_max_blks != 0x00L) softc->ws_max_blks = ws_max_blks; } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } /* * Failure here doesn't mean UNMAP is not * supported as this is an optional page. */ softc->unmap_max_lba = 1; softc->unmap_max_ranges = 1; } } free(block_limits, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_BDC; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_BDC: { struct scsi_vpd_block_device_characteristics *bdc; bdc = (struct scsi_vpd_block_device_characteristics *) csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t valid_len; /* * Disable queue sorting for non-rotational media * by default. */ u_int16_t old_rate = softc->disk->d_rotation_rate; valid_len = csio->dxfer_len - csio->resid; if (SBDC_IS_PRESENT(bdc, valid_len, medium_rotation_rate)) { softc->disk->d_rotation_rate = scsi_2btoul(bdc->medium_rotation_rate); if (softc->disk->d_rotation_rate == SVPD_BDC_RATE_NON_ROTATING) { cam_iosched_set_sort_queue( softc->cam_iosched, 0); softc->rotating = 0; } if (softc->disk->d_rotation_rate != old_rate) { disk_attr_changed(softc->disk, "GEOM::rotation_rate", M_NOWAIT); } } if ((SBDC_IS_PRESENT(bdc, valid_len, flags)) && (softc->zone_mode == DA_ZONE_NONE)) { int ata_proto; if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) ata_proto = 1; else ata_proto = 0; /* * The Zoned field will only be set for * Drive Managed and Host Aware drives. If * they are Host Managed, the device type * in the standard INQUIRY data should be * set to T_ZBC_HM (0x14). */ if ((bdc->flags & SVPD_ZBC_MASK) == SVPD_HAW_ZBC) { softc->zone_mode = DA_ZONE_HOST_AWARE; softc->zone_interface = (ata_proto) ? DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI; } else if ((bdc->flags & SVPD_ZBC_MASK) == SVPD_DM_ZBC) { softc->zone_mode =DA_ZONE_DRIVE_MANAGED; softc->zone_interface = (ata_proto) ? DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI; } else if ((bdc->flags & SVPD_ZBC_MASK) != SVPD_ZBC_NR) { xpt_print(periph->path, "Unknown zoned " "type %#x", bdc->flags & SVPD_ZBC_MASK); } } } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(bdc, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_ATA; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_ATA: { int i; struct ata_params *ata_params; int continue_probe; int error; int16_t *ptr; ata_params = (struct ata_params *)csio->data_ptr; ptr = (uint16_t *)ata_params; continue_probe = 0; error = 0; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint16_t old_rate; for (i = 0; i < sizeof(*ata_params) / 2; i++) ptr[i] = le16toh(ptr[i]); if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM && (softc->quirks & DA_Q_NO_UNMAP) == 0) { dadeleteflag(softc, DA_DELETE_ATA_TRIM, 1); if (ata_params->max_dsm_blocks != 0) softc->trim_max_ranges = min( softc->trim_max_ranges, ata_params->max_dsm_blocks * ATA_DSM_BLK_RANGES); } /* * Disable queue sorting for non-rotational media * by default. */ old_rate = softc->disk->d_rotation_rate; softc->disk->d_rotation_rate = ata_params->media_rotation_rate; if (softc->disk->d_rotation_rate == ATA_RATE_NON_ROTATING) { cam_iosched_set_sort_queue(softc->cam_iosched, 0); softc->rotating = 0; } if (softc->disk->d_rotation_rate != old_rate) { disk_attr_changed(softc->disk, "GEOM::rotation_rate", M_NOWAIT); } if (ata_params->capabilities1 & ATA_SUPPORT_DMA) softc->flags |= DA_FLAG_CAN_ATA_DMA; if (ata_params->support.extension & ATA_SUPPORT_GENLOG) softc->flags |= DA_FLAG_CAN_ATA_LOG; /* * At this point, if we have a SATA host aware drive, * we communicate via ATA passthrough unless the * SAT layer supports ZBC -> ZAC translation. In * that case, */ /* * XXX KDM figure out how to detect a host managed * SATA drive. */ if (softc->zone_mode == DA_ZONE_NONE) { /* * Note that we don't override the zone * mode or interface if it has already been * set. This is because it has either been * set as a quirk, or when we probed the * SCSI Block Device Characteristics page, * the zoned field was set. The latter * means that the SAT layer supports ZBC to * ZAC translation, and we would prefer to * use that if it is available. */ if ((ata_params->support3 & ATA_SUPPORT_ZONE_MASK) == ATA_SUPPORT_ZONE_HOST_AWARE) { softc->zone_mode = DA_ZONE_HOST_AWARE; softc->zone_interface = DA_ZONE_IF_ATA_PASS; } else if ((ata_params->support3 & ATA_SUPPORT_ZONE_MASK) == ATA_SUPPORT_ZONE_DEV_MANAGED) { softc->zone_mode =DA_ZONE_DRIVE_MANAGED; softc->zone_interface = DA_ZONE_IF_ATA_PASS; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(ata_params, M_SCSIDA); if ((softc->zone_mode == DA_ZONE_HOST_AWARE) || (softc->zone_mode == DA_ZONE_HOST_MANAGED)) { /* * If the ATA IDENTIFY failed, we could be talking * to a SCSI drive, although that seems unlikely, * since the drive did report that it supported the * ATA Information VPD page. If the ATA IDENTIFY * succeeded, and the SAT layer doesn't support * ZBC -> ZAC translation, continue on to get the * directory of ATA logs, and complete the rest of * the ZAC probe. If the SAT layer does support * ZBC -> ZAC translation, we want to use that, * and we'll probe the SCSI Zoned Block Device * Characteristics VPD page next. */ if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_LOG) && (softc->zone_interface == DA_ZONE_IF_ATA_PASS)) softc->state = DA_STATE_PROBE_ATA_LOGDIR; else softc->state = DA_STATE_PROBE_ZONE; continue_probe = 1; } if (continue_probe != 0) { xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } else daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_LOGDIR: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { error = 0; softc->valid_logdir_len = 0; bzero(&softc->ata_logdir, sizeof(softc->ata_logdir)); softc->valid_logdir_len = csio->dxfer_len - csio->resid; if (softc->valid_logdir_len > 0) bcopy(csio->data_ptr, &softc->ata_logdir, min(softc->valid_logdir_len, sizeof(softc->ata_logdir))); /* * Figure out whether the Identify Device log is * supported. The General Purpose log directory * has a header, and lists the number of pages * available for each GP log identified by the * offset into the list. */ if ((softc->valid_logdir_len >= ((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t))) && (le16dec(softc->ata_logdir.header) == ATA_GP_LOG_DIR_VERSION) && (le16dec(&softc->ata_logdir.num_pages[ (ATA_IDENTIFY_DATA_LOG * sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){ softc->flags |= DA_FLAG_CAN_ATA_IDLOG; } else { softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG; } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA log directory, * then ATA logs are effectively not * supported even if the bit is set in the * identify data. */ softc->flags &= ~(DA_FLAG_CAN_ATA_LOG | DA_FLAG_CAN_ATA_IDLOG); if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_IDLOG)) { softc->state = DA_STATE_PROBE_ATA_IDDIR; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_IDDIR: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { off_t entries_offset, max_entries; error = 0; softc->valid_iddir_len = 0; bzero(&softc->ata_iddir, sizeof(softc->ata_iddir)); softc->flags &= ~(DA_FLAG_CAN_ATA_SUPCAP | DA_FLAG_CAN_ATA_ZONE); softc->valid_iddir_len = csio->dxfer_len - csio->resid; if (softc->valid_iddir_len > 0) bcopy(csio->data_ptr, &softc->ata_iddir, min(softc->valid_iddir_len, sizeof(softc->ata_iddir))); entries_offset = __offsetof(struct ata_identify_log_pages,entries); max_entries = softc->valid_iddir_len - entries_offset; if ((softc->valid_iddir_len > (entries_offset + 1)) && (le64dec(softc->ata_iddir.header) == ATA_IDLOG_REVISION) && (softc->ata_iddir.entry_count > 0)) { int num_entries, i; num_entries = softc->ata_iddir.entry_count; num_entries = min(num_entries, softc->valid_iddir_len - entries_offset); for (i = 0; i < num_entries && i < max_entries; i++) { if (softc->ata_iddir.entries[i] == ATA_IDL_SUP_CAP) softc->flags |= DA_FLAG_CAN_ATA_SUPCAP; else if (softc->ata_iddir.entries[i]== ATA_IDL_ZDI) softc->flags |= DA_FLAG_CAN_ATA_ZONE; if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) && (softc->flags & DA_FLAG_CAN_ATA_ZONE)) break; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA Identify Data log * directory, then it effectively isn't * supported even if the ATA Log directory * a non-zero number of pages present for * this log. */ softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_SUPCAP)) { softc->state = DA_STATE_PROBE_ATA_SUP; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_SUP: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t valid_len; size_t needed_size; struct ata_identify_log_sup_cap *sup_cap; error = 0; sup_cap = (struct ata_identify_log_sup_cap *) csio->data_ptr; valid_len = csio->dxfer_len - csio->resid; needed_size = __offsetof(struct ata_identify_log_sup_cap, sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap); if (valid_len >= needed_size) { uint64_t zoned, zac_cap; zoned = le64dec(sup_cap->zoned_cap); if (zoned & ATA_ZONED_VALID) { /* * This should have already been * set, because this is also in the * ATA identify data. */ if ((zoned & ATA_ZONED_MASK) == ATA_SUPPORT_ZONE_HOST_AWARE) softc->zone_mode = DA_ZONE_HOST_AWARE; else if ((zoned & ATA_ZONED_MASK) == ATA_SUPPORT_ZONE_DEV_MANAGED) softc->zone_mode = DA_ZONE_DRIVE_MANAGED; } zac_cap = le64dec(sup_cap->sup_zac_cap); if (zac_cap & ATA_SUP_ZAC_CAP_VALID) { if (zac_cap & ATA_REPORT_ZONES_SUP) softc->zone_flags |= DA_ZONE_FLAG_RZ_SUP; if (zac_cap & ATA_ND_OPEN_ZONE_SUP) softc->zone_flags |= DA_ZONE_FLAG_OPEN_SUP; if (zac_cap & ATA_ND_CLOSE_ZONE_SUP) softc->zone_flags |= DA_ZONE_FLAG_CLOSE_SUP; if (zac_cap & ATA_ND_FINISH_ZONE_SUP) softc->zone_flags |= DA_ZONE_FLAG_FINISH_SUP; if (zac_cap & ATA_ND_RWP_SUP) softc->zone_flags |= DA_ZONE_FLAG_RWP_SUP; } else { /* * This field was introduced in * ACS-4, r08 on April 28th, 2015. * If the drive firmware was written * to an earlier spec, it won't have * the field. So, assume all * commands are supported. */ softc->zone_flags |= DA_ZONE_FLAG_SUP_MASK; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA Identify Data * Supported Capabilities page, clear the * flag... */ softc->flags &= ~DA_FLAG_CAN_ATA_SUPCAP; /* * And clear zone capabilities. */ softc->zone_flags &= ~DA_ZONE_FLAG_SUP_MASK; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_ZONE)) { softc->state = DA_STATE_PROBE_ATA_ZONE; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_ZONE: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { struct ata_zoned_info_log *zi_log; uint32_t valid_len; size_t needed_size; zi_log = (struct ata_zoned_info_log *)csio->data_ptr; valid_len = csio->dxfer_len - csio->resid; needed_size = __offsetof(struct ata_zoned_info_log, version_info) + 1 + sizeof(zi_log->version_info); if (valid_len >= needed_size) { uint64_t tmpvar; tmpvar = le64dec(zi_log->zoned_cap); if (tmpvar & ATA_ZDI_CAP_VALID) { if (tmpvar & ATA_ZDI_CAP_URSWRZ) softc->zone_flags |= DA_ZONE_FLAG_URSWRZ; else softc->zone_flags &= ~DA_ZONE_FLAG_URSWRZ; } tmpvar = le64dec(zi_log->optimal_seq_zones); if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) { softc->zone_flags |= DA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_seq_zones = (tmpvar & ATA_ZDI_OPT_SEQ_MASK); } else { softc->zone_flags &= ~DA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_seq_zones = 0; } tmpvar =le64dec(zi_log->optimal_nonseq_zones); if (tmpvar & ATA_ZDI_OPT_NS_VALID) { softc->zone_flags |= DA_ZONE_FLAG_OPT_NONSEQ_SET; softc->optimal_nonseq_zones = (tmpvar & ATA_ZDI_OPT_NS_MASK); } else { softc->zone_flags &= ~DA_ZONE_FLAG_OPT_NONSEQ_SET; softc->optimal_nonseq_zones = 0; } tmpvar = le64dec(zi_log->max_seq_req_zones); if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) { softc->zone_flags |= DA_ZONE_FLAG_MAX_SEQ_SET; softc->max_seq_zones = (tmpvar & ATA_ZDI_MAX_SEQ_MASK); } else { softc->zone_flags &= ~DA_ZONE_FLAG_MAX_SEQ_SET; softc->max_seq_zones = 0; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { softc->flags &= ~DA_FLAG_CAN_ATA_ZONE; softc->flags &= ~DA_ZONE_FLAG_SET_MASK; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ZONE: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t valid_len; size_t needed_len; struct scsi_vpd_zoned_bdc *zoned_bdc; error = 0; zoned_bdc = (struct scsi_vpd_zoned_bdc *) csio->data_ptr; valid_len = csio->dxfer_len - csio->resid; needed_len = __offsetof(struct scsi_vpd_zoned_bdc, max_seq_req_zones) + 1 + sizeof(zoned_bdc->max_seq_req_zones); if ((valid_len >= needed_len) && (scsi_2btoul(zoned_bdc->page_length) >= SVPD_ZBDC_PL)) { if (zoned_bdc->flags & SVPD_ZBDC_URSWRZ) softc->zone_flags |= DA_ZONE_FLAG_URSWRZ; else softc->zone_flags &= ~DA_ZONE_FLAG_URSWRZ; softc->optimal_seq_zones = scsi_4btoul(zoned_bdc->optimal_seq_zones); softc->zone_flags |= DA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_nonseq_zones = scsi_4btoul( zoned_bdc->optimal_nonseq_zones); softc->zone_flags |= DA_ZONE_FLAG_OPT_NONSEQ_SET; softc->max_seq_zones = scsi_4btoul(zoned_bdc->max_seq_req_zones); softc->zone_flags |= DA_ZONE_FLAG_MAX_SEQ_SET; } /* * All of the zone commands are mandatory for SCSI * devices. * * XXX KDM this is valid as of September 2015. * Re-check this assumption once the SAT spec is * updated to support SCSI ZBC to ATA ZAC mapping. * Since ATA allows zone commands to be reported * as supported or not, this may not necessarily * be true for an ATA device behind a SAT (SCSI to * ATA Translation) layer. */ softc->zone_flags |= DA_ZONE_FLAG_SUP_MASK; } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } daprobedone(periph, done_ccb); return; } case DA_CCB_DUMP: /* No-op. We're polling */ return; case DA_CCB_TUR: { if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_RECOVERY | SF_NO_PRINT) == ERESTART) return; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } xpt_release_ccb(done_ccb); cam_periph_release_locked(periph); return; } default: break; } xpt_release_ccb(done_ccb); } static void dareprobe(struct cam_periph *periph) { struct da_softc *softc; cam_status status; softc = (struct da_softc *)periph->softc; /* Probe in progress; don't interfere. */ if (softc->state != DA_STATE_NORMAL) return; status = cam_periph_acquire(periph); KASSERT(status == CAM_REQ_CMP, ("dareprobe: cam_periph_acquire failed")); if (softc->flags & DA_FLAG_CAN_RC16) softc->state = DA_STATE_PROBE_RC16; else softc->state = DA_STATE_PROBE_RC; xpt_schedule(periph, CAM_PRIORITY_DEV); } static int daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { struct da_softc *softc; struct cam_periph *periph; int error, error_code, sense_key, asc, ascq; + +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + if (ccb->csio.bio != NULL) + biotrack(ccb->csio.bio, __func__); +#endif periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct da_softc *)periph->softc; /* * Automatically detect devices that do not support * READ(6)/WRITE(6) and upgrade to using 10 byte cdbs. */ error = 0; if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) { error = cmd6workaround(ccb); } else if (scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (sense_key == SSD_KEY_ILLEGAL_REQUEST) error = cmd6workaround(ccb); /* * If the target replied with CAPACITY DATA HAS CHANGED UA, * query the capacity and notify upper layers. */ else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x2A && ascq == 0x09) { xpt_print(periph->path, "Capacity data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); sense_flags |= SF_NO_PRINT; } else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x28 && ascq == 0x00) { softc->flags &= ~DA_FLAG_PROBED; disk_media_changed(softc->disk, M_NOWAIT); } else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x3F && ascq == 0x03) { xpt_print(periph->path, "INQUIRY data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); sense_flags |= SF_NO_PRINT; } else if (sense_key == SSD_KEY_NOT_READY && asc == 0x3a && (softc->flags & DA_FLAG_PACK_INVALID) == 0) { softc->flags |= DA_FLAG_PACK_INVALID; disk_media_gone(softc->disk, M_NOWAIT); } } if (error == ERESTART) return (ERESTART); #ifdef CAM_IO_STATS switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: softc->timeouts++; break; case CAM_REQ_ABORTED: case CAM_REQ_CMP_ERR: case CAM_REQ_TERMIO: case CAM_UNREC_HBA_ERROR: case CAM_DATA_RUN_ERR: softc->errors++; break; default: break; } #endif /* * XXX * Until we have a better way of doing pack validation, * don't treat UAs as errors. */ sense_flags |= SF_RETRY_UA; if (softc->quirks & DA_Q_RETRY_BUSY) sense_flags |= SF_RETRY_BUSY; return(cam_periph_error(ccb, cam_flags, sense_flags, &softc->saved_ccb)); } static void damediapoll(void *arg) { struct cam_periph *periph = arg; struct da_softc *softc = periph->softc; if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR) && LIST_EMPTY(&softc->pending_ccbs)) { if (cam_periph_acquire(periph) == CAM_REQ_CMP) { cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR); daschedule(periph); } } /* Queue us up again */ if (da_poll_period != 0) callout_schedule(&softc->mediapoll_c, da_poll_period * hz); } static void daprevent(struct cam_periph *periph, int action) { struct da_softc *softc; union ccb *ccb; int error; softc = (struct da_softc *)periph->softc; if (((action == PR_ALLOW) && (softc->flags & DA_FLAG_PACK_LOCKED) == 0) || ((action == PR_PREVENT) && (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) { return; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_prevent(&ccb->csio, /*retries*/1, /*cbcfp*/dadone, MSG_SIMPLE_Q_TAG, action, SSD_FULL_SIZE, 5000); error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_PRINT, softc->disk->d_devstat); if (error == 0) { if (action == PR_ALLOW) softc->flags &= ~DA_FLAG_PACK_LOCKED; else softc->flags |= DA_FLAG_PACK_LOCKED; } xpt_release_ccb(ccb); } static void dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector, struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len) { struct ccb_calc_geometry ccg; struct da_softc *softc; struct disk_params *dp; u_int lbppbe, lalba; int error; softc = (struct da_softc *)periph->softc; dp = &softc->params; dp->secsize = block_len; dp->sectors = maxsector + 1; if (rcaplong != NULL) { lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE; lalba = scsi_2btoul(rcaplong->lalba_lbp); lalba &= SRC16_LALBA_A; } else { lbppbe = 0; lalba = 0; } if (lbppbe > 0) { dp->stripesize = block_len << lbppbe; dp->stripeoffset = (dp->stripesize - block_len * lalba) % dp->stripesize; } else if (softc->quirks & DA_Q_4K) { dp->stripesize = 4096; dp->stripeoffset = 0; } else { dp->stripesize = 0; dp->stripeoffset = 0; } /* * Have the controller provide us with a geometry * for this disk. The only time the geometry * matters is when we boot and the controller * is the only one knowledgeable enough to come * up with something that will make this a bootable * device. */ xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL); ccg.ccb_h.func_code = XPT_CALC_GEOMETRY; ccg.block_size = dp->secsize; ccg.volume_size = dp->sectors; ccg.heads = 0; ccg.secs_per_track = 0; ccg.cylinders = 0; xpt_action((union ccb*)&ccg); if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { /* * We don't know what went wrong here- but just pick * a geometry so we don't have nasty things like divide * by zero. */ dp->heads = 255; dp->secs_per_track = 255; dp->cylinders = dp->sectors / (255 * 255); if (dp->cylinders == 0) { dp->cylinders = 1; } } else { dp->heads = ccg.heads; dp->secs_per_track = ccg.secs_per_track; dp->cylinders = ccg.cylinders; } /* * If the user supplied a read capacity buffer, and if it is * different than the previous buffer, update the data in the EDT. * If it's the same, we don't bother. This avoids sending an * update every time someone opens this device. */ if ((rcaplong != NULL) && (bcmp(rcaplong, &softc->rcaplong, min(sizeof(softc->rcaplong), rcap_len)) != 0)) { struct ccb_dev_advinfo cdai; xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cdai.ccb_h.func_code = XPT_DEV_ADVINFO; cdai.buftype = CDAI_TYPE_RCAPLONG; cdai.flags = CDAI_FLAG_STORE; cdai.bufsiz = rcap_len; cdai.buf = (uint8_t *)rcaplong; xpt_action((union ccb *)&cdai); if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); if (cdai.ccb_h.status != CAM_REQ_CMP) { xpt_print(periph->path, "%s: failed to set read " "capacity advinfo\n", __func__); /* Use cam_error_print() to decode the status */ cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS, CAM_EPF_ALL); } else { bcopy(rcaplong, &softc->rcaplong, min(sizeof(softc->rcaplong), rcap_len)); } } softc->disk->d_sectorsize = softc->params.secsize; softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors; softc->disk->d_stripesize = softc->params.stripesize; softc->disk->d_stripeoffset = softc->params.stripeoffset; /* XXX: these are not actually "firmware" values, so they may be wrong */ softc->disk->d_fwsectors = softc->params.secs_per_track; softc->disk->d_fwheads = softc->params.heads; softc->disk->d_devstat->block_size = softc->params.secsize; softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE; error = disk_resize(softc->disk, M_NOWAIT); if (error != 0) xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error); } static void dasendorderedtag(void *arg) { struct da_softc *softc = arg; if (da_send_ordered) { if (!LIST_EMPTY(&softc->pending_ccbs)) { if ((softc->flags & DA_FLAG_WAS_OTAG) == 0) softc->flags |= DA_FLAG_NEED_OTAG; softc->flags &= ~DA_FLAG_WAS_OTAG; } } /* Queue us up again */ callout_reset(&softc->sendordered_c, (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL, dasendorderedtag, softc); } /* * Step through all DA peripheral drivers, and if the device is still open, * sync the disk cache to physical media. */ static void dashutdown(void * arg, int howto) { struct cam_periph *periph; struct da_softc *softc; union ccb *ccb; int error; CAM_PERIPH_FOREACH(periph, &dadriver) { softc = (struct da_softc *)periph->softc; if (SCHEDULER_STOPPED()) { /* If we paniced with the lock held, do not recurse. */ if (!cam_periph_owned(periph) && (softc->flags & DA_FLAG_OPEN)) { dadump(softc->disk, NULL, 0, 0, 0); } continue; } cam_periph_lock(periph); /* * We only sync the cache if the drive is still open, and * if the drive is capable of it.. */ if (((softc->flags & DA_FLAG_OPEN) == 0) || (softc->quirks & DA_Q_NO_SYNC_CACHE)) { cam_periph_unlock(periph); continue; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_synchronize_cache(&ccb->csio, /*retries*/0, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0, /* whole disk */ /*lb_count*/0, SSD_FULL_SIZE, 60 * 60 * 1000); error = cam_periph_runccb(ccb, daerror, /*cam_flags*/0, /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); xpt_release_ccb(ccb); cam_periph_unlock(periph); } } #else /* !_KERNEL */ /* * XXX These are only left out of the kernel build to silence warnings. If, * for some reason these functions are used in the kernel, the ifdefs should * be moved so they are included both in the kernel and userland. */ void scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_format_unit *scsi_cmd; scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = FORMAT_UNIT; scsi_cmd->byte2 = byte2; scsi_ulto2b(ileave, scsi_cmd->interleave); cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_defects(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t list_format, uint32_t addr_desc_index, uint8_t *data_ptr, uint32_t dxfer_len, int minimum_cmd_size, uint8_t sense_len, uint32_t timeout) { uint8_t cdb_len; /* * These conditions allow using the 10 byte command. Otherwise we * need to use the 12 byte command. */ if ((minimum_cmd_size <= 10) && (addr_desc_index == 0) && (dxfer_len <= SRDD10_MAX_LENGTH)) { struct scsi_read_defect_data_10 *cdb10; cdb10 = (struct scsi_read_defect_data_10 *) &csio->cdb_io.cdb_bytes; cdb_len = sizeof(*cdb10); bzero(cdb10, cdb_len); cdb10->opcode = READ_DEFECT_DATA_10; cdb10->format = list_format; scsi_ulto2b(dxfer_len, cdb10->alloc_length); } else { struct scsi_read_defect_data_12 *cdb12; cdb12 = (struct scsi_read_defect_data_12 *) &csio->cdb_io.cdb_bytes; cdb_len = sizeof(*cdb12); bzero(cdb12, cdb_len); cdb12->opcode = READ_DEFECT_DATA_12; cdb12->format = list_format; scsi_ulto4b(dxfer_len, cdb12->alloc_length); scsi_ulto4b(addr_desc_index, cdb12->address_descriptor_index); } cam_fill_csio(csio, retries, cbfcnp, /*flags*/ CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, cdb_len, timeout); } void scsi_sanitize(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int16_t control, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_sanitize *scsi_cmd; scsi_cmd = (struct scsi_sanitize *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = SANITIZE; scsi_cmd->byte2 = byte2; scsi_cmd->control = control; scsi_ulto2b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } #endif /* _KERNEL */ void scsi_zbc_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t service_action, uint64_t zone_id, uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t sense_len, uint32_t timeout) { struct scsi_zbc_out *scsi_cmd; scsi_cmd = (struct scsi_zbc_out *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = ZBC_OUT; scsi_cmd->service_action = service_action; scsi_u64to8b(zone_id, scsi_cmd->zone_id); scsi_cmd->zone_flags = zone_flags; cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_zbc_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t service_action, uint64_t zone_start_lba, uint8_t zone_options, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t sense_len, uint32_t timeout) { struct scsi_zbc_in *scsi_cmd; scsi_cmd = (struct scsi_zbc_in *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = ZBC_IN; scsi_cmd->service_action = service_action; scsi_u64to8b(zone_start_lba, scsi_cmd->zone_start_lba); scsi_cmd->zone_options = zone_options; cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_IN : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } int scsi_ata_zac_mgmt_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int use_ncq, uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage, size_t cdb_storage_len, uint8_t sense_len, uint32_t timeout) { uint8_t command_out, protocol, ata_flags; uint16_t features_out; uint32_t sectors_out, auxiliary; int retval; retval = 0; if (use_ncq == 0) { command_out = ATA_ZAC_MANAGEMENT_OUT; features_out = (zm_action & 0xf) | (zone_flags << 8); ata_flags = AP_FLAG_BYT_BLOK_BLOCKS; if (dxfer_len == 0) { protocol = AP_PROTO_NON_DATA; ata_flags |= AP_FLAG_TLEN_NO_DATA; sectors_out = 0; } else { protocol = AP_PROTO_DMA; ata_flags |= AP_FLAG_TLEN_SECT_CNT | AP_FLAG_TDIR_TO_DEV; sectors_out = ((dxfer_len >> 9) & 0xffff); } auxiliary = 0; } else { ata_flags = AP_FLAG_BYT_BLOK_BLOCKS; if (dxfer_len == 0) { command_out = ATA_NCQ_NON_DATA; features_out = ATA_NCQ_ZAC_MGMT_OUT; /* * We're assuming the SCSI to ATA translation layer * will set the NCQ tag number in the tag field. * That isn't clear from the SAT-4 spec (as of rev 05). */ sectors_out = 0; ata_flags |= AP_FLAG_TLEN_NO_DATA; } else { command_out = ATA_SEND_FPDMA_QUEUED; /* * Note that we're defaulting to normal priority, * and assuming that the SCSI to ATA translation * layer will insert the NCQ tag number in the tag * field. That isn't clear in the SAT-4 spec (as * of rev 05). */ sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8; ata_flags |= AP_FLAG_TLEN_FEAT | AP_FLAG_TDIR_TO_DEV; /* * For SEND FPDMA QUEUED, the transfer length is * encoded in the FEATURE register, and 0 means * that 65536 512 byte blocks are to be tranferred. * In practice, it seems unlikely that we'll see * a transfer that large, and it may confuse the * the SAT layer, because generally that means that * 0 bytes should be transferred. */ if (dxfer_len == (65536 * 512)) { features_out = 0; } else if (dxfer_len <= (65535 * 512)) { features_out = ((dxfer_len >> 9) & 0xffff); } else { /* The transfer is too big. */ retval = 1; goto bailout; } } auxiliary = (zm_action & 0xf) | (zone_flags << 8); protocol = AP_PROTO_FPDMA; } protocol |= AP_EXTEND; retval = scsi_ata_pass(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, /*protocol*/ protocol, /*ata_flags*/ ata_flags, /*features*/ features_out, /*sector_count*/ sectors_out, /*lba*/ zone_id, /*command*/ command_out, /*device*/ 0, /*icc*/ 0, /*auxiliary*/ auxiliary, /*control*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, /*cdb_storage*/ cdb_storage, /*cdb_storage_len*/ cdb_storage_len, /*minimum_cmd_size*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout); bailout: return (retval); } int scsi_ata_zac_mgmt_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int use_ncq, uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage, size_t cdb_storage_len, uint8_t sense_len, uint32_t timeout) { uint8_t command_out, protocol; uint16_t features_out, sectors_out; uint32_t auxiliary; int ata_flags; int retval; retval = 0; ata_flags = AP_FLAG_TDIR_FROM_DEV | AP_FLAG_BYT_BLOK_BLOCKS; if (use_ncq == 0) { command_out = ATA_ZAC_MANAGEMENT_IN; /* XXX KDM put a macro here */ features_out = (zm_action & 0xf) | (zone_flags << 8); sectors_out = dxfer_len >> 9; /* XXX KDM macro */ protocol = AP_PROTO_DMA; ata_flags |= AP_FLAG_TLEN_SECT_CNT; auxiliary = 0; } else { ata_flags |= AP_FLAG_TLEN_FEAT; command_out = ATA_RECV_FPDMA_QUEUED; sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8; /* * For RECEIVE FPDMA QUEUED, the transfer length is * encoded in the FEATURE register, and 0 means * that 65536 512 byte blocks are to be tranferred. * In practice, it seems unlikely that we'll see * a transfer that large, and it may confuse the * the SAT layer, because generally that means that * 0 bytes should be transferred. */ if (dxfer_len == (65536 * 512)) { features_out = 0; } else if (dxfer_len <= (65535 * 512)) { features_out = ((dxfer_len >> 9) & 0xffff); } else { /* The transfer is too big. */ retval = 1; goto bailout; } auxiliary = (zm_action & 0xf) | (zone_flags << 8), protocol = AP_PROTO_FPDMA; } protocol |= AP_EXTEND; retval = scsi_ata_pass(csio, retries, cbfcnp, /*flags*/ CAM_DIR_IN, tag_action, /*protocol*/ protocol, /*ata_flags*/ ata_flags, /*features*/ features_out, /*sector_count*/ sectors_out, /*lba*/ zone_id, /*command*/ command_out, /*device*/ 0, /*icc*/ 0, /*auxiliary*/ auxiliary, /*control*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ (dxfer_len >> 9) * 512, /* XXX KDM */ /*cdb_storage*/ cdb_storage, /*cdb_storage_len*/ cdb_storage_len, /*minimum_cmd_size*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout); bailout: return (retval); } Index: head/sys/conf/options =================================================================== --- head/sys/conf/options (revision 308154) +++ head/sys/conf/options (revision 308155) @@ -1,996 +1,998 @@ # $FreeBSD$ # # On the handling of kernel options # # All kernel options should be listed in NOTES, with suitable # descriptions. Negative options (options that make some code not # compile) should be commented out; LINT (generated from NOTES) should # compile as much code as possible. Try to structure option-using # code so that a single option only switch code on, or only switch # code off, to make it possible to have a full compile-test. If # necessary, you can check for COMPILING_LINT to get maximum code # coverage. # # All new options shall also be listed in either "conf/options" or # "conf/options.". Options that affect a single source-file # .[c|s] should be directed into "opt_.h", while options # that affect multiple files should either go in "opt_global.h" if # this is a kernel-wide option (used just about everywhere), or in # "opt_.h" if it affects only some files. # Note that the effect of listing only an option without a # header-file-name in conf/options (and cousins) is that the last # convention is followed. # # This handling scheme is not yet fully implemented. # # # Format of this file: # Option name filename # # If filename is missing, the default is # opt_.h AAC_DEBUG opt_aac.h AACRAID_DEBUG opt_aacraid.h AHC_ALLOW_MEMIO opt_aic7xxx.h AHC_TMODE_ENABLE opt_aic7xxx.h AHC_DUMP_EEPROM opt_aic7xxx.h AHC_DEBUG opt_aic7xxx.h AHC_DEBUG_OPTS opt_aic7xxx.h AHC_REG_PRETTY_PRINT opt_aic7xxx.h AHD_DEBUG opt_aic79xx.h AHD_DEBUG_OPTS opt_aic79xx.h AHD_TMODE_ENABLE opt_aic79xx.h AHD_REG_PRETTY_PRINT opt_aic79xx.h ADW_ALLOW_MEMIO opt_adw.h TWA_DEBUG opt_twa.h TWA_FLASH_FIRMWARE opt_twa.h # Debugging options. ALT_BREAK_TO_DEBUGGER opt_kdb.h BREAK_TO_DEBUGGER opt_kdb.h +BUF_TRACKING opt_global.h DDB DDB_BUFR_SIZE opt_ddb.h DDB_CAPTURE_DEFAULTBUFSIZE opt_ddb.h DDB_CAPTURE_MAXBUFSIZE opt_ddb.h DDB_CTF opt_ddb.h DDB_NUMSYM opt_ddb.h +FULL_BUF_TRACKING opt_global.h GDB KDB opt_global.h KDB_TRACE opt_kdb.h KDB_UNATTENDED opt_kdb.h KLD_DEBUG opt_kld.h SYSCTL_DEBUG opt_sysctl.h EARLY_PRINTF opt_global.h TEXTDUMP_PREFERRED opt_ddb.h TEXTDUMP_VERBOSE opt_ddb.h NUM_CORE_FILES opt_global.h # Miscellaneous options. ADAPTIVE_LOCKMGRS ALQ ALTERA_SDCARD_FAST_SIM opt_altera_sdcard.h ATSE_CFI_HACK opt_cfi.h AUDIT opt_global.h BOOTHOWTO opt_global.h BOOTVERBOSE opt_global.h CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h COMPAT_43 opt_compat.h COMPAT_43TTY opt_compat.h COMPAT_FREEBSD4 opt_compat.h COMPAT_FREEBSD5 opt_compat.h COMPAT_FREEBSD6 opt_compat.h COMPAT_FREEBSD7 opt_compat.h COMPAT_FREEBSD9 opt_compat.h COMPAT_FREEBSD10 opt_compat.h COMPAT_CLOUDABI32 opt_dontuse.h COMPAT_CLOUDABI64 opt_dontuse.h COMPAT_LINUXKPI opt_compat.h COMPILING_LINT opt_global.h CY_PCI_FASTINTR DEADLKRES opt_watchdog.h DEVICE_NUMA EXT_RESOURCES opt_global.h DIRECTIO FILEMON opt_dontuse.h FFCLOCK FULL_PREEMPTION opt_sched.h GZIO opt_gzio.h IMAGACT_BINMISC opt_dontuse.h IPI_PREEMPTION opt_sched.h GEOM_AES opt_geom.h GEOM_BDE opt_geom.h GEOM_BSD opt_geom.h GEOM_CACHE opt_geom.h GEOM_CONCAT opt_geom.h GEOM_ELI opt_geom.h GEOM_FOX opt_geom.h GEOM_GATE opt_geom.h GEOM_JOURNAL opt_geom.h GEOM_LABEL opt_geom.h GEOM_LABEL_GPT opt_geom.h GEOM_LINUX_LVM opt_geom.h GEOM_MAP opt_geom.h GEOM_MBR opt_geom.h GEOM_MIRROR opt_geom.h GEOM_MOUNTVER opt_geom.h GEOM_MULTIPATH opt_geom.h GEOM_NOP opt_geom.h GEOM_PART_APM opt_geom.h GEOM_PART_BSD opt_geom.h GEOM_PART_BSD64 opt_geom.h GEOM_PART_EBR opt_geom.h GEOM_PART_EBR_COMPAT opt_geom.h GEOM_PART_GPT opt_geom.h GEOM_PART_LDM opt_geom.h GEOM_PART_MBR opt_geom.h GEOM_PART_PC98 opt_geom.h GEOM_PART_VTOC8 opt_geom.h GEOM_PC98 opt_geom.h GEOM_RAID opt_geom.h GEOM_RAID3 opt_geom.h GEOM_SHSEC opt_geom.h GEOM_STRIPE opt_geom.h GEOM_SUNLABEL opt_geom.h GEOM_UZIP opt_geom.h GEOM_UZIP_DEBUG opt_geom.h GEOM_VINUM opt_geom.h GEOM_VIRSTOR opt_geom.h GEOM_VOL opt_geom.h GEOM_ZERO opt_geom.h IFLIB opt_iflib.h KDTRACE_HOOKS opt_global.h KDTRACE_FRAME opt_kdtrace.h KN_HASHSIZE opt_kqueue.h KSTACK_MAX_PAGES KSTACK_PAGES KSTACK_USAGE_PROF KTRACE KTRACE_REQUEST_POOL opt_ktrace.h LIBICONV MAC opt_global.h MAC_BIBA opt_dontuse.h MAC_BSDEXTENDED opt_dontuse.h MAC_IFOFF opt_dontuse.h MAC_LOMAC opt_dontuse.h MAC_MLS opt_dontuse.h MAC_NONE opt_dontuse.h MAC_PARTITION opt_dontuse.h MAC_PORTACL opt_dontuse.h MAC_SEEOTHERUIDS opt_dontuse.h MAC_STATIC opt_mac.h MAC_STUB opt_dontuse.h MAC_TEST opt_dontuse.h MD_ROOT opt_md.h MD_ROOT_FSTYPE opt_md.h MD_ROOT_SIZE opt_md.h MFI_DEBUG opt_mfi.h MFI_DECODE_LOG opt_mfi.h MPROF_BUFFERS opt_mprof.h MPROF_HASH_SIZE opt_mprof.h NEW_PCIB opt_global.h NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h NO_ADAPTIVE_RWLOCKS NO_ADAPTIVE_SX NO_EVENTTIMERS opt_timer.h NO_SYSCTL_DESCR opt_global.h NSWBUF_MIN opt_swap.h MBUF_PACKET_ZONE_DISABLE opt_global.h PANIC_REBOOT_WAIT_TIME opt_panic.h PCI_HP opt_pci.h PCI_IOV opt_global.h PPC_DEBUG opt_ppc.h PPC_PROBE_CHIPSET opt_ppc.h PPS_SYNC opt_ntp.h PREEMPTION opt_sched.h QUOTA SCHED_4BSD opt_sched.h SCHED_STATS opt_sched.h SCHED_ULE opt_sched.h SLEEPQUEUE_PROFILING SLHCI_DEBUG opt_slhci.h SPX_HACK STACK opt_stack.h SUIDDIR MSGMNB opt_sysvipc.h MSGMNI opt_sysvipc.h MSGSEG opt_sysvipc.h MSGSSZ opt_sysvipc.h MSGTQL opt_sysvipc.h SEMMNI opt_sysvipc.h SEMMNS opt_sysvipc.h SEMMNU opt_sysvipc.h SEMMSL opt_sysvipc.h SEMOPM opt_sysvipc.h SEMUME opt_sysvipc.h SHMALL opt_sysvipc.h SHMMAX opt_sysvipc.h SHMMAXPGS opt_sysvipc.h SHMMIN opt_sysvipc.h SHMMNI opt_sysvipc.h SHMSEG opt_sysvipc.h SYSVMSG opt_sysvipc.h SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TURNSTILE_PROFILING UMTX_PROFILING VERBOSE_SYSINIT # POSIX kernel options P1003_1B_MQUEUE opt_posix.h P1003_1B_SEMAPHORES opt_posix.h _KPOSIX_PRIORITY_SCHEDULING opt_posix.h # Do we want the config file compiled into the kernel? INCLUDE_CONFIG_FILE opt_config.h # Options for static filesystems. These should only be used at config # time, since the corresponding lkms cannot work if there are any static # dependencies. Unusability is enforced by hiding the defines for the # options in a never-included header. AUTOFS opt_dontuse.h CD9660 opt_dontuse.h EXT2FS opt_dontuse.h FDESCFS opt_dontuse.h FFS opt_dontuse.h FUSE opt_dontuse.h MSDOSFS opt_dontuse.h NANDFS opt_dontuse.h NULLFS opt_dontuse.h PROCFS opt_dontuse.h PSEUDOFS opt_dontuse.h SMBFS opt_dontuse.h TMPFS opt_dontuse.h UDF opt_dontuse.h UNIONFS opt_dontuse.h ZFS opt_dontuse.h # Pseudofs debugging PSEUDOFS_TRACE opt_pseudofs.h # In-kernel GSS-API KGSSAPI opt_kgssapi.h KGSSAPI_DEBUG opt_kgssapi.h # These static filesystems have one slightly bogus static dependency in # sys/i386/i386/autoconf.c. If any of these filesystems are # statically compiled into the kernel, code for mounting them as root # filesystems will be enabled - but look below. # NFSCL - client # NFSD - server NFSCL opt_nfs.h NFSD opt_nfs.h # filesystems and libiconv bridge CD9660_ICONV opt_dontuse.h MSDOSFS_ICONV opt_dontuse.h UDF_ICONV opt_dontuse.h # If you are following the conditions in the copyright, # you can enable soft-updates which will speed up a lot of thigs # and make the system safer from crashes at the same time. # otherwise a STUB module will be compiled in. SOFTUPDATES opt_ffs.h # On small, embedded systems, it can be useful to turn off support for # snapshots. It saves about 30-40k for a feature that would be lightly # used, if it is used at all. NO_FFS_SNAPSHOT opt_ffs.h # Enabling this option turns on support for Access Control Lists in UFS, # which can be used to support high security configurations. Depends on # UFS_EXTATTR. UFS_ACL opt_ufs.h # Enabling this option turns on support for extended attributes in UFS-based # filesystems, which can be used to support high security configurations # as well as new filesystem features. UFS_EXTATTR opt_ufs.h UFS_EXTATTR_AUTOSTART opt_ufs.h # Enable fast hash lookups for large directories on UFS-based filesystems. UFS_DIRHASH opt_ufs.h # Enable gjournal-based UFS journal. UFS_GJOURNAL opt_ufs.h # The below sentence is not in English, and neither is this one. # We plan to remove the static dependences above, with a # _ROOT option to control if it usable as root. This list # allows these options to be present in config files already (though # they won't make any difference yet). NFS_ROOT opt_nfsroot.h # SMB/CIFS requester NETSMB opt_netsmb.h # Options used only in subr_param.c. HZ opt_param.h MAXFILES opt_param.h NBUF opt_param.h NSFBUFS opt_param.h VM_BCACHE_SIZE_MAX opt_param.h VM_SWZONE_SIZE_MAX opt_param.h MAXUSERS DFLDSIZ opt_param.h MAXDSIZ opt_param.h MAXSSIZ opt_param.h # Generic SCSI options. CAM_MAX_HIGHPOWER opt_cam.h CAMDEBUG opt_cam.h CAM_DEBUG_COMPILE opt_cam.h CAM_DEBUG_DELAY opt_cam.h CAM_DEBUG_BUS opt_cam.h CAM_DEBUG_TARGET opt_cam.h CAM_DEBUG_LUN opt_cam.h CAM_DEBUG_FLAGS opt_cam.h CAM_BOOT_DELAY opt_cam.h CAM_IOSCHED_DYNAMIC opt_cam.h SCSI_DELAY opt_scsi.h SCSI_NO_SENSE_STRINGS opt_scsi.h SCSI_NO_OP_STRINGS opt_scsi.h # Options used only in cam/ata/ata_da.c ADA_TEST_FAILURE opt_ada.h ATA_STATIC_ID opt_ada.h # Options used only in cam/scsi/scsi_cd.c CHANGER_MIN_BUSY_SECONDS opt_cd.h CHANGER_MAX_BUSY_SECONDS opt_cd.h # Options used only in cam/scsi/scsi_sa.c. SA_IO_TIMEOUT opt_sa.h SA_SPACE_TIMEOUT opt_sa.h SA_REWIND_TIMEOUT opt_sa.h SA_ERASE_TIMEOUT opt_sa.h SA_1FM_AT_EOD opt_sa.h # Options used only in cam/scsi/scsi_pt.c SCSI_PT_DEFAULT_TIMEOUT opt_pt.h # Options used only in cam/scsi/scsi_ses.c SES_ENABLE_PASSTHROUGH opt_ses.h # Options used in dev/sym/ (Symbios SCSI driver). SYM_SETUP_LP_PROBE_MAP opt_sym.h #-Low Priority Probe Map (bits) # Allows the ncr to take precedence # 1 (1<<0) -> 810a, 860 # 2 (1<<1) -> 825a, 875, 885, 895 # 4 (1<<2) -> 895a, 896, 1510d SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking # disabled:0, enabled:1 (default) SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported # default:8, range:[1..64] # Options used only in dev/ncr/* SCSI_NCR_DEBUG opt_ncr.h SCSI_NCR_MAX_SYNC opt_ncr.h SCSI_NCR_MAX_WIDE opt_ncr.h SCSI_NCR_MYADDR opt_ncr.h # Options used only in dev/isp/* ISP_TARGET_MODE opt_isp.h ISP_FW_CRASH_DUMP opt_isp.h ISP_DEFAULT_ROLES opt_isp.h ISP_INTERNAL_TARGET opt_isp.h # Options used only in dev/iscsi ISCSI_INITIATOR_DEBUG opt_iscsi_initiator.h # Net stuff. ACCEPT_FILTER_DATA ACCEPT_FILTER_DNS ACCEPT_FILTER_HTTP ALTQ opt_global.h ALTQ_CBQ opt_altq.h ALTQ_CDNR opt_altq.h ALTQ_CODEL opt_altq.h ALTQ_DEBUG opt_altq.h ALTQ_HFSC opt_altq.h ALTQ_FAIRQ opt_altq.h ALTQ_NOPCC opt_altq.h ALTQ_PRIQ opt_altq.h ALTQ_RED opt_altq.h ALTQ_RIO opt_altq.h BOOTP opt_bootp.h BOOTP_BLOCKSIZE opt_bootp.h BOOTP_COMPAT opt_bootp.h BOOTP_NFSROOT opt_bootp.h BOOTP_NFSV3 opt_bootp.h BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h INET opt_inet.h INET6 opt_inet6.h IPDIVERT IPFILTER opt_ipfilter.h IPFILTER_DEFAULT_BLOCK opt_ipfilter.h IPFILTER_LOG opt_ipfilter.h IPFILTER_LOOKUP opt_ipfilter.h IPFIREWALL opt_ipfw.h IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h IPFIREWALL_NAT opt_ipfw.h IPFIREWALL_NAT64 opt_ipfw.h IPFIREWALL_NAT64_DIRECT_OUTPUT opt_ipfw.h IPFIREWALL_NPTV6 opt_ipfw.h IPFIREWALL_VERBOSE opt_ipfw.h IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h IPSEC opt_ipsec.h IPSEC_DEBUG opt_ipsec.h IPSEC_NAT_T opt_ipsec.h IPSTEALTH KRPC LIBALIAS LIBMBPOOL LIBMCHAIN MBUF_PROFILING MBUF_STRESS_TEST MROUTING opt_mrouting.h NFSLOCKD PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h RADIX_MPATH opt_mpath.h ROUTETABLES opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCPPCAP opt_global.h SIFTR TCP_HHOOK opt_inet.h TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading TCP_RFC7413 opt_inet.h TCP_RFC7413_MAX_KEYS opt_inet.h TCP_SIGNATURE opt_inet.h VLAN_ARRAY opt_vlan.h XBONEHACK FLOWTABLE opt_route.h FLOWTABLE_HASH_ALL opt_route.h # # SCTP # SCTP opt_sctp.h SCTP_DEBUG opt_sctp.h # Enable debug printfs SCTP_WITH_NO_CSUM opt_sctp.h # Use this at your peril SCTP_LOCK_LOGGING opt_sctp.h # Log to KTR lock activity SCTP_MBUF_LOGGING opt_sctp.h # Log to KTR general mbuf aloc/free SCTP_MBCNT_LOGGING opt_sctp.h # Log to KTR mbcnt activity SCTP_PACKET_LOGGING opt_sctp.h # Log to a packet buffer last N packets SCTP_LTRACE_CHUNKS opt_sctp.h # Log to KTR chunks processed SCTP_LTRACE_ERRORS opt_sctp.h # Log to KTR error returns. SCTP_USE_PERCPU_STAT opt_sctp.h # Use per cpu stats. SCTP_MCORE_INPUT opt_sctp.h # Have multiple input threads for input mbufs SCTP_LOCAL_TRACE_BUF opt_sctp.h # Use tracebuffer exported via sysctl SCTP_DETAILED_STR_STATS opt_sctp.h # Use per PR-SCTP policy stream stats # # # # Netgraph(4). Use option NETGRAPH to enable the base netgraph code. # Each netgraph node type can be either be compiled into the kernel # or loaded dynamically. To get the former, include the corresponding # option below. Each type has its own man page, e.g. ng_async(4). NETGRAPH NETGRAPH_DEBUG opt_netgraph.h NETGRAPH_ASYNC opt_netgraph.h NETGRAPH_ATMLLC opt_netgraph.h NETGRAPH_ATM_ATMPIF opt_netgraph.h NETGRAPH_BLUETOOTH opt_netgraph.h NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h NETGRAPH_BLUETOOTH_H4 opt_netgraph.h NETGRAPH_BLUETOOTH_HCI opt_netgraph.h NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h NETGRAPH_BLUETOOTH_UBT opt_netgraph.h NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h NETGRAPH_BPF opt_netgraph.h NETGRAPH_BRIDGE opt_netgraph.h NETGRAPH_CAR opt_netgraph.h NETGRAPH_CISCO opt_netgraph.h NETGRAPH_DEFLATE opt_netgraph.h NETGRAPH_DEVICE opt_netgraph.h NETGRAPH_ECHO opt_netgraph.h NETGRAPH_EIFACE opt_netgraph.h NETGRAPH_ETHER opt_netgraph.h NETGRAPH_ETHER_ECHO opt_netgraph.h NETGRAPH_FEC opt_netgraph.h NETGRAPH_FRAME_RELAY opt_netgraph.h NETGRAPH_GIF opt_netgraph.h NETGRAPH_GIF_DEMUX opt_netgraph.h NETGRAPH_HOLE opt_netgraph.h NETGRAPH_IFACE opt_netgraph.h NETGRAPH_IP_INPUT opt_netgraph.h NETGRAPH_IPFW opt_netgraph.h NETGRAPH_KSOCKET opt_netgraph.h NETGRAPH_L2TP opt_netgraph.h NETGRAPH_LMI opt_netgraph.h # MPPC compression requires proprietary files (not included) NETGRAPH_MPPC_COMPRESSION opt_netgraph.h NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h NETGRAPH_PATCH opt_netgraph.h NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h NETGRAPH_PRED1 opt_netgraph.h NETGRAPH_RFC1490 opt_netgraph.h NETGRAPH_SOCKET opt_netgraph.h NETGRAPH_SPLIT opt_netgraph.h NETGRAPH_SPPP opt_netgraph.h NETGRAPH_TAG opt_netgraph.h NETGRAPH_TCPMSS opt_netgraph.h NETGRAPH_TEE opt_netgraph.h NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h NETGRAPH_VLAN opt_netgraph.h # NgATM options NGATM_ATM opt_netgraph.h NGATM_ATMBASE opt_netgraph.h NGATM_SSCOP opt_netgraph.h NGATM_SSCFU opt_netgraph.h NGATM_UNI opt_netgraph.h NGATM_CCATM opt_netgraph.h # DRM options DRM_DEBUG opt_drm.h TI_SF_BUF_JUMBO opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h # XXX Conflict: # of devices vs network protocol (Native ATM). # This makes "atm.h" unusable. NATM # DPT driver debug flags DPT_MEASURE_PERFORMANCE opt_dpt.h DPT_RESET_HBA opt_dpt.h # Misc debug flags. Most of these should probably be replaced with # 'DEBUG', and then let people recompile just the interesting modules # with 'make CC="cc -DDEBUG"'. CLUSTERDEBUG opt_debug_cluster.h DEBUG_1284 opt_ppb_1284.h VP0_DEBUG opt_vpo.h LPT_DEBUG opt_lpt.h PLIP_DEBUG opt_plip.h LOCKF_DEBUG opt_debug_lockf.h SI_DEBUG opt_debug_si.h IFMEDIA_DEBUG opt_ifmedia.h # Fb options FB_DEBUG opt_fb.h FB_INSTALL_CDEV opt_fb.h # ppbus related options PERIPH_1284 opt_ppb_1284.h DONTPROBE_1284 opt_ppb_1284.h # smbus related options ENABLE_ALART opt_intpm.h # These cause changes all over the kernel BLKDEV_IOSIZE opt_global.h BURN_BRIDGES opt_global.h DEBUG opt_global.h DEBUG_LOCKS opt_global.h DEBUG_VFS_LOCKS opt_global.h DFLTPHYS opt_global.h DIAGNOSTIC opt_global.h INVARIANT_SUPPORT opt_global.h INVARIANTS opt_global.h MAXCPU opt_global.h MAXMEMDOM opt_global.h MAXPHYS opt_global.h MCLSHIFT opt_global.h MUTEX_NOINLINE opt_global.h LOCK_PROFILING opt_global.h LOCK_PROFILING_FAST opt_global.h MSIZE opt_global.h REGRESSION opt_global.h RWLOCK_NOINLINE opt_global.h SX_NOINLINE opt_global.h VFS_BIO_DEBUG opt_global.h # These are VM related options VM_KMEM_SIZE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h VM_NRESERVLEVEL opt_vm.h VM_NUMA_ALLOC opt_vm.h VM_LEVEL_0_ORDER opt_vm.h NO_SWAPPING opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h MALLOC_PROFILE opt_vm.h MALLOC_DEBUG_MAXZONES opt_vm.h # The MemGuard replacement allocator used for tamper-after-free detection DEBUG_MEMGUARD opt_vm.h # The RedZone malloc(9) protection DEBUG_REDZONE opt_vm.h # Standard SMP options EARLY_AP_STARTUP opt_global.h SMP opt_global.h # Size of the kernel message buffer MSGBUF_SIZE opt_msgbuf.h # NFS options NFS_MINATTRTIMO opt_nfs.h NFS_MAXATTRTIMO opt_nfs.h NFS_MINDIRATTRTIMO opt_nfs.h NFS_MAXDIRATTRTIMO opt_nfs.h NFS_DEBUG opt_nfs.h # For the Bt848/Bt848A/Bt849/Bt878/Bt879 driver OVERRIDE_CARD opt_bktr.h OVERRIDE_TUNER opt_bktr.h OVERRIDE_DBX opt_bktr.h OVERRIDE_MSP opt_bktr.h BROOKTREE_SYSTEM_DEFAULT opt_bktr.h BROOKTREE_ALLOC_PAGES opt_bktr.h BKTR_OVERRIDE_CARD opt_bktr.h BKTR_OVERRIDE_TUNER opt_bktr.h BKTR_OVERRIDE_DBX opt_bktr.h BKTR_OVERRIDE_MSP opt_bktr.h BKTR_SYSTEM_DEFAULT opt_bktr.h BKTR_ALLOC_PAGES opt_bktr.h BKTR_USE_PLL opt_bktr.h BKTR_GPIO_ACCESS opt_bktr.h BKTR_NO_MSP_RESET opt_bktr.h BKTR_430_FX_MODE opt_bktr.h BKTR_SIS_VIA_MODE opt_bktr.h BKTR_USE_FREEBSD_SMBUS opt_bktr.h BKTR_NEW_MSP34XX_DRIVER opt_bktr.h # Options for uart(4) UART_PPS_ON_CTS opt_uart.h UART_POLL_FREQ opt_uart.h UART_DEV_TOLERANCE_PCT opt_uart.h # options for bus/device framework BUS_DEBUG opt_bus.h # options for USB support USB_DEBUG opt_usb.h USB_HOST_ALIGN opt_usb.h USB_REQ_DEBUG opt_usb.h USB_TEMPLATE opt_usb.h USB_VERBOSE opt_usb.h USB_DMA_SINGLE_ALLOC opt_usb.h USB_EHCI_BIG_ENDIAN_DESC opt_usb.h U3G_DEBUG opt_u3g.h UKBD_DFLT_KEYMAP opt_ukbd.h UPLCOM_INTR_INTERVAL opt_uplcom.h UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h UVSCOM_INTR_INTERVAL opt_uvscom.h # options for the Realtek rtwn driver RTWN_DEBUG opt_rtwn.h RTWN_WITHOUT_UCODE opt_rtwn.h # Embedded system options INIT_PATH ROOTDEVNAME FDC_DEBUG opt_fdc.h PCFCLOCK_VERBOSE opt_pcfclock.h PCFCLOCK_MAX_RETRIES opt_pcfclock.h KTR opt_global.h KTR_ALQ opt_ktr.h KTR_MASK opt_ktr.h KTR_CPUMASK opt_ktr.h KTR_COMPILE opt_global.h KTR_BOOT_ENTRIES opt_global.h KTR_ENTRIES opt_global.h KTR_VERBOSE opt_ktr.h WITNESS opt_global.h WITNESS_KDB opt_witness.h WITNESS_NO_VNODE opt_witness.h WITNESS_SKIPSPIN opt_witness.h WITNESS_COUNT opt_witness.h OPENSOLARIS_WITNESS opt_global.h # options for ACPI support ACPI_DEBUG opt_acpi.h ACPI_MAX_TASKS opt_acpi.h ACPI_MAX_THREADS opt_acpi.h ACPI_DMAR opt_acpi.h DEV_ACPI opt_acpi.h # ISA support DEV_ISA opt_isa.h ISAPNP opt_isa.h # various 'device presence' options. DEV_BPF opt_bpf.h DEV_CARP opt_carp.h DEV_MCA opt_mca.h DEV_NETMAP opt_global.h DEV_PCI opt_pci.h DEV_PF opt_pf.h DEV_PFLOG opt_pf.h DEV_PFSYNC opt_pf.h DEV_RANDOM opt_global.h DEV_SPLASH opt_splash.h DEV_VLAN opt_vlan.h # EISA support DEV_EISA opt_eisa.h EISA_SLOTS opt_eisa.h # ed driver ED_HPP opt_ed.h ED_3C503 opt_ed.h ED_SIC opt_ed.h # bce driver BCE_DEBUG opt_bce.h BCE_NVRAM_WRITE_SUPPORT opt_bce.h SOCKBUF_DEBUG opt_global.h # options for ubsec driver UBSEC_DEBUG opt_ubsec.h UBSEC_RNDTEST opt_ubsec.h UBSEC_NO_RNG opt_ubsec.h # options for hifn driver HIFN_DEBUG opt_hifn.h HIFN_RNDTEST opt_hifn.h # options for safenet driver SAFE_DEBUG opt_safe.h SAFE_NO_RNG opt_safe.h SAFE_RNDTEST opt_safe.h # syscons/vt options MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_CUT_SPACES2TABS opt_syscons.h SC_CUT_SEPCHARS opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DISABLE_KDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_MODE_CHANGE opt_syscons.h SC_NO_SUSPEND_VTYSWITCH opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h VT_ALT_TO_ESC_HACK opt_syscons.h VT_FB_DEFAULT_WIDTH opt_syscons.h VT_FB_DEFAULT_HEIGHT opt_syscons.h VT_MAXWINDOWS opt_syscons.h VT_TWOBUTTON_MOUSE opt_syscons.h DEV_SC opt_syscons.h DEV_VT opt_syscons.h # teken terminal emulator options TEKEN_CONS25 opt_teken.h TEKEN_UTF8 opt_teken.h TERMINAL_KERN_ATTR opt_teken.h TERMINAL_NORM_ATTR opt_teken.h # options for printf PRINTF_BUFR_SIZE opt_printf.h # kbd options KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBDIO_DEBUG opt_kbd.h KBDMUX_DFLT_KEYMAP opt_kbdmux.h # options for the Atheros driver ATH_DEBUG opt_ath.h ATH_TXBUF opt_ath.h ATH_RXBUF opt_ath.h ATH_DIAGAPI opt_ath.h ATH_TX99_DIAG opt_ath.h ATH_ENABLE_11N opt_ath.h ATH_ENABLE_DFS opt_ath.h ATH_EEPROM_FIRMWARE opt_ath.h ATH_ENABLE_RADIOTAP_VENDOR_EXT opt_ath.h ATH_DEBUG_ALQ opt_ath.h ATH_KTR_INTR_DEBUG opt_ath.h # options for the Atheros hal AH_SUPPORT_AR5416 opt_ah.h # XXX For now, this breaks non-AR9130 chipsets, so only use it # XXX when actually targeting AR9130. AH_SUPPORT_AR9130 opt_ah.h # This is required for AR933x SoC support AH_SUPPORT_AR9330 opt_ah.h AH_SUPPORT_AR9340 opt_ah.h AH_SUPPORT_QCA9530 opt_ah.h AH_SUPPORT_QCA9550 opt_ah.h AH_DEBUG opt_ah.h AH_ASSERT opt_ah.h AH_DEBUG_ALQ opt_ah.h AH_REGOPS_FUNC opt_ah.h AH_WRITE_REGDOMAIN opt_ah.h AH_DEBUG_COUNTRY opt_ah.h AH_WRITE_EEPROM opt_ah.h AH_PRIVATE_DIAG opt_ah.h AH_NEED_DESC_SWAP opt_ah.h AH_USE_INIPDGAIN opt_ah.h AH_MAXCHAN opt_ah.h AH_RXCFG_SDMAMW_4BYTES opt_ah.h AH_INTERRUPT_DEBUGGING opt_ah.h # AR5416 and later interrupt mitigation # XXX do not use this for AR9130 AH_AR5416_INTERRUPT_MITIGATION opt_ah.h # options for the Broadcom BCM43xx driver (bwi) BWI_DEBUG opt_bwi.h BWI_DEBUG_VERBOSE opt_bwi.h # options for the Brodacom BCM43xx driver (bwn) BWN_DEBUG opt_bwn.h BWN_GPL_PHY opt_bwn.h # Options for the SIBA driver SIBA_DEBUG opt_siba.h # options for the Marvell 8335 wireless driver MALO_DEBUG opt_malo.h MALO_TXBUF opt_malo.h MALO_RXBUF opt_malo.h # options for the Marvell wireless driver MWL_DEBUG opt_mwl.h MWL_TXBUF opt_mwl.h MWL_RXBUF opt_mwl.h MWL_DIAGAPI opt_mwl.h MWL_AGGR_SIZE opt_mwl.h MWL_TX_NODROP opt_mwl.h # Options for the Intel 802.11ac wireless driver IWM_DEBUG opt_iwm.h # Options for the Intel 802.11n wireless driver IWN_DEBUG opt_iwn.h # Options for the Intel 3945ABG wireless driver WPI_DEBUG opt_wpi.h # dcons options DCONS_BUF_SIZE opt_dcons.h DCONS_POLL_HZ opt_dcons.h DCONS_FORCE_CONSOLE opt_dcons.h DCONS_FORCE_GDB opt_dcons.h # HWPMC options HWPMC_DEBUG opt_global.h HWPMC_HOOKS HWPMC_MIPS_BACKTRACE opt_hwpmc_hooks.h # XBOX options for FreeBSD/i386, but some files are MI XBOX opt_xbox.h # Interrupt filtering INTR_FILTER # 802.11 support layer IEEE80211_DEBUG opt_wlan.h IEEE80211_DEBUG_REFCNT opt_wlan.h IEEE80211_AMPDU_AGE opt_wlan.h IEEE80211_SUPPORT_MESH opt_wlan.h IEEE80211_SUPPORT_SUPERG opt_wlan.h IEEE80211_SUPPORT_TDMA opt_wlan.h IEEE80211_ALQ opt_wlan.h IEEE80211_DFS_DEBUG opt_wlan.h # 802.11 TDMA support TDMA_SLOTLEN_DEFAULT opt_tdma.h TDMA_SLOTCNT_DEFAULT opt_tdma.h TDMA_BINTVAL_DEFAULT opt_tdma.h TDMA_TXRATE_11B_DEFAULT opt_tdma.h TDMA_TXRATE_11G_DEFAULT opt_tdma.h TDMA_TXRATE_11A_DEFAULT opt_tdma.h TDMA_TXRATE_TURBO_DEFAULT opt_tdma.h TDMA_TXRATE_HALF_DEFAULT opt_tdma.h TDMA_TXRATE_QUARTER_DEFAULT opt_tdma.h TDMA_TXRATE_11NA_DEFAULT opt_tdma.h TDMA_TXRATE_11NG_DEFAULT opt_tdma.h # VideoMode PICKMODE_DEBUG opt_videomode.h # Network stack virtualization options VIMAGE opt_global.h VNET_DEBUG opt_global.h # Common Flash Interface (CFI) options CFI_SUPPORT_STRATAFLASH opt_cfi.h CFI_ARMEDANDDANGEROUS opt_cfi.h CFI_HARDWAREBYTESWAP opt_cfi.h # Sound options SND_DEBUG opt_snd.h SND_DIAGNOSTIC opt_snd.h SND_FEEDER_MULTIFORMAT opt_snd.h SND_FEEDER_FULL_MULTIFORMAT opt_snd.h SND_FEEDER_RATE_HP opt_snd.h SND_PCM_64 opt_snd.h SND_OLDSTEREO opt_snd.h X86BIOS # Flattened device tree options FDT opt_platform.h FDT_DTB_STATIC opt_platform.h # OFED Infiniband stack OFED opt_ofed.h OFED_DEBUG_INIT opt_ofed.h SDP opt_ofed.h SDP_DEBUG opt_ofed.h IPOIB opt_ofed.h IPOIB_DEBUG opt_ofed.h IPOIB_CM opt_ofed.h # Resource Accounting RACCT opt_global.h RACCT_DEFAULT_TO_DISABLED opt_global.h # Resource Limits RCTL opt_global.h # Random number generator(s) # Which CSPRNG hash we get. # If Yarrow is not chosen, Fortuna is selected. RANDOM_YARROW opt_global.h # With this, no entropy processor is loaded, but the entropy # harvesting infrastructure is present. This means an entropy # processor may be loaded as a module. RANDOM_LOADABLE opt_global.h # This turns on high-rate and potentially expensive harvesting in # the uma slab allocator. RANDOM_ENABLE_UMA opt_global.h # Intel em(4) driver EM_MULTIQUEUE opt_em.h # BHND(4) driver BHND_LOGLEVEL opt_global.h # GPIO and child devices GPIO_SPI_DEBUG opt_gpio.h # evdev protocol support EVDEV_SUPPORT opt_evdev.h EVDEV_DEBUG opt_evdev.h UINPUT_DEBUG opt_evdev.h Index: head/sys/dev/mps/mps_sas.c =================================================================== --- head/sys/dev/mps/mps_sas.c (revision 308154) +++ head/sys/dev/mps/mps_sas.c (revision 308155) @@ -1,3712 +1,3721 @@ /*- * Copyright (c) 2009 Yahoo! Inc. * Copyright (c) 2011-2015 LSI Corp. * Copyright (c) 2013-2015 Avago Technologies * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Avago Technologies (LSI) MPT-Fusion Host Adapter FreeBSD * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); /* Communications core for Avago Technologies (LSI) MPT2 */ /* TODO Move headers to mpsvar */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 900026 #include #endif #include #include #include #include #include #include #include #include #include #include #include #define MPSSAS_DISCOVERY_TIMEOUT 20 #define MPSSAS_MAX_DISCOVERY_TIMEOUTS 10 /* 200 seconds */ /* * static array to check SCSI OpCode for EEDP protection bits */ #define PRO_R MPI2_SCSIIO_EEDPFLAGS_CHECK_REMOVE_OP #define PRO_W MPI2_SCSIIO_EEDPFLAGS_INSERT_OP #define PRO_V MPI2_SCSIIO_EEDPFLAGS_INSERT_OP static uint8_t op_code_prot[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V, 0, 0, 0, PRO_W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PRO_R, 0, PRO_W, 0, 0, 0, PRO_W, PRO_V, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; MALLOC_DEFINE(M_MPSSAS, "MPSSAS", "MPS SAS memory"); static void mpssas_remove_device(struct mps_softc *, struct mps_command *); static void mpssas_remove_complete(struct mps_softc *, struct mps_command *); static void mpssas_action(struct cam_sim *sim, union ccb *ccb); static void mpssas_poll(struct cam_sim *sim); static int mpssas_send_abort(struct mps_softc *sc, struct mps_command *tm, struct mps_command *cm); static void mpssas_scsiio_timeout(void *data); static void mpssas_abort_complete(struct mps_softc *sc, struct mps_command *cm); static void mpssas_direct_drive_io(struct mpssas_softc *sassc, struct mps_command *cm, union ccb *ccb); static void mpssas_action_scsiio(struct mpssas_softc *, union ccb *); static void mpssas_scsiio_complete(struct mps_softc *, struct mps_command *); static void mpssas_action_resetdev(struct mpssas_softc *, union ccb *); #if __FreeBSD_version >= 900026 static void mpssas_smpio_complete(struct mps_softc *sc, struct mps_command *cm); static void mpssas_send_smpcmd(struct mpssas_softc *sassc, union ccb *ccb, uint64_t sasaddr); static void mpssas_action_smpio(struct mpssas_softc *sassc, union ccb *ccb); #endif //FreeBSD_version >= 900026 static void mpssas_resetdev_complete(struct mps_softc *, struct mps_command *); static void mpssas_async(void *callback_arg, uint32_t code, struct cam_path *path, void *arg); #if (__FreeBSD_version < 901503) || \ ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) static void mpssas_check_eedp(struct mps_softc *sc, struct cam_path *path, struct ccb_getdev *cgd); static void mpssas_read_cap_done(struct cam_periph *periph, union ccb *done_ccb); #endif static int mpssas_send_portenable(struct mps_softc *sc); static void mpssas_portenable_complete(struct mps_softc *sc, struct mps_command *cm); struct mpssas_target * mpssas_find_target_by_handle(struct mpssas_softc *sassc, int start, uint16_t handle) { struct mpssas_target *target; int i; for (i = start; i < sassc->maxtargets; i++) { target = &sassc->targets[i]; if (target->handle == handle) return (target); } return (NULL); } /* we need to freeze the simq during attach and diag reset, to avoid failing * commands before device handles have been found by discovery. Since * discovery involves reading config pages and possibly sending commands, * discovery actions may continue even after we receive the end of discovery * event, so refcount discovery actions instead of assuming we can unfreeze * the simq when we get the event. */ void mpssas_startup_increment(struct mpssas_softc *sassc) { MPS_FUNCTRACE(sassc->sc); if ((sassc->flags & MPSSAS_IN_STARTUP) != 0) { if (sassc->startup_refcount++ == 0) { /* just starting, freeze the simq */ mps_dprint(sassc->sc, MPS_INIT, "%s freezing simq\n", __func__); #if __FreeBSD_version >= 1000039 xpt_hold_boot(); #endif xpt_freeze_simq(sassc->sim, 1); } mps_dprint(sassc->sc, MPS_INIT, "%s refcount %u\n", __func__, sassc->startup_refcount); } } void mpssas_release_simq_reinit(struct mpssas_softc *sassc) { if (sassc->flags & MPSSAS_QUEUE_FROZEN) { sassc->flags &= ~MPSSAS_QUEUE_FROZEN; xpt_release_simq(sassc->sim, 1); mps_dprint(sassc->sc, MPS_INFO, "Unfreezing SIM queue\n"); } } void mpssas_startup_decrement(struct mpssas_softc *sassc) { MPS_FUNCTRACE(sassc->sc); if ((sassc->flags & MPSSAS_IN_STARTUP) != 0) { if (--sassc->startup_refcount == 0) { /* finished all discovery-related actions, release * the simq and rescan for the latest topology. */ mps_dprint(sassc->sc, MPS_INIT, "%s releasing simq\n", __func__); sassc->flags &= ~MPSSAS_IN_STARTUP; xpt_release_simq(sassc->sim, 1); #if __FreeBSD_version >= 1000039 xpt_release_boot(); #else mpssas_rescan_target(sassc->sc, NULL); #endif } mps_dprint(sassc->sc, MPS_INIT, "%s refcount %u\n", __func__, sassc->startup_refcount); } } /* The firmware requires us to stop sending commands when we're doing task * management, so refcount the TMs and keep the simq frozen when any are in * use. */ struct mps_command * mpssas_alloc_tm(struct mps_softc *sc) { struct mps_command *tm; tm = mps_alloc_high_priority_command(sc); return tm; } void mpssas_free_tm(struct mps_softc *sc, struct mps_command *tm) { int target_id = 0xFFFFFFFF; if (tm == NULL) return; /* * For TM's the devq is frozen for the device. Unfreeze it here and * free the resources used for freezing the devq. Must clear the * INRESET flag as well or scsi I/O will not work. */ if (tm->cm_targ != NULL) { tm->cm_targ->flags &= ~MPSSAS_TARGET_INRESET; target_id = tm->cm_targ->tid; } if (tm->cm_ccb) { mps_dprint(sc, MPS_INFO, "Unfreezing devq for target ID %d\n", target_id); xpt_release_devq(tm->cm_ccb->ccb_h.path, 1, TRUE); xpt_free_path(tm->cm_ccb->ccb_h.path); xpt_free_ccb(tm->cm_ccb); } mps_free_high_priority_command(sc, tm); } void mpssas_rescan_target(struct mps_softc *sc, struct mpssas_target *targ) { struct mpssas_softc *sassc = sc->sassc; path_id_t pathid; target_id_t targetid; union ccb *ccb; MPS_FUNCTRACE(sc); pathid = cam_sim_path(sassc->sim); if (targ == NULL) targetid = CAM_TARGET_WILDCARD; else targetid = targ - sassc->targets; /* * Allocate a CCB and schedule a rescan. */ ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { mps_dprint(sc, MPS_ERROR, "unable to alloc CCB for rescan\n"); return; } if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { mps_dprint(sc, MPS_ERROR, "unable to create path for rescan\n"); xpt_free_ccb(ccb); return; } if (targetid == CAM_TARGET_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_BUS; else ccb->ccb_h.func_code = XPT_SCAN_TGT; mps_dprint(sc, MPS_TRACE, "%s targetid %u\n", __func__, targetid); xpt_rescan(ccb); } static void mpssas_log_command(struct mps_command *cm, u_int level, const char *fmt, ...) { struct sbuf sb; va_list ap; char str[192]; char path_str[64]; if (cm == NULL) return; /* No need to be in here if debugging isn't enabled */ if ((cm->cm_sc->mps_debug & level) == 0) return; sbuf_new(&sb, str, sizeof(str), 0); va_start(ap, fmt); if (cm->cm_ccb != NULL) { xpt_path_string(cm->cm_ccb->csio.ccb_h.path, path_str, sizeof(path_str)); sbuf_cat(&sb, path_str); if (cm->cm_ccb->ccb_h.func_code == XPT_SCSI_IO) { scsi_command_string(&cm->cm_ccb->csio, &sb); sbuf_printf(&sb, "length %d ", cm->cm_ccb->csio.dxfer_len); } } else { sbuf_printf(&sb, "(noperiph:%s%d:%u:%u:%u): ", cam_sim_name(cm->cm_sc->sassc->sim), cam_sim_unit(cm->cm_sc->sassc->sim), cam_sim_bus(cm->cm_sc->sassc->sim), cm->cm_targ ? cm->cm_targ->tid : 0xFFFFFFFF, cm->cm_lun); } sbuf_printf(&sb, "SMID %u ", cm->cm_desc.Default.SMID); sbuf_vprintf(&sb, fmt, ap); sbuf_finish(&sb); mps_dprint_field(cm->cm_sc, level, "%s", sbuf_data(&sb)); va_end(ap); } static void mpssas_remove_volume(struct mps_softc *sc, struct mps_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; struct mpssas_target *targ; uint16_t handle; MPS_FUNCTRACE(sc); reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; handle = (uint16_t)(uintptr_t)tm->cm_complete_data; targ = tm->cm_targ; if (reply == NULL) { /* XXX retry the remove after the diag reset completes? */ mps_dprint(sc, MPS_FAULT, "%s NULL reply resetting device 0x%04x\n", __func__, handle); mpssas_free_tm(sc, tm); return; } if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) { mps_dprint(sc, MPS_ERROR, "IOCStatus = 0x%x while resetting device 0x%x\n", le16toh(reply->IOCStatus), handle); } mps_dprint(sc, MPS_XINFO, "Reset aborted %u commands\n", reply->TerminationCount); mps_free_reply(sc, tm->cm_reply_data); tm->cm_reply = NULL; /* Ensures the reply won't get re-freed */ mps_dprint(sc, MPS_XINFO, "clearing target %u handle 0x%04x\n", targ->tid, handle); /* * Don't clear target if remove fails because things will get confusing. * Leave the devname and sasaddr intact so that we know to avoid reusing * this target id if possible, and so we can assign the same target id * to this device if it comes back in the future. */ if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SUCCESS) { targ = tm->cm_targ; targ->handle = 0x0; targ->encl_handle = 0x0; targ->encl_slot = 0x0; targ->exp_dev_handle = 0x0; targ->phy_num = 0x0; targ->linkrate = 0x0; targ->devinfo = 0x0; targ->flags = 0x0; } mpssas_free_tm(sc, tm); } /* * No Need to call "MPI2_SAS_OP_REMOVE_DEVICE" For Volume removal. * Otherwise Volume Delete is same as Bare Drive Removal. */ void mpssas_prepare_volume_remove(struct mpssas_softc *sassc, uint16_t handle) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mps_softc *sc; struct mps_command *cm; struct mpssas_target *targ = NULL; MPS_FUNCTRACE(sassc->sc); sc = sassc->sc; #ifdef WD_SUPPORT /* * If this is a WD controller, determine if the disk should be exposed * to the OS or not. If disk should be exposed, return from this * function without doing anything. */ if (sc->WD_available && (sc->WD_hide_expose == MPS_WD_EXPOSE_ALWAYS)) { return; } #endif //WD_SUPPORT targ = mpssas_find_target_by_handle(sassc, 0, handle); if (targ == NULL) { /* FIXME: what is the action? */ /* We don't know about this device? */ mps_dprint(sc, MPS_ERROR, "%s %d : invalid handle 0x%x \n", __func__,__LINE__, handle); return; } targ->flags |= MPSSAS_TARGET_INREMOVAL; cm = mpssas_alloc_tm(sc); if (cm == NULL) { mps_dprint(sc, MPS_ERROR, "%s: command alloc failure\n", __func__); return; } mpssas_rescan_target(sc, targ); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req; req->DevHandle = targ->handle; req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; /* SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; cm->cm_targ = targ; cm->cm_data = NULL; cm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; cm->cm_complete = mpssas_remove_volume; cm->cm_complete_data = (void *)(uintptr_t)handle; mps_dprint(sc, MPS_INFO, "%s: Sending reset for target ID %d\n", __func__, targ->tid); mpssas_prepare_for_tm(sc, cm, targ, CAM_LUN_WILDCARD); mps_map_command(sc, cm); } /* * The MPT2 firmware performs debounce on the link to avoid transient link * errors and false removals. When it does decide that link has been lost * and a device need to go away, it expects that the host will perform a * target reset and then an op remove. The reset has the side-effect of * aborting any outstanding requests for the device, which is required for * the op-remove to succeed. It's not clear if the host should check for * the device coming back alive after the reset. */ void mpssas_prepare_remove(struct mpssas_softc *sassc, uint16_t handle) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mps_softc *sc; struct mps_command *cm; struct mpssas_target *targ = NULL; MPS_FUNCTRACE(sassc->sc); sc = sassc->sc; targ = mpssas_find_target_by_handle(sassc, 0, handle); if (targ == NULL) { /* FIXME: what is the action? */ /* We don't know about this device? */ mps_dprint(sc, MPS_ERROR, "%s : invalid handle 0x%x \n", __func__, handle); return; } targ->flags |= MPSSAS_TARGET_INREMOVAL; cm = mpssas_alloc_tm(sc); if (cm == NULL) { mps_dprint(sc, MPS_ERROR, "%s: command alloc failure\n", __func__); return; } mpssas_rescan_target(sc, targ); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req; memset(req, 0, sizeof(*req)); req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; /* SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; cm->cm_targ = targ; cm->cm_data = NULL; cm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; cm->cm_complete = mpssas_remove_device; cm->cm_complete_data = (void *)(uintptr_t)handle; mps_dprint(sc, MPS_INFO, "%s: Sending reset for target ID %d\n", __func__, targ->tid); mpssas_prepare_for_tm(sc, cm, targ, CAM_LUN_WILDCARD); mps_map_command(sc, cm); } static void mpssas_remove_device(struct mps_softc *sc, struct mps_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SAS_IOUNIT_CONTROL_REQUEST *req; struct mpssas_target *targ; struct mps_command *next_cm; uint16_t handle; MPS_FUNCTRACE(sc); reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; handle = (uint16_t)(uintptr_t)tm->cm_complete_data; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { mps_dprint(sc, MPS_ERROR, "%s: cm_flags = %#x for remove of handle %#04x! " "This should not happen!\n", __func__, tm->cm_flags, handle); } if (reply == NULL) { /* XXX retry the remove after the diag reset completes? */ mps_dprint(sc, MPS_FAULT, "%s NULL reply resetting device 0x%04x\n", __func__, handle); mpssas_free_tm(sc, tm); return; } if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) { mps_dprint(sc, MPS_ERROR, "IOCStatus = 0x%x while resetting device 0x%x\n", le16toh(reply->IOCStatus), handle); } mps_dprint(sc, MPS_XINFO, "Reset aborted %u commands\n", le32toh(reply->TerminationCount)); mps_free_reply(sc, tm->cm_reply_data); tm->cm_reply = NULL; /* Ensures the reply won't get re-freed */ /* Reuse the existing command */ req = (MPI2_SAS_IOUNIT_CONTROL_REQUEST *)tm->cm_req; memset(req, 0, sizeof(*req)); req->Function = MPI2_FUNCTION_SAS_IO_UNIT_CONTROL; req->Operation = MPI2_SAS_OP_REMOVE_DEVICE; req->DevHandle = htole16(handle); tm->cm_data = NULL; tm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; tm->cm_complete = mpssas_remove_complete; tm->cm_complete_data = (void *)(uintptr_t)handle; mps_map_command(sc, tm); mps_dprint(sc, MPS_XINFO, "clearing target %u handle 0x%04x\n", targ->tid, handle); TAILQ_FOREACH_SAFE(tm, &targ->commands, cm_link, next_cm) { union ccb *ccb; mps_dprint(sc, MPS_XINFO, "Completing missed command %p\n", tm); ccb = tm->cm_complete_data; mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); mpssas_scsiio_complete(sc, tm); } } static void mpssas_remove_complete(struct mps_softc *sc, struct mps_command *tm) { MPI2_SAS_IOUNIT_CONTROL_REPLY *reply; uint16_t handle; struct mpssas_target *targ; struct mpssas_lun *lun; MPS_FUNCTRACE(sc); reply = (MPI2_SAS_IOUNIT_CONTROL_REPLY *)tm->cm_reply; handle = (uint16_t)(uintptr_t)tm->cm_complete_data; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { mps_dprint(sc, MPS_XINFO, "%s: cm_flags = %#x for remove of handle %#04x! " "This should not happen!\n", __func__, tm->cm_flags, handle); mpssas_free_tm(sc, tm); return; } if (reply == NULL) { /* most likely a chip reset */ mps_dprint(sc, MPS_FAULT, "%s NULL reply removing device 0x%04x\n", __func__, handle); mpssas_free_tm(sc, tm); return; } mps_dprint(sc, MPS_XINFO, "%s on handle 0x%04x, IOCStatus= 0x%x\n", __func__, handle, le16toh(reply->IOCStatus)); /* * Don't clear target if remove fails because things will get confusing. * Leave the devname and sasaddr intact so that we know to avoid reusing * this target id if possible, and so we can assign the same target id * to this device if it comes back in the future. */ if ((le16toh(reply->IOCStatus) & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SUCCESS) { targ = tm->cm_targ; targ->handle = 0x0; targ->encl_handle = 0x0; targ->encl_slot = 0x0; targ->exp_dev_handle = 0x0; targ->phy_num = 0x0; targ->linkrate = 0x0; targ->devinfo = 0x0; targ->flags = 0x0; while(!SLIST_EMPTY(&targ->luns)) { lun = SLIST_FIRST(&targ->luns); SLIST_REMOVE_HEAD(&targ->luns, lun_link); free(lun, M_MPT2); } } mpssas_free_tm(sc, tm); } static int mpssas_register_events(struct mps_softc *sc) { u32 events[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS]; bzero(events, 16); setbit(events, MPI2_EVENT_SAS_DEVICE_STATUS_CHANGE); setbit(events, MPI2_EVENT_SAS_DISCOVERY); setbit(events, MPI2_EVENT_SAS_BROADCAST_PRIMITIVE); setbit(events, MPI2_EVENT_SAS_INIT_DEVICE_STATUS_CHANGE); setbit(events, MPI2_EVENT_SAS_INIT_TABLE_OVERFLOW); setbit(events, MPI2_EVENT_SAS_TOPOLOGY_CHANGE_LIST); setbit(events, MPI2_EVENT_SAS_ENCL_DEVICE_STATUS_CHANGE); setbit(events, MPI2_EVENT_IR_CONFIGURATION_CHANGE_LIST); setbit(events, MPI2_EVENT_IR_VOLUME); setbit(events, MPI2_EVENT_IR_PHYSICAL_DISK); setbit(events, MPI2_EVENT_IR_OPERATION_STATUS); setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED); mps_register_events(sc, events, mpssas_evt_handler, NULL, &sc->sassc->mpssas_eh); return (0); } int mps_attach_sas(struct mps_softc *sc) { struct mpssas_softc *sassc; cam_status status; int unit, error = 0; MPS_FUNCTRACE(sc); sassc = malloc(sizeof(struct mpssas_softc), M_MPT2, M_WAITOK|M_ZERO); if(!sassc) { device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } /* * XXX MaxTargets could change during a reinit. Since we don't * resize the targets[] array during such an event, cache the value * of MaxTargets here so that we don't get into trouble later. This * should move into the reinit logic. */ sassc->maxtargets = sc->facts->MaxTargets; sassc->targets = malloc(sizeof(struct mpssas_target) * sassc->maxtargets, M_MPT2, M_WAITOK|M_ZERO); if(!sassc->targets) { device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n", __func__, __LINE__); free(sassc, M_MPT2); return (ENOMEM); } sc->sassc = sassc; sassc->sc = sc; if ((sassc->devq = cam_simq_alloc(sc->num_reqs)) == NULL) { mps_dprint(sc, MPS_ERROR, "Cannot allocate SIMQ\n"); error = ENOMEM; goto out; } unit = device_get_unit(sc->mps_dev); sassc->sim = cam_sim_alloc(mpssas_action, mpssas_poll, "mps", sassc, unit, &sc->mps_mtx, sc->num_reqs, sc->num_reqs, sassc->devq); if (sassc->sim == NULL) { mps_dprint(sc, MPS_ERROR, "Cannot allocate SIM\n"); error = EINVAL; goto out; } TAILQ_INIT(&sassc->ev_queue); /* Initialize taskqueue for Event Handling */ TASK_INIT(&sassc->ev_task, 0, mpssas_firmware_event_work, sc); sassc->ev_tq = taskqueue_create("mps_taskq", M_NOWAIT | M_ZERO, taskqueue_thread_enqueue, &sassc->ev_tq); taskqueue_start_threads(&sassc->ev_tq, 1, PRIBIO, "%s taskq", device_get_nameunit(sc->mps_dev)); mps_lock(sc); /* * XXX There should be a bus for every port on the adapter, but since * we're just going to fake the topology for now, we'll pretend that * everything is just a target on a single bus. */ if ((error = xpt_bus_register(sassc->sim, sc->mps_dev, 0)) != 0) { mps_dprint(sc, MPS_ERROR, "Error %d registering SCSI bus\n", error); mps_unlock(sc); goto out; } /* * Assume that discovery events will start right away. * * Hold off boot until discovery is complete. */ sassc->flags |= MPSSAS_IN_STARTUP | MPSSAS_IN_DISCOVERY; sc->sassc->startup_refcount = 0; mpssas_startup_increment(sassc); callout_init(&sassc->discovery_callout, 1 /*mpsafe*/); /* * Register for async events so we can determine the EEDP * capabilities of devices. */ status = xpt_create_path(&sassc->path, /*periph*/NULL, cam_sim_path(sc->sassc->sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (status != CAM_REQ_CMP) { mps_printf(sc, "Error %#x creating sim path\n", status); sassc->path = NULL; } else { int event; #if (__FreeBSD_version >= 1000006) || \ ((__FreeBSD_version >= 901503) && (__FreeBSD_version < 1000000)) event = AC_ADVINFO_CHANGED; #else event = AC_FOUND_DEVICE; #endif status = xpt_register_async(event, mpssas_async, sc, sassc->path); if (status != CAM_REQ_CMP) { mps_dprint(sc, MPS_ERROR, "Error %#x registering async handler for " "AC_ADVINFO_CHANGED events\n", status); xpt_free_path(sassc->path); sassc->path = NULL; } } if (status != CAM_REQ_CMP) { /* * EEDP use is the exception, not the rule. * Warn the user, but do not fail to attach. */ mps_printf(sc, "EEDP capabilities disabled.\n"); } mps_unlock(sc); mpssas_register_events(sc); out: if (error) mps_detach_sas(sc); return (error); } int mps_detach_sas(struct mps_softc *sc) { struct mpssas_softc *sassc; struct mpssas_lun *lun, *lun_tmp; struct mpssas_target *targ; int i; MPS_FUNCTRACE(sc); if (sc->sassc == NULL) return (0); sassc = sc->sassc; mps_deregister_events(sc, sassc->mpssas_eh); /* * Drain and free the event handling taskqueue with the lock * unheld so that any parallel processing tasks drain properly * without deadlocking. */ if (sassc->ev_tq != NULL) taskqueue_free(sassc->ev_tq); /* Make sure CAM doesn't wedge if we had to bail out early. */ mps_lock(sc); /* Deregister our async handler */ if (sassc->path != NULL) { xpt_register_async(0, mpssas_async, sc, sassc->path); xpt_free_path(sassc->path); sassc->path = NULL; } if (sassc->flags & MPSSAS_IN_STARTUP) xpt_release_simq(sassc->sim, 1); if (sassc->sim != NULL) { xpt_bus_deregister(cam_sim_path(sassc->sim)); cam_sim_free(sassc->sim, FALSE); } mps_unlock(sc); if (sassc->devq != NULL) cam_simq_free(sassc->devq); for(i=0; i< sassc->maxtargets ;i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPT2); } } free(sassc->targets, M_MPT2); free(sassc, M_MPT2); sc->sassc = NULL; return (0); } void mpssas_discovery_end(struct mpssas_softc *sassc) { struct mps_softc *sc = sassc->sc; MPS_FUNCTRACE(sc); if (sassc->flags & MPSSAS_DISCOVERY_TIMEOUT_PENDING) callout_stop(&sassc->discovery_callout); } static void mpssas_action(struct cam_sim *sim, union ccb *ccb) { struct mpssas_softc *sassc; sassc = cam_sim_softc(sim); MPS_FUNCTRACE(sassc->sc); mps_dprint(sassc->sc, MPS_TRACE, "ccb func_code 0x%x\n", ccb->ccb_h.func_code); mtx_assert(&sassc->sc->mps_mtx, MA_OWNED); switch (ccb->ccb_h.func_code) { case XPT_PATH_INQ: { struct ccb_pathinq *cpi = &ccb->cpi; struct mps_softc *sc = sassc->sc; uint8_t sges_per_frame; cpi->version_num = 1; cpi->hba_inquiry = PI_SDTR_ABLE|PI_TAG_ABLE|PI_WIDE_16; cpi->target_sprt = 0; #if __FreeBSD_version >= 1000039 cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED | PIM_NOSCAN; #else cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED; #endif cpi->hba_eng_cnt = 0; cpi->max_target = sassc->maxtargets - 1; cpi->max_lun = 255; cpi->initiator_id = sassc->maxtargets - 1; strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strncpy(cpi->hba_vid, "Avago Tech (LSI)", HBA_IDLEN); strncpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); cpi->bus_id = cam_sim_bus(sim); cpi->base_transfer_speed = 150000; cpi->transport = XPORT_SAS; cpi->transport_version = 0; cpi->protocol = PROTO_SCSI; cpi->protocol_version = SCSI_REV_SPC; /* * Max IO Size is Page Size * the following: * ((SGEs per frame - 1 for chain element) * * Max Chain Depth) + 1 for no chain needed in last frame * * If user suggests a Max IO size to use, use the smaller of the * user's value and the calculated value as long as the user's * value is larger than 0. The user's value is in pages. */ sges_per_frame = ((sc->facts->IOCRequestFrameSize * 4) / sizeof(MPI2_SGE_SIMPLE64)) - 1; cpi->maxio = (sges_per_frame * sc->facts->MaxChainDepth) + 1; cpi->maxio *= PAGE_SIZE; if ((sc->max_io_pages > 0) && (sc->max_io_pages * PAGE_SIZE < cpi->maxio)) cpi->maxio = sc->max_io_pages * PAGE_SIZE; mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); break; } case XPT_GET_TRAN_SETTINGS: { struct ccb_trans_settings *cts; struct ccb_trans_settings_sas *sas; struct ccb_trans_settings_scsi *scsi; struct mpssas_target *targ; cts = &ccb->cts; sas = &cts->xport_specific.sas; scsi = &cts->proto_specific.scsi; KASSERT(cts->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in XPT_GET_TRANS_SETTINGS\n", cts->ccb_h.target_id)); targ = &sassc->targets[cts->ccb_h.target_id]; if (targ->handle == 0x0) { mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); break; } cts->protocol_version = SCSI_REV_SPC2; cts->transport = XPORT_SAS; cts->transport_version = 0; sas->valid = CTS_SAS_VALID_SPEED; switch (targ->linkrate) { case 0x08: sas->bitrate = 150000; break; case 0x09: sas->bitrate = 300000; break; case 0x0a: sas->bitrate = 600000; break; default: sas->valid = 0; } cts->protocol = PROTO_SCSI; scsi->valid = CTS_SCSI_VALID_TQ; scsi->flags = CTS_SCSI_FLAGS_TAG_ENB; mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); break; } case XPT_CALC_GEOMETRY: cam_calc_geometry(&ccb->ccg, /*extended*/1); mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); break; case XPT_RESET_DEV: mps_dprint(sassc->sc, MPS_XINFO, "mpssas_action XPT_RESET_DEV\n"); mpssas_action_resetdev(sassc, ccb); return; case XPT_RESET_BUS: case XPT_ABORT: case XPT_TERM_IO: mps_dprint(sassc->sc, MPS_XINFO, "mpssas_action faking success for abort or reset\n"); mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); break; case XPT_SCSI_IO: mpssas_action_scsiio(sassc, ccb); return; #if __FreeBSD_version >= 900026 case XPT_SMP_IO: mpssas_action_smpio(sassc, ccb); return; #endif default: mpssas_set_ccbstatus(ccb, CAM_FUNC_NOTAVAIL); break; } xpt_done(ccb); } static void mpssas_announce_reset(struct mps_softc *sc, uint32_t ac_code, target_id_t target_id, lun_id_t lun_id) { path_id_t path_id = cam_sim_path(sc->sassc->sim); struct cam_path *path; mps_dprint(sc, MPS_XINFO, "%s code %x target %d lun %jx\n", __func__, ac_code, target_id, (uintmax_t)lun_id); if (xpt_create_path(&path, NULL, path_id, target_id, lun_id) != CAM_REQ_CMP) { mps_dprint(sc, MPS_ERROR, "unable to create path for reset " "notification\n"); return; } xpt_async(ac_code, path, NULL); xpt_free_path(path); } static void mpssas_complete_all_commands(struct mps_softc *sc) { struct mps_command *cm; int i; int completed; MPS_FUNCTRACE(sc); mtx_assert(&sc->mps_mtx, MA_OWNED); /* complete all commands with a NULL reply */ for (i = 1; i < sc->num_reqs; i++) { cm = &sc->commands[i]; cm->cm_reply = NULL; completed = 0; if (cm->cm_flags & MPS_CM_FLAGS_POLLED) cm->cm_flags |= MPS_CM_FLAGS_COMPLETE; if (cm->cm_complete != NULL) { mpssas_log_command(cm, MPS_RECOVERY, "completing cm %p state %x ccb %p for diag reset\n", cm, cm->cm_state, cm->cm_ccb); cm->cm_complete(sc, cm); completed = 1; } if (cm->cm_flags & MPS_CM_FLAGS_WAKEUP) { mpssas_log_command(cm, MPS_RECOVERY, "waking up cm %p state %x ccb %p for diag reset\n", cm, cm->cm_state, cm->cm_ccb); wakeup(cm); completed = 1; } if (cm->cm_sc->io_cmds_active != 0) { cm->cm_sc->io_cmds_active--; } else { mps_dprint(cm->cm_sc, MPS_INFO, "Warning: " "io_cmds_active is out of sync - resynching to " "0\n"); } if ((completed == 0) && (cm->cm_state != MPS_CM_STATE_FREE)) { /* this should never happen, but if it does, log */ mpssas_log_command(cm, MPS_RECOVERY, "cm %p state %x flags 0x%x ccb %p during diag " "reset\n", cm, cm->cm_state, cm->cm_flags, cm->cm_ccb); } } } void mpssas_handle_reinit(struct mps_softc *sc) { int i; /* Go back into startup mode and freeze the simq, so that CAM * doesn't send any commands until after we've rediscovered all * targets and found the proper device handles for them. * * After the reset, portenable will trigger discovery, and after all * discovery-related activities have finished, the simq will be * released. */ mps_dprint(sc, MPS_INIT, "%s startup\n", __func__); sc->sassc->flags |= MPSSAS_IN_STARTUP; sc->sassc->flags |= MPSSAS_IN_DISCOVERY; mpssas_startup_increment(sc->sassc); /* notify CAM of a bus reset */ mpssas_announce_reset(sc, AC_BUS_RESET, CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); /* complete and cleanup after all outstanding commands */ mpssas_complete_all_commands(sc); mps_dprint(sc, MPS_INIT, "%s startup %u after command completion\n", __func__, sc->sassc->startup_refcount); /* zero all the target handles, since they may change after the * reset, and we have to rediscover all the targets and use the new * handles. */ for (i = 0; i < sc->sassc->maxtargets; i++) { if (sc->sassc->targets[i].outstanding != 0) mps_dprint(sc, MPS_INIT, "target %u outstanding %u\n", i, sc->sassc->targets[i].outstanding); sc->sassc->targets[i].handle = 0x0; sc->sassc->targets[i].exp_dev_handle = 0x0; sc->sassc->targets[i].outstanding = 0; sc->sassc->targets[i].flags = MPSSAS_TARGET_INDIAGRESET; } } static void mpssas_tm_timeout(void *data) { struct mps_command *tm = data; struct mps_softc *sc = tm->cm_sc; mtx_assert(&sc->mps_mtx, MA_OWNED); mpssas_log_command(tm, MPS_INFO|MPS_RECOVERY, "task mgmt %p timed out\n", tm); mps_reinit(sc); } static void mpssas_logical_unit_reset_complete(struct mps_softc *sc, struct mps_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SCSI_TASK_MANAGE_REQUEST *req; unsigned int cm_count = 0; struct mps_command *cm; struct mpssas_target *targ; callout_stop(&tm->cm_callout); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. * XXXSL So should it be an assertion? */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { mps_dprint(sc, MPS_ERROR, "%s: cm_flags = %#x for LUN reset! " "This should not happen!\n", __func__, tm->cm_flags); mpssas_free_tm(sc, tm); return; } if (reply == NULL) { mpssas_log_command(tm, MPS_RECOVERY, "NULL reset reply for tm %p\n", tm); if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ targ->tm = NULL; mpssas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ mps_reinit(sc); } return; } mpssas_log_command(tm, MPS_RECOVERY, "logical unit reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); /* See if there are any outstanding commands for this LUN. * This could be made more efficient by using a per-LU data * structure of some sort. */ TAILQ_FOREACH(cm, &targ->commands, cm_link) { if (cm->cm_lun == tm->cm_lun) cm_count++; } if (cm_count == 0) { mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, "logical unit %u finished recovery after reset\n", tm->cm_lun, tm); mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, tm->cm_lun); /* we've finished recovery for this logical unit. check and * see if some other logical unit has a timedout command * that needs to be processed. */ cm = TAILQ_FIRST(&targ->timedout_commands); if (cm) { mpssas_send_abort(sc, tm, cm); } else { targ->tm = NULL; mpssas_free_tm(sc, tm); } } else { /* if we still have commands for this LUN, the reset * effectively failed, regardless of the status reported. * Escalate to a target reset. */ mpssas_log_command(tm, MPS_RECOVERY, "logical unit reset complete for tm %p, but still have %u command(s)\n", tm, cm_count); mpssas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET); } } static void mpssas_target_reset_complete(struct mps_softc *sc, struct mps_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mpssas_target *targ; callout_stop(&tm->cm_callout); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { mps_dprint(sc, MPS_ERROR,"%s: cm_flags = %#x for target reset! " "This should not happen!\n", __func__, tm->cm_flags); mpssas_free_tm(sc, tm); return; } if (reply == NULL) { mpssas_log_command(tm, MPS_RECOVERY, "NULL reset reply for tm %p\n", tm); if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ targ->tm = NULL; mpssas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ mps_reinit(sc); } return; } mpssas_log_command(tm, MPS_RECOVERY, "target reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); if (targ->outstanding == 0) { /* we've finished recovery for this target and all * of its logical units. */ mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, "recovery finished after target reset\n"); mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, CAM_LUN_WILDCARD); targ->tm = NULL; mpssas_free_tm(sc, tm); } else { /* after a target reset, if this target still has * outstanding commands, the reset effectively failed, * regardless of the status reported. escalate. */ mpssas_log_command(tm, MPS_RECOVERY, "target reset complete for tm %p, but still have %u command(s)\n", tm, targ->outstanding); mps_reinit(sc); } } #define MPS_RESET_TIMEOUT 30 int mpssas_send_reset(struct mps_softc *sc, struct mps_command *tm, uint8_t type) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mpssas_target *target; int err; target = tm->cm_targ; if (target->handle == 0) { mps_dprint(sc, MPS_ERROR,"%s null devhandle for target_id %d\n", __func__, target->tid); return -1; } req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; req->DevHandle = htole16(target->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = type; if (type == MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET) { /* XXX Need to handle invalid LUNs */ MPS_SET_LUN(req->LUN, tm->cm_lun); tm->cm_targ->logical_unit_resets++; mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, "sending logical unit reset\n"); tm->cm_complete = mpssas_logical_unit_reset_complete; mpssas_prepare_for_tm(sc, tm, target, tm->cm_lun); } else if (type == MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET) { /* * Target reset method = * SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; tm->cm_targ->target_resets++; mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, "sending target reset\n"); tm->cm_complete = mpssas_target_reset_complete; mpssas_prepare_for_tm(sc, tm, target, CAM_LUN_WILDCARD); } else { mps_dprint(sc, MPS_ERROR, "unexpected reset type 0x%x\n", type); return -1; } tm->cm_data = NULL; tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; tm->cm_complete_data = (void *)tm; callout_reset(&tm->cm_callout, MPS_RESET_TIMEOUT * hz, mpssas_tm_timeout, tm); err = mps_map_command(sc, tm); if (err) mpssas_log_command(tm, MPS_RECOVERY, "error %d sending reset type %u\n", err, type); return err; } static void mpssas_abort_complete(struct mps_softc *sc, struct mps_command *tm) { struct mps_command *cm; MPI2_SCSI_TASK_MANAGE_REPLY *reply; MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mpssas_target *targ; callout_stop(&tm->cm_callout); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; reply = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; targ = tm->cm_targ; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { mpssas_log_command(tm, MPS_RECOVERY, "cm_flags = %#x for abort %p TaskMID %u!\n", tm->cm_flags, tm, le16toh(req->TaskMID)); mpssas_free_tm(sc, tm); return; } if (reply == NULL) { mpssas_log_command(tm, MPS_RECOVERY, "NULL abort reply for tm %p TaskMID %u\n", tm, le16toh(req->TaskMID)); if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ targ->tm = NULL; mpssas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ mps_reinit(sc); } return; } mpssas_log_command(tm, MPS_RECOVERY, "abort TaskMID %u status 0x%x code 0x%x count %u\n", le16toh(req->TaskMID), le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); cm = TAILQ_FIRST(&tm->cm_targ->timedout_commands); if (cm == NULL) { /* if there are no more timedout commands, we're done with * error recovery for this target. */ mpssas_log_command(tm, MPS_RECOVERY, "finished recovery after aborting TaskMID %u\n", le16toh(req->TaskMID)); targ->tm = NULL; mpssas_free_tm(sc, tm); } else if (le16toh(req->TaskMID) != cm->cm_desc.Default.SMID) { /* abort success, but we have more timedout commands to abort */ mpssas_log_command(tm, MPS_RECOVERY, "continuing recovery after aborting TaskMID %u\n", le16toh(req->TaskMID)); mpssas_send_abort(sc, tm, cm); } else { /* we didn't get a command completion, so the abort * failed as far as we're concerned. escalate. */ mpssas_log_command(tm, MPS_RECOVERY, "abort failed for TaskMID %u tm %p\n", le16toh(req->TaskMID), tm); mpssas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET); } } #define MPS_ABORT_TIMEOUT 5 static int mpssas_send_abort(struct mps_softc *sc, struct mps_command *tm, struct mps_command *cm) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mpssas_target *targ; int err; targ = cm->cm_targ; if (targ->handle == 0) { mps_dprint(sc, MPS_ERROR,"%s null devhandle for target_id %d\n", __func__, cm->cm_ccb->ccb_h.target_id); return -1; } mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, "Aborting command %p\n", cm); req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_ABORT_TASK; /* XXX Need to handle invalid LUNs */ MPS_SET_LUN(req->LUN, cm->cm_ccb->ccb_h.target_lun); req->TaskMID = htole16(cm->cm_desc.Default.SMID); tm->cm_data = NULL; tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; tm->cm_complete = mpssas_abort_complete; tm->cm_complete_data = (void *)tm; tm->cm_targ = cm->cm_targ; tm->cm_lun = cm->cm_lun; callout_reset(&tm->cm_callout, MPS_ABORT_TIMEOUT * hz, mpssas_tm_timeout, tm); targ->aborts++; mps_dprint(sc, MPS_INFO, "Sending reset from %s for target ID %d\n", __func__, targ->tid); mpssas_prepare_for_tm(sc, tm, targ, tm->cm_lun); err = mps_map_command(sc, tm); if (err) mpssas_log_command(tm, MPS_RECOVERY, "error %d sending abort for cm %p SMID %u\n", err, cm, req->TaskMID); return err; } static void mpssas_scsiio_timeout(void *data) { struct mps_softc *sc; struct mps_command *cm; struct mpssas_target *targ; cm = (struct mps_command *)data; sc = cm->cm_sc; MPS_FUNCTRACE(sc); mtx_assert(&sc->mps_mtx, MA_OWNED); mps_dprint(sc, MPS_XINFO, "Timeout checking cm %p\n", sc); /* * Run the interrupt handler to make sure it's not pending. This * isn't perfect because the command could have already completed * and been re-used, though this is unlikely. */ mps_intr_locked(sc); if (cm->cm_state == MPS_CM_STATE_FREE) { mpssas_log_command(cm, MPS_XINFO, "SCSI command %p almost timed out\n", cm); return; } if (cm->cm_ccb == NULL) { mps_dprint(sc, MPS_ERROR, "command timeout with NULL ccb\n"); return; } mpssas_log_command(cm, MPS_INFO, "command timeout cm %p ccb %p\n", cm, cm->cm_ccb); targ = cm->cm_targ; targ->timeouts++; /* XXX first, check the firmware state, to see if it's still * operational. if not, do a diag reset. */ mpssas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT); cm->cm_state = MPS_CM_STATE_TIMEDOUT; TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery); if (targ->tm != NULL) { /* target already in recovery, just queue up another * timedout command to be processed later. */ mps_dprint(sc, MPS_RECOVERY, "queued timedout cm %p for processing by tm %p\n", cm, targ->tm); } else if ((targ->tm = mpssas_alloc_tm(sc)) != NULL) { mps_dprint(sc, MPS_RECOVERY, "timedout cm %p allocated tm %p\n", cm, targ->tm); /* start recovery by aborting the first timedout command */ mpssas_send_abort(sc, targ->tm, cm); } else { /* XXX queue this target up for recovery once a TM becomes * available. The firmware only has a limited number of * HighPriority credits for the high priority requests used * for task management, and we ran out. * * Isilon: don't worry about this for now, since we have * more credits than disks in an enclosure, and limit * ourselves to one TM per target for recovery. */ mps_dprint(sc, MPS_RECOVERY, "timedout cm %p failed to allocate a tm\n", cm); } } static void mpssas_action_scsiio(struct mpssas_softc *sassc, union ccb *ccb) { MPI2_SCSI_IO_REQUEST *req; struct ccb_scsiio *csio; struct mps_softc *sc; struct mpssas_target *targ; struct mpssas_lun *lun; struct mps_command *cm; uint8_t i, lba_byte, *ref_tag_addr; uint16_t eedp_flags; uint32_t mpi_control; sc = sassc->sc; MPS_FUNCTRACE(sc); mtx_assert(&sc->mps_mtx, MA_OWNED); csio = &ccb->csio; KASSERT(csio->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in XPT_SCSI_IO\n", csio->ccb_h.target_id)); targ = &sassc->targets[csio->ccb_h.target_id]; mps_dprint(sc, MPS_TRACE, "ccb %p target flag %x\n", ccb, targ->flags); if (targ->handle == 0x0) { mps_dprint(sc, MPS_ERROR, "%s NULL handle for target %u\n", __func__, csio->ccb_h.target_id); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); xpt_done(ccb); return; } if (targ->flags & MPS_TARGET_FLAGS_RAID_COMPONENT) { mps_dprint(sc, MPS_ERROR, "%s Raid component no SCSI IO " "supported %u\n", __func__, csio->ccb_h.target_id); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); xpt_done(ccb); return; } /* * Sometimes, it is possible to get a command that is not "In * Progress" and was actually aborted by the upper layer. Check for * this here and complete the command without error. */ if (mpssas_get_ccbstatus(ccb) != CAM_REQ_INPROG) { mps_dprint(sc, MPS_TRACE, "%s Command is not in progress for " "target %u\n", __func__, csio->ccb_h.target_id); xpt_done(ccb); return; } /* * If devinfo is 0 this will be a volume. In that case don't tell CAM * that the volume has timed out. We want volumes to be enumerated * until they are deleted/removed, not just failed. */ if (targ->flags & MPSSAS_TARGET_INREMOVAL) { if (targ->devinfo == 0) mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); else mpssas_set_ccbstatus(ccb, CAM_SEL_TIMEOUT); xpt_done(ccb); return; } if ((sc->mps_flags & MPS_FLAGS_SHUTDOWN) != 0) { mps_dprint(sc, MPS_INFO, "%s shutting down\n", __func__); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); xpt_done(ccb); return; } /* * If target has a reset in progress, freeze the devq and return. The * devq will be released when the TM reset is finished. */ if (targ->flags & MPSSAS_TARGET_INRESET) { ccb->ccb_h.status = CAM_BUSY | CAM_DEV_QFRZN; mps_dprint(sc, MPS_INFO, "%s: Freezing devq for target ID %d\n", __func__, targ->tid); xpt_freeze_devq(ccb->ccb_h.path, 1); xpt_done(ccb); return; } cm = mps_alloc_command(sc); if (cm == NULL || (sc->mps_flags & MPS_FLAGS_DIAGRESET)) { if (cm != NULL) { mps_free_command(sc, cm); } if ((sassc->flags & MPSSAS_QUEUE_FROZEN) == 0) { xpt_freeze_simq(sassc->sim, 1); sassc->flags |= MPSSAS_QUEUE_FROZEN; } ccb->ccb_h.status &= ~CAM_SIM_QUEUED; ccb->ccb_h.status |= CAM_REQUEUE_REQ; xpt_done(ccb); return; } req = (MPI2_SCSI_IO_REQUEST *)cm->cm_req; bzero(req, sizeof(*req)); req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_IO_REQUEST; req->MsgFlags = 0; req->SenseBufferLowAddress = htole32(cm->cm_sense_busaddr); req->SenseBufferLength = MPS_SENSE_LEN; req->SGLFlags = 0; req->ChainOffset = 0; req->SGLOffset0 = 24; /* 32bit word offset to the SGL */ req->SGLOffset1= 0; req->SGLOffset2= 0; req->SGLOffset3= 0; req->SkipCount = 0; req->DataLength = htole32(csio->dxfer_len); req->BidirectionalDataLength = 0; req->IoFlags = htole16(csio->cdb_len); req->EEDPFlags = 0; /* Note: BiDirectional transfers are not supported */ switch (csio->ccb_h.flags & CAM_DIR_MASK) { case CAM_DIR_IN: mpi_control = MPI2_SCSIIO_CONTROL_READ; cm->cm_flags |= MPS_CM_FLAGS_DATAIN; break; case CAM_DIR_OUT: mpi_control = MPI2_SCSIIO_CONTROL_WRITE; cm->cm_flags |= MPS_CM_FLAGS_DATAOUT; break; case CAM_DIR_NONE: default: mpi_control = MPI2_SCSIIO_CONTROL_NODATATRANSFER; break; } if (csio->cdb_len == 32) mpi_control |= 4 << MPI2_SCSIIO_CONTROL_ADDCDBLEN_SHIFT; /* * It looks like the hardware doesn't require an explicit tag * number for each transaction. SAM Task Management not supported * at the moment. */ switch (csio->tag_action) { case MSG_HEAD_OF_Q_TAG: mpi_control |= MPI2_SCSIIO_CONTROL_HEADOFQ; break; case MSG_ORDERED_Q_TAG: mpi_control |= MPI2_SCSIIO_CONTROL_ORDEREDQ; break; case MSG_ACA_TASK: mpi_control |= MPI2_SCSIIO_CONTROL_ACAQ; break; case CAM_TAG_ACTION_NONE: case MSG_SIMPLE_Q_TAG: default: mpi_control |= MPI2_SCSIIO_CONTROL_SIMPLEQ; break; } mpi_control |= sc->mapping_table[csio->ccb_h.target_id].TLR_bits; req->Control = htole32(mpi_control); if (MPS_SET_LUN(req->LUN, csio->ccb_h.target_lun) != 0) { mps_free_command(sc, cm); mpssas_set_ccbstatus(ccb, CAM_LUN_INVALID); xpt_done(ccb); return; } if (csio->ccb_h.flags & CAM_CDB_POINTER) bcopy(csio->cdb_io.cdb_ptr, &req->CDB.CDB32[0], csio->cdb_len); else bcopy(csio->cdb_io.cdb_bytes, &req->CDB.CDB32[0],csio->cdb_len); req->IoFlags = htole16(csio->cdb_len); /* * Check if EEDP is supported and enabled. If it is then check if the * SCSI opcode could be using EEDP. If so, make sure the LUN exists and * is formatted for EEDP support. If all of this is true, set CDB up * for EEDP transfer. */ eedp_flags = op_code_prot[req->CDB.CDB32[0]]; if (sc->eedp_enabled && eedp_flags) { SLIST_FOREACH(lun, &targ->luns, lun_link) { if (lun->lun_id == csio->ccb_h.target_lun) { break; } } if ((lun != NULL) && (lun->eedp_formatted)) { req->EEDPBlockSize = htole16(lun->eedp_block_size); eedp_flags |= (MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG | MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG | MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD); req->EEDPFlags = htole16(eedp_flags); /* * If CDB less than 32, fill in Primary Ref Tag with * low 4 bytes of LBA. If CDB is 32, tag stuff is * already there. Also, set protection bit. FreeBSD * currently does not support CDBs bigger than 16, but * the code doesn't hurt, and will be here for the * future. */ if (csio->cdb_len != 32) { lba_byte = (csio->cdb_len == 16) ? 6 : 2; ref_tag_addr = (uint8_t *)&req->CDB.EEDP32. PrimaryReferenceTag; for (i = 0; i < 4; i++) { *ref_tag_addr = req->CDB.CDB32[lba_byte + i]; ref_tag_addr++; } req->CDB.EEDP32.PrimaryReferenceTag = htole32(req->CDB.EEDP32.PrimaryReferenceTag); req->CDB.EEDP32.PrimaryApplicationTagMask = 0xFFFF; req->CDB.CDB32[1] = (req->CDB.CDB32[1] & 0x1F) | 0x20; } else { eedp_flags |= MPI2_SCSIIO_EEDPFLAGS_INC_PRI_APPTAG; req->EEDPFlags = htole16(eedp_flags); req->CDB.CDB32[10] = (req->CDB.CDB32[10] & 0x1F) | 0x20; } } } cm->cm_length = csio->dxfer_len; if (cm->cm_length != 0) { cm->cm_data = ccb; cm->cm_flags |= MPS_CM_FLAGS_USE_CCB; } else { cm->cm_data = NULL; } cm->cm_sge = &req->SGL; cm->cm_sglsize = (32 - 24) * 4; cm->cm_desc.SCSIIO.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO; cm->cm_desc.SCSIIO.DevHandle = htole16(targ->handle); cm->cm_complete = mpssas_scsiio_complete; cm->cm_complete_data = ccb; cm->cm_targ = targ; cm->cm_lun = csio->ccb_h.target_lun; cm->cm_ccb = ccb; /* * If HBA is a WD and the command is not for a retry, try to build a * direct I/O message. If failed, or the command is for a retry, send * the I/O to the IR volume itself. */ if (sc->WD_valid_config) { if (ccb->ccb_h.sim_priv.entries[0].field == MPS_WD_RETRY) { mpssas_direct_drive_io(sassc, cm, ccb); } else { mpssas_set_ccbstatus(ccb, CAM_REQ_INPROG); } } +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + if (csio->bio != NULL) + biotrack(csio->bio, __func__); +#endif callout_reset_sbt(&cm->cm_callout, SBT_1MS * ccb->ccb_h.timeout, 0, mpssas_scsiio_timeout, cm, 0); targ->issued++; targ->outstanding++; TAILQ_INSERT_TAIL(&targ->commands, cm, cm_link); ccb->ccb_h.status |= CAM_SIM_QUEUED; mpssas_log_command(cm, MPS_XINFO, "%s cm %p ccb %p outstanding %u\n", __func__, cm, ccb, targ->outstanding); mps_map_command(sc, cm); return; } static void mps_response_code(struct mps_softc *sc, u8 response_code) { char *desc; switch (response_code) { case MPI2_SCSITASKMGMT_RSP_TM_COMPLETE: desc = "task management request completed"; break; case MPI2_SCSITASKMGMT_RSP_INVALID_FRAME: desc = "invalid frame"; break; case MPI2_SCSITASKMGMT_RSP_TM_NOT_SUPPORTED: desc = "task management request not supported"; break; case MPI2_SCSITASKMGMT_RSP_TM_FAILED: desc = "task management request failed"; break; case MPI2_SCSITASKMGMT_RSP_TM_SUCCEEDED: desc = "task management request succeeded"; break; case MPI2_SCSITASKMGMT_RSP_TM_INVALID_LUN: desc = "invalid lun"; break; case 0xA: desc = "overlapped tag attempted"; break; case MPI2_SCSITASKMGMT_RSP_IO_QUEUED_ON_IOC: desc = "task queued, however not sent to target"; break; default: desc = "unknown"; break; } mps_dprint(sc, MPS_XINFO, "response_code(0x%01x): %s\n", response_code, desc); } /** * mps_sc_failed_io_info - translated non-succesfull SCSI_IO request */ static void mps_sc_failed_io_info(struct mps_softc *sc, struct ccb_scsiio *csio, Mpi2SCSIIOReply_t *mpi_reply) { u32 response_info; u8 *response_bytes; u16 ioc_status = le16toh(mpi_reply->IOCStatus) & MPI2_IOCSTATUS_MASK; u8 scsi_state = mpi_reply->SCSIState; u8 scsi_status = mpi_reply->SCSIStatus; char *desc_ioc_state = NULL; char *desc_scsi_status = NULL; char *desc_scsi_state = sc->tmp_string; u32 log_info = le32toh(mpi_reply->IOCLogInfo); if (log_info == 0x31170000) return; switch (ioc_status) { case MPI2_IOCSTATUS_SUCCESS: desc_ioc_state = "success"; break; case MPI2_IOCSTATUS_INVALID_FUNCTION: desc_ioc_state = "invalid function"; break; case MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR: desc_ioc_state = "scsi recovered error"; break; case MPI2_IOCSTATUS_SCSI_INVALID_DEVHANDLE: desc_ioc_state = "scsi invalid dev handle"; break; case MPI2_IOCSTATUS_SCSI_DEVICE_NOT_THERE: desc_ioc_state = "scsi device not there"; break; case MPI2_IOCSTATUS_SCSI_DATA_OVERRUN: desc_ioc_state = "scsi data overrun"; break; case MPI2_IOCSTATUS_SCSI_DATA_UNDERRUN: desc_ioc_state = "scsi data underrun"; break; case MPI2_IOCSTATUS_SCSI_IO_DATA_ERROR: desc_ioc_state = "scsi io data error"; break; case MPI2_IOCSTATUS_SCSI_PROTOCOL_ERROR: desc_ioc_state = "scsi protocol error"; break; case MPI2_IOCSTATUS_SCSI_TASK_TERMINATED: desc_ioc_state = "scsi task terminated"; break; case MPI2_IOCSTATUS_SCSI_RESIDUAL_MISMATCH: desc_ioc_state = "scsi residual mismatch"; break; case MPI2_IOCSTATUS_SCSI_TASK_MGMT_FAILED: desc_ioc_state = "scsi task mgmt failed"; break; case MPI2_IOCSTATUS_SCSI_IOC_TERMINATED: desc_ioc_state = "scsi ioc terminated"; break; case MPI2_IOCSTATUS_SCSI_EXT_TERMINATED: desc_ioc_state = "scsi ext terminated"; break; case MPI2_IOCSTATUS_EEDP_GUARD_ERROR: desc_ioc_state = "eedp guard error"; break; case MPI2_IOCSTATUS_EEDP_REF_TAG_ERROR: desc_ioc_state = "eedp ref tag error"; break; case MPI2_IOCSTATUS_EEDP_APP_TAG_ERROR: desc_ioc_state = "eedp app tag error"; break; default: desc_ioc_state = "unknown"; break; } switch (scsi_status) { case MPI2_SCSI_STATUS_GOOD: desc_scsi_status = "good"; break; case MPI2_SCSI_STATUS_CHECK_CONDITION: desc_scsi_status = "check condition"; break; case MPI2_SCSI_STATUS_CONDITION_MET: desc_scsi_status = "condition met"; break; case MPI2_SCSI_STATUS_BUSY: desc_scsi_status = "busy"; break; case MPI2_SCSI_STATUS_INTERMEDIATE: desc_scsi_status = "intermediate"; break; case MPI2_SCSI_STATUS_INTERMEDIATE_CONDMET: desc_scsi_status = "intermediate condmet"; break; case MPI2_SCSI_STATUS_RESERVATION_CONFLICT: desc_scsi_status = "reservation conflict"; break; case MPI2_SCSI_STATUS_COMMAND_TERMINATED: desc_scsi_status = "command terminated"; break; case MPI2_SCSI_STATUS_TASK_SET_FULL: desc_scsi_status = "task set full"; break; case MPI2_SCSI_STATUS_ACA_ACTIVE: desc_scsi_status = "aca active"; break; case MPI2_SCSI_STATUS_TASK_ABORTED: desc_scsi_status = "task aborted"; break; default: desc_scsi_status = "unknown"; break; } desc_scsi_state[0] = '\0'; if (!scsi_state) desc_scsi_state = " "; if (scsi_state & MPI2_SCSI_STATE_RESPONSE_INFO_VALID) strcat(desc_scsi_state, "response info "); if (scsi_state & MPI2_SCSI_STATE_TERMINATED) strcat(desc_scsi_state, "state terminated "); if (scsi_state & MPI2_SCSI_STATE_NO_SCSI_STATUS) strcat(desc_scsi_state, "no status "); if (scsi_state & MPI2_SCSI_STATE_AUTOSENSE_FAILED) strcat(desc_scsi_state, "autosense failed "); if (scsi_state & MPI2_SCSI_STATE_AUTOSENSE_VALID) strcat(desc_scsi_state, "autosense valid "); mps_dprint(sc, MPS_XINFO, "\thandle(0x%04x), ioc_status(%s)(0x%04x)\n", le16toh(mpi_reply->DevHandle), desc_ioc_state, ioc_status); /* We can add more detail about underflow data here * TO-DO * */ mps_dprint(sc, MPS_XINFO, "\tscsi_status(%s)(0x%02x), " "scsi_state(%s)(0x%02x)\n", desc_scsi_status, scsi_status, desc_scsi_state, scsi_state); if (sc->mps_debug & MPS_XINFO && scsi_state & MPI2_SCSI_STATE_AUTOSENSE_VALID) { mps_dprint(sc, MPS_XINFO, "-> Sense Buffer Data : Start :\n"); scsi_sense_print(csio); mps_dprint(sc, MPS_XINFO, "-> Sense Buffer Data : End :\n"); } if (scsi_state & MPI2_SCSI_STATE_RESPONSE_INFO_VALID) { response_info = le32toh(mpi_reply->ResponseInfo); response_bytes = (u8 *)&response_info; mps_response_code(sc,response_bytes[0]); } } static void mpssas_scsiio_complete(struct mps_softc *sc, struct mps_command *cm) { MPI2_SCSI_IO_REPLY *rep; union ccb *ccb; struct ccb_scsiio *csio; struct mpssas_softc *sassc; struct scsi_vpd_supported_page_list *vpd_list = NULL; u8 *TLR_bits, TLR_on; int dir = 0, i; u16 alloc_len; struct mpssas_target *target; target_id_t target_id; MPS_FUNCTRACE(sc); mps_dprint(sc, MPS_TRACE, "cm %p SMID %u ccb %p reply %p outstanding %u\n", cm, cm->cm_desc.Default.SMID, cm->cm_ccb, cm->cm_reply, cm->cm_targ->outstanding); callout_stop(&cm->cm_callout); mtx_assert(&sc->mps_mtx, MA_OWNED); sassc = sc->sassc; ccb = cm->cm_complete_data; csio = &ccb->csio; target_id = csio->ccb_h.target_id; rep = (MPI2_SCSI_IO_REPLY *)cm->cm_reply; /* * XXX KDM if the chain allocation fails, does it matter if we do * the sync and unload here? It is simpler to do it in every case, * assuming it doesn't cause problems. */ if (cm->cm_data != NULL) { if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) dir = BUS_DMASYNC_POSTREAD; else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT) dir = BUS_DMASYNC_POSTWRITE; bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir); bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap); } cm->cm_targ->completed++; cm->cm_targ->outstanding--; TAILQ_REMOVE(&cm->cm_targ->commands, cm, cm_link); ccb->ccb_h.status &= ~(CAM_STATUS_MASK | CAM_SIM_QUEUED); + +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + if (ccb->csio.bio != NULL) + biotrack(ccb->csio.bio, __func__); +#endif if (cm->cm_state == MPS_CM_STATE_TIMEDOUT) { TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery); if (cm->cm_reply != NULL) mpssas_log_command(cm, MPS_RECOVERY, "completed timedout cm %p ccb %p during recovery " "ioc %x scsi %x state %x xfer %u\n", cm, cm->cm_ccb, le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); else mpssas_log_command(cm, MPS_RECOVERY, "completed timedout cm %p ccb %p during recovery\n", cm, cm->cm_ccb); } else if (cm->cm_targ->tm != NULL) { if (cm->cm_reply != NULL) mpssas_log_command(cm, MPS_RECOVERY, "completed cm %p ccb %p during recovery " "ioc %x scsi %x state %x xfer %u\n", cm, cm->cm_ccb, le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); else mpssas_log_command(cm, MPS_RECOVERY, "completed cm %p ccb %p during recovery\n", cm, cm->cm_ccb); } else if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) { mpssas_log_command(cm, MPS_RECOVERY, "reset completed cm %p ccb %p\n", cm, cm->cm_ccb); } if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { /* * We ran into an error after we tried to map the command, * so we're getting a callback without queueing the command * to the hardware. So we set the status here, and it will * be retained below. We'll go through the "fast path", * because there can be no reply when we haven't actually * gone out to the hardware. */ mpssas_set_ccbstatus(ccb, CAM_REQUEUE_REQ); /* * Currently the only error included in the mask is * MPS_CM_FLAGS_CHAIN_FAILED, which means we're out of * chain frames. We need to freeze the queue until we get * a command that completed without this error, which will * hopefully have some chain frames attached that we can * use. If we wanted to get smarter about it, we would * only unfreeze the queue in this condition when we're * sure that we're getting some chain frames back. That's * probably unnecessary. */ if ((sassc->flags & MPSSAS_QUEUE_FROZEN) == 0) { xpt_freeze_simq(sassc->sim, 1); sassc->flags |= MPSSAS_QUEUE_FROZEN; mps_dprint(sc, MPS_XINFO, "Error sending command, " "freezing SIM queue\n"); } } /* * If this is a Start Stop Unit command and it was issued by the driver * during shutdown, decrement the refcount to account for all of the * commands that were sent. All SSU commands should be completed before * shutdown completes, meaning SSU_refcount will be 0 after SSU_started * is TRUE. */ if (sc->SSU_started && (csio->cdb_io.cdb_bytes[0] == START_STOP_UNIT)) { mps_dprint(sc, MPS_INFO, "Decrementing SSU count.\n"); sc->SSU_refcount--; } /* Take the fast path to completion */ if (cm->cm_reply == NULL) { if (mpssas_get_ccbstatus(ccb) == CAM_REQ_INPROG) { if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) mpssas_set_ccbstatus(ccb, CAM_SCSI_BUS_RESET); else { mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); ccb->csio.scsi_status = SCSI_STATUS_OK; } if (sassc->flags & MPSSAS_QUEUE_FROZEN) { ccb->ccb_h.status |= CAM_RELEASE_SIMQ; sassc->flags &= ~MPSSAS_QUEUE_FROZEN; mps_dprint(sc, MPS_XINFO, "Unfreezing SIM queue\n"); } } /* * There are two scenarios where the status won't be * CAM_REQ_CMP. The first is if MPS_CM_FLAGS_ERROR_MASK is * set, the second is in the MPS_FLAGS_DIAGRESET above. */ if (mpssas_get_ccbstatus(ccb) != CAM_REQ_CMP) { /* * Freeze the dev queue so that commands are * executed in the correct order after error * recovery. */ ccb->ccb_h.status |= CAM_DEV_QFRZN; xpt_freeze_devq(ccb->ccb_h.path, /*count*/ 1); } mps_free_command(sc, cm); xpt_done(ccb); return; } mpssas_log_command(cm, MPS_XINFO, "ioc %x scsi %x state %x xfer %u\n", le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); /* * If this is a Direct Drive I/O, reissue the I/O to the original IR * Volume if an error occurred (normal I/O retry). Use the original * CCB, but set a flag that this will be a retry so that it's sent to * the original volume. Free the command but reuse the CCB. */ if (cm->cm_flags & MPS_CM_FLAGS_DD_IO) { mps_free_command(sc, cm); ccb->ccb_h.sim_priv.entries[0].field = MPS_WD_RETRY; mpssas_action_scsiio(sassc, ccb); return; } else ccb->ccb_h.sim_priv.entries[0].field = 0; switch (le16toh(rep->IOCStatus) & MPI2_IOCSTATUS_MASK) { case MPI2_IOCSTATUS_SCSI_DATA_UNDERRUN: csio->resid = cm->cm_length - le32toh(rep->TransferCount); /* FALLTHROUGH */ case MPI2_IOCSTATUS_SUCCESS: case MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR: if ((le16toh(rep->IOCStatus) & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SCSI_RECOVERED_ERROR) mpssas_log_command(cm, MPS_XINFO, "recovered error\n"); /* Completion failed at the transport level. */ if (rep->SCSIState & (MPI2_SCSI_STATE_NO_SCSI_STATUS | MPI2_SCSI_STATE_TERMINATED)) { mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); break; } /* In a modern packetized environment, an autosense failure * implies that there's not much else that can be done to * recover the command. */ if (rep->SCSIState & MPI2_SCSI_STATE_AUTOSENSE_FAILED) { mpssas_set_ccbstatus(ccb, CAM_AUTOSENSE_FAIL); break; } /* * CAM doesn't care about SAS Response Info data, but if this is * the state check if TLR should be done. If not, clear the * TLR_bits for the target. */ if ((rep->SCSIState & MPI2_SCSI_STATE_RESPONSE_INFO_VALID) && ((le32toh(rep->ResponseInfo) & MPI2_SCSI_RI_MASK_REASONCODE) == MPS_SCSI_RI_INVALID_FRAME)) { sc->mapping_table[target_id].TLR_bits = (u8)MPI2_SCSIIO_CONTROL_NO_TLR; } /* * Intentionally override the normal SCSI status reporting * for these two cases. These are likely to happen in a * multi-initiator environment, and we want to make sure that * CAM retries these commands rather than fail them. */ if ((rep->SCSIStatus == MPI2_SCSI_STATUS_COMMAND_TERMINATED) || (rep->SCSIStatus == MPI2_SCSI_STATUS_TASK_ABORTED)) { mpssas_set_ccbstatus(ccb, CAM_REQ_ABORTED); break; } /* Handle normal status and sense */ csio->scsi_status = rep->SCSIStatus; if (rep->SCSIStatus == MPI2_SCSI_STATUS_GOOD) mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); else mpssas_set_ccbstatus(ccb, CAM_SCSI_STATUS_ERROR); if (rep->SCSIState & MPI2_SCSI_STATE_AUTOSENSE_VALID) { int sense_len, returned_sense_len; returned_sense_len = min(le32toh(rep->SenseCount), sizeof(struct scsi_sense_data)); if (returned_sense_len < ccb->csio.sense_len) ccb->csio.sense_resid = ccb->csio.sense_len - returned_sense_len; else ccb->csio.sense_resid = 0; sense_len = min(returned_sense_len, ccb->csio.sense_len - ccb->csio.sense_resid); bzero(&ccb->csio.sense_data, sizeof(ccb->csio.sense_data)); bcopy(cm->cm_sense, &ccb->csio.sense_data, sense_len); ccb->ccb_h.status |= CAM_AUTOSNS_VALID; } /* * Check if this is an INQUIRY command. If it's a VPD inquiry, * and it's page code 0 (Supported Page List), and there is * inquiry data, and this is for a sequential access device, and * the device is an SSP target, and TLR is supported by the * controller, turn the TLR_bits value ON if page 0x90 is * supported. */ if ((csio->cdb_io.cdb_bytes[0] == INQUIRY) && (csio->cdb_io.cdb_bytes[1] & SI_EVPD) && (csio->cdb_io.cdb_bytes[2] == SVPD_SUPPORTED_PAGE_LIST) && ((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR) && (csio->data_ptr != NULL) && ((csio->data_ptr[0] & 0x1f) == T_SEQUENTIAL) && (sc->control_TLR) && (sc->mapping_table[target_id].device_info & MPI2_SAS_DEVICE_INFO_SSP_TARGET)) { vpd_list = (struct scsi_vpd_supported_page_list *) csio->data_ptr; TLR_bits = &sc->mapping_table[target_id].TLR_bits; *TLR_bits = (u8)MPI2_SCSIIO_CONTROL_NO_TLR; TLR_on = (u8)MPI2_SCSIIO_CONTROL_TLR_ON; alloc_len = ((u16)csio->cdb_io.cdb_bytes[3] << 8) + csio->cdb_io.cdb_bytes[4]; alloc_len -= csio->resid; for (i = 0; i < MIN(vpd_list->length, alloc_len); i++) { if (vpd_list->list[i] == 0x90) { *TLR_bits = TLR_on; break; } } } /* * If this is a SATA direct-access end device, mark it so that * a SCSI StartStopUnit command will be sent to it when the * driver is being shutdown. */ if ((csio->cdb_io.cdb_bytes[0] == INQUIRY) && ((csio->data_ptr[0] & 0x1f) == T_DIRECT) && (sc->mapping_table[target_id].device_info & MPI2_SAS_DEVICE_INFO_SATA_DEVICE) && ((sc->mapping_table[target_id].device_info & MPI2_SAS_DEVICE_INFO_MASK_DEVICE_TYPE) == MPI2_SAS_DEVICE_INFO_END_DEVICE)) { target = &sassc->targets[target_id]; target->supports_SSU = TRUE; mps_dprint(sc, MPS_XINFO, "Target %d supports SSU\n", target_id); } break; case MPI2_IOCSTATUS_SCSI_INVALID_DEVHANDLE: case MPI2_IOCSTATUS_SCSI_DEVICE_NOT_THERE: /* * If devinfo is 0 this will be a volume. In that case don't * tell CAM that the volume is not there. We want volumes to * be enumerated until they are deleted/removed, not just * failed. */ if (cm->cm_targ->devinfo == 0) mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); else mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); break; case MPI2_IOCSTATUS_INVALID_SGL: mps_print_scsiio_cmd(sc, cm); mpssas_set_ccbstatus(ccb, CAM_UNREC_HBA_ERROR); break; case MPI2_IOCSTATUS_SCSI_TASK_TERMINATED: /* * This is one of the responses that comes back when an I/O * has been aborted. If it is because of a timeout that we * initiated, just set the status to CAM_CMD_TIMEOUT. * Otherwise set it to CAM_REQ_ABORTED. The effect on the * command is the same (it gets retried, subject to the * retry counter), the only difference is what gets printed * on the console. */ if (cm->cm_state == MPS_CM_STATE_TIMEDOUT) mpssas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT); else mpssas_set_ccbstatus(ccb, CAM_REQ_ABORTED); break; case MPI2_IOCSTATUS_SCSI_DATA_OVERRUN: /* resid is ignored for this condition */ csio->resid = 0; mpssas_set_ccbstatus(ccb, CAM_DATA_RUN_ERR); break; case MPI2_IOCSTATUS_SCSI_IOC_TERMINATED: case MPI2_IOCSTATUS_SCSI_EXT_TERMINATED: /* * These can sometimes be transient transport-related * errors, and sometimes persistent drive-related errors. * We used to retry these without decrementing the retry * count by returning CAM_REQUEUE_REQ. Unfortunately, if * we hit a persistent drive problem that returns one of * these error codes, we would retry indefinitely. So, * return CAM_REQ_CMP_ERROR so that we decrement the retry * count and avoid infinite retries. We're taking the * potential risk of flagging false failures in the event * of a topology-related error (e.g. a SAS expander problem * causes a command addressed to a drive to fail), but * avoiding getting into an infinite retry loop. */ mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); mpssas_log_command(cm, MPS_INFO, "terminated ioc %x scsi %x state %x xfer %u\n", le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); break; case MPI2_IOCSTATUS_INVALID_FUNCTION: case MPI2_IOCSTATUS_INTERNAL_ERROR: case MPI2_IOCSTATUS_INVALID_VPID: case MPI2_IOCSTATUS_INVALID_FIELD: case MPI2_IOCSTATUS_INVALID_STATE: case MPI2_IOCSTATUS_OP_STATE_NOT_SUPPORTED: case MPI2_IOCSTATUS_SCSI_IO_DATA_ERROR: case MPI2_IOCSTATUS_SCSI_PROTOCOL_ERROR: case MPI2_IOCSTATUS_SCSI_RESIDUAL_MISMATCH: case MPI2_IOCSTATUS_SCSI_TASK_MGMT_FAILED: default: mpssas_log_command(cm, MPS_XINFO, "completed ioc %x scsi %x state %x xfer %u\n", le16toh(rep->IOCStatus), rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); csio->resid = cm->cm_length; mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); break; } mps_sc_failed_io_info(sc,csio,rep); if (sassc->flags & MPSSAS_QUEUE_FROZEN) { ccb->ccb_h.status |= CAM_RELEASE_SIMQ; sassc->flags &= ~MPSSAS_QUEUE_FROZEN; mps_dprint(sc, MPS_XINFO, "Command completed, " "unfreezing SIM queue\n"); } if (mpssas_get_ccbstatus(ccb) != CAM_REQ_CMP) { ccb->ccb_h.status |= CAM_DEV_QFRZN; xpt_freeze_devq(ccb->ccb_h.path, /*count*/ 1); } mps_free_command(sc, cm); xpt_done(ccb); } /* All Request reached here are Endian safe */ static void mpssas_direct_drive_io(struct mpssas_softc *sassc, struct mps_command *cm, union ccb *ccb) { pMpi2SCSIIORequest_t pIO_req; struct mps_softc *sc = sassc->sc; uint64_t virtLBA; uint32_t physLBA, stripe_offset, stripe_unit; uint32_t io_size, column; uint8_t *ptrLBA, lba_idx, physLBA_byte, *CDB; /* * If this is a valid SCSI command (Read6, Read10, Read16, Write6, * Write10, or Write16), build a direct I/O message. Otherwise, the I/O * will be sent to the IR volume itself. Since Read6 and Write6 are a * bit different than the 10/16 CDBs, handle them separately. */ pIO_req = (pMpi2SCSIIORequest_t)cm->cm_req; CDB = pIO_req->CDB.CDB32; /* * Handle 6 byte CDBs. */ if ((pIO_req->DevHandle == sc->DD_dev_handle) && ((CDB[0] == READ_6) || (CDB[0] == WRITE_6))) { /* * Get the transfer size in blocks. */ io_size = (cm->cm_length >> sc->DD_block_exponent); /* * Get virtual LBA given in the CDB. */ virtLBA = ((uint64_t)(CDB[1] & 0x1F) << 16) | ((uint64_t)CDB[2] << 8) | (uint64_t)CDB[3]; /* * Check that LBA range for I/O does not exceed volume's * MaxLBA. */ if ((virtLBA + (uint64_t)io_size - 1) <= sc->DD_max_lba) { /* * Check if the I/O crosses a stripe boundary. If not, * translate the virtual LBA to a physical LBA and set * the DevHandle for the PhysDisk to be used. If it * does cross a boundary, do normal I/O. To get the * right DevHandle to use, get the map number for the * column, then use that map number to look up the * DevHandle of the PhysDisk. */ stripe_offset = (uint32_t)virtLBA & (sc->DD_stripe_size - 1); if ((stripe_offset + io_size) <= sc->DD_stripe_size) { physLBA = (uint32_t)virtLBA >> sc->DD_stripe_exponent; stripe_unit = physLBA / sc->DD_num_phys_disks; column = physLBA % sc->DD_num_phys_disks; pIO_req->DevHandle = htole16(sc->DD_column_map[column].dev_handle); /* ???? Is this endian safe*/ cm->cm_desc.SCSIIO.DevHandle = pIO_req->DevHandle; physLBA = (stripe_unit << sc->DD_stripe_exponent) + stripe_offset; ptrLBA = &pIO_req->CDB.CDB32[1]; physLBA_byte = (uint8_t)(physLBA >> 16); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[2]; physLBA_byte = (uint8_t)(physLBA >> 8); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[3]; physLBA_byte = (uint8_t)physLBA; *ptrLBA = physLBA_byte; /* * Set flag that Direct Drive I/O is * being done. */ cm->cm_flags |= MPS_CM_FLAGS_DD_IO; } } return; } /* * Handle 10, 12 or 16 byte CDBs. */ if ((pIO_req->DevHandle == sc->DD_dev_handle) && ((CDB[0] == READ_10) || (CDB[0] == WRITE_10) || (CDB[0] == READ_16) || (CDB[0] == WRITE_16) || (CDB[0] == READ_12) || (CDB[0] == WRITE_12))) { /* * For 16-byte CDB's, verify that the upper 4 bytes of the CDB * are 0. If not, this is accessing beyond 2TB so handle it in * the else section. 10-byte and 12-byte CDB's are OK. * FreeBSD sends very rare 12 byte READ/WRITE, but driver is * ready to accept 12byte CDB for Direct IOs. */ if ((CDB[0] == READ_10 || CDB[0] == WRITE_10) || (CDB[0] == READ_12 || CDB[0] == WRITE_12) || !(CDB[2] | CDB[3] | CDB[4] | CDB[5])) { /* * Get the transfer size in blocks. */ io_size = (cm->cm_length >> sc->DD_block_exponent); /* * Get virtual LBA. Point to correct lower 4 bytes of * LBA in the CDB depending on command. */ lba_idx = ((CDB[0] == READ_12) || (CDB[0] == WRITE_12) || (CDB[0] == READ_10) || (CDB[0] == WRITE_10))? 2 : 6; virtLBA = ((uint64_t)CDB[lba_idx] << 24) | ((uint64_t)CDB[lba_idx + 1] << 16) | ((uint64_t)CDB[lba_idx + 2] << 8) | (uint64_t)CDB[lba_idx + 3]; /* * Check that LBA range for I/O does not exceed volume's * MaxLBA. */ if ((virtLBA + (uint64_t)io_size - 1) <= sc->DD_max_lba) { /* * Check if the I/O crosses a stripe boundary. * If not, translate the virtual LBA to a * physical LBA and set the DevHandle for the * PhysDisk to be used. If it does cross a * boundary, do normal I/O. To get the right * DevHandle to use, get the map number for the * column, then use that map number to look up * the DevHandle of the PhysDisk. */ stripe_offset = (uint32_t)virtLBA & (sc->DD_stripe_size - 1); if ((stripe_offset + io_size) <= sc->DD_stripe_size) { physLBA = (uint32_t)virtLBA >> sc->DD_stripe_exponent; stripe_unit = physLBA / sc->DD_num_phys_disks; column = physLBA % sc->DD_num_phys_disks; pIO_req->DevHandle = htole16(sc->DD_column_map[column]. dev_handle); cm->cm_desc.SCSIIO.DevHandle = pIO_req->DevHandle; physLBA = (stripe_unit << sc->DD_stripe_exponent) + stripe_offset; ptrLBA = &pIO_req->CDB.CDB32[lba_idx]; physLBA_byte = (uint8_t)(physLBA >> 24); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[lba_idx + 1]; physLBA_byte = (uint8_t)(physLBA >> 16); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[lba_idx + 2]; physLBA_byte = (uint8_t)(physLBA >> 8); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[lba_idx + 3]; physLBA_byte = (uint8_t)physLBA; *ptrLBA = physLBA_byte; /* * Set flag that Direct Drive I/O is * being done. */ cm->cm_flags |= MPS_CM_FLAGS_DD_IO; } } } else { /* * 16-byte CDB and the upper 4 bytes of the CDB are not * 0. Get the transfer size in blocks. */ io_size = (cm->cm_length >> sc->DD_block_exponent); /* * Get virtual LBA. */ virtLBA = ((uint64_t)CDB[2] << 54) | ((uint64_t)CDB[3] << 48) | ((uint64_t)CDB[4] << 40) | ((uint64_t)CDB[5] << 32) | ((uint64_t)CDB[6] << 24) | ((uint64_t)CDB[7] << 16) | ((uint64_t)CDB[8] << 8) | (uint64_t)CDB[9]; /* * Check that LBA range for I/O does not exceed volume's * MaxLBA. */ if ((virtLBA + (uint64_t)io_size - 1) <= sc->DD_max_lba) { /* * Check if the I/O crosses a stripe boundary. * If not, translate the virtual LBA to a * physical LBA and set the DevHandle for the * PhysDisk to be used. If it does cross a * boundary, do normal I/O. To get the right * DevHandle to use, get the map number for the * column, then use that map number to look up * the DevHandle of the PhysDisk. */ stripe_offset = (uint32_t)virtLBA & (sc->DD_stripe_size - 1); if ((stripe_offset + io_size) <= sc->DD_stripe_size) { physLBA = (uint32_t)(virtLBA >> sc->DD_stripe_exponent); stripe_unit = physLBA / sc->DD_num_phys_disks; column = physLBA % sc->DD_num_phys_disks; pIO_req->DevHandle = htole16(sc->DD_column_map[column]. dev_handle); cm->cm_desc.SCSIIO.DevHandle = pIO_req->DevHandle; physLBA = (stripe_unit << sc->DD_stripe_exponent) + stripe_offset; /* * Set upper 4 bytes of LBA to 0. We * assume that the phys disks are less * than 2 TB's in size. Then, set the * lower 4 bytes. */ pIO_req->CDB.CDB32[2] = 0; pIO_req->CDB.CDB32[3] = 0; pIO_req->CDB.CDB32[4] = 0; pIO_req->CDB.CDB32[5] = 0; ptrLBA = &pIO_req->CDB.CDB32[6]; physLBA_byte = (uint8_t)(physLBA >> 24); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[7]; physLBA_byte = (uint8_t)(physLBA >> 16); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[8]; physLBA_byte = (uint8_t)(physLBA >> 8); *ptrLBA = physLBA_byte; ptrLBA = &pIO_req->CDB.CDB32[9]; physLBA_byte = (uint8_t)physLBA; *ptrLBA = physLBA_byte; /* * Set flag that Direct Drive I/O is * being done. */ cm->cm_flags |= MPS_CM_FLAGS_DD_IO; } } } } } #if __FreeBSD_version >= 900026 static void mpssas_smpio_complete(struct mps_softc *sc, struct mps_command *cm) { MPI2_SMP_PASSTHROUGH_REPLY *rpl; MPI2_SMP_PASSTHROUGH_REQUEST *req; uint64_t sasaddr; union ccb *ccb; ccb = cm->cm_complete_data; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and SMP * commands require two S/G elements only. That should be handled * in the standard request size. */ if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { mps_dprint(sc, MPS_ERROR,"%s: cm_flags = %#x on SMP request!\n", __func__, cm->cm_flags); mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } rpl = (MPI2_SMP_PASSTHROUGH_REPLY *)cm->cm_reply; if (rpl == NULL) { mps_dprint(sc, MPS_ERROR, "%s: NULL cm_reply!\n", __func__); mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } req = (MPI2_SMP_PASSTHROUGH_REQUEST *)cm->cm_req; sasaddr = le32toh(req->SASAddress.Low); sasaddr |= ((uint64_t)(le32toh(req->SASAddress.High))) << 32; if ((le16toh(rpl->IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS || rpl->SASStatus != MPI2_SASSTATUS_SUCCESS) { mps_dprint(sc, MPS_XINFO, "%s: IOCStatus %04x SASStatus %02x\n", __func__, le16toh(rpl->IOCStatus), rpl->SASStatus); mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } mps_dprint(sc, MPS_XINFO, "%s: SMP request to SAS address " "%#jx completed successfully\n", __func__, (uintmax_t)sasaddr); if (ccb->smpio.smp_response[2] == SMP_FR_ACCEPTED) mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); else mpssas_set_ccbstatus(ccb, CAM_SMP_STATUS_ERROR); bailout: /* * We sync in both directions because we had DMAs in the S/G list * in both directions. */ bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap); mps_free_command(sc, cm); xpt_done(ccb); } static void mpssas_send_smpcmd(struct mpssas_softc *sassc, union ccb *ccb, uint64_t sasaddr) { struct mps_command *cm; uint8_t *request, *response; MPI2_SMP_PASSTHROUGH_REQUEST *req; struct mps_softc *sc; int error; sc = sassc->sc; error = 0; /* * XXX We don't yet support physical addresses here. */ switch ((ccb->ccb_h.flags & CAM_DATA_MASK)) { case CAM_DATA_PADDR: case CAM_DATA_SG_PADDR: mps_dprint(sc, MPS_ERROR, "%s: physical addresses not supported\n", __func__); mpssas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; case CAM_DATA_SG: /* * The chip does not support more than one buffer for the * request or response. */ if ((ccb->smpio.smp_request_sglist_cnt > 1) || (ccb->smpio.smp_response_sglist_cnt > 1)) { mps_dprint(sc, MPS_ERROR, "%s: multiple request or response " "buffer segments not supported for SMP\n", __func__); mpssas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; } /* * The CAM_SCATTER_VALID flag was originally implemented * for the XPT_SCSI_IO CCB, which only has one data pointer. * We have two. So, just take that flag to mean that we * might have S/G lists, and look at the S/G segment count * to figure out whether that is the case for each individual * buffer. */ if (ccb->smpio.smp_request_sglist_cnt != 0) { bus_dma_segment_t *req_sg; req_sg = (bus_dma_segment_t *)ccb->smpio.smp_request; request = (uint8_t *)(uintptr_t)req_sg[0].ds_addr; } else request = ccb->smpio.smp_request; if (ccb->smpio.smp_response_sglist_cnt != 0) { bus_dma_segment_t *rsp_sg; rsp_sg = (bus_dma_segment_t *)ccb->smpio.smp_response; response = (uint8_t *)(uintptr_t)rsp_sg[0].ds_addr; } else response = ccb->smpio.smp_response; break; case CAM_DATA_VADDR: request = ccb->smpio.smp_request; response = ccb->smpio.smp_response; break; default: mpssas_set_ccbstatus(ccb, CAM_REQ_INVALID); xpt_done(ccb); return; } cm = mps_alloc_command(sc); if (cm == NULL) { mps_dprint(sc, MPS_ERROR, "%s: cannot allocate command\n", __func__); mpssas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL); xpt_done(ccb); return; } req = (MPI2_SMP_PASSTHROUGH_REQUEST *)cm->cm_req; bzero(req, sizeof(*req)); req->Function = MPI2_FUNCTION_SMP_PASSTHROUGH; /* Allow the chip to use any route to this SAS address. */ req->PhysicalPort = 0xff; req->RequestDataLength = htole16(ccb->smpio.smp_request_len); req->SGLFlags = MPI2_SGLFLAGS_SYSTEM_ADDRESS_SPACE | MPI2_SGLFLAGS_SGL_TYPE_MPI; mps_dprint(sc, MPS_XINFO, "%s: sending SMP request to SAS " "address %#jx\n", __func__, (uintmax_t)sasaddr); mpi_init_sge(cm, req, &req->SGL); /* * Set up a uio to pass into mps_map_command(). This allows us to * do one map command, and one busdma call in there. */ cm->cm_uio.uio_iov = cm->cm_iovec; cm->cm_uio.uio_iovcnt = 2; cm->cm_uio.uio_segflg = UIO_SYSSPACE; /* * The read/write flag isn't used by busdma, but set it just in * case. This isn't exactly accurate, either, since we're going in * both directions. */ cm->cm_uio.uio_rw = UIO_WRITE; cm->cm_iovec[0].iov_base = request; cm->cm_iovec[0].iov_len = le16toh(req->RequestDataLength); cm->cm_iovec[1].iov_base = response; cm->cm_iovec[1].iov_len = ccb->smpio.smp_response_len; cm->cm_uio.uio_resid = cm->cm_iovec[0].iov_len + cm->cm_iovec[1].iov_len; /* * Trigger a warning message in mps_data_cb() for the user if we * wind up exceeding two S/G segments. The chip expects one * segment for the request and another for the response. */ cm->cm_max_segs = 2; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_complete = mpssas_smpio_complete; cm->cm_complete_data = ccb; /* * Tell the mapping code that we're using a uio, and that this is * an SMP passthrough request. There is a little special-case * logic there (in mps_data_cb()) to handle the bidirectional * transfer. */ cm->cm_flags |= MPS_CM_FLAGS_USE_UIO | MPS_CM_FLAGS_SMP_PASS | MPS_CM_FLAGS_DATAIN | MPS_CM_FLAGS_DATAOUT; /* The chip data format is little endian. */ req->SASAddress.High = htole32(sasaddr >> 32); req->SASAddress.Low = htole32(sasaddr); /* * XXX Note that we don't have a timeout/abort mechanism here. * From the manual, it looks like task management requests only * work for SCSI IO and SATA passthrough requests. We may need to * have a mechanism to retry requests in the event of a chip reset * at least. Hopefully the chip will insure that any errors short * of that are relayed back to the driver. */ error = mps_map_command(sc, cm); if ((error != 0) && (error != EINPROGRESS)) { mps_dprint(sc, MPS_ERROR, "%s: error %d returned from mps_map_command()\n", __func__, error); goto bailout_error; } return; bailout_error: mps_free_command(sc, cm); mpssas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL); xpt_done(ccb); return; } static void mpssas_action_smpio(struct mpssas_softc *sassc, union ccb *ccb) { struct mps_softc *sc; struct mpssas_target *targ; uint64_t sasaddr = 0; sc = sassc->sc; /* * Make sure the target exists. */ KASSERT(ccb->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in XPT_SMP_IO\n", ccb->ccb_h.target_id)); targ = &sassc->targets[ccb->ccb_h.target_id]; if (targ->handle == 0x0) { mps_dprint(sc, MPS_ERROR, "%s: target %d does not exist!\n", __func__, ccb->ccb_h.target_id); mpssas_set_ccbstatus(ccb, CAM_SEL_TIMEOUT); xpt_done(ccb); return; } /* * If this device has an embedded SMP target, we'll talk to it * directly. * figure out what the expander's address is. */ if ((targ->devinfo & MPI2_SAS_DEVICE_INFO_SMP_TARGET) != 0) sasaddr = targ->sasaddr; /* * If we don't have a SAS address for the expander yet, try * grabbing it from the page 0x83 information cached in the * transport layer for this target. LSI expanders report the * expander SAS address as the port-associated SAS address in * Inquiry VPD page 0x83. Maxim expanders don't report it in page * 0x83. * * XXX KDM disable this for now, but leave it commented out so that * it is obvious that this is another possible way to get the SAS * address. * * The parent handle method below is a little more reliable, and * the other benefit is that it works for devices other than SES * devices. So you can send a SMP request to a da(4) device and it * will get routed to the expander that device is attached to. * (Assuming the da(4) device doesn't contain an SMP target...) */ #if 0 if (sasaddr == 0) sasaddr = xpt_path_sas_addr(ccb->ccb_h.path); #endif /* * If we still don't have a SAS address for the expander, look for * the parent device of this device, which is probably the expander. */ if (sasaddr == 0) { #ifdef OLD_MPS_PROBE struct mpssas_target *parent_target; #endif if (targ->parent_handle == 0x0) { mps_dprint(sc, MPS_ERROR, "%s: handle %d does not have a valid " "parent handle!\n", __func__, targ->handle); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } #ifdef OLD_MPS_PROBE parent_target = mpssas_find_target_by_handle(sassc, 0, targ->parent_handle); if (parent_target == NULL) { mps_dprint(sc, MPS_ERROR, "%s: handle %d does not have a valid " "parent target!\n", __func__, targ->handle); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } if ((parent_target->devinfo & MPI2_SAS_DEVICE_INFO_SMP_TARGET) == 0) { mps_dprint(sc, MPS_ERROR, "%s: handle %d parent %d does not " "have an SMP target!\n", __func__, targ->handle, parent_target->handle); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } sasaddr = parent_target->sasaddr; #else /* OLD_MPS_PROBE */ if ((targ->parent_devinfo & MPI2_SAS_DEVICE_INFO_SMP_TARGET) == 0) { mps_dprint(sc, MPS_ERROR, "%s: handle %d parent %d does not " "have an SMP target!\n", __func__, targ->handle, targ->parent_handle); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } if (targ->parent_sasaddr == 0x0) { mps_dprint(sc, MPS_ERROR, "%s: handle %d parent handle %d does " "not have a valid SAS address!\n", __func__, targ->handle, targ->parent_handle); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } sasaddr = targ->parent_sasaddr; #endif /* OLD_MPS_PROBE */ } if (sasaddr == 0) { mps_dprint(sc, MPS_INFO, "%s: unable to find SAS address for handle %d\n", __func__, targ->handle); mpssas_set_ccbstatus(ccb, CAM_DEV_NOT_THERE); goto bailout; } mpssas_send_smpcmd(sassc, ccb, sasaddr); return; bailout: xpt_done(ccb); } #endif //__FreeBSD_version >= 900026 static void mpssas_action_resetdev(struct mpssas_softc *sassc, union ccb *ccb) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; struct mps_softc *sc; struct mps_command *tm; struct mpssas_target *targ; MPS_FUNCTRACE(sassc->sc); mtx_assert(&sassc->sc->mps_mtx, MA_OWNED); KASSERT(ccb->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in XPT_RESET_DEV\n", ccb->ccb_h.target_id)); sc = sassc->sc; tm = mps_alloc_command(sc); if (tm == NULL) { mps_dprint(sc, MPS_ERROR, "command alloc failure in mpssas_action_resetdev\n"); mpssas_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL); xpt_done(ccb); return; } targ = &sassc->targets[ccb->ccb_h.target_id]; req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; req->DevHandle = htole16(targ->handle); req->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; req->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET; /* SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; tm->cm_data = NULL; tm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; tm->cm_complete = mpssas_resetdev_complete; tm->cm_complete_data = ccb; tm->cm_targ = targ; targ->flags |= MPSSAS_TARGET_INRESET; mps_map_command(sc, tm); } static void mpssas_resetdev_complete(struct mps_softc *sc, struct mps_command *tm) { MPI2_SCSI_TASK_MANAGE_REPLY *resp; union ccb *ccb; MPS_FUNCTRACE(sc); mtx_assert(&sc->mps_mtx, MA_OWNED); resp = (MPI2_SCSI_TASK_MANAGE_REPLY *)tm->cm_reply; ccb = tm->cm_complete_data; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { MPI2_SCSI_TASK_MANAGE_REQUEST *req; req = (MPI2_SCSI_TASK_MANAGE_REQUEST *)tm->cm_req; mps_dprint(sc, MPS_ERROR, "%s: cm_flags = %#x for reset of handle %#04x! " "This should not happen!\n", __func__, tm->cm_flags, req->DevHandle); mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); goto bailout; } mps_dprint(sc, MPS_XINFO, "%s: IOCStatus = 0x%x ResponseCode = 0x%x\n", __func__, le16toh(resp->IOCStatus), le32toh(resp->ResponseCode)); if (le32toh(resp->ResponseCode) == MPI2_SCSITASKMGMT_RSP_TM_COMPLETE) { mpssas_set_ccbstatus(ccb, CAM_REQ_CMP); mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, CAM_LUN_WILDCARD); } else mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); bailout: mpssas_free_tm(sc, tm); xpt_done(ccb); } static void mpssas_poll(struct cam_sim *sim) { struct mpssas_softc *sassc; sassc = cam_sim_softc(sim); if (sassc->sc->mps_debug & MPS_TRACE) { /* frequent debug messages during a panic just slow * everything down too much. */ mps_printf(sassc->sc, "%s clearing MPS_TRACE\n", __func__); sassc->sc->mps_debug &= ~MPS_TRACE; } mps_intr_locked(sassc->sc); } static void mpssas_async(void *callback_arg, uint32_t code, struct cam_path *path, void *arg) { struct mps_softc *sc; sc = (struct mps_softc *)callback_arg; switch (code) { #if (__FreeBSD_version >= 1000006) || \ ((__FreeBSD_version >= 901503) && (__FreeBSD_version < 1000000)) case AC_ADVINFO_CHANGED: { struct mpssas_target *target; struct mpssas_softc *sassc; struct scsi_read_capacity_data_long rcap_buf; struct ccb_dev_advinfo cdai; struct mpssas_lun *lun; lun_id_t lunid; int found_lun; uintptr_t buftype; buftype = (uintptr_t)arg; found_lun = 0; sassc = sc->sassc; /* * We're only interested in read capacity data changes. */ if (buftype != CDAI_TYPE_RCAPLONG) break; /* * We should have a handle for this, but check to make sure. */ KASSERT(xpt_path_target_id(path) < sassc->maxtargets, ("Target %d out of bounds in mpssas_async\n", xpt_path_target_id(path))); target = &sassc->targets[xpt_path_target_id(path)]; if (target->handle == 0) break; lunid = xpt_path_lun_id(path); SLIST_FOREACH(lun, &target->luns, lun_link) { if (lun->lun_id == lunid) { found_lun = 1; break; } } if (found_lun == 0) { lun = malloc(sizeof(struct mpssas_lun), M_MPT2, M_NOWAIT | M_ZERO); if (lun == NULL) { mps_dprint(sc, MPS_ERROR, "Unable to alloc " "LUN for EEDP support.\n"); break; } lun->lun_id = lunid; SLIST_INSERT_HEAD(&target->luns, lun, lun_link); } bzero(&rcap_buf, sizeof(rcap_buf)); xpt_setup_ccb(&cdai.ccb_h, path, CAM_PRIORITY_NORMAL); cdai.ccb_h.func_code = XPT_DEV_ADVINFO; cdai.ccb_h.flags = CAM_DIR_IN; cdai.buftype = CDAI_TYPE_RCAPLONG; #if (__FreeBSD_version >= 1100061) || \ ((__FreeBSD_version >= 1001510) && (__FreeBSD_version < 1100000)) cdai.flags = CDAI_FLAG_NONE; #else cdai.flags = 0; #endif cdai.bufsiz = sizeof(rcap_buf); cdai.buf = (uint8_t *)&rcap_buf; xpt_action((union ccb *)&cdai); if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); if ((mpssas_get_ccbstatus((union ccb *)&cdai) == CAM_REQ_CMP) && (rcap_buf.prot & SRC16_PROT_EN)) { lun->eedp_formatted = TRUE; lun->eedp_block_size = scsi_4btoul(rcap_buf.length); } else { lun->eedp_formatted = FALSE; lun->eedp_block_size = 0; } break; } #else case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cgd = arg; mpssas_check_eedp(sc, path, cgd); break; } #endif default: break; } } #if (__FreeBSD_version < 901503) || \ ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) static void mpssas_check_eedp(struct mps_softc *sc, struct cam_path *path, struct ccb_getdev *cgd) { struct mpssas_softc *sassc = sc->sassc; struct ccb_scsiio *csio; struct scsi_read_capacity_16 *scsi_cmd; struct scsi_read_capacity_eedp *rcap_buf; path_id_t pathid; target_id_t targetid; lun_id_t lunid; union ccb *ccb; struct cam_path *local_path; struct mpssas_target *target; struct mpssas_lun *lun; uint8_t found_lun; char path_str[64]; sassc = sc->sassc; pathid = cam_sim_path(sassc->sim); targetid = xpt_path_target_id(path); lunid = xpt_path_lun_id(path); KASSERT(targetid < sassc->maxtargets, ("Target %d out of bounds in mpssas_check_eedp\n", targetid)); target = &sassc->targets[targetid]; if (target->handle == 0x0) return; /* * Determine if the device is EEDP capable. * * If this flag is set in the inquiry data, * the device supports protection information, * and must support the 16 byte read * capacity command, otherwise continue without * sending read cap 16 */ if ((cgd->inq_data.spc3_flags & SPC3_SID_PROTECT) == 0) return; /* * Issue a READ CAPACITY 16 command. This info * is used to determine if the LUN is formatted * for EEDP support. */ ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { mps_dprint(sc, MPS_ERROR, "Unable to alloc CCB " "for EEDP support.\n"); return; } if (xpt_create_path(&local_path, xpt_periph, pathid, targetid, lunid) != CAM_REQ_CMP) { mps_dprint(sc, MPS_ERROR, "Unable to create " "path for EEDP support\n"); xpt_free_ccb(ccb); return; } /* * If LUN is already in list, don't create a new * one. */ found_lun = FALSE; SLIST_FOREACH(lun, &target->luns, lun_link) { if (lun->lun_id == lunid) { found_lun = TRUE; break; } } if (!found_lun) { lun = malloc(sizeof(struct mpssas_lun), M_MPT2, M_NOWAIT | M_ZERO); if (lun == NULL) { mps_dprint(sc, MPS_ERROR, "Unable to alloc LUN for EEDP support.\n"); xpt_free_path(local_path); xpt_free_ccb(ccb); return; } lun->lun_id = lunid; SLIST_INSERT_HEAD(&target->luns, lun, lun_link); } xpt_path_string(local_path, path_str, sizeof(path_str)); mps_dprint(sc, MPS_INFO, "Sending read cap: path %s handle %d\n", path_str, target->handle); /* * Issue a READ CAPACITY 16 command for the LUN. * The mpssas_read_cap_done function will load * the read cap info into the LUN struct. */ rcap_buf = malloc(sizeof(struct scsi_read_capacity_eedp), M_MPT2, M_NOWAIT | M_ZERO); if (rcap_buf == NULL) { mps_dprint(sc, MPS_FAULT, "Unable to alloc read capacity buffer for EEDP support.\n"); xpt_free_path(ccb->ccb_h.path); xpt_free_ccb(ccb); return; } xpt_setup_ccb(&ccb->ccb_h, local_path, CAM_PRIORITY_XPT); csio = &ccb->csio; csio->ccb_h.func_code = XPT_SCSI_IO; csio->ccb_h.flags = CAM_DIR_IN; csio->ccb_h.retry_count = 4; csio->ccb_h.cbfcnp = mpssas_read_cap_done; csio->ccb_h.timeout = 60000; csio->data_ptr = (uint8_t *)rcap_buf; csio->dxfer_len = sizeof(struct scsi_read_capacity_eedp); csio->sense_len = MPS_SENSE_LEN; csio->cdb_len = sizeof(*scsi_cmd); csio->tag_action = MSG_SIMPLE_Q_TAG; scsi_cmd = (struct scsi_read_capacity_16 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = 0x9E; scsi_cmd->service_action = SRC16_SERVICE_ACTION; ((uint8_t *)scsi_cmd)[13] = sizeof(struct scsi_read_capacity_eedp); ccb->ccb_h.ppriv_ptr1 = sassc; xpt_action(ccb); } static void mpssas_read_cap_done(struct cam_periph *periph, union ccb *done_ccb) { struct mpssas_softc *sassc; struct mpssas_target *target; struct mpssas_lun *lun; struct scsi_read_capacity_eedp *rcap_buf; if (done_ccb == NULL) return; /* Driver need to release devq, it Scsi command is * generated by driver internally. * Currently there is a single place where driver * calls scsi command internally. In future if driver * calls more scsi command internally, it needs to release * devq internally, since those command will not go back to * cam_periph. */ if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) ) { done_ccb->ccb_h.status &= ~CAM_DEV_QFRZN; xpt_release_devq(done_ccb->ccb_h.path, /*count*/ 1, /*run_queue*/TRUE); } rcap_buf = (struct scsi_read_capacity_eedp *)done_ccb->csio.data_ptr; /* * Get the LUN ID for the path and look it up in the LUN list for the * target. */ sassc = (struct mpssas_softc *)done_ccb->ccb_h.ppriv_ptr1; KASSERT(done_ccb->ccb_h.target_id < sassc->maxtargets, ("Target %d out of bounds in mpssas_read_cap_done\n", done_ccb->ccb_h.target_id)); target = &sassc->targets[done_ccb->ccb_h.target_id]; SLIST_FOREACH(lun, &target->luns, lun_link) { if (lun->lun_id != done_ccb->ccb_h.target_lun) continue; /* * Got the LUN in the target's LUN list. Fill it in * with EEDP info. If the READ CAP 16 command had some * SCSI error (common if command is not supported), mark * the lun as not supporting EEDP and set the block size * to 0. */ if ((mpssas_get_ccbstatus(done_ccb) != CAM_REQ_CMP) || (done_ccb->csio.scsi_status != SCSI_STATUS_OK)) { lun->eedp_formatted = FALSE; lun->eedp_block_size = 0; break; } if (rcap_buf->protect & 0x01) { mps_dprint(sassc->sc, MPS_INFO, "LUN %d for " "target ID %d is formatted for EEDP " "support.\n", done_ccb->ccb_h.target_lun, done_ccb->ccb_h.target_id); lun->eedp_formatted = TRUE; lun->eedp_block_size = scsi_4btoul(rcap_buf->length); } break; } // Finished with this CCB and path. free(rcap_buf, M_MPT2); xpt_free_path(done_ccb->ccb_h.path); xpt_free_ccb(done_ccb); } #endif /* (__FreeBSD_version < 901503) || \ ((__FreeBSD_version >= 1000000) && (__FreeBSD_version < 1000006)) */ void mpssas_prepare_for_tm(struct mps_softc *sc, struct mps_command *tm, struct mpssas_target *target, lun_id_t lun_id) { union ccb *ccb; path_id_t path_id; /* * Set the INRESET flag for this target so that no I/O will be sent to * the target until the reset has completed. If an I/O request does * happen, the devq will be frozen. The CCB holds the path which is * used to release the devq. The devq is released and the CCB is freed * when the TM completes. */ ccb = xpt_alloc_ccb_nowait(); if (ccb) { path_id = cam_sim_path(sc->sassc->sim); if (xpt_create_path(&ccb->ccb_h.path, xpt_periph, path_id, target->tid, lun_id) != CAM_REQ_CMP) { xpt_free_ccb(ccb); } else { tm->cm_ccb = ccb; tm->cm_targ = target; target->flags |= MPSSAS_TARGET_INRESET; } } } int mpssas_startup(struct mps_softc *sc) { /* * Send the port enable message and set the wait_for_port_enable flag. * This flag helps to keep the simq frozen until all discovery events * are processed. */ sc->wait_for_port_enable = 1; mpssas_send_portenable(sc); return (0); } static int mpssas_send_portenable(struct mps_softc *sc) { MPI2_PORT_ENABLE_REQUEST *request; struct mps_command *cm; MPS_FUNCTRACE(sc); if ((cm = mps_alloc_command(sc)) == NULL) return (EBUSY); request = (MPI2_PORT_ENABLE_REQUEST *)cm->cm_req; request->Function = MPI2_FUNCTION_PORT_ENABLE; request->MsgFlags = 0; request->VP_ID = 0; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_complete = mpssas_portenable_complete; cm->cm_data = NULL; cm->cm_sge = NULL; mps_map_command(sc, cm); mps_dprint(sc, MPS_XINFO, "mps_send_portenable finished cm %p req %p complete %p\n", cm, cm->cm_req, cm->cm_complete); return (0); } static void mpssas_portenable_complete(struct mps_softc *sc, struct mps_command *cm) { MPI2_PORT_ENABLE_REPLY *reply; struct mpssas_softc *sassc; MPS_FUNCTRACE(sc); sassc = sc->sassc; /* * Currently there should be no way we can hit this case. It only * happens when we have a failure to allocate chain frames, and * port enable commands don't have S/G lists. */ if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { mps_dprint(sc, MPS_ERROR, "%s: cm_flags = %#x for port enable! " "This should not happen!\n", __func__, cm->cm_flags); } reply = (MPI2_PORT_ENABLE_REPLY *)cm->cm_reply; if (reply == NULL) mps_dprint(sc, MPS_FAULT, "Portenable NULL reply\n"); else if (le16toh(reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) mps_dprint(sc, MPS_FAULT, "Portenable failed\n"); mps_free_command(sc, cm); if (sc->mps_ich.ich_arg != NULL) { mps_dprint(sc, MPS_XINFO, "disestablish config intrhook\n"); config_intrhook_disestablish(&sc->mps_ich); sc->mps_ich.ich_arg = NULL; } /* * Get WarpDrive info after discovery is complete but before the scan * starts. At this point, all devices are ready to be exposed to the * OS. If devices should be hidden instead, take them out of the * 'targets' array before the scan. The devinfo for a disk will have * some info and a volume's will be 0. Use that to remove disks. */ mps_wd_config_pages(sc); /* * Done waiting for port enable to complete. Decrement the refcount. * If refcount is 0, discovery is complete and a rescan of the bus can * take place. Since the simq was explicitly frozen before port * enable, it must be explicitly released here to keep the * freeze/release count in sync. */ sc->wait_for_port_enable = 0; sc->port_enable_complete = 1; wakeup(&sc->port_enable_complete); mpssas_startup_decrement(sassc); } int mpssas_check_id(struct mpssas_softc *sassc, int id) { struct mps_softc *sc = sassc->sc; char *ids; char *name; ids = &sc->exclude_ids[0]; while((name = strsep(&ids, ",")) != NULL) { if (name[0] == '\0') continue; if (strtol(name, NULL, 0) == (long)id) return (1); } return (0); } void mpssas_realloc_targets(struct mps_softc *sc, int maxtargets) { struct mpssas_softc *sassc; struct mpssas_lun *lun, *lun_tmp; struct mpssas_target *targ; int i; sassc = sc->sassc; /* * The number of targets is based on IOC Facts, so free all of * the allocated LUNs for each target and then the target buffer * itself. */ for (i=0; i< maxtargets; i++) { targ = &sassc->targets[i]; SLIST_FOREACH_SAFE(lun, &targ->luns, lun_link, lun_tmp) { free(lun, M_MPT2); } } free(sassc->targets, M_MPT2); sassc->targets = malloc(sizeof(struct mpssas_target) * maxtargets, M_MPT2, M_WAITOK|M_ZERO); if (!sassc->targets) { panic("%s failed to alloc targets with error %d\n", __func__, ENOMEM); } } Index: head/sys/geom/geom_dev.c =================================================================== --- head/sys/geom/geom_dev.c (revision 308154) +++ head/sys/geom/geom_dev.c (revision 308155) @@ -1,764 +1,765 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_dev_softc { struct mtx sc_mtx; struct cdev *sc_dev; struct cdev *sc_alias; int sc_open; int sc_active; }; static d_open_t g_dev_open; static d_close_t g_dev_close; static d_strategy_t g_dev_strategy; static d_ioctl_t g_dev_ioctl; static struct cdevsw g_dev_cdevsw = { .d_version = D_VERSION, .d_open = g_dev_open, .d_close = g_dev_close, .d_read = physread, .d_write = physwrite, .d_ioctl = g_dev_ioctl, .d_strategy = g_dev_strategy, .d_name = "g_dev", .d_flags = D_DISK | D_TRACKCLOSE, }; static g_init_t g_dev_init; static g_fini_t g_dev_fini; static g_taste_t g_dev_taste; static g_orphan_t g_dev_orphan; static g_attrchanged_t g_dev_attrchanged; static struct g_class g_dev_class = { .name = "DEV", .version = G_VERSION, .init = g_dev_init, .fini = g_dev_fini, .taste = g_dev_taste, .orphan = g_dev_orphan, .attrchanged = g_dev_attrchanged }; /* * We target 262144 (8 x 32768) sectors by default as this significantly * increases the throughput on commonly used SSD's with a marginal * increase in non-interruptible request latency. */ static uint64_t g_dev_del_max_sectors = 262144; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW, 0, "GEOM_DEV stuff"); SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW, &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single " "delete request sent to the provider. Larger requests are chunked " "so they can be interrupted. (0 = disable chunking)"); static char *dumpdev = NULL; static void g_dev_init(struct g_class *mp) { dumpdev = kern_getenv("dumpdev"); } static void g_dev_fini(struct g_class *mp) { freeenv(dumpdev); dumpdev = NULL; } static int g_dev_setdumpdev(struct cdev *dev, struct thread *td) { struct g_kerneldump kd; struct g_consumer *cp; int error, len; if (dev == NULL) return (set_dumper(NULL, NULL, td)); cp = dev->si_drv2; len = sizeof(kd); kd.offset = 0; kd.length = OFF_MAX; error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd); if (error == 0) { error = set_dumper(&kd.di, devtoname(dev), td); if (error == 0) dev->si_flags |= SI_DUMPDEV; } return (error); } static int init_dumpdev(struct cdev *dev) { struct g_consumer *cp; const char *devprefix = "/dev/", *devname; int error; size_t len; if (dumpdev == NULL) return (0); len = strlen(devprefix); devname = devtoname(dev); if (strcmp(devname, dumpdev) != 0 && (strncmp(dumpdev, devprefix, len) != 0 || strcmp(devname, dumpdev + len) != 0)) return (0); cp = (struct g_consumer *)dev->si_drv2; error = g_access(cp, 1, 0, 0); if (error != 0) return (error); error = g_dev_setdumpdev(dev, curthread); if (error == 0) { freeenv(dumpdev); dumpdev = NULL; } (void)g_access(cp, -1, 0, 0); return (error); } static void g_dev_destroy(void *arg, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_dev_softc *sc; char buf[SPECNAMELEN + 6]; g_topology_assert(); cp = arg; gp = cp->geom; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "DESTROY", buf, M_WAITOK); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); } void g_dev_print(void) { struct g_geom *gp; char const *p = ""; LIST_FOREACH(gp, &g_dev_class.geom, geom) { printf("%s%s", p, gp->name); p = " "; } printf("\n"); } static void g_dev_set_physpath(struct g_consumer *cp) { struct g_dev_softc *sc; char *physpath; int error, physpath_len; if (g_access(cp, 1, 0, 0) != 0) return; sc = cp->private; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0 && strlen(physpath) != 0) { struct cdev *dev, *old_alias_dev; struct cdev **alias_devp; dev = sc->sc_dev; old_alias_dev = sc->sc_alias; alias_devp = (struct cdev **)&sc->sc_alias; make_dev_physpath_alias(MAKEDEV_WAITOK, alias_devp, dev, old_alias_dev, physpath); } else if (sc->sc_alias) { destroy_dev((struct cdev *)sc->sc_alias); sc->sc_alias = NULL; } g_free(physpath); } static void g_dev_set_media(struct g_consumer *cp) { struct g_dev_softc *sc; struct cdev *dev; char buf[SPECNAMELEN + 6]; sc = cp->private; dev = sc->sc_dev; snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); dev = sc->sc_alias; if (dev != NULL) { snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify_f("DEVFS", "CDEV", "MEDIACHANGE", buf, M_WAITOK); devctl_notify_f("GEOM", "DEV", "MEDIACHANGE", buf, M_WAITOK); } } static void g_dev_attrchanged(struct g_consumer *cp, const char *attr) { if (strcmp(attr, "GEOM::media") == 0) { g_dev_set_media(cp); return; } if (strcmp(attr, "GEOM::physpath") == 0) { g_dev_set_physpath(cp); return; } } struct g_provider * g_dev_getprovider(struct cdev *dev) { struct g_consumer *cp; g_topology_assert(); if (dev == NULL) return (NULL); if (dev->si_devsw != &g_dev_cdevsw) return (NULL); cp = dev->si_drv2; return (cp->provider); } static struct g_geom * g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_consumer *cp; struct g_dev_softc *sc; int error; struct cdev *dev; char buf[SPECNAMELEN + 6]; g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_new_geomf(mp, "%s", pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF); cp = g_new_consumer(gp); cp->private = sc; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); KASSERT(error == 0, ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error)); error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev, &g_dev_cdevsw, NULL, UID_ROOT, GID_OPERATOR, 0640, "%s", gp->name); if (error != 0) { printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); return (NULL); } dev->si_flags |= SI_UNMAPPED; sc->sc_dev = dev; dev->si_iosize_max = MAXPHYS; dev->si_drv2 = cp; error = init_dumpdev(dev); if (error != 0) printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_dev_attrchanged(cp, "GEOM::physpath"); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify_f("GEOM", "DEV", "CREATE", buf, M_WAITOK); return (gp); } static int g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; if (cp == NULL) return (ENXIO); /* g_dev_taste() not done yet */ g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? 1 : 0; w = flags & FWRITE ? 1 : 0; #ifdef notyet e = flags & O_EXCL ? 1 : 0; #else e = 0; #endif /* * This happens on attempt to open a device node with O_EXEC. */ if (r + w + e == 0) return (EINVAL); if (w) { /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); if (error == 0) { sc = cp->private; mtx_lock(&sc->sc_mtx); if (sc->sc_open == 0 && sc->sc_active != 0) wakeup(&sc->sc_active); sc->sc_open += r + w + e; mtx_unlock(&sc->sc_mtx); } return (error); } static int g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; if (cp == NULL) return (ENXIO); g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? -1 : 0; w = flags & FWRITE ? -1 : 0; #ifdef notyet e = flags & O_EXCL ? -1 : 0; #else e = 0; #endif /* * The vgonel(9) - caused by eg. forced unmount of devfs - calls * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags, * which would result in zero deltas, which in turn would cause * panic in g_access(9). * * Note that we cannot zero the counters (ie. do "r = cp->acr" * etc) instead, because the consumer might be opened in another * devfs instance. */ if (r + w + e == 0) return (EINVAL); sc = cp->private; mtx_lock(&sc->sc_mtx); sc->sc_open += r + w + e; while (sc->sc_open == 0 && sc->sc_active != 0) msleep(&sc->sc_active, &sc->sc_mtx, 0, "PRIBIO", 0); mtx_unlock(&sc->sc_mtx); g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); return (error); } /* * XXX: Until we have unmessed the ioctl situation, there is a race against * XXX: a concurrent orphanization. We cannot close it by holding topology * XXX: since that would prevent us from doing our job, and stalling events * XXX: will break (actually: stall) the BSD disklabel hacks. */ static int g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_provider *pp; off_t offset, length, chunk; int i, error; cp = dev->si_drv2; pp = cp->provider; error = 0; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_ioctl")); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = cp->provider->sectorsize; if (*(u_int *)data == 0) error = ENOENT; break; case DIOCGMEDIASIZE: *(off_t *)data = cp->provider->mediasize; if (*(off_t *)data == 0) error = ENOENT; break; case DIOCGFWSECTORS: error = g_io_getattr("GEOM::fwsectors", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFWHEADS: error = g_io_getattr("GEOM::fwheads", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFRONTSTUFF: error = g_io_getattr("GEOM::frontstuff", cp, &i, data); break; case DIOCSKERNELDUMP: if (*(u_int *)data == 0) error = g_dev_setdumpdev(NULL, td); else error = g_dev_setdumpdev(dev, td); break; case DIOCGFLUSH: error = g_io_flush(cp); break; case DIOCGDELETE: offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % cp->provider->sectorsize) != 0 || (length % cp->provider->sectorsize) != 0 || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = EINVAL; break; } while (length > 0) { chunk = length; if (g_dev_del_max_sectors != 0 && chunk > g_dev_del_max_sectors * cp->provider->sectorsize) { chunk = g_dev_del_max_sectors * cp->provider->sectorsize; } error = g_delete_data(cp, offset, chunk); length -= chunk; offset += chunk; if (error) break; /* * Since the request size can be large, the service * time can be is likewise. We make this ioctl * interruptible by checking for signals for each bio. */ if (SIGPENDING(td)) break; } break; case DIOCGIDENT: error = g_io_getattr("GEOM::ident", cp, &i, data); break; case DIOCGPROVIDERNAME: if (pp == NULL) return (ENOENT); strlcpy(data, pp->name, i); break; case DIOCGSTRIPESIZE: *(off_t *)data = cp->provider->stripesize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = cp->provider->stripeoffset; break; case DIOCGPHYSPATH: error = g_io_getattr("GEOM::physpath", cp, &i, data); if (error == 0 && *(char *)data == '\0') error = ENOENT; break; case DIOCGATTR: { struct diocgattr_arg *arg = (struct diocgattr_arg *)data; if (arg->len > sizeof(arg->value)) { error = EINVAL; break; } error = g_io_getattr(arg->name, cp, &arg->len, &arg->value); break; } case DIOCZONECMD: { struct disk_zone_args *zone_args =(struct disk_zone_args *)data; struct disk_zone_rep_entry *new_entries, *old_entries; struct disk_zone_report *rep; size_t alloc_size; old_entries = NULL; new_entries = NULL; rep = NULL; alloc_size = 0; if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) { rep = &zone_args->zone_params.report; alloc_size = rep->entries_allocated * sizeof(struct disk_zone_rep_entry); if (alloc_size != 0) new_entries = g_malloc(alloc_size, M_WAITOK| M_ZERO); old_entries = rep->entries; rep->entries = new_entries; } error = g_io_zonecmd(zone_args, cp); if ((zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) && (alloc_size != 0) && (error == 0)) { error = copyout(new_entries, old_entries, alloc_size); } if ((old_entries != NULL) && (rep != NULL)) rep->entries = old_entries; if (new_entries != NULL) g_free(new_entries); break; } default: if (cp->provider->geom->ioctl != NULL) { error = cp->provider->geom->ioctl(cp->provider, cmd, data, fflag, td); } else { error = ENOIOCTL; } } return (error); } static void g_dev_done(struct bio *bp2) { struct g_consumer *cp; struct g_dev_softc *sc; struct bio *bp; int destroy; cp = bp2->bio_from; sc = cp->private; bp = bp2->bio_parent; bp->bio_error = bp2->bio_error; bp->bio_completed = bp2->bio_completed; bp->bio_resid = bp->bio_length - bp2->bio_completed; if (bp2->bio_cmd == BIO_ZONE) bcopy(&bp2->bio_zone, &bp->bio_zone, sizeof(bp->bio_zone)); if (bp2->bio_error != 0) { g_trace(G_T_BIO, "g_dev_done(%p) had error %d", bp2, bp2->bio_error); bp->bio_flags |= BIO_ERROR; } else { g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd", bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed); } g_destroy_bio(bp2); destroy = 0; mtx_lock(&sc->sc_mtx); if ((--sc->sc_active) == 0) { if (sc->sc_open == 0) wakeup(&sc->sc_active); if (sc->sc_dev == NULL) destroy = 1; } mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL); biodone(bp); } static void g_dev_strategy(struct bio *bp) { struct g_consumer *cp; struct bio *bp2; struct cdev *dev; struct g_dev_softc *sc; KASSERT(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE || bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_ZONE, ("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd)); dev = bp->bio_dev; cp = dev->si_drv2; sc = cp->private; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_strategy")); + biotrack(bp, __func__); #ifdef INVARIANTS if ((bp->bio_offset % cp->provider->sectorsize) != 0 || (bp->bio_bcount % cp->provider->sectorsize) != 0) { bp->bio_resid = bp->bio_bcount; biofinish(bp, NULL, EINVAL); return; } #endif mtx_lock(&sc->sc_mtx); KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy")); sc->sc_active++; mtx_unlock(&sc->sc_mtx); for (;;) { /* * XXX: This is not an ideal solution, but I believe it to * XXX: deadlock safely, all things considered. */ bp2 = g_clone_bio(bp); if (bp2 != NULL) break; pause("gdstrat", hz / 10); } KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place")); bp2->bio_done = g_dev_done; g_trace(G_T_BIO, "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d", bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length, bp2->bio_data, bp2->bio_cmd); g_io_request(bp2, cp); KASSERT(cp->acr || cp->acw, ("g_dev_strategy raced with g_dev_close and lost")); } /* * g_dev_callback() * * Called by devfs when asynchronous device destruction is completed. * - Mark that we have no attached device any more. * - If there are no outstanding requests, schedule geom destruction. * Otherwise destruction will be scheduled later by g_dev_done(). */ static void g_dev_callback(void *arg) { struct g_consumer *cp; struct g_dev_softc *sc; int destroy; cp = arg; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name); mtx_lock(&sc->sc_mtx); sc->sc_dev = NULL; sc->sc_alias = NULL; destroy = (sc->sc_active == 0); mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_dev_destroy, cp, M_WAITOK, NULL); } /* * g_dev_orphan() * * Called from below when the provider orphaned us. * - Clear any dump settings. * - Request asynchronous device destruction to prevent any more requests * from coming in. The provider is already marked with an error, so * anything which comes in the interim will be returned immediately. */ static void g_dev_orphan(struct g_consumer *cp) { struct cdev *dev; struct g_dev_softc *sc; g_topology_assert(); sc = cp->private; dev = sc->sc_dev; g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name); /* Reset any dump-area set on this device */ if (dev->si_flags & SI_DUMPDEV) (void)set_dumper(NULL, NULL, curthread); /* Destroy the struct cdev *so we get no more requests */ destroy_dev_sched_cb(dev, g_dev_callback, cp); } DECLARE_GEOM_CLASS(g_dev_class, g_dev); Index: head/sys/geom/geom_disk.c =================================================================== --- head/sys/geom/geom_disk.c (revision 308154) +++ head/sys/geom/geom_disk.c (revision 308155) @@ -1,1010 +1,1012 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_geom.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_disk_softc { struct mtx done_mtx; struct disk *dp; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; char led[64]; uint32_t state; struct mtx start_mtx; }; static g_access_t g_disk_access; static g_start_t g_disk_start; static g_ioctl_t g_disk_ioctl; static g_dumpconf_t g_disk_dumpconf; static g_provgone_t g_disk_providergone; static struct g_class g_disk_class = { .name = G_DISK_CLASS_NAME, .version = G_VERSION, .start = g_disk_start, .access = g_disk_access, .ioctl = g_disk_ioctl, .providergone = g_disk_providergone, .dumpconf = g_disk_dumpconf, }; SYSCTL_DECL(_kern_geom); static SYSCTL_NODE(_kern_geom, OID_AUTO, disk, CTLFLAG_RW, 0, "GEOM_DISK stuff"); DECLARE_GEOM_CLASS(g_disk_class, g_disk); static int g_disk_access(struct g_provider *pp, int r, int w, int e) { struct disk *dp; struct g_disk_softc *sc; int error; g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)", pp->name, r, w, e); g_topology_assert(); sc = pp->private; if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) { /* * Allow decreasing access count even if disk is not * available anymore. */ if (r <= 0 && w <= 0 && e <= 0) return (0); return (ENXIO); } r += pp->acr; w += pp->acw; e += pp->ace; error = 0; if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { if (dp->d_open != NULL) { error = dp->d_open(dp); if (bootverbose && error != 0) printf("Opened disk %s -> %d\n", pp->name, error); if (error != 0) return (error); } pp->sectorsize = dp->d_sectorsize; if (dp->d_maxsize == 0) { printf("WARNING: Disk drive %s%d has no d_maxsize\n", dp->d_name, dp->d_unit); dp->d_maxsize = DFLTPHYS; } if (dp->d_delmaxsize == 0) { if (bootverbose && dp->d_flags & DISKFLAG_CANDELETE) { printf("WARNING: Disk drive %s%d has no " "d_delmaxsize\n", dp->d_name, dp->d_unit); } dp->d_delmaxsize = dp->d_maxsize; } pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; dp->d_flags |= DISKFLAG_OPEN; /* * Do not invoke resize event when initial size was zero. * Some disks report its size only after first opening. */ if (pp->mediasize == 0) pp->mediasize = dp->d_mediasize; else g_resize_provider(pp, dp->d_mediasize); } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { if (dp->d_close != NULL) { error = dp->d_close(dp); if (error != 0) printf("Closed disk %s -> %d\n", pp->name, error); } sc->state = G_STATE_ACTIVE; if (sc->led[0] != 0) led_set(sc->led, "0"); dp->d_flags &= ~DISKFLAG_OPEN; } return (error); } static void g_disk_kerneldump(struct bio *bp, struct disk *dp) { struct g_kerneldump *gkd; struct g_geom *gp; gkd = (struct g_kerneldump*)bp->bio_data; gp = bp->bio_to->geom; g_trace(G_T_TOPOLOGY, "g_disk_kerneldump(%s, %jd, %jd)", gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); if (dp->d_dump == NULL) { g_io_deliver(bp, ENODEV); return; } gkd->di.dumper = dp->d_dump; gkd->di.priv = dp; gkd->di.blocksize = dp->d_sectorsize; gkd->di.maxiosize = dp->d_maxsize; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > dp->d_mediasize) gkd->length = dp->d_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_disk_setstate(struct bio *bp, struct g_disk_softc *sc) { const char *cmd; memcpy(&sc->state, bp->bio_data, sizeof(sc->state)); if (sc->led[0] != 0) { switch (sc->state) { case G_STATE_FAILED: cmd = "1"; break; case G_STATE_REBUILD: cmd = "f5"; break; case G_STATE_RESYNC: cmd = "f1"; break; default: cmd = "0"; break; } led_set(sc->led, cmd); } g_io_deliver(bp, 0); } static void g_disk_done(struct bio *bp) { struct bintime now; struct bio *bp2; struct g_disk_softc *sc; /* See "notes" for why we need a mutex here */ /* XXX: will witness accept a mix of Giant/unGiant drivers here ? */ bp2 = bp->bio_parent; sc = bp2->bio_to->private; bp->bio_completed = bp->bio_length - bp->bio_resid; binuptime(&now); mtx_lock(&sc->done_mtx); if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; switch (bp->bio_cmd) { case BIO_ZONE: bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /*FALLTHROUGH*/ case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: devstat_end_transaction_bio_bt(sc->dp->d_devstat, bp, &now); break; default: break; } bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) { mtx_unlock(&sc->done_mtx); bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed; g_io_deliver(bp2, bp2->bio_error); } else mtx_unlock(&sc->done_mtx); g_destroy_bio(bp); } static int g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct thread *td) { struct disk *dp; struct g_disk_softc *sc; int error; sc = pp->private; dp = sc->dp; if (dp->d_ioctl == NULL) return (ENOIOCTL); error = dp->d_ioctl(dp, cmd, data, fflag, td); return (error); } static off_t g_disk_maxsize(struct disk *dp, struct bio *bp) { if (bp->bio_cmd == BIO_DELETE) return (dp->d_delmaxsize); return (dp->d_maxsize); } static int g_disk_maxsegs(struct disk *dp, struct bio *bp) { return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1); } static void g_disk_advance(struct disk *dp, struct bio *bp, off_t off) { bp->bio_offset += off; bp->bio_length -= off; if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *seg, *end; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; off += bp->bio_ma_offset; while (off >= seg->ds_len) { KASSERT((seg != end), ("vlist request runs off the end")); off -= seg->ds_len; seg++; } bp->bio_ma_offset = off; bp->bio_ma_n = end - seg; bp->bio_data = (void *)seg; } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma += off / PAGE_SIZE; bp->bio_ma_offset += off; bp->bio_ma_offset %= PAGE_SIZE; bp->bio_ma_n -= off / PAGE_SIZE; } else { bp->bio_data += off; } } static void g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset, off_t *plength, int *ppages) { uintptr_t seg_page_base; uintptr_t seg_page_end; off_t offset; off_t length; int seg_pages; offset = *poffset; length = *plength; if (length > seg->ds_len - offset) length = seg->ds_len - offset; seg_page_base = trunc_page(seg->ds_addr + offset); seg_page_end = round_page(seg->ds_addr + offset + length); seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT; if (seg_pages > *ppages) { seg_pages = *ppages; length = (seg_page_base + (seg_pages << PAGE_SHIFT)) - (seg->ds_addr + offset); } *poffset = 0; *plength -= length; *ppages -= seg_pages; } static off_t g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg) { bus_dma_segment_t *seg, *end; off_t residual; off_t offset; int pages; seg = (bus_dma_segment_t *)bp->bio_data; end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; residual = bp->bio_length; offset = bp->bio_ma_offset; pages = g_disk_maxsegs(dp, bp); while (residual != 0 && pages != 0) { KASSERT((seg != end), ("vlist limit runs off the end")); g_disk_seg_limit(seg, &offset, &residual, &pages); seg++; } if (pendseg != NULL) *pendseg = seg; return (residual); } static bool g_disk_limit(struct disk *dp, struct bio *bp) { bool limited = false; off_t maxsz; maxsz = g_disk_maxsize(dp, bp); /* * XXX: If we have a stripesize we should really use it here. * Care should be taken in the delete case if this is done * as deletes can be very sensitive to size given how they * are processed. */ if (bp->bio_length > maxsz) { bp->bio_length = maxsz; limited = true; } if ((bp->bio_flags & BIO_VLIST) != 0) { bus_dma_segment_t *firstseg, *endseg; off_t residual; firstseg = (bus_dma_segment_t*)bp->bio_data; residual = g_disk_vlist_limit(dp, bp, &endseg); if (residual != 0) { bp->bio_ma_n = endseg - firstseg; bp->bio_length -= residual; limited = true; } } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE); } return (limited); } static void g_disk_start(struct bio *bp) { struct bio *bp2, *bp3; struct disk *dp; struct g_disk_softc *sc; int error; off_t off; + biotrack(bp, __func__); + sc = bp->bio_to->private; if (sc == NULL || (dp = sc->dp) == NULL || dp->d_destroyed) { g_io_deliver(bp, ENXIO); return; } error = EJUSTRETURN; switch(bp->bio_cmd) { case BIO_DELETE: if (!(dp->d_flags & DISKFLAG_CANDELETE)) { error = EOPNOTSUPP; break; } /* fall-through */ case BIO_READ: case BIO_WRITE: KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0, ("unmapped bio not supported by disk %s", dp->d_name)); off = 0; bp3 = NULL; bp2 = g_clone_bio(bp); if (bp2 == NULL) { error = ENOMEM; break; } for (;;) { if (g_disk_limit(dp, bp2)) { off += bp2->bio_length; /* * To avoid a race, we need to grab the next bio * before we schedule this one. See "notes". */ bp3 = g_clone_bio(bp); if (bp3 == NULL) bp->bio_error = ENOMEM; } bp2->bio_done = g_disk_done; bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize; bp2->bio_bcount = bp2->bio_length; bp2->bio_disk = dp; mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); if (bp3 == NULL) break; bp2 = bp3; bp3 = NULL; g_disk_advance(dp, bp2, off); } break; case BIO_GETATTR: /* Give the driver a chance to override */ if (dp->d_getattr != NULL) { if (bp->bio_disk == NULL) bp->bio_disk = dp; error = dp->d_getattr(bp); if (error != -1) break; error = EJUSTRETURN; } if (g_handleattr_int(bp, "GEOM::candelete", (dp->d_flags & DISKFLAG_CANDELETE) != 0)) break; else if (g_handleattr_int(bp, "GEOM::fwsectors", dp->d_fwsectors)) break; else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads)) break; else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0)) break; else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor", dp->d_hba_vendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_device", dp->d_hba_device)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor", dp->d_hba_subvendor)) break; else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice", dp->d_hba_subdevice)) break; else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_disk_kerneldump(bp, dp); else if (!strcmp(bp->bio_attribute, "GEOM::setstate")) g_disk_setstate(bp, sc); else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate", dp->d_rotation_rate)) break; else error = ENOIOCTL; break; case BIO_FLUSH: g_trace(G_T_BIO, "g_disk_flushcache(%s)", bp->bio_to->name); if (!(dp->d_flags & DISKFLAG_CANFLUSHCACHE)) { error = EOPNOTSUPP; break; } /*FALLTHROUGH*/ case BIO_ZONE: if (bp->bio_cmd == BIO_ZONE) { if (!(dp->d_flags & DISKFLAG_CANZONE)) { error = EOPNOTSUPP; break; } g_trace(G_T_BIO, "g_disk_zone(%s)", bp->bio_to->name); } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_disk_done; bp2->bio_disk = dp; mtx_lock(&sc->start_mtx); devstat_start_transaction_bio(dp->d_devstat, bp2); mtx_unlock(&sc->start_mtx); dp->d_strategy(bp2); break; default: error = EOPNOTSUPP; break; } if (error != EJUSTRETURN) g_io_deliver(bp, error); return; } static void g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct bio *bp; struct disk *dp; struct g_disk_softc *sc; char *buf; int res = 0; sc = gp->softc; if (sc == NULL || (dp = sc->dp) == NULL) return; if (indent == NULL) { sbuf_printf(sb, " hd %u", dp->d_fwheads); sbuf_printf(sb, " sc %u", dp->d_fwsectors); return; } if (pp != NULL) { sbuf_printf(sb, "%s%u\n", indent, dp->d_fwheads); sbuf_printf(sb, "%s%u\n", indent, dp->d_fwsectors); /* * "rotationrate" is a little complicated, because the value * returned by the drive might not be the RPM; 0 and 1 are * special cases, and there's also a valid range. */ sbuf_printf(sb, "%s", indent); if (dp->d_rotation_rate == 0) /* Old drives don't */ sbuf_printf(sb, "unknown"); /* report RPM. */ else if (dp->d_rotation_rate == 1) /* Since 0 is used */ sbuf_printf(sb, "0"); /* above, SSDs use 1. */ else if ((dp->d_rotation_rate >= 0x041) && (dp->d_rotation_rate <= 0xfffe)) sbuf_printf(sb, "%u", dp->d_rotation_rate); else sbuf_printf(sb, "invalid"); sbuf_printf(sb, "\n"); if (dp->d_getattr != NULL) { buf = g_malloc(DISK_IDENT_SIZE, M_WAITOK); bp = g_alloc_bio(); bp->bio_disk = dp; bp->bio_attribute = "GEOM::ident"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; res = dp->d_getattr(bp); sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", res == 0 ? buf: dp->d_ident); sbuf_printf(sb, "\n"); bp->bio_attribute = "GEOM::lunid"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", buf); sbuf_printf(sb, "\n"); } bp->bio_attribute = "GEOM::lunname"; bp->bio_length = DISK_IDENT_SIZE; bp->bio_data = buf; if (dp->d_getattr(bp) == 0) { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", buf); sbuf_printf(sb, "\n"); } g_destroy_bio(bp); g_free(buf); } else { sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", dp->d_ident); sbuf_printf(sb, "\n"); } sbuf_printf(sb, "%s", indent); g_conf_printf_escaped(sb, "%s", dp->d_descr); sbuf_printf(sb, "\n"); } } static void g_disk_resize(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_provider *pp; if (flag == EV_CANCEL) return; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (dp->d_destroyed || gp == NULL) return; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->sectorsize != 0 && pp->sectorsize != dp->d_sectorsize) g_wither_provider(pp, ENXIO); else g_resize_provider(pp, dp->d_mediasize); } } static void g_disk_create(void *arg, int flag) { struct g_geom *gp; struct g_provider *pp; struct disk *dp; struct g_disk_softc *sc; char tmpstr[80]; if (flag == EV_CANCEL) return; g_topology_assert(); dp = arg; mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_START; /* * If the disk has already gone away, we can just stop here and * call the user's callback to tell him we've cleaned things up. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); if (dp->d_gone != NULL) dp->d_gone(dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->start_mtx, "g_disk_start", NULL, MTX_DEF); mtx_init(&sc->done_mtx, "g_disk_done", NULL, MTX_DEF); sc->dp = dp; gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit); gp->softc = sc; pp = g_new_providerf(gp, "%s", gp->name); devstat_remove_entry(pp->stat); pp->stat = NULL; dp->d_devstat->id = pp; pp->mediasize = dp->d_mediasize; pp->sectorsize = dp->d_sectorsize; pp->stripeoffset = dp->d_stripeoffset; pp->stripesize = dp->d_stripesize; if ((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0) pp->flags |= G_PF_ACCEPT_UNMAPPED; if ((dp->d_flags & DISKFLAG_DIRECT_COMPLETION) != 0) pp->flags |= G_PF_DIRECT_SEND; pp->flags |= G_PF_DIRECT_RECEIVE; if (bootverbose) printf("GEOM: new disk %s\n", gp->name); sysctl_ctx_init(&sc->sysctl_ctx); snprintf(tmpstr, sizeof(tmpstr), "GEOM disk %s", gp->name); sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_geom_disk), OID_AUTO, gp->name, CTLFLAG_RD, 0, tmpstr); if (sc->sysctl_tree != NULL) { SYSCTL_ADD_STRING(&sc->sysctl_ctx, SYSCTL_CHILDREN(sc->sysctl_tree), OID_AUTO, "led", CTLFLAG_RWTUN, sc->led, sizeof(sc->led), "LED name"); } pp->private = sc; dp->d_geom = gp; g_error_provider(pp, 0); mtx_pool_lock(mtxpool_sleep, dp); dp->d_init_level = DISK_INIT_DONE; /* * If the disk has gone away at this stage, start the withering * process for it. */ if (dp->d_goneflag != 0) { mtx_pool_unlock(mtxpool_sleep, dp); g_wither_provider(pp, ENXIO); return; } mtx_pool_unlock(mtxpool_sleep, dp); } /* * We get this callback after all of the consumers have gone away, and just * before the provider is freed. If the disk driver provided a d_gone * callback, let them know that it is okay to free resources -- they won't * be getting any more accesses from GEOM. */ static void g_disk_providergone(struct g_provider *pp) { struct disk *dp; struct g_disk_softc *sc; sc = (struct g_disk_softc *)pp->private; dp = sc->dp; if (dp != NULL && dp->d_gone != NULL) dp->d_gone(dp); if (sc->sysctl_tree != NULL) { sysctl_ctx_free(&sc->sysctl_ctx); sc->sysctl_tree = NULL; } if (sc->led[0] != 0) { led_set(sc->led, "0"); sc->led[0] = 0; } pp->private = NULL; pp->geom->softc = NULL; mtx_destroy(&sc->done_mtx); mtx_destroy(&sc->start_mtx); g_free(sc); } static void g_disk_destroy(void *ptr, int flag) { struct disk *dp; struct g_geom *gp; struct g_disk_softc *sc; g_topology_assert(); dp = ptr; gp = dp->d_geom; if (gp != NULL) { sc = gp->softc; if (sc != NULL) sc->dp = NULL; dp->d_geom = NULL; g_wither_geom(gp, ENXIO); } g_free(dp); } /* * We only allow printable characters in disk ident, * the rest is converted to 'x'. */ static void g_disk_ident_adjust(char *ident, size_t size) { char *p, tmp[4], newid[DISK_IDENT_SIZE]; newid[0] = '\0'; for (p = ident; *p != '\0'; p++) { if (isprint(*p)) { tmp[0] = *p; tmp[1] = '\0'; } else { snprintf(tmp, sizeof(tmp), "x%02hhx", *(unsigned char *)p); } if (strlcat(newid, tmp, sizeof(newid)) >= sizeof(newid)) break; } bzero(ident, size); strlcpy(ident, newid, size); } struct disk * disk_alloc(void) { return (g_malloc(sizeof(struct disk), M_WAITOK | M_ZERO)); } void disk_create(struct disk *dp, int version) { if (version != DISK_VERSION) { printf("WARNING: Attempt to add disk %s%d %s", dp->d_name, dp->d_unit, " using incompatible ABI version of disk(9)\n"); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } if (dp->d_flags & DISKFLAG_RESERVED) { printf("WARNING: Attempt to add non-MPSAFE disk %s%d\n", dp->d_name, dp->d_unit); printf("WARNING: Ignoring disk %s%d\n", dp->d_name, dp->d_unit); return; } KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy")); KASSERT(dp->d_name != NULL, ("disk_create need d_name")); KASSERT(*dp->d_name != 0, ("disk_create need d_name")); KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long")); if (dp->d_devstat == NULL) dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit, dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); dp->d_geom = NULL; dp->d_init_level = DISK_INIT_NONE; g_disk_ident_adjust(dp->d_ident, sizeof(dp->d_ident)); g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL); } void disk_destroy(struct disk *dp) { g_cancel_event(dp); dp->d_destroyed = 1; if (dp->d_devstat != NULL) devstat_remove_entry(dp->d_devstat); g_post_event(g_disk_destroy, dp, M_WAITOK, NULL); } void disk_gone(struct disk *dp) { struct g_geom *gp; struct g_provider *pp; mtx_pool_lock(mtxpool_sleep, dp); dp->d_goneflag = 1; /* * If we're still in the process of creating this disk (the * g_disk_create() function is still queued, or is in * progress), the init level will not yet be DISK_INIT_DONE. * * If that is the case, g_disk_create() will see d_goneflag * and take care of cleaning things up. * * If the disk has already been created, we default to * withering the provider as usual below. * * If the caller has not set a d_gone() callback, he will * not be any worse off by returning here, because the geom * has not been fully setup in any case. */ if (dp->d_init_level < DISK_INIT_DONE) { mtx_pool_unlock(mtxpool_sleep, dp); return; } mtx_pool_unlock(mtxpool_sleep, dp); gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_wither_provider(pp, ENXIO); } } } void disk_attr_changed(struct disk *dp, const char *attr, int flag) { struct g_geom *gp; struct g_provider *pp; char devnamebuf[128]; gp = dp->d_geom; if (gp != NULL) LIST_FOREACH(pp, &gp->provider, provider) (void)g_attr_changed(pp, attr, flag); snprintf(devnamebuf, sizeof(devnamebuf), "devname=%s%d", dp->d_name, dp->d_unit); devctl_notify("GEOM", "disk", attr, devnamebuf); } void disk_media_changed(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_changed(pp, flag); } } } void disk_media_gone(struct disk *dp, int flag) { struct g_geom *gp; struct g_provider *pp; gp = dp->d_geom; if (gp != NULL) { pp = LIST_FIRST(&gp->provider); if (pp != NULL) { KASSERT(LIST_NEXT(pp, provider) == NULL, ("geom %p has more than one provider", gp)); g_media_gone(pp, flag); } } } int disk_resize(struct disk *dp, int flag) { if (dp->d_destroyed || dp->d_geom == NULL) return (0); return (g_post_event(g_disk_resize, dp, flag, NULL)); } static void g_kern_disks(void *p, int flag __unused) { struct sbuf *sb; struct g_geom *gp; char *sp; sb = p; sp = ""; g_topology_assert(); LIST_FOREACH(gp, &g_disk_class.geom, geom) { sbuf_printf(sb, "%s%s", sp, gp->name); sp = " "; } sbuf_finish(sb); } static int sysctl_disks(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; sb = sbuf_new_auto(); g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return error; } SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_disks, "A", "names of available disks"); Index: head/sys/geom/geom_io.c =================================================================== --- head/sys/geom/geom_io.c (revision 308154) +++ head/sys/geom/geom_io.c (revision 308155) @@ -1,1045 +1,1057 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int g_io_transient_map_bio(struct bio *bp); static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; /* * Pace is a hint that we've had some trouble recently allocating * bios, so we should back off trying to send I/O down the stack * a bit to let the problem resolve. When pacing, we also turn * off direct dispatch to also reduce memory pressure from I/Os * there, at the expxense of some added latency while the memory * pressures exist. See g_io_schedule_down() for more details * and limitations. */ static volatile u_int pace; static uma_zone_t biozone; /* * The head of the list of classifiers used in g_io_request. * Use g_register_classifier() and g_unregister_classifier() * to add/remove entries to the list. * Classifiers are invoked in registration order. */ static TAILQ_HEAD(g_classifier_tailq, g_classifier_hook) g_classifier_tailq = TAILQ_HEAD_INITIALIZER(g_classifier_tailq); #include static void g_bioq_lock(struct g_bioq *bq) { mtx_lock(&bq->bio_queue_lock); } static void g_bioq_unlock(struct g_bioq *bq) { mtx_unlock(&bq->bio_queue_lock); } #if 0 static void g_bioq_destroy(struct g_bioq *bq) { mtx_destroy(&bq->bio_queue_lock); } #endif static void g_bioq_init(struct g_bioq *bq) { TAILQ_INIT(&bq->bio_queue); mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); } static struct bio * g_bioq_first(struct g_bioq *bq) { struct bio *bp; bp = TAILQ_FIRST(&bq->bio_queue); if (bp != NULL) { KASSERT((bp->bio_flags & BIO_ONQUEUE), ("Bio not on queue bp=%p target %p", bp, bq)); bp->bio_flags &= ~BIO_ONQUEUE; TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); bq->bio_queue_length--; } return (bp); } struct bio * g_new_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_new_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } struct bio * g_alloc_bio(void) { struct bio *bp; bp = uma_zalloc(biozone, M_WAITOK | M_ZERO); #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_alloc_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return (bp); } void g_destroy_bio(struct bio *bp) { #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR1(KTR_GEOM, "g_destroy_bio(): %p", bp); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif uma_zfree(biozone, bp); } struct bio * g_clone_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); if (bp2 != NULL) { bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; if (bp->bio_cmd == BIO_ZONE) bcopy(&bp->bio_zone, &bp2->bio_zone, sizeof(bp->bio_zone)); /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; bp2->bio_classifier2 = bp->bio_classifier2; +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + bp2->bio_track_bp = bp->bio_track_bp; +#endif bp->bio_children++; } #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_clone_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } struct bio * g_duplicate_bio(struct bio *bp) { struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; bp2->bio_ma = bp->bio_ma; bp2->bio_ma_n = bp->bio_ma_n; bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR if ((KTR_COMPILE & KTR_GEOM) && (ktr_mask & KTR_GEOM)) { struct stack st; CTR2(KTR_GEOM, "g_duplicate_bio(%p): %p", bp, bp2); stack_save(&st); CTRSTACK(KTR_GEOM, &st, 3, 0); } #endif return(bp2); } void g_reset_bio(struct bio *bp) { bzero(bp, sizeof(*bp)); } void g_io_init() { g_bioq_init(&g_bio_run_down); g_bioq_init(&g_bio_run_up); biozone = uma_zcreate("g_bio", sizeof (struct bio), NULL, NULL, NULL, NULL, 0, 0); } int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_getattr(%s)", attr); bp = g_alloc_bio(); bp->bio_cmd = BIO_GETATTR; bp->bio_done = NULL; bp->bio_attribute = attr; bp->bio_length = *len; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "ggetattr"); *len = bp->bio_completed; g_destroy_bio(bp); return (error); } int g_io_zonecmd(struct disk_zone_args *zone_args, struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_zone(%d)", zone_args->zone_cmd); bp = g_alloc_bio(); bp->bio_cmd = BIO_ZONE; bp->bio_done = NULL; /* * XXX KDM need to handle report zone data. */ bcopy(zone_args, &bp->bio_zone, sizeof(*zone_args)); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) bp->bio_length = zone_args->zone_params.report.entries_allocated * sizeof(struct disk_zone_rep_entry); else bp->bio_length = 0; g_io_request(bp, cp); error = biowait(bp, "gzone"); bcopy(&bp->bio_zone, zone_args, sizeof(*zone_args)); g_destroy_bio(bp); return (error); } int g_io_flush(struct g_consumer *cp) { struct bio *bp; int error; g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name); bp = g_alloc_bio(); bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_done = NULL; bp->bio_attribute = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gflush"); g_destroy_bio(bp); return (error); } static int g_io_check(struct bio *bp) { struct g_consumer *cp; struct g_provider *pp; off_t excess; int error; + biotrack(bp, __func__); + cp = bp->bio_from; pp = bp->bio_to; /* Fail if access counters dont allow the operation */ switch(bp->bio_cmd) { case BIO_READ: case BIO_GETATTR: if (cp->acr == 0) return (EPERM); break; case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: if (cp->acw == 0) return (EPERM); break; case BIO_ZONE: if ((bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) || (bp->bio_zone.zone_cmd == DISK_ZONE_GET_PARAMS)) { if (cp->acr == 0) return (EPERM); } else if (cp->acw == 0) return (EPERM); break; default: return (EPERM); } /* if provider is marked for error, don't disturb. */ if (pp->error) return (pp->error); if (cp->flags & G_CF_ORPHAN) return (ENXIO); switch(bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: /* Zero sectorsize or mediasize is probably a lack of media. */ if (pp->sectorsize == 0 || pp->mediasize == 0) return (ENXIO); /* Reject I/O not on sector boundary */ if (bp->bio_offset % pp->sectorsize) return (EINVAL); /* Reject I/O not integral sector long */ if (bp->bio_length % pp->sectorsize) return (EINVAL); /* Reject requests before or past the end of media. */ if (bp->bio_offset < 0) return (EIO); if (bp->bio_offset > pp->mediasize) return (EIO); /* Truncate requests to the end of providers media. */ excess = bp->bio_offset + bp->bio_length; if (excess > bp->bio_to->mediasize) { KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE == bp->bio_ma_n, ("excess bio %p too short", bp)); excess -= bp->bio_to->mediasize; bp->bio_length -= excess; if ((bp->bio_flags & BIO_UNMAPPED) != 0) { bp->bio_ma_n = round_page(bp->bio_ma_offset + bp->bio_length) / PAGE_SIZE; } if (excess > 0) CTR3(KTR_GEOM, "g_down truncated bio " "%p provider %s by %d", bp, bp->bio_to->name, excess); } /* Deliver zero length transfers right here. */ if (bp->bio_length == 0) { CTR2(KTR_GEOM, "g_down terminated 0-length " "bp %p provider %s", bp, bp->bio_to->name); return (0); } if ((bp->bio_flags & BIO_UNMAPPED) != 0 && (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { if ((error = g_io_transient_map_bio(bp)) >= 0) return (error); } break; default: break; } return (EJUSTRETURN); } /* * bio classification support. * * g_register_classifier() and g_unregister_classifier() * are used to add/remove a classifier from the list. * The list is protected using the g_bio_run_down lock, * because the classifiers are called in this path. * * g_io_request() passes bio's that are not already classified * (i.e. those with bio_classifier1 == NULL) to g_run_classifiers(). * Classifiers can store their result in the two fields * bio_classifier1 and bio_classifier2. * A classifier that updates one of the fields should * return a non-zero value. * If no classifier updates the field, g_run_classifiers() sets * bio_classifier1 = BIO_NOTCLASSIFIED to avoid further calls. */ int g_register_classifier(struct g_classifier_hook *hook) { g_bioq_lock(&g_bio_run_down); TAILQ_INSERT_TAIL(&g_classifier_tailq, hook, link); g_bioq_unlock(&g_bio_run_down); return (0); } void g_unregister_classifier(struct g_classifier_hook *hook) { struct g_classifier_hook *entry; g_bioq_lock(&g_bio_run_down); TAILQ_FOREACH(entry, &g_classifier_tailq, link) { if (entry == hook) { TAILQ_REMOVE(&g_classifier_tailq, hook, link); break; } } g_bioq_unlock(&g_bio_run_down); } static void g_run_classifiers(struct bio *bp) { struct g_classifier_hook *hook; int classified = 0; + biotrack(bp, __func__); + TAILQ_FOREACH(hook, &g_classifier_tailq, link) classified |= hook->func(hook->arg, bp); if (!classified) bp->bio_classifier1 = BIO_NOTCLASSIFIED; } void g_io_request(struct bio *bp, struct g_consumer *cp) { struct g_provider *pp; struct mtx *mtxp; int direct, error, first; uint8_t cmd; + biotrack(bp, __func__); + KASSERT(cp != NULL, ("NULL cp in g_io_request")); KASSERT(bp != NULL, ("NULL bp in g_io_request")); pp = cp->provider; KASSERT(pp != NULL, ("consumer not attached in g_io_request")); #ifdef DIAGNOSTIC KASSERT(bp->bio_driver1 == NULL, ("bio_driver1 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_driver2 == NULL, ("bio_driver2 used by the consumer (geom %s)", cp->geom->name)); KASSERT(bp->bio_pflags == 0, ("bio_pflags used by the consumer (geom %s)", cp->geom->name)); /* * Remember consumer's private fields, so we can detect if they were * modified by the provider. */ bp->_bio_caller1 = bp->bio_caller1; bp->_bio_caller2 = bp->bio_caller2; bp->_bio_cflags = bp->bio_cflags; #endif cmd = bp->bio_cmd; if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_GETATTR) { KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_DELETE || cmd == BIO_FLUSH) { KASSERT(bp->bio_data == NULL, ("non-NULL bp->data in g_io_request(cmd=%hu)", bp->bio_cmd)); } if (cmd == BIO_READ || cmd == BIO_WRITE || cmd == BIO_DELETE) { KASSERT(bp->bio_offset % cp->provider->sectorsize == 0, ("wrong offset %jd for sectorsize %u", bp->bio_offset, cp->provider->sectorsize)); KASSERT(bp->bio_length % cp->provider->sectorsize == 0, ("wrong length %jd for sectorsize %u", bp->bio_length, cp->provider->sectorsize)); } g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); bp->bio_from = cp; bp->bio_to = pp; bp->bio_error = 0; bp->bio_completed = 0; KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&bp->bio_t0); else getbinuptime(&bp->bio_t0); #ifdef GET_STACK_USAGE direct = (cp->flags & G_CF_DIRECT_SEND) != 0 && (pp->flags & G_PF_DIRECT_RECEIVE) != 0 && !g_is_geom_thread(curthread) && ((pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 || (bp->bio_flags & BIO_UNMAPPED) == 0 || THREAD_CAN_SLEEP()) && pace == 0; if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif if (!TAILQ_EMPTY(&g_classifier_tailq) && !bp->bio_classifier1) { g_bioq_lock(&g_bio_run_down); g_run_classifiers(bp); g_bioq_unlock(&g_bio_run_down); } /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ mtxp = mtx_pool_find(mtxpool_sleep, pp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_start_transaction(pp->stat, &bp->bio_t0); if (g_collectstats & G_STATS_CONSUMERS) devstat_start_transaction(cp->stat, &bp->bio_t0); pp->nstart++; cp->nstart++; mtx_unlock(mtxp); if (direct) { error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_io_request g_io_check on bp %p " "provider %s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); return; } bp->bio_to->geom->start(bp); } else { g_bioq_lock(&g_bio_run_down); first = TAILQ_EMPTY(&g_bio_run_down.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_down.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_down.bio_queue_length++; g_bioq_unlock(&g_bio_run_down); /* Pass it on down. */ if (first) wakeup(&g_wait_down); } } void g_io_deliver(struct bio *bp, int error) { struct bintime now; struct g_consumer *cp; struct g_provider *pp; struct mtx *mtxp; int direct, first; + biotrack(bp, __func__); + KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); pp = bp->bio_to; KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); cp = bp->bio_from; if (cp == NULL) { bp->bio_error = error; bp->bio_done(bp); return; } KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); #ifdef DIAGNOSTIC /* * Some classes - GJournal in particular - can modify bio's * private fields while the bio is in transit; G_GEOM_VOLATILE_BIO * flag means it's an expected behaviour for that particular geom. */ if ((cp->geom->flags & G_GEOM_VOLATILE_BIO) == 0) { KASSERT(bp->bio_caller1 == bp->_bio_caller1, ("bio_caller1 used by the provider %s", pp->name)); KASSERT(bp->bio_caller2 == bp->_bio_caller2, ("bio_caller2 used by the provider %s", pp->name)); KASSERT(bp->bio_cflags == bp->_bio_cflags, ("bio_cflags used by the provider %s", pp->name)); } #endif KASSERT(bp->bio_completed >= 0, ("bio_completed can't be less than 0")); KASSERT(bp->bio_completed <= bp->bio_length, ("bio_completed can't be greater than bio_length")); g_trace(G_T_BIO, "g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); KASSERT(!(bp->bio_flags & BIO_ONQUEUE), ("Bio already on queue bp=%p", bp)); /* * XXX: next two doesn't belong here */ bp->bio_bcount = bp->bio_length; bp->bio_resid = bp->bio_bcount - bp->bio_completed; #ifdef GET_STACK_USAGE direct = (pp->flags & G_PF_DIRECT_SEND) && (cp->flags & G_CF_DIRECT_RECEIVE) && !g_is_geom_thread(curthread); if (direct) { /* Block direct execution if less then half of stack left. */ size_t st, su; GET_STACK_USAGE(st, su); if (su * 2 > st) direct = 0; } #else direct = 0; #endif /* * The statistics collection is lockless, as such, but we * can not update one instance of the statistics from more * than one thread at a time, so grab the lock first. */ if ((g_collectstats & G_STATS_CONSUMERS) != 0 || ((g_collectstats & G_STATS_PROVIDERS) != 0 && pp->stat != NULL)) binuptime(&now); mtxp = mtx_pool_find(mtxpool_sleep, cp); mtx_lock(mtxp); if (g_collectstats & G_STATS_PROVIDERS) devstat_end_transaction_bio_bt(pp->stat, bp, &now); if (g_collectstats & G_STATS_CONSUMERS) devstat_end_transaction_bio_bt(cp->stat, bp, &now); cp->nend++; pp->nend++; mtx_unlock(mtxp); if (error != ENOMEM) { bp->bio_error = error; if (direct) { biodone(bp); } else { g_bioq_lock(&g_bio_run_up); first = TAILQ_EMPTY(&g_bio_run_up.bio_queue); TAILQ_INSERT_TAIL(&g_bio_run_up.bio_queue, bp, bio_queue); bp->bio_flags |= BIO_ONQUEUE; g_bio_run_up.bio_queue_length++; g_bioq_unlock(&g_bio_run_up); if (first) wakeup(&g_wait_up); } return; } if (bootverbose) printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); bp->bio_children = 0; bp->bio_inbed = 0; bp->bio_driver1 = NULL; bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); pace = 1; return; } SYSCTL_DECL(_kern_geom); static long transient_maps; SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, &transient_maps, 0, "Total count of the transient mapping requests"); u_int transient_map_retries = 10; SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, &transient_map_retries, 0, "Max count of retries used before giving up on creating transient map"); int transient_map_hard_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, &transient_map_hard_failures, 0, "Failures to establish the transient mapping due to retry attempts " "exhausted"); int transient_map_soft_failures; SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, &transient_map_soft_failures, 0, "Count of retried failures to establish the transient mapping"); int inflight_transient_maps; SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, &inflight_transient_maps, 0, "Current count of the active transient maps"); static int g_io_transient_map_bio(struct bio *bp) { vm_offset_t addr; long size; u_int retried; KASSERT(unmapped_buf_allowed, ("unmapped disabled")); size = round_page(bp->bio_ma_offset + bp->bio_length); KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); addr = 0; retried = 0; atomic_add_long(&transient_maps, 1); retry: if (vmem_alloc(transient_arena, size, M_BESTFIT | M_NOWAIT, &addr)) { if (transient_map_retries != 0 && retried >= transient_map_retries) { CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", bp, bp->bio_to->name); atomic_add_int(&transient_map_hard_failures, 1); return (EDEADLK/* XXXKIB */); } else { /* * Naive attempt to quisce the I/O to get more * in-flight requests completed and defragment * the transient_arena. */ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", bp, bp->bio_to->name, retried); pause("g_d_tra", hz / 10); retried++; atomic_add_int(&transient_map_soft_failures, 1); goto retry; } } atomic_add_int(&inflight_transient_maps, 1); pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; bp->bio_flags |= BIO_TRANSIENT_MAPPING; bp->bio_flags &= ~BIO_UNMAPPED; return (EJUSTRETURN); } void g_io_schedule_down(struct thread *tp __unused) { struct bio *bp; int error; for(;;) { g_bioq_lock(&g_bio_run_down); bp = g_bioq_first(&g_bio_run_down); if (bp == NULL) { CTR0(KTR_GEOM, "g_down going to sleep"); msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); + biotrack(bp, __func__); if (pace != 0) { /* * There has been at least one memory allocation * failure since the last I/O completed. Pause 1ms to * give the system a chance to free up memory. We only * do this once because a large number of allocations * can fail in the direct dispatch case and there's no * relationship between the number of these failures and * the length of the outage. If there's still an outage, * we'll pause again and again until it's * resolved. Older versions paused longer and once per * allocation failure. This was OK for a single threaded * g_down, but with direct dispatch would lead to max of * 10 IOPs for minutes at a time when transient memory * issues prevented allocation for a batch of requests * from the upper layers. * * XXX This pacing is really lame. It needs to be solved * by other methods. This is OK only because the worst * case scenario is so rare. In the worst case scenario * all memory is tied up waiting for I/O to complete * which can never happen since we can't allocate bios * for that I/O. */ CTR0(KTR_GEOM, "g_down pacing self"); pause("g_down", min(hz/1000, 1)); pace = 0; } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name); error = g_io_check(bp); if (error >= 0) { CTR3(KTR_GEOM, "g_down g_io_check on bp %p provider " "%s returned %d", bp, bp->bio_to->name, error); g_io_deliver(bp, error); continue; } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); bp->bio_to->geom->start(bp); THREAD_SLEEPING_OK(); } } void g_io_schedule_up(struct thread *tp __unused) { struct bio *bp; for(;;) { g_bioq_lock(&g_bio_run_up); bp = g_bioq_first(&g_bio_run_up); if (bp == NULL) { CTR0(KTR_GEOM, "g_up going to sleep"); msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, PRIBIO | PDROP, "-", 0); continue; } g_bioq_unlock(&g_bio_run_up); THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_up biodone bp %p provider %s off " "%jd len %ld", bp, bp->bio_to->name, bp->bio_offset, bp->bio_length); biodone(bp); THREAD_SLEEPING_OK(); } } void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) { struct bio *bp; void *ptr; int errorc; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_read_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_READ; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; ptr = g_malloc(length, M_WAITOK); bp->bio_data = ptr; g_io_request(bp, cp); errorc = biowait(bp, "gread"); if (error != NULL) *error = errorc; g_destroy_bio(bp); if (errorc) { g_free(ptr); ptr = NULL; } return (ptr); } int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize && length <= MAXPHYS, ("g_write_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_WRITE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = ptr; g_io_request(bp, cp); error = biowait(bp, "gwrite"); g_destroy_bio(bp); return (error); } int g_delete_data(struct g_consumer *cp, off_t offset, off_t length) { struct bio *bp; int error; KASSERT(length > 0 && length >= cp->provider->sectorsize, ("g_delete_data(): invalid length %jd", (intmax_t)length)); bp = g_alloc_bio(); bp->bio_cmd = BIO_DELETE; bp->bio_done = NULL; bp->bio_offset = offset; bp->bio_length = length; bp->bio_data = NULL; g_io_request(bp, cp); error = biowait(bp, "gdelete"); g_destroy_bio(bp); return (error); } void g_print_bio(struct bio *bp) { const char *pname, *cmd = NULL; if (bp->bio_to != NULL) pname = bp->bio_to->name; else pname = "[unknown]"; switch (bp->bio_cmd) { case BIO_GETATTR: cmd = "GETATTR"; printf("%s[%s(attr=%s)]", pname, cmd, bp->bio_attribute); return; case BIO_FLUSH: cmd = "FLUSH"; printf("%s[%s]", pname, cmd); return; case BIO_ZONE: { char *subcmd = NULL; cmd = "ZONE"; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: subcmd = "OPEN"; break; case DISK_ZONE_CLOSE: subcmd = "CLOSE"; break; case DISK_ZONE_FINISH: subcmd = "FINISH"; break; case DISK_ZONE_RWP: subcmd = "RWP"; break; case DISK_ZONE_REPORT_ZONES: subcmd = "REPORT ZONES"; break; case DISK_ZONE_GET_PARAMS: subcmd = "GET PARAMS"; break; default: subcmd = "UNKNOWN"; break; } printf("%s[%s,%s]", pname, cmd, subcmd); return; } case BIO_READ: cmd = "READ"; break; case BIO_WRITE: cmd = "WRITE"; break; case BIO_DELETE: cmd = "DELETE"; break; default: cmd = "UNKNOWN"; printf("%s[%s()]", pname, cmd); return; } printf("%s[%s(offset=%jd, length=%jd)]", pname, cmd, (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); } Index: head/sys/geom/geom_subr.c =================================================================== --- head/sys/geom/geom_subr.c (revision 308154) +++ head/sys/geom/geom_subr.c (revision 308155) @@ -1,1541 +1,1545 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KDB #include #endif struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes); static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms); char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim; struct g_hh00 { struct g_class *mp; struct g_provider *pp; off_t size; int error; int post; }; /* * This event offers a new class a chance to taste all preexisting providers. */ static void g_load_class(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp2, *mp; struct g_geom *gp; struct g_provider *pp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name); KASSERT(mp->name != NULL && *mp->name != '\0', ("GEOM class has no name")); LIST_FOREACH(mp2, &g_classes, class) { if (mp2 == mp) { printf("The GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } else if (strcmp(mp2->name, mp->name) == 0) { printf("A GEOM class %s is already loaded.\n", mp2->name); if (hh != NULL) hh->error = EEXIST; return; } } LIST_INIT(&mp->geom); LIST_INSERT_HEAD(&g_classes, mp, class); if (mp->init != NULL) mp->init(mp); if (mp->taste == NULL) return; LIST_FOREACH(mp2, &g_classes, class) { if (mp == mp2) continue; LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { mp->taste(mp, pp, 0); g_topology_assert(); } } } } static int g_unload_class(struct g_class *mp) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; int error; g_topology_lock(); g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name); retry: G_VALID_CLASS(mp); LIST_FOREACH(gp, &mp->geom, geom) { /* We refuse to unload if anything is open */ LIST_FOREACH(pp, &gp->provider, provider) if (pp->acr || pp->acw || pp->ace) { g_topology_unlock(); return (EBUSY); } LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) { g_topology_unlock(); return (EBUSY); } /* If the geom is withering, wait for it to finish. */ if (gp->flags & G_GEOM_WITHER) { g_topology_sleep(mp, 1); goto retry; } } /* * We allow unloading if we have no geoms, or a class * method we can use to get rid of them. */ if (!LIST_EMPTY(&mp->geom) && mp->destroy_geom == NULL) { g_topology_unlock(); return (EOPNOTSUPP); } /* Bar new entries */ mp->taste = NULL; mp->config = NULL; LIST_FOREACH(gp, &mp->geom, geom) { error = mp->destroy_geom(NULL, mp, gp); if (error != 0) { g_topology_unlock(); return (error); } } /* Wait for withering to finish. */ for (;;) { gp = LIST_FIRST(&mp->geom); if (gp == NULL) break; KASSERT(gp->flags & G_GEOM_WITHER, ("Non-withering geom in class %s", mp->name)); g_topology_sleep(mp, 1); } G_VALID_CLASS(mp); if (mp->fini != NULL) mp->fini(mp); LIST_REMOVE(mp, class); g_topology_unlock(); return (0); } int g_modevent(module_t mod, int type, void *data) { struct g_hh00 *hh; int error; static int g_ignition; struct g_class *mp; mp = data; if (mp->version != G_VERSION) { printf("GEOM class %s has Wrong version %x\n", mp->name, mp->version); return (EINVAL); } if (!g_ignition) { g_ignition++; g_init(); } error = EOPNOTSUPP; switch (type) { case MOD_LOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", mp->name); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; /* * Once the system is not cold, MOD_LOAD calls will be * from the userland and the g_event thread will be able * to acknowledge their completion. */ if (cold) { hh->post = 1; error = g_post_event(g_load_class, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_load_class, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } break; case MOD_UNLOAD: g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", mp->name); error = g_unload_class(mp); if (error == 0) { KASSERT(LIST_EMPTY(&mp->geom), ("Unloaded class (%s) still has geom", mp->name)); } break; } return (error); } static void g_retaste_event(void *arg, int flag) { struct g_class *mp, *mp2; struct g_geom *gp; struct g_hh00 *hh; struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); if (flag == EV_CANCEL) /* XXX: can't happen ? */ return; if (g_shutdown || g_notaste) return; hh = arg; mp = hh->mp; hh->error = 0; if (hh->post) { g_free(hh); hh = NULL; } g_trace(G_T_TOPOLOGY, "g_retaste(%s)", mp->name); LIST_FOREACH(mp2, &g_classes, class) { LIST_FOREACH(gp, &mp2->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (pp->acr || pp->acw || pp->ace) continue; LIST_FOREACH(cp, &pp->consumers, consumers) { if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; } if (cp != NULL) { cp->flags |= G_CF_ORPHAN; g_wither_geom(cp->geom, ENXIO); } mp->taste(mp, pp, 0); g_topology_assert(); } } } } int g_retaste(struct g_class *mp) { struct g_hh00 *hh; int error; if (mp->taste == NULL) return (EINVAL); hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->mp = mp; if (cold) { hh->post = 1; error = g_post_event(g_retaste_event, hh, M_WAITOK, NULL); } else { error = g_waitfor_event(g_retaste_event, hh, M_WAITOK, NULL); if (error == 0) error = hh->error; g_free(hh); } return (error); } struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...) { struct g_geom *gp; va_list ap; struct sbuf *sb; g_topology_assert(); G_VALID_CLASS(mp); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO); gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO); gp->class = mp; gp->rank = 1; LIST_INIT(&gp->consumer); LIST_INIT(&gp->provider); LIST_INSERT_HEAD(&mp->geom, gp, geom); TAILQ_INSERT_HEAD(&geoms, gp, geoms); strcpy(gp->name, sbuf_data(sb)); sbuf_delete(sb); /* Fill in defaults from class */ gp->start = mp->start; gp->spoiled = mp->spoiled; gp->attrchanged = mp->attrchanged; gp->providergone = mp->providergone; gp->dumpconf = mp->dumpconf; gp->access = mp->access; gp->orphan = mp->orphan; gp->ioctl = mp->ioctl; gp->resize = mp->resize; return (gp); } void g_destroy_geom(struct g_geom *gp) { g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name); KASSERT(LIST_EMPTY(&gp->consumer), ("g_destroy_geom(%s) with consumer(s) [%p]", gp->name, LIST_FIRST(&gp->consumer))); KASSERT(LIST_EMPTY(&gp->provider), ("g_destroy_geom(%s) with provider(s) [%p]", gp->name, LIST_FIRST(&gp->provider))); g_cancel_event(gp); LIST_REMOVE(gp, geom); TAILQ_REMOVE(&geoms, gp, geoms); g_free(gp->name); g_free(gp); } /* * This function is called (repeatedly) until the geom has withered away. */ void g_wither_geom(struct g_geom *gp, int error) { struct g_provider *pp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name); if (!(gp->flags & G_GEOM_WITHER)) { gp->flags |= G_GEOM_WITHER; LIST_FOREACH(pp, &gp->provider, provider) if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } g_do_wither(); } /* * Convenience function to destroy a particular provider. */ void g_wither_provider(struct g_provider *pp, int error) { pp->flags |= G_PF_WITHER; if (!(pp->flags & G_PF_ORPHAN)) g_orphan_provider(pp, error); } /* * This function is called (repeatedly) until the has withered away. */ void g_wither_geom_close(struct g_geom *gp, int error) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); g_trace(G_T_TOPOLOGY, "g_wither_geom_close(%p(%s))", gp, gp->name); LIST_FOREACH(cp, &gp->consumer, consumer) if (cp->acr || cp->acw || cp->ace) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_wither_geom(gp, error); } /* * This function is called (repeatedly) until we cant wash away more * withered bits at present. */ void g_wither_washer() { struct g_class *mp; struct g_geom *gp, *gp2; struct g_provider *pp, *pp2; struct g_consumer *cp, *cp2; g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (!(pp->flags & G_PF_WITHER)) continue; if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } if (!(gp->flags & G_GEOM_WITHER)) continue; LIST_FOREACH_SAFE(pp, &gp->provider, provider, pp2) { if (LIST_EMPTY(&pp->consumers)) g_destroy_provider(pp); } LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp2) { if (cp->acr || cp->acw || cp->ace) continue; if (cp->provider != NULL) g_detach(cp); g_destroy_consumer(cp); } if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); } } } struct g_consumer * g_new_consumer(struct g_geom *gp) { struct g_consumer *cp; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(!(gp->flags & G_GEOM_WITHER), ("g_new_consumer on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); KASSERT(gp->orphan != NULL, ("g_new_consumer on geom(%s) (class %s) without orphan", gp->name, gp->class->name)); cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO); cp->geom = gp; cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->consumer, cp, consumer); return(cp); } void g_destroy_consumer(struct g_consumer *cp) { struct g_geom *gp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp); KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached")); KASSERT (cp->acr == 0, ("g_destroy_consumer with acr")); KASSERT (cp->acw == 0, ("g_destroy_consumer with acw")); KASSERT (cp->ace == 0, ("g_destroy_consumer with ace")); g_cancel_event(cp); gp = cp->geom; LIST_REMOVE(cp, consumer); devstat_remove_entry(cp->stat); g_free(cp); if (gp->flags & G_GEOM_WITHER) g_do_wither(); } static void g_new_provider_event(void *arg, int flag) { struct g_class *mp; struct g_provider *pp; struct g_consumer *cp, *next_cp; g_topology_assert(); if (flag == EV_CANCEL) return; if (g_shutdown) return; pp = arg; G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_new_provider_event but withered")); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, next_cp) { if ((cp->flags & G_CF_ORPHAN) == 0 && cp->geom->attrchanged != NULL) cp->geom->attrchanged(cp, "GEOM::media"); } if (g_notaste) return; LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...) { struct g_provider *pp; struct sbuf *sb; va_list ap; g_topology_assert(); G_VALID_GEOM(gp); KASSERT(gp->access != NULL, ("new provider on geom(%s) without ->access (class %s)", gp->name, gp->class->name)); KASSERT(gp->start != NULL, ("new provider on geom(%s) without ->start (class %s)", gp->name, gp->class->name)); KASSERT(!(gp->flags & G_GEOM_WITHER), ("new provider on WITHERing geom(%s) (class %s)", gp->name, gp->class->name)); sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); pp->name = (char *)(pp + 1); strcpy(pp->name, sbuf_data(sb)); sbuf_delete(sb); LIST_INIT(&pp->consumers); pp->error = ENXIO; pp->geom = gp; pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); LIST_INSERT_HEAD(&gp->provider, pp, provider); g_post_event(g_new_provider_event, pp, M_WAITOK, pp, gp, NULL); return (pp); } void g_error_provider(struct g_provider *pp, int error) { /* G_VALID_PROVIDER(pp); We may not have g_topology */ pp->error = error; } static void g_resize_provider_event(void *arg, int flag) { struct g_hh00 *hh; struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp, *cp2; off_t size; g_topology_assert(); if (g_shutdown) return; hh = arg; pp = hh->pp; size = hh->size; g_free(hh); G_VALID_PROVIDER(pp); KASSERT(!(pp->flags & G_PF_WITHER), ("g_resize_provider_event but withered")); g_trace(G_T_TOPOLOGY, "g_resize_provider_event(%p)", pp); LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if (gp->resize == NULL && size < pp->mediasize) { cp->flags |= G_CF_ORPHAN; cp->geom->orphan(cp); } } pp->mediasize = size; LIST_FOREACH_SAFE(cp, &pp->consumers, consumers, cp2) { gp = cp->geom; if ((gp->flags & G_GEOM_WITHER) == 0 && gp->resize != NULL) gp->resize(cp); } /* * After resizing, the previously invalid GEOM class metadata * might become valid. This means we should retaste. */ LIST_FOREACH(mp, &g_classes, class) { if (mp->taste == NULL) continue; LIST_FOREACH(cp, &pp->consumers, consumers) if (cp->geom->class == mp && (cp->flags & G_CF_ORPHAN) == 0) break; if (cp != NULL) continue; mp->taste(mp, pp, 0); g_topology_assert(); } } void g_resize_provider(struct g_provider *pp, off_t size) { struct g_hh00 *hh; G_VALID_PROVIDER(pp); if (pp->flags & G_PF_WITHER) return; if (size == pp->mediasize) return; hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); hh->pp = pp; hh->size = size; g_post_event(g_resize_provider_event, hh, M_WAITOK, NULL); } #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif struct g_provider * g_provider_by_name(char const *arg) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp, *wpp; if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) arg += sizeof(_PATH_DEV) - 1; wpp = NULL; LIST_FOREACH(cp, &g_classes, class) { LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (strcmp(arg, pp->name) != 0) continue; if ((gp->flags & G_GEOM_WITHER) == 0 && (pp->flags & G_PF_WITHER) == 0) return (pp); else wpp = pp; } } } return (wpp); } void g_destroy_provider(struct g_provider *pp) { struct g_geom *gp; g_topology_assert(); G_VALID_PROVIDER(pp); KASSERT(LIST_EMPTY(&pp->consumers), ("g_destroy_provider but attached")); KASSERT (pp->acr == 0, ("g_destroy_provider with acr")); KASSERT (pp->acw == 0, ("g_destroy_provider with acw")); KASSERT (pp->ace == 0, ("g_destroy_provider with ace")); g_cancel_event(pp); LIST_REMOVE(pp, provider); gp = pp->geom; devstat_remove_entry(pp->stat); /* * If a callback was provided, send notification that the provider * is now gone. */ if (gp->providergone != NULL) gp->providergone(pp); g_free(pp); if ((gp->flags & G_GEOM_WITHER)) g_do_wither(); } /* * We keep the "geoms" list sorted by topological order (== increasing * numerical rank) at all times. * When an attach is done, the attaching geoms rank is invalidated * and it is moved to the tail of the list. * All geoms later in the sequence has their ranks reevaluated in * sequence. If we cannot assign rank to a geom because it's * prerequisites do not have rank, we move that element to the tail * of the sequence with invalid rank as well. * At some point we encounter our original geom and if we stil fail * to assign it a rank, there must be a loop and we fail back to * g_attach() which detach again and calls redo_rank again * to fix up the damage. * It would be much simpler code wise to do it recursively, but we * can't risk that on the kernel stack. */ static int redo_rank(struct g_geom *gp) { struct g_consumer *cp; struct g_geom *gp1, *gp2; int n, m; g_topology_assert(); G_VALID_GEOM(gp); /* Invalidate this geoms rank and move it to the tail */ gp1 = TAILQ_NEXT(gp, geoms); if (gp1 != NULL) { gp->rank = 0; TAILQ_REMOVE(&geoms, gp, geoms); TAILQ_INSERT_TAIL(&geoms, gp, geoms); } else { gp1 = gp; } /* re-rank the rest of the sequence */ for (; gp1 != NULL; gp1 = gp2) { gp1->rank = 0; m = 1; LIST_FOREACH(cp, &gp1->consumer, consumer) { if (cp->provider == NULL) continue; n = cp->provider->geom->rank; if (n == 0) { m = 0; break; } else if (n >= m) m = n + 1; } gp1->rank = m; gp2 = TAILQ_NEXT(gp1, geoms); /* got a rank, moving on */ if (m != 0) continue; /* no rank to original geom means loop */ if (gp == gp1) return (ELOOP); /* no rank, put it at the end move on */ TAILQ_REMOVE(&geoms, gp1, geoms); TAILQ_INSERT_TAIL(&geoms, gp1, geoms); } return (0); } int g_attach(struct g_consumer *cp, struct g_provider *pp) { int error; g_topology_assert(); G_VALID_CONSUMER(cp); G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "g_attach(%p, %p)", cp, pp); KASSERT(cp->provider == NULL, ("attach but attached")); cp->provider = pp; LIST_INSERT_HEAD(&pp->consumers, cp, consumers); error = redo_rank(cp->geom); if (error) { LIST_REMOVE(cp, consumers); cp->provider = NULL; redo_rank(cp->geom); } return (error); } void g_detach(struct g_consumer *cp) { struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp); KASSERT(cp->provider != NULL, ("detach but not attached")); KASSERT(cp->acr == 0, ("detach but nonzero acr")); KASSERT(cp->acw == 0, ("detach but nonzero acw")); KASSERT(cp->ace == 0, ("detach but nonzero ace")); KASSERT(cp->nstart == cp->nend, ("detach with active requests")); pp = cp->provider; LIST_REMOVE(cp, consumers); cp->provider = NULL; if ((cp->geom->flags & G_GEOM_WITHER) || (pp->geom->flags & G_GEOM_WITHER) || (pp->flags & G_PF_WITHER)) g_do_wither(); redo_rank(cp->geom); } /* * g_access() * * Access-check with delta values. The question asked is "can provider * "cp" change the access counters by the relative amounts dc[rwe] ?" */ int g_access(struct g_consumer *cp, int dcr, int dcw, int dce) { struct g_provider *pp; int pr,pw,pe; int error; g_topology_assert(); G_VALID_CONSUMER(cp); pp = cp->provider; KASSERT(pp != NULL, ("access but not attached")); G_VALID_PROVIDER(pp); g_trace(G_T_ACCESS, "g_access(%p(%s), %d, %d, %d)", cp, pp->name, dcr, dcw, dce); KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr")); KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw")); KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace")); KASSERT(dcr != 0 || dcw != 0 || dce != 0, ("NOP access request")); KASSERT(pp->geom->access != NULL, ("NULL geom->access")); /* * If our class cares about being spoiled, and we have been, we * are probably just ahead of the event telling us that. Fail * now rather than having to unravel this later. */ if (cp->geom->spoiled != NULL && (cp->flags & G_CF_SPOILED) && (dcr > 0 || dcw > 0 || dce > 0)) return (ENXIO); /* * Figure out what counts the provider would have had, if this * consumer had (r0w0e0) at this time. */ pr = pp->acr - cp->acr; pw = pp->acw - cp->acw; pe = pp->ace - cp->ace; g_trace(G_T_ACCESS, "open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)", dcr, dcw, dce, cp->acr, cp->acw, cp->ace, pp->acr, pp->acw, pp->ace, pp, pp->name); /* If foot-shooting is enabled, any open on rank#1 is OK */ if ((g_debugflags & 16) && pp->geom->rank == 1) ; /* If we try exclusive but already write: fail */ else if (dce > 0 && pw > 0) return (EPERM); /* If we try write but already exclusive: fail */ else if (dcw > 0 && pe > 0) return (EPERM); /* If we try to open more but provider is error'ed: fail */ else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) { printf("%s(%d): provider %s has error\n", __func__, __LINE__, pp->name); return (pp->error); } /* Ok then... */ error = pp->geom->access(pp, dcr, dcw, dce); KASSERT(dcr > 0 || dcw > 0 || dce > 0 || error == 0, ("Geom provider %s::%s dcr=%d dcw=%d dce=%d error=%d failed " "closing ->access()", pp->geom->class->name, pp->name, dcr, dcw, dce, error)); if (!error) { /* * If we open first write, spoil any partner consumers. * If we close last write and provider is not errored, * trigger re-taste. */ if (pp->acw == 0 && dcw != 0) g_spoil(pp, cp); else if (pp->acw != 0 && pp->acw == -dcw && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); pp->acr += dcr; pp->acw += dcw; pp->ace += dce; cp->acr += dcr; cp->acw += dcw; cp->ace += dce; if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) KASSERT(pp->sectorsize > 0, ("Provider %s lacks sectorsize", pp->name)); if ((cp->geom->flags & G_GEOM_WITHER) && cp->acr == 0 && cp->acw == 0 && cp->ace == 0) g_do_wither(); } return (error); } int g_handleattr_int(struct bio *bp, const char *attribute, int val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) { return (g_handleattr(bp, attribute, &val, sizeof val)); } int g_handleattr_str(struct bio *bp, const char *attribute, const char *str) { return (g_handleattr(bp, attribute, str, 0)); } int g_handleattr(struct bio *bp, const char *attribute, const void *val, int len) { int error = 0; if (strcmp(bp->bio_attribute, attribute)) return (0); if (len == 0) { bzero(bp->bio_data, bp->bio_length); if (strlcpy(bp->bio_data, val, bp->bio_length) >= bp->bio_length) { printf("%s: %s bio_length %jd len %zu -> EFAULT\n", __func__, bp->bio_to->name, (intmax_t)bp->bio_length, strlen(val)); error = EFAULT; } } else if (bp->bio_length == len) { bcopy(val, bp->bio_data, len); } else { printf("%s: %s bio_length %jd len %d -> EFAULT\n", __func__, bp->bio_to->name, (intmax_t)bp->bio_length, len); error = EFAULT; } if (error == 0) bp->bio_completed = bp->bio_length; g_io_deliver(bp, error); return (1); } int g_std_access(struct g_provider *pp, int dr __unused, int dw __unused, int de __unused) { g_topology_assert(); G_VALID_PROVIDER(pp); return (0); } void g_std_done(struct bio *bp) { struct bio *bp2; bp2 = bp->bio_parent; if (bp2->bio_error == 0) bp2->bio_error = bp->bio_error; bp2->bio_completed += bp->bio_completed; g_destroy_bio(bp); bp2->bio_inbed++; if (bp2->bio_children == bp2->bio_inbed) g_io_deliver(bp2, bp2->bio_error); } /* XXX: maybe this is only g_slice_spoiled */ void g_std_spoiled(struct g_consumer *cp) { struct g_geom *gp; struct g_provider *pp; g_topology_assert(); G_VALID_CONSUMER(cp); g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp); cp->flags |= G_CF_ORPHAN; g_detach(cp); gp = cp->geom; LIST_FOREACH(pp, &gp->provider, provider) g_orphan_provider(pp, ENXIO); g_destroy_consumer(cp); if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) g_destroy_geom(gp); else gp->flags |= G_GEOM_WITHER; } /* * Spoiling happens when a provider is opened for writing, but consumers * which are configured by in-band data are attached (slicers for instance). * Since the write might potentially change the in-band data, such consumers * need to re-evaluate their existence after the writing session closes. * We do this by (offering to) tear them down when the open for write happens * in return for a re-taste when it closes again. * Together with the fact that such consumers grab an 'e' bit whenever they * are open, regardless of mode, this ends up DTRT. */ static void g_spoil_event(void *arg, int flag) { struct g_provider *pp; struct g_consumer *cp, *cp2; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); g_trace(G_T_TOPOLOGY, "%s %p(%s:%s:%s)", __func__, pp, pp->geom->class->name, pp->geom->name, pp->name); for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) { cp2 = LIST_NEXT(cp, consumers); if ((cp->flags & G_CF_SPOILED) == 0) continue; cp->flags &= ~G_CF_SPOILED; if (cp->geom->spoiled == NULL) continue; cp->geom->spoiled(cp); g_topology_assert(); } } void g_spoil(struct g_provider *pp, struct g_consumer *cp) { struct g_consumer *cp2; g_topology_assert(); G_VALID_PROVIDER(pp); G_VALID_CONSUMER(cp); LIST_FOREACH(cp2, &pp->consumers, consumers) { if (cp2 == cp) continue; /* KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr)); KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw)); */ KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace)); cp2->flags |= G_CF_SPOILED; } g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL); } static void g_media_changed_event(void *arg, int flag) { struct g_provider *pp; int retaste; g_topology_assert(); if (flag == EV_CANCEL) return; pp = arg; G_VALID_PROVIDER(pp); /* * If provider was not open for writing, queue retaste after spoiling. * If it was, retaste will happen automatically on close. */ retaste = (pp->acw == 0 && pp->error == 0 && !(pp->geom->flags & G_GEOM_WITHER)); g_spoil_event(arg, flag); if (retaste) g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); } int g_media_changed(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_media_changed_event, pp, flag, pp, NULL)); } int g_media_gone(struct g_provider *pp, int flag) { struct g_consumer *cp; LIST_FOREACH(cp, &pp->consumers, consumers) cp->flags |= G_CF_SPOILED; return (g_post_event(g_spoil_event, pp, flag, pp, NULL)); } int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len) { int error, i; i = len; error = g_io_getattr(attr, cp, &i, var); if (error) return (error); if (i != len) return (EINVAL); return (0); } static int g_get_device_prefix_len(const char *name) { int len; if (strncmp(name, "ada", 3) == 0) len = 3; else if (strncmp(name, "ad", 2) == 0) len = 2; else return (0); if (name[len] < '0' || name[len] > '9') return (0); do { len++; } while (name[len] >= '0' && name[len] <= '9'); return (len); } int g_compare_names(const char *namea, const char *nameb) { int deva, devb; if (strcmp(namea, nameb) == 0) return (1); deva = g_get_device_prefix_len(namea); if (deva == 0) return (0); devb = g_get_device_prefix_len(nameb); if (devb == 0) return (0); if (strcmp(namea + deva, nameb + devb) == 0) return (1); return (0); } #if defined(DIAGNOSTIC) || defined(DDB) /* * This function walks the mesh and returns a non-zero integer if it * finds the argument pointer is an object. The return value indicates * which type of object it is believed to be. If topology is not locked, * this function is potentially dangerous, but we don't assert that the * topology lock is held when called from debugger. */ int g_valid_obj(void const *ptr) { struct g_class *mp; struct g_geom *gp; struct g_consumer *cp; struct g_provider *pp; #ifdef KDB if (kdb_active == 0) #endif g_topology_assert(); LIST_FOREACH(mp, &g_classes, class) { if (ptr == mp) return (1); LIST_FOREACH(gp, &mp->geom, geom) { if (ptr == gp) return (2); LIST_FOREACH(cp, &gp->consumer, consumer) if (ptr == cp) return (3); LIST_FOREACH(pp, &gp->provider, provider) if (ptr == pp) return (4); } } return(0); } #endif #ifdef DDB #define gprintf(...) do { \ db_printf("%*s", indent, ""); \ db_printf(__VA_ARGS__); \ } while (0) #define gprintln(...) do { \ gprintf(__VA_ARGS__); \ db_printf("\n"); \ } while (0) #define ADDFLAG(obj, flag, sflag) do { \ if ((obj)->flags & (flag)) { \ if (comma) \ strlcat(str, ",", size); \ strlcat(str, (sflag), size); \ comma = 1; \ } \ } while (0) static char * provider_flags_to_string(struct g_provider *pp, char *str, size_t size) { int comma = 0; bzero(str, size); if (pp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(pp, G_PF_WITHER, "G_PF_WITHER"); ADDFLAG(pp, G_PF_ORPHAN, "G_PF_ORPHAN"); return (str); } static char * geom_flags_to_string(struct g_geom *gp, char *str, size_t size) { int comma = 0; bzero(str, size); if (gp->flags == 0) { strlcpy(str, "NONE", size); return (str); } ADDFLAG(gp, G_GEOM_WITHER, "G_GEOM_WITHER"); return (str); } static void db_show_geom_consumer(int indent, struct g_consumer *cp) { if (indent == 0) { gprintln("consumer: %p", cp); gprintln(" class: %s (%p)", cp->geom->class->name, cp->geom->class); gprintln(" geom: %s (%p)", cp->geom->name, cp->geom); if (cp->provider == NULL) gprintln(" provider: none"); else { gprintln(" provider: %s (%p)", cp->provider->name, cp->provider); } gprintln(" access: r%dw%de%d", cp->acr, cp->acw, cp->ace); gprintln(" flags: 0x%04x", cp->flags); gprintln(" nstart: %u", cp->nstart); gprintln(" nend: %u", cp->nend); } else { gprintf("consumer: %p (%s), access=r%dw%de%d", cp, cp->provider != NULL ? cp->provider->name : "none", cp->acr, cp->acw, cp->ace); if (cp->flags) db_printf(", flags=0x%04x", cp->flags); db_printf("\n"); } } static void db_show_geom_provider(int indent, struct g_provider *pp) { struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("provider: %s (%p)", pp->name, pp); gprintln(" class: %s (%p)", pp->geom->class->name, pp->geom->class); gprintln(" geom: %s (%p)", pp->geom->name, pp->geom); gprintln(" mediasize: %jd", (intmax_t)pp->mediasize); gprintln(" sectorsize: %u", pp->sectorsize); gprintln(" stripesize: %u", pp->stripesize); gprintln(" stripeoffset: %u", pp->stripeoffset); gprintln(" access: r%dw%de%d", pp->acr, pp->acw, pp->ace); gprintln(" flags: %s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); gprintln(" error: %d", pp->error); gprintln(" nstart: %u", pp->nstart); gprintln(" nend: %u", pp->nend); if (LIST_EMPTY(&pp->consumers)) gprintln(" consumers: none"); } else { gprintf("provider: %s (%p), access=r%dw%de%d", pp->name, pp, pp->acr, pp->acw, pp->ace); if (pp->flags != 0) { db_printf(", flags=%s (0x%04x)", provider_flags_to_string(pp, flags, sizeof(flags)), pp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&pp->consumers)) { LIST_FOREACH(cp, &pp->consumers, consumers) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_geom(int indent, struct g_geom *gp) { struct g_provider *pp; struct g_consumer *cp; char flags[64]; if (indent == 0) { gprintln("geom: %s (%p)", gp->name, gp); gprintln(" class: %s (%p)", gp->class->name, gp->class); gprintln(" flags: %s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); gprintln(" rank: %d", gp->rank); if (LIST_EMPTY(&gp->provider)) gprintln(" providers: none"); if (LIST_EMPTY(&gp->consumer)) gprintln(" consumers: none"); } else { gprintf("geom: %s (%p), rank=%d", gp->name, gp, gp->rank); if (gp->flags != 0) { db_printf(", flags=%s (0x%04x)", geom_flags_to_string(gp, flags, sizeof(flags)), gp->flags); } db_printf("\n"); } if (!LIST_EMPTY(&gp->provider)) { LIST_FOREACH(pp, &gp->provider, provider) { db_show_geom_provider(indent + 2, pp); if (db_pager_quit) break; } } if (!LIST_EMPTY(&gp->consumer)) { LIST_FOREACH(cp, &gp->consumer, consumer) { db_show_geom_consumer(indent + 2, cp); if (db_pager_quit) break; } } } static void db_show_geom_class(struct g_class *mp) { struct g_geom *gp; db_printf("class: %s (%p)\n", mp->name, mp); LIST_FOREACH(gp, &mp->geom, geom) { db_show_geom_geom(2, gp); if (db_pager_quit) break; } } /* * Print the GEOM topology or the given object. */ DB_SHOW_COMMAND(geom, db_show_geom) { struct g_class *mp; if (!have_addr) { /* No address given, print the entire topology. */ LIST_FOREACH(mp, &g_classes, class) { db_show_geom_class(mp); db_printf("\n"); if (db_pager_quit) break; } } else { switch (g_valid_obj((void *)addr)) { case 1: db_show_geom_class((struct g_class *)addr); break; case 2: db_show_geom_geom(0, (struct g_geom *)addr); break; case 3: db_show_geom_consumer(0, (struct g_consumer *)addr); break; case 4: db_show_geom_provider(0, (struct g_provider *)addr); break; default: db_printf("Not a GEOM object.\n"); break; } } } static void db_print_bio_cmd(struct bio *bp) { db_printf(" cmd: "); switch (bp->bio_cmd) { case BIO_READ: db_printf("BIO_READ"); break; case BIO_WRITE: db_printf("BIO_WRITE"); break; case BIO_DELETE: db_printf("BIO_DELETE"); break; case BIO_GETATTR: db_printf("BIO_GETATTR"); break; case BIO_FLUSH: db_printf("BIO_FLUSH"); break; case BIO_CMD0: db_printf("BIO_CMD0"); break; case BIO_CMD1: db_printf("BIO_CMD1"); break; case BIO_CMD2: db_printf("BIO_CMD2"); break; case BIO_ZONE: db_printf("BIO_ZONE"); break; default: db_printf("UNKNOWN"); break; } db_printf("\n"); } static void db_print_bio_flags(struct bio *bp) { int comma; comma = 0; db_printf(" flags: "); if (bp->bio_flags & BIO_ERROR) { db_printf("BIO_ERROR"); comma = 1; } if (bp->bio_flags & BIO_DONE) { db_printf("%sBIO_DONE", (comma ? ", " : "")); comma = 1; } if (bp->bio_flags & BIO_ONQUEUE) db_printf("%sBIO_ONQUEUE", (comma ? ", " : "")); db_printf("\n"); } /* * Print useful information in a BIO */ DB_SHOW_COMMAND(bio, db_show_bio) { struct bio *bp; if (have_addr) { bp = (struct bio *)addr; db_printf("BIO %p\n", bp); db_print_bio_cmd(bp); db_print_bio_flags(bp); db_printf(" cflags: 0x%hx\n", bp->bio_cflags); db_printf(" pflags: 0x%hx\n", bp->bio_pflags); db_printf(" offset: %jd\n", (intmax_t)bp->bio_offset); db_printf(" length: %jd\n", (intmax_t)bp->bio_length); db_printf(" bcount: %ld\n", bp->bio_bcount); db_printf(" resid: %ld\n", bp->bio_resid); db_printf(" completed: %jd\n", (intmax_t)bp->bio_completed); db_printf(" children: %u\n", bp->bio_children); db_printf(" inbed: %u\n", bp->bio_inbed); db_printf(" error: %d\n", bp->bio_error); db_printf(" parent: %p\n", bp->bio_parent); db_printf(" driver1: %p\n", bp->bio_driver1); db_printf(" driver2: %p\n", bp->bio_driver2); db_printf(" caller1: %p\n", bp->bio_caller1); db_printf(" caller2: %p\n", bp->bio_caller2); db_printf(" bio_from: %p\n", bp->bio_from); db_printf(" bio_to: %p\n", bp->bio_to); + +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + db_printf(" bio_track_bp: %p\n", bp->bio_track_bp); +#endif } } #undef gprintf #undef gprintln #undef ADDFLAG #endif /* DDB */ Index: head/sys/geom/geom_vfs.c =================================================================== --- head/sys/geom/geom_vfs.c (revision 308154) +++ head/sys/geom/geom_vfs.c (revision 308155) @@ -1,284 +1,288 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include /* * subroutines for use by filesystems. * * XXX: should maybe live somewhere else ? */ #include struct g_vfs_softc { struct mtx sc_mtx; struct bufobj *sc_bo; int sc_active; int sc_orphaned; }; static struct buf_ops __g_vfs_bufops = { .bop_name = "GEOM_VFS", .bop_write = bufwrite, .bop_strategy = g_vfs_strategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush }; struct buf_ops *g_vfs_bufops = &__g_vfs_bufops; static g_orphan_t g_vfs_orphan; static struct g_class g_vfs_class = { .name = "VFS", .version = G_VERSION, .orphan = g_vfs_orphan, }; DECLARE_GEOM_CLASS(g_vfs_class, g_vfs); static void g_vfs_destroy(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); if (cp->geom->softc == NULL) g_wither_geom(cp->geom, ENXIO); } static void g_vfs_done(struct bio *bip) { struct g_consumer *cp; struct g_vfs_softc *sc; struct buf *bp; int destroy; struct mount *mp; struct vnode *vp; struct cdev *cdevp; /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ bp = bip->bio_caller2; vp = bp->b_vp; if (vp != NULL) { /* * If not a disk vnode, use its associated mount point * otherwise use the mountpoint associated with the disk. */ VI_LOCK(vp); if (vp->v_type != VCHR || (cdevp = vp->v_rdev) == NULL || cdevp->si_devsw == NULL || (cdevp->si_devsw->d_flags & D_DISK) == 0) mp = vp->v_mount; else mp = cdevp->si_mountpt; if (mp != NULL) { if (bp->b_iocmd == BIO_READ) { if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC) mp->mnt_stat.f_asyncreads++; else mp->mnt_stat.f_syncreads++; } else if (bp->b_iocmd == BIO_WRITE) { if (LK_HOLDER(bp->b_lock.lk_lock) == LK_KERNPROC) mp->mnt_stat.f_asyncwrites++; else mp->mnt_stat.f_syncwrites++; } } VI_UNLOCK(vp); } cp = bip->bio_from; sc = cp->geom->softc; if (bip->bio_error) { printf("g_vfs_done():"); g_print_bio(bip); printf("error = %d\n", bip->bio_error); } bp->b_error = bip->bio_error; bp->b_ioflags = bip->bio_flags; if (bip->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bip->bio_completed; g_destroy_bio(bip); mtx_lock(&sc->sc_mtx); destroy = ((--sc->sc_active) == 0 && sc->sc_orphaned); mtx_unlock(&sc->sc_mtx); if (destroy) g_post_event(g_vfs_destroy, cp, M_WAITOK, NULL); bufdone(bp); } void g_vfs_strategy(struct bufobj *bo, struct buf *bp) { struct g_vfs_softc *sc; struct g_consumer *cp; struct bio *bip; cp = bo->bo_private; sc = cp->geom->softc; /* * If the provider has orphaned us, just return EXIO. */ mtx_lock(&sc->sc_mtx); if (sc->sc_orphaned) { mtx_unlock(&sc->sc_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } sc->sc_active++; mtx_unlock(&sc->sc_mtx); bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bdata2bio(bp, bip); if ((bp->b_flags & B_BARRIER) != 0) { bip->bio_flags |= BIO_ORDERED; bp->b_flags &= ~B_BARRIER; } bip->bio_done = g_vfs_done; bip->bio_caller2 = bp; +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + buf_track(bp, __func__); + bip->bio_track_bp = bp; +#endif g_io_request(bip, cp); } static void g_vfs_orphan(struct g_consumer *cp) { struct g_geom *gp; struct g_vfs_softc *sc; int destroy; g_topology_assert(); gp = cp->geom; g_trace(G_T_TOPOLOGY, "g_vfs_orphan(%p(%s))", cp, gp->name); sc = gp->softc; if (sc == NULL) return; mtx_lock(&sc->sc_mtx); sc->sc_orphaned = 1; destroy = (sc->sc_active == 0); mtx_unlock(&sc->sc_mtx); if (destroy) g_vfs_destroy(cp, 0); /* * Do not destroy the geom. Filesystem will do that during unmount. */ } int g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr) { struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; struct g_vfs_softc *sc; struct bufobj *bo; int error; g_topology_assert(); *cpp = NULL; bo = &vp->v_bufobj; if (bo->bo_private != vp) return (EBUSY); pp = g_dev_getprovider(vp->v_rdev); if (pp == NULL) return (ENOENT); gp = g_new_geomf(&g_vfs_class, "%s.%s", fsname, pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_vfs", NULL, MTX_DEF); sc->sc_bo = bo; gp->softc = sc; cp = g_new_consumer(gp); g_attach(cp, pp); error = g_access(cp, 1, wr, wr); if (error) { g_wither_geom(gp, ENXIO); return (error); } vnode_create_vobject(vp, pp->mediasize, curthread); *cpp = cp; cp->private = vp; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; bo->bo_ops = g_vfs_bufops; bo->bo_private = cp; bo->bo_bsize = pp->sectorsize; return (error); } void g_vfs_close(struct g_consumer *cp) { struct g_geom *gp; struct g_vfs_softc *sc; g_topology_assert(); gp = cp->geom; sc = gp->softc; bufobj_invalbuf(sc->sc_bo, V_SAVE, 0, 0); sc->sc_bo->bo_private = cp->private; gp->softc = NULL; mtx_destroy(&sc->sc_mtx); if (!sc->sc_orphaned || cp->provider == NULL) g_wither_geom_close(gp, ENXIO); g_free(sc); } Index: head/sys/geom/part/g_part.c =================================================================== --- head/sys/geom/part/g_part.c (revision 308154) +++ head/sys/geom/part/g_part.c (revision 308155) @@ -1,2340 +1,2342 @@ /*- * Copyright (c) 2002, 2005-2009 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" #ifndef _PATH_DEV #define _PATH_DEV "/dev/" #endif static kobj_method_t g_part_null_methods[] = { { 0, 0 } }; static struct g_part_scheme g_part_null_scheme = { "(none)", g_part_null_methods, sizeof(struct g_part_table), }; TAILQ_HEAD(, g_part_scheme) g_part_schemes = TAILQ_HEAD_INITIALIZER(g_part_schemes); struct g_part_alias_list { const char *lexeme; enum g_part_alias alias; } g_part_alias_list[G_PART_ALIAS_COUNT] = { { "apple-boot", G_PART_ALIAS_APPLE_BOOT }, { "apple-core-storage", G_PART_ALIAS_APPLE_CORE_STORAGE }, { "apple-hfs", G_PART_ALIAS_APPLE_HFS }, { "apple-label", G_PART_ALIAS_APPLE_LABEL }, { "apple-raid", G_PART_ALIAS_APPLE_RAID }, { "apple-raid-offline", G_PART_ALIAS_APPLE_RAID_OFFLINE }, { "apple-tv-recovery", G_PART_ALIAS_APPLE_TV_RECOVERY }, { "apple-ufs", G_PART_ALIAS_APPLE_UFS }, { "bios-boot", G_PART_ALIAS_BIOS_BOOT }, { "chromeos-firmware", G_PART_ALIAS_CHROMEOS_FIRMWARE }, { "chromeos-kernel", G_PART_ALIAS_CHROMEOS_KERNEL }, { "chromeos-reserved", G_PART_ALIAS_CHROMEOS_RESERVED }, { "chromeos-root", G_PART_ALIAS_CHROMEOS_ROOT }, { "dragonfly-ccd", G_PART_ALIAS_DFBSD_CCD }, { "dragonfly-hammer", G_PART_ALIAS_DFBSD_HAMMER }, { "dragonfly-hammer2", G_PART_ALIAS_DFBSD_HAMMER2 }, { "dragonfly-label32", G_PART_ALIAS_DFBSD }, { "dragonfly-label64", G_PART_ALIAS_DFBSD64 }, { "dragonfly-legacy", G_PART_ALIAS_DFBSD_LEGACY }, { "dragonfly-swap", G_PART_ALIAS_DFBSD_SWAP }, { "dragonfly-ufs", G_PART_ALIAS_DFBSD_UFS }, { "dragonfly-vinum", G_PART_ALIAS_DFBSD_VINUM }, { "ebr", G_PART_ALIAS_EBR }, { "efi", G_PART_ALIAS_EFI }, { "fat16", G_PART_ALIAS_MS_FAT16 }, { "fat32", G_PART_ALIAS_MS_FAT32 }, { "freebsd", G_PART_ALIAS_FREEBSD }, { "freebsd-boot", G_PART_ALIAS_FREEBSD_BOOT }, { "freebsd-nandfs", G_PART_ALIAS_FREEBSD_NANDFS }, { "freebsd-swap", G_PART_ALIAS_FREEBSD_SWAP }, { "freebsd-ufs", G_PART_ALIAS_FREEBSD_UFS }, { "freebsd-vinum", G_PART_ALIAS_FREEBSD_VINUM }, { "freebsd-zfs", G_PART_ALIAS_FREEBSD_ZFS }, { "linux-data", G_PART_ALIAS_LINUX_DATA }, { "linux-lvm", G_PART_ALIAS_LINUX_LVM }, { "linux-raid", G_PART_ALIAS_LINUX_RAID }, { "linux-swap", G_PART_ALIAS_LINUX_SWAP }, { "mbr", G_PART_ALIAS_MBR }, { "ms-basic-data", G_PART_ALIAS_MS_BASIC_DATA }, { "ms-ldm-data", G_PART_ALIAS_MS_LDM_DATA }, { "ms-ldm-metadata", G_PART_ALIAS_MS_LDM_METADATA }, { "ms-recovery", G_PART_ALIAS_MS_RECOVERY }, { "ms-reserved", G_PART_ALIAS_MS_RESERVED }, { "ms-spaces", G_PART_ALIAS_MS_SPACES }, { "netbsd-ccd", G_PART_ALIAS_NETBSD_CCD }, { "netbsd-cgd", G_PART_ALIAS_NETBSD_CGD }, { "netbsd-ffs", G_PART_ALIAS_NETBSD_FFS }, { "netbsd-lfs", G_PART_ALIAS_NETBSD_LFS }, { "netbsd-raid", G_PART_ALIAS_NETBSD_RAID }, { "netbsd-swap", G_PART_ALIAS_NETBSD_SWAP }, { "ntfs", G_PART_ALIAS_MS_NTFS }, { "openbsd-data", G_PART_ALIAS_OPENBSD_DATA }, { "prep-boot", G_PART_ALIAS_PREP_BOOT }, { "vmware-reserved", G_PART_ALIAS_VMRESERVED }, { "vmware-vmfs", G_PART_ALIAS_VMFS }, { "vmware-vmkdiag", G_PART_ALIAS_VMKDIAG }, { "vmware-vsanhdr", G_PART_ALIAS_VMVSANHDR }, }; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, part, CTLFLAG_RW, 0, "GEOM_PART stuff"); static u_int check_integrity = 1; SYSCTL_UINT(_kern_geom_part, OID_AUTO, check_integrity, CTLFLAG_RWTUN, &check_integrity, 1, "Enable integrity checking"); /* * The GEOM partitioning class. */ static g_ctl_req_t g_part_ctlreq; static g_ctl_destroy_geom_t g_part_destroy_geom; static g_fini_t g_part_fini; static g_init_t g_part_init; static g_taste_t g_part_taste; static g_access_t g_part_access; static g_dumpconf_t g_part_dumpconf; static g_orphan_t g_part_orphan; static g_spoiled_t g_part_spoiled; static g_start_t g_part_start; static g_resize_t g_part_resize; static g_ioctl_t g_part_ioctl; static struct g_class g_part_class = { .name = "PART", .version = G_VERSION, /* Class methods. */ .ctlreq = g_part_ctlreq, .destroy_geom = g_part_destroy_geom, .fini = g_part_fini, .init = g_part_init, .taste = g_part_taste, /* Geom methods. */ .access = g_part_access, .dumpconf = g_part_dumpconf, .orphan = g_part_orphan, .spoiled = g_part_spoiled, .start = g_part_start, .resize = g_part_resize, .ioctl = g_part_ioctl, }; DECLARE_GEOM_CLASS(g_part_class, g_part); MODULE_VERSION(g_part, 0); /* * Support functions. */ static void g_part_wither(struct g_geom *, int); const char * g_part_alias_name(enum g_part_alias alias) { int i; for (i = 0; i < G_PART_ALIAS_COUNT; i++) { if (g_part_alias_list[i].alias != alias) continue; return (g_part_alias_list[i].lexeme); } return (NULL); } void g_part_geometry_heads(off_t blocks, u_int sectors, off_t *bestchs, u_int *bestheads) { static u_int candidate_heads[] = { 1, 2, 16, 32, 64, 128, 255, 0 }; off_t chs, cylinders; u_int heads; int idx; *bestchs = 0; *bestheads = 0; for (idx = 0; candidate_heads[idx] != 0; idx++) { heads = candidate_heads[idx]; cylinders = blocks / heads / sectors; if (cylinders < heads || cylinders < sectors) break; if (cylinders > 1023) continue; chs = cylinders * heads * sectors; if (chs > *bestchs || (chs == *bestchs && *bestheads == 1)) { *bestchs = chs; *bestheads = heads; } } } static void g_part_geometry(struct g_part_table *table, struct g_consumer *cp, off_t blocks) { static u_int candidate_sectors[] = { 1, 9, 17, 33, 63, 0 }; off_t chs, bestchs; u_int heads, sectors; int idx; if (g_getattr("GEOM::fwsectors", cp, §ors) != 0 || sectors == 0 || g_getattr("GEOM::fwheads", cp, &heads) != 0 || heads == 0) { table->gpt_fixgeom = 0; table->gpt_heads = 0; table->gpt_sectors = 0; bestchs = 0; for (idx = 0; candidate_sectors[idx] != 0; idx++) { sectors = candidate_sectors[idx]; g_part_geometry_heads(blocks, sectors, &chs, &heads); if (chs == 0) continue; /* * Prefer a geometry with sectors > 1, but only if * it doesn't bump down the number of heads to 1. */ if (chs > bestchs || (chs == bestchs && heads > 1 && table->gpt_sectors == 1)) { bestchs = chs; table->gpt_heads = heads; table->gpt_sectors = sectors; } } /* * If we didn't find a geometry at all, then the disk is * too big. This means we can use the maximum number of * heads and sectors. */ if (bestchs == 0) { table->gpt_heads = 255; table->gpt_sectors = 63; } } else { table->gpt_fixgeom = 1; table->gpt_heads = heads; table->gpt_sectors = sectors; } } #define DPRINTF(...) if (bootverbose) { \ printf("GEOM_PART: " __VA_ARGS__); \ } static int g_part_check_integrity(struct g_part_table *table, struct g_consumer *cp) { struct g_part_entry *e1, *e2; struct g_provider *pp; off_t offset; int failed; failed = 0; pp = cp->provider; if (table->gpt_last < table->gpt_first) { DPRINTF("last LBA is below first LBA: %jd < %jd\n", (intmax_t)table->gpt_last, (intmax_t)table->gpt_first); failed++; } if (table->gpt_last > pp->mediasize / pp->sectorsize - 1) { DPRINTF("last LBA extends beyond mediasize: " "%jd > %jd\n", (intmax_t)table->gpt_last, (intmax_t)pp->mediasize / pp->sectorsize - 1); failed++; } LIST_FOREACH(e1, &table->gpt_entry, gpe_entry) { if (e1->gpe_deleted || e1->gpe_internal) continue; if (e1->gpe_start < table->gpt_first) { DPRINTF("partition %d has start offset below first " "LBA: %jd < %jd\n", e1->gpe_index, (intmax_t)e1->gpe_start, (intmax_t)table->gpt_first); failed++; } if (e1->gpe_start > table->gpt_last) { DPRINTF("partition %d has start offset beyond last " "LBA: %jd > %jd\n", e1->gpe_index, (intmax_t)e1->gpe_start, (intmax_t)table->gpt_last); failed++; } if (e1->gpe_end < e1->gpe_start) { DPRINTF("partition %d has end offset below start " "offset: %jd < %jd\n", e1->gpe_index, (intmax_t)e1->gpe_end, (intmax_t)e1->gpe_start); failed++; } if (e1->gpe_end > table->gpt_last) { DPRINTF("partition %d has end offset beyond last " "LBA: %jd > %jd\n", e1->gpe_index, (intmax_t)e1->gpe_end, (intmax_t)table->gpt_last); failed++; } if (pp->stripesize > 0) { offset = e1->gpe_start * pp->sectorsize; if (e1->gpe_offset > offset) offset = e1->gpe_offset; if ((offset + pp->stripeoffset) % pp->stripesize) { DPRINTF("partition %d on (%s, %s) is not " "aligned on %u bytes\n", e1->gpe_index, pp->name, table->gpt_scheme->name, pp->stripesize); /* Don't treat this as a critical failure */ } } e2 = e1; while ((e2 = LIST_NEXT(e2, gpe_entry)) != NULL) { if (e2->gpe_deleted || e2->gpe_internal) continue; if (e1->gpe_start >= e2->gpe_start && e1->gpe_start <= e2->gpe_end) { DPRINTF("partition %d has start offset inside " "partition %d: start[%d] %jd >= start[%d] " "%jd <= end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e2->gpe_index, (intmax_t)e2->gpe_start, e1->gpe_index, (intmax_t)e1->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_end); failed++; } if (e1->gpe_end >= e2->gpe_start && e1->gpe_end <= e2->gpe_end) { DPRINTF("partition %d has end offset inside " "partition %d: start[%d] %jd >= end[%d] " "%jd <= end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e2->gpe_index, (intmax_t)e2->gpe_start, e1->gpe_index, (intmax_t)e1->gpe_end, e2->gpe_index, (intmax_t)e2->gpe_end); failed++; } if (e1->gpe_start < e2->gpe_start && e1->gpe_end > e2->gpe_end) { DPRINTF("partition %d contains partition %d: " "start[%d] %jd > start[%d] %jd, end[%d] " "%jd < end[%d] %jd\n", e1->gpe_index, e2->gpe_index, e1->gpe_index, (intmax_t)e1->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_start, e2->gpe_index, (intmax_t)e2->gpe_end, e1->gpe_index, (intmax_t)e1->gpe_end); failed++; } } } if (failed != 0) { printf("GEOM_PART: integrity check failed (%s, %s)\n", pp->name, table->gpt_scheme->name); if (check_integrity != 0) return (EINVAL); table->gpt_corrupt = 1; } return (0); } #undef DPRINTF struct g_part_entry * g_part_new_entry(struct g_part_table *table, int index, quad_t start, quad_t end) { struct g_part_entry *entry, *last; last = NULL; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_index == index) break; if (entry->gpe_index > index) { entry = NULL; break; } last = entry; } if (entry == NULL) { entry = g_malloc(table->gpt_scheme->gps_entrysz, M_WAITOK | M_ZERO); entry->gpe_index = index; if (last == NULL) LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry); else LIST_INSERT_AFTER(last, entry, gpe_entry); } else entry->gpe_offset = 0; entry->gpe_start = start; entry->gpe_end = end; return (entry); } static void g_part_new_provider(struct g_geom *gp, struct g_part_table *table, struct g_part_entry *entry) { struct g_consumer *cp; struct g_provider *pp; struct sbuf *sb; off_t offset; cp = LIST_FIRST(&gp->consumer); pp = cp->provider; offset = entry->gpe_start * pp->sectorsize; if (entry->gpe_offset < offset) entry->gpe_offset = offset; if (entry->gpe_pp == NULL) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_finish(sb); entry->gpe_pp = g_new_providerf(gp, "%s", sbuf_data(sb)); sbuf_delete(sb); entry->gpe_pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE; entry->gpe_pp->private = entry; /* Close the circle. */ } entry->gpe_pp->index = entry->gpe_index - 1; /* index is 1-based. */ entry->gpe_pp->mediasize = (entry->gpe_end - entry->gpe_start + 1) * pp->sectorsize; entry->gpe_pp->mediasize -= entry->gpe_offset - offset; entry->gpe_pp->sectorsize = pp->sectorsize; entry->gpe_pp->stripesize = pp->stripesize; entry->gpe_pp->stripeoffset = pp->stripeoffset + entry->gpe_offset; if (pp->stripesize > 0) entry->gpe_pp->stripeoffset %= pp->stripesize; entry->gpe_pp->flags |= pp->flags & G_PF_ACCEPT_UNMAPPED; g_error_provider(entry->gpe_pp, 0); } static struct g_geom* g_part_find_geom(const char *name) { struct g_geom *gp; LIST_FOREACH(gp, &g_part_class.geom, geom) { if ((gp->flags & G_GEOM_WITHER) == 0 && strcmp(name, gp->name) == 0) break; } return (gp); } static int g_part_parm_geom(struct gctl_req *req, const char *name, struct g_geom **v) { struct g_geom *gp; const char *gname; gname = gctl_get_asciiparam(req, name); if (gname == NULL) return (ENOATTR); if (strncmp(gname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) gname += sizeof(_PATH_DEV) - 1; gp = g_part_find_geom(gname); if (gp == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, gname); return (EINVAL); } *v = gp; return (0); } static int g_part_parm_provider(struct gctl_req *req, const char *name, struct g_provider **v) { struct g_provider *pp; const char *pname; pname = gctl_get_asciiparam(req, name); if (pname == NULL) return (ENOATTR); if (strncmp(pname, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) pname += sizeof(_PATH_DEV) - 1; pp = g_provider_by_name(pname); if (pp == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, pname); return (EINVAL); } *v = pp; return (0); } static int g_part_parm_quad(struct gctl_req *req, const char *name, quad_t *v) { const char *p; char *x; quad_t q; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); q = strtoq(p, &x, 0); if (*x != '\0' || q < 0) { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = q; return (0); } static int g_part_parm_scheme(struct gctl_req *req, const char *name, struct g_part_scheme **v) { struct g_part_scheme *s; const char *p; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); TAILQ_FOREACH(s, &g_part_schemes, scheme_list) { if (s == &g_part_null_scheme) continue; if (!strcasecmp(s->name, p)) break; } if (s == NULL) { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = s; return (0); } static int g_part_parm_str(struct gctl_req *req, const char *name, const char **v) { const char *p; p = gctl_get_asciiparam(req, name); if (p == NULL) return (ENOATTR); /* An empty label is always valid. */ if (strcmp(name, "label") != 0 && p[0] == '\0') { gctl_error(req, "%d %s '%s'", EINVAL, name, p); return (EINVAL); } *v = p; return (0); } static int g_part_parm_intmax(struct gctl_req *req, const char *name, u_int *v) { const intmax_t *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); if (size != sizeof(*p) || *p < 0 || *p > INT_MAX) { gctl_error(req, "%d %s '%jd'", EINVAL, name, *p); return (EINVAL); } *v = (u_int)*p; return (0); } static int g_part_parm_uint32(struct gctl_req *req, const char *name, u_int *v) { const uint32_t *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); if (size != sizeof(*p) || *p > INT_MAX) { gctl_error(req, "%d %s '%u'", EINVAL, name, (unsigned int)*p); return (EINVAL); } *v = (u_int)*p; return (0); } static int g_part_parm_bootcode(struct gctl_req *req, const char *name, const void **v, unsigned int *s) { const void *p; int size; p = gctl_get_param(req, name, &size); if (p == NULL) return (ENOATTR); *v = p; *s = size; return (0); } static int g_part_probe(struct g_geom *gp, struct g_consumer *cp, int depth) { struct g_part_scheme *iter, *scheme; struct g_part_table *table; int pri, probe; table = gp->softc; scheme = (table != NULL) ? table->gpt_scheme : NULL; pri = (scheme != NULL) ? G_PART_PROBE(table, cp) : INT_MIN; if (pri == 0) goto done; if (pri > 0) { /* error */ scheme = NULL; pri = INT_MIN; } TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) { if (iter == &g_part_null_scheme) continue; table = (void *)kobj_create((kobj_class_t)iter, M_GEOM, M_WAITOK); table->gpt_gp = gp; table->gpt_scheme = iter; table->gpt_depth = depth; probe = G_PART_PROBE(table, cp); if (probe <= 0 && probe > pri) { pri = probe; scheme = iter; if (gp->softc != NULL) kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = table; if (pri == 0) goto done; } else kobj_delete((kobj_t)table, M_GEOM); } done: return ((scheme == NULL) ? ENXIO : 0); } /* * Control request functions. */ static int g_part_ctl_add(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *delent, *last, *entry; struct g_part_table *table; struct sbuf *sb; quad_t end; unsigned int index; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); pp = LIST_FIRST(&gp->consumer)->provider; table = gp->softc; end = gpp->gpp_start + gpp->gpp_size - 1; if (gpp->gpp_start < table->gpt_first || gpp->gpp_start > table->gpt_last) { gctl_error(req, "%d start '%jd'", EINVAL, (intmax_t)gpp->gpp_start); return (EINVAL); } if (end < gpp->gpp_start || end > table->gpt_last) { gctl_error(req, "%d size '%jd'", EINVAL, (intmax_t)gpp->gpp_size); return (EINVAL); } if (gpp->gpp_index > table->gpt_entries) { gctl_error(req, "%d index '%d'", EINVAL, gpp->gpp_index); return (EINVAL); } delent = last = NULL; index = (gpp->gpp_index > 0) ? gpp->gpp_index : 1; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted) { if (entry->gpe_index == index) delent = entry; continue; } if (entry->gpe_index == index) index = entry->gpe_index + 1; if (entry->gpe_index < index) last = entry; if (entry->gpe_internal) continue; if (gpp->gpp_start >= entry->gpe_start && gpp->gpp_start <= entry->gpe_end) { gctl_error(req, "%d start '%jd'", ENOSPC, (intmax_t)gpp->gpp_start); return (ENOSPC); } if (end >= entry->gpe_start && end <= entry->gpe_end) { gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end); return (ENOSPC); } if (gpp->gpp_start < entry->gpe_start && end > entry->gpe_end) { gctl_error(req, "%d size '%jd'", ENOSPC, (intmax_t)gpp->gpp_size); return (ENOSPC); } } if (gpp->gpp_index > 0 && index != gpp->gpp_index) { gctl_error(req, "%d index '%d'", EEXIST, gpp->gpp_index); return (EEXIST); } if (index > table->gpt_entries) { gctl_error(req, "%d index '%d'", ENOSPC, index); return (ENOSPC); } entry = (delent == NULL) ? g_malloc(table->gpt_scheme->gps_entrysz, M_WAITOK | M_ZERO) : delent; entry->gpe_index = index; entry->gpe_start = gpp->gpp_start; entry->gpe_end = end; error = G_PART_ADD(table, entry, gpp); if (error) { gctl_error(req, "%d", error); if (delent == NULL) g_free(entry); return (error); } if (delent == NULL) { if (last == NULL) LIST_INSERT_HEAD(&table->gpt_entry, entry, gpe_entry); else LIST_INSERT_AFTER(last, entry, gpe_entry); entry->gpe_created = 1; } else { entry->gpe_deleted = 0; entry->gpe_modified = 1; } g_part_new_provider(gp, table, entry); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); if (pp->stripesize > 0 && entry->gpe_pp->stripeoffset != 0) sbuf_printf(sb, " added, but partition is not " "aligned on %u bytes\n", pp->stripesize); else sbuf_cat(sb, " added\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_bootcode(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_part_table *table; struct sbuf *sb; int error, sz; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; sz = table->gpt_scheme->gps_bootcodesz; if (sz == 0) { error = ENODEV; goto fail; } if (gpp->gpp_codesize > sz) { error = EFBIG; goto fail; } error = G_PART_BOOTCODE(table, gpp); if (error) goto fail; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "bootcode written to %s\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); fail: gctl_error(req, "%d", error); return (error); } static int g_part_ctl_commit(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry, *tmp; struct g_part_table *table; char *buf; int error, i; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (!table->gpt_opened) { gctl_error(req, "%d", EPERM); return (EPERM); } g_topology_unlock(); cp = LIST_FIRST(&gp->consumer); if ((table->gpt_smhead | table->gpt_smtail) != 0) { pp = cp->provider; buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); while (table->gpt_smhead != 0) { i = ffs(table->gpt_smhead) - 1; error = g_write_data(cp, i * pp->sectorsize, buf, pp->sectorsize); if (error) { g_free(buf); goto fail; } table->gpt_smhead &= ~(1 << i); } while (table->gpt_smtail != 0) { i = ffs(table->gpt_smtail) - 1; error = g_write_data(cp, pp->mediasize - (i + 1) * pp->sectorsize, buf, pp->sectorsize); if (error) { g_free(buf); goto fail; } table->gpt_smtail &= ~(1 << i); } g_free(buf); } if (table->gpt_scheme == &g_part_null_scheme) { g_topology_lock(); g_access(cp, -1, -1, -1); g_part_wither(gp, ENXIO); return (0); } error = G_PART_WRITE(table, cp); if (error) goto fail; LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { if (!entry->gpe_deleted) { entry->gpe_created = 0; entry->gpe_modified = 0; continue; } LIST_REMOVE(entry, gpe_entry); g_free(entry); } table->gpt_created = 0; table->gpt_opened = 0; g_topology_lock(); g_access(cp, -1, -1, -1); return (0); fail: g_topology_lock(); gctl_error(req, "%d", error); return (error); } static int g_part_ctl_create(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_scheme *scheme; struct g_part_table *null, *table; struct sbuf *sb; int attr, error; pp = gpp->gpp_provider; scheme = gpp->gpp_scheme; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name)); g_topology_assert(); /* Check that there isn't already a g_part geom on the provider. */ gp = g_part_find_geom(pp->name); if (gp != NULL) { null = gp->softc; if (null->gpt_scheme != &g_part_null_scheme) { gctl_error(req, "%d geom '%s'", EEXIST, pp->name); return (EEXIST); } } else null = NULL; if ((gpp->gpp_parms & G_PART_PARM_ENTRIES) && (gpp->gpp_entries < scheme->gps_minent || gpp->gpp_entries > scheme->gps_maxent)) { gctl_error(req, "%d entries '%d'", EINVAL, gpp->gpp_entries); return (EINVAL); } if (null == NULL) gp = g_new_geomf(&g_part_class, "%s", pp->name); gp->softc = kobj_create((kobj_class_t)gpp->gpp_scheme, M_GEOM, M_WAITOK); table = gp->softc; table->gpt_gp = gp; table->gpt_scheme = gpp->gpp_scheme; table->gpt_entries = (gpp->gpp_parms & G_PART_PARM_ENTRIES) ? gpp->gpp_entries : scheme->gps_minent; LIST_INIT(&table->gpt_entry); if (null == NULL) { cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 1, 1); if (error != 0) { g_part_wither(gp, error); gctl_error(req, "%d geom '%s'", error, pp->name); return (error); } table->gpt_opened = 1; } else { cp = LIST_FIRST(&gp->consumer); table->gpt_opened = null->gpt_opened; table->gpt_smhead = null->gpt_smhead; table->gpt_smtail = null->gpt_smtail; } g_topology_unlock(); /* Make sure the provider has media. */ if (pp->mediasize == 0 || pp->sectorsize == 0) { error = ENODEV; goto fail; } /* Make sure we can nest and if so, determine our depth. */ error = g_getattr("PART::isleaf", cp, &attr); if (!error && attr) { error = ENODEV; goto fail; } error = g_getattr("PART::depth", cp, &attr); table->gpt_depth = (!error) ? attr + 1 : 0; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); error = G_PART_CREATE(table, gpp); if (error) goto fail; g_topology_lock(); table->gpt_created = 1; if (null != NULL) kobj_delete((kobj_t)null, M_GEOM); /* * Support automatic commit by filling in the gpp_geom * parameter. */ gpp->gpp_parms |= G_PART_PARM_GEOM; gpp->gpp_geom = gp; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s created\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); fail: g_topology_lock(); if (null == NULL) { g_access(cp, -1, -1, -1); g_part_wither(gp, error); } else { kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = null; } gctl_error(req, "%d provider", error); return (error); } static int g_part_ctl_delete(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } pp = entry->gpe_pp; if (pp != NULL) { if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) { gctl_error(req, "%d", EBUSY); return (EBUSY); } pp->private = NULL; entry->gpe_pp = NULL; } if (pp != NULL) g_wither_provider(pp, ENXIO); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " deleted\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } if (entry->gpe_created) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } else { entry->gpe_modified = 0; entry->gpe_deleted = 1; } return (0); } static int g_part_ctl_destroy(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_entry *entry, *tmp; struct g_part_table *null, *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; /* Check for busy providers. */ LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (gpp->gpp_force) { pp = entry->gpe_pp; if (pp == NULL) continue; if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0) continue; } gctl_error(req, "%d", EBUSY); return (EBUSY); } if (gpp->gpp_force) { /* Destroy all providers. */ LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { pp = entry->gpe_pp; if (pp != NULL) { pp->private = NULL; g_wither_provider(pp, ENXIO); } LIST_REMOVE(entry, gpe_entry); g_free(entry); } } error = G_PART_DESTROY(table, gpp); if (error) { gctl_error(req, "%d", error); return (error); } gp->softc = kobj_create((kobj_class_t)&g_part_null_scheme, M_GEOM, M_WAITOK); null = gp->softc; null->gpt_gp = gp; null->gpt_scheme = &g_part_null_scheme; LIST_INIT(&null->gpt_entry); cp = LIST_FIRST(&gp->consumer); pp = cp->provider; null->gpt_last = pp->mediasize / pp->sectorsize - 1; null->gpt_depth = table->gpt_depth; null->gpt_opened = table->gpt_opened; null->gpt_smhead = table->gpt_smhead; null->gpt_smtail = table->gpt_smtail; while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } kobj_delete((kobj_t)table, M_GEOM); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s destroyed\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_modify(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } error = G_PART_MODIFY(table, entry, gpp); if (error) { gctl_error(req, "%d", error); return (error); } if (!entry->gpe_created) entry->gpe_modified = 1; /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " modified\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_move(struct gctl_req *req, struct g_part_parms *gpp) { gctl_error(req, "%d verb 'move'", ENOSYS); return (ENOSYS); } static int g_part_ctl_recover(struct gctl_req *req, struct g_part_parms *gpp) { struct g_part_table *table; struct g_geom *gp; struct sbuf *sb; int error, recovered; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; error = recovered = 0; if (table->gpt_corrupt) { error = G_PART_RECOVER(table); if (error == 0) error = g_part_check_integrity(table, LIST_FIRST(&gp->consumer)); if (error) { gctl_error(req, "%d recovering '%s' failed", error, gp->name); return (error); } recovered = 1; } /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); if (recovered) sbuf_printf(sb, "%s recovered\n", gp->name); else sbuf_printf(sb, "%s recovering is not needed\n", gp->name); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_resize(struct gctl_req *req, struct g_part_parms *gpp) { struct g_geom *gp; struct g_provider *pp; struct g_part_entry *pe, *entry; struct g_part_table *table; struct sbuf *sb; quad_t end; int error; off_t mediasize; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; /* check gpp_index */ LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } /* check gpp_size */ end = entry->gpe_start + gpp->gpp_size - 1; if (gpp->gpp_size < 1 || end > table->gpt_last) { gctl_error(req, "%d size '%jd'", EINVAL, (intmax_t)gpp->gpp_size); return (EINVAL); } LIST_FOREACH(pe, &table->gpt_entry, gpe_entry) { if (pe->gpe_deleted || pe->gpe_internal || pe == entry) continue; if (end >= pe->gpe_start && end <= pe->gpe_end) { gctl_error(req, "%d end '%jd'", ENOSPC, (intmax_t)end); return (ENOSPC); } if (entry->gpe_start < pe->gpe_start && end > pe->gpe_end) { gctl_error(req, "%d size '%jd'", ENOSPC, (intmax_t)gpp->gpp_size); return (ENOSPC); } } pp = entry->gpe_pp; if ((g_debugflags & 16) == 0 && (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)) { if (entry->gpe_end - entry->gpe_start + 1 > gpp->gpp_size) { /* Deny shrinking of an opened partition. */ gctl_error(req, "%d", EBUSY); return (EBUSY); } } error = G_PART_RESIZE(table, entry, gpp); if (error) { gctl_error(req, "%d%s", error, error != EBUSY ? "": " resizing will lead to unexpected shrinking" " due to alignment"); return (error); } if (!entry->gpe_created) entry->gpe_modified = 1; /* update mediasize of changed provider */ mediasize = (entry->gpe_end - entry->gpe_start + 1) * pp->sectorsize; g_resize_provider(pp, mediasize); /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); G_PART_FULLNAME(table, entry, sb, gp->name); sbuf_cat(sb, " resized\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_setunset(struct gctl_req *req, struct g_part_parms *gpp, unsigned int set) { struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct sbuf *sb; int error; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (gpp->gpp_parms & G_PART_PARM_INDEX) { LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_deleted || entry->gpe_internal) continue; if (entry->gpe_index == gpp->gpp_index) break; } if (entry == NULL) { gctl_error(req, "%d index '%d'", ENOENT, gpp->gpp_index); return (ENOENT); } } else entry = NULL; error = G_PART_SETUNSET(table, entry, gpp->gpp_attrib, set); if (error) { gctl_error(req, "%d attrib '%s'", error, gpp->gpp_attrib); return (error); } /* Provide feedback if so requested. */ if (gpp->gpp_parms & G_PART_PARM_OUTPUT) { sb = sbuf_new_auto(); sbuf_printf(sb, "%s %sset on ", gpp->gpp_attrib, (set) ? "" : "un"); if (entry) G_PART_FULLNAME(table, entry, sb, gp->name); else sbuf_cat(sb, gp->name); sbuf_cat(sb, "\n"); sbuf_finish(sb); gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); } return (0); } static int g_part_ctl_undo(struct gctl_req *req, struct g_part_parms *gpp) { struct g_consumer *cp; struct g_provider *pp; struct g_geom *gp; struct g_part_entry *entry, *tmp; struct g_part_table *table; int error, reprobe; gp = gpp->gpp_geom; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, gp->name)); g_topology_assert(); table = gp->softc; if (!table->gpt_opened) { gctl_error(req, "%d", EPERM); return (EPERM); } cp = LIST_FIRST(&gp->consumer); LIST_FOREACH_SAFE(entry, &table->gpt_entry, gpe_entry, tmp) { entry->gpe_modified = 0; if (entry->gpe_created) { pp = entry->gpe_pp; if (pp != NULL) { pp->private = NULL; entry->gpe_pp = NULL; g_wither_provider(pp, ENXIO); } entry->gpe_deleted = 1; } if (entry->gpe_deleted) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } } g_topology_unlock(); reprobe = (table->gpt_scheme == &g_part_null_scheme || table->gpt_created) ? 1 : 0; if (reprobe) { LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (entry->gpe_internal) continue; error = EBUSY; goto fail; } while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } error = g_part_probe(gp, cp, table->gpt_depth); if (error) { g_topology_lock(); g_access(cp, -1, -1, -1); g_part_wither(gp, error); return (0); } table = gp->softc; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ pp = cp->provider; g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); } error = G_PART_READ(table, cp); if (error) goto fail; error = g_part_check_integrity(table, cp); if (error) goto fail; g_topology_lock(); LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (!entry->gpe_internal) g_part_new_provider(gp, table, entry); } table->gpt_opened = 0; g_access(cp, -1, -1, -1); return (0); fail: g_topology_lock(); gctl_error(req, "%d", error); return (error); } static void g_part_wither(struct g_geom *gp, int error) { struct g_part_entry *entry; struct g_part_table *table; table = gp->softc; if (table != NULL) { G_PART_DESTROY(table, NULL); while ((entry = LIST_FIRST(&table->gpt_entry)) != NULL) { LIST_REMOVE(entry, gpe_entry); g_free(entry); } if (gp->softc != NULL) { kobj_delete((kobj_t)gp->softc, M_GEOM); gp->softc = NULL; } } g_wither_geom(gp, error); } /* * Class methods. */ static void g_part_ctlreq(struct gctl_req *req, struct g_class *mp, const char *verb) { struct g_part_parms gpp; struct g_part_table *table; struct gctl_req_arg *ap; enum g_part_ctl ctlreq; unsigned int i, mparms, oparms, parm; int auto_commit, close_on_error; int error, modifies; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, verb)); g_topology_assert(); ctlreq = G_PART_CTL_NONE; modifies = 1; mparms = 0; oparms = G_PART_PARM_FLAGS | G_PART_PARM_OUTPUT | G_PART_PARM_VERSION; switch (*verb) { case 'a': if (!strcmp(verb, "add")) { ctlreq = G_PART_CTL_ADD; mparms |= G_PART_PARM_GEOM | G_PART_PARM_SIZE | G_PART_PARM_START | G_PART_PARM_TYPE; oparms |= G_PART_PARM_INDEX | G_PART_PARM_LABEL; } break; case 'b': if (!strcmp(verb, "bootcode")) { ctlreq = G_PART_CTL_BOOTCODE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_BOOTCODE; } break; case 'c': if (!strcmp(verb, "commit")) { ctlreq = G_PART_CTL_COMMIT; mparms |= G_PART_PARM_GEOM; modifies = 0; } else if (!strcmp(verb, "create")) { ctlreq = G_PART_CTL_CREATE; mparms |= G_PART_PARM_PROVIDER | G_PART_PARM_SCHEME; oparms |= G_PART_PARM_ENTRIES; } break; case 'd': if (!strcmp(verb, "delete")) { ctlreq = G_PART_CTL_DELETE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; } else if (!strcmp(verb, "destroy")) { ctlreq = G_PART_CTL_DESTROY; mparms |= G_PART_PARM_GEOM; oparms |= G_PART_PARM_FORCE; } break; case 'm': if (!strcmp(verb, "modify")) { ctlreq = G_PART_CTL_MODIFY; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; oparms |= G_PART_PARM_LABEL | G_PART_PARM_TYPE; } else if (!strcmp(verb, "move")) { ctlreq = G_PART_CTL_MOVE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX; } break; case 'r': if (!strcmp(verb, "recover")) { ctlreq = G_PART_CTL_RECOVER; mparms |= G_PART_PARM_GEOM; } else if (!strcmp(verb, "resize")) { ctlreq = G_PART_CTL_RESIZE; mparms |= G_PART_PARM_GEOM | G_PART_PARM_INDEX | G_PART_PARM_SIZE; } break; case 's': if (!strcmp(verb, "set")) { ctlreq = G_PART_CTL_SET; mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM; oparms |= G_PART_PARM_INDEX; } break; case 'u': if (!strcmp(verb, "undo")) { ctlreq = G_PART_CTL_UNDO; mparms |= G_PART_PARM_GEOM; modifies = 0; } else if (!strcmp(verb, "unset")) { ctlreq = G_PART_CTL_UNSET; mparms |= G_PART_PARM_ATTRIB | G_PART_PARM_GEOM; oparms |= G_PART_PARM_INDEX; } break; } if (ctlreq == G_PART_CTL_NONE) { gctl_error(req, "%d verb '%s'", EINVAL, verb); return; } bzero(&gpp, sizeof(gpp)); for (i = 0; i < req->narg; i++) { ap = &req->arg[i]; parm = 0; switch (ap->name[0]) { case 'a': if (!strcmp(ap->name, "arg0")) { parm = mparms & (G_PART_PARM_GEOM | G_PART_PARM_PROVIDER); } if (!strcmp(ap->name, "attrib")) parm = G_PART_PARM_ATTRIB; break; case 'b': if (!strcmp(ap->name, "bootcode")) parm = G_PART_PARM_BOOTCODE; break; case 'c': if (!strcmp(ap->name, "class")) continue; break; case 'e': if (!strcmp(ap->name, "entries")) parm = G_PART_PARM_ENTRIES; break; case 'f': if (!strcmp(ap->name, "flags")) parm = G_PART_PARM_FLAGS; else if (!strcmp(ap->name, "force")) parm = G_PART_PARM_FORCE; break; case 'i': if (!strcmp(ap->name, "index")) parm = G_PART_PARM_INDEX; break; case 'l': if (!strcmp(ap->name, "label")) parm = G_PART_PARM_LABEL; break; case 'o': if (!strcmp(ap->name, "output")) parm = G_PART_PARM_OUTPUT; break; case 's': if (!strcmp(ap->name, "scheme")) parm = G_PART_PARM_SCHEME; else if (!strcmp(ap->name, "size")) parm = G_PART_PARM_SIZE; else if (!strcmp(ap->name, "start")) parm = G_PART_PARM_START; break; case 't': if (!strcmp(ap->name, "type")) parm = G_PART_PARM_TYPE; break; case 'v': if (!strcmp(ap->name, "verb")) continue; else if (!strcmp(ap->name, "version")) parm = G_PART_PARM_VERSION; break; } if ((parm & (mparms | oparms)) == 0) { gctl_error(req, "%d param '%s'", EINVAL, ap->name); return; } switch (parm) { case G_PART_PARM_ATTRIB: error = g_part_parm_str(req, ap->name, &gpp.gpp_attrib); break; case G_PART_PARM_BOOTCODE: error = g_part_parm_bootcode(req, ap->name, &gpp.gpp_codeptr, &gpp.gpp_codesize); break; case G_PART_PARM_ENTRIES: error = g_part_parm_intmax(req, ap->name, &gpp.gpp_entries); break; case G_PART_PARM_FLAGS: error = g_part_parm_str(req, ap->name, &gpp.gpp_flags); break; case G_PART_PARM_FORCE: error = g_part_parm_uint32(req, ap->name, &gpp.gpp_force); break; case G_PART_PARM_GEOM: error = g_part_parm_geom(req, ap->name, &gpp.gpp_geom); break; case G_PART_PARM_INDEX: error = g_part_parm_intmax(req, ap->name, &gpp.gpp_index); break; case G_PART_PARM_LABEL: error = g_part_parm_str(req, ap->name, &gpp.gpp_label); break; case G_PART_PARM_OUTPUT: error = 0; /* Write-only parameter */ break; case G_PART_PARM_PROVIDER: error = g_part_parm_provider(req, ap->name, &gpp.gpp_provider); break; case G_PART_PARM_SCHEME: error = g_part_parm_scheme(req, ap->name, &gpp.gpp_scheme); break; case G_PART_PARM_SIZE: error = g_part_parm_quad(req, ap->name, &gpp.gpp_size); break; case G_PART_PARM_START: error = g_part_parm_quad(req, ap->name, &gpp.gpp_start); break; case G_PART_PARM_TYPE: error = g_part_parm_str(req, ap->name, &gpp.gpp_type); break; case G_PART_PARM_VERSION: error = g_part_parm_uint32(req, ap->name, &gpp.gpp_version); break; default: error = EDOOFUS; gctl_error(req, "%d %s", error, ap->name); break; } if (error != 0) { if (error == ENOATTR) { gctl_error(req, "%d param '%s'", error, ap->name); } return; } gpp.gpp_parms |= parm; } if ((gpp.gpp_parms & mparms) != mparms) { parm = mparms - (gpp.gpp_parms & mparms); gctl_error(req, "%d param '%x'", ENOATTR, parm); return; } /* Obtain permissions if possible/necessary. */ close_on_error = 0; table = NULL; if (modifies && (gpp.gpp_parms & G_PART_PARM_GEOM)) { table = gpp.gpp_geom->softc; if (table != NULL && table->gpt_corrupt && ctlreq != G_PART_CTL_DESTROY && ctlreq != G_PART_CTL_RECOVER) { gctl_error(req, "%d table '%s' is corrupt", EPERM, gpp.gpp_geom->name); return; } if (table != NULL && !table->gpt_opened) { error = g_access(LIST_FIRST(&gpp.gpp_geom->consumer), 1, 1, 1); if (error) { gctl_error(req, "%d geom '%s'", error, gpp.gpp_geom->name); return; } table->gpt_opened = 1; close_on_error = 1; } } /* Allow the scheme to check or modify the parameters. */ if (table != NULL) { error = G_PART_PRECHECK(table, ctlreq, &gpp); if (error) { gctl_error(req, "%d pre-check failed", error); goto out; } } else error = EDOOFUS; /* Prevent bogus uninit. warning. */ switch (ctlreq) { case G_PART_CTL_NONE: panic("%s", __func__); case G_PART_CTL_ADD: error = g_part_ctl_add(req, &gpp); break; case G_PART_CTL_BOOTCODE: error = g_part_ctl_bootcode(req, &gpp); break; case G_PART_CTL_COMMIT: error = g_part_ctl_commit(req, &gpp); break; case G_PART_CTL_CREATE: error = g_part_ctl_create(req, &gpp); break; case G_PART_CTL_DELETE: error = g_part_ctl_delete(req, &gpp); break; case G_PART_CTL_DESTROY: error = g_part_ctl_destroy(req, &gpp); break; case G_PART_CTL_MODIFY: error = g_part_ctl_modify(req, &gpp); break; case G_PART_CTL_MOVE: error = g_part_ctl_move(req, &gpp); break; case G_PART_CTL_RECOVER: error = g_part_ctl_recover(req, &gpp); break; case G_PART_CTL_RESIZE: error = g_part_ctl_resize(req, &gpp); break; case G_PART_CTL_SET: error = g_part_ctl_setunset(req, &gpp, 1); break; case G_PART_CTL_UNDO: error = g_part_ctl_undo(req, &gpp); break; case G_PART_CTL_UNSET: error = g_part_ctl_setunset(req, &gpp, 0); break; } /* Implement automatic commit. */ if (!error) { auto_commit = (modifies && (gpp.gpp_parms & G_PART_PARM_FLAGS) && strchr(gpp.gpp_flags, 'C') != NULL) ? 1 : 0; if (auto_commit) { KASSERT(gpp.gpp_parms & G_PART_PARM_GEOM, ("%s", __func__)); error = g_part_ctl_commit(req, &gpp); } } out: if (error && close_on_error) { g_access(LIST_FIRST(&gpp.gpp_geom->consumer), -1, -1, -1); table->gpt_opened = 0; } } static int g_part_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, gp->name)); g_topology_assert(); g_part_wither(gp, EINVAL); return (0); } static struct g_geom * g_part_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct root_hold_token *rht; int attr, depth; int error; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s,%s)", __func__, mp->name, pp->name)); g_topology_assert(); /* Skip providers that are already open for writing. */ if (pp->acw > 0) return (NULL); /* * Create a GEOM with consumer and hook it up to the provider. * With that we become part of the topology. Optain read access * to the provider. */ gp = g_new_geomf(mp, "%s", pp->name); cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error == 0) error = g_access(cp, 1, 0, 0); if (error != 0) { if (cp->provider) g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } rht = root_mount_hold(mp->name); g_topology_unlock(); /* * Short-circuit the whole probing galore when there's no * media present. */ if (pp->mediasize == 0 || pp->sectorsize == 0) { error = ENODEV; goto fail; } /* Make sure we can nest and if so, determine our depth. */ error = g_getattr("PART::isleaf", cp, &attr); if (!error && attr) { error = ENODEV; goto fail; } error = g_getattr("PART::depth", cp, &attr); depth = (!error) ? attr + 1 : 0; error = g_part_probe(gp, cp, depth); if (error) goto fail; table = gp->softc; /* * Synthesize a disk geometry. Some partitioning schemes * depend on it and since some file systems need it even * when the partitition scheme doesn't, we do it here in * scheme-independent code. */ g_part_geometry(table, cp, pp->mediasize / pp->sectorsize); error = G_PART_READ(table, cp); if (error) goto fail; error = g_part_check_integrity(table, cp); if (error) goto fail; g_topology_lock(); LIST_FOREACH(entry, &table->gpt_entry, gpe_entry) { if (!entry->gpe_internal) g_part_new_provider(gp, table, entry); } root_mount_rel(rht); g_access(cp, -1, 0, 0); return (gp); fail: g_topology_lock(); root_mount_rel(rht); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); return (NULL); } /* * Geom methods. */ static int g_part_access(struct g_provider *pp, int dr, int dw, int de) { struct g_consumer *cp; G_PART_TRACE((G_T_ACCESS, "%s(%s,%d,%d,%d)", __func__, pp->name, dr, dw, de)); cp = LIST_FIRST(&pp->geom->consumer); /* We always gain write-exclusive access. */ return (g_access(cp, dr, dw, dw + de)); } static void g_part_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { char buf[64]; struct g_part_entry *entry; struct g_part_table *table; KASSERT(sb != NULL && gp != NULL, ("%s", __func__)); table = gp->softc; if (indent == NULL) { KASSERT(cp == NULL && pp != NULL, ("%s", __func__)); entry = pp->private; if (entry == NULL) return; sbuf_printf(sb, " i %u o %ju ty %s", entry->gpe_index, (uintmax_t)entry->gpe_offset, G_PART_TYPE(table, entry, buf, sizeof(buf))); /* * libdisk compatibility quirk - the scheme dumps the * slicer name and partition type in a way that is * compatible with libdisk. When libdisk is not used * anymore, this should go away. */ G_PART_DUMPCONF(table, entry, sb, indent); } else if (cp != NULL) { /* Consumer configuration. */ KASSERT(pp == NULL, ("%s", __func__)); /* none */ } else if (pp != NULL) { /* Provider configuration. */ entry = pp->private; if (entry == NULL) return; sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_start); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_end); sbuf_printf(sb, "%s%u\n", indent, entry->gpe_index); sbuf_printf(sb, "%s%s\n", indent, G_PART_TYPE(table, entry, buf, sizeof(buf))); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)entry->gpe_offset); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)pp->mediasize); G_PART_DUMPCONF(table, entry, sb, indent); } else { /* Geom configuration. */ sbuf_printf(sb, "%s%s\n", indent, table->gpt_scheme->name); sbuf_printf(sb, "%s%u\n", indent, table->gpt_entries); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->gpt_first); sbuf_printf(sb, "%s%ju\n", indent, (uintmax_t)table->gpt_last); sbuf_printf(sb, "%s%u\n", indent, table->gpt_sectors); sbuf_printf(sb, "%s%u\n", indent, table->gpt_heads); sbuf_printf(sb, "%s%s\n", indent, table->gpt_corrupt ? "CORRUPT": "OK"); sbuf_printf(sb, "%s%s\n", indent, table->gpt_opened ? "true": "false"); G_PART_DUMPCONF(table, NULL, sb, indent); } } /*- * This start routine is only called for non-trivial requests, all the * trivial ones are handled autonomously by the slice code. * For requests we handle here, we must call the g_io_deliver() on the * bio, and return non-zero to indicate to the slice code that we did so. * This code executes in the "DOWN" I/O path, this means: * * No sleeping. * * Don't grab the topology lock. * * Don't call biowait, g_getattr(), g_setattr() or g_read_data() */ static int g_part_ioctl(struct g_provider *pp, u_long cmd, void *data, int fflag, struct thread *td) { struct g_part_table *table; table = pp->geom->softc; return G_PART_IOCTL(table, pp, cmd, data, fflag, td); } static void g_part_resize(struct g_consumer *cp) { struct g_part_table *table; G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name)); g_topology_assert(); table = cp->geom->softc; if (table->gpt_opened == 0) { if (g_access(cp, 1, 1, 1) != 0) return; table->gpt_opened = 1; } if (G_PART_RESIZE(table, NULL, NULL) == 0) printf("GEOM_PART: %s was automatically resized.\n" " Use `gpart commit %s` to save changes or " "`gpart undo %s` to revert them.\n", cp->geom->name, cp->geom->name, cp->geom->name); if (g_part_check_integrity(table, cp) != 0) { g_access(cp, -1, -1, -1); table->gpt_opened = 0; g_part_wither(table->gpt_gp, ENXIO); } } static void g_part_orphan(struct g_consumer *cp) { struct g_provider *pp; struct g_part_table *table; pp = cp->provider; KASSERT(pp != NULL, ("%s", __func__)); G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, pp->name)); g_topology_assert(); KASSERT(pp->error != 0, ("%s", __func__)); table = cp->geom->softc; if (table != NULL && table->gpt_opened) g_access(cp, -1, -1, -1); g_part_wither(cp->geom, pp->error); } static void g_part_spoiled(struct g_consumer *cp) { G_PART_TRACE((G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name)); g_topology_assert(); cp->flags |= G_CF_ORPHAN; g_part_wither(cp->geom, ENXIO); } static void g_part_start(struct bio *bp) { struct bio *bp2; struct g_consumer *cp; struct g_geom *gp; struct g_part_entry *entry; struct g_part_table *table; struct g_kerneldump *gkd; struct g_provider *pp; char buf[64]; + biotrack(bp, __func__); + pp = bp->bio_to; gp = pp->geom; table = gp->softc; cp = LIST_FIRST(&gp->consumer); G_PART_TRACE((G_T_BIO, "%s: cmd=%d, provider=%s", __func__, bp->bio_cmd, pp->name)); entry = pp->private; if (entry == NULL) { g_io_deliver(bp, ENXIO); return; } switch(bp->bio_cmd) { case BIO_DELETE: case BIO_READ: case BIO_WRITE: if (bp->bio_offset >= pp->mediasize) { g_io_deliver(bp, EIO); return; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } if (bp2->bio_offset + bp2->bio_length > pp->mediasize) bp2->bio_length = pp->mediasize - bp2->bio_offset; bp2->bio_done = g_std_done; bp2->bio_offset += entry->gpe_offset; g_io_request(bp2, cp); return; case BIO_FLUSH: break; case BIO_GETATTR: if (g_handleattr_int(bp, "GEOM::fwheads", table->gpt_heads)) return; if (g_handleattr_int(bp, "GEOM::fwsectors", table->gpt_sectors)) return; if (g_handleattr_int(bp, "PART::isleaf", table->gpt_isleaf)) return; if (g_handleattr_int(bp, "PART::depth", table->gpt_depth)) return; if (g_handleattr_str(bp, "PART::scheme", table->gpt_scheme->name)) return; if (g_handleattr_str(bp, "PART::type", G_PART_TYPE(table, entry, buf, sizeof(buf)))) return; if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) { /* * Check that the partition is suitable for kernel * dumps. Typically only swap partitions should be * used. If the request comes from the nested scheme * we allow dumping there as well. */ if ((bp->bio_from == NULL || bp->bio_from->geom->class != &g_part_class) && G_PART_DUMPTO(table, entry) == 0) { g_io_deliver(bp, ENODEV); printf("GEOM_PART: Partition '%s' not suitable" " for kernel dumps (wrong type?)\n", pp->name); return; } gkd = (struct g_kerneldump *)bp->bio_data; if (gkd->offset >= pp->mediasize) { g_io_deliver(bp, EIO); return; } if (gkd->offset + gkd->length > pp->mediasize) gkd->length = pp->mediasize - gkd->offset; gkd->offset += entry->gpe_offset; } break; default: g_io_deliver(bp, EOPNOTSUPP); return; } bp2 = g_clone_bio(bp); if (bp2 == NULL) { g_io_deliver(bp, ENOMEM); return; } bp2->bio_done = g_std_done; g_io_request(bp2, cp); } static void g_part_init(struct g_class *mp) { TAILQ_INSERT_HEAD(&g_part_schemes, &g_part_null_scheme, scheme_list); } static void g_part_fini(struct g_class *mp) { TAILQ_REMOVE(&g_part_schemes, &g_part_null_scheme, scheme_list); } static void g_part_unload_event(void *arg, int flag) { struct g_consumer *cp; struct g_geom *gp; struct g_provider *pp; struct g_part_scheme *scheme; struct g_part_table *table; uintptr_t *xchg; int acc, error; if (flag == EV_CANCEL) return; xchg = arg; error = 0; scheme = (void *)(*xchg); g_topology_assert(); LIST_FOREACH(gp, &g_part_class.geom, geom) { table = gp->softc; if (table->gpt_scheme != scheme) continue; acc = 0; LIST_FOREACH(pp, &gp->provider, provider) acc += pp->acr + pp->acw + pp->ace; LIST_FOREACH(cp, &gp->consumer, consumer) acc += cp->acr + cp->acw + cp->ace; if (!acc) g_part_wither(gp, ENOSYS); else error = EBUSY; } if (!error) TAILQ_REMOVE(&g_part_schemes, scheme, scheme_list); *xchg = error; } int g_part_modevent(module_t mod, int type, struct g_part_scheme *scheme) { struct g_part_scheme *iter; uintptr_t arg; int error; error = 0; switch (type) { case MOD_LOAD: TAILQ_FOREACH(iter, &g_part_schemes, scheme_list) { if (scheme == iter) { printf("GEOM_PART: scheme %s is already " "registered!\n", scheme->name); break; } } if (iter == NULL) { TAILQ_INSERT_TAIL(&g_part_schemes, scheme, scheme_list); g_retaste(&g_part_class); } break; case MOD_UNLOAD: arg = (uintptr_t)scheme; error = g_waitfor_event(g_part_unload_event, &arg, M_WAITOK, NULL); if (error == 0) error = arg; break; default: error = EOPNOTSUPP; break; } return (error); } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 308154) +++ head/sys/kern/vfs_bio.c (revision 308155) @@ -1,4902 +1,4933 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(struct vnode *vp, int); static int buf_recycle(bool); static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int); static void buf_release(void *, void **, int); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Physical memory used for buffers"); #endif static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer acquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int numbufallocfails; SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct rwlock_padalign nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request/wakeup point for the bufspace daemon. */ static int bufspace_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Maximum number of clean buffer queues. */ #define CLEAN_QUEUES 16 /* Configured number of clean queues. */ static int clean_queues; /* Maximum number of buffer queues. */ #define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Lock for each bufqueue */ static struct mtx_padalign bqlocks[BUFFER_QUEUES]; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif static int bqcleanq(void) { static int nextq; return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); } static int bqisclean(int qindex) { return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); } /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { return (struct mtx *)&bqlocks[qindex]; } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static void bufspace_wakeup(void) { /* * If someone is waiting for bufspace, wake them up. * * Since needsbuffer is set prior to doing an additional queue * scan it is safe to check for the flag prior to acquiring the * lock. The thread that is preparing to scan again before * blocking would discover the buf we released. */ if (needsbuffer) { rw_rlock(&nblock); if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } } /* * bufspace_daemonwakeup: * * Wakeup the daemon responsible for freeing clean bufs. */ static void bufspace_daemonwakeup(void) { rw_rlock(&nblock); if (bufspace_request == 0) { bufspace_request = 1; wakeup(&bufspace_request); } rw_runlock(&nblock); } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); bufspace_wakeup(); } else { space = atomic_fetchadd_long(&bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + diff >= bufspacethresh) bufspace_daemonwakeup(); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(int size, bool metadata) { long limit; long space; if (metadata) limit = maxbufspace; else limit = hibufspace; do { space = bufspace; if (space + size > limit) return (ENOSPC); } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + size >= bufspacethresh) bufspace_daemonwakeup(); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(int size) { atomic_subtract_long(&bufspace, size); bufspace_wakeup(); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. needsbuffer must be set in a safe fashion prior to * polling for space. The operation must be re-tried on return. */ static void bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; rw_wlock(&nblock); while (needsbuffer != 0) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { rw_wunlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; rw_wlock(&nblock); if (fl != 0) continue; if (needsbuffer == 0) break; } error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } rw_wunlock(&nblock); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void) { for (;;) { kproc_suspend_check(bufspacedaemonproc); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bufspace > lobufspace || numfreebuffers < hifreebuffers) { if (buf_recycle(false) != 0) { atomic_set_int(&needsbuffer, 1); if (buf_recycle(false) != 0) { rw_wlock(&nblock); if (needsbuffer) rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, PRIBIO|PDROP, "bufspace", hz/10); else rw_wunlock(&nblock); } } maybe_yield(); } /* * Re-check our limits under the exclusive nblock. */ rw_wlock(&nblock); if (bufspace < bufspacethresh && numfreebuffers > lofreebuffers) { bufspace_request = 0; rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, "-", hz); } else rw_wunlock(&nblock); } } static struct kproc_desc bufspace_kp = { "bufspacedaemon", bufspace_daemon, &bufspacedaemonproc }; SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &bufspace_kp); /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; CTASSERT(MAXBCACHEBUF >= MAXBSIZE); mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Drop Giant and spin for a while to allow * interrupt threads to run. */ DROP_GIANT(); DELAY(50000 * iter); PICKUP_GIANT(); #else /* * Drop Giant and context switch several times to * allow interrupt threads to run. */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; if (qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } /* * Stick to the same clean queue for the lifetime of the buf to * limit locking below. Otherwise pick ont sequentially. */ if (qindex == QUEUE_CLEAN) { if (bqisclean(bp->b_qindex)) qindex = bp->b_qindex; else qindex = bqcleanq(); } /* * Handle delayed bremfree() processing. */ nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); bremfreel(bp); if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } } else mtx_lock(nlock); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); atomic_add_int(&numfreebuffers, 1); bufspace_wakeup(); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int flags) { struct buf *bp; int i; mtx_lock(&bqlocks[QUEUE_EMPTY]); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); if (bp == NULL) break; bremfreel(bp); store[i] = bp; } mtx_unlock(&bqlocks[QUEUE_EMPTY]); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { int i; for (i = 0; i < cnt; i++) binsfree(store[i], QUEUE_EMPTY); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(void) { struct buf *bp; bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { bufspace_daemonwakeup(); atomic_add_int(&numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition. */ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) bufspace_daemonwakeup(); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_qrecycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_qrecycle(int qindex, bool kva) { struct buf *bp, *nbp; if (kva) atomic_add_int(&bufdefragcnt, 1); nbp = NULL; mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bremfreel(bp); mtx_unlock(&bqlocks[qindex]); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } mtx_unlock(&bqlocks[qindex]); return (ENOBUFS); } /* * buf_recycle: * * Iterate through all clean queues until we find a buf to recycle or * exhaust the search. */ static int buf_recycle(bool kva) { int qindex, first_qindex; qindex = first_qindex = bqcleanq(); do { if (buf_qrecycle(qindex, kva) == 0) return (0); if (++qindex == QUEUE_CLEAN + clean_queues) qindex = QUEUE_CLEAN; } while (qindex != first_qindex); return (ENOBUFS); } /* * buf_scan: * * Scan the clean queues looking for a buffer to recycle. needsbuffer * is set on failure so that the caller may optionally bufspace_wait() * in a race-free fashion. */ static int buf_scan(bool defrag) { int error; /* * To avoid heavy synchronization and wakeup races we set * needsbuffer and re-poll before failing. This ensures that * no frees can be missed between an unsuccessful poll and * going to sleep in a synchronized fashion. */ if ((error = buf_recycle(defrag)) != 0) { atomic_set_int(&needsbuffer, 1); bufspace_daemonwakeup(); error = buf_recycle(defrag); } if (error == 0) atomic_add_int(&getnewbufrestarts, 1); return (error); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~B_REMFREE; } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); atomic_subtract_long(&bufkvaspace, bp->b_kvasize); atomic_add_int(&buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; atomic_add_long(&bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { int i; for (i = 0; i < 5; i++) if (buf_scan(true) != 0) break; return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); + buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } + buf_track(bp, __func__); + /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } + buf_track(bp, __func__); + /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: + buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int bogus, i, iosize; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = 0; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Unwire a page held by a buf and place it on the appropriate vm queue. */ static void vfs_vmio_unwire(struct buf *bp, vm_page_t m) { bool freed; vm_page_lock(m); if (vm_page_unwire(m, PQ_NONE)) { /* * Determine if the page should be freed before adding * it to the inactive queue. */ if (m->valid == 0) { freed = !vm_page_busied(m); if (freed) vm_page_free(m); } else if ((bp->b_flags & B_DIRECT) != 0) freed = vm_page_try_to_free(m); else freed = false; if (!freed) { /* * If the page is unlikely to be reused, let the * VM know. Otherwise, maintain LRU page * ordering and put the page at the tail of the * inactive queue. */ if ((bp->b_flags & B_NOREUSE) != 0) vm_page_deactivate_noreuse(m); else vm_page_deactivate(m); } } vm_page_unlock(m); } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh", true); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vfs_vmio_unwire(bp, m); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); obj = bp->b_bufobj->bo_object; if (obj != NULL) VM_OBJECT_WLOCK(obj); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; vfs_vmio_unwire(bp, m); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); while (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_COUNT(desiredpages - bp->b_npages)); if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; atomic_add_int(&getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(maxsize, metadata) != 0) continue; reserved = true; if ((bp = buf_alloc()) == NULL) continue; if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while(buf_scan(false) == 0); if (reserved) atomic_subtract_long(&bufspace, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, int target) { int flushed; flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; bool unlock; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlocks[queue]); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); while (flushed != target) { maybe_yield(); mtx_lock(&bqlocks[queue]); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqlocks[queue]); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { mtx_unlock(&bqlocks[queue]); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqlocks[queue]); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); notbufdflushes++; } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqlocks[queue]); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); bufspace_wait(bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBCACHEBUF) panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size, MAXBCACHEBUF); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); bufspace_release(maxsize); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: + buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; BUF_ASSERT_HELD(bp); if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; + biotrack(bp, __func__); if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, OFF_TO_IDX(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else { bp->bio_flags |= BIO_DONE; done(bp); } } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) +void +biotrack_buf(struct bio *bp, const char *location) +{ + + buf_track(bp->bio_track_bp, location); +} +#endif + /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); + buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage", true); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { int i, bogus; vm_object_t obj; vm_ooffset_t foff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); if (p == NULL) { VM_WAIT; goto tryagain; } pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; if (vm_page_sbusied(p)) printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); p->wire_count--; vm_page_free(p); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } static int buf_pager_relbuf; SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, error, i; bool redo, lpart; object = vp->v_object; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); if (rbehind != NULL) { lb = IDX_TO_OFF(ma[0]->pindex); *rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); } if (rahead != NULL) { *rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(*rahead) >= object->un_pager.vnp.vnp_size) { *rahead = OFF_TO_IDX(roundup2(object->un_pager. vnp.vnp_size, PAGE_SIZE) - la); } } VM_OBJECT_WLOCK(object); again: for (i = 0; i < count; i++) vm_page_busy_downgrade(ma[i]); VM_OBJECT_WUNLOCK(object); lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (m->valid == VM_PAGE_BITS_ALL) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED, &bp); if (error != 0) goto end_pages; if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || m->valid != VM_PAGE_BITS_ALL) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || m->valid == VM_PAGE_BITS_ALL || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); if (m->valid != 0 && m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } next_page:; } end_pages: VM_OBJECT_WLOCK(object); redo = false; for (i = 0; i < count; i++) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (ma[i]->valid != VM_PAGE_BITS_ALL) redo = true; } if (redo && error == 0) goto again; VM_OBJECT_WUNLOCK(object); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; +#ifdef FULL_BUF_TRACKING + uint32_t i, j; +#endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } +#if defined(FULL_BUF_TRACKING) + db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); + + i = bp->b_io_tcnt % BUF_TRACKING_SIZE; + for (j = 1; j <= BUF_TRACKING_SIZE; j++) + db_printf(" %2u: %s\n", j, + bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); +#elif defined(BUF_TRACKING) + db_printf("b_io_tracking: %s\n", bp->b_io_tracking); +#endif db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: head/sys/kern/vfs_cluster.c =================================================================== --- head/sys/kern/vfs_cluster.c (revision 308154) +++ head/sys/kern/vfs_cluster.c (revision 308155) @@ -1,1063 +1,1064 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_debug_cluster.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CLUSTERDEBUG) static int rcluster= 0; SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "Debug VFS clustering code"); #endif static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); static struct cluster_save *cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags); static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); static int read_max = 64; SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, "Cluster read-ahead max block count"); static int read_min = 1; SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, "Cluster read min block count"); /* Page expended to mark partially backed buffers */ extern vm_page_t bogus_page; /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); if (bp == NULL) return (EBUSY); origblkno = lblkno; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { bp->b_flags &= ~B_RAM; BO_RLOCK(bo); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(&vp->v_bufobj, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. (If we can lock the * buffer without waiting) */ if ((((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) && (0 == BUF_LOCK(rbp, LK_EXCLUSIVE | LK_NOWAIT, NULL))) { rbp->b_flags |= B_RAM; BUF_UNLOCK(rbp); } } BO_RUNLOCK(bo); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; long minread; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Adjust totread if needed */ minread = read_min * size; if (minread > totread) totread = minread; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rbp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } if (reqbp) { /* * Like bread, always brelse() the buffer when * returning an error. */ error = bufwait(reqbp); if (error != 0) { brelse(reqbp); *bpp = NULL; } } return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct buf *bp, *tbp; daddr_t bn; off_t off; long tinc, tsize; int i, inc, j, k, toff; KASSERT(size == vp->v_mount->mnt_stat.f_iosize, ("cluster_rbuild: size %ld != f_iosize %jd\n", size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } if (fbp) { tbp = fbp; tbp->b_iocmd = BIO_READ; } else { tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; } tbp->b_blkno = blkno; if( (tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = trypbuf(&cluster_pbuf_freecnt); if (bp == NULL) return tbp; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; } else { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; bp->b_offset = tbp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i == 0) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); vfs_drain_busy_pages(tbp); vm_object_pip_add(tbp->b_bufobj->bo_object, tbp->b_npages); for (k = 0; k < tbp->b_npages; k++) vm_page_sbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } else { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > vp->v_mount->mnt_iosize_max) { break; } tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | (gbflags & GB_UNMAPPED)); /* Don't wait around for locked bufs. */ if (tbp == NULL) break; /* * Stop scanning if the buffer is fully valid * (marked B_CACHE), or locked (may be doing a * background write), or if the buffer is not * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. The bo lock is not * required for the BKGRDINPROG check since it * can not be set without the buf lock. */ if ((tbp->b_vflags & BV_BKGRDINPROG) || (tbp->b_flags & B_CACHE) || (tbp->b_flags & B_VMIO) == 0) { bqrelse(tbp); break; } /* * The buffer must be completely invalid in order to * take part in the cluster. If it is partially valid * then we stop. */ off = tbp->b_offset; tsize = size; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; tsize > 0; j++) { toff = off & PAGE_MASK; tinc = tsize; if (toff + tinc > PAGE_SIZE) tinc = PAGE_SIZE - toff; VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object); if ((tbp->b_pages[j]->valid & vm_page_bits(toff, tinc)) != 0) break; if (vm_page_xbusied(tbp->b_pages[j])) break; vm_object_pip_add(tbp->b_bufobj->bo_object, 1); vm_page_sbusy(tbp->b_pages[j]); off += tinc; tsize -= tinc; } if (tsize > 0) { clean_sbusy: vm_object_pip_add(tbp->b_bufobj->bo_object, -j); for (k = 0; k < j; k++) vm_page_sunbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); bqrelse(tbp); break; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Set a read-ahead mark as appropriate */ if ((fbp && (i == 1)) || (i == (run - 1))) tbp->b_flags |= B_RAM; /* * Set the buffer up for an async read (XXX should * we do this only if we do not wind up brelse()ing?). * Set the block number if it isn't set, otherwise * if it is make sure it matches the block number we * expect. */ tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_READ; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); goto clean_sbusy; } } /* * XXX fbp from caller may not be B_ASYNC, but we are going * to biodone() it in cluster_callback() anyway */ BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } if (m->valid == VM_PAGE_BITS_ALL) tbp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Don't inherit tbp->b_bufsize as it may be larger due to * a non-page-aligned size. Instead just aggregate using * 'size'. */ if (tbp->b_bcount != size) printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); if (tbp->b_bufsize != size) printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); bp->b_bcount += size; bp->b_bufsize += size; } /* * Fully valid pages in the cluster are already good and do not need * to be re-read from disk. Replace the page with bogus_page */ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (j = 0; j < bp->b_npages; j++) { VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object); if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL) bp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); if (bp->b_bufsize > bp->b_kvasize) panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ static void cluster_callback(bp) struct buf *bp; { struct buf *nbp, *tbp; int error = 0; /* * Must propagate errors to all the components. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; if (buf_mapped(bp)) { pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); tbp; tbp = nbp) { nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_ioflags |= BIO_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; /* * XXX the bdwrite()/bqrelse() issued during * cluster building clears B_RELBUF (see bqrelse() * comment). If direct I/O was specified, we have * to restore it here to allow the buffer and VM * to be freed. */ if (tbp->b_flags & B_DIRECT) tbp->b_flags |= B_RELBUF; } bufdone(tbp); } pbrelvp(bp); relpbuf(bp, &cluster_pbuf_freecnt); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { int r = 0; switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, int gbflags) { daddr_t lbn; int maxclen, cursize; int lblocksize; int async; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; if (vp->v_type == VREG) { async = DOINGASYNC(vp); lblocksize = vp->v_mount->mnt_stat.f_iosize; } else { async = 0; lblocksize = bp->b_bufsize; } lbn = bp->b_lblkno; KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp, gbflags); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((vp->v_type == VREG) && ((u_quad_t) bp->b_offset + lblocksize) != filesize && (bp->b_blkno == bp->b_lblkno) && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1, gbflags); } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; int i, j; int totalwritten = 0; int dbsize = btodb(size); if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; bo = &vp->v_bufobj; while (len > 0) { /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ BO_LOCK(bo); if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); ++start_lbn; --len; continue; } if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { ++start_lbn; --len; continue; } if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { BUF_UNLOCK(tbp); ++start_lbn; --len; continue; } bremfree(tbp); tbp->b_flags &= ~B_DONE; /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != (B_CLUSTEROK | B_VMIO)) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || ((bp = (vp->v_vflag & VV_MD) != 0 ? trypbuf(&cluster_pbuf_freecnt) : getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } /* * We got a pbuf to make the cluster in. * so initialise it. */ TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_wcred != NOCRED) bp->b_wcred = crhold(tbp->b_wcred); bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; bp->b_offset = tbp->b_offset; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ if ((gbflags & GB_UNMAPPED) == 0 || (tbp->b_flags & B_VMIO) == 0) { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } else { bp->b_data = unmapped_buf; } bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* If not the first buffer */ /* * If the adjacent data is not even in core it * can't need to be written. */ BO_LOCK(bo); if ((tbp = gbincore(bo, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); break; } /* * If it IS in core, but has different * characteristics, or is locked (which * means it could be undergoing a background * I/O or be in a weird state), then don't * cluster with it. */ if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) break; if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || tbp->b_wcred != bp->b_wcred) { BUF_UNLOCK(tbp); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != size) || ((bp->b_blkno + (dbsize * i)) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { BUF_UNLOCK(tbp); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); tbp->b_flags &= ~B_DONE; } /* end of code for non-first buffers only */ /* * If the IO is via the VM then we do some * special VM hackery (yuck). Since the buffer's * block size may not be page-aligned it is possible * for a page to be shared between two buffers. We * have to get rid of the duplication when building * the cluster. */ if (tbp->b_flags & B_VMIO) { vm_page_t m; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); if (i == 0) { vfs_drain_busy_pages(tbp); } else { /* if not first buffer */ for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if (vm_page_xbusied(m)) { VM_OBJECT_WUNLOCK( tbp->b_object); bqrelse(tbp); goto finishcluster; } } } for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; vm_page_sbusy(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages - 1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } bp->b_bcount += size; bp->b_bufsize += size; /* * If any of the clustered buffers have their * B_BARRIER flag set, transfer that request to * the cluster. */ bp->b_flags |= (tbp->b_flags & B_BARRIER); tbp->b_flags &= ~(B_DONE | B_BARRIER); tbp->b_flags |= B_ASYNC; tbp->b_ioflags &= ~BIO_ERROR; tbp->b_iocmd = BIO_WRITE; bundirty(tbp); reassignbuf(tbp); /* put on clean list */ bufobj_wref(tbp->b_bufobj); BUF_KERNPROC(tbp); + buf_track(tbp, __func__); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } finishcluster: if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, gbflags, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buflist->bs_children[i] = bp = last_bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); buflist->bs_nchildren = i + 1; return (buflist); } Index: head/sys/sys/bio.h =================================================================== --- head/sys/sys/bio.h (revision 308154) +++ head/sys/sys/bio.h (revision 308155) @@ -1,160 +1,180 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BIO_H_ #define _SYS_BIO_H_ #include #include /* bio_cmd */ #define BIO_READ 0x01 /* Read I/O data */ #define BIO_WRITE 0x02 /* Write I/O data */ #define BIO_DELETE 0x03 /* TRIM or free blocks, i.e. mark as unused */ #define BIO_GETATTR 0x04 /* Get GEOM attributes of object */ #define BIO_FLUSH 0x05 /* Commit outstanding I/O now */ #define BIO_CMD0 0x06 /* Available for local hacks */ #define BIO_CMD1 0x07 /* Available for local hacks */ #define BIO_CMD2 0x08 /* Available for local hacks */ #define BIO_ZONE 0x09 /* Zone command */ /* bio_flags */ #define BIO_ERROR 0x01 /* An error occurred processing this bio. */ #define BIO_DONE 0x02 /* This bio is finished. */ #define BIO_ONQUEUE 0x04 /* This bio is in a queue & not yet taken. */ /* * This bio must be executed after all previous bios in the queue have been * executed, and before any successive bios can be executed. */ #define BIO_ORDERED 0x08 #define BIO_UNMAPPED 0x10 #define BIO_TRANSIENT_MAPPING 0x20 #define BIO_VLIST 0x40 #ifdef _KERNEL struct disk; struct bio; struct vm_map; /* Empty classifier tag, to prevent further classification. */ #define BIO_NOTCLASSIFIED (void *)(~0UL) typedef void bio_task_t(void *); /* * The bio structure describes an I/O operation in the kernel. */ struct bio { uint16_t bio_cmd; /* I/O operation. */ uint16_t bio_flags; /* General flags. */ uint16_t bio_cflags; /* Private use by the consumer. */ uint16_t bio_pflags; /* Private use by the provider. */ struct cdev *bio_dev; /* Device to do I/O on. */ struct disk *bio_disk; /* Valid below geom_disk.c only */ off_t bio_offset; /* Offset into file. */ long bio_bcount; /* Valid bytes in buffer. */ caddr_t bio_data; /* Memory, superblocks, indirect etc. */ struct vm_page **bio_ma; /* Or unmapped. */ int bio_ma_offset; /* Offset in the first page of bio_ma. */ int bio_ma_n; /* Number of pages in bio_ma. */ int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); void *bio_driver1; /* Private use by the provider. */ void *bio_driver2; /* Private use by the provider. */ void *bio_caller1; /* Private use by the consumer. */ void *bio_caller2; /* Private use by the consumer. */ TAILQ_ENTRY(bio) bio_queue; /* Disksort queue. */ const char *bio_attribute; /* Attribute for BIO_[GS]ETATTR */ struct disk_zone_args bio_zone;/* Used for BIO_ZONE */ struct g_consumer *bio_from; /* GEOM linkage */ struct g_provider *bio_to; /* GEOM linkage */ off_t bio_length; /* Like bio_bcount */ off_t bio_completed; /* Inverse of bio_resid */ u_int bio_children; /* Number of spawned bios */ u_int bio_inbed; /* Children safely home by now */ struct bio *bio_parent; /* Pointer to parent */ struct bintime bio_t0; /* Time request started */ bio_task_t *bio_task; /* Task_queue handler */ void *bio_task_arg; /* Argument to above */ void *bio_classifier1; /* Classifier tag. */ void *bio_classifier2; /* Classifier tag. */ #ifdef DIAGNOSTIC void *_bio_caller1; void *_bio_caller2; uint8_t _bio_cflags; #endif +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + struct buf *bio_track_bp; /* Parent buf for tracking */ +#endif /* XXX: these go away when bio chaining is introduced */ daddr_t bio_pblkno; /* physical block number */ }; struct uio; struct devstat; struct bio_queue_head { TAILQ_HEAD(bio_queue, bio) queue; off_t last_offset; struct bio *insert_point; }; extern struct vm_map *bio_transient_map; extern int bio_transient_maxcnt; void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); + +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) +void biotrack_buf(struct bio *bp, const char *location); + +static __inline void +biotrack(struct bio *bp, const char *location) +{ + + if (bp->bio_track_bp != NULL) + biotrack_buf(bp, location); +} +#else +static __inline void +biotrack(struct bio *bp __unused, const char *location __unused) +{ +} +#endif void bioq_disksort(struct bio_queue_head *ap, struct bio *bp); struct bio *bioq_first(struct bio_queue_head *head); struct bio *bioq_takefirst(struct bio_queue_head *head); void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error); void bioq_init(struct bio_queue_head *head); void bioq_insert_head(struct bio_queue_head *head, struct bio *bp); void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp); void bioq_remove(struct bio_queue_head *head, struct bio *bp); int physio(struct cdev *dev, struct uio *uio, int ioflag); #define physread physio #define physwrite physio #endif /* _KERNEL */ #endif /* !_SYS_BIO_H_ */ Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 308154) +++ head/sys/sys/buf.h (revision 308155) @@ -1,549 +1,569 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; struct vm_page; typedef unsigned char b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning bufobj lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */ uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */ off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ unsigned short b_qindex; /* (Q) buffer queue index */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ int b_runningbufspace; /* when I/O is running, pipelining */ int b_kvasize; /* size of kva for buffer */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ caddr_t b_kvabase; /* base kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ union { TAILQ_ENTRY(buf) b_freelist; /* (Q) */ struct { void (*b_pgiodone)(void *, vm_page_t *, int, int); int b_pgbefore; int b_pgafter; }; }; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; + +#if defined(FULL_BUF_TRACKING) +#define BUF_TRACKING_SIZE 32 +#define BUF_TRACKING_ENTRY(x) ((x) & (BUF_TRACKING_SIZE - 1)) + const char *b_io_tracking[BUF_TRACKING_SIZE]; + uint32_t b_io_tcnt; +#elif defined(BUF_TRACKING) + const char *b_io_tracking; +#endif }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_00000100 0x00000100 /* Available flag. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_00040000 0x00040000 /* Available flag. */ #define B_00080000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ #define B_00200000 0x00200000 /* Available flag. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ #define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15b12\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\2clean\1dirty" #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #define BV_BKGRDERR 0x00000008 /* Error from background write */ #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL /* * Buffer locking */ extern const char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ #define BUF_LOCK(bp, locktype, interlock) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE) /* * Get a lock sleeping with specified interruptably and timeout. */ #define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ LOCK_FILE, LOCK_LINE) /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ #define BUF_UNLOCK(bp) do { \ KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ \ (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE); \ } while (0) /* * Check if a buffer lock is recursed. */ #define BUF_LOCKRECURSED(bp) \ lockmgr_recursed(&(bp)->b_lock) /* * Check if a buffer lock is currently held. */ #define BUF_ISLOCKED(bp) \ lockstatus(&(bp)->b_lock) /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ lockdestroy(&(bp)->b_lock) /* * Print informations on a buffer lock. */ #define BUF_LOCKPRINTINFO(bp) \ lockmgr_printinfo(&(bp)->b_lock) /* * Buffer lock assertions. */ #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) #define BUF_ASSERT_LOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_SLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_XLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ #define BUF_KERNPROC(bp) \ _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); +} + +static __inline void +buf_track(struct buf *bp, const char *location) +{ + +#if defined(FULL_BUF_TRACKING) + bp->b_io_tracking[BUF_TRACKING_ENTRY(bp->b_io_tcnt++)] = location; +#elif defined(BUF_TRACKING) + bp->b_io_tracking = location; +#endif } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ #define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern long maxswzone; /* Max KVA for swap structures */ extern long maxbcache; /* Max KVA for buffer cache */ extern long runningbufspace; extern long hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int buf_mapped(struct buf *bp) { return (bp->b_data != unmapped_buf); } void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); void bufshutdown(int); void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ gbflags, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, int, struct buf **); void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); void bdwrite(struct buf *); void bawrite(struct buf *); void babarrierwrite(struct buf *); int bbarrierwrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t); typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t); int vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ Index: head/sys/vm/vm_pager.c =================================================================== --- head/sys/vm/vm_pager.c (revision 308154) +++ head/sys/vm/vm_pager.c (revision 308155) @@ -1,560 +1,562 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Paging space routine stubs. Emulates a matchmaker-like interface * for builtin pagers. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ struct buf *swbuf; static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t dead_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static void dead_pager_dealloc(vm_object_t); static int dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int *rbehind, int *rahead) { return (VM_PAGER_FAIL); } static vm_object_t dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off, struct ucred *cred) { return NULL; } static void dead_pager_putpages(object, m, count, flags, rtvals) vm_object_t object; vm_page_t *m; int count; int flags; int *rtvals; { int i; for (i = 0; i < count; i++) { rtvals[i] = VM_PAGER_AGAIN; } } static int dead_pager_haspage(object, pindex, prev, next) vm_object_t object; vm_pindex_t pindex; int *prev; int *next; { if (prev) *prev = 0; if (next) *next = 0; return FALSE; } static void dead_pager_dealloc(object) vm_object_t object; { return; } static struct pagerops deadpagerops = { .pgo_alloc = dead_pager_alloc, .pgo_dealloc = dead_pager_dealloc, .pgo_getpages = dead_pager_getpages, .pgo_putpages = dead_pager_putpages, .pgo_haspage = dead_pager_haspage, }; struct pagerops *pagertab[] = { &defaultpagerops, /* OBJT_DEFAULT */ &swappagerops, /* OBJT_SWAP */ &vnodepagerops, /* OBJT_VNODE */ &devicepagerops, /* OBJT_DEVICE */ &physpagerops, /* OBJT_PHYS */ &deadpagerops, /* OBJT_DEAD */ &sgpagerops, /* OBJT_SG */ &mgtdevicepagerops, /* OBJT_MGTDEVICE */ }; /* * Kernel address space for mapping pages. * Used by pagers where KVAs are needed for IO. * * XXX needs to be large enough to support the number of pending async * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ struct mtx_padalign pbuf_mtx; static TAILQ_HEAD(swqueue, buf) bswlist; static int bswneeded; vm_offset_t swapbkva; /* swap buffers kva */ void vm_pager_init() { struct pagerops **pgops; TAILQ_INIT(&bswlist); /* * Initialize known pagers */ for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++) if ((*pgops)->pgo_init != NULL) (*(*pgops)->pgo_init) (); } void vm_pager_bufferinit() { struct buf *bp; int i; mtx_init(&pbuf_mtx, "pbuf mutex", NULL, MTX_DEF); bp = swbuf; /* * Now set up swap and physical I/O buffer headers. */ for (i = 0; i < nswbuf; i++, bp++) { TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); BUF_LOCKINIT(bp); LIST_INIT(&bp->b_dep); bp->b_rcred = bp->b_wcred = NOCRED; bp->b_xflags = 0; } cluster_pbuf_freecnt = nswbuf / 2; vnode_pbuf_freecnt = nswbuf / 2 + 1; vnode_async_pbuf_freecnt = nswbuf / 2; } /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that * need to perform page-level validation (e.g. the device pager). */ vm_object_t vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off, struct ucred *cred) { vm_object_t ret; struct pagerops *ops; ops = pagertab[type]; if (ops) ret = (*ops->pgo_alloc) (handle, size, prot, off, cred); else ret = NULL; return (ret); } /* * The object must be locked. */ void vm_pager_deallocate(object) vm_object_t object; { VM_OBJECT_ASSERT_WLOCKED(object); (*pagertab[object->type]->pgo_dealloc) (object); } static void vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count) { #ifdef INVARIANTS VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(count > 0, ("%s: 0 count", __func__)); /* * All pages must be busied, not mapped, not fully valid, * not dirty and belong to the proper object. */ for (int i = 0 ; i < count; i++) { vm_page_assert_xbusied(m[i]); KASSERT(!pmap_page_is_mapped(m[i]), ("%s: page %p is mapped", __func__, m[i])); KASSERT(m[i]->valid != VM_PAGE_BITS_ALL, ("%s: request for a valid page %p", __func__, m[i])); KASSERT(m[i]->dirty == 0, ("%s: page %p is dirty", __func__, m[i])); KASSERT(m[i]->object == object, ("%s: wrong object %p/%p", __func__, object, m[i]->object)); } #endif } /* * Page in the pages for the object using its associated pager. * The requested page must be fully valid on successful return. */ int vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { #ifdef INVARIANTS vm_pindex_t pindex = m[0]->pindex; #endif int r; vm_pager_assert_in(object, m, count); r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind, rahead); if (r != VM_PAGER_OK) return (r); for (int i = 0; i < count; i++) { /* * If pager has replaced a page, assert that it had * updated the array. */ KASSERT(m[i] == vm_page_lookup(object, pindex++), ("%s: mismatch page %p pindex %ju", __func__, m[i], (uintmax_t )pindex - 1)); /* * Zero out partially filled data. */ if (m[i]->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m[i], TRUE); } return (VM_PAGER_OK); } int vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { vm_pager_assert_in(object, m, count); return ((*pagertab[object->type]->pgo_getpages_async)(object, m, count, rbehind, rahead, iodone, arg)); } /* * vm_pager_put_pages() - inline, see vm/vm_pager.h * vm_pager_has_page() - inline, see vm/vm_pager.h */ /* * Search the specified pager object list for an object with the * specified handle. If an object with the specified handle is found, * increase its reference count and return it. Otherwise, return NULL. * * The pager object list must be locked. */ vm_object_t vm_pager_object_lookup(struct pagerlst *pg_list, void *handle) { vm_object_t object; TAILQ_FOREACH(object, pg_list, pager_object_list) { if (object->handle == handle) { VM_OBJECT_WLOCK(object); if ((object->flags & OBJ_DEAD) == 0) { vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); break; } VM_OBJECT_WUNLOCK(object); } } return (object); } /* * initialize a physical buffer */ /* * XXX This probably belongs in vfs_bio.c */ static void initpbuf(struct buf *bp) { KASSERT(bp->b_bufobj == NULL, ("initpbuf with bufobj")); KASSERT(bp->b_vp == NULL, ("initpbuf with vp")); bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ bp->b_kvabase = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva; bp->b_data = bp->b_kvabase; bp->b_kvasize = MAXPHYS; bp->b_flags = 0; bp->b_xflags = 0; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; BUF_LOCK(bp, LK_EXCLUSIVE, NULL); + buf_track(bp, __func__); } /* * allocate a physical buffer * * There are a limited number (nswbuf) of physical buffers. We need * to make sure that no single subsystem is able to hog all of them, * so each subsystem implements a counter which is typically initialized * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and * increments it on release, and blocks if the counter hits zero. A * subsystem may initialize the counter to -1 to disable the feature, * but it must still be sure to match up all uses of getpbuf() with * relpbuf() using the same variable. * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ struct buf * getpbuf(int *pfreecnt) { struct buf *bp; mtx_lock(&pbuf_mtx); for (;;) { if (pfreecnt) { while (*pfreecnt == 0) { msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0); } } /* get a bp from the swap buffer header pool */ if ((bp = TAILQ_FIRST(&bswlist)) != NULL) break; bswneeded = 1; msleep(&bswneeded, &pbuf_mtx, PVM, "wswbuf1", 0); /* loop in case someone else grabbed one */ } TAILQ_REMOVE(&bswlist, bp, b_freelist); if (pfreecnt) --*pfreecnt; mtx_unlock(&pbuf_mtx); initpbuf(bp); return bp; } /* * allocate a physical buffer, if one is available. * * Note that there is no NULL hack here - all subsystems using this * call understand how to use pfreecnt. */ struct buf * trypbuf(int *pfreecnt) { struct buf *bp; mtx_lock(&pbuf_mtx); if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { mtx_unlock(&pbuf_mtx); return NULL; } TAILQ_REMOVE(&bswlist, bp, b_freelist); --*pfreecnt; mtx_unlock(&pbuf_mtx); initpbuf(bp); return bp; } /* * release a physical buffer * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ void relpbuf(struct buf *bp, int *pfreecnt) { if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } KASSERT(bp->b_vp == NULL, ("relpbuf with vp")); KASSERT(bp->b_bufobj == NULL, ("relpbuf with bufobj")); + buf_track(bp, __func__); BUF_UNLOCK(bp); mtx_lock(&pbuf_mtx); TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); if (bswneeded) { bswneeded = 0; wakeup(&bswneeded); } if (pfreecnt) { if (++*pfreecnt == 1) wakeup(pfreecnt); } mtx_unlock(&pbuf_mtx); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(struct vnode *vp, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); KASSERT(bp->b_bufobj == NULL, ("pbgetvp: not free (bufobj)")); bp->b_vp = vp; bp->b_flags |= B_PAGING; bp->b_bufobj = &vp->v_bufobj; } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetbo(struct bufobj *bo, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbgetbo: not free (vnode)")); KASSERT(bp->b_bufobj == NULL, ("pbgetbo: not free (bufobj)")); bp->b_flags |= B_PAGING; bp->b_bufobj = bo; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(struct buf *bp) { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); KASSERT(bp->b_bufobj != NULL, ("pbrelvp: NULL bufobj")); KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, ("pbrelvp: pager buf on vnode list.")); bp->b_vp = NULL; bp->b_bufobj = NULL; bp->b_flags &= ~B_PAGING; } /* * Disassociate a p-buffer from a bufobj. */ void pbrelbo(struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbrelbo: vnode")); KASSERT(bp->b_bufobj != NULL, ("pbrelbo: NULL bufobj")); KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, ("pbrelbo: pager buf on vnode list.")); bp->b_bufobj = NULL; bp->b_flags &= ~B_PAGING; }