diff --git a/share/man/man4/nvme.4 b/share/man/man4/nvme.4 index db31b6f26b3e..eb9f9b222f51 100644 --- a/share/man/man4/nvme.4 +++ b/share/man/man4/nvme.4 @@ -1,259 +1,267 @@ .\" .\" Copyright (c) 2012-2016 Intel Corporation .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions, and the following disclaimer, .\" without modification. .\" 2. Redistributions in binary form must reproduce at minimum a disclaimer .\" substantially similar to the "NO WARRANTY" disclaimer below .\" ("Disclaimer") and any redistribution must be conditioned upon .\" including a substantially similar Disclaimer requirement for further .\" binary redistribution. .\" .\" NO WARRANTY .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS .\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT .\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR .\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT .\" HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, .\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING .\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGES. .\" .\" nvme driver man page. .\" .\" Author: Jim Harris .\" .\" $FreeBSD$ .\" .Dd June 6, 2020 .Dt NVME 4 .Os .Sh NAME .Nm nvme .Nd NVM Express core driver .Sh SYNOPSIS To compile this driver into your kernel, place the following line in your kernel configuration file: .Bd -ragged -offset indent .Cd "device nvme" .Ed .Pp Or, to load the driver as a module at boot, place the following line in .Xr loader.conf 5 : .Bd -literal -offset indent nvme_load="YES" .Ed .Pp Most users will also want to enable .Xr nvd 4 or .Xr nda 4 to expose NVM Express namespaces as disk devices which can be partitioned. Note that in NVM Express terms, a namespace is roughly equivalent to a SCSI LUN. .Sh DESCRIPTION The .Nm driver provides support for NVM Express (NVMe) controllers, such as: .Bl -bullet .It Hardware initialization .It Per-CPU IO queue pairs .It API for registering NVMe namespace consumers such as .Xr nvd 4 or .Xr nda 4 .It API for submitting NVM commands to namespaces .It Ioctls for controller and namespace configuration and management .El .Pp The .Nm driver creates controller device nodes in the format .Pa /dev/nvmeX and namespace device nodes in the format .Pa /dev/nvmeXnsY . Note that the NVM Express specification starts numbering namespaces at 1, not 0, and this driver follows that convention. .Sh CONFIGURATION By default, .Nm will create an I/O queue pair for each CPU, provided enough MSI-X vectors and NVMe queue pairs can be allocated. If not enough vectors or queue pairs are available, nvme(4) will use a smaller number of queue pairs and assign multiple CPUs per queue pair. .Pp To force a single I/O queue pair shared by all CPUs, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.per_cpu_io_queues=0 .Ed .Pp To assign more than one CPU per I/O queue pair, thereby reducing the number of MSI-X vectors consumed by the device, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.min_cpus_per_ioq=X .Ed .Pp To force legacy interrupts for all .Nm driver instances, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.force_intx=1 .Ed .Pp Note that use of INTx implies disabling of per-CPU I/O queue pairs. .Pp To control maximum amount of system RAM in bytes to use as Host Memory Buffer for capable devices, set the following tunable: .Bd -literal -offset indent hw.nvme.hmb_max .Ed .Pp The default value is 5% of physical memory size per device. .Pp The .Xr nvd 4 driver is used to provide a disk driver to the system by default. The .Xr nda 4 driver can also be used instead. The .Xr nvd 4 driver performs better with smaller transactions and few TRIM commands. It sends all commands directly to the drive immediately. The .Xr nda 4 driver performs better with larger transactions and also collapses TRIM commands giving better performance. It can queue commands to the drive; combine .Dv BIO_DELETE commands into a single trip; and use the CAM I/O scheduler to bias one type of operation over another. To select the .Xr nda 4 driver, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.use_nvd=0 .Ed .Pp This value may also be set in the kernel config file with .Bd -literal -offset indent .Cd options NVME_USE_NVD=0 .Ed .Pp When there is an error, .Nm prints only the most relevant information about the command by default. To enable dumping of all information about the command, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.verbose_cmd_dump=1 .Ed .Pp +Prior versions of the driver reset the card twice on boot. +This proved to be unnecessary and inefficient, so the driver now resets drive +controller only once. +The old behavior may be restored in the kernel config file with +.Bd -literal -offset indent +.Cd options NVME_2X_RESET +.Ed +.Pp .Sh SYSCTL VARIABLES The following controller-level sysctls are currently implemented: .Bl -tag -width indent .It Va dev.nvme.0.num_cpus_per_ioq (R) Number of CPUs associated with each I/O queue pair. .It Va dev.nvme.0.int_coal_time (R/W) Interrupt coalescing timer period in microseconds. Set to 0 to disable. .It Va dev.nvme.0.int_coal_threshold (R/W) Interrupt coalescing threshold in number of command completions. Set to 0 to disable. .El .Pp The following queue pair-level sysctls are currently implemented. Admin queue sysctls take the format of dev.nvme.0.adminq and I/O queue sysctls take the format of dev.nvme.0.ioq0. .Bl -tag -width indent .It Va dev.nvme.0.ioq0.num_entries (R) Number of entries in this queue pair's command and completion queue. .It Va dev.nvme.0.ioq0.num_tr (R) Number of nvme_tracker structures currently allocated for this queue pair. .It Va dev.nvme.0.ioq0.num_prp_list (R) Number of nvme_prp_list structures currently allocated for this queue pair. .It Va dev.nvme.0.ioq0.sq_head (R) Current location of the submission queue head pointer as observed by the driver. The head pointer is incremented by the controller as it takes commands off of the submission queue. .It Va dev.nvme.0.ioq0.sq_tail (R) Current location of the submission queue tail pointer as observed by the driver. The driver increments the tail pointer after writing a command into the submission queue to signal that a new command is ready to be processed. .It Va dev.nvme.0.ioq0.cq_head (R) Current location of the completion queue head pointer as observed by the driver. The driver increments the head pointer after finishing with a completion entry that was posted by the controller. .It Va dev.nvme.0.ioq0.num_cmds (R) Number of commands that have been submitted on this queue pair. .It Va dev.nvme.0.ioq0.dump_debug (W) Writing 1 to this sysctl will dump the full contents of the submission and completion queues to the console. .El .Pp In addition to the typical pci attachment, the .Nm driver supports attaching to a .Xr ahci 4 device. Intel's Rapid Storage Technology (RST) hides the nvme device behind the AHCI device due to limitations in Windows. However, this effectively hides it from the .Fx kernel. To work around this limitation, .Fx detects that the AHCI device supports RST and when it is enabled. See .Xr ahci 4 for more details. .Sh SEE ALSO .Xr nda 4 , .Xr nvd 4 , .Xr pci 4 , .Xr nvmecontrol 8 , .Xr disk 9 .Sh HISTORY The .Nm driver first appeared in .Fx 9.2 . .Sh AUTHORS .An -nosplit The .Nm driver was developed by Intel and originally written by .An Jim Harris Aq Mt jimharris@FreeBSD.org , with contributions from .An Joe Golio at EMC. .Pp This man page was written by .An Jim Harris Aq Mt jimharris@FreeBSD.org . diff --git a/sys/conf/options b/sys/conf/options index 42703b621752..2d99dc8c67db 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -1,1030 +1,1031 @@ # $FreeBSD$ # # On the handling of kernel options # # All kernel options should be listed in NOTES, with suitable # descriptions. Negative options (options that make some code not # compile) should be commented out; LINT (generated from NOTES) should # compile as much code as possible. Try to structure option-using # code so that a single option only switch code on, or only switch # code off, to make it possible to have a full compile-test. If # necessary, you can check for COMPILING_LINT to get maximum code # coverage. # # All new options shall also be listed in either "conf/options" or # "conf/options.". Options that affect a single source-file # .[c|s] should be directed into "opt_.h", while options # that affect multiple files should either go in "opt_global.h" if # this is a kernel-wide option (used just about everywhere), or in # "opt_.h" if it affects only some files. # Note that the effect of listing only an option without a # header-file-name in conf/options (and cousins) is that the last # convention is followed. # # This handling scheme is not yet fully implemented. # # # Format of this file: # Option name filename # # If filename is missing, the default is # opt_.h AAC_DEBUG opt_aac.h AACRAID_DEBUG opt_aacraid.h AHC_ALLOW_MEMIO opt_aic7xxx.h AHC_TMODE_ENABLE opt_aic7xxx.h AHC_DUMP_EEPROM opt_aic7xxx.h AHC_DEBUG opt_aic7xxx.h AHC_DEBUG_OPTS opt_aic7xxx.h AHC_REG_PRETTY_PRINT opt_aic7xxx.h AHD_DEBUG opt_aic79xx.h AHD_DEBUG_OPTS opt_aic79xx.h AHD_TMODE_ENABLE opt_aic79xx.h AHD_REG_PRETTY_PRINT opt_aic79xx.h TWA_DEBUG opt_twa.h # Debugging options. ALT_BREAK_TO_DEBUGGER opt_kdb.h BREAK_TO_DEBUGGER opt_kdb.h BUF_TRACKING opt_global.h DDB DDB_BUFR_SIZE opt_ddb.h DDB_CAPTURE_DEFAULTBUFSIZE opt_ddb.h DDB_CAPTURE_MAXBUFSIZE opt_ddb.h DDB_CTF opt_ddb.h DDB_NUMSYM opt_ddb.h EARLY_PRINTF opt_global.h FULL_BUF_TRACKING opt_global.h GDB KDB opt_global.h KDB_TRACE opt_kdb.h KDB_UNATTENDED opt_kdb.h KLD_DEBUG opt_kld.h NUM_CORE_FILES opt_global.h QUEUE_MACRO_DEBUG_TRACE opt_global.h QUEUE_MACRO_DEBUG_TRASH opt_global.h SYSCTL_DEBUG opt_sysctl.h TEXTDUMP_PREFERRED opt_ddb.h TEXTDUMP_VERBOSE opt_ddb.h TSLOG opt_global.h TSLOGSIZE opt_global.h # Miscellaneous options. ALQ ALTERA_SDCARD_FAST_SIM opt_altera_sdcard.h ATSE_CFI_HACK opt_cfi.h AUDIT opt_global.h BOOTHOWTO opt_global.h BOOTVERBOSE opt_global.h CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h COMPAT_43 opt_global.h COMPAT_43TTY opt_global.h COMPAT_FREEBSD4 opt_global.h COMPAT_FREEBSD5 opt_global.h COMPAT_FREEBSD6 opt_global.h COMPAT_FREEBSD7 opt_global.h COMPAT_FREEBSD9 opt_global.h COMPAT_FREEBSD10 opt_global.h COMPAT_FREEBSD11 opt_global.h COMPAT_FREEBSD12 opt_global.h COMPAT_LINUXKPI opt_dontuse.h _COMPAT_LINUX32 opt_compat.h # XXX: make sure opt_compat.h exists COMPILING_LINT opt_global.h CY_PCI_FASTINTR DEADLKRES opt_watchdog.h EXPERIMENTAL opt_global.h EXT_RESOURCES opt_global.h DIRECTIO FILEMON opt_dontuse.h FFCLOCK FULL_PREEMPTION opt_sched.h GZIO opt_gzio.h IMAGACT_BINMISC opt_dontuse.h IPI_PREEMPTION opt_sched.h GEOM_BDE opt_geom.h GEOM_CACHE opt_geom.h GEOM_CONCAT opt_geom.h GEOM_ELI opt_geom.h GEOM_GATE opt_geom.h GEOM_JOURNAL opt_geom.h GEOM_LABEL opt_geom.h GEOM_LABEL_GPT opt_geom.h GEOM_LINUX_LVM opt_geom.h GEOM_MAP opt_geom.h GEOM_MIRROR opt_geom.h GEOM_MOUNTVER opt_geom.h GEOM_MULTIPATH opt_geom.h GEOM_NOP opt_geom.h GEOM_PART_APM opt_geom.h GEOM_PART_BSD opt_geom.h GEOM_PART_BSD64 opt_geom.h GEOM_PART_EBR opt_geom.h GEOM_PART_GPT opt_geom.h GEOM_PART_LDM opt_geom.h GEOM_PART_MBR opt_geom.h GEOM_PART_VTOC8 opt_geom.h GEOM_RAID opt_geom.h GEOM_RAID3 opt_geom.h GEOM_SHSEC opt_geom.h GEOM_STRIPE opt_geom.h GEOM_UZIP opt_geom.h GEOM_UZIP_DEBUG opt_geom.h GEOM_VINUM opt_geom.h GEOM_VIRSTOR opt_geom.h GEOM_ZERO opt_geom.h IFLIB opt_iflib.h KDTRACE_HOOKS opt_global.h KDTRACE_FRAME opt_kdtrace.h KN_HASHSIZE opt_kqueue.h KSTACK_MAX_PAGES KSTACK_PAGES KSTACK_USAGE_PROF KTRACE KTRACE_REQUEST_POOL opt_ktrace.h LIBICONV MAC opt_global.h MAC_BIBA opt_dontuse.h MAC_BSDEXTENDED opt_dontuse.h MAC_IFOFF opt_dontuse.h MAC_LOMAC opt_dontuse.h MAC_MLS opt_dontuse.h MAC_NONE opt_dontuse.h MAC_NTPD opt_dontuse.h MAC_PARTITION opt_dontuse.h MAC_PORTACL opt_dontuse.h MAC_SEEOTHERUIDS opt_dontuse.h MAC_STATIC opt_mac.h MAC_STUB opt_dontuse.h MAC_TEST opt_dontuse.h MAC_VERIEXEC opt_dontuse.h MAC_VERIEXEC_SHA1 opt_dontuse.h MAC_VERIEXEC_SHA256 opt_dontuse.h MAC_VERIEXEC_SHA384 opt_dontuse.h MAC_VERIEXEC_SHA512 opt_dontuse.h MD_ROOT opt_md.h MD_ROOT_FSTYPE opt_md.h MD_ROOT_READONLY opt_md.h MD_ROOT_SIZE opt_md.h MD_ROOT_MEM opt_md.h MFI_DEBUG opt_mfi.h MFI_DECODE_LOG opt_mfi.h MPROF_BUFFERS opt_mprof.h MPROF_HASH_SIZE opt_mprof.h NEW_PCIB opt_global.h NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h NO_ADAPTIVE_RWLOCKS NO_ADAPTIVE_SX NO_OBSOLETE_CODE opt_global.h NO_SYSCTL_DESCR opt_global.h NSWBUF_MIN opt_param.h MBUF_PACKET_ZONE_DISABLE opt_global.h PANIC_REBOOT_WAIT_TIME opt_panic.h PCI_HP opt_pci.h PCI_IOV opt_global.h PPC_DEBUG opt_ppc.h PPC_PROBE_CHIPSET opt_ppc.h PPS_SYNC opt_ntp.h PREEMPTION opt_sched.h QUOTA SCHED_4BSD opt_sched.h SCHED_STATS opt_sched.h SCHED_ULE opt_sched.h SLEEPQUEUE_PROFILING SLHCI_DEBUG opt_slhci.h STACK opt_stack.h SUIDDIR MSGMNB opt_sysvipc.h MSGMNI opt_sysvipc.h MSGSEG opt_sysvipc.h MSGSSZ opt_sysvipc.h MSGTQL opt_sysvipc.h SEMMNI opt_sysvipc.h SEMMNS opt_sysvipc.h SEMMNU opt_sysvipc.h SEMMSL opt_sysvipc.h SEMOPM opt_sysvipc.h SEMUME opt_sysvipc.h SHMALL opt_sysvipc.h SHMMAX opt_sysvipc.h SHMMAXPGS opt_sysvipc.h SHMMIN opt_sysvipc.h SHMMNI opt_sysvipc.h SHMSEG opt_sysvipc.h SYSVMSG opt_sysvipc.h SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS opt_inet.h TURNSTILE_PROFILING UMTX_PROFILING UMTX_CHAINS opt_global.h VERBOSE_SYSINIT ZSTDIO opt_zstdio.h # Sanitizers COVERAGE opt_global.h KASAN opt_global.h KCOV KCSAN opt_global.h KMSAN opt_global.h KUBSAN opt_global.h # POSIX kernel options P1003_1B_MQUEUE opt_posix.h P1003_1B_SEMAPHORES opt_posix.h _KPOSIX_PRIORITY_SCHEDULING opt_posix.h # Do we want the config file compiled into the kernel? INCLUDE_CONFIG_FILE opt_config.h # Options for static filesystems. These should only be used at config # time, since the corresponding lkms cannot work if there are any static # dependencies. Unusability is enforced by hiding the defines for the # options in a never-included header. AUTOFS opt_dontuse.h CD9660 opt_dontuse.h EXT2FS opt_dontuse.h FDESCFS opt_dontuse.h FFS opt_dontuse.h FUSEFS opt_dontuse.h MSDOSFS opt_dontuse.h NULLFS opt_dontuse.h PROCFS opt_dontuse.h PSEUDOFS opt_dontuse.h SMBFS opt_dontuse.h TMPFS opt_dontuse.h UDF opt_dontuse.h UNIONFS opt_dontuse.h ZFS opt_dontuse.h # Pseudofs debugging PSEUDOFS_TRACE opt_pseudofs.h # In-kernel GSS-API KGSSAPI opt_kgssapi.h KGSSAPI_DEBUG opt_kgssapi.h # These static filesystems have one slightly bogus static dependency in # sys/i386/i386/autoconf.c. If any of these filesystems are # statically compiled into the kernel, code for mounting them as root # filesystems will be enabled - but look below. # NFSCL - client # NFSD - server NFSCL opt_nfs.h NFSD opt_nfs.h # filesystems and libiconv bridge CD9660_ICONV opt_dontuse.h MSDOSFS_ICONV opt_dontuse.h UDF_ICONV opt_dontuse.h # If you are following the conditions in the copyright, # you can enable soft-updates which will speed up a lot of thigs # and make the system safer from crashes at the same time. # otherwise a STUB module will be compiled in. SOFTUPDATES opt_ffs.h # On small, embedded systems, it can be useful to turn off support for # snapshots. It saves about 30-40k for a feature that would be lightly # used, if it is used at all. NO_FFS_SNAPSHOT opt_ffs.h # Enabling this option turns on support for Access Control Lists in UFS, # which can be used to support high security configurations. Depends on # UFS_EXTATTR. UFS_ACL opt_ufs.h # Enabling this option turns on support for extended attributes in UFS-based # filesystems, which can be used to support high security configurations # as well as new filesystem features. UFS_EXTATTR opt_ufs.h UFS_EXTATTR_AUTOSTART opt_ufs.h # Enable fast hash lookups for large directories on UFS-based filesystems. UFS_DIRHASH opt_ufs.h # Enable gjournal-based UFS journal. UFS_GJOURNAL opt_ufs.h # The below sentence is not in English, and neither is this one. # We plan to remove the static dependences above, with a # _ROOT option to control if it usable as root. This list # allows these options to be present in config files already (though # they won't make any difference yet). NFS_ROOT opt_nfsroot.h # SMB/CIFS requester NETSMB opt_netsmb.h # Enable debugnet(4) networking support. DEBUGNET opt_global.h # Enable netdump(4) client support. NETDUMP opt_global.h # Enable netgdb(4) support. NETGDB opt_global.h # Options used only in subr_param.c. HZ opt_param.h MAXFILES opt_param.h NBUF opt_param.h NSFBUFS opt_param.h VM_BCACHE_SIZE_MAX opt_param.h VM_SWZONE_SIZE_MAX opt_param.h MAXUSERS DFLDSIZ opt_param.h MAXDSIZ opt_param.h MAXSSIZ opt_param.h # Generic SCSI options. CAM_MAX_HIGHPOWER opt_cam.h CAMDEBUG opt_cam.h CAM_DEBUG_COMPILE opt_cam.h CAM_DEBUG_DELAY opt_cam.h CAM_DEBUG_BUS opt_cam.h CAM_DEBUG_TARGET opt_cam.h CAM_DEBUG_LUN opt_cam.h CAM_DEBUG_FLAGS opt_cam.h CAM_BOOT_DELAY opt_cam.h CAM_IOSCHED_DYNAMIC opt_cam.h CAM_IO_STATS opt_cam.h CAM_TEST_FAILURE opt_cam.h SCSI_DELAY opt_scsi.h SCSI_NO_SENSE_STRINGS opt_scsi.h SCSI_NO_OP_STRINGS opt_scsi.h # Options used only in cam/ata/ata_da.c ATA_STATIC_ID opt_ada.h # Options used only in cam/scsi/scsi_cd.c CHANGER_MIN_BUSY_SECONDS opt_cd.h CHANGER_MAX_BUSY_SECONDS opt_cd.h # Options used only in cam/scsi/scsi_da.c DA_TRACK_REFS opt_da.h # Options used only in cam/scsi/scsi_sa.c. SA_IO_TIMEOUT opt_sa.h SA_SPACE_TIMEOUT opt_sa.h SA_REWIND_TIMEOUT opt_sa.h SA_ERASE_TIMEOUT opt_sa.h SA_1FM_AT_EOD opt_sa.h # Options used only in cam/scsi/scsi_pt.c SCSI_PT_DEFAULT_TIMEOUT opt_pt.h # Options used only in cam/scsi/scsi_ses.c SES_ENABLE_PASSTHROUGH opt_ses.h # Options used in dev/sym/ (Symbios SCSI driver). SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking # disabled:0, enabled:1 (default) SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported # default:8, range:[1..64] # Options used only in dev/isp/* ISP_TARGET_MODE opt_isp.h ISP_FW_CRASH_DUMP opt_isp.h ISP_DEFAULT_ROLES opt_isp.h ISP_INTERNAL_TARGET opt_isp.h ISP_FCTAPE_OFF opt_isp.h # Options used only in dev/iscsi ISCSI_INITIATOR_DEBUG opt_iscsi_initiator.h # Net stuff. ACCEPT_FILTER_DATA ACCEPT_FILTER_DNS ACCEPT_FILTER_HTTP ALTQ opt_global.h ALTQ_CBQ opt_altq.h ALTQ_CDNR opt_altq.h ALTQ_CODEL opt_altq.h ALTQ_DEBUG opt_altq.h ALTQ_HFSC opt_altq.h ALTQ_FAIRQ opt_altq.h ALTQ_NOPCC opt_altq.h ALTQ_PRIQ opt_altq.h ALTQ_RED opt_altq.h ALTQ_RIO opt_altq.h BOOTP opt_bootp.h BOOTP_BLOCKSIZE opt_bootp.h BOOTP_COMPAT opt_bootp.h BOOTP_NFSROOT opt_bootp.h BOOTP_NFSV3 opt_bootp.h BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h RATELIMIT opt_ratelimit.h RATELIMIT_DEBUG opt_ratelimit.h INET opt_inet.h INET6 opt_inet6.h STATS opt_global.h IPDIVERT IPFILTER opt_ipfilter.h IPFILTER_DEFAULT_BLOCK opt_ipfilter.h IPFILTER_LOG opt_ipfilter.h IPFILTER_LOOKUP opt_ipfilter.h IPFIREWALL opt_ipfw.h IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h IPFIREWALL_NAT opt_ipfw.h IPFIREWALL_NAT64 opt_ipfw.h IPFIREWALL_NPTV6 opt_ipfw.h IPFIREWALL_VERBOSE opt_ipfw.h IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h IPFIREWALL_PMOD opt_ipfw.h IPSEC opt_ipsec.h IPSEC_DEBUG opt_ipsec.h IPSEC_SUPPORT opt_ipsec.h IPSTEALTH KERN_TLS KRPC LIBALIAS LIBMCHAIN MBUF_PROFILING MBUF_STRESS_TEST MROUTING opt_mrouting.h NFSLOCKD PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h FIB_ALGO opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCPPCAP opt_global.h SIFTR TCP_BLACKBOX opt_global.h TCP_HHOOK opt_inet.h TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading TCP_RFC7413 opt_inet.h TCP_RFC7413_MAX_KEYS opt_inet.h TCP_RFC7413_MAX_PSKS opt_inet.h TCP_SIGNATURE opt_ipsec.h VLAN_ARRAY opt_vlan.h XDR XBONEHACK # # SCTP # SCTP opt_sctp.h SCTP_SUPPORT opt_sctp.h SCTP_DEBUG opt_sctp.h # Enable debug printfs SCTP_LOCK_LOGGING opt_sctp.h # Log to KTR lock activity SCTP_MBUF_LOGGING opt_sctp.h # Log to KTR general mbuf aloc/free SCTP_MBCNT_LOGGING opt_sctp.h # Log to KTR mbcnt activity SCTP_PACKET_LOGGING opt_sctp.h # Log to a packet buffer last N packets SCTP_LTRACE_CHUNKS opt_sctp.h # Log to KTR chunks processed SCTP_LTRACE_ERRORS opt_sctp.h # Log to KTR error returns. SCTP_USE_PERCPU_STAT opt_sctp.h # Use per cpu stats. SCTP_MCORE_INPUT opt_sctp.h # Have multiple input threads for input mbufs SCTP_LOCAL_TRACE_BUF opt_sctp.h # Use tracebuffer exported via sysctl SCTP_DETAILED_STR_STATS opt_sctp.h # Use per PR-SCTP policy stream stats # # # # Netgraph(4). Use option NETGRAPH to enable the base netgraph code. # Each netgraph node type can be either be compiled into the kernel # or loaded dynamically. To get the former, include the corresponding # option below. Each type has its own man page, e.g. ng_async(4). NETGRAPH NETGRAPH_DEBUG opt_netgraph.h NETGRAPH_ASYNC opt_netgraph.h NETGRAPH_ATMLLC opt_netgraph.h NETGRAPH_ATM_ATMPIF opt_netgraph.h NETGRAPH_BLUETOOTH opt_netgraph.h NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h NETGRAPH_BLUETOOTH_H4 opt_netgraph.h NETGRAPH_BLUETOOTH_HCI opt_netgraph.h NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h NETGRAPH_BLUETOOTH_UBT opt_netgraph.h NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h NETGRAPH_BPF opt_netgraph.h NETGRAPH_BRIDGE opt_netgraph.h NETGRAPH_CAR opt_netgraph.h NETGRAPH_CHECKSUM opt_netgraph.h NETGRAPH_CISCO opt_netgraph.h NETGRAPH_DEFLATE opt_netgraph.h NETGRAPH_DEVICE opt_netgraph.h NETGRAPH_ECHO opt_netgraph.h NETGRAPH_EIFACE opt_netgraph.h NETGRAPH_ETHER opt_netgraph.h NETGRAPH_ETHER_ECHO opt_netgraph.h NETGRAPH_FEC opt_netgraph.h NETGRAPH_FRAME_RELAY opt_netgraph.h NETGRAPH_GIF opt_netgraph.h NETGRAPH_GIF_DEMUX opt_netgraph.h NETGRAPH_HOLE opt_netgraph.h NETGRAPH_IFACE opt_netgraph.h NETGRAPH_IP_INPUT opt_netgraph.h NETGRAPH_IPFW opt_netgraph.h NETGRAPH_KSOCKET opt_netgraph.h NETGRAPH_L2TP opt_netgraph.h NETGRAPH_LMI opt_netgraph.h NETGRAPH_MPPC_COMPRESSION opt_netgraph.h NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h NETGRAPH_PATCH opt_netgraph.h NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h NETGRAPH_PRED1 opt_netgraph.h NETGRAPH_RFC1490 opt_netgraph.h NETGRAPH_SOCKET opt_netgraph.h NETGRAPH_SPLIT opt_netgraph.h NETGRAPH_SPPP opt_netgraph.h NETGRAPH_TAG opt_netgraph.h NETGRAPH_TCPMSS opt_netgraph.h NETGRAPH_TEE opt_netgraph.h NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h NETGRAPH_VLAN opt_netgraph.h # NgATM options NGATM_ATM opt_netgraph.h NGATM_ATMBASE opt_netgraph.h NGATM_SSCOP opt_netgraph.h NGATM_SSCFU opt_netgraph.h NGATM_UNI opt_netgraph.h NGATM_CCATM opt_netgraph.h # DRM options DRM_DEBUG opt_drm.h TI_SF_BUF_JUMBO opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h # Misc debug flags. Most of these should probably be replaced with # 'DEBUG', and then let people recompile just the interesting modules # with 'make CC="cc -DDEBUG"'. DEBUG_1284 opt_ppb_1284.h LPT_DEBUG opt_lpt.h PLIP_DEBUG opt_plip.h LOCKF_DEBUG opt_debug_lockf.h SI_DEBUG opt_debug_si.h IFMEDIA_DEBUG opt_ifmedia.h # Fb options FB_DEBUG opt_fb.h FB_INSTALL_CDEV opt_fb.h # ppbus related options PERIPH_1284 opt_ppb_1284.h DONTPROBE_1284 opt_ppb_1284.h # smbus related options ENABLE_ALART opt_intpm.h # These cause changes all over the kernel BLKDEV_IOSIZE opt_global.h BURN_BRIDGES opt_global.h DEBUG opt_global.h DEBUG_LOCKS opt_global.h DEBUG_VFS_LOCKS opt_global.h DFLTPHYS opt_global.h DIAGNOSTIC opt_global.h INVARIANT_SUPPORT opt_global.h INVARIANTS opt_global.h KASSERT_PANIC_OPTIONAL opt_global.h MAXCPU opt_global.h MAXMEMDOM opt_global.h MAXPHYS opt_maxphys.h MCLSHIFT opt_global.h MUTEX_NOINLINE opt_global.h LOCK_PROFILING opt_global.h LOCK_PROFILING_FAST opt_global.h MSIZE opt_global.h REGRESSION opt_global.h RWLOCK_NOINLINE opt_global.h SX_NOINLINE opt_global.h VFS_BIO_DEBUG opt_global.h # These are VM related options VM_KMEM_SIZE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h VM_NRESERVLEVEL opt_vm.h VM_LEVEL_0_ORDER opt_vm.h NO_SWAPPING opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h MALLOC_PROFILE opt_vm.h MALLOC_DEBUG_MAXZONES opt_vm.h # The MemGuard replacement allocator used for tamper-after-free detection DEBUG_MEMGUARD opt_vm.h # The RedZone malloc(9) protection DEBUG_REDZONE opt_vm.h # Standard SMP options EARLY_AP_STARTUP opt_global.h SMP opt_global.h NUMA opt_global.h # Size of the kernel message buffer MSGBUF_SIZE opt_msgbuf.h # NFS options NFS_MINATTRTIMO opt_nfs.h NFS_MAXATTRTIMO opt_nfs.h NFS_MINDIRATTRTIMO opt_nfs.h NFS_MAXDIRATTRTIMO opt_nfs.h NFS_DEBUG opt_nfs.h # TMPFS options TMPFS_PAGES_MINRESERVED opt_tmpfs.h # Options for uart(4) UART_PPS_ON_CTS opt_uart.h UART_POLL_FREQ opt_uart.h UART_DEV_TOLERANCE_PCT opt_uart.h # options for bus/device framework BUS_DEBUG opt_bus.h # options for USB support USB_DEBUG opt_usb.h USB_HOST_ALIGN opt_usb.h USB_REQ_DEBUG opt_usb.h USB_TEMPLATE opt_usb.h USB_VERBOSE opt_usb.h USB_DMA_SINGLE_ALLOC opt_usb.h USB_EHCI_BIG_ENDIAN_DESC opt_usb.h U3G_DEBUG opt_u3g.h UKBD_DFLT_KEYMAP opt_ukbd.h UPLCOM_INTR_INTERVAL opt_uplcom.h UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h UVSCOM_INTR_INTERVAL opt_uvscom.h # options for the Realtek rtwn driver RTWN_DEBUG opt_rtwn.h RTWN_WITHOUT_UCODE opt_rtwn.h # Embedded system options INIT_PATH ROOTDEVNAME FDC_DEBUG opt_fdc.h PCFCLOCK_VERBOSE opt_pcfclock.h PCFCLOCK_MAX_RETRIES opt_pcfclock.h KTR opt_global.h KTR_ALQ opt_ktr.h KTR_MASK opt_ktr.h KTR_CPUMASK opt_ktr.h KTR_COMPILE opt_global.h KTR_BOOT_ENTRIES opt_global.h KTR_ENTRIES opt_global.h KTR_VERBOSE opt_ktr.h WITNESS opt_global.h WITNESS_KDB opt_witness.h WITNESS_NO_VNODE opt_witness.h WITNESS_SKIPSPIN opt_witness.h WITNESS_COUNT opt_witness.h OPENSOLARIS_WITNESS opt_global.h EPOCH_TRACE opt_global.h # options for ACPI support ACPI_DEBUG opt_acpi.h ACPI_MAX_TASKS opt_acpi.h ACPI_MAX_THREADS opt_acpi.h DEV_ACPI opt_acpi.h ACPI_EARLY_EPYC_WAR opt_acpi.h # options for IOMMU support IOMMU opt_iommu.h # ISA support DEV_ISA opt_isa.h ISAPNP opt_isa.h # various 'device presence' options. DEV_BPF opt_bpf.h DEV_CARP opt_carp.h DEV_NETMAP opt_global.h DEV_PCI opt_pci.h DEV_PF opt_pf.h DEV_PFLOG opt_pf.h DEV_PFSYNC opt_pf.h DEV_SPLASH opt_splash.h DEV_VLAN opt_vlan.h # ed driver ED_HPP opt_ed.h ED_3C503 opt_ed.h ED_SIC opt_ed.h # bce driver BCE_DEBUG opt_bce.h BCE_NVRAM_WRITE_SUPPORT opt_bce.h SOCKBUF_DEBUG opt_global.h # options for hifn driver HIFN_DEBUG opt_hifn.h HIFN_RNDTEST opt_hifn.h # options for safenet driver SAFE_DEBUG opt_safe.h SAFE_NO_RNG opt_safe.h SAFE_RNDTEST opt_safe.h # syscons/vt options MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_CUT_SPACES2TABS opt_syscons.h SC_CUT_SEPCHARS opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DFLT_TERM opt_syscons.h SC_DISABLE_KDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_ATTRS opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_MODE_CHANGE opt_syscons.h SC_NO_SUSPEND_VTYSWITCH opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NO_TERM_DUMB opt_syscons.h SC_NO_TERM_SC opt_syscons.h SC_NO_TERM_TEKEN opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h VT_ALT_TO_ESC_HACK opt_syscons.h VT_FB_MAX_WIDTH opt_syscons.h VT_FB_MAX_HEIGHT opt_syscons.h VT_MAXWINDOWS opt_syscons.h VT_TWOBUTTON_MOUSE opt_syscons.h DEV_SC opt_syscons.h DEV_VT opt_syscons.h # teken terminal emulator options TEKEN_CONS25 opt_teken.h TEKEN_UTF8 opt_teken.h TERMINAL_KERN_ATTR opt_teken.h TERMINAL_NORM_ATTR opt_teken.h # options for printf PRINTF_BUFR_SIZE opt_printf.h BOOT_TAG opt_printf.h BOOT_TAG_SZ opt_printf.h # kbd options KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBDIO_DEBUG opt_kbd.h KBDMUX_DFLT_KEYMAP opt_kbdmux.h # options for the Atheros driver ATH_DEBUG opt_ath.h ATH_TXBUF opt_ath.h ATH_RXBUF opt_ath.h ATH_DIAGAPI opt_ath.h ATH_TX99_DIAG opt_ath.h ATH_ENABLE_11N opt_ath.h ATH_ENABLE_DFS opt_ath.h ATH_EEPROM_FIRMWARE opt_ath.h ATH_ENABLE_RADIOTAP_VENDOR_EXT opt_ath.h ATH_DEBUG_ALQ opt_ath.h ATH_KTR_INTR_DEBUG opt_ath.h # options for the Atheros hal # XXX For now, this breaks non-AR9130 chipsets, so only use it # XXX when actually targeting AR9130. AH_SUPPORT_AR9130 opt_ah.h # This is required for AR933x SoC support AH_SUPPORT_AR9330 opt_ah.h AH_SUPPORT_AR9340 opt_ah.h AH_SUPPORT_QCA9530 opt_ah.h AH_SUPPORT_QCA9550 opt_ah.h AH_DEBUG opt_ah.h AH_ASSERT opt_ah.h AH_DEBUG_ALQ opt_ah.h AH_REGOPS_FUNC opt_ah.h AH_WRITE_REGDOMAIN opt_ah.h AH_DEBUG_COUNTRY opt_ah.h AH_WRITE_EEPROM opt_ah.h AH_PRIVATE_DIAG opt_ah.h AH_NEED_DESC_SWAP opt_ah.h AH_USE_INIPDGAIN opt_ah.h AH_MAXCHAN opt_ah.h AH_RXCFG_SDMAMW_4BYTES opt_ah.h AH_INTERRUPT_DEBUGGING opt_ah.h # AR5416 and later interrupt mitigation # XXX do not use this for AR9130 AH_AR5416_INTERRUPT_MITIGATION opt_ah.h # options for the Altera mSGDMA driver (altera_msgdma) ALTERA_MSGDMA_DESC_STD opt_altera_msgdma.h ALTERA_MSGDMA_DESC_EXT opt_altera_msgdma.h ALTERA_MSGDMA_DESC_PF_STD opt_altera_msgdma.h ALTERA_MSGDMA_DESC_PF_EXT opt_altera_msgdma.h # options for the Broadcom BCM43xx driver (bwi) BWI_DEBUG opt_bwi.h BWI_DEBUG_VERBOSE opt_bwi.h # options for the Brodacom BCM43xx driver (bwn) BWN_DEBUG opt_bwn.h BWN_GPL_PHY opt_bwn.h BWN_USE_SIBA opt_bwn.h # Options for the SIBA driver SIBA_DEBUG opt_siba.h # options for the Marvell 8335 wireless driver MALO_DEBUG opt_malo.h MALO_TXBUF opt_malo.h MALO_RXBUF opt_malo.h # options for the Marvell wireless driver MWL_DEBUG opt_mwl.h MWL_TXBUF opt_mwl.h MWL_RXBUF opt_mwl.h MWL_DIAGAPI opt_mwl.h MWL_AGGR_SIZE opt_mwl.h MWL_TX_NODROP opt_mwl.h # Options for the Marvell NETA driver MVNETA_MULTIQUEUE opt_mvneta.h MVNETA_KTR opt_mvneta.h # Options for the Intel 802.11ac wireless driver IWM_DEBUG opt_iwm.h # Options for the Intel 802.11n wireless driver IWN_DEBUG opt_iwn.h # Options for the Intel 3945ABG wireless driver WPI_DEBUG opt_wpi.h # dcons options DCONS_BUF_SIZE opt_dcons.h DCONS_POLL_HZ opt_dcons.h DCONS_FORCE_CONSOLE opt_dcons.h DCONS_FORCE_GDB opt_dcons.h # HWPMC options HWPMC_DEBUG opt_global.h HWPMC_HOOKS HWPMC_MIPS_BACKTRACE opt_hwpmc_hooks.h # 802.11 support layer IEEE80211_DEBUG opt_wlan.h IEEE80211_DEBUG_REFCNT opt_wlan.h IEEE80211_SUPPORT_MESH opt_wlan.h IEEE80211_SUPPORT_SUPERG opt_wlan.h IEEE80211_SUPPORT_TDMA opt_wlan.h IEEE80211_ALQ opt_wlan.h IEEE80211_DFS_DEBUG opt_wlan.h # 802.11 TDMA support TDMA_SLOTLEN_DEFAULT opt_tdma.h TDMA_SLOTCNT_DEFAULT opt_tdma.h TDMA_BINTVAL_DEFAULT opt_tdma.h TDMA_TXRATE_11B_DEFAULT opt_tdma.h TDMA_TXRATE_11G_DEFAULT opt_tdma.h TDMA_TXRATE_11A_DEFAULT opt_tdma.h TDMA_TXRATE_TURBO_DEFAULT opt_tdma.h TDMA_TXRATE_HALF_DEFAULT opt_tdma.h TDMA_TXRATE_QUARTER_DEFAULT opt_tdma.h TDMA_TXRATE_11NA_DEFAULT opt_tdma.h TDMA_TXRATE_11NG_DEFAULT opt_tdma.h # VideoMode PICKMODE_DEBUG opt_videomode.h # Network stack virtualization options VIMAGE opt_global.h VNET_DEBUG opt_global.h # Common Flash Interface (CFI) options CFI_SUPPORT_STRATAFLASH opt_cfi.h CFI_ARMEDANDDANGEROUS opt_cfi.h CFI_HARDWAREBYTESWAP opt_cfi.h # Sound options SND_DEBUG opt_snd.h SND_DIAGNOSTIC opt_snd.h SND_FEEDER_MULTIFORMAT opt_snd.h SND_FEEDER_FULL_MULTIFORMAT opt_snd.h SND_FEEDER_RATE_HP opt_snd.h SND_PCM_64 opt_snd.h SND_OLDSTEREO opt_snd.h X86BIOS # Flattened device tree options FDT opt_platform.h FDT_DTB_STATIC opt_platform.h # OFED Infiniband stack OFED opt_ofed.h OFED_DEBUG_INIT opt_ofed.h SDP opt_ofed.h SDP_DEBUG opt_ofed.h IPOIB opt_ofed.h IPOIB_DEBUG opt_ofed.h IPOIB_CM opt_ofed.h # Resource Accounting RACCT opt_global.h RACCT_DEFAULT_TO_DISABLED opt_global.h # Resource Limits RCTL opt_global.h # Random number generator(s) # Alternative RNG algorithm. RANDOM_FENESTRASX opt_global.h # With this, no entropy processor is loaded, but the entropy # harvesting infrastructure is present. This means an entropy # processor may be loaded as a module. RANDOM_LOADABLE opt_global.h # This turns on high-rate and potentially expensive harvesting in # the uma slab allocator. RANDOM_ENABLE_UMA opt_global.h RANDOM_ENABLE_ETHER opt_global.h # This options turns TPM into entropy source. TPM_HARVEST opt_tpm.h # BHND(4) driver BHND_LOGLEVEL opt_global.h # GPIO and child devices GPIO_SPI_DEBUG opt_gpio.h # SPI devices SPIGEN_LEGACY_CDEVNAME opt_spi.h # etherswitch(4) driver RTL8366_SOFT_RESET opt_etherswitch.h # evdev protocol support EVDEV_SUPPORT opt_evdev.h EVDEV_DEBUG opt_evdev.h UINPUT_DEBUG opt_evdev.h # Hyper-V network driver HN_DEBUG opt_hn.h # CAM-based MMC stack MMCCAM # Encrypted kernel crash dumps EKCD opt_ekcd.h # NVME options NVME_USE_NVD opt_nvme.h +NVME_2X_RESET opt_nvme.h # amdsbwd options AMDSBWD_DEBUG opt_amdsbwd.h # gcov support GCOV opt_global.h LINDEBUGFS # options for HID support HID_DEBUG opt_hid.h IICHID_DEBUG opt_hid.h IICHID_SAMPLING opt_hid.h HKBD_DFLT_KEYMAP opt_hkbd.h HIDRAW_MAKE_UHID_ALIAS opt_hid.h # kenv options # The early kernel environment (loader environment, config(8)-provided static) # is typically cleared after the dynamic environment comes up to ensure that # we're not inadvertently holding on to 'secret' values in these stale envs. # This option is insecure except in controlled environments where the static # environment's contents are known to be safe. PRESERVE_EARLY_KENV opt_global.h diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index d0be0da39902..ca89e99d2934 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -1,1703 +1,1710 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2016 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cam.h" +#include "opt_nvme.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nvme_private.h" #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer); static void nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags) { bus_barrier(ctrlr->resource, 0, rman_get_size(ctrlr->resource), flags); } static void nvme_ctrlr_devctl_log(struct nvme_controller *ctrlr, const char *type, const char *msg, ...) { struct sbuf sb; va_list ap; int error; if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL) return; sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev)); va_start(ap, msg); sbuf_vprintf(&sb, msg, ap); va_end(ap); error = sbuf_finish(&sb); if (error == 0) printf("%s\n", sbuf_data(&sb)); sbuf_clear(&sb); sbuf_printf(&sb, "name=\"%s\" reason=\"", device_get_nameunit(ctrlr->dev)); va_start(ap, msg); sbuf_vprintf(&sb, msg, ap); va_end(ap); sbuf_printf(&sb, "\""); error = sbuf_finish(&sb); if (error == 0) devctl_notify("nvme", "controller", type, sbuf_data(&sb)); sbuf_delete(&sb); } static int nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t num_entries; int error; qpair = &ctrlr->adminq; qpair->id = 0; qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; qpair->domain = ctrlr->domain; num_entries = NVME_ADMIN_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); /* * If admin_entries was overridden to an invalid value, revert it * back to our default value. */ if (num_entries < NVME_MIN_ADMIN_ENTRIES || num_entries > NVME_MAX_ADMIN_ENTRIES) { nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d " "specified\n", num_entries); num_entries = NVME_ADMIN_ENTRIES; } /* * The admin queue's max xfer size is treated differently than the * max I/O xfer size. 16KB is sufficient here - maybe even less? */ error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS, ctrlr); return (error); } #define QP(ctrlr, c) ((c) * (ctrlr)->num_io_queues / mp_ncpus) static int nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t cap_lo; uint16_t mqes; int c, error, i, n; int num_entries, num_trackers, max_entries; /* * NVMe spec sets a hard limit of 64K max entries, but devices may * specify a smaller limit, so we need to check the MQES field in the * capabilities register. We have to cap the number of entries to the * current stride allows for in BAR 0/1, otherwise the remainder entries * are inaccessable. MQES should reflect this, and this is just a * fail-safe. */ max_entries = (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) / (1 << (ctrlr->dstrd + 1)); num_entries = NVME_IO_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); mqes = NVME_CAP_LO_MQES(cap_lo); num_entries = min(num_entries, mqes + 1); num_entries = min(num_entries, max_entries); num_trackers = NVME_IO_TRACKERS; TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers); num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS); num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS); /* * No need to have more trackers than entries in the submit queue. Note * also that for a queue size of N, we can only have (N-1) commands * outstanding, hence the "-1" here. */ num_trackers = min(num_trackers, (num_entries-1)); /* * Our best estimate for the maximum number of I/Os that we should * normally have in flight at one time. This should be viewed as a hint, * not a hard limit and will need to be revisited when the upper layers * of the storage system grows multi-queue support. */ ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), M_NVME, M_ZERO | M_WAITOK); for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) { qpair = &ctrlr->ioq[i]; /* * Admin queue has ID=0. IO queues start at ID=1 - * hence the 'i+1' here. */ qpair->id = i + 1; if (ctrlr->num_io_queues > 1) { /* Find number of CPUs served by this queue. */ for (n = 1; QP(ctrlr, c + n) == i; n++) ; /* Shuffle multiple NVMe devices between CPUs. */ qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n; qpair->domain = pcpu_find(qpair->cpu)->pc_domain; } else { qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; qpair->domain = ctrlr->domain; } /* * For I/O queues, use the controller-wide max_xfer_size * calculated in nvme_attach(). */ error = nvme_qpair_construct(qpair, num_entries, num_trackers, ctrlr); if (error) return (error); /* * Do not bother binding interrupts if we only have one I/O * interrupt thread for this controller. */ if (ctrlr->num_io_queues > 1) bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu); } return (0); } static void nvme_ctrlr_fail(struct nvme_controller *ctrlr) { int i; ctrlr->is_failed = true; nvme_admin_qpair_disable(&ctrlr->adminq); nvme_qpair_fail(&ctrlr->adminq); if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) { nvme_io_qpair_disable(&ctrlr->ioq[i]); nvme_qpair_fail(&ctrlr->ioq[i]); } } nvme_notify_fail_consumers(ctrlr); } void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, struct nvme_request *req) { mtx_lock(&ctrlr->lock); STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); mtx_unlock(&ctrlr->lock); if (!ctrlr->is_dying) taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); } static void nvme_ctrlr_fail_req_task(void *arg, int pending) { struct nvme_controller *ctrlr = arg; struct nvme_request *req; mtx_lock(&ctrlr->lock); while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) { STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq); mtx_unlock(&ctrlr->lock); nvme_qpair_manual_complete_request(req->qpair, req, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST); mtx_lock(&ctrlr->lock); } mtx_unlock(&ctrlr->lock); } static int nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val) { int timeout = ticks + MSEC_2_TICKS(ctrlr->ready_timeout_in_ms); uint32_t csts; while (1) { csts = nvme_mmio_read_4(ctrlr, csts); if (csts == NVME_GONE) /* Hot unplug. */ return (ENXIO); if (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK) == desired_val) break; if (timeout - ticks < 0) { nvme_printf(ctrlr, "controller ready did not become %d " "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms); return (ENXIO); } pause("nvmerdy", 1); } return (0); } static int nvme_ctrlr_disable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK; rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK; /* * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY * isn't the desired value. Short circuit if we're already disabled. */ if (en == 0) { /* Wait for RDY == 0 or timeout & fail */ if (rdy == 0) return (0); return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } if (rdy == 0) { /* EN == 1, wait for RDY == 1 or timeout & fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 1); if (err != 0) return (err); } cc &= ~NVME_CC_REG_EN_MASK; nvme_mmio_write_4(ctrlr, cc, cc); /* * A few drives have firmware bugs that freeze the drive if we access * the mmio too soon after we disable. */ if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY) pause("nvmeR", MSEC_2_TICKS(B4_CHK_RDY_DELAY_MS)); return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } static int nvme_ctrlr_enable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint32_t aqa; uint32_t qsize; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK; rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK; /* * See note in nvme_ctrlr_disable. Short circuit if we're already enabled. */ if (en == 1) { if (rdy == 1) return (0); return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } /* EN == 0 already wait for RDY == 0 or timeout & fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 0); if (err != 0) return (err); nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr); nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr); /* acqs and asqs are 0-based. */ qsize = ctrlr->adminq.num_entries - 1; aqa = 0; aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT; aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT; nvme_mmio_write_4(ctrlr, aqa, aqa); /* Initialization values for CC */ cc = 0; cc |= 1 << NVME_CC_REG_EN_SHIFT; cc |= 0 << NVME_CC_REG_CSS_SHIFT; cc |= 0 << NVME_CC_REG_AMS_SHIFT; cc |= 0 << NVME_CC_REG_SHN_SHIFT; cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */ cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */ /* This evaluates to 0, which is according to spec. */ cc |= (PAGE_SIZE >> 13) << NVME_CC_REG_MPS_SHIFT; nvme_ctrlr_barrier(ctrlr, BUS_SPACE_BARRIER_WRITE); nvme_mmio_write_4(ctrlr, cc, cc); return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } static void nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr) { int i; nvme_admin_qpair_disable(&ctrlr->adminq); /* * I/O queues are not allocated before the initial HW * reset, so do not try to disable them. Use is_initialized * to determine if this is the initial HW reset. */ if (ctrlr->is_initialized) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_disable(&ctrlr->ioq[i]); } } static int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) { int err; TSENTER(); nvme_ctrlr_disable_qpairs(ctrlr); err = nvme_ctrlr_disable(ctrlr); if (err != 0) return err; err = nvme_ctrlr_enable(ctrlr); TSEXIT(); return (err); } void nvme_ctrlr_reset(struct nvme_controller *ctrlr) { int cmpset; cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); if (cmpset == 0 || ctrlr->is_failed) /* * Controller is already resetting or has failed. Return * immediately since there is no need to kick off another * reset in these cases. */ return; if (!ctrlr->is_dying) taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); } static int nvme_ctrlr_identify(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; status.done = 0; nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_identify_controller failed!\n"); return (ENXIO); } /* Convert data to host endian */ nvme_controller_data_swapbytes(&ctrlr->cdata); /* * Use MDTS to ensure our default max_xfer_size doesn't exceed what the * controller supports. */ if (ctrlr->cdata.mdts > 0) ctrlr->max_xfer_size = min(ctrlr->max_xfer_size, ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); return (0); } static int nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; int cq_allocated, sq_allocated; status.done = 0; nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n"); return (ENXIO); } /* * Data in cdw0 is 0-based. * Lower 16-bits indicate number of submission queues allocated. * Upper 16-bits indicate number of completion queues allocated. */ sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1; cq_allocated = (status.cpl.cdw0 >> 16) + 1; /* * Controller may allocate more queues than we requested, * so use the minimum of the number requested and what was * actually allocated. */ ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); if (ctrlr->num_io_queues > vm_ndomains) ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains; return (0); } static int nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; int i; for (i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; status.done = 0; nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_cq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_sq failed!\n"); return (ENXIO); } } return (0); } static int nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; for (int i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; status.done = 0; nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n"); return (ENXIO); } } return (0); } static int nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) { struct nvme_namespace *ns; uint32_t i; for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) { ns = &ctrlr->ns[i]; nvme_ns_construct(ns, i+1, ctrlr); } return (0); } static bool is_log_page_id_valid(uint8_t page_id) { switch (page_id) { case NVME_LOG_ERROR: case NVME_LOG_HEALTH_INFORMATION: case NVME_LOG_FIRMWARE_SLOT: case NVME_LOG_CHANGED_NAMESPACE: case NVME_LOG_COMMAND_EFFECT: case NVME_LOG_RES_NOTIFICATION: case NVME_LOG_SANITIZE_STATUS: return (true); } return (false); } static uint32_t nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id) { uint32_t log_page_size; switch (page_id) { case NVME_LOG_ERROR: log_page_size = min( sizeof(struct nvme_error_information_entry) * (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE); break; case NVME_LOG_HEALTH_INFORMATION: log_page_size = sizeof(struct nvme_health_information_page); break; case NVME_LOG_FIRMWARE_SLOT: log_page_size = sizeof(struct nvme_firmware_page); break; case NVME_LOG_CHANGED_NAMESPACE: log_page_size = sizeof(struct nvme_ns_list); break; case NVME_LOG_COMMAND_EFFECT: log_page_size = sizeof(struct nvme_command_effects_page); break; case NVME_LOG_RES_NOTIFICATION: log_page_size = sizeof(struct nvme_res_notification_page); break; case NVME_LOG_SANITIZE_STATUS: log_page_size = sizeof(struct nvme_sanitize_status_page); break; default: log_page_size = 0; break; } return (log_page_size); } static void nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, uint8_t state) { if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE) nvme_ctrlr_devctl_log(ctrlr, "critical", "available spare space below threshold"); if (state & NVME_CRIT_WARN_ST_TEMPERATURE) nvme_ctrlr_devctl_log(ctrlr, "critical", "temperature above threshold"); if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY) nvme_ctrlr_devctl_log(ctrlr, "critical", "device reliability degraded"); if (state & NVME_CRIT_WARN_ST_READ_ONLY) nvme_ctrlr_devctl_log(ctrlr, "critical", "media placed in read only mode"); if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP) nvme_ctrlr_devctl_log(ctrlr, "critical", "volatile memory backup device failed"); if (state & NVME_CRIT_WARN_ST_RESERVED_MASK) nvme_ctrlr_devctl_log(ctrlr, "critical", "unknown critical warning(s): state = 0x%02x", state); } static void nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; struct nvme_health_information_page *health_info; struct nvme_ns_list *nsl; struct nvme_error_information_entry *err; int i; /* * If the log page fetch for some reason completed with an error, * don't pass log page data to the consumers. In practice, this case * should never happen. */ if (nvme_completion_is_error(cpl)) nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, NULL, 0); else { /* Convert data to host endian */ switch (aer->log_page_id) { case NVME_LOG_ERROR: err = (struct nvme_error_information_entry *)aer->log_page_buffer; for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) nvme_error_information_entry_swapbytes(err++); break; case NVME_LOG_HEALTH_INFORMATION: nvme_health_information_page_swapbytes( (struct nvme_health_information_page *)aer->log_page_buffer); break; case NVME_LOG_FIRMWARE_SLOT: nvme_firmware_page_swapbytes( (struct nvme_firmware_page *)aer->log_page_buffer); break; case NVME_LOG_CHANGED_NAMESPACE: nvme_ns_list_swapbytes( (struct nvme_ns_list *)aer->log_page_buffer); break; case NVME_LOG_COMMAND_EFFECT: nvme_command_effects_page_swapbytes( (struct nvme_command_effects_page *)aer->log_page_buffer); break; case NVME_LOG_RES_NOTIFICATION: nvme_res_notification_page_swapbytes( (struct nvme_res_notification_page *)aer->log_page_buffer); break; case NVME_LOG_SANITIZE_STATUS: nvme_sanitize_status_page_swapbytes( (struct nvme_sanitize_status_page *)aer->log_page_buffer); break; case INTEL_LOG_TEMP_STATS: intel_log_temp_stats_swapbytes( (struct intel_log_temp_stats *)aer->log_page_buffer); break; default: break; } if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { health_info = (struct nvme_health_information_page *) aer->log_page_buffer; nvme_ctrlr_log_critical_warnings(aer->ctrlr, health_info->critical_warning); /* * Critical warnings reported through the * SMART/health log page are persistent, so * clear the associated bits in the async event * config so that we do not receive repeated * notifications for the same event. */ aer->ctrlr->async_event_config &= ~health_info->critical_warning; nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, aer->ctrlr->async_event_config, NULL, NULL); } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE && !nvme_use_nvd) { nsl = (struct nvme_ns_list *)aer->log_page_buffer; for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { if (nsl->ns[i] > NVME_MAX_NAMESPACES) break; nvme_notify_ns(aer->ctrlr, nsl->ns[i]); } } /* * Pass the cpl data from the original async event completion, * not the log page fetch. */ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, aer->log_page_buffer, aer->log_page_size); } /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; if (nvme_completion_is_error(cpl)) { /* * Do not retry failed async event requests. This avoids * infinite loops where a new async event request is submitted * to replace the one just failed, only to fail again and * perpetuate the loop. */ return; } /* Associated log page is in bits 23:16 of completion entry dw0. */ aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x," " page 0x%02x)\n", (cpl->cdw0 & 0x07), (cpl->cdw0 & 0xFF00) >> 8, aer->log_page_id); if (is_log_page_id_valid(aer->log_page_id)) { aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id); memcpy(&aer->cpl, cpl, sizeof(*cpl)); nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, aer); /* Wait to notify consumers until after log page is fetched. */ } else { nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, NULL, 0); /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } } static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer) { struct nvme_request *req; aer->ctrlr = ctrlr; req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer); aer->req = req; /* * Disable timeout here, since asynchronous event requests should by * nature never be timed out. */ req->timeout = false; req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; nvme_ctrlr_submit_admin_request(ctrlr, req); } static void nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_async_event_request *aer; uint32_t i; ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE | NVME_CRIT_WARN_ST_DEVICE_RELIABILITY | NVME_CRIT_WARN_ST_READ_ONLY | NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP; if (ctrlr->cdata.ver >= NVME_REV(1, 2)) ctrlr->async_event_config |= NVME_ASYNC_EVENT_NS_ATTRIBUTE | NVME_ASYNC_EVENT_FW_ACTIVATE; status.done = 0; nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD, 0, NULL, 0, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl) || (status.cpl.cdw0 & 0xFFFF) == 0xFFFF || (status.cpl.cdw0 & 0xFFFF) == 0x0000) { nvme_printf(ctrlr, "temperature threshold not supported\n"); } else ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE; nvme_ctrlr_cmd_set_async_event_config(ctrlr, ctrlr->async_event_config, NULL, NULL); /* aerl is a zero-based value, so we need to add 1 here. */ ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1)); for (i = 0; i < ctrlr->num_aers; i++) { aer = &ctrlr->aer[i]; nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); } } static void nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) { ctrlr->int_coal_time = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_time", &ctrlr->int_coal_time); ctrlr->int_coal_threshold = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold", &ctrlr->int_coal_threshold); nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL, NULL); } static void nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr) { struct nvme_hmb_chunk *hmbc; int i; if (ctrlr->hmb_desc_paddr) { bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map); bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr, ctrlr->hmb_desc_map); ctrlr->hmb_desc_paddr = 0; } if (ctrlr->hmb_desc_tag) { bus_dma_tag_destroy(ctrlr->hmb_desc_tag); ctrlr->hmb_desc_tag = NULL; } for (i = 0; i < ctrlr->hmb_nchunks; i++) { hmbc = &ctrlr->hmb_chunks[i]; bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map); bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr, hmbc->hmbc_map); } ctrlr->hmb_nchunks = 0; if (ctrlr->hmb_tag) { bus_dma_tag_destroy(ctrlr->hmb_tag); ctrlr->hmb_tag = NULL; } if (ctrlr->hmb_chunks) { free(ctrlr->hmb_chunks, M_NVME); ctrlr->hmb_chunks = NULL; } } static void nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr) { struct nvme_hmb_chunk *hmbc; size_t pref, min, minc, size; int err, i; uint64_t max; /* Limit HMB to 5% of RAM size per device by default. */ max = (uint64_t)physmem * PAGE_SIZE / 20; TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max); min = (long long unsigned)ctrlr->cdata.hmmin * 4096; if (max == 0 || max < min) return; pref = MIN((long long unsigned)ctrlr->cdata.hmpre * 4096, max); minc = MAX(ctrlr->cdata.hmminds * 4096, PAGE_SIZE); if (min > 0 && ctrlr->cdata.hmmaxd > 0) minc = MAX(minc, min / ctrlr->cdata.hmmaxd); ctrlr->hmb_chunk = pref; again: ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, PAGE_SIZE); ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk); if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd) ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd; ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) * ctrlr->hmb_nchunks, M_NVME, M_WAITOK); err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag); if (err != 0) { nvme_printf(ctrlr, "HMB tag create failed %d\n", err); nvme_ctrlr_hmb_free(ctrlr); return; } for (i = 0; i < ctrlr->hmb_nchunks; i++) { hmbc = &ctrlr->hmb_chunks[i]; if (bus_dmamem_alloc(ctrlr->hmb_tag, (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT, &hmbc->hmbc_map)) { nvme_printf(ctrlr, "failed to alloc HMB\n"); break; } if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map, hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map, &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) { bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr, hmbc->hmbc_map); nvme_printf(ctrlr, "failed to load HMB\n"); break; } bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); } if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min && ctrlr->hmb_chunk / 2 >= minc) { ctrlr->hmb_nchunks = i; nvme_ctrlr_hmb_free(ctrlr); ctrlr->hmb_chunk /= 2; goto again; } ctrlr->hmb_nchunks = i; if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) { nvme_ctrlr_hmb_free(ctrlr); return; } size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks; err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag); if (err != 0) { nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err); nvme_ctrlr_hmb_free(ctrlr); return; } if (bus_dmamem_alloc(ctrlr->hmb_desc_tag, (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK, &ctrlr->hmb_desc_map)) { nvme_printf(ctrlr, "failed to alloc HMB desc\n"); nvme_ctrlr_hmb_free(ctrlr); return; } if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map, ctrlr->hmb_desc_vaddr, size, nvme_single_map, &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) { bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr, ctrlr->hmb_desc_map); nvme_printf(ctrlr, "failed to load HMB desc\n"); nvme_ctrlr_hmb_free(ctrlr); return; } for (i = 0; i < ctrlr->hmb_nchunks; i++) { ctrlr->hmb_desc_vaddr[i].addr = htole64(ctrlr->hmb_chunks[i].hmbc_paddr); ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / 4096); } bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map, BUS_DMASYNC_PREWRITE); nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n", (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk / 1024 / 1024); } static void nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret) { struct nvme_completion_poll_status status; uint32_t cdw11; cdw11 = 0; if (enable) cdw11 |= 1; if (memret) cdw11 |= 2; status.done = 0; nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11, ctrlr->hmb_nchunks * ctrlr->hmb_chunk / 4096, ctrlr->hmb_desc_paddr, ctrlr->hmb_desc_paddr >> 32, ctrlr->hmb_nchunks, NULL, 0, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n"); } static void nvme_ctrlr_start(void *ctrlr_arg, bool resetting) { struct nvme_controller *ctrlr = ctrlr_arg; uint32_t old_num_io_queues; int i; TSENTER(); /* * Only reset adminq here when we are restarting the * controller after a reset. During initialization, * we have already submitted admin commands to get * the number of I/O queues supported, so cannot reset * the adminq again here. */ if (resetting) { nvme_qpair_reset(&ctrlr->adminq); nvme_admin_qpair_enable(&ctrlr->adminq); } if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_qpair_reset(&ctrlr->ioq[i]); } /* * If it was a reset on initialization command timeout, just * return here, letting initialization code fail gracefully. */ if (resetting && !ctrlr->is_initialized) return; if (resetting && nvme_ctrlr_identify(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } /* * The number of qpairs are determined during controller initialization, * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the * HW limit. We call SET_FEATURES again here so that it gets called * after any reset for controllers that depend on the driver to * explicit specify how many queues it will use. This value should * never change between resets, so panic if somehow that does happen. */ if (resetting) { old_num_io_queues = ctrlr->num_io_queues; if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (old_num_io_queues != ctrlr->num_io_queues) { panic("num_io_queues changed from %u to %u", old_num_io_queues, ctrlr->num_io_queues); } } if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) { nvme_ctrlr_hmb_alloc(ctrlr); if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, true, false); } else if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, true, true); if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } nvme_ctrlr_configure_aer(ctrlr); nvme_ctrlr_configure_int_coalescing(ctrlr); for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_enable(&ctrlr->ioq[i]); TSEXIT(); } void nvme_ctrlr_start_config_hook(void *arg) { struct nvme_controller *ctrlr = arg; TSENTER(); - /* - * Reset controller twice to ensure we do a transition from cc.en==1 to - * cc.en==0. This is because we don't really know what status the - * controller was left in when boot handed off to OS. Linux doesn't do - * this, however. If we adopt that policy, see also nvme_ctrlr_resume(). - */ if (nvme_ctrlr_hw_reset(ctrlr) != 0) { fail: nvme_ctrlr_fail(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); return; } +#ifdef NVME_2X_RESET + /* + * Reset controller twice to ensure we do a transition from cc.en==1 to + * cc.en==0. This is because we don't really know what status the + * controller was left in when boot handed off to OS. Linux doesn't do + * this, however, and when the controller is in state cc.en == 0, no + * I/O can happen. + */ if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; +#endif nvme_qpair_reset(&ctrlr->adminq); nvme_admin_qpair_enable(&ctrlr->adminq); if (nvme_ctrlr_identify(ctrlr) == 0 && nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) nvme_ctrlr_start(ctrlr, false); else goto fail; nvme_sysctl_initialize_ctrlr(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); ctrlr->is_initialized = 1; nvme_notify_new_controller(ctrlr); TSEXIT(); } static void nvme_ctrlr_reset_task(void *arg, int pending) { struct nvme_controller *ctrlr = arg; int status; nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller"); status = nvme_ctrlr_hw_reset(ctrlr); /* * Use pause instead of DELAY, so that we yield to any nvme interrupt * handlers on this CPU that were blocked on a qpair lock. We want * all nvme interrupts completed before proceeding with restarting the * controller. * * XXX - any way to guarantee the interrupt handlers have quiesced? */ pause("nvmereset", hz / 10); if (status == 0) nvme_ctrlr_start(ctrlr, true); else nvme_ctrlr_fail(ctrlr); atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } /* * Poll all the queues enabled on the device for completion. */ void nvme_ctrlr_poll(struct nvme_controller *ctrlr) { int i; nvme_qpair_process_completions(&ctrlr->adminq); for (i = 0; i < ctrlr->num_io_queues; i++) if (ctrlr->ioq && ctrlr->ioq[i].cpl) nvme_qpair_process_completions(&ctrlr->ioq[i]); } /* * Poll the single-vector interrupt case: num_io_queues will be 1 and * there's only a single vector. While we're polling, we mask further * interrupts in the controller. */ void nvme_ctrlr_shared_handler(void *arg) { struct nvme_controller *ctrlr = arg; nvme_mmio_write_4(ctrlr, intms, 1); nvme_ctrlr_poll(ctrlr); nvme_mmio_write_4(ctrlr, intmc, 1); } static void nvme_pt_done(void *arg, const struct nvme_completion *cpl) { struct nvme_pt_command *pt = arg; struct mtx *mtx = pt->driver_lock; uint16_t status; bzero(&pt->cpl, sizeof(pt->cpl)); pt->cpl.cdw0 = cpl->cdw0; status = cpl->status; status &= ~NVME_STATUS_P_MASK; pt->cpl.status = status; mtx_lock(mtx); pt->driver_lock = NULL; wakeup(pt); mtx_unlock(mtx); } int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd) { struct nvme_request *req; struct mtx *mtx; struct buf *buf = NULL; int ret = 0; if (pt->len > 0) { if (pt->len > ctrlr->max_xfer_size) { nvme_printf(ctrlr, "pt->len (%d) " "exceeds max_xfer_size (%d)\n", pt->len, ctrlr->max_xfer_size); return EIO; } if (is_user_buffer) { /* * Ensure the user buffer is wired for the duration of * this pass-through command. */ PHOLD(curproc); buf = uma_zalloc(pbuf_zone, M_WAITOK); buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) { ret = EFAULT; goto err; } req = nvme_allocate_request_vaddr(buf->b_data, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_null(nvme_pt_done, pt); /* Assume user space already converted to little-endian */ req->cmd.opc = pt->cmd.opc; req->cmd.fuse = pt->cmd.fuse; req->cmd.rsvd2 = pt->cmd.rsvd2; req->cmd.rsvd3 = pt->cmd.rsvd3; req->cmd.cdw10 = pt->cmd.cdw10; req->cmd.cdw11 = pt->cmd.cdw11; req->cmd.cdw12 = pt->cmd.cdw12; req->cmd.cdw13 = pt->cmd.cdw13; req->cmd.cdw14 = pt->cmd.cdw14; req->cmd.cdw15 = pt->cmd.cdw15; req->cmd.nsid = htole32(nsid); mtx = mtx_pool_find(mtxpool_sleep, pt); pt->driver_lock = mtx; if (is_admin_cmd) nvme_ctrlr_submit_admin_request(ctrlr, req); else nvme_ctrlr_submit_io_request(ctrlr, req); mtx_lock(mtx); while (pt->driver_lock != NULL) mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0); mtx_unlock(mtx); err: if (buf != NULL) { uma_zfree(pbuf_zone, buf); PRELE(curproc); } return (ret); } static int nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvme_controller *ctrlr; struct nvme_pt_command *pt; ctrlr = cdev->si_drv1; switch (cmd) { case NVME_RESET_CONTROLLER: nvme_ctrlr_reset(ctrlr); break; case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid), 1 /* is_user_buffer */, 1 /* is_admin_cmd */)); case NVME_GET_NSID: { struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg; strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), sizeof(gnsid->cdev)); gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = 0; break; } case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = ctrlr->max_xfer_size; break; default: return (ENOTTY); } return (0); } static struct cdevsw nvme_ctrlr_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_ioctl = nvme_ctrlr_ioctl }; int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) { struct make_dev_args md_args; uint32_t cap_lo; uint32_t cap_hi; uint32_t to, vs, pmrcap; uint8_t mpsmin; int status, timeout_period; ctrlr->dev = dev; mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); if (bus_get_domain(dev, &ctrlr->domain) != 0) ctrlr->domain = 0; cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); if (bootverbose) { device_printf(dev, "CapLo: 0x%08x: MQES %u%s%s%s%s, TO %u\n", cap_lo, NVME_CAP_LO_MQES(cap_lo), NVME_CAP_LO_CQR(cap_lo) ? ", CQR" : "", NVME_CAP_LO_AMS(cap_lo) ? ", AMS" : "", (NVME_CAP_LO_AMS(cap_lo) & 0x1) ? " WRRwUPC" : "", (NVME_CAP_LO_AMS(cap_lo) & 0x2) ? " VS" : "", NVME_CAP_LO_TO(cap_lo)); } cap_hi = nvme_mmio_read_4(ctrlr, cap_hi); if (bootverbose) { device_printf(dev, "CapHi: 0x%08x: DSTRD %u%s, CSS %x%s, " "MPSMIN %u, MPSMAX %u%s%s\n", cap_hi, NVME_CAP_HI_DSTRD(cap_hi), NVME_CAP_HI_NSSRS(cap_hi) ? ", NSSRS" : "", NVME_CAP_HI_CSS(cap_hi), NVME_CAP_HI_BPS(cap_hi) ? ", BPS" : "", NVME_CAP_HI_MPSMIN(cap_hi), NVME_CAP_HI_MPSMAX(cap_hi), NVME_CAP_HI_PMRS(cap_hi) ? ", PMRS" : "", NVME_CAP_HI_CMBS(cap_hi) ? ", CMBS" : ""); } if (bootverbose) { vs = nvme_mmio_read_4(ctrlr, vs); device_printf(dev, "Version: 0x%08x: %d.%d\n", vs, NVME_MAJOR(vs), NVME_MINOR(vs)); } if (bootverbose && NVME_CAP_HI_PMRS(cap_hi)) { pmrcap = nvme_mmio_read_4(ctrlr, pmrcap); device_printf(dev, "PMRCap: 0x%08x: BIR %u%s%s, PMRTU %u, " "PMRWBM %x, PMRTO %u%s\n", pmrcap, NVME_PMRCAP_BIR(pmrcap), NVME_PMRCAP_RDS(pmrcap) ? ", RDS" : "", NVME_PMRCAP_WDS(pmrcap) ? ", WDS" : "", NVME_PMRCAP_PMRTU(pmrcap), NVME_PMRCAP_PMRWBM(pmrcap), NVME_PMRCAP_PMRTO(pmrcap), NVME_PMRCAP_CMSS(pmrcap) ? ", CMSS" : ""); } ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2; mpsmin = NVME_CAP_HI_MPSMIN(cap_hi); ctrlr->min_page_size = 1 << (12 + mpsmin); /* Get ready timeout value from controller, in units of 500ms. */ to = NVME_CAP_LO_TO(cap_lo) + 1; ctrlr->ready_timeout_in_ms = to * 500; timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD; TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period); timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD); timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD); ctrlr->timeout_period = timeout_period; nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count); ctrlr->enable_aborts = 0; TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE; if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0) return (ENXIO); /* * Create 2 threads for the taskqueue. The reset thread will block when * it detects that the controller has failed until all I/O has been * failed up the stack. The fail_req task needs to be able to run in * this case to finish the request failure for some cases. * * We could partially solve this race by draining the failed requeust * queue before proceding to free the sim, though nothing would stop * new I/O from coming in after we do that drain, but before we reach * cam_sim_free, so this big hammer is used instead. */ ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq"); ctrlr->is_resetting = 0; ctrlr->is_initialized = 0; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr); STAILQ_INIT(&ctrlr->fail_req); ctrlr->is_failed = false; make_dev_args_init(&md_args); md_args.mda_devsw = &nvme_ctrlr_cdevsw; md_args.mda_uid = UID_ROOT; md_args.mda_gid = GID_WHEEL; md_args.mda_mode = 0600; md_args.mda_unit = device_get_unit(dev); md_args.mda_si_drv1 = (void *)ctrlr; status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d", device_get_unit(dev)); if (status != 0) return (ENXIO); return (0); } void nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) { int gone, i; ctrlr->is_dying = true; if (ctrlr->resource == NULL) goto nores; if (!mtx_initialized(&ctrlr->adminq.lock)) goto noadminq; /* * Check whether it is a hot unplug or a clean driver detach. * If device is not there any more, skip any shutdown commands. */ gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE); if (gone) nvme_ctrlr_fail(ctrlr); else nvme_notify_fail_consumers(ctrlr); for (i = 0; i < NVME_MAX_NAMESPACES; i++) nvme_ns_destruct(&ctrlr->ns[i]); if (ctrlr->cdev) destroy_dev(ctrlr->cdev); if (ctrlr->is_initialized) { if (!gone) { if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, false, false); nvme_ctrlr_delete_qpairs(ctrlr); } nvme_ctrlr_hmb_free(ctrlr); } if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_destroy(&ctrlr->ioq[i]); free(ctrlr->ioq, M_NVME); } nvme_admin_qpair_destroy(&ctrlr->adminq); /* * Notify the controller of a shutdown, even though this is due to * a driver unload, not a system shutdown (this path is not invoked * during shutdown). This ensures the controller receives a * shutdown notification in case the system is shutdown before * reloading the driver. */ if (!gone) nvme_ctrlr_shutdown(ctrlr); if (!gone) nvme_ctrlr_disable(ctrlr); noadminq: if (ctrlr->taskqueue) taskqueue_free(ctrlr->taskqueue); if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); if (ctrlr->res) bus_release_resource(ctrlr->dev, SYS_RES_IRQ, rman_get_rid(ctrlr->res), ctrlr->res); if (ctrlr->bar4_resource != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->bar4_resource_id, ctrlr->bar4_resource); } bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->resource_id, ctrlr->resource); nores: mtx_destroy(&ctrlr->lock); } void nvme_ctrlr_shutdown(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; int timeout; cc = nvme_mmio_read_4(ctrlr, cc); cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT); cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT; nvme_mmio_write_4(ctrlr, cc, cc); timeout = ticks + (ctrlr->cdata.rtd3e == 0 ? 5 * hz : ((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000); while (1) { csts = nvme_mmio_read_4(ctrlr, csts); if (csts == NVME_GONE) /* Hot unplug. */ break; if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE) break; if (timeout - ticks < 0) { nvme_printf(ctrlr, "shutdown timeout\n"); break; } pause("nvmeshut", 1); } } void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, struct nvme_request *req) { nvme_qpair_submit_request(&ctrlr->adminq, req); } void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, struct nvme_request *req) { struct nvme_qpair *qpair; qpair = &ctrlr->ioq[QP(ctrlr, curcpu)]; nvme_qpair_submit_request(qpair, req); } device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr) { return (ctrlr->dev); } const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr) { return (&ctrlr->cdata); } int nvme_ctrlr_suspend(struct nvme_controller *ctrlr) { int to = hz; /* * Can't touch failed controllers, so it's already suspended. */ if (ctrlr->is_failed) return (0); /* * We don't want the reset taskqueue running, since it does similar * things, so prevent it from running after we start. Wait for any reset * that may have been started to complete. The reset process we follow * will ensure that any new I/O will queue and be given to the hardware * after we resume (though there should be none). */ while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0) pause("nvmesusp", 1); if (to <= 0) { nvme_printf(ctrlr, "Competing reset task didn't finish. Try again later.\n"); return (EWOULDBLOCK); } if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, false, false); /* * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to * delete the hardware I/O queues, and then shutdown. This properly * flushes any metadata the drive may have stored so it can survive * having its power removed and prevents the unsafe shutdown count from * incriminating. Once we delete the qpairs, we have to disable them * before shutting down. */ nvme_ctrlr_delete_qpairs(ctrlr); nvme_ctrlr_disable_qpairs(ctrlr); nvme_ctrlr_shutdown(ctrlr); return (0); } int nvme_ctrlr_resume(struct nvme_controller *ctrlr) { /* * Can't touch failed controllers, so nothing to do to resume. */ if (ctrlr->is_failed) return (0); - /* - * Have to reset the hardware twice, just like we do on attach. See - * nmve_attach() for why. - */ if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; +#ifdef NVME_2X_RESET + /* + * Prior to FreeBSD 13.1, FreeBSD's nvme driver reset the hardware twice + * to get it into a known good state. However, the hardware's state is + * good and we don't need to do this for proper functioning. + */ if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; +#endif /* * Now that we've reset the hardware, we can restart the controller. Any * I/O that was pending is requeued. Any admin commands are aborted with * an error. Once we've restarted, take the controller out of reset. */ nvme_ctrlr_start(ctrlr, true); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); return (0); fail: /* * Since we can't bring the controller out of reset, announce and fail * the controller. However, we have to return success for the resume * itself, due to questionable APIs. */ nvme_printf(ctrlr, "Failed to reset on resume, failing.\n"); nvme_ctrlr_fail(ctrlr); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); return (0); }