diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 6de8eab73027..55c1798bbffc 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -1,2866 +1,2867 @@ # # NOTES -- Lines that can be cut/pasted into kernel and hints configs. # # Lines that begin with 'device', 'options', 'machine', 'ident', 'maxusers', # 'makeoptions', 'hints', etc. go into the kernel configuration that you # run config(8) with. # # Lines that begin with 'envvar hint.' should go into your hints file. # See /boot/device.hints and/or the 'hints' config(8) directive. # # Please use ``make LINT'' to create an old-style LINT file if you want to # do kernel test-builds. # # This file contains machine independent kernel configuration notes. For # machine dependent notes, look in /sys//conf/NOTES. # # # NOTES conventions and style guide: # # Large block comments should begin and end with a line containing only a # comment character. # # To describe a particular object, a block comment (if it exists) should # come first. Next should come device, options, and hints lines in that # order. All device and option lines must be described by a comment that # doesn't just expand the device or option name. Use only a concise # comment on the same line if possible. Very detailed descriptions of # devices and subsystems belong in man pages. # # A space followed by a tab separates 'options' from an option name. Two # spaces followed by a tab separate 'device' from a device name. Comments # after an option or device should use one space after the comment character. # To comment out a negative option that disables code and thus should not be # enabled for LINT builds, precede 'options' with "#!". # # # This is the ``identification'' of the kernel. Usually this should # be the same as the name of your kernel. # ident LINT # # The `maxusers' parameter controls the static sizing of a number of # internal system tables by a formula defined in subr_param.c. # Omitting this parameter or setting it to 0 will cause the system to # auto-size based on physical memory. # maxusers 10 # To statically compile in device wiring instead of /boot/device.hints #hints "LINT.hints" # Default places to look for devices. # Use the following to compile in values accessible to the kernel # through getenv() (or kenv(1) in userland). The format of the file # is 'variable=value', see kenv(1) # #env "LINT.env" # # The `makeoptions' parameter allows variables to be passed to the # generated Makefile in the build area. # # CONF_CFLAGS gives some extra compiler flags that are added to ${CFLAGS} # after most other flags. Here we use it to inhibit use of non-optimal # gcc built-in functions (e.g., memcmp). # # DEBUG happens to be magic. # The following is equivalent to 'config -g KERNELNAME' and creates # 'kernel.debug' compiled with -g debugging as well as a normal # 'kernel'. Use 'make install.debug' to install the debug kernel # but that isn't normally necessary as the debug symbols are not loaded # by the kernel and are not useful there anyway. # # KERNEL can be overridden so that you can change the default name of your # kernel. # # MODULES_OVERRIDE can be used to limit modules built to a specific list. # makeoptions CONF_CFLAGS=-fno-builtin #Don't allow use of memcmp, etc. #makeoptions DEBUG=-g #Build kernel with gdb(1) debug symbols #makeoptions KERNEL=foo #Build kernel "foo" and install "/foo" # Only build ext2fs module plus those parts of the sound system I need. #makeoptions MODULES_OVERRIDE="ext2fs sound/sound sound/driver/maestro3" makeoptions DESTDIR=/tmp # # FreeBSD processes are subject to certain limits to their consumption # of system resources. See getrlimit(2) for more details. Each # resource limit has two values, a "soft" limit and a "hard" limit. # The soft limits can be modified during normal system operation, but # the hard limits are set at boot time. Their default values are # in sys//include/vmparam.h. There are two ways to change them: # # 1. Set the values at kernel build time. The options below are one # way to allow that limit to grow to 1GB. They can be increased # further by changing the parameters: # # 2. In /boot/loader.conf, set the tunables kern.maxswzone, # kern.maxbcache, kern.maxtsiz, kern.dfldsiz, kern.maxdsiz, # kern.dflssiz, kern.maxssiz and kern.sgrowsiz. # # The options in /boot/loader.conf override anything in the kernel # configuration file. See the function init_param1 in # sys/kern/subr_param.c for more details. # options MAXDSIZ=(1024UL*1024*1024) options MAXSSIZ=(128UL*1024*1024) options DFLDSIZ=(1024UL*1024*1024) # # BLKDEV_IOSIZE sets the default block size used in user block # device I/O. Note that this value will be overridden by the label # when specifying a block device from a label with a non-0 # partition blocksize. The default is PAGE_SIZE. # options BLKDEV_IOSIZE=8192 # # MAXPHYS and DFLTPHYS # # These are the maximal and safe 'raw' I/O block device access sizes. # Reads and writes will be split into MAXPHYS chunks for known good # devices and DFLTPHYS for the rest. Some applications have better # performance with larger raw I/O access sizes. Note that certain VM # parameters are derived from these values and making them too large # can make an unbootable kernel. # # The defaults are 64K and 128K respectively. options DFLTPHYS=(64*1024) options MAXPHYS=(128*1024) # This allows you to actually store this configuration file into # the kernel binary itself. See config(8) for more details. # options INCLUDE_CONFIG_FILE # Include this file in kernel # # Compile-time defaults for various boot parameters # options BOOTVERBOSE=1 options BOOTHOWTO=RB_MULTIPLE # # Compile-time defaults for dmesg boot tagging # # Default boot tag; may use 'kern.boot_tag' loader tunable to override. The # current boot's tag is also exposed via the 'kern.boot_tag' sysctl. options BOOT_TAG=\"\" # Maximum boot tag size the kernel's static buffer should accommodate. Maximum # size for both BOOT_TAG and the assocated tunable. options BOOT_TAG_SZ=32 options GEOM_BDE # Disk encryption. options GEOM_CACHE # Disk cache. options GEOM_CONCAT # Disk concatenation. options GEOM_ELI # Disk encryption. options GEOM_GATE # Userland services. options GEOM_JOURNAL # Journaling. options GEOM_LABEL # Providers labelization. options GEOM_LINUX_LVM # Linux LVM2 volumes options GEOM_MAP # Map based partitioning options GEOM_MIRROR # Disk mirroring. options GEOM_MULTIPATH # Disk multipath options GEOM_NOP # Test class. options GEOM_PART_APM # Apple partitioning options GEOM_PART_BSD # BSD disklabel options GEOM_PART_BSD64 # BSD disklabel64 options GEOM_PART_EBR # Extended Boot Records options GEOM_PART_GPT # GPT partitioning options GEOM_PART_LDM # Logical Disk Manager options GEOM_PART_MBR # MBR partitioning options GEOM_RAID # Soft RAID functionality. options GEOM_RAID3 # RAID3 functionality. options GEOM_SHSEC # Shared secret. options GEOM_STRIPE # Disk striping. options GEOM_UZIP # Read-only compressed disks options GEOM_VINUM # Vinum logical volume manager options GEOM_VIRSTOR # Virtual storage. options GEOM_ZERO # Performance testing helper. # # The root device and filesystem type can be compiled in; # this provides a fallback option if the root device cannot # be correctly guessed by the bootstrap code, or an override if # the RB_DFLTROOT flag (-r) is specified when booting the kernel. # options ROOTDEVNAME=\"ufs:da0s2e\" ##################################################################### # Scheduler options: # # Specifying one of SCHED_4BSD or SCHED_ULE is mandatory. These options # select which scheduler is compiled in. # # SCHED_4BSD is the historical, proven, BSD scheduler. It has a global run # queue and no CPU affinity which makes it suboptimal for SMP. It has very # good interactivity and priority selection. # # SCHED_ULE provides significant performance advantages over 4BSD on many # workloads on SMP machines. It supports cpu-affinity, per-cpu runqueues # and scheduler locks. It also has a stronger notion of interactivity # which leads to better responsiveness even on uniprocessor machines. This # is the default scheduler. # # SCHED_STATS is a debugging option which keeps some stats in the sysctl # tree at 'kern.sched.stats' and is useful for debugging scheduling decisions. # options SCHED_4BSD options SCHED_STATS #options SCHED_ULE ##################################################################### # SMP OPTIONS: # # SMP enables building of a Symmetric MultiProcessor Kernel. # Mandatory: options SMP # Symmetric MultiProcessor Kernel # EARLY_AP_STARTUP releases the Application Processors earlier in the # kernel startup process (before devices are probed) rather than at the # end. This is a temporary option for use during the transition from # late to early AP startup. options EARLY_AP_STARTUP # MAXCPU defines the maximum number of CPUs that can boot in the system. # A default value should be already present, for every architecture. options MAXCPU=32 # NUMA enables use of Non-Uniform Memory Access policies in various kernel # subsystems. options NUMA # MAXMEMDOM defines the maximum number of memory domains that can boot in the # system. A default value should already be defined by every architecture. options MAXMEMDOM=2 # ADAPTIVE_MUTEXES changes the behavior of blocking mutexes to spin # if the thread that currently owns the mutex is executing on another # CPU. This behavior is enabled by default, so this option can be used # to disable it. options NO_ADAPTIVE_MUTEXES # ADAPTIVE_RWLOCKS changes the behavior of reader/writer locks to spin # if the thread that currently owns the rwlock is executing on another # CPU. This behavior is enabled by default, so this option can be used # to disable it. options NO_ADAPTIVE_RWLOCKS # ADAPTIVE_SX changes the behavior of sx locks to spin if the thread that # currently owns the sx lock is executing on another CPU. # This behavior is enabled by default, so this option can be used to # disable it. options NO_ADAPTIVE_SX # MUTEX_NOINLINE forces mutex operations to call functions to perform each # operation rather than inlining the simple cases. This can be used to # shrink the size of the kernel text segment. Note that this behavior is # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING, # and WITNESS options. options MUTEX_NOINLINE # RWLOCK_NOINLINE forces rwlock operations to call functions to perform each # operation rather than inlining the simple cases. This can be used to # shrink the size of the kernel text segment. Note that this behavior is # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING, # and WITNESS options. options RWLOCK_NOINLINE # SX_NOINLINE forces sx lock operations to call functions to perform each # operation rather than inlining the simple cases. This can be used to # shrink the size of the kernel text segment. Note that this behavior is # already implied by the INVARIANT_SUPPORT, INVARIANTS, KTR, LOCK_PROFILING, # and WITNESS options. options SX_NOINLINE # SMP Debugging Options: # # CALLOUT_PROFILING enables rudimentary profiling of the callwheel data # structure used as backend in callout(9). # PREEMPTION allows the threads that are in the kernel to be preempted by # higher priority [interrupt] threads. It helps with interactivity # and allows interrupt threads to run sooner rather than waiting. # FULL_PREEMPTION instructs the kernel to preempt non-realtime kernel # threads. Its sole use is to expose race conditions and other # bugs during development. Enabling this option will reduce # performance and increase the frequency of kernel panics by # design. If you aren't sure that you need it then you don't. # Relies on the PREEMPTION option. DON'T TURN THIS ON. # SLEEPQUEUE_PROFILING enables rudimentary profiling of the hash table # used to hold active sleep queues as well as sleep wait message # frequency. # TURNSTILE_PROFILING enables rudimentary profiling of the hash table # used to hold active lock queues. # UMTX_PROFILING enables rudimentary profiling of the hash table used # to hold active lock queues. # WITNESS enables the witness code which detects deadlocks and cycles # during locking operations. # WITNESS_KDB causes the witness code to drop into the kernel debugger if # a lock hierarchy violation occurs or if locks are held when going to # sleep. # WITNESS_SKIPSPIN disables the witness checks on spin mutexes. options PREEMPTION options FULL_PREEMPTION options WITNESS options WITNESS_KDB options WITNESS_SKIPSPIN # LOCK_PROFILING - Profiling locks. See LOCK_PROFILING(9) for details. options LOCK_PROFILING # Set the number of buffers and the hash size. The hash size MUST be larger # than the number of buffers. Hash size should be prime. options MPROF_BUFFERS="1536" options MPROF_HASH_SIZE="1543" # Profiling for the callout(9) backend. options CALLOUT_PROFILING # Profiling for internal hash tables. options SLEEPQUEUE_PROFILING options TURNSTILE_PROFILING options UMTX_PROFILING # Debugging traces for epoch(9) misuse options EPOCH_TRACE ##################################################################### # COMPATIBILITY OPTIONS # Old tty interface. options COMPAT_43TTY # Note that as a general rule, COMPAT_FREEBSD depends on # COMPAT_FREEBSD, COMPAT_FREEBSD, etc. # Enable FreeBSD4 compatibility syscalls options COMPAT_FREEBSD4 # Enable FreeBSD5 compatibility syscalls options COMPAT_FREEBSD5 # Enable FreeBSD6 compatibility syscalls options COMPAT_FREEBSD6 # Enable FreeBSD7 compatibility syscalls options COMPAT_FREEBSD7 # Enable FreeBSD9 compatibility syscalls options COMPAT_FREEBSD9 # Enable FreeBSD10 compatibility syscalls options COMPAT_FREEBSD10 # Enable FreeBSD11 compatibility syscalls options COMPAT_FREEBSD11 # Enable FreeBSD12 compatibility syscalls options COMPAT_FREEBSD12 # Enable FreeBSD13 compatibility syscalls options COMPAT_FREEBSD13 # Enable Linux Kernel Programming Interface options COMPAT_LINUXKPI # # These three options provide support for System V Interface # Definition-style interprocess communication, in the form of shared # memory, semaphores, and message queues, respectively. # options SYSVSHM options SYSVSEM options SYSVMSG ##################################################################### # DEBUGGING OPTIONS # # Compile with kernel debugger related code. # options KDB # # Print a stack trace of the current thread on the console for a panic. # options KDB_TRACE # # Don't enter the debugger for a panic. Intended for unattended operation # where you may want to enter the debugger from the console, but still want # the machine to recover from a panic. # options KDB_UNATTENDED # # Enable the ddb debugger backend. # options DDB # # Print the numerical value of symbols in addition to the symbolic # representation. # options DDB_NUMSYM # # Enable the remote gdb debugger backend. # options GDB # # Trashes list pointers when they become invalid (i.e., the element is # removed from a list). Relatively inexpensive to enable. # options QUEUE_MACRO_DEBUG_TRASH # # Stores information about the last caller to modify the list object # in the list object. Requires additional memory overhead. # #options QUEUE_MACRO_DEBUG_TRACE # # SYSCTL_DEBUG enables a 'sysctl' debug tree that can be used to dump the # contents of the registered sysctl nodes on the console. It is disabled by # default because it generates excessively verbose console output that can # interfere with serial console operation. # options SYSCTL_DEBUG # # Enable textdump by default, this disables kernel core dumps. # options TEXTDUMP_PREFERRED # # Enable extra debug messages while performing textdumps. # options TEXTDUMP_VERBOSE # # NO_SYSCTL_DESCR omits the sysctl node descriptions to save space in the # resulting kernel. options NO_SYSCTL_DESCR # # MALLOC_DEBUG_MAXZONES enables multiple uma zones for malloc(9) # allocations that are smaller than a page. The purpose is to isolate # different malloc types into hash classes, so that any buffer # overruns or use-after-free will usually only affect memory from # malloc types in that hash class. This is purely a debugging tool; # by varying the hash function and tracking which hash class was # corrupted, the intersection of the hash classes from each instance # will point to a single malloc type that is being misused. At this # point inspection or memguard(9) can be used to catch the offending # code. # options MALLOC_DEBUG_MAXZONES=8 # # DEBUG_MEMGUARD builds and enables memguard(9), a replacement allocator # for the kernel used to detect modify-after-free scenarios. See the # memguard(9) man page for more information on usage. # options DEBUG_MEMGUARD # # DEBUG_REDZONE enables buffer underflows and buffer overflows detection for # malloc(9). # options DEBUG_REDZONE # # EARLY_PRINTF enables support for calling a special printf (eprintf) # very early in the kernel (before cn_init() has been called). This # should only be used for debugging purposes early in boot. Normally, # it is not defined. It is commented out here because this feature # isn't generally available. And the required eputc() isn't defined. # #options EARLY_PRINTF # # KTRACE enables the system-call tracing facility ktrace(2). To be more # SMP-friendly, KTRACE uses a worker thread to process most trace events # asynchronously to the thread generating the event. This requires a # pre-allocated store of objects representing trace events. The # KTRACE_REQUEST_POOL option specifies the initial size of this store. # The size of the pool can be adjusted both at boottime and runtime via # the kern.ktrace_request_pool tunable and sysctl. # options KTRACE #kernel tracing options KTRACE_REQUEST_POOL=101 # # KTR is a kernel tracing facility imported from BSD/OS. It is # enabled with the KTR option. KTR_ENTRIES defines the number of # entries in the circular trace buffer; it may be an arbitrary number. # KTR_BOOT_ENTRIES defines the number of entries during the early boot, # before malloc(9) is functional. # KTR_COMPILE defines the mask of events to compile into the kernel as # defined by the KTR_* constants in . KTR_MASK defines the # initial value of the ktr_mask variable which determines at runtime # what events to trace. KTR_CPUMASK determines which CPU's log # events, with bit X corresponding to CPU X. The layout of the string # passed as KTR_CPUMASK must match a series of bitmasks each of them # separated by the "," character (ie: # KTR_CPUMASK=0xAF,0xFFFFFFFFFFFFFFFF). KTR_VERBOSE enables # dumping of KTR events to the console by default. This functionality # can be toggled via the debug.ktr_verbose sysctl and defaults to off # if KTR_VERBOSE is not defined. See ktr(4) and ktrdump(8) for details. # options KTR options KTR_BOOT_ENTRIES=1024 options KTR_ENTRIES=(128*1024) options KTR_COMPILE=(KTR_ALL) options KTR_MASK=KTR_INTR options KTR_CPUMASK=0x3 options KTR_VERBOSE # # ALQ(9) is a facility for the asynchronous queuing of records from the kernel # to a vnode, and is employed by services such as ktr(4) to produce trace # files based on a kernel event stream. Records are written asynchronously # in a worker thread. # options ALQ options KTR_ALQ # # The INVARIANTS option is used in a number of source files to enable # extra sanity checking of internal structures. This support is not # enabled by default because of the extra time it would take to check # for these conditions, which can only occur as a result of # programming errors. # options INVARIANTS # # The INVARIANT_SUPPORT option makes us compile in support for # verifying some of the internal structures. It is a prerequisite for # 'INVARIANTS', as enabling 'INVARIANTS' will make these functions be # called. The intent is that you can set 'INVARIANTS' for single # source files (by changing the source file or specifying it on the # command line) if you have 'INVARIANT_SUPPORT' enabled. Also, if you # wish to build a kernel module with 'INVARIANTS', then adding # 'INVARIANT_SUPPORT' to your kernel will provide all the necessary # infrastructure without the added overhead. # options INVARIANT_SUPPORT # # The KASSERT_PANIC_OPTIONAL option allows kasserts to fire without # necessarily inducing a panic. Panic is the default behavior, but # runtime options can configure it either entirely off, or off with a # limit. # options KASSERT_PANIC_OPTIONAL # # The DIAGNOSTIC option is used to enable extra debugging information # and invariants checking. The added checks are too expensive or noisy # for an INVARIANTS kernel and thus are disabled by default. It is # expected that a kernel configured with DIAGNOSTIC will also have the # INVARIANTS option enabled. # options DIAGNOSTIC # # REGRESSION causes optional kernel interfaces necessary only for regression # testing to be enabled. These interfaces may constitute security risks # when enabled, as they permit processes to easily modify aspects of the # run-time environment to reproduce unlikely or unusual (possibly normally # impossible) scenarios. # options REGRESSION # # This option lets some drivers co-exist that can't co-exist in a running # system. This is used to be able to compile all kernel code in one go for # quality assurance purposes (like this file, which the option takes it name # from.) # options COMPILING_LINT # # STACK enables the stack(9) facility, allowing the capture of kernel stack # for the purpose of procinfo(1), etc. stack(9) will also be compiled in # automatically if DDB(4) is compiled into the kernel. # options STACK # # The NUM_CORE_FILES option specifies the limit for the number of core # files generated by a particular process, when the core file format # specifier includes the %I pattern. Since we only have 1 character for # the core count in the format string, meaning the range will be 0-9, the # maximum value allowed for this option is 10. # This core file limit can be adjusted at runtime via the debug.ncores # sysctl. # options NUM_CORE_FILES=5 # # The TSLOG option enables timestamped logging of events, especially # function entries/exits, in order to track the time spent by the kernel. # In particular, this is useful when investigating the early boot process, # before it is possible to use more sophisticated tools like DTrace. # The TSLOGSIZE option controls the size of the (preallocated, fixed # length) buffer used for storing these events (default: 262144 records). # The TSLOG_PAGEZERO option enables TSLOG of pmap_zero_page; this must be # enabled separately since it typically generates too many records to be # useful. # # For security reasons the TSLOG option should not be enabled on systems # used in production. # options TSLOG options TSLOGSIZE=262144 ##################################################################### # PERFORMANCE MONITORING OPTIONS # # The hwpmc driver that allows the use of in-CPU performance monitoring # counters for performance monitoring. The base kernel needs to be configured # with the 'options' line, while the hwpmc device can be either compiled # in or loaded as a loadable kernel module. # # Additional configuration options may be required on specific architectures, # please see hwpmc(4). device hwpmc # Driver (also a loadable module) options HWPMC_DEBUG options HWPMC_HOOKS # Other necessary kernel hooks ##################################################################### # NETWORKING OPTIONS # # Protocol families # options INET #Internet communications protocols options INET6 #IPv6 communications protocols # # Note if you include INET/INET6 or both options # You *must* define at least one of the congestion control # options or the compile will fail. GENERIC defines # options CC_CUBIC. You may want to specify a default # if multiple congestion controls are compiled in. # The string in default is the name of the # cc module as it would appear in the sysctl for # setting the default. The code defines CUBIC # as default, or the sole cc_module compiled in. # options CC_CDG options CC_CHD options CC_CUBIC options CC_DCTCP options CC_HD options CC_HTCP options CC_NEWRENO options CC_VEGAS options CC_DEFAULT=\"cubic\" options RATELIMIT # TX rate limiting support options ROUTETABLES=2 # allocated fibs up to 65536. default is 1. # but that would be a bad idea as they are large. options TCP_OFFLOAD # TCP offload support. options TCP_RFC7413 # TCP Fast Open options TCPHPTS # In order to enable IPSEC you MUST also add device crypto to # your kernel configuration options IPSEC #IP security (requires device crypto) # Option IPSEC_SUPPORT does not enable IPsec, but makes it possible to # load it as a kernel module. You still MUST add device crypto to your kernel # configuration. options IPSEC_SUPPORT #options IPSEC_DEBUG #debug for IP security # Alternative TCP stacks options TCP_BBR options TCP_RACK # TLS framing and encryption/decryption of data over TCP sockets. options KERN_TLS # TLS transmit and receive offload # Netlink kernel/user<>kernel/user messaging interface options NETLINK # # SMB/CIFS requester # NETSMB enables support for SMB protocol, it requires LIBMCHAIN and LIBICONV # options. options NETSMB #SMB/CIFS requester # mchain library. It can be either loaded as KLD or compiled into kernel options LIBMCHAIN # libalias library, performing NAT options LIBALIAS # # SCTP is a NEW transport protocol defined by # RFC2960 updated by RFC3309 and RFC3758.. and # soon to have a new base RFC and many many more # extensions. This release supports all the extensions # including many drafts (most about to become RFC's). # It is the reference implementation of SCTP # and is quite well tested. # # Note YOU MUST have both INET and INET6 defined. # You don't have to enable V6, but SCTP is # dual stacked and so far we have not torn apart # the V6 and V4.. since an association can span # both a V6 and V4 address at the SAME time :-) # # The SCTP_SUPPORT option does not enable SCTP, but provides the necessary # support for loading SCTP as a loadable kernel module. # options SCTP options SCTP_SUPPORT # There are bunches of options: # this one turns on all sorts of # nastily printing that you can # do. It's all controlled by a # bit mask (settable by socket opt and # by sysctl). Including will not cause # logging until you set the bits.. but it # can be quite verbose.. so without this # option we don't do any of the tests for # bits and prints.. which makes the code run # faster.. if you are not debugging don't use. options SCTP_DEBUG # # All that options after that turn on specific types of # logging. You can monitor CWND growth, flight size # and all sorts of things. Go look at the code and # see. I have used this to produce interesting # charts and graphs as well :-> # # I have not yet committed the tools to get and print # the logs, I will do that eventually .. before then # if you want them send me an email rrs@freebsd.org # You basically must have ktr(4) enabled for these # and you then set the sysctl to turn on/off various # logging bits. Use ktrdump(8) to pull the log and run # it through a display program.. and graphs and other # things too. # options SCTP_LOCK_LOGGING options SCTP_MBUF_LOGGING options SCTP_MBCNT_LOGGING options SCTP_PACKET_LOGGING options SCTP_LTRACE_CHUNKS options SCTP_LTRACE_ERRORS # OpenFabrics Enterprise Distribution (Infiniband). options OFED options OFED_DEBUG_INIT # Sockets Direct Protocol options SDP options SDP_DEBUG # IP over Infiniband options IPOIB options IPOIB_DEBUG options IPOIB_CM # altq(9). Enable the base part of the hooks with the ALTQ option. # Individual disciplines must be built into the base system and can not be # loaded as modules at this point. ALTQ requires a stable TSC so if yours is # broken or changes with CPU throttling then you must also have the ALTQ_NOPCC # option. options ALTQ options ALTQ_CBQ # Class Based Queueing options ALTQ_RED # Random Early Detection options ALTQ_RIO # RED In/Out options ALTQ_CODEL # CoDel Active Queueing options ALTQ_HFSC # Hierarchical Packet Scheduler options ALTQ_FAIRQ # Fair Packet Scheduler options ALTQ_CDNR # Traffic conditioner options ALTQ_PRIQ # Priority Queueing options ALTQ_NOPCC # Required if the TSC is unusable options ALTQ_DEBUG # netgraph(4). Enable the base netgraph code with the NETGRAPH option. # Individual node types can be enabled with the corresponding option # listed below; however, this is not strictly necessary as netgraph # will automatically load the corresponding KLD module if the node type # is not already compiled into the kernel. Each type below has a # corresponding man page, e.g., ng_async(8). options NETGRAPH # netgraph(4) system options NETGRAPH_DEBUG # enable extra debugging, this # affects netgraph(4) and nodes # Node types options NETGRAPH_ASYNC options NETGRAPH_BLUETOOTH # ng_bluetooth(4) options NETGRAPH_BLUETOOTH_HCI # ng_hci(4) options NETGRAPH_BLUETOOTH_L2CAP # ng_l2cap(4) options NETGRAPH_BLUETOOTH_SOCKET # ng_btsocket(4) options NETGRAPH_BLUETOOTH_UBT # ng_ubt(4) options NETGRAPH_BLUETOOTH_UBTBCMFW # ubtbcmfw(4) options NETGRAPH_BPF options NETGRAPH_BRIDGE options NETGRAPH_CAR options NETGRAPH_CHECKSUM options NETGRAPH_CISCO options NETGRAPH_DEFLATE options NETGRAPH_DEVICE options NETGRAPH_ECHO options NETGRAPH_EIFACE options NETGRAPH_ETHER options NETGRAPH_FRAME_RELAY options NETGRAPH_GIF options NETGRAPH_GIF_DEMUX options NETGRAPH_HOLE options NETGRAPH_IFACE options NETGRAPH_IP_INPUT options NETGRAPH_IPFW options NETGRAPH_KSOCKET options NETGRAPH_L2TP options NETGRAPH_LMI options NETGRAPH_MPPC_COMPRESSION options NETGRAPH_MPPC_ENCRYPTION options NETGRAPH_NETFLOW options NETGRAPH_NAT options NETGRAPH_ONE2MANY options NETGRAPH_PATCH options NETGRAPH_PIPE options NETGRAPH_PPP options NETGRAPH_PPPOE options NETGRAPH_PPTPGRE options NETGRAPH_PRED1 options NETGRAPH_RFC1490 options NETGRAPH_SOCKET options NETGRAPH_SPLIT options NETGRAPH_TAG options NETGRAPH_TCPMSS options NETGRAPH_TEE options NETGRAPH_UI options NETGRAPH_VJC options NETGRAPH_VLAN # Network stack virtualization. options VIMAGE options VNET_DEBUG # debug for VIMAGE # # Network interfaces: # The `loop' device is MANDATORY when networking is enabled. device loop # The `ether' device provides generic code to handle # Ethernets; it is MANDATORY when an Ethernet device driver is # configured. device ether # The `vlan' device implements the VLAN tagging of Ethernet frames # according to IEEE 802.1Q. device vlan # The `vxlan' device implements the VXLAN encapsulation of Ethernet # frames in UDP packets according to RFC7348. device vxlan # The `wlan' device provides generic code to support 802.11 # drivers, including host AP mode; it is MANDATORY for the wi, # and ath drivers and will eventually be required by all 802.11 drivers. device wlan options IEEE80211_DEBUG #enable debugging msgs options IEEE80211_DEBUG_REFCNT options IEEE80211_SUPPORT_MESH #enable 802.11s D3.0 support options IEEE80211_SUPPORT_TDMA #enable TDMA support # The `wlan_wep', `wlan_tkip', and `wlan_ccmp' devices provide # support for WEP, TKIP, and AES-CCMP crypto protocols optionally # used with 802.11 devices that depend on the `wlan' module. device wlan_wep device wlan_ccmp device wlan_tkip # The `wlan_xauth' device provides support for external (i.e. user-mode) # authenticators for use with 802.11 drivers that use the `wlan' # module and support 802.1x and/or WPA security protocols. device wlan_xauth # The `wlan_acl' device provides a MAC-based access control mechanism # for use with 802.11 drivers operating in ap mode and using the # `wlan' module. # The 'wlan_amrr' device provides AMRR transmit rate control algorithm device wlan_acl device wlan_amrr # The `bpf' device enables the Berkeley Packet Filter. Be # aware of the legal and administrative consequences of enabling this # option. DHCP requires bpf. device bpf # The `netmap' device implements memory-mapped access to network # devices from userspace, enabling wire-speed packet capture and # generation even at 10Gbit/s. Requires support in the device # driver. Supported drivers are ixgbe, e1000, re. device netmap # The `disc' device implements a minimal network interface, # which throws away all packets sent and never receives any. It is # included for testing and benchmarking purposes. device disc # The `epair' device implements a virtual back-to-back connected Ethernet # like interface pair. device epair # The `edsc' device implements a minimal Ethernet interface, # which discards all packets sent and receives none. device edsc # The `tuntap' device implements (user-)ppp, nos-tun(8) and a pty-like virtual # Ethernet interface device tuntap # The `gif' device implements IPv6 over IP4 tunneling, # IPv4 over IPv6 tunneling, IPv4 over IPv4 tunneling and # IPv6 over IPv6 tunneling. # The `gre' device implements GRE (Generic Routing Encapsulation) tunneling, # as specified in the RFC 2784 and RFC 2890. # The `me' device implements Minimal Encapsulation within IPv4 as # specified in the RFC 2004. # The XBONEHACK option allows the same pair of addresses to be configured on # multiple gif interfaces. device gif device gre device me options XBONEHACK # The `stf' device implements 6to4 encapsulation. device stf # The pf packet filter consists of three devices: # The `pf' device provides /dev/pf and the firewall code itself. # The `pflog' device provides the pflog0 interface which logs packets. # The `pfsync' device provides the pfsync0 interface used for # synchronization of firewall state tables (over the net). device pf device pflog device pfsync # Bridge interface. device if_bridge # Common Address Redundancy Protocol. See carp(4) for more details. device carp # IPsec interface. device enc # Link aggregation interface. device lagg # WireGuard interface. device wg # # Internet family options: # # MROUTING enables the kernel multicast packet forwarder, which works # with mrouted and XORP. # # IPFIREWALL enables support for IP firewall construction, in # conjunction with the `ipfw' program. IPFIREWALL_VERBOSE sends # logged packets to the system logger. IPFIREWALL_VERBOSE_LIMIT # limits the number of times a matching entry can be logged. # # WARNING: IPFIREWALL defaults to a policy of "deny ip from any to any" # and if you do not add other rules during startup to allow access, # YOU WILL LOCK YOURSELF OUT. It is suggested that you set firewall_type=open # in /etc/rc.conf when first enabling this feature, then refining the # firewall rules in /etc/rc.firewall after you've tested that the new kernel # feature works properly. # # IPFIREWALL_DEFAULT_TO_ACCEPT causes the default rule (at boot) to # allow everything. Use with care, if a cracker can crash your # firewall machine, they can get to your protected machines. However, # if you are using it as an as-needed filter for specific problems as # they arise, then this may be for you. Changing the default to 'allow' # means that you won't get stuck if the kernel and /sbin/ipfw binary get # out of sync. # # IPDIVERT enables the divert IP sockets, used by ``ipfw divert''. It # depends on IPFIREWALL if compiled into the kernel. # # IPFIREWALL_NAT adds support for in kernel nat in ipfw, and it requires # LIBALIAS. # # IPFIREWALL_NAT64 adds support for in kernel NAT64 in ipfw. # # IPFIREWALL_NPTV6 adds support for in kernel NPTv6 in ipfw. # # IPFIREWALL_PMOD adds support for protocols modification module. Currently # it supports only TCP MSS modification. # # IPSTEALTH enables code to support stealth forwarding (i.e., forwarding # packets without touching the TTL). This can be useful to hide firewalls # from traceroute and similar tools. # # PF_DEFAULT_TO_DROP causes the default pf(4) rule to deny everything. # # TCPPCAP enables code which keeps the last n packets sent and received # on a TCP socket. # # TCP_BLACKBOX enables enhanced TCP event logging. # # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack. # # ROUTE_MPATH provides support for multipath routing. # options MROUTING # Multicast routing options IPFIREWALL #firewall options IPFIREWALL_VERBOSE #enable logging to syslogd(8) options IPFIREWALL_VERBOSE_LIMIT=100 #limit verbosity options IPFIREWALL_DEFAULT_TO_ACCEPT #allow everything by default options IPFIREWALL_NAT #ipfw kernel nat support options IPFIREWALL_NAT64 #ipfw kernel NAT64 support options IPFIREWALL_NPTV6 #ipfw kernel IPv6 NPT support options IPDIVERT #divert sockets options IPFILTER #ipfilter support options IPFILTER_LOG #ipfilter logging options IPFILTER_LOOKUP #ipfilter pools options IPFILTER_DEFAULT_BLOCK #block all packets by default options IPSTEALTH #support for stealth forwarding options PF_DEFAULT_TO_DROP #drop everything by default options TCPPCAP options TCP_BLACKBOX options TCP_HHOOK options ROUTE_MPATH # The MBUF_STRESS_TEST option enables options which create # various random failures / extreme cases related to mbuf # functions. See mbuf(9) for a list of available test cases. # MBUF_PROFILING enables code to profile the mbuf chains # exiting the system (via participating interfaces) and # return a logarithmic histogram of monitored parameters # (e.g. packet size, wasted space, number of mbufs in chain). options MBUF_STRESS_TEST options MBUF_PROFILING # Statically link in accept filters options ACCEPT_FILTER_DATA options ACCEPT_FILTER_DNS options ACCEPT_FILTER_HTTP # TCP_SIGNATURE adds support for RFC 2385 (TCP-MD5) digests. These are # carried in TCP option 19. This option is commonly used to protect # TCP sessions (e.g. BGP) where IPSEC is not available nor desirable. # This is enabled on a per-socket basis using the TCP_MD5SIG socket option. # This requires the use of 'device crypto' and either 'options IPSEC' or # 'options IPSEC_SUPPORT'. options TCP_SIGNATURE #include support for RFC 2385 # DUMMYNET enables the "dummynet" bandwidth limiter. You need IPFIREWALL # as well. See dummynet(4) and ipfw(8) for more info. When you run # DUMMYNET, HZ/kern.hz should be at least 1000 for adequate response. options DUMMYNET # The DEBUGNET option enables a basic debug/panic-time networking API. It # is used by NETDUMP and NETGDB. options DEBUGNET # The NETDUMP option enables netdump(4) client support in the kernel. # This allows a panicking kernel to transmit a kernel dump to a remote host. options NETDUMP # The NETGDB option enables netgdb(4) support in the kernel. This allows a # panicking kernel to be debugged as a GDB remote over the network. options NETGDB ##################################################################### # FILESYSTEM OPTIONS # # Only the root filesystem needs to be statically compiled or preloaded # as module; everything else will be automatically loaded at mount # time. Some people still prefer to statically compile other # filesystems as well. # # NB: The UNION filesystem was known to be buggy in the past. It is now # being actively maintained, although there are still some issues being # resolved. # # One of these is mandatory: options FFS #Fast filesystem options NFSCL #Network File System client # The rest are optional: options AUTOFS #Automounter filesystem options CD9660 #ISO 9660 filesystem options FDESCFS #File descriptor filesystem options FUSEFS #FUSEFS support module options MSDOSFS #MS DOS File System (FAT, FAT32) options NFSLOCKD #Network Lock Manager options NFSD #Network Filesystem Server options KGSSAPI #Kernel GSSAPI implementation options NULLFS #NULL filesystem options PROCFS #Process filesystem (requires PSEUDOFS) options PSEUDOFS #Pseudo-filesystem framework options PSEUDOFS_TRACE #Debugging support for PSEUDOFS options SMBFS #SMB/CIFS filesystem options TMPFS #Efficient memory filesystem options UDF #Universal Disk Format options UNIONFS #Union filesystem # The xFS_ROOT options REQUIRE the associated ``options xFS'' options NFS_ROOT #NFS usable as root device # Soft updates is a technique for improving filesystem speed and # making abrupt shutdown less risky. # options SOFTUPDATES # Extended attributes allow additional data to be associated with files, # and is used for ACLs, Capabilities, and MAC labels. # See src/sys/ufs/ufs/README.extattr for more information. options UFS_EXTATTR options UFS_EXTATTR_AUTOSTART # Access Control List support for UFS filesystems. The current ACL # implementation requires extended attribute support, UFS_EXTATTR, # for the underlying filesystem. # See src/sys/ufs/ufs/README.acls for more information. options UFS_ACL # Directory hashing improves the speed of operations on very large # directories at the expense of some memory. options UFS_DIRHASH # Gjournal-based UFS journaling support. options UFS_GJOURNAL # Make space in the kernel for a root filesystem on a md device. # Define to the number of kilobytes to reserve for the filesystem. # This is now optional. # If not defined, the root filesystem passed in as the MFS_IMAGE makeoption # will be automatically embedded in the kernel during linking. Its exact size # will be consumed within the kernel. # If defined, the old way of embedding the filesystem in the kernel will be # used. That is to say MD_ROOT_SIZE KB will be allocated in the kernel and # later, the filesystem image passed in as the MFS_IMAGE makeoption will be # dd'd into the reserved space if it fits. options MD_ROOT_SIZE=10 # Make the md device a potential root device, either with preloaded # images of type mfs_root or md_root. options MD_ROOT # Write-protect the md root device so that it may not be mounted writeable. options MD_ROOT_READONLY # Allow to read MD image from external memory regions options MD_ROOT_MEM # Disk quotas are supported when this option is enabled. options QUOTA #enable disk quotas # If you are running a machine just as a fileserver for PC and MAC # users, using SAMBA, you may consider setting this option # and keeping all those users' directories on a filesystem that is # mounted with the suiddir option. This gives new files the same # ownership as the directory (similar to group). It's a security hole # if you let these users run programs, so confine it to file-servers # (but it'll save you lots of headaches in those cases). Root owned # directories are exempt and X bits are cleared. The suid bit must be # set on the directory as well; see chmod(1). PC owners can't see/set # ownerships so they keep getting their toes trodden on. This saves # you all the support calls as the filesystem it's used on will act as # they expect: "It's my dir so it must be my file". # options SUIDDIR # NFS options: options NFS_MINATTRTIMO=3 # VREG attrib cache timeout in sec options NFS_MAXATTRTIMO=60 options NFS_MINDIRATTRTIMO=30 # VDIR attrib cache timeout in sec options NFS_MAXDIRATTRTIMO=60 options NFS_DEBUG # Enable NFS Debugging # # Add support for the EXT2FS filesystem of Linux fame. Be a bit # careful with this - the ext2fs code has a tendency to lag behind # changes and not be exercised very much, so mounting read/write could # be dangerous (and even mounting read only could result in panics.) # options EXT2FS # The system memory devices; /dev/mem, /dev/kmem device mem # The kernel symbol table device; /dev/ksyms device ksyms # Optional character code conversion support with LIBICONV. # Each option requires their base file system and LIBICONV. options CD9660_ICONV options MSDOSFS_ICONV options UDF_ICONV ##################################################################### # POSIX P1003.1B # Real time extensions added in the 1993 POSIX # _KPOSIX_PRIORITY_SCHEDULING: Build in _POSIX_PRIORITY_SCHEDULING options _KPOSIX_PRIORITY_SCHEDULING # p1003_1b_semaphores are very experimental, # user should be ready to assist in debugging if problems arise. options P1003_1B_SEMAPHORES # POSIX message queue options P1003_1B_MQUEUE ##################################################################### # SECURITY POLICY PARAMETERS # Support for BSM audit options AUDIT # Support for Mandatory Access Control (MAC): options MAC options MAC_BIBA options MAC_BSDEXTENDED options MAC_DDB options MAC_IFOFF options MAC_IPACL options MAC_LOMAC options MAC_MLS options MAC_NONE options MAC_NTPD options MAC_PARTITION options MAC_PORTACL options MAC_PRIORITY options MAC_SEEOTHERUIDS options MAC_STUB options MAC_TEST options MAC_VERIEXEC options MAC_VERIEXEC_SHA1 options MAC_VERIEXEC_SHA256 options MAC_VERIEXEC_SHA384 options MAC_VERIEXEC_SHA512 device mac_veriexec_parser # Support for Capsicum options CAPABILITIES # fine-grained rights on file descriptors options CAPABILITY_MODE # sandboxes with no global namespace access ##################################################################### # CLOCK OPTIONS # The granularity of operation is controlled by the kernel option HZ (default # frequency of 1000 Hz or a period 1ms between calls). Virtual machine guests # use a value of 100. Lower values may lower overhead at the expense of accuracy # of scheduling, though the adaptive tick code reduces that overhead. options HZ=100 # Enable support for the kernel PLL to use an external PPS signal, # under supervision of [x]ntpd(8) # More info in ntpd documentation: http://www.eecis.udel.edu/~ntp options PPS_SYNC # Enable support for generic feed-forward clocks in the kernel. # The feed-forward clock support is an alternative to the feedback oriented # ntpd/system clock approach, and is to be used with a feed-forward # synchronization algorithm such as the RADclock: # More info here: http://www.synclab.org/radclock options FFCLOCK ##################################################################### # SCSI DEVICES # SCSI DEVICE CONFIGURATION # The SCSI subsystem consists of the `base' SCSI code, a number of # high-level SCSI device `type' drivers, and the low-level host-adapter # device drivers. The host adapters are listed in the ISA and PCI # device configuration sections below. # # It is possible to wire down your SCSI devices so that a given bus, # target, and LUN always come on line as the same device unit. In # earlier versions the unit numbers were assigned in the order that # the devices were probed on the SCSI bus. This means that if you # removed a disk drive, you may have had to rewrite your /etc/fstab # file, and also that you had to be careful when adding a new disk # as it may have been probed earlier and moved your device configuration # around. # This old behavior is maintained as the default behavior. The unit # assignment begins with the first non-wired down unit for a device # type. For example, if you wire a disk as "da3" then the first # non-wired disk will be assigned da4. # The syntax for wiring down devices is: envvar hint.scbus.0.at="ahc0" envvar hint.scbus.1.at="ahc1" envvar hint.scbus.1.bus="0" envvar hint.scbus.3.at="ahc2" envvar hint.scbus.3.bus="0" envvar hint.scbus.2.at="ahc2" envvar hint.scbus.2.bus="1" envvar hint.da.0.at="scbus0" envvar hint.da.0.target="0" envvar hint.da.0.unit="0" envvar hint.da.1.at="scbus3" envvar hint.da.1.target="1" envvar hint.da.2.at="scbus2" envvar hint.da.2.target="3" envvar hint.sa.1.at="scbus1" envvar hint.sa.1.target="6" # "units" (SCSI logical unit number) that are not specified are # treated as if specified as LUN 0. # All SCSI devices allocate as many units as are required. # The ch driver drives SCSI Media Changer ("jukebox") devices. # # The da driver drives SCSI Direct Access ("disk") and Optical Media # ("WORM") devices. # # The sa driver drives SCSI Sequential Access ("tape") devices. # # The cd driver drives SCSI Read Only Direct Access ("cd") devices. # # The ses driver drives SCSI Environment Services ("ses") and # SAF-TE ("SCSI Accessible Fault-Tolerant Enclosure") devices. # # The pt driver drives SCSI Processor devices. # # The sg driver provides a passthrough API that is compatible with the # Linux SG driver. It will work in conjunction with the Linuxulator # to run linux SG apps. It can also stand on its own and provide # source level API compatibility for porting apps to FreeBSD. # # Target Mode support is provided here but also requires that a SIM # (SCSI Host Adapter Driver) provide support as well. # # The targ driver provides target mode support as a Processor type device. # It exists to give the minimal context necessary to respond to Inquiry # commands. There is a sample user application that shows how the rest # of the command support might be done in /usr/share/examples/scsi_target. # # The targbh driver provides target mode support and exists to respond # to incoming commands that do not otherwise have a logical unit assigned # to them. # # The pass driver provides a passthrough API to access the CAM subsystem. device scbus #base SCSI code device ch #SCSI media changers device da #SCSI direct access devices (aka disks) device sa #SCSI tapes device cd #SCSI CD-ROMs device ses #Enclosure Services (SES and SAF-TE) device pt #SCSI processor device targ #SCSI Target Mode Code device targbh #SCSI Target Mode Blackhole Device device pass #CAM passthrough driver device sg #Linux SCSI passthrough device ctl #CAM Target Layer # CAM OPTIONS: # debugging options: # CAMDEBUG Compile in all possible debugging. # CAM_DEBUG_COMPILE Debug levels to compile in. # CAM_DEBUG_FLAGS Debug levels to enable on boot. # CAM_DEBUG_BUS Limit debugging to the given bus. # CAM_DEBUG_TARGET Limit debugging to the given target. # CAM_DEBUG_LUN Limit debugging to the given lun. # CAM_DEBUG_DELAY Delay in us after printing each debug line. # CAM_IO_STATS Publish additional CAM device statics by sysctl # # CAM_MAX_HIGHPOWER: Maximum number of concurrent high power (start unit) cmds # SCSI_NO_SENSE_STRINGS: When defined disables sense descriptions # SCSI_NO_OP_STRINGS: When defined disables opcode descriptions # SCSI_DELAY: The number of MILLISECONDS to freeze the SIM (scsi adapter) # queue after a bus reset, and the number of milliseconds to # freeze the device queue after a bus device reset. This # can be changed at boot and runtime with the # kern.cam.scsi_delay tunable/sysctl. options CAMDEBUG options CAM_DEBUG_COMPILE=-1 options CAM_DEBUG_FLAGS=(CAM_DEBUG_INFO|CAM_DEBUG_PROBE|CAM_DEBUG_PERIPH) options CAM_DEBUG_BUS=-1 options CAM_DEBUG_TARGET=-1 options CAM_DEBUG_LUN=-1 options CAM_DEBUG_DELAY=1 options CAM_MAX_HIGHPOWER=4 options SCSI_NO_SENSE_STRINGS options SCSI_NO_OP_STRINGS options SCSI_DELAY=5000 # Be pessimistic about Joe SCSI device options CAM_IOSCHED_DYNAMIC options CAM_IO_STATS options CAM_TEST_FAILURE # Options for the CAM CDROM driver: # CHANGER_MIN_BUSY_SECONDS: Guaranteed minimum time quantum for a changer LUN # CHANGER_MAX_BUSY_SECONDS: Maximum time quantum per changer LUN, only # enforced if there is I/O waiting for another LUN # The compiled in defaults for these variables are 2 and 10 seconds, # respectively. # # These can also be changed on the fly with the following sysctl variables: # kern.cam.cd.changer.min_busy_seconds # kern.cam.cd.changer.max_busy_seconds # options CHANGER_MIN_BUSY_SECONDS=2 options CHANGER_MAX_BUSY_SECONDS=10 # Options for the CAM sequential access driver: # SA_IO_TIMEOUT: Timeout for read/write/wfm operations, in minutes # SA_SPACE_TIMEOUT: Timeout for space operations, in minutes # SA_REWIND_TIMEOUT: Timeout for rewind operations, in minutes # SA_ERASE_TIMEOUT: Timeout for erase operations, in minutes # SA_1FM_AT_EOD: Default to model which only has a default one filemark at EOT. options SA_IO_TIMEOUT=4 options SA_SPACE_TIMEOUT=60 options SA_REWIND_TIMEOUT=(2*60) options SA_ERASE_TIMEOUT=(4*60) options SA_1FM_AT_EOD # Optional timeout for the CAM processor target (pt) device # This is specified in seconds. The default is 60 seconds. options SCSI_PT_DEFAULT_TIMEOUT=60 # Optional enable of doing SES passthrough on other devices (e.g., disks) # # Normally disabled because a lot of newer SCSI disks report themselves # as having SES capabilities, but this can then clot up attempts to build # a topology with the SES device that's on the box these drives are in.... options SES_ENABLE_PASSTHROUGH # iSCSI # # iSCSI permits access to SCSI peripherals over a network connection # (e.g. via a TCP/IP socket) device cfiscsi # CAM Target Layer iSCSI target frontend device iscsi # iSCSI initiator device iser # iSCSI Extensions for RDMA (iSER) initiator ##################################################################### # MISCELLANEOUS DEVICES AND OPTIONS device pty #BSD-style compatibility pseudo ttys device nmdm #back-to-back tty devices device md #Memory/malloc disk device snp #Snoop device - to look at pty/vty/etc.. device ccd #Concatenated disk driver device firmware #firmware(9) support # Kernel side iconv library options LIBICONV # Size of the kernel message buffer. Should be N * pagesize. options MSGBUF_SIZE=40960 ##################################################################### # HARDWARE BUS CONFIGURATION # # PCI bus & PCI options: # device pci options PCI_HP # PCI-Express native HotPlug options PCI_IOV # PCI SR-IOV support ##################################################################### # HARDWARE DEVICE CONFIGURATION # For ISA the required hints are listed. # PCI, CardBus, and SD/MMC are self identifying buses, so # no hints are needed. # # Mandatory devices: # # These options are valid for other keyboard drivers as well. options KBD_DISABLE_KEYMAP_LOAD # refuse to load a keymap options KBD_INSTALL_CDEV # install a CDEV entry in /dev # Define keyboard latency (try 200/15 for a snappy interactive console) options KBD_DELAY1=200 # define initial key delay options KBD_DELAY2=15 # define key delay device kbdmux # keyboard multiplexer options KBDMUX_DFLT_KEYMAP # specify the built-in keymap makeoptions KBDMUX_DFLT_KEYMAP=it.iso options FB_DEBUG # Frame buffer debugging # Enable experimental features of the syscons terminal emulator (teken). options TEKEN_CONS25 # cons25-style terminal emulation options TEKEN_UTF8 # UTF-8 output handling # The vt video console driver. device vt options VT_ALT_TO_ESC_HACK=1 # Prepend ESC sequence to ALT keys options VT_MAXWINDOWS=16 # Number of virtual consoles options VT_TWOBUTTON_MOUSE # Use right mouse button to paste # The following options set the maximum framebuffer size. options VT_FB_MAX_HEIGHT=480 options VT_FB_MAX_WIDTH=640 # The following options will let you change the default vt terminal colors. options TERMINAL_NORM_ATTR=(FG_GREEN|BG_BLACK) options TERMINAL_KERN_ATTR=(FG_LIGHTRED|BG_BLACK) # # Optional devices: # # # SCSI host adapters: # # aacraid: Adaptec by PMC RAID controllers, Series 6/7/8 and upcoming # families. Container interface, CAM required. # ahc: Adaptec 274x/284x/2910/293x/294x/394x/3950x/3960x/398X/4944/ # 19160x/29160x, aic7770/aic78xx # ahd: Adaptec 29320/39320 Controllers. # isp: Qlogic ISP 1020, 1040 and 1040B PCI SCSI host adapters, # ISP 1240 Dual Ultra SCSI, ISP 1080 and 1280 (Dual) Ultra2, # ISP 12160 Ultra3 SCSI, # Qlogic ISP 2100 and ISP 2200 1Gb Fibre Channel host adapters. # Qlogic ISP 2300 and ISP 2312 2Gb Fibre Channel host adapters. # Qlogic ISP 2322 and ISP 6322 2Gb Fibre Channel host adapters. # ispfw: Firmware module for Qlogic host adapters # mpr: LSI-Logic MPT/Fusion Gen 3 # mps: LSI-Logic MPT/Fusion Gen 2 # mpt: LSI-Logic MPT/Fusion 53c1020 or 53c1030 Ultra4 # or FC9x9 Fibre Channel host adapters. # sym: Symbios/Logic 53C8XX family of PCI-SCSI I/O processors: # 53C810, 53C810A, 53C815, 53C825, 53C825A, 53C860, 53C875, # 53C876, 53C885, 53C895, 53C895A, 53C896, 53C897, 53C1510D, # 53C1010-33, 53C1010-66. device aacraid device ahc device ahd device isp envvar hint.isp.0.disable="1" envvar hint.isp.0.role="3" envvar hint.isp.0.prefer_iomap="1" envvar hint.isp.0.prefer_memmap="1" envvar hint.isp.0.fwload_disable="1" envvar hint.isp.0.ignore_nvram="1" envvar hint.isp.0.fullduplex="1" envvar hint.isp.0.topology="lport" envvar hint.isp.0.topology="nport" envvar hint.isp.0.topology="lport-only" envvar hint.isp.0.topology="nport-only" # we can't get u_int64_t types, nor can we get strings if it's got # a leading 0x, hence this silly dodge. envvar hint.isp.0.portwnn="w50000000aaaa0000" envvar hint.isp.0.nodewnn="w50000000aaaa0001" device ispfw # Only works on aarch64 and amd64 #device mpi3mr # LSI-Logic MPT-Fusion 4 device mpr # LSI-Logic MPT-Fusion 3 device mps # LSI-Logic MPT-Fusion 2 device mpt # LSI-Logic MPT-Fusion device sym # The aic7xxx driver will attempt to use memory mapped I/O for all PCI # controllers that have it configured only if this option is set. Unfortunately, # this doesn't work on some motherboards, which prevents it from being the # default. options AHC_ALLOW_MEMIO # Dump the contents of the ahc controller configuration PROM. options AHC_DUMP_EEPROM # Bitmap of units to enable targetmode operations. options AHC_TMODE_ENABLE # Compile in Aic7xxx Debugging code. options AHC_DEBUG # Aic7xxx driver debugging options. See sys/dev/aic7xxx/aic7xxx.h options AHC_DEBUG_OPTS # Print register bitfields in debug output. Adds ~128k to driver # See ahc(4). options AHC_REG_PRETTY_PRINT # Compile in aic79xx debugging code. options AHD_DEBUG # Aic79xx driver debugging options. Adds ~215k to driver. See ahd(4). options AHD_DEBUG_OPTS=0xFFFFFFFF # Print human-readable register definitions when debugging options AHD_REG_PRETTY_PRINT # Bitmap of units to enable targetmode operations. options AHD_TMODE_ENABLE # Options used in dev/isp/ (Qlogic SCSI/FC driver). # # ISP_TARGET_MODE - enable target mode operation # options ISP_TARGET_MODE=1 # # ISP_DEFAULT_ROLES - default role # none=0 # target=1 # initiator=2 # both=3 (not supported currently) # # ISP_INTERNAL_TARGET (trivial internal disk target, for testing) # options ISP_DEFAULT_ROLES=0 #options SYM_SETUP_SCSI_DIFF #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 #options SYM_SETUP_PCI_PARITY #-PCI parity checking # disabled:0, enabled:1 (default) #options SYM_SETUP_MAX_LUN #-Number of LUNs supported # default:8, range:[1..64] # # Compaq "CISS" RAID controllers (SmartRAID 5* series) # These controllers have a SCSI-like interface, and require the # CAM infrastructure. # device ciss # # Compaq Smart RAID, Mylex DAC960 and AMI MegaRAID controllers. Only # one entry is needed; the code will find and configure all supported # controllers. # device ida # Compaq Smart RAID device mlx # Mylex DAC960 device mfi # LSI MegaRAID SAS device mfip # LSI MegaRAID SAS passthrough, requires CAM options MFI_DEBUG device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s # NVM Express # # nvme: PCI-express NVM Express host controllers # nda: CAM NVMe disk driver # nvd: non-CAM NVMe disk driver device nvme # base NVMe driver options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver device nda # NVMe direct access devices (aka disks) device nvd # expose NVMe namespaces as disks, depends on nvme # # Serial ATA host controllers: # # ahci: Advanced Host Controller Interface (AHCI) compatible # mvs: Marvell 88SX50XX/88SX60XX/88SX70XX/SoC controllers # siis: SiliconImage SiI3124/SiI3132/SiI3531 controllers # # These drivers are part of cam(4) subsystem. They supersede less featured # ata(4) subsystem drivers, supporting same hardware. device ahci # AHCI-compatible SATA controllers device mvs # Marvell 88SX50XX/88SX60XX/88SX70XX/SoC SATA device siis # SiliconImage SiI3124/SiI3132/SiI3531 SATA device ada # ATA/SATA direct access devices (aka disks) # # The 'ATA' driver supports all legacy ATA/ATAPI controllers, including # PC Card devices. You only need one "device ata" for it to find all # PCI and PC Card ATA/ATAPI devices on modern machines. # Alternatively, individual bus and chipset drivers may be chosen by using # the 'atacore' driver then selecting the drivers on a per vendor basis. # For example to build a system which only supports a VIA chipset, # omit 'ata' and include the 'atacore', 'atapci' and 'atavia' drivers. device ata # Legacy ATA/SATA controllers # Modular ATA #device atacore # Core ATA functionality #device ataisa # ISA bus support #device atapci # PCI bus support; only generic chipset support # PCI ATA chipsets #device ataacard # ACARD #device ataacerlabs # Acer Labs Inc. (ALI) #device ataamd # American Micro Devices (AMD) #device ataati # ATI #device atacenatek # Cenatek #device atacypress # Cypress #device atacyrix # Cyrix #device atahighpoint # HighPoint #device ataintel # Intel #device ataite # Integrated Technology Inc. (ITE) #device atajmicron # JMicron #device atamarvell # Marvell #device atamicron # Micron #device atanational # National #device atanetcell # NetCell #device atanvidia # nVidia #device atapromise # Promise #device ataserverworks # ServerWorks #device atasiliconimage # Silicon Image Inc. (SiI) (formerly CMD) #device atasis # Silicon Integrated Systems Corp.(SiS) #device atavia # VIA Technologies Inc. # # For older non-PCI, non-PnPBIOS systems, these are the hints lines to add: envvar hint.ata.0.at="isa" envvar hint.ata.0.port="0x1f0" envvar hint.ata.0.irq="14" envvar hint.ata.1.at="isa" envvar hint.ata.1.port="0x170" envvar hint.ata.1.irq="15" # # uart: generic driver for serial interfaces. # device uart # Options for uart(4) options UART_PPS_ON_CTS # Do time pulse capturing using CTS # instead of DCD. options UART_POLL_FREQ # Set polling rate, used when hw has # no interrupt support (50 Hz default). # The following hint should only be used for pure ISA devices. It is not # needed otherwise. Use of hints is strongly discouraged. envvar hint.uart.0.at="isa" # The following 3 hints are used when the UART is a system device (i.e., a # console or debug port), but only on platforms that don't have any other # means to pass the information to the kernel. The unit number of the hint # is only used to bundle the hints together. There is no relation to the # unit number of the probed UART. envvar hint.uart.0.port="0x3f8" envvar hint.uart.0.flags="0x10" envvar hint.uart.0.baud="115200" # `flags' for serial drivers that support consoles, like uart(4): # 0x10 enable console support for this unit. Other console flags # (if applicable) are ignored unless this is set. Enabling # console support does not make the unit the preferred console. # Boot with -h or set boot_serial=YES in the loader. # Currently, at most one unit can have console support; the # first one (in config file order) with this flag set is # preferred. # 0x80 use this port for serial line gdb support in ddb. Also known # as debug port. # # Options for serial drivers that support consoles: options BREAK_TO_DEBUGGER # A BREAK/DBG on the console goes to # ddb, if available. # Solaris implements a new BREAK which is initiated by a character # sequence CR ~ ^b which is similar to a familiar pattern used on # Sun servers by the Remote Console. There are FreeBSD extensions: # CR ~ ^p requests force panic and CR ~ ^r requests a clean reboot. options ALT_BREAK_TO_DEBUGGER # Serial Communications Controller # Supports the Freescale/NXP QUad Integrated and Zilog Z8530 multi-channel # communications controllers. device scc # PCI Universal Communications driver # Supports various multi port PCI I/O cards. device puc # # Network interfaces: # # MII bus support is required for many PCI Ethernet NICs, # namely those which use MII-compliant transceivers or implement # transceiver control interfaces that operate like an MII. Adding # "device miibus" to the kernel config pulls in support for the generic # miibus API, the common support for bit-bang'ing the MII and all # of the PHY drivers, including a generic one for PHYs that aren't # specifically handled by an individual driver. Support for specific # PHYs may be built by adding "device mii", "device mii_bitbang" if # needed by the NIC driver and then adding the appropriate PHY driver. device mii # Minimal MII support device mii_bitbang # Common module for bit-bang'ing the MII device miibus # MII support w/ bit-bang'ing and all PHYs device acphy # Altima Communications AC101 device amphy # AMD AM79c873 / Davicom DM910{1,2} device atphy # Attansic/Atheros F1 device axphy # Asix Semiconductor AX88x9x device bmtphy # Broadcom BCM5201/BCM5202 and 3Com 3c905C device bnxt # Broadcom NetXtreme-C/NetXtreme-E device brgphy # Broadcom BCM54xx/57xx 1000baseTX device cgem # Cadence GEM Gigabit Ethernet device ciphy # Cicada/Vitesse CS/VSC8xxx device e1000phy # Marvell 88E1000 1000/100/10-BT device gentbi # Generic 10-bit 1000BASE-{LX,SX} fiber ifaces device icsphy # ICS ICS1889-1893 device ip1000phy # IC Plus IP1000A/IP1001 device jmphy # JMicron JMP211/JMP202 device lxtphy # Level One LXT-970 device nsgphy # NatSemi DP8361/DP83865/DP83891 device nsphy # NatSemi DP83840A device nsphyter # NatSemi DP83843/DP83815 device pnaphy # HomePNA device qsphy # Quality Semiconductor QS6612 device rdcphy # RDC Semiconductor R6040 device rgephy # RealTek 8169S/8110S/8211B/8211C device rlphy # RealTek 8139 device rlswitch # RealTek 8305 device smcphy # SMSC LAN91C111 device tdkphy # TDK 89Q2120 device truephy # LSI TruePHY device xmphy # XaQti XMAC II # ae: Support for gigabit ethernet adapters based on the Attansic/Atheros # L2 PCI-Express FastEthernet controllers. # age: Support for gigabit ethernet adapters based on the Attansic/Atheros # L1 PCI express gigabit ethernet controllers. # alc: Support for Atheros AR8131/AR8132 PCIe ethernet controllers. # ale: Support for Atheros AR8121/AR8113/AR8114 PCIe ethernet controllers. # ath: Atheros a/b/g WiFi adapters (requires ath_hal and wlan) # bce: Broadcom NetXtreme II (BCM5706/BCM5708) PCI/PCIe Gigabit Ethernet # adapters. # bfe: Broadcom BCM4401 Ethernet adapter. # bge: Support for gigabit ethernet adapters based on the Broadcom # BCM570x family of controllers, including the 3Com 3c996-T, # the Netgear GA302T, the SysKonnect SK-9D21 and SK-9D41, and # the embedded gigE NICs on Dell PowerEdge 2550 servers. # bnxt: Broadcom NetXtreme-C and NetXtreme-E PCIe 10/25/50G Ethernet adapters. # bxe: Broadcom NetXtreme II (BCM5771X/BCM578XX) PCIe 10Gb Ethernet # adapters. # bwi: Broadcom BCM430* and BCM431* family of wireless adapters. # bwn: Broadcom BCM43xx family of wireless adapters. # cas: Sun Cassini/Cassini+ and National Semiconductor DP83065 Saturn # cxgb: Chelsio T3 based 1GbE/10GbE PCIe Ethernet adapters. # cxgbe:Chelsio T4, T5, and T6-based 1/10/25/40/100GbE PCIe Ethernet # adapters. # cxgbev: Chelsio T4, T5, and T6-based PCIe Virtual Functions. # dc: Support for PCI fast ethernet adapters based on the DEC/Intel 21143 # and various workalikes including: # the ADMtek AL981 Comet and AN985 Centaur, the ASIX Electronics # AX88140A and AX88141, the Davicom DM9100 and DM9102, the Lite-On # 82c168 and 82c169 PNIC, the Lite-On/Macronix LC82C115 PNIC II # and the Macronix 98713/98713A/98715/98715A/98725 PMAC. This driver # replaces the old al, ax, dm, pn and mx drivers. List of brands: # Digital DE500-BA, Kingston KNE100TX, D-Link DFE-570TX, SOHOware SFA110, # SVEC PN102-TX, CNet Pro110B, 120A, and 120B, Compex RL100-TX, # LinkSys LNE100TX, LNE100TX V2.0, Jaton XpressNet, Alfa Inc GFC2204, # KNE110TX. # em: Intel Pro/1000 Gigabit Ethernet 82542, 82543, 82544 based adapters. # fxp: Intel EtherExpress Pro/100B # (hint of prefer_iomap can be done to prefer I/O instead of Mem mapping) # gem: Apple GMAC/Sun ERI/Sun GEM # jme: JMicron JMC260 Fast Ethernet/JMC250 Gigabit Ethernet based adapters. # le: AMD Am7900 LANCE and Am79C9xx PCnet # lge: Support for PCI gigabit ethernet adapters based on the Level 1 # LXT1001 NetCellerator chipset. This includes the D-Link DGE-500SX, # SMC TigerCard 1000 (SMC9462SX), and some Addtron cards. # lio: Support for Cavium 23XX Ethernet adapters # malo: Marvell Libertas wireless NICs. # mwl: Marvell 88W8363 802.11n wireless NICs. # Requires the mwl firmware module # mwlfw: Marvell 88W8363 firmware # msk: Support for gigabit ethernet adapters based on the Marvell/SysKonnect # Yukon II Gigabit controllers, including 88E8021, 88E8022, 88E8061, # 88E8062, 88E8035, 88E8036, 88E8038, 88E8050, 88E8052, 88E8053, # 88E8055, 88E8056 and D-Link 560T/550SX. # mlxfw: Mellanox firmware update module. # mlx5: Mellanox ConnectX-4 and ConnectX-4 LX IB and Eth shared code module. # mlx5en:Mellanox ConnectX-4 and ConnectX-4 LX PCIe Ethernet adapters. # my: Myson Fast Ethernet (MTD80X, MTD89X) # nge: Support for PCI gigabit ethernet adapters based on the National # Semiconductor DP83820 and DP83821 chipset. This includes the # SMC EZ Card 1000 (SMC9462TX), D-Link DGE-500T, Asante FriendlyNet # GigaNIX 1000TA and 1000TPC, the Addtron AEG320T, the Surecom # EP-320G-TX and the Netgear GA622T. # oce: Emulex 10 Gbit adapters (OneConnect Ethernet) # ral: Ralink Technology IEEE 802.11 wireless adapter # re: RealTek 8139C+/8169/816xS/811xS/8101E PCI/PCIe Ethernet adapter # rl: Support for PCI fast ethernet adapters based on the RealTek 8129/8139 # chipset. Note that the RealTek driver defaults to using programmed # I/O to do register accesses because memory mapped mode seems to cause # severe lockups on SMP hardware. This driver also supports the # Accton EN1207D `Cheetah' adapter, which uses a chip called # the MPX 5030/5038, which is either a RealTek in disguise or a # RealTek workalike. Note that the D-Link DFE-530TX+ uses the RealTek # chipset and is supported by this driver, not the 'vr' driver. # rtwn: RealTek wireless adapters. # rtwnfw: RealTek wireless firmware. # sge: Silicon Integrated Systems SiS190/191 Fast/Gigabit Ethernet adapter # sis: Support for NICs based on the Silicon Integrated Systems SiS 900, # SiS 7016 and NS DP83815 PCI fast ethernet controller chips. # sk: Support for the SysKonnect SK-984x series PCI gigabit ethernet NICs. # This includes the SK-9841 and SK-9842 single port cards (single mode # and multimode fiber) and the SK-9843 and SK-9844 dual port cards # (also single mode and multimode). # The driver will autodetect the number of ports on the card and # attach each one as a separate network interface. # ste: Sundance Technologies ST201 PCI fast ethernet controller, includes # the D-Link DFE-550TX. # stge: Support for gigabit ethernet adapters based on the Sundance/Tamarack # TC9021 family of controllers, including the Sundance ST2021/ST2023, # the Sundance/Tamarack TC9021, the D-Link DL-4000 and ASUS NX1101. # ti: Support for PCI gigabit ethernet NICs based on the Alteon Networks # Tigon 1 and Tigon 2 chipsets. This includes the Alteon AceNIC, the # 3Com 3c985, the Netgear GA620 and various others. Note that you will # probably want to bump up kern.ipc.nmbclusters a lot to use this driver. # vr: Support for various fast ethernet adapters based on the VIA # Technologies VT3043 `Rhine I' and VT86C100A `Rhine II' chips, # including the D-Link DFE520TX and D-Link DFE530TX (see 'rl' for # DFE530TX+), the Hawking Technologies PN102TX, and the AOpen/Acer ALN-320. # vte: DM&P Vortex86 RDC R6040 Fast Ethernet # xl: Support for the 3Com 3c900, 3c905, 3c905B and 3c905C (Fast) # Etherlink XL cards and integrated controllers. This includes the # integrated 3c905B-TX chips in certain Dell Optiplex and Dell # Precision desktop machines and the integrated 3c905-TX chips # in Dell Latitude laptop docking stations. # Also supported: 3Com 3c980(C)-TX, 3Com 3cSOHO100-TX, 3Com 3c450-TX # PCI Ethernet NICs that use the common MII bus controller code. device ae # Attansic/Atheros L2 FastEthernet device age # Attansic/Atheros L1 Gigabit Ethernet device alc # Atheros AR8131/AR8132 Ethernet device ale # Atheros AR8121/AR8113/AR8114 Ethernet device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet device bfe # Broadcom BCM440x 10/100 Ethernet device bge # Broadcom BCM570xx Gigabit Ethernet device cas # Sun Cassini/Cassini+ and NS DP83065 Saturn device dc # DEC/Intel 21143 and various workalikes device et # Agere ET1310 10/100/Gigabit Ethernet device fxp # Intel EtherExpress PRO/100B (82557, 82558) envvar hint.fxp.0.prefer_iomap="0" device gem # Apple GMAC/Sun ERI/Sun GEM device jme # JMicron JMC250 Gigabit/JMC260 Fast Ethernet device lge # Level 1 LXT1001 gigabit Ethernet device lio # Support for Cavium 23XX Ethernet adapters device mlxfw # Mellanox firmware update module device mlx5 # Shared code module between IB and Ethernet device mlx5en # Mellanox ConnectX-4 and ConnectX-4 LX device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet device my # Myson Fast Ethernet (MTD80X, MTD89X) device nge # NatSemi DP83820 gigabit Ethernet device re # RealTek 8139C+/8169/8169S/8110S device rl # RealTek 8129/8139 device sge # Silicon Integrated Systems SiS190/191 device sis # Silicon Integrated Systems SiS 900/SiS 7016 device sk # SysKonnect SK-984x & SK-982x gigabit Ethernet device ste # Sundance ST201 (D-Link DFE-550TX) device stge # Sundance/Tamarack TC9021 gigabit Ethernet device vr # VIA Rhine, Rhine II device vte # DM&P Vortex86 RDC R6040 Fast Ethernet device xl # 3Com 3c90x (``Boomerang'', ``Cyclone'') # PCI/PCI-X/PCIe Ethernet NICs that use iflib infrastructure device iflib device em # Intel Pro/1000 Gigabit Ethernet device ix # Intel Pro/10Gbe PCIE Ethernet device ixv # Intel Pro/10Gbe PCIE Ethernet VF # PCI Ethernet NICs. device cxgb # Chelsio T3 10 Gigabit Ethernet device cxgb_t3fw # Chelsio T3 10 Gigabit Ethernet firmware device cxgbe # Chelsio T4-T6 1/10/25/40/100 Gigabit Ethernet device cxgbev # Chelsio T4-T6 Virtual Functions device le # AMD Am7900 LANCE and Am79C9xx PCnet device mxge # Myricom Myri-10G 10GbE NIC device oce # Emulex 10 GbE (OneConnect Ethernet) device ti # Alteon Networks Tigon I/II gigabit Ethernet # PCI IEEE 802.11 Wireless NICs device ath # Atheros pci/cardbus NIC's device ath_hal # pci/cardbus chip support #device ath_ar5210 # AR5210 chips #device ath_ar5211 # AR5211 chips #device ath_ar5212 # AR5212 chips #device ath_rf2413 #device ath_rf2417 #device ath_rf2425 #device ath_rf5111 #device ath_rf5112 #device ath_rf5413 #device ath_ar5416 # AR5416 chips # All of the AR5212 parts have a problem when paired with the AR71xx # CPUS. These parts have a bug that triggers a fatal bus error on the AR71xx # only. Details of the exact nature of the bug are sketchy, but some can be # found at https://forum.openwrt.org/viewtopic.php?pid=70060 on pages 4, 5 and # 6. This option enables this workaround. There is a performance penalty # for this work around, but without it things don't work at all. The DMA # from the card usually bursts 128 bytes, but on the affected CPUs, only # 4 are safe. options AH_RXCFG_SDMAMW_4BYTES #device ath_ar9160 # AR9160 chips #device ath_ar9280 # AR9280 chips #device ath_ar9285 # AR9285 chips device ath_rate_sample # SampleRate tx rate control for ath device bwi # Broadcom BCM430* BCM431* device bwn # Broadcom BCM43xx device malo # Marvell Libertas wireless NICs. device mwl # Marvell 88W8363 802.11n wireless NICs. device mwlfw device ral # Ralink Technology RT2500 wireless NICs. device rtwn # Realtek wireless NICs device rtwnfw # Use sf_buf(9) interface for jumbo buffers on ti(4) controllers. #options TI_SF_BUF_JUMBO # Turn on the header splitting option for the ti(4) driver firmware. This # only works for Tigon II chips, and has no effect for Tigon I chips. # This option requires the TI_SF_BUF_JUMBO option above. #options TI_JUMBO_HDRSPLIT # These two options allow manipulating the mbuf cluster size and mbuf size, # respectively. Be very careful with NIC driver modules when changing # these from their default values, because that can potentially cause a # mismatch between the mbuf size assumed by the kernel and the mbuf size # assumed by a module. The only driver that currently has the ability to # detect a mismatch is ti(4). -options MCLSHIFT=11 # mbuf cluster shift in bits, 11 == 2KB +options MCLSHIFT=12 # mbuf cluster shift in bits, 12 == 4 kB + # default is 11 == 2 kB options MSIZE=256 # mbuf size in bytes # # Sound drivers # # sound: The generic sound driver. # device sound # # snd_*: Device-specific drivers. # # The flags of the device tell the device a bit more info about the # device that normally is obtained through the PnP interface. # bit 2..0 secondary DMA channel; # bit 4 set if the board uses two dma channels; # bit 15..8 board type, overrides autodetection; leave it # zero if don't know what to put in (and you don't, # since this is unsupported at the moment...). # # snd_als4000: Avance Logic ALS4000 PCI. # snd_atiixp: ATI IXP 200/300/400 PCI. # snd_cmi: CMedia CMI8338/CMI8738 PCI. # snd_cs4281: Crystal Semiconductor CS4281 PCI. # snd_csa: Crystal Semiconductor CS461x/428x PCI. (except # 4281) # snd_emu10k1: Creative EMU10K1 PCI and EMU10K2 (Audigy) PCI. # snd_emu10kx: Creative SoundBlaster Live! and Audigy # snd_envy24: VIA Envy24 and compatible, needs snd_spicds. # snd_envy24ht: VIA Envy24HT and compatible, needs snd_spicds. # snd_es137x: Ensoniq AudioPCI ES137x PCI. # snd_fm801: Forte Media FM801 PCI. # snd_hda: Intel High Definition Audio (Controller) and # compatible. # snd_hdspe: RME HDSPe AIO and RayDAT. # snd_ich: Intel ICH AC'97 and some more audio controllers # embedded in a chipset, for example nVidia # nForce controllers. # snd_maestro3: ESS Technology Maestro-3/Allegro PCI. # snd_neomagic: Neomagic 256 AV/ZX PCI. # snd_solo: ESS Solo-1x PCI. # snd_spicds: SPI codec driver, needed by Envy24/Envy24HT drivers. # snd_t4dwave: Trident 4DWave DX/NX PCI, Sis 7018 PCI and Acer Labs # M5451 PCI. # snd_uaudio: USB audio. # snd_via8233: VIA VT8233x PCI. # snd_via82c686: VIA VT82C686A PCI. # snd_vibes: S3 Sonicvibes PCI. device snd_als4000 device snd_atiixp device snd_cmi device snd_cs4281 device snd_csa device snd_emu10k1 device snd_emu10kx device snd_envy24 device snd_envy24ht device snd_es137x device snd_fm801 device snd_hda device snd_hdspe device snd_ich device snd_maestro3 device snd_neomagic device snd_solo device snd_spicds device snd_t4dwave device snd_uaudio device snd_via8233 device snd_via82c686 device snd_vibes # For non-PnP sound cards: envvar hint.pcm.0.at="isa" envvar hint.pcm.0.irq="10" envvar hint.pcm.0.drq="1" envvar hint.pcm.0.flags="0x0" envvar hint.sbc.0.at="isa" envvar hint.sbc.0.port="0x220" envvar hint.sbc.0.irq="5" envvar hint.sbc.0.drq="1" envvar hint.sbc.0.flags="0x15" envvar hint.gusc.0.at="isa" envvar hint.gusc.0.port="0x220" envvar hint.gusc.0.irq="5" envvar hint.gusc.0.drq="1" envvar hint.gusc.0.flags="0x13" # # Following options are intended for debugging/testing purposes: # # SND_DEBUG Enable extra debugging code that includes # sanity checking and possible increase of # verbosity. # # SND_DIAGNOSTIC Similar in a spirit of INVARIANTS/DIAGNOSTIC, # zero tolerance against inconsistencies. # # SND_FEEDER_MULTIFORMAT By default, only 16/32 bit feeders are compiled # in. This options enable most feeder converters # except for 8bit. WARNING: May bloat the kernel. # # SND_FEEDER_FULL_MULTIFORMAT Ditto, but includes 8bit feeders as well. # # SND_FEEDER_RATE_HP (feeder_rate) High precision 64bit arithmetic # as much as possible (the default trying to # avoid it). Possible slowdown. # # SND_PCM_64 (Only applicable for i386/32bit arch) # Process 32bit samples through 64bit # integer/arithmetic. Slight increase of dynamic # range at a cost of possible slowdown. # # SND_OLDSTEREO Only 2 channels are allowed, effectively # disabling multichannel processing. # options SND_DEBUG options SND_DIAGNOSTIC options SND_FEEDER_MULTIFORMAT options SND_FEEDER_FULL_MULTIFORMAT options SND_FEEDER_RATE_HP options SND_PCM_64 options SND_OLDSTEREO # # Cardbus # # cbb: pci/CardBus bridge implementing YENTA interface # cardbus: CardBus slots device cbb device cardbus # # MMC/SD # # mmc MMC/SD bus # mmcsd MMC/SD memory card # sdhci Generic PCI SD Host Controller # rtsx Realtek SD card reader (RTS5209, RTS5227, ...) device mmc device mmcsd device sdhci device rtsx # # SMB bus # # System Management Bus support is provided by the 'smbus' device. # Access to the SMBus device is via the 'smb' device (/dev/smb*), # which is a child of the 'smbus' device. # # Supported devices: # smb standard I/O through /dev/smb* # # Supported SMB interfaces: # iicsmb I2C to SMB bridge with any iicbus interface # intpm Intel PIIX4 (82371AB, 82443MX) Power Management Unit # alpm Acer Aladdin-IV/V/Pro2 Power Management Unit # ichsmb Intel ICH SMBus controller chips (82801AA, 82801AB, 82801BA) # viapm VIA VT82C586B/596B/686A and VT8233 Power Management Unit # amdpm AMD 756 Power Management Unit # amdsmb AMD 8111 SMBus 2.0 Controller # nfpm NVIDIA nForce Power Management Unit # nfsmb NVIDIA nForce2/3/4 MCP SMBus 2.0 Controller # ismt Intel SMBus 2.0 controller chips (on Atom S1200, C2000) # device smbus # Bus support, required for smb below. device intpm options ENABLE_ALART # Control alarm on Intel intpm driver device alpm device ichsmb device viapm device amdpm device amdsmb device nfpm device nfsmb device ismt device smb # SMBus peripheral devices # # jedec_dimm Asset and temperature reporting for DDR3 and DDR4 DIMMs # device jedec_dimm # I2C Bus # # Philips i2c bus support is provided by the `iicbus' device. # # Supported devices: # ic i2c network interface # iic i2c standard io # iicsmb i2c to smb bridge. Allow i2c i/o with smb commands. # iicoc simple polling driver for OpenCores I2C controller # # Other: # iicbb generic I2C bit-banging code (needed by lpbb) # device iicbus # Bus support, required for ic/iic/iicsmb below. device iicbb # bitbang driver; implements i2c on a pair of gpio pins device ic device iic # userland access to i2c slave devices via ioctl(8) device iicsmb # smb over i2c bridge device iicoc # OpenCores I2C controller support # I2C bus multiplexer (mux) devices device iicmux # i2c mux core driver device iic_gpiomux # i2c mux hardware controlled via gpio pins device ltc430x # LTC4305 and LTC4306 i2c mux chips # I2C peripheral devices # device ad7418 # Analog Devices temp and voltage sensor device ads111x # Texas Instruments ADS101x and ADS111x ADCs device ds1307 # Dallas DS1307 RTC and compatible device ds13rtc # All Dallas/Maxim ds13xx chips device ds1672 # Dallas DS1672 RTC device ds3231 # Dallas DS3231 RTC + temperature device fan53555 # Fairchild Semi FAN53555/SYR82x Regulator device icee # AT24Cxxx and compatible EEPROMs device isl12xx # Intersil ISL12xx RTC device lm75 # LM75 compatible temperature sensor device nxprtc # NXP RTCs: PCA/PFC212x PCA/PCF85xx device rtc8583 # Epson RTC-8583 device s35390a # Seiko Instruments S-35390A RTC device sy8106a # Silergy Corp. SY8106A buck regulator # Parallel-Port Bus # # Parallel port bus support is provided by the `ppbus' device. # Multiple devices may be attached to the parallel port, devices # are automatically probed and attached when found. # # Supported devices: # lpt Parallel Printer # plip Parallel network interface # ppi General-purpose I/O ("Geek Port") + IEEE1284 I/O # pps Pulse per second Timing Interface # lpbb Philips official parallel port I2C bit-banging interface # pcfclock Parallel port clock driver. # # Supported interfaces: # ppc ISA-bus parallel port interfaces. # options PPC_PROBE_CHIPSET # Enable chipset specific detection # (see flags in ppc(4)) options DEBUG_1284 # IEEE1284 signaling protocol debug options PERIPH_1284 # Makes your computer act as an IEEE1284 # compliant peripheral options DONTPROBE_1284 # Avoid boot detection of PnP parallel devices options LPT_DEBUG # Printer driver debug options PPC_DEBUG # Parallel chipset level debug options PLIP_DEBUG # Parallel network IP interface debug options PCFCLOCK_VERBOSE # Verbose pcfclock driver options PCFCLOCK_MAX_RETRIES=5 # Maximum read tries (default 10) device ppc envvar hint.ppc.0.at="isa" envvar hint.ppc.0.irq="7" device ppbus device lpt device plip device ppi device pps device lpbb device pcfclock # General Purpose I/O pins device dwgpio # Synopsys DesignWare APB GPIO Controller device gpio # gpio interfaces and bus support device gpiobacklight # sysctl control of gpio-based backlight device gpioiic # i2c via gpio bitbang device gpiokeys # kbd(4) glue for gpio-based key input device gpioled # led(4) gpio glue device gpiopower # event handler for gpio-based powerdown device gpiopps # Pulse per second input from gpio pin device gpioregulator # extres/regulator glue for gpio pin device gpiospi # SPI via gpio bitbang device gpioths # 1-wire temp/humidity sensor on gpio pin # Pulse width modulation device pwmbus # pwm interface and bus support device pwmc # userland control access to pwm outputs # # Etherswitch framework and drivers # # etherswitch The etherswitch(4) framework # miiproxy Proxy device for miibus(4) functionality # # Switch hardware support: # arswitch Atheros switches # ip17x IC+ 17x family switches # rtl8366r Realtek RTL8366 switches # ukswitch Multi-PHY switches # device etherswitch device miiproxy device arswitch device ip17x device rtl8366rb device ukswitch # Kernel BOOTP support options BOOTP # Use BOOTP to obtain IP address/hostname # Requires NFSCL and NFS_ROOT options BOOTP_NFSROOT # NFS mount root filesystem using BOOTP info options BOOTP_NFSV3 # Use NFS v3 to NFS mount root options BOOTP_COMPAT # Workaround for broken bootp daemons. options BOOTP_WIRED_TO=fxp0 # Use interface fxp0 for BOOTP options BOOTP_BLOCKSIZE=8192 # Override NFS block size # # Enable software watchdog routines, even if hardware watchdog is present. # By default, software watchdog timer is enabled only if no hardware watchdog # is present. # options SW_WATCHDOG # # Add the software deadlock resolver thread. # options DEADLKRES # # Disable swapping of stack pages. This option removes all # code which actually performs swapping, so it's not possible to turn # it back on at run-time. # # This is sometimes usable for systems which don't have any swap space # (see also sysctl "vm.disable_swapspace_pageouts") # #options NO_SWAPPING # Set the number of sf_bufs to allocate. sf_bufs are virtual buffers # for sendfile(2) that are used to map file VM pages, and normally # default to a quantity that is roughly 16*MAXUSERS+512. You would # typically want about 4 of these for each simultaneous file send. # options NSFBUFS=1024 # # Enable extra debugging code for locks. This stores the filename and # line of whatever acquired the lock in the lock itself, and changes a # number of function calls to pass around the relevant data. This is # not at all useful unless you are debugging lock code. Note that # modules should be recompiled as this option modifies KBI. # options DEBUG_LOCKS # # VirtIO support # # The virtio entry provides a generic bus for use by the device drivers. # It must be combined with an interface that communicates with the host. # Multiple such interfaces are defined by the VirtIO specification # including PCI and MMIO. # device virtio # Generic VirtIO bus (required) device virtio_mmio # VirtIO MMIO Interface device virtio_pci # VirtIO PCI Interface device vtnet # VirtIO Ethernet device device virtio_balloon # VirtIO Memory Balloon device device virtio_blk # VirtIO Block device device virtio_console # VirtIO Console device device virtio_gpu # VirtIO GPU device device virtio_random # VirtIO Entropy device device virtio_scsi # VirtIO SCSI device ##################################################################### # HID support device hid # Generic HID support options HID_DEBUG # enable debug msgs device hidbus # HID bus device hidmap # HID to evdev mapping device hidraw # Raw access driver options HIDRAW_MAKE_UHID_ALIAS # install /dev/uhid alias device hconf # Multitouch configuration TLC device hcons # Consumer controls device hgame # Generic game controllers device hkbd # HID keyboard device hms # HID mouse device hmt # HID multitouch (MS-compatible) device hpen # Generic pen driver device hsctrl # System controls device ps4dshock # Sony PS4 DualShock 4 gamepad driver device xb360gp # XBox 360 gamepad driver ##################################################################### # USB support # UHCI controller device uhci # OHCI controller device ohci # EHCI controller device ehci # XHCI controller device xhci # SL811 Controller #device slhci # General USB code (mandatory for USB) device usb # # USB Double Bulk Pipe devices device udbp # USB temperature meter device ugold # USB LED device uled # Human Interface Device (anything with buttons and dials) device uhid # USB keyboard device ukbd # USB printer device ulpt # USB mass storage driver (Requires scbus and da) device umass # USB mass storage driver for device-side mode device usfs # USB support for Belkin F5U109 and Magic Control Technology serial adapters device umct # USB modem support device umodem # USB mouse device ums # USB touchpad(s) device atp device wsp # eGalax USB touch screen device uep # Diamond Rio 500 MP3 player device urio # HID-over-USB driver device usbhid # # USB serial support device ucom # USB support for 3G modem cards by Option, Novatel, Huawei and Sierra device u3g # USB support for Technologies ARK3116 based serial adapters device uark # USB support for Belkin F5U103 and compatible serial adapters device ubsa # USB support for serial adapters based on the FT8U100AX and FT8U232AM device uftdi # USB support for some Windows CE based serial communication. device uipaq # USB support for Prolific PL-2303 serial adapters device uplcom # USB support for Silicon Laboratories CP2101/CP2102 based USB serial adapters device uslcom # USB Visor and Palm devices device uvisor # USB serial support for DDI pocket's PHS device uvscom # # USB ethernet support device uether # ADMtek USB ethernet. Supports the LinkSys USB100TX, # the Billionton USB100, the Melco LU-ATX, the D-Link DSB-650TX # and the SMC 2202USB. Also works with the ADMtek AN986 Pegasus # eval board. device aue # ASIX Electronics AX88172 USB 2.0 ethernet driver. Used in the # LinkSys USB200M and various other adapters. device axe # ASIX Electronics AX88178A/AX88179 USB 2.0/3.0 gigabit ethernet driver. device axge # # Devices which communicate using Ethernet over USB, particularly # Communication Device Class (CDC) Ethernet specification. Supports # Sharp Zaurus PDAs, some DOCSIS cable modems and so on. device cdce # # CATC USB-EL1201A USB ethernet. Supports the CATC Netmate # and Netmate II, and the Belkin F5U111. device cue # # Kawasaki LSI ethernet. Supports the LinkSys USB10T, # Entrega USB-NET-E45, Peracom Ethernet Adapter, the # 3Com 3c19250, the ADS Technologies USB-10BT, the ATen UC10T, # the Netgear EA101, the D-Link DSB-650, the SMC 2102USB # and 2104USB, and the Corega USB-T. device kue # # RealTek RTL8150 USB to fast ethernet. Supports the Melco LUA-KTX # and the GREEN HOUSE GH-USB100B. device rue # # Davicom DM9601E USB to fast ethernet. Supports the Corega FEther USB-TXC. device udav # # RealTek RTL8152/RTL8153 USB Ethernet driver device ure # # Moschip MCS7730/MCS7840 USB to fast ethernet. Supports the Sitecom LN030. device mos # # HSxPA devices from Option N.V device uhso # Realtek RTL8188SU/RTL8191SU/RTL8192SU wireless driver device rsu # # Ralink Technology RT2501USB/RT2601USB wireless driver device rum # Ralink Technology RT2700U/RT2800U/RT3000U wireless driver device run # # Atheros AR5523 wireless driver device uath # # Conexant/Intersil PrismGT wireless driver device upgt # # Ralink Technology RT2500USB wireless driver device ural # # RNDIS USB ethernet driver device urndis # Realtek RTL8187B/L wireless driver device urtw # # ZyDas ZD1211/ZD1211B wireless driver device zyd # # Sierra USB wireless driver device usie # # debugging options for the USB subsystem # options USB_DEBUG options U3G_DEBUG # options for ukbd: options UKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions UKBD_DFLT_KEYMAP=jp.106 # options for uplcom: options UPLCOM_INTR_INTERVAL=100 # interrupt pipe interval # in milliseconds # options for uvscom: options UVSCOM_DEFAULT_OPKTSIZE=8 # default output packet size options UVSCOM_INTR_INTERVAL=100 # interrupt pipe interval # in milliseconds ##################################################################### # FireWire support device firewire # FireWire bus code device sbp # SCSI over Firewire (Requires scbus and da) device sbp_targ # SBP-2 Target mode (Requires scbus and targ) device fwe # Ethernet over FireWire (non-standard!) device fwip # IP over FireWire (RFC2734 and RFC3146) ##################################################################### # dcons support (Dumb Console Device) device dcons # dumb console driver device dcons_crom # FireWire attachment options DCONS_BUF_SIZE=16384 # buffer size options DCONS_POLL_HZ=100 # polling rate options DCONS_FORCE_CONSOLE=0 # force to be the primary console options DCONS_FORCE_GDB=1 # force to be the gdb device ##################################################################### # crypto subsystem # # This is a port of the OpenBSD crypto framework. Include this when # configuring IPSEC and when you have a h/w crypto device to accelerate # user applications that link to OpenSSL. # # Drivers are ports from OpenBSD with some simple enhancements that have # been fed back to OpenBSD. device crypto # core crypto support # Only install the cryptodev device if you are running tests, or know # specifically why you need it. In most cases, it is not needed and # will make things slower. device cryptodev # /dev/crypto for access to h/w device rndtest # FIPS 140-2 entropy tester device ccr # Chelsio T6 device hifn # Hifn 7951, 7781, etc. options HIFN_DEBUG # enable debugging support: hw.hifn.debug options HIFN_RNDTEST # enable rndtest support device safe # SafeNet 1141 options SAFE_DEBUG # enable debugging support: hw.safe.debug options SAFE_RNDTEST # enable rndtest support ##################################################################### # # Embedded system options: # # An embedded system might want to run something other than init. options INIT_PATH=/sbin/init:/rescue/init # Debug options options BUS_DEBUG # enable newbus debugging options DEBUG_VFS_LOCKS # enable VFS lock debugging options SOCKBUF_DEBUG # enable sockbuf last record/mb tail checking options IFMEDIA_DEBUG # enable debugging in net/if_media.c # # Verbose SYSINIT # # Make the SYSINIT process performed by mi_startup() verbose. This is very # useful when porting to a new architecture. If DDB is also enabled, this # will print function names instead of addresses. If defined with a value # of zero, the verbose code is compiled-in but disabled by default, and can # be enabled with the debug.verbose_sysinit=1 tunable. options VERBOSE_SYSINIT ##################################################################### # SYSV IPC KERNEL PARAMETERS # # Maximum number of System V semaphores that can be used on the system at # one time. options SEMMNI=11 # Total number of semaphores system wide options SEMMNS=61 # Total number of undo structures in system options SEMMNU=31 # Maximum number of System V semaphores that can be used by a single process # at one time. options SEMMSL=61 # Maximum number of operations that can be outstanding on a single System V # semaphore at one time. options SEMOPM=101 # Maximum number of undo operations that can be outstanding on a single # System V semaphore at one time. options SEMUME=11 # Maximum number of shared memory pages system wide. options SHMALL=1025 # Maximum size, in bytes, of a single System V shared memory region. options SHMMAX=(SHMMAXPGS*PAGE_SIZE+1) options SHMMAXPGS=1025 # Minimum size, in bytes, of a single System V shared memory region. options SHMMIN=2 # Maximum number of shared memory regions that can be used on the system # at one time. options SHMMNI=33 # Maximum number of System V shared memory regions that can be attached to # a single process at one time. options SHMSEG=9 # Set the amount of time (in seconds) the system will wait before # rebooting automatically when a kernel panic occurs. If set to (-1), # the system will wait indefinitely until a key is pressed on the # console. options PANIC_REBOOT_WAIT_TIME=16 # Attempt to bypass the buffer cache and put data directly into the # userland buffer for read operation when O_DIRECT flag is set on the # file. Both offset and length of the read operation must be # multiples of the physical media sector size. # options DIRECTIO # Specify a lower limit for the number of swap I/O buffers. They are # (among other things) used when bypassing the buffer cache due to # DIRECTIO kernel option enabled and O_DIRECT flag set on file. # options NSWBUF_MIN=120 ##################################################################### # More undocumented options for linting. # Note that documenting these is not considered an affront. options CAM_DEBUG_DELAY options DEBUG # Kernel filelock debugging. options LOCKF_DEBUG # System V compatible message queues # Please note that the values provided here are used to test kernel # building. The defaults in the sources provide almost the same numbers. # MSGSSZ must be a power of 2 between 8 and 1024. options MSGMNB=2049 # Max number of chars in queue options MSGMNI=41 # Max number of message queue identifiers options MSGSEG=2049 # Max number of message segments options MSGSSZ=16 # Size of a message segment options MSGTQL=41 # Max number of messages in system options NBUF=512 # Number of buffer headers options SC_DEBUG_LEVEL=5 # Syscons debug level options SC_RENDER_DEBUG # syscons rendering debugging options VFS_BIO_DEBUG # VFS buffer I/O debugging options KSTACK_MAX_PAGES=32 # Maximum pages to give the kernel stack options KSTACK_USAGE_PROF # Adaptec Array Controller driver options options AAC_DEBUG # Debugging levels: # 0 - quiet, only emit warnings # 1 - noisy, emit major function # points and things done # 2 - extremely noisy, emit trace # items in loops, etc. # Resource Accounting options RACCT # Resource Limits options RCTL # Yet more undocumented options for linting. options MAXFILES=999 # Random number generator # Alternative algorithm. #options RANDOM_FENESTRASX # Allow the CSPRNG algorithm to be loaded as a module. #options RANDOM_LOADABLE # Select this to allow high-rate but potentially expensive # harvesting of Slab-Allocator entropy. In very high-rate # situations the value of doing this is dubious at best. options RANDOM_ENABLE_UMA # slab allocator # Select this to allow high-rate but potentially expensive # harvesting of the m_next pointer in the mbuf. Note that # the m_next pointer is NULL except when receiving > 4K # jumbo frames or sustained bursts by way of LRO. Thus in # the common case it is stirring zero in to the entropy # pool. In cases where it is not NULL it is pointing to one # of a small (in the thousands to 10s of thousands) number # of 256 byte aligned mbufs. Hence it is, even in the best # case, a poor source of entropy. And in the absence of actual # runtime analysis of entropy collection may mislead the user in # to believe that substantially more entropy is being collected # than in fact is - leading to a different class of security # risk. In high packet rate situations ethernet entropy # collection is also very expensive, possibly leading to as # much as a 50% drop in packets received. # This option is present to maintain backwards compatibility # if desired, however it cannot be recommended for use in any # environment. options RANDOM_ENABLE_ETHER # ether_input # Module to enable execution of application via emulators like QEMU options IMGACT_BINMISC # zlib I/O stream support # This enables support for compressed core dumps. options GZIO # zstd support # This enables support for Zstd compressed core dumps, GEOM_UZIP images, # and is required by zfs if statically linked. options ZSTDIO # BHND(4) drivers options BHND_LOGLEVEL # Logging threshold level # evdev interface device evdev # input event device support options EVDEV_SUPPORT # evdev support in legacy drivers options EVDEV_DEBUG # enable event debug msgs device uinput # install /dev/uinput cdev options UINPUT_DEBUG # enable uinput debug msgs # Encrypted kernel crash dumps. options EKCD # Serial Peripheral Interface (SPI) support. device spibus # Bus support. device at45d # DataFlash driver device cqspi # device mx25l # SPIFlash driver device n25q # device spigen # Generic access to SPI devices from userland. # Enable legacy /dev/spigenN name aliases for /dev/spigenX.Y devices. options SPIGEN_LEGACY_CDEVNAME # legacy device names for spigen # Compression supports. device zlib # gzip/zlib compression/decompression library device xz # xz_embedded LZMA de-compression library # Kernel support for stats(3). options STATS # File system monitoring device filemon # file monitoring for make(1) meta-mode diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 3bf4f666ce7d..d3820245837a 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -1,1597 +1,1601 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef __T4_ADAPTER_H__ #define __T4_ADAPTER_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "offload.h" #include "t4_ioctl.h" #include "common/t4_msg.h" #include "firmware/t4fw_interface.h" #define KTR_CXGBE KTR_SPARE3 MALLOC_DECLARE(M_CXGBE); #define CXGBE_UNIMPLEMENTED(s) \ panic("%s (%s, line %d) not implemented yet.", s, __FILE__, __LINE__) /* * Same as LIST_HEAD from queue.h. This is to avoid conflict with LinuxKPI's * LIST_HEAD when building iw_cxgbe. */ #define CXGBE_LIST_HEAD(name, type) \ struct name { \ struct type *lh_first; /* first element */ \ } #ifndef SYSCTL_ADD_UQUAD #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD #define sysctl_handle_64 sysctl_handle_quad #define CTLTYPE_U64 CTLTYPE_QUAD #endif SYSCTL_DECL(_hw_cxgbe); struct adapter; typedef struct adapter adapter_t; enum { /* * All ingress queues use this entry size. Note that the firmware event * queue and any iq expecting CPL_RX_PKT in the descriptor needs this to * be at least 64. */ IQ_ESIZE = 64, /* Default queue sizes for all kinds of ingress queues */ FW_IQ_QSIZE = 256, RX_IQ_QSIZE = 1024, /* All egress queues use this entry size */ EQ_ESIZE = 64, /* Default queue sizes for all kinds of egress queues */ CTRL_EQ_QSIZE = 1024, TX_EQ_QSIZE = 1024, +#if MJUMPAGESIZE != MCLBYTES SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */ +#else + SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */ +#endif CL_METADATA_SIZE = CACHE_LINE_SIZE, SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ TX_SGL_SEGS = 39, TX_SGL_SEGS_TSO = 38, TX_SGL_SEGS_VM = 38, TX_SGL_SEGS_VM_TSO = 37, TX_SGL_SEGS_EO_TSO = 30, /* XXX: lower for IPv6. */ TX_SGL_SEGS_VXLAN_TSO = 37, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 }; enum { /* adapter intr_type */ INTR_INTX = (1 << 0), INTR_MSI = (1 << 1), INTR_MSIX = (1 << 2) }; enum { XGMAC_MTU = (1 << 0), XGMAC_PROMISC = (1 << 1), XGMAC_ALLMULTI = (1 << 2), XGMAC_VLANEX = (1 << 3), XGMAC_UCADDR = (1 << 4), XGMAC_MCADDRS = (1 << 5), XGMAC_ALL = 0xffff }; enum { /* flags understood by begin_synchronized_op */ HOLD_LOCK = (1 << 0), SLEEP_OK = (1 << 1), INTR_OK = (1 << 2), /* flags understood by end_synchronized_op */ LOCK_HELD = HOLD_LOCK, }; enum { /* adapter flags. synch_op or adapter_lock. */ FULL_INIT_DONE = (1 << 0), FW_OK = (1 << 1), CHK_MBOX_ACCESS = (1 << 2), MASTER_PF = (1 << 3), BUF_PACKING_OK = (1 << 6), IS_VF = (1 << 7), KERN_TLS_ON = (1 << 8), /* HW is configured for KERN_TLS */ CXGBE_BUSY = (1 << 9), /* adapter error_flags. reg_lock for HW_OFF_LIMITS, atomics for the rest. */ ADAP_STOPPED = (1 << 0), /* Adapter has been stopped. */ ADAP_FATAL_ERR = (1 << 1), /* Encountered a fatal error. */ HW_OFF_LIMITS = (1 << 2), /* off limits to all except reset_thread */ ADAP_CIM_ERR = (1 << 3), /* Error was related to FW/CIM. */ /* port flags */ HAS_TRACEQ = (1 << 3), FIXED_IFMEDIA = (1 << 4), /* ifmedia list doesn't change. */ /* VI flags */ VI_DETACHING = (1 << 0), VI_INIT_DONE = (1 << 1), /* 1 << 2 is unused, was VI_SYSCTL_CTX */ TX_USES_VM_WR = (1 << 3), VI_SKIP_STATS = (1 << 4), /* adapter debug_flags */ DF_DUMP_MBOX = (1 << 0), /* Log all mbox cmd/rpl. */ DF_LOAD_FW_ANYTIME = (1 << 1), /* Allow LOAD_FW after init */ DF_DISABLE_TCB_CACHE = (1 << 2), /* Disable TCB cache (T6+) */ DF_DISABLE_CFG_RETRY = (1 << 3), /* Disable fallback config */ DF_VERBOSE_SLOWINTR = (1 << 4), /* Chatty slow intr handler */ }; #define IS_DETACHING(vi) ((vi)->flags & VI_DETACHING) #define SET_DETACHING(vi) do {(vi)->flags |= VI_DETACHING;} while (0) #define CLR_DETACHING(vi) do {(vi)->flags &= ~VI_DETACHING;} while (0) #define IS_BUSY(sc) ((sc)->flags & CXGBE_BUSY) #define SET_BUSY(sc) do {(sc)->flags |= CXGBE_BUSY;} while (0) #define CLR_BUSY(sc) do {(sc)->flags &= ~CXGBE_BUSY;} while (0) struct vi_info { device_t dev; struct port_info *pi; struct adapter *adapter; if_t ifp; struct pfil_head *pfil; unsigned long flags; int if_flags; uint16_t *rss, *nm_rss; uint16_t viid; /* opaque VI identifier */ uint16_t smt_idx; uint16_t vin; uint8_t vfvld; int16_t xact_addr_filt;/* index of exact MAC address filter */ uint16_t rss_size; /* size of VI's RSS table slice */ uint16_t rss_base; /* start of VI's RSS table slice */ int hashen; int nintr; int first_intr; /* These need to be int as they are used in sysctl */ int ntxq; /* # of tx queues */ int first_txq; /* index of first tx queue */ int rsrv_noflowq; /* Reserve queue 0 for non-flowid packets */ int nrxq; /* # of rx queues */ int first_rxq; /* index of first rx queue */ int nofldtxq; /* # of offload tx queues */ int first_ofld_txq; /* index of first offload tx queue */ int nofldrxq; /* # of offload rx queues */ int first_ofld_rxq; /* index of first offload rx queue */ int nnmtxq; int first_nm_txq; int nnmrxq; int first_nm_rxq; int tmr_idx; int ofld_tmr_idx; int pktc_idx; int ofld_pktc_idx; int qsize_rxq; int qsize_txq; struct timeval last_refreshed; struct fw_vi_stats_vf stats; struct mtx tick_mtx; struct callout tick; struct sysctl_ctx_list ctx; struct sysctl_oid *rxq_oid; struct sysctl_oid *txq_oid; struct sysctl_oid *nm_rxq_oid; struct sysctl_oid *nm_txq_oid; struct sysctl_oid *ofld_rxq_oid; struct sysctl_oid *ofld_txq_oid; uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */ u_int txq_rr; u_int rxq_rr; }; struct tx_ch_rl_params { enum fw_sched_params_rate ratemode; /* %port (REL) or kbps (ABS) */ uint32_t maxrate; }; /* CLRL state */ enum clrl_state { CS_UNINITIALIZED = 0, CS_PARAMS_SET, /* sw parameters have been set. */ CS_HW_UPDATE_REQUESTED, /* async HW update requested. */ CS_HW_UPDATE_IN_PROGRESS, /* sync hw update in progress. */ CS_HW_CONFIGURED /* configured in the hardware. */ }; /* CLRL flags */ enum { CF_USER = (1 << 0), /* was configured by driver ioctl. */ }; struct tx_cl_rl_params { enum clrl_state state; int refcount; uint8_t flags; enum fw_sched_params_rate ratemode; /* %port REL or ABS value */ enum fw_sched_params_unit rateunit; /* kbps or pps (when ABS) */ enum fw_sched_params_mode mode; /* aggr or per-flow */ uint32_t maxrate; uint16_t pktsize; uint16_t burstsize; }; /* Tx scheduler parameters for a channel/port */ struct tx_sched_params { /* Channel Rate Limiter */ struct tx_ch_rl_params ch_rl; /* Class WRR */ /* XXX */ /* Class Rate Limiter (including the default pktsize and burstsize). */ int pktsize; int burstsize; struct tx_cl_rl_params cl_rl[]; }; struct port_info { device_t dev; struct adapter *adapter; struct vi_info *vi; int nvi; int up_vis; int uld_vis; bool vxlan_tcam_entry; struct tx_sched_params *sched_params; struct mtx pi_lock; char lockname[16]; unsigned long flags; uint8_t lport; /* associated offload logical port */ int8_t mdio_addr; uint8_t port_type; uint8_t mod_type; uint8_t port_id; uint8_t tx_chan; /* tx TP c-channel */ uint8_t rx_chan; /* rx TP c-channel */ uint8_t mps_bg_map; /* rx MPS buffer group bitmap */ uint8_t rx_e_chan_map; /* rx TP e-channel bitmap */ struct link_config link_cfg; struct ifmedia media; struct port_stats stats; u_int tnl_cong_drops; u_int tx_parse_error; int fcs_reg; uint64_t fcs_base; struct sysctl_ctx_list ctx; }; #define IS_MAIN_VI(vi) ((vi) == &((vi)->pi->vi[0])) struct cluster_metadata { uma_zone_t zone; caddr_t cl; u_int refcount; }; struct fl_sdesc { caddr_t cl; uint16_t nmbuf; /* # of driver originated mbufs with ref on cluster */ int16_t moff; /* offset of metadata from cl */ uint8_t zidx; }; struct tx_desc { __be64 flit[8]; }; struct tx_sdesc { struct mbuf *m; /* m_nextpkt linked chain of frames */ uint8_t desc_used; /* # of hardware descriptors used by the WR */ }; #define IQ_PAD (IQ_ESIZE - sizeof(struct rsp_ctrl) - sizeof(struct rss_header)) struct iq_desc { struct rss_header rss; uint8_t cpl[IQ_PAD]; struct rsp_ctrl rsp; }; #undef IQ_PAD CTASSERT(sizeof(struct iq_desc) == IQ_ESIZE); enum { /* iq type */ IQ_OTHER = FW_IQ_IQTYPE_OTHER, IQ_ETH = FW_IQ_IQTYPE_NIC, IQ_OFLD = FW_IQ_IQTYPE_OFLD, /* iq flags */ IQ_SW_ALLOCATED = (1 << 0), /* sw resources allocated */ IQ_HAS_FL = (1 << 1), /* iq associated with a freelist */ IQ_RX_TIMESTAMP = (1 << 2), /* provide the SGE rx timestamp */ IQ_LRO_ENABLED = (1 << 3), /* iq is an eth rxq with LRO enabled */ IQ_ADJ_CREDIT = (1 << 4), /* hw is off by 1 credit for this iq */ IQ_HW_ALLOCATED = (1 << 5), /* fw/hw resources allocated */ /* iq state */ IQS_DISABLED = 0, IQS_BUSY = 1, IQS_IDLE = 2, /* netmap related flags */ NM_OFF = 0, NM_ON = 1, NM_BUSY = 2, }; enum { CPL_COOKIE_RESERVED = 0, CPL_COOKIE_FILTER, CPL_COOKIE_DDP0, CPL_COOKIE_DDP1, CPL_COOKIE_TOM, CPL_COOKIE_HASHFILTER, CPL_COOKIE_ETHOFLD, CPL_COOKIE_KERN_TLS, NUM_CPL_COOKIES = 8 /* Limited by M_COOKIE. Do not increase. */ }; struct sge_iq; struct rss_header; typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *, struct mbuf *); typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *); typedef int (*fw_msg_handler_t)(struct adapter *, const __be64 *); /* * Ingress Queue: T4 is producer, driver is consumer. */ struct sge_iq { uint16_t flags; uint8_t qtype; volatile int state; struct adapter *adapter; struct iq_desc *desc; /* KVA of descriptor ring */ int8_t intr_pktc_idx; /* packet count threshold index */ uint8_t gen; /* generation bit */ uint8_t intr_params; /* interrupt holdoff parameters */ int8_t cong_drop; /* congestion drop settings for the queue */ uint16_t qsize; /* size (# of entries) of the queue */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer index */ uint16_t cntxt_id; /* SGE context id for the iq */ uint16_t abs_id; /* absolute SGE id for the iq */ int16_t intr_idx; /* interrupt used by the queue */ STAILQ_ENTRY(sge_iq) link; bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ }; enum { /* eq type */ EQ_CTRL = 1, EQ_ETH = 2, EQ_OFLD = 3, /* eq flags */ EQ_SW_ALLOCATED = (1 << 0), /* sw resources allocated */ EQ_HW_ALLOCATED = (1 << 1), /* hw/fw resources allocated */ EQ_ENABLED = (1 << 3), /* open for business */ EQ_QFLUSH = (1 << 4), /* if_qflush in progress */ }; /* Listed in order of preference. Update t4_sysctls too if you change these */ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB}; /* * Egress Queue: driver is producer, T4 is consumer. * * Note: A free list is an egress queue (driver produces the buffers and T4 * consumes them) but it's special enough to have its own struct (see sge_fl). */ struct sge_eq { unsigned int flags; /* MUST be first */ unsigned int cntxt_id; /* SGE context id for the eq */ unsigned int abs_id; /* absolute SGE id for the eq */ uint8_t type; /* EQ_CTRL/EQ_ETH/EQ_OFLD */ uint8_t doorbells; uint8_t port_id; /* port_id of the port associated with the eq */ uint8_t tx_chan; /* tx channel used by the eq */ struct mtx eq_lock; struct tx_desc *desc; /* KVA of descriptor ring */ volatile uint32_t *udb; /* KVA of doorbell (lies within BAR2) */ u_int udb_qid; /* relative qid within the doorbell page */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer idx (desc idx) */ uint16_t pidx; /* producer idx (desc idx) */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint16_t iqid; /* cached iq->cntxt_id (see iq below) */ volatile u_int equiq; /* EQUIQ outstanding */ struct sge_iq *iq; /* iq that receives egr_update for the eq */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ char lockname[16]; }; struct rx_buf_info { uma_zone_t zone; /* zone that this cluster comes from */ uint16_t size1; /* same as size of cluster: 2K/4K/9K/16K. * hwsize[hwidx1] = size1. No spare. */ uint16_t size2; /* hwsize[hwidx2] = size2. * spare in cluster = size1 - size2. */ int8_t hwidx1; /* SGE bufsize idx for size1 */ int8_t hwidx2; /* SGE bufsize idx for size2 */ uint8_t type; /* EXT_xxx type of the cluster */ }; enum { NUM_MEMWIN = 3, MEMWIN0_APERTURE = 2048, MEMWIN0_BASE = 0x1b800, MEMWIN1_APERTURE = 32768, MEMWIN1_BASE = 0x28000, MEMWIN2_APERTURE_T4 = 65536, MEMWIN2_BASE_T4 = 0x30000, MEMWIN2_APERTURE_T5 = 128 * 1024, MEMWIN2_BASE_T5 = 0x60000, }; struct memwin { struct rwlock mw_lock __aligned(CACHE_LINE_SIZE); uint32_t mw_base; /* constant after setup_memwin */ uint32_t mw_aperture; /* ditto */ uint32_t mw_curpos; /* protected by mw_lock */ }; enum { FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */ FL_DOOMED = (1 << 1), /* about to be destroyed */ FL_BUF_PACKING = (1 << 2), /* buffer packing enabled */ FL_BUF_RESUME = (1 << 3), /* resume from the middle of the frame */ }; #define FL_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) <= fl->lowat) #define FL_NOT_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) >= 2 * fl->lowat) struct sge_fl { struct mtx fl_lock; __be64 *desc; /* KVA of descriptor ring, ptr to addresses */ struct fl_sdesc *sdesc; /* KVA of software descriptor ring */ uint16_t zidx; /* refill zone idx */ uint16_t safe_zidx; uint16_t lowat; /* # of buffers <= this means fl needs help */ int flags; uint16_t buf_boundary; /* The 16b idx all deal with hw descriptors */ uint16_t dbidx; /* hw pidx after last doorbell */ uint16_t sidx; /* index of status page */ volatile uint16_t hw_cidx; /* The 32b idx are all buffer idx, not hardware descriptor idx */ uint32_t cidx; /* consumer index */ uint32_t pidx; /* producer index */ uint32_t dbval; u_int rx_offset; /* offset in fl buf (when buffer packing) */ volatile uint32_t *udb; uint64_t cl_allocated; /* # of clusters allocated */ uint64_t cl_recycled; /* # of clusters recycled */ uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */ /* These 3 are valid when FL_BUF_RESUME is set, stale otherwise. */ struct mbuf *m0; struct mbuf **pnext; u_int remaining; uint16_t qsize; /* # of hw descriptors (status page included) */ uint16_t cntxt_id; /* SGE context id for the freelist */ TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; char lockname[16]; bus_addr_t ba; /* bus address of descriptor ring */ }; struct mp_ring; struct txpkts { uint8_t wr_type; /* type 0 or type 1 */ uint8_t npkt; /* # of packets in this work request */ uint8_t len16; /* # of 16B pieces used by this work request */ uint8_t score; uint8_t max_npkt; /* maximum number of packets allowed */ uint16_t plen; /* total payload (sum of all packets) */ /* straight from fw_eth_tx_pkts_vm_wr. */ __u8 ethmacdst[6]; __u8 ethmacsrc[6]; __be16 ethtype; __be16 vlantci; struct mbuf *mb[15]; }; /* txq: SGE egress queue + what's needed for Ethernet NIC */ struct sge_txq { struct sge_eq eq; /* MUST be first */ if_t ifp; /* the interface this txq belongs to */ struct mp_ring *r; /* tx software ring */ struct tx_sdesc *sdesc; /* KVA of software descriptor ring */ struct sglist *gl; __be32 cpl_ctrl0; /* for convenience */ int tc_idx; /* traffic class */ uint64_t last_tx; /* cycle count when eth_tx was last called */ struct txpkts txp; struct task tx_reclaim_task; /* stats for common events first */ uint64_t txcsum; /* # of times hardware assisted with checksum */ uint64_t tso_wrs; /* # of TSO work requests */ uint64_t vlan_insertion;/* # of times VLAN tag was inserted */ uint64_t imm_wrs; /* # of work requests with immediate data */ uint64_t sgl_wrs; /* # of work requests with direct SGL */ uint64_t txpkt_wrs; /* # of txpkt work requests (not coalesced) */ uint64_t txpkts0_wrs; /* # of type0 coalesced tx work requests */ uint64_t txpkts1_wrs; /* # of type1 coalesced tx work requests */ uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */ uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */ uint64_t txpkts_flush; /* # of times txp had to be sent by tx_update */ uint64_t raw_wrs; /* # of raw work requests (alloc_wr_mbuf) */ uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */ uint64_t vxlan_txcsum; uint64_t kern_tls_records; uint64_t kern_tls_short; uint64_t kern_tls_partial; uint64_t kern_tls_full; uint64_t kern_tls_octets; uint64_t kern_tls_waste; uint64_t kern_tls_options; uint64_t kern_tls_header; uint64_t kern_tls_fin; uint64_t kern_tls_fin_short; uint64_t kern_tls_cbc; uint64_t kern_tls_gcm; /* stats for not-that-common events */ /* Optional scratch space for constructing work requests. */ uint8_t ss[SGE_MAX_WR_LEN] __aligned(16); } __aligned(CACHE_LINE_SIZE); /* rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ if_t ifp; /* the interface this rxq belongs to */ struct lro_ctrl lro; /* LRO state */ /* stats for common events first */ uint64_t rxcsum; /* # of times hardware assisted with checksum */ uint64_t vlan_extraction;/* # of times VLAN tag was extracted */ uint64_t vxlan_rxcsum; /* stats for not-that-common events */ } __aligned(CACHE_LINE_SIZE); static inline struct sge_rxq * iq_to_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_rxq, iq)); } /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_ofld_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ counter_u64_t rx_iscsi_ddp_setup_ok; counter_u64_t rx_iscsi_ddp_setup_error; uint64_t rx_iscsi_ddp_pdus; uint64_t rx_iscsi_ddp_octets; uint64_t rx_iscsi_fl_pdus; uint64_t rx_iscsi_fl_octets; uint64_t rx_iscsi_padding_errors; uint64_t rx_iscsi_header_digest_errors; uint64_t rx_iscsi_data_digest_errors; uint64_t rx_aio_ddp_jobs; uint64_t rx_aio_ddp_octets; u_long rx_toe_tls_records; u_long rx_toe_tls_octets; u_long rx_toe_ddp_octets; counter_u64_t ddp_buffer_alloc; counter_u64_t ddp_buffer_reuse; counter_u64_t ddp_buffer_free; } __aligned(CACHE_LINE_SIZE); static inline struct sge_ofld_rxq * iq_to_ofld_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_ofld_rxq, iq)); } struct wrqe { STAILQ_ENTRY(wrqe) link; struct sge_wrq *wrq; int wr_len; char wr[] __aligned(16); }; struct wrq_cookie { TAILQ_ENTRY(wrq_cookie) link; int ndesc; int pidx; }; /* * wrq: SGE egress queue that is given prebuilt work requests. Control queues * are of this type. */ struct sge_wrq { struct sge_eq eq; /* MUST be first */ struct adapter *adapter; struct task wrq_tx_task; /* Tx desc reserved but WR not "committed" yet. */ TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs; /* List of WRs ready to go out as soon as descriptors are available. */ STAILQ_HEAD(, wrqe) wr_list; u_int nwr_pending; u_int ndesc_needed; /* stats for common events first */ uint64_t tx_wrs_direct; /* # of WRs written directly to desc ring. */ uint64_t tx_wrs_ss; /* # of WRs copied from scratch space. */ uint64_t tx_wrs_copied; /* # of WRs queued and copied to desc ring. */ /* stats for not-that-common events */ /* * Scratch space for work requests that wrap around after reaching the * status page, and some information about the last WR that used it. */ uint16_t ss_pidx; uint16_t ss_len; uint8_t ss[SGE_MAX_WR_LEN]; } __aligned(CACHE_LINE_SIZE); /* ofld_txq: SGE egress queue + miscellaneous items */ struct sge_ofld_txq { struct sge_wrq wrq; counter_u64_t tx_iscsi_pdus; counter_u64_t tx_iscsi_octets; counter_u64_t tx_iscsi_iso_wrs; counter_u64_t tx_aio_jobs; counter_u64_t tx_aio_octets; counter_u64_t tx_toe_tls_records; counter_u64_t tx_toe_tls_octets; } __aligned(CACHE_LINE_SIZE); #define INVALID_NM_RXQ_CNTXT_ID ((uint16_t)(-1)) struct sge_nm_rxq { /* Items used by the driver rx ithread are in this cacheline. */ volatile int nm_state __aligned(CACHE_LINE_SIZE); /* NM_OFF, NM_ON, or NM_BUSY */ u_int nid; /* netmap ring # for this queue */ struct vi_info *vi; struct iq_desc *iq_desc; uint16_t iq_abs_id; uint16_t iq_cntxt_id; uint16_t iq_cidx; uint16_t iq_sidx; uint8_t iq_gen; uint32_t fl_sidx; /* Items used by netmap rxsync are in this cacheline. */ __be64 *fl_desc __aligned(CACHE_LINE_SIZE); uint16_t fl_cntxt_id; uint32_t fl_pidx; uint32_t fl_sidx2; /* copy of fl_sidx */ uint32_t fl_db_val; u_int fl_db_saved; u_int fl_db_threshold; /* in descriptors */ u_int fl_hwidx:4; /* * fl_cidx is used by both the ithread and rxsync, the rest are not used * in the rx fast path. */ uint32_t fl_cidx __aligned(CACHE_LINE_SIZE); bus_dma_tag_t iq_desc_tag; bus_dmamap_t iq_desc_map; bus_addr_t iq_ba; int intr_idx; bus_dma_tag_t fl_desc_tag; bus_dmamap_t fl_desc_map; bus_addr_t fl_ba; }; #define INVALID_NM_TXQ_CNTXT_ID ((u_int)(-1)) struct sge_nm_txq { struct tx_desc *desc; uint16_t cidx; uint16_t pidx; uint16_t sidx; uint16_t equiqidx; /* EQUIQ last requested at this pidx */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint8_t doorbells; volatile uint32_t *udb; u_int udb_qid; u_int cntxt_id; __be32 cpl_ctrl0; /* for convenience */ __be32 op_pkd; /* ditto */ u_int nid; /* netmap ring # for this queue */ /* infrequently used items after this */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; int iqidx; } __aligned(CACHE_LINE_SIZE); struct sge { int nrxq; /* total # of Ethernet rx queues */ int ntxq; /* total # of Ethernet tx queues */ int nofldrxq; /* total # of TOE rx queues */ int nofldtxq; /* total # of TOE tx queues */ int nnmrxq; /* total # of netmap rx queues */ int nnmtxq; /* total # of netmap tx queues */ int niq; /* total # of ingress queues */ int neq; /* total # of egress queues */ struct sge_iq fwq; /* Firmware event queue */ struct sge_wrq *ctrlq; /* Control queues */ struct sge_txq *txq; /* NIC tx queues */ struct sge_rxq *rxq; /* NIC rx queues */ struct sge_ofld_txq *ofld_txq; /* TOE tx queues */ struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */ struct sge_nm_txq *nm_txq; /* netmap tx queues */ struct sge_nm_rxq *nm_rxq; /* netmap rx queues */ uint16_t iq_start; /* first cntxt_id */ uint16_t iq_base; /* first abs_id */ int eq_start; /* first cntxt_id */ int eq_base; /* first abs_id */ int iqmap_sz; int eqmap_sz; struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ int8_t safe_zidx; struct rx_buf_info rx_buf_info[SW_ZONE_SIZES]; }; struct devnames { const char *nexus_name; const char *ifnet_name; const char *vi_ifnet_name; const char *pf03_drv_name; const char *vf_nexus_name; const char *vf_ifnet_name; }; struct clip_entry; #define CNT_CAL_INFO 3 struct clock_sync { uint64_t hw_cur; uint64_t hw_prev; sbintime_t sbt_cur; sbintime_t sbt_prev; seqc_t gen; }; struct adapter { SLIST_ENTRY(adapter) link; device_t dev; struct cdev *cdev; const struct devnames *names; /* PCIe register resources */ int regs_rid; struct resource *regs_res; int msix_rid; struct resource *msix_res; bus_space_handle_t bh; bus_space_tag_t bt; bus_size_t mmio_len; int udbs_rid; struct resource *udbs_res; volatile uint8_t *udbs_base; unsigned int pf; unsigned int mbox; unsigned int vpd_busy; unsigned int vpd_flag; /* Interrupt information */ int intr_type; int intr_count; struct irq { struct resource *res; int rid; void *tag; struct sge_rxq *rxq; struct sge_nm_rxq *nm_rxq; } __aligned(CACHE_LINE_SIZE) *irq; int sge_gts_reg; int sge_kdoorbell_reg; bus_dma_tag_t dmat; /* Parent DMA tag */ struct sge sge; int lro_timeout; int sc_do_rxcopy; int vxlan_port; u_int vxlan_refcount; int rawf_base; int nrawf; u_int vlan_id; struct taskqueue *tq[MAX_NPORTS]; /* General purpose taskqueues */ struct port_info *port[MAX_NPORTS]; uint8_t chan_map[MAX_NCHAN]; /* channel -> port */ CXGBE_LIST_HEAD(, clip_entry) *clip_table; TAILQ_HEAD(, clip_entry) clip_pending; /* these need hw update. */ u_long clip_mask; int clip_gen; struct timeout_task clip_task; void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; struct t4_offload_policy *policy; struct rwlock policy_lock; void *iwarp_softc; /* (struct c4iw_dev *) */ struct iw_tunables iwt; void *iscsi_ulp_softc; /* (struct cxgbei_data *) */ struct l2t_data *l2t; /* L2 table */ struct smt_data *smt; /* Source MAC Table */ struct tid_info tids; vmem_t *key_map; struct tls_tunables tlst; uint8_t doorbells; int offload_map; /* port_id's with IFCAP_TOE enabled */ int bt_map; /* tx_chan's with BASE-T */ int active_ulds; /* ULDs activated on this adapter */ int flags; int debug_flags; int error_flags; /* Used by error handler and live reset. */ char ifp_lockname[16]; struct mtx ifp_lock; if_t ifp; /* tracer ifp */ struct ifmedia media; int traceq; /* iq used by all tracers, -1 if none */ int tracer_valid; /* bitmap of valid tracers */ int tracer_enabled; /* bitmap of enabled tracers */ char fw_version[16]; char tp_version[16]; char er_version[16]; char bs_version[16]; char cfg_file[32]; u_int cfcsum; struct adapter_params params; const struct chip_params *chip_params; struct t4_virt_res vres; uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; struct sysctl_ctx_list ctx; struct sysctl_oid *ctrlq_oid; struct sysctl_oid *fwq_oid; struct mtx sc_lock; char lockname[16]; /* Starving free lists */ struct mtx sfl_lock; /* same cache-line as sc_lock? but that's ok */ TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; struct callout cal_callout; struct clock_sync cal_info[CNT_CAL_INFO]; int cal_current; int cal_count; uint32_t cal_gen; /* * Driver code that can run when the adapter is suspended must use this * lock or a synchronized_op and check for HW_OFF_LIMITS before * accessing hardware. * * XXX: could be changed to rwlock. wlock in suspend/resume and for * indirect register access, rlock everywhere else. */ struct mtx reg_lock; struct memwin memwin[NUM_MEMWIN]; /* memory windows */ struct mtx tc_lock; struct task tc_task; struct task fatal_error_task; struct task reset_task; const void *reset_thread; int num_resets; int incarnation; const char *last_op; const void *last_op_thr; int last_op_flags; int swintr; int sensor_resets; struct callout ktls_tick; }; #define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock) #define ADAPTER_UNLOCK(sc) mtx_unlock(&(sc)->sc_lock) #define ADAPTER_LOCK_ASSERT_OWNED(sc) mtx_assert(&(sc)->sc_lock, MA_OWNED) #define ADAPTER_LOCK_ASSERT_NOTOWNED(sc) mtx_assert(&(sc)->sc_lock, MA_NOTOWNED) #define ASSERT_SYNCHRONIZED_OP(sc) \ KASSERT(IS_BUSY(sc) && \ (mtx_owned(&(sc)->sc_lock) || sc->last_op_thr == curthread), \ ("%s: operation not synchronized.", __func__)) #define PORT_LOCK(pi) mtx_lock(&(pi)->pi_lock) #define PORT_UNLOCK(pi) mtx_unlock(&(pi)->pi_lock) #define PORT_LOCK_ASSERT_OWNED(pi) mtx_assert(&(pi)->pi_lock, MA_OWNED) #define PORT_LOCK_ASSERT_NOTOWNED(pi) mtx_assert(&(pi)->pi_lock, MA_NOTOWNED) #define FL_LOCK(fl) mtx_lock(&(fl)->fl_lock) #define FL_TRYLOCK(fl) mtx_trylock(&(fl)->fl_lock) #define FL_UNLOCK(fl) mtx_unlock(&(fl)->fl_lock) #define FL_LOCK_ASSERT_OWNED(fl) mtx_assert(&(fl)->fl_lock, MA_OWNED) #define FL_LOCK_ASSERT_NOTOWNED(fl) mtx_assert(&(fl)->fl_lock, MA_NOTOWNED) #define RXQ_FL_LOCK(rxq) FL_LOCK(&(rxq)->fl) #define RXQ_FL_UNLOCK(rxq) FL_UNLOCK(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_OWNED(rxq) FL_LOCK_ASSERT_OWNED(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl) #define EQ_LOCK(eq) mtx_lock(&(eq)->eq_lock) #define EQ_TRYLOCK(eq) mtx_trylock(&(eq)->eq_lock) #define EQ_UNLOCK(eq) mtx_unlock(&(eq)->eq_lock) #define EQ_LOCK_ASSERT_OWNED(eq) mtx_assert(&(eq)->eq_lock, MA_OWNED) #define EQ_LOCK_ASSERT_NOTOWNED(eq) mtx_assert(&(eq)->eq_lock, MA_NOTOWNED) #define TXQ_LOCK(txq) EQ_LOCK(&(txq)->eq) #define TXQ_TRYLOCK(txq) EQ_TRYLOCK(&(txq)->eq) #define TXQ_UNLOCK(txq) EQ_UNLOCK(&(txq)->eq) #define TXQ_LOCK_ASSERT_OWNED(txq) EQ_LOCK_ASSERT_OWNED(&(txq)->eq) #define TXQ_LOCK_ASSERT_NOTOWNED(txq) EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq) #define for_each_txq(vi, iter, q) \ for (q = &vi->adapter->sge.txq[vi->first_txq], iter = 0; \ iter < vi->ntxq; ++iter, ++q) #define for_each_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.rxq[vi->first_rxq], iter = 0; \ iter < vi->nrxq; ++iter, ++q) #define for_each_ofld_txq(vi, iter, q) \ for (q = &vi->adapter->sge.ofld_txq[vi->first_ofld_txq], iter = 0; \ iter < vi->nofldtxq; ++iter, ++q) #define for_each_ofld_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.ofld_rxq[vi->first_ofld_rxq], iter = 0; \ iter < vi->nofldrxq; ++iter, ++q) #define for_each_nm_txq(vi, iter, q) \ for (q = &vi->adapter->sge.nm_txq[vi->first_nm_txq], iter = 0; \ iter < vi->nnmtxq; ++iter, ++q) #define for_each_nm_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.nm_rxq[vi->first_nm_rxq], iter = 0; \ iter < vi->nnmrxq; ++iter, ++q) #define for_each_vi(_pi, _iter, _vi) \ for ((_vi) = (_pi)->vi, (_iter) = 0; (_iter) < (_pi)->nvi; \ ++(_iter), ++(_vi)) #define IDXINCR(idx, incr, wrap) do { \ idx = wrap - idx > incr ? idx + incr : incr - (wrap - idx); \ } while (0) #define IDXDIFF(head, tail, wrap) \ ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) /* One for errors, one for firmware events */ #define T4_EXTRA_INTR 2 /* One for firmware events */ #define T4VF_EXTRA_INTR 1 static inline int forwarding_intr_to_fwq(struct adapter *sc) { return (sc->intr_count == 1); } /* Works reliably inside a synch_op or with reg_lock held. */ static inline bool hw_off_limits(struct adapter *sc) { const int off_limits = atomic_load_int(&sc->error_flags) & HW_OFF_LIMITS; return (__predict_false(off_limits != 0)); } /* Works reliably inside a synch_op or with reg_lock held. */ static inline bool hw_all_ok(struct adapter *sc) { const int not_ok = atomic_load_int(&sc->error_flags) & (ADAP_STOPPED | HW_OFF_LIMITS); return (__predict_true(not_ok == 0)); } static inline int mbuf_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.inner_l5hlen > 0, ("%s: mbuf %p missing information on # of segments.", __func__, m)); return (m->m_pkthdr.inner_l5hlen); } static inline void set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.inner_l5hlen = nsegs; } /* Internal mbuf flags stored in PH_loc.eight[1]. */ #define MC_NOMAP 0x01 #define MC_RAW_WR 0x02 #define MC_TLS 0x04 static inline int mbuf_cflags(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[4]); } static inline void set_mbuf_cflags(struct mbuf *m, uint8_t flags) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[4] = flags; } static inline int mbuf_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[0]; if (!(mbuf_cflags(m) & MC_TLS)) MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } static inline void set_mbuf_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); if (!(mbuf_cflags(m) & MC_TLS)) MPASS(len16 > 0 && len16 <= SGE_MAX_WR_LEN / 16); m->m_pkthdr.PH_loc.eight[0] = len16; } static inline uint32_t t4_read_reg(struct adapter *sc, uint32_t reg) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); return bus_space_read_4(sc->bt, sc->bh, reg); } static inline void t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); bus_space_write_4(sc->bt, sc->bh, reg, val); } static inline uint64_t t4_read_reg64(struct adapter *sc, uint32_t reg) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); #ifdef __LP64__ return bus_space_read_8(sc->bt, sc->bh, reg); #else return (uint64_t)bus_space_read_4(sc->bt, sc->bh, reg) + ((uint64_t)bus_space_read_4(sc->bt, sc->bh, reg + 4) << 32); #endif } static inline void t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); #ifdef __LP64__ bus_space_write_8(sc->bt, sc->bh, reg, val); #else bus_space_write_4(sc->bt, sc->bh, reg, val); bus_space_write_4(sc->bt, sc->bh, reg + 4, val>> 32); #endif } static inline void t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 1); } static inline void t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 1); } static inline void t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 2); } static inline void t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 2); } static inline void t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 4); } static inline void t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 4); } static inline struct port_info * adap2pinfo(struct adapter *sc, int idx) { return (sc->port[idx]); } static inline void t4_os_set_hw_addr(struct port_info *pi, uint8_t hw_addr[]) { bcopy(hw_addr, pi->vi[0].hw_addr, ETHER_ADDR_LEN); } static inline int tx_resume_threshold(struct sge_eq *eq) { /* not quite the same as qsize / 4, but this will do. */ return (eq->sidx / 4); } static inline int t4_use_ldst(struct adapter *sc) { #ifdef notyet return (sc->flags & FW_OK || !sc->use_bd); #else return (0); #endif } static inline void CH_DUMP_MBOX(struct adapter *sc, int mbox, const int reg, const char *msg, const __be64 *const p, const bool err) { if (!(sc->debug_flags & DF_DUMP_MBOX) && !err) return; if (p != NULL) { log(err ? LOG_ERR : LOG_DEBUG, "%s: mbox %u %s %016llx %016llx %016llx %016llx " "%016llx %016llx %016llx %016llx\n", device_get_nameunit(sc->dev), mbox, msg, (long long)be64_to_cpu(p[0]), (long long)be64_to_cpu(p[1]), (long long)be64_to_cpu(p[2]), (long long)be64_to_cpu(p[3]), (long long)be64_to_cpu(p[4]), (long long)be64_to_cpu(p[5]), (long long)be64_to_cpu(p[6]), (long long)be64_to_cpu(p[7])); } else { log(err ? LOG_ERR : LOG_DEBUG, "%s: mbox %u %s %016llx %016llx %016llx %016llx " "%016llx %016llx %016llx %016llx\n", device_get_nameunit(sc->dev), mbox, msg, (long long)t4_read_reg64(sc, reg), (long long)t4_read_reg64(sc, reg + 8), (long long)t4_read_reg64(sc, reg + 16), (long long)t4_read_reg64(sc, reg + 24), (long long)t4_read_reg64(sc, reg + 32), (long long)t4_read_reg64(sc, reg + 40), (long long)t4_read_reg64(sc, reg + 48), (long long)t4_read_reg64(sc, reg + 56)); } } /* t4_main.c */ extern int t4_ntxq; extern int t4_nrxq; extern int t4_intr_types; extern int t4_tmr_idx; extern int t4_pktc_idx; extern unsigned int t4_qsize_rxq; extern unsigned int t4_qsize_txq; extern int t4_ddp_rcvbuf_len; extern unsigned int t4_ddp_rcvbuf_cache; extern device_method_t cxgbe_methods[]; int t4_os_find_pci_capability(struct adapter *, int); void t4_os_portmod_changed(struct port_info *); void t4_os_link_changed(struct port_info *); void t4_iterate(void (*)(struct adapter *, void *), void *); void t4_init_devnames(struct adapter *); void t4_add_adapter(struct adapter *); int t4_detach_common(device_t); int t4_map_bars_0_and_4(struct adapter *); int t4_map_bar_2(struct adapter *); int t4_adj_doorbells(struct adapter *); int t4_setup_intr_handlers(struct adapter *); void t4_sysctls(struct adapter *); int begin_synchronized_op(struct adapter *, struct vi_info *, int, char *); void end_synchronized_op(struct adapter *, int); void begin_vi_detach(struct adapter *, struct vi_info *); void end_vi_detach(struct adapter *, struct vi_info *); int update_mac_settings(if_t, int); int adapter_init(struct adapter *); int vi_init(struct vi_info *); void vi_sysctls(struct vi_info *); int rw_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int, int); int alloc_atid(struct adapter *, void *); void *lookup_atid(struct adapter *, int); void free_atid(struct adapter *, int); void release_tid(struct adapter *, int, struct sge_wrq *); int cxgbe_media_change(if_t); void cxgbe_media_status(if_t, struct ifmediareq *); void t4_os_cim_err(struct adapter *); int suspend_adapter(struct adapter *); int resume_adapter(struct adapter *); int toe_capability(struct vi_info *, bool); #ifdef KERN_TLS /* t6_kern_tls.c */ int t6_tls_tag_alloc(if_t, union if_snd_tag_alloc_params *, struct m_snd_tag **); void t6_ktls_modload(void); void t6_ktls_modunload(void); int t6_ktls_try(if_t, struct socket *, struct ktls_session *); int t6_ktls_parse_pkt(struct mbuf *); int t6_ktls_write_wr(struct sge_txq *, void *, struct mbuf *, u_int); #endif /* t4_keyctx.c */ struct auth_hash; union authctx; #ifdef KERN_TLS struct ktls_session; struct tls_key_req; struct tls_keyctx; #endif void t4_aes_getdeckey(void *, const void *, unsigned int); void t4_copy_partial_hash(int, union authctx *, void *); void t4_init_gmac_hash(const char *, int, char *); void t4_init_hmac_digest(const struct auth_hash *, u_int, const char *, int, char *); #ifdef KERN_TLS u_int t4_tls_key_info_size(const struct ktls_session *); int t4_tls_proto_ver(const struct ktls_session *); int t4_tls_cipher_mode(const struct ktls_session *); int t4_tls_auth_mode(const struct ktls_session *); int t4_tls_hmac_ctrl(const struct ktls_session *); void t4_tls_key_ctx(const struct ktls_session *, int, struct tls_keyctx *); int t4_alloc_tls_keyid(struct adapter *); void t4_free_tls_keyid(struct adapter *, int); void t4_write_tlskey_wr(const struct ktls_session *, int, int, int, int, struct tls_key_req *); #endif #ifdef DEV_NETMAP /* t4_netmap.c */ struct sge_nm_rxq; void cxgbe_nm_attach(struct vi_info *); void cxgbe_nm_detach(struct vi_info *); void service_nm_rxq(struct sge_nm_rxq *); int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int); int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *); int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int); int free_nm_txq(struct vi_info *, struct sge_nm_txq *); #endif /* t4_sge.c */ void t4_sge_modload(void); void t4_sge_modunload(void); uint64_t t4_sge_extfree_refs(void); void t4_tweak_chip_settings(struct adapter *); int t4_verify_chip_settings(struct adapter *); void t4_init_rx_buf_info(struct adapter *); int t4_create_dma_tag(struct adapter *); void t4_sge_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid_list *); int t4_destroy_dma_tag(struct adapter *); int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); void free_fl_buffers(struct adapter *, struct sge_fl *); int t4_setup_adapter_queues(struct adapter *); int t4_teardown_adapter_queues(struct adapter *); int t4_setup_vi_queues(struct vi_info *); int t4_teardown_vi_queues(struct vi_info *); void t4_intr_all(void *); void t4_intr(void *); #ifdef DEV_NETMAP void t4_nm_intr(void *); void t4_vi_intr(void *); #endif void t4_intr_err(void *); void t4_intr_evt(void *); void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); void t4_update_fl_bufsize(if_t); struct mbuf *alloc_wr_mbuf(int, int); int parse_pkt(struct mbuf **, bool); void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *); void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *); int t4_sge_set_conm_context(struct adapter *, int, int, int); void t4_register_an_handler(an_handler_t); void t4_register_fw_msg_handler(int, fw_msg_handler_t); void t4_register_cpl_handler(int, cpl_handler_t); void t4_register_shared_cpl_handler(int, cpl_handler_t, int); #ifdef RATELIMIT void send_etid_flush_wr(struct cxgbe_rate_tag *); #endif /* t4_tracer.c */ struct t4_tracer; void t4_tracer_modload(void); void t4_tracer_modunload(void); void t4_tracer_port_detach(struct adapter *); int t4_get_tracer(struct adapter *, struct t4_tracer *); int t4_set_tracer(struct adapter *, struct t4_tracer *); int t4_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); int t5_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); /* t4_sched.c */ int t4_set_sched_class(struct adapter *, struct t4_sched_params *); int t4_set_sched_queue(struct adapter *, struct t4_sched_queue *); int t4_init_tx_sched(struct adapter *); int t4_free_tx_sched(struct adapter *); void t4_update_tx_sched(struct adapter *); int t4_reserve_cl_rl_kbps(struct adapter *, int, u_int, int *); void t4_release_cl_rl(struct adapter *, int, int); int sysctl_tc(SYSCTL_HANDLER_ARGS); int sysctl_tc_params(SYSCTL_HANDLER_ARGS); #ifdef RATELIMIT void t4_init_etid_table(struct adapter *); void t4_free_etid_table(struct adapter *); struct cxgbe_rate_tag *lookup_etid(struct adapter *, int); int cxgbe_rate_tag_alloc(if_t, union if_snd_tag_alloc_params *, struct m_snd_tag **); void cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *); void cxgbe_ratelimit_query(if_t, struct if_ratelimit_query_results *); #endif /* t4_filter.c */ int get_filter_mode(struct adapter *, uint32_t *); int set_filter_mode(struct adapter *, uint32_t); int set_filter_mask(struct adapter *, uint32_t); int get_filter(struct adapter *, struct t4_filter *); int set_filter(struct adapter *, struct t4_filter *); int del_filter(struct adapter *, struct t4_filter *); int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_hashfilter_ao_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_hashfilter_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_del_hashfilter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); void free_hftid_hash(struct tid_info *); static inline struct wrqe * alloc_wrqe(int wr_len, struct sge_wrq *wrq) { int len = offsetof(struct wrqe, wr) + wr_len; struct wrqe *wr; wr = malloc(len, M_CXGBE, M_NOWAIT); if (__predict_false(wr == NULL)) return (NULL); wr->wr_len = wr_len; wr->wrq = wrq; return (wr); } static inline void * wrtod(struct wrqe *wr) { return (&wr->wr[0]); } static inline void free_wrqe(struct wrqe *wr) { free(wr, M_CXGBE); } static inline void t4_wrq_tx(struct adapter *sc, struct wrqe *wr) { struct sge_wrq *wrq = wr->wrq; TXQ_LOCK(wrq); if (__predict_true(wrq->eq.flags & EQ_HW_ALLOCATED)) t4_wrq_tx_locked(sc, wrq, wr); else free(wr, M_CXGBE); TXQ_UNLOCK(wrq); } static inline int read_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, val, len, 0)); } static inline int write_via_memwin(struct adapter *sc, int idx, uint32_t addr, const uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, (void *)(uintptr_t)val, len, 1)); } /* Number of len16 -> number of descriptors */ static inline int tx_len16_to_desc(int len16) { return (howmany(len16, EQ_ESIZE / 16)); } #endif diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index cc927f27d318..7591db6cd833 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -1,6981 +1,6989 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_NETMAP #include #include #include #include #include #endif #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #define RX_COPY_THRESHOLD MINCLSIZE /* * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ static int fl_pktshift = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0, "payload DMA offset in rx buffer (bytes)"); /* * Pad ethernet payload up to this boundary. * -1: driver should figure out a good value. * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ int fl_pad = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0, "payload pad boundary (bytes)"); /* * Status page length. * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ static int spg_len = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0, "status page size (bytes)"); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. * 2: both backpressure and drop. */ static int cong_drop = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0, "Congestion control for NIC RX queues (0 = backpressure, 1 = drop, 2 = both"); #ifdef TCP_OFFLOAD static int ofld_cong_drop = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ofld_cong_drop, CTLFLAG_RDTUN, &ofld_cong_drop, 0, "Congestion control for TOE RX queues (0 = backpressure, 1 = drop, 2 = both"); #endif /* * Deliver multiple frames in the same free list buffer if they fit. * -1: let the driver decide whether to enable buffer packing or not. * 0: disable buffer packing. * 1: enable buffer packing. */ static int buffer_packing = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing, 0, "Enable buffer packing"); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. * T4: driver will ignore this and use the same value as fl_pad above. * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0, "payload pack boundary (bytes)"); /* * Largest rx cluster size that the driver is allowed to allocate. */ static int largest_rx_cluster = MJUM16BYTES; SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN, &largest_rx_cluster, 0, "Largest rx cluster (bytes)"); /* * Size of cluster allocation that's most likely to succeed. The driver will * fall back to this size if it fails to allocate clusters larger than this. */ static int safest_rx_cluster = PAGE_SIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN, &safest_rx_cluster, 0, "Safe rx cluster (bytes)"); #ifdef RATELIMIT /* * Knob to control TCP timestamp rewriting, and the granularity of the tick used * for rewriting. -1 and 0-3 are all valid values. * -1: hardware should leave the TCP timestamps alone. * 0: 1ms * 1: 100us * 2: 10us * 3: 1us */ static int tsclk = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0, "Control TCP timestamp rewriting when using pacing"); static int eo_max_backlog = 1024 * 1024; SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog, 0, "Maximum backlog of ratelimited data per flow"); #endif /* * The interrupt holdoff timers are multiplied by this value on T6+. * 1 and 3-17 (both inclusive) are legal values. */ static int tscale = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0, "Interrupt holdoff timer scale on T6+"); /* * Number of LRO entries in the lro_ctrl structure per rx queue. */ static int lro_entries = TCP_LRO_ENTRIES; SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0, "Number of LRO entries per RX queue"); /* * This enables presorting of frames before they're fed into tcp_lro_rx. */ static int lro_mbufs = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0, "Enable presorting of LRO frames"); static counter_u64_t pullups; SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, pullups, CTLFLAG_RD, &pullups, "Number of mbuf pullups performed"); static counter_u64_t defrags; SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, defrags, CTLFLAG_RD, &defrags, "Number of mbuf defrags performed"); static int t4_tx_coalesce = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce, CTLFLAG_RWTUN, &t4_tx_coalesce, 0, "tx coalescing allowed"); /* * The driver will make aggressive attempts at tx coalescing if it sees these * many packets eligible for coalescing in quick succession, with no more than * the specified gap in between the eth_tx calls that delivered the packets. */ static int t4_tx_coalesce_pkts = 32; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_pkts, CTLFLAG_RWTUN, &t4_tx_coalesce_pkts, 0, "# of consecutive packets (1 - 255) that will trigger tx coalescing"); static int t4_tx_coalesce_gap = 5; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_gap, CTLFLAG_RWTUN, &t4_tx_coalesce_gap, 0, "tx gap (in microseconds)"); static int service_iq(struct sge_iq *, int); static int service_iq_fl(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *, u_int); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t, struct sge_iq *, char *); static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *, struct sysctl_ctx_list *, struct sysctl_oid *); static void free_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *); static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_iq *); static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_fl *); static int alloc_iq_fl_hwq(struct vi_info *, struct sge_iq *, struct sge_fl *); static int free_iq_fl_hwq(struct adapter *, struct sge_iq *, struct sge_fl *); static int alloc_fwq(struct adapter *); static void free_fwq(struct adapter *); static int alloc_ctrlq(struct adapter *, int); static void free_ctrlq(struct adapter *, int); static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, int); static void free_rxq(struct vi_info *, struct sge_rxq *); static void add_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_rxq *); #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int, int); static void free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *); static void add_ofld_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_ofld_rxq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct sge_eq *, struct sysctl_ctx_list *, struct sysctl_oid *); static void free_eq(struct adapter *, struct sge_eq *); static void add_eq_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_eq *); static int alloc_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *); static int free_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *); static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *, struct sysctl_ctx_list *, struct sysctl_oid *); static void free_wrq(struct adapter *, struct sge_wrq *); static void add_wrq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_wrq *); static int alloc_txq(struct vi_info *, struct sge_txq *, int); static void free_txq(struct vi_info *, struct sge_txq *); static void add_txq_sysctls(struct vi_info *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_txq *); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int alloc_ofld_txq(struct vi_info *, struct sge_ofld_txq *, int); static void free_ofld_txq(struct vi_info *, struct sge_ofld_txq *); static void add_ofld_txq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_ofld_txq *); #endif static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int find_refill_source(struct adapter *, int, bool); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); static inline u_int txpkt_len16(u_int, const u_int); static inline u_int txpkt_vm_len16(u_int, const u_int); static inline void calculate_mbuf_len16(struct mbuf *, bool); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int); static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *, u_int); static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *, struct mbuf *); static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *, int, bool *); static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *, int, bool *); static u_int write_txpkts_wr(struct adapter *, struct sge_txq *); static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *); static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); static inline uint16_t read_hw_cidx(struct sge_eq *); static inline u_int reclaimable_tx_desc(struct sge_eq *); static inline u_int total_available_tx_desc(struct sge_eq *); static u_int reclaim_tx_descs(struct sge_txq *, u_int); static void tx_reclaim(void *, int); static __be64 get_flit(struct sglist_seg *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *); static void wrq_tx_drain(void *, int); static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); #ifdef RATELIMIT static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *, struct mbuf *); #if defined(INET) || defined(INET6) static inline u_int txpkt_eo_len16(u_int, u_int, u_int); static int ethofld_transmit(if_t, struct mbuf *); #endif #endif static counter_u64_t extfree_refs; static counter_u64_t extfree_rels; an_handler_t t4_an_handler; fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES]; cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS]; cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES]; cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES]; void t4_register_an_handler(an_handler_t h) { uintptr_t *loc; MPASS(h == NULL || t4_an_handler == NULL); loc = (uintptr_t *)&t4_an_handler; atomic_store_rel_ptr(loc, (uintptr_t)h); } void t4_register_fw_msg_handler(int type, fw_msg_handler_t h) { uintptr_t *loc; MPASS(type < nitems(t4_fw_msg_handler)); MPASS(h == NULL || t4_fw_msg_handler[type] == NULL); /* * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL * handler dispatch table. Reject any attempt to install a handler for * this subtype. */ MPASS(type != FW_TYPE_RSSCPL); MPASS(type != FW6_TYPE_RSSCPL); loc = (uintptr_t *)&t4_fw_msg_handler[type]; atomic_store_rel_ptr(loc, (uintptr_t)h); } void t4_register_cpl_handler(int opcode, cpl_handler_t h) { uintptr_t *loc; MPASS(opcode < nitems(t4_cpl_handler)); MPASS(h == NULL || t4_cpl_handler[opcode] == NULL); loc = (uintptr_t *)&t4_cpl_handler[opcode]; atomic_store_rel_ptr(loc, (uintptr_t)h); } static int set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); u_int tid; int cookie; MPASS(m == NULL); tid = GET_TID(cpl); if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) { /* * The return code for filter-write is put in the CPL cookie so * we have to rely on the hardware tid (is_ftid) to determine * that this is a response to a filter. */ cookie = CPL_COOKIE_FILTER; } else { cookie = G_COOKIE(cpl->cookie); } MPASS(cookie > CPL_COOKIE_RESERVED); MPASS(cookie < nitems(set_tcb_rpl_handlers)); return (set_tcb_rpl_handlers[cookie](iq, rss, m)); } static int l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); unsigned int cookie; MPASS(m == NULL); cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER; return (l2t_write_rpl_handlers[cookie](iq, rss, m)); } static int act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status))); MPASS(m == NULL); MPASS(cookie != CPL_COOKIE_RESERVED); return (act_open_rpl_handlers[cookie](iq, rss, m)); } static int abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; u_int cookie; MPASS(m == NULL); if (is_hashfilter(sc)) cookie = CPL_COOKIE_HASHFILTER; else cookie = CPL_COOKIE_TOM; return (abort_rpl_rss_handlers[cookie](iq, rss, m)); } static int fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); u_int cookie; MPASS(m == NULL); if (is_etid(sc, tid)) cookie = CPL_COOKIE_ETHOFLD; else cookie = CPL_COOKIE_TOM; return (fw4_ack_handlers[cookie](iq, rss, m)); } static void t4_init_shared_cpl_handlers(void) { t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler); t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler); t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler); t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler); t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler); } void t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie) { uintptr_t *loc; MPASS(opcode < nitems(t4_cpl_handler)); MPASS(cookie > CPL_COOKIE_RESERVED); MPASS(cookie < NUM_CPL_COOKIES); MPASS(t4_cpl_handler[opcode] != NULL); switch (opcode) { case CPL_SET_TCB_RPL: loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie]; break; case CPL_L2T_WRITE_RPL: loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie]; break; case CPL_ACT_OPEN_RPL: loc = (uintptr_t *)&act_open_rpl_handlers[cookie]; break; case CPL_ABORT_RPL_RSS: loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie]; break; case CPL_FW4_ACK: loc = (uintptr_t *)&fw4_ack_handlers[cookie]; break; default: MPASS(0); return; } MPASS(h == NULL || *loc == (uintptr_t)NULL); atomic_store_rel_ptr(loc, (uintptr_t)h); } /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. */ void t4_sge_modload(void) { if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," " using 0 instead.\n", fl_pktshift); fl_pktshift = 0; } if (spg_len != 64 && spg_len != 128) { int len; #if defined(__i386__) || defined(__amd64__) len = cpu_clflush_line_size > 64 ? 128 : 64; #else len = 64; #endif if (spg_len != -1) { printf("Invalid hw.cxgbe.spg_len value (%d)," " using %d instead.\n", spg_len, len); } spg_len = len; } if (cong_drop < -1 || cong_drop > 2) { printf("Invalid hw.cxgbe.cong_drop value (%d)," " using 0 instead.\n", cong_drop); cong_drop = 0; } #ifdef TCP_OFFLOAD if (ofld_cong_drop < -1 || ofld_cong_drop > 2) { printf("Invalid hw.cxgbe.ofld_cong_drop value (%d)," " using 0 instead.\n", ofld_cong_drop); ofld_cong_drop = 0; } #endif if (tscale != 1 && (tscale < 3 || tscale > 17)) { printf("Invalid hw.cxgbe.tscale value (%d)," " using 1 instead.\n", tscale); tscale = 1; } if (largest_rx_cluster != MCLBYTES && +#if MJUMPAGESIZE != MCLBYTES largest_rx_cluster != MJUMPAGESIZE && +#endif largest_rx_cluster != MJUM9BYTES && largest_rx_cluster != MJUM16BYTES) { printf("Invalid hw.cxgbe.largest_rx_cluster value (%d)," " using %d instead.\n", largest_rx_cluster, MJUM16BYTES); largest_rx_cluster = MJUM16BYTES; } if (safest_rx_cluster != MCLBYTES && +#if MJUMPAGESIZE != MCLBYTES safest_rx_cluster != MJUMPAGESIZE && +#endif safest_rx_cluster != MJUM9BYTES && safest_rx_cluster != MJUM16BYTES) { printf("Invalid hw.cxgbe.safest_rx_cluster value (%d)," " using %d instead.\n", safest_rx_cluster, MJUMPAGESIZE); safest_rx_cluster = MJUMPAGESIZE; } extfree_refs = counter_u64_alloc(M_WAITOK); extfree_rels = counter_u64_alloc(M_WAITOK); pullups = counter_u64_alloc(M_WAITOK); defrags = counter_u64_alloc(M_WAITOK); counter_u64_zero(extfree_refs); counter_u64_zero(extfree_rels); counter_u64_zero(pullups); counter_u64_zero(defrags); t4_init_shared_cpl_handlers(); t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update); #ifdef RATELIMIT t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack, CPL_COOKIE_ETHOFLD); #endif t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl); } void t4_sge_modunload(void) { counter_u64_free(extfree_refs); counter_u64_free(extfree_rels); counter_u64_free(pullups); counter_u64_free(defrags); } uint64_t t4_sge_extfree_refs(void) { uint64_t refs, rels; rels = counter_u64_fetch(extfree_rels); refs = counter_u64_fetch(extfree_refs); return (refs - rels); } /* max 4096 */ #define MAX_PACK_BOUNDARY 512 static inline void setup_pad_and_pack_boundaries(struct adapter *sc) { uint32_t v, m; int pad, pack, pad_shift; pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT : X_INGPADBOUNDARY_SHIFT; pad = fl_pad; if (fl_pad < (1 << pad_shift) || fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) || !powerof2(fl_pad)) { /* * If there is any chance that we might use buffer packing and * the chip is a T4, then pick 64 as the pad/pack boundary. Set * it to the minimum allowed in all other cases. */ pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift; /* * For fl_pad = 0 we'll still write a reasonable value to the * register but all the freelists will opt out of padding. * We'll complain here only if the user tried to set it to a * value greater than 0 that was invalid. */ if (fl_pad > 0) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" " (%d), using %d instead.\n", fl_pad, pad); } } m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); if (is_t4(sc)) { if (fl_pack != -1 && fl_pack != pad) { /* Complain but carry on. */ device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," " using %d instead.\n", fl_pack, pad); } return; } pack = fl_pack; if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) { if (sc->params.pci.mps > MAX_PACK_BOUNDARY) pack = MAX_PACK_BOUNDARY; else pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); MPASS(powerof2(pack)); if (pack < 16) pack = 16; if (pack == 32) pack = 64; if (pack > 4096) pack = 4096; if (fl_pack != -1) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" " (%d), using %d instead.\n", fl_pack, pack); } } m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(pack) - 5); MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); } /* * adap->params.vpd.cclk must be set up before this is called. */ void t4_tweak_chip_settings(struct adapter *sc) { int i, reg; uint32_t v, m; int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sw_buf_sizes[] = { MCLBYTES, +#if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, +#endif MJUM9BYTES, MJUM16BYTES }; KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); setup_pad_and_pack_boundaries(sc); v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096); t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536); reg = A_SGE_FL_BUFFER_SIZE2; for (i = 0; i < nitems(sw_buf_sizes); i++) { MPASS(reg <= A_SGE_FL_BUFFER_SIZE15); t4_write_reg(sc, reg, sw_buf_sizes[i]); reg += 4; MPASS(reg <= A_SGE_FL_BUFFER_SIZE15); t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE); reg += 4; } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); KASSERT(intr_timer[0] <= timer_max, ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], timer_max)); for (i = 1; i < nitems(intr_timer); i++) { KASSERT(intr_timer[i] >= intr_timer[i - 1], ("%s: timers not listed in increasing order (%d)", __func__, i)); while (intr_timer[i] > timer_max) { if (i == nitems(intr_timer) - 1) { intr_timer[i] = timer_max; break; } intr_timer[i] += intr_timer[i - 1]; intr_timer[i] /= 2; } } v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); if (chip_id(sc) >= CHELSIO_T6) { m = V_TSCALE(M_TSCALE); if (tscale == 1) v = 0; else v = V_TSCALE(tscale - 2); t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v); if (sc->debug_flags & DF_DISABLE_TCB_CACHE) { m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN | V_WRTHRTHRESH(M_WRTHRTHRESH); t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1); v &= ~m; v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN | V_WRTHRTHRESH(16); t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1); } } /* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); /* * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP. These have been * chosen with MAXPHYS = 128K in mind. The largest DDP buffer that we * may have to deal with is MAXPHYS + 1 page. */ v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4); t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v); /* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */ m = v = F_TDDPTAGTCB | F_ISCSITAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } /* * SGE wants the buffer to be at least 64B and then a multiple of 16. Its * address mut be 16B aligned. If padding is in use the buffer's start and end * need to be aligned to the pad boundary as well. We'll just make sure that * the size is a multiple of the pad boundary here, it is up to the buffer * allocation code to make sure the start of the buffer is aligned. */ static inline int hwsz_ok(struct adapter *sc, int hwsz) { int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1; return (hwsz >= 64 && (hwsz & mask) == 0); } /* * Initialize the rx buffer sizes and figure out which zones the buffers will * be allocated from. */ void t4_init_rx_buf_info(struct adapter *sc) { struct sge *s = &sc->sge; struct sge_params *sp = &sc->params.sge; int i, j, n; static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, +#if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, +#endif MJUM9BYTES, MJUM16BYTES }; struct rx_buf_info *rxb; s->safe_zidx = -1; rxb = &s->rx_buf_info[0]; for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { rxb->size1 = sw_buf_sizes[i]; rxb->zone = m_getzone(rxb->size1); rxb->type = m_gettype(rxb->size1); rxb->size2 = 0; rxb->hwidx1 = -1; rxb->hwidx2 = -1; for (j = 0; j < SGE_FLBUF_SIZES; j++) { int hwsize = sp->sge_fl_buffer_size[j]; if (!hwsz_ok(sc, hwsize)) continue; /* hwidx for size1 */ if (rxb->hwidx1 == -1 && rxb->size1 == hwsize) rxb->hwidx1 = j; /* hwidx for size2 (buffer packing) */ if (rxb->size1 - CL_METADATA_SIZE < hwsize) continue; n = rxb->size1 - hwsize - CL_METADATA_SIZE; if (n == 0) { rxb->hwidx2 = j; rxb->size2 = hwsize; break; /* stop looking */ } if (rxb->hwidx2 != -1) { if (n < sp->sge_fl_buffer_size[rxb->hwidx2] - hwsize - CL_METADATA_SIZE) { rxb->hwidx2 = j; rxb->size2 = hwsize; } } else if (n <= 2 * CL_METADATA_SIZE) { rxb->hwidx2 = j; rxb->size2 = hwsize; } } if (rxb->hwidx2 != -1) sc->flags |= BUF_PACKING_OK; if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster) s->safe_zidx = i; } } /* * Verify some basic SGE settings for the PF and VF driver, and other * miscellaneous settings for the PF driver. */ int t4_verify_chip_settings(struct adapter *sc) { struct sge_params *sp = &sc->params.sge; uint32_t m, v, r; int rc = 0; const uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); m = F_RXPKTCPLMODE; v = F_RXPKTCPLMODE; r = sp->sge_control; if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } /* * If this changes then every single use of PAGE_SHIFT in the driver * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift. */ if (sp->page_shift != PAGE_SHIFT) { device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); rc = EINVAL; } if (sc->flags & IS_VF) return (0); v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); if (r != v) { device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); if (sc->vres.ddp.size != 0) rc = EINVAL; } m = v = F_TDDPTAGTCB; r = t4_read_reg(sc, A_ULP_RX_CTL); if ((r & m) != v) { device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); if (sc->vres.ddp.size != 0) rc = EINVAL; } m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; r = t4_read_reg(sc, A_TP_PARA_REG5); if ((r & m) != v) { device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); if (sc->vres.ddp.size != 0) rc = EINVAL; } return (rc); } int t4_create_dma_tag(struct adapter *sc) { int rc; rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmat); if (rc != 0) { device_printf(sc->dev, "failed to create main DMA tag: %d\n", rc); } return (rc); } void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { struct sge_params *sp = &sc->params.sge; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_bufsizes, "A", "freelist buffer sizes"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, NULL, sp->pad_boundary, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, sp->spg_len, "status page size (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); #ifdef TCP_OFFLOAD SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ofld_cong_drop", CTLFLAG_RD, NULL, ofld_cong_drop, "congestion drop setting"); #endif SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sp->pack_boundary, "payload pack boundary (bytes)"); } int t4_destroy_dma_tag(struct adapter *sc) { if (sc->dmat) bus_dma_tag_destroy(sc->dmat); return (0); } /* * Allocate and initialize the firmware event queue, control queues, and special * purpose rx queues owned by the adapter. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ int t4_setup_adapter_queues(struct adapter *sc) { int rc, i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* * Firmware event queue */ rc = alloc_fwq(sc); if (rc != 0) return (rc); /* * That's all for the VF driver. */ if (sc->flags & IS_VF) return (rc); /* * XXX: General purpose rx queues, one per port. */ /* * Control queues, one per port. */ for_each_port(sc, i) { rc = alloc_ctrlq(sc, i); if (rc != 0) return (rc); } return (rc); } /* * Idempotent */ int t4_teardown_adapter_queues(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); if (sc->sge.ctrlq != NULL) { MPASS(!(sc->flags & IS_VF)); /* VFs don't allocate ctrlq. */ for_each_port(sc, i) free_ctrlq(sc, i); } free_fwq(sc); return (0); } /* Maximum payload that could arrive with a single iq descriptor. */ static inline int max_rx_payload(struct adapter *sc, if_t ifp, const bool ofld) { int maxp; /* large enough even when hw VLAN extraction is disabled */ maxp = sc->params.sge.fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + if_getmtu(ifp); if (ofld && sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS && maxp < sc->params.tp.max_rx_pdu) maxp = sc->params.tp.max_rx_pdu; return (maxp); } int t4_setup_vi_queues(struct vi_info *vi) { int rc = 0, i, intr_idx; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif #ifdef DEV_NETMAP int saved_idx, iqidx; struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif struct adapter *sc = vi->adapter; if_t ifp = vi->ifp; int maxp; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = vi->first_intr; #ifdef DEV_NETMAP saved_idx = intr_idx; if (if_getcapabilities(ifp) & IFCAP_NETMAP) { /* netmap is supported with direct interrupts only. */ MPASS(!forwarding_intr_to_fwq(sc)); MPASS(vi->first_intr >= 0); /* * We don't have buffers to back the netmap rx queues * right now so we create the queues in a way that * doesn't set off any congestion signal in the chip. */ for_each_nm_rxq(vi, i, nm_rxq) { rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i); if (rc != 0) goto done; intr_idx++; } for_each_nm_txq(vi, i, nm_txq) { iqidx = vi->first_nm_rxq + (i % vi->nnmrxq); rc = alloc_nm_txq(vi, nm_txq, iqidx, i); if (rc != 0) goto done; } } /* Normal rx queues and netmap rx queues share the same interrupts. */ intr_idx = saved_idx; #endif /* * Allocate rx queues first because a default iqid is required when * creating a tx queue. */ maxp = max_rx_payload(sc, ifp, false); for_each_rxq(vi, i, rxq) { rc = alloc_rxq(vi, rxq, i, intr_idx, maxp); if (rc != 0) goto done; if (!forwarding_intr_to_fwq(sc)) intr_idx++; } #ifdef DEV_NETMAP if (if_getcapabilities(ifp) & IFCAP_NETMAP) intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq); #endif #ifdef TCP_OFFLOAD maxp = max_rx_payload(sc, ifp, true); for_each_ofld_rxq(vi, i, ofld_rxq) { rc = alloc_ofld_rxq(vi, ofld_rxq, i, intr_idx, maxp); if (rc != 0) goto done; if (!forwarding_intr_to_fwq(sc)) intr_idx++; } #endif /* * Now the tx queues. */ for_each_txq(vi, i, txq) { rc = alloc_txq(vi, txq, i); if (rc != 0) goto done; } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { rc = alloc_ofld_txq(vi, ofld_txq, i); if (rc != 0) goto done; } #endif done: if (rc) t4_teardown_vi_queues(vi); return (rc); } /* * Idempotent */ int t4_teardown_vi_queues(struct vi_info *vi) { int i; struct sge_rxq *rxq; struct sge_txq *txq; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif #ifdef DEV_NETMAP if (if_getcapabilities(vi->ifp) & IFCAP_NETMAP) { for_each_nm_txq(vi, i, nm_txq) { free_nm_txq(vi, nm_txq); } for_each_nm_rxq(vi, i, nm_rxq) { free_nm_rxq(vi, nm_rxq); } } #endif /* * Take down all the tx queues first, as they reference the rx queues * (for egress updates, etc.). */ for_each_txq(vi, i, txq) { free_txq(vi, txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { free_ofld_txq(vi, ofld_txq); } #endif /* * Then take down the rx queues. */ for_each_rxq(vi, i, rxq) { free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { free_ofld_rxq(vi, ofld_rxq); } #endif return (0); } /* * Interrupt handler when the driver is using only 1 interrupt. This is a very * unusual scenario. * * a) Deals with errors, if any. * b) Services firmware event queue, which is taking interrupts for all other * queues. */ void t4_intr_all(void *arg) { struct adapter *sc = arg; struct sge_iq *fwq = &sc->sge.fwq; MPASS(sc->intr_count == 1); if (sc->intr_type == INTR_INTX) t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); t4_intr_err(arg); t4_intr_evt(fwq); } /* * Interrupt handler for errors (installed directly when multiple interrupts are * being used, or called by t4_intr_all). */ void t4_intr_err(void *arg) { struct adapter *sc = arg; uint32_t v; const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0; if (atomic_load_int(&sc->error_flags) & ADAP_FATAL_ERR) return; v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE)); if (v & F_PFSW) { sc->swintr++; t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v); } if (t4_slow_intr_handler(sc, verbose)) t4_fatal_err(sc, false); } /* * Interrupt handler for iq-only queues. The firmware event queue is the only * such queue right now. */ void t4_intr_evt(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } /* * Interrupt handler for iq+fl queues. */ void t4_intr(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq_fl(iq, 0); (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } #ifdef DEV_NETMAP /* * Interrupt handler for netmap rx queues. */ void t4_nm_intr(void *arg) { struct sge_nm_rxq *nm_rxq = arg; if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) { service_nm_rxq(nm_rxq); (void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON); } } /* * Interrupt handler for vectors shared between NIC and netmap rx queues. */ void t4_vi_intr(void *arg) { struct irq *irq = arg; MPASS(irq->nm_rxq != NULL); t4_nm_intr(irq->nm_rxq); MPASS(irq->rxq != NULL); t4_intr(irq->rxq); } #endif /* * Deals with interrupts on an iq-only (no freelist) queue. */ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type; uint32_t lq; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); KASSERT((iq->flags & IQ_HAS_FL) == 0, ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq, iq->flags)); MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); MPASS((iq->flags & IQ_LRO_ENABLED) == 0); limit = budget ? budget : iq->qsize / 16; /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: panic("%s: data for an iq (%p) with no freelist", __func__, iq); /* NOTREACHED */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL); break; case X_RSPD_TYPE_INTR: /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. */ if (__predict_true(lq >= 1024)) { t4_an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start - sc->sge.iq_base]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { if (service_iq_fl(q, q->qsize / 16) == 0) { (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { STAILQ_INSERT_TAIL(&iql, q, link); } } break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; if (budget) { return (EINPROGRESS); } } } if (STAILQ_EMPTY(&iql)) break; /* * Process the head only, and send it to the back of the list if * it's still not done. */ q = STAILQ_FIRST(&iql); STAILQ_REMOVE_HEAD(&iql, link); if (service_iq_fl(q, q->qsize / 8) == 0) (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); else STAILQ_INSERT_TAIL(&iql, q, link); } t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); return (0); } #if defined(INET) || defined(INET6) static inline int sort_before_lro(struct lro_ctrl *lro) { return (lro->lro_mbuf_max != 0); } #endif #define CGBE_SHIFT_SCALE 10 static inline uint64_t t4_tstmp_to_ns(struct adapter *sc, uint64_t lf) { struct clock_sync *cur, dcur; uint64_t hw_clocks; uint64_t hw_clk_div; sbintime_t sbt_cur_to_prev, sbt; uint64_t hw_tstmp = lf & 0xfffffffffffffffULL; /* 60b, not 64b. */ seqc_t gen; for (;;) { cur = &sc->cal_info[sc->cal_current]; gen = seqc_read(&cur->gen); if (gen == 0) return (0); dcur = *cur; if (seqc_consistent(&cur->gen, gen)) break; } /* * Our goal here is to have a result that is: * * ( (cur_time - prev_time) ) * ((hw_tstmp - hw_prev) * ----------------------------- ) + prev_time * ( (hw_cur - hw_prev) ) * * With the constraints that we cannot use float and we * don't want to overflow the uint64_t numbers we are using. */ hw_clocks = hw_tstmp - dcur.hw_prev; sbt_cur_to_prev = (dcur.sbt_cur - dcur.sbt_prev); hw_clk_div = dcur.hw_cur - dcur.hw_prev; sbt = hw_clocks * sbt_cur_to_prev / hw_clk_div + dcur.sbt_prev; return (sbttons(sbt)); } static inline void move_to_next_rxbuf(struct sge_fl *fl) { fl->rx_offset = 0; if (__predict_false((++fl->cidx & 7) == 0)) { uint16_t cidx = fl->cidx >> 3; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } } /* * Deals with interrupts on an iq+fl queue. */ static int service_iq_fl(struct sge_iq *iq, int budget) { struct sge_rxq *rxq = iq_to_rxq(iq); struct sge_fl *fl; struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs, limit; int rsp_type, starved; uint32_t lq; uint16_t fl_hw_cidx; struct mbuf *m0; #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; struct lro_ctrl *lro = &rxq->lro; #endif KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); MPASS(iq->flags & IQ_HAS_FL); ndescs = 0; #if defined(INET) || defined(INET6) if (iq->flags & IQ_ADJ_CREDIT) { MPASS(sort_before_lro(lro)); iq->flags &= ~IQ_ADJ_CREDIT; if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) { tcp_lro_flush_all(lro); t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); return (0); } ndescs = 1; } #else MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); #endif limit = budget ? budget : iq->qsize / 16; fl = &rxq->fl; fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); m0 = NULL; rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: if (lq & F_RSPD_NEWBUF) { if (fl->rx_offset > 0) move_to_next_rxbuf(fl); lq = G_RSPD_LEN(lq); } if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) { FL_LOCK(fl); refill_fl(sc, fl, 64); FL_UNLOCK(fl); fl_hw_cidx = fl->hw_cidx; } if (d->rss.opcode == CPL_RX_PKT) { if (__predict_true(eth_rx(sc, rxq, d, lq) == 0)) break; goto out; } m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto out; /* fall through */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. That is the only * acceptable indirect interrupt on this queue. */ if (__predict_false(lq < 1024)) { panic("%s: indirect interrupt on iq_fl %p " "with qid %u", __func__, iq, lq); } t4_an_handler(iq, &d->rsp); break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && !sort_before_lro(lro) && sc->lro_timeout != 0) { tcp_lro_flush_inactive(lro, &lro_timeout); } #endif if (budget) return (EINPROGRESS); ndescs = 0; } } out: #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { if (ndescs > 0 && lro->lro_mbuf_count > 8) { MPASS(sort_before_lro(lro)); /* hold back one credit and don't flush LRO state */ iq->flags |= IQ_ADJ_CREDIT; ndescs--; } else { tcp_lro_flush_all(lro); } } #endif t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); FL_LOCK(fl); starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); return (0); } static inline struct cluster_metadata * cl_metadata(struct fl_sdesc *sd) { return ((void *)(sd->cl + sd->moff)); } static void rxb_free(struct mbuf *m) { struct cluster_metadata *clm = m->m_ext.ext_arg1; uma_zfree(clm->zone, clm->cl); counter_u64_add(extfree_rels, 1); } /* * The mbuf returned comes from zone_muf and carries the payload in one of these * ways * a) complete frame inside the mbuf * b) m_cljset (for clusters without metadata) * d) m_extaddref (cluster with metadata) */ static struct mbuf * get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; struct cluster_metadata *clm; int len, blen; caddr_t payload; if (fl->flags & FL_BUF_PACKING) { u_int l, pad; blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); payload = sd->cl + fl->rx_offset; l = fr_offset + len; pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < rxb->size2) blen = len + pad; MPASS(fl->rx_offset + blen <= rxb->size2); } else { MPASS(fl->rx_offset == 0); /* not packing */ blen = rxb->size1; len = min(remaining, blen); payload = sd->cl; } if (fr_offset == 0) { m = m_gethdr(M_NOWAIT, MT_DATA); if (__predict_false(m == NULL)) return (NULL); m->m_pkthdr.len = remaining; } else { m = m_get(M_NOWAIT, MT_DATA); if (__predict_false(m == NULL)) return (NULL); } m->m_len = len; kmsan_mark(payload, len, KMSAN_STATE_INITED); if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { /* copy data to mbuf */ bcopy(payload, mtod(m, caddr_t), len); if (fl->flags & FL_BUF_PACKING) { fl->rx_offset += blen; MPASS(fl->rx_offset <= rxb->size2); if (fl->rx_offset < rxb->size2) return (m); /* without advancing the cidx */ } } else if (fl->flags & FL_BUF_PACKING) { clm = cl_metadata(sd); if (sd->nmbuf++ == 0) { clm->refcount = 1; clm->zone = rxb->zone; clm->cl = sd->cl; counter_u64_add(extfree_refs, 1); } m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm, NULL); fl->rx_offset += blen; MPASS(fl->rx_offset <= rxb->size2); if (fl->rx_offset < rxb->size2) return (m); /* without advancing the cidx */ } else { m_cljset(m, sd->cl, rxb->type); sd->cl = NULL; /* consumed, not a recycle candidate */ } move_to_next_rxbuf(fl); return (m); } static struct mbuf * get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen) { struct mbuf *m0, *m, **pnext; u_int remaining; if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(fl->m0->m_pkthdr.len == plen); MPASS(fl->remaining < plen); m0 = fl->m0; pnext = fl->pnext; remaining = fl->remaining; fl->flags &= ~FL_BUF_RESUME; goto get_segment; } /* * Payload starts at rx_offset in the current hw buffer. Its length is * 'len' and it may span multiple hw buffers. */ m0 = get_scatter_segment(sc, fl, 0, plen); if (m0 == NULL) return (NULL); remaining = plen - m0->m_len; pnext = &m0->m_next; while (remaining > 0) { get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, plen - remaining, remaining); if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = remaining; fl->flags |= FL_BUF_RESUME; return (NULL); } *pnext = m; pnext = &m->m_next; remaining -= m->m_len; } *pnext = NULL; M_ASSERTPKTHDR(m0); return (m0); } static int skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; int len, blen; if (fl->flags & FL_BUF_PACKING) { u_int l, pad; blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); l = fr_offset + len; pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < rxb->size2) blen = len + pad; fl->rx_offset += blen; MPASS(fl->rx_offset <= rxb->size2); if (fl->rx_offset < rxb->size2) return (len); /* without advancing the cidx */ } else { MPASS(fl->rx_offset == 0); /* not packing */ blen = rxb->size1; len = min(remaining, blen); } move_to_next_rxbuf(fl); return (len); } static inline void skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen) { int remaining, fr_offset, len; fr_offset = 0; remaining = plen; while (remaining > 0) { len = skip_scatter_segment(sc, fl, fr_offset, remaining); fr_offset += len; remaining -= len; } } static inline int get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen) { int len; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; if (fl->flags & FL_BUF_PACKING) len = rxb->size2 - fl->rx_offset; else len = rxb->size1; return (min(plen, len)); } static int eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d, u_int plen) { struct mbuf *m0; if_t ifp = rxq->ifp; struct sge_fl *fl = &rxq->fl; struct vi_info *vi = if_getsoftc(ifp); const struct cpl_rx_pkt *cpl; #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif uint16_t err_vec, tnl_type, tnlhdr_len; static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6}, {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6}, }; static const int sw_csum_flags[2][2] = { { /* IP, inner IP */ CSUM_ENCAP_VXLAN | CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, /* IP, inner IP6 */ CSUM_ENCAP_VXLAN | CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, }, { /* IP6, inner IP */ CSUM_ENCAP_VXLAN | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, /* IP6, inner IP6 */ CSUM_ENCAP_VXLAN | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, }, }; MPASS(plen > sc->params.sge.fl_pktshift); if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) && __predict_true((fl->flags & FL_BUF_RESUME) == 0)) { struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; caddr_t frame; int rc, slen; slen = get_segment_len(sc, fl, plen) - sc->params.sge.fl_pktshift; frame = sd->cl + fl->rx_offset + sc->params.sge.fl_pktshift; CURVNET_SET_QUIET(if_getvnet(ifp)); rc = pfil_mem_in(vi->pfil, frame, slen, ifp, &m0); CURVNET_RESTORE(); if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) { skip_fl_payload(sc, fl, plen); return (0); } if (rc == PFIL_REALLOCED) { skip_fl_payload(sc, fl, plen); goto have_mbuf; } } m0 = get_fl_payload(sc, fl, plen); if (__predict_false(m0 == NULL)) return (ENOMEM); m0->m_pkthdr.len -= sc->params.sge.fl_pktshift; m0->m_len -= sc->params.sge.fl_pktshift; m0->m_data += sc->params.sge.fl_pktshift; have_mbuf: m0->m_pkthdr.rcvif = ifp; M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]); m0->m_pkthdr.flowid = be32toh(d->rss.hash_val); cpl = (const void *)(&d->rss + 1); if (sc->params.tp.rx_pkt_encap) { const uint16_t ev = be16toh(cpl->err_vec); err_vec = G_T6_COMPR_RXERR_VEC(ev); tnl_type = G_T6_RX_TNL_TYPE(ev); tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev); } else { err_vec = be16toh(cpl->err_vec); tnl_type = 0; tnlhdr_len = 0; } if (cpl->csum_calc && err_vec == 0) { int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6)); /* checksum(s) calculated and found to be correct. */ MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^ (cpl->l2info & htobe32(F_RXF_IP6))); m0->m_pkthdr.csum_data = be16toh(cpl->csum); if (tnl_type == 0) { if (!ipv6 && if_getcapenable(ifp) & IFCAP_RXCSUM) { m0->m_pkthdr.csum_flags = CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID; } else if (ipv6 && if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) { m0->m_pkthdr.csum_flags = CSUM_L4_CALC | CSUM_L4_VALID; } rxq->rxcsum++; } else { MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN); M_HASHTYPE_SETINNER(m0); if (__predict_false(cpl->ip_frag)) { /* * csum_data is for the inner frame (which is an * IP fragment) and is not 0xffff. There is no * way to pass the inner csum_data to the stack. * We don't want the stack to use the inner * csum_data to validate the outer frame or it * will get rejected. So we fix csum_data here * and let sw do the checksum of inner IP * fragments. * * XXX: Need 32b for csum_data2 in an rx mbuf. * Maybe stuff it into rcv_tstmp? */ m0->m_pkthdr.csum_data = 0xffff; if (ipv6) { m0->m_pkthdr.csum_flags = CSUM_L4_CALC | CSUM_L4_VALID; } else { m0->m_pkthdr.csum_flags = CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID; } } else { int outer_ipv6; MPASS(m0->m_pkthdr.csum_data == 0xffff); outer_ipv6 = tnlhdr_len >= sizeof(struct ether_header) + sizeof(struct ip6_hdr); m0->m_pkthdr.csum_flags = sw_csum_flags[outer_ipv6][ipv6]; } rxq->vxlan_rxcsum++; } } if (cpl->vlan_ex) { if (sc->flags & IS_VF && sc->vlan_id) { /* * HW is not setup correctly if extracted vlan_id does * not match the VF's setting. */ MPASS(be16toh(cpl->vlan) == sc->vlan_id); } else { m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); m0->m_flags |= M_VLANTAG; rxq->vlan_extraction++; } } if (rxq->iq.flags & IQ_RX_TIMESTAMP) { /* * Fill up rcv_tstmp but do not set M_TSTMP as * long as we get a non-zero back from t4_tstmp_to_ns(). */ m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc, be64toh(d->rsp.u.last_flit)); if (m0->m_pkthdr.rcv_tstmp != 0) m0->m_flags |= M_TSTMP; } #ifdef NUMA m0->m_pkthdr.numa_domain = if_getnumadomain(ifp); #endif #if defined(INET) || defined(INET6) if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 && (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 || M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) { if (sort_before_lro(lro)) { tcp_lro_queue_mbuf(lro, m0); return (0); /* queued for sort, then LRO */ } if (tcp_lro_rx(lro, m0, 0) == 0) return (0); /* queued for LRO */ } #endif if_input(ifp, m0); return (0); } /* * Must drain the wrq or make sure that someone else will. */ static void wrq_tx_drain(void *arg, int n) { struct sge_wrq *wrq = arg; struct sge_eq *eq = &wrq->eq; EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(wrq->adapter, wrq); EQ_UNLOCK(eq); } static void drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) { struct sge_eq *eq = &wrq->eq; u_int available, dbdiff; /* # of hardware descriptors */ u_int n; struct wrqe *wr; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ EQ_LOCK_ASSERT_OWNED(eq); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); wr = STAILQ_FIRST(&wrq->wr_list); MPASS(wr != NULL); /* Must be called with something useful to do */ MPASS(eq->pidx == eq->dbidx); dbdiff = 0; do { eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; MPASS(wr->wrq == wrq); n = howmany(wr->wr_len, EQ_ESIZE); if (available < n) break; dst = (void *)&eq->desc[eq->pidx]; if (__predict_true(eq->sidx - eq->pidx > n)) { /* Won't wrap, won't end exactly at the status page. */ bcopy(&wr->wr[0], dst, wr->wr_len); eq->pidx += n; } else { int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; bcopy(&wr->wr[0], dst, first_portion); if (wr->wr_len > first_portion) { bcopy(&wr->wr[first_portion], &eq->desc[0], wr->wr_len - first_portion); } eq->pidx = n - (eq->sidx - eq->pidx); } wrq->tx_wrs_copied++; if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { /* * XXX: This is not 100% reliable with some * types of WRs. But this is a very unusual * situation for an ofld/ctrl queue anyway. */ dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); } dbdiff += n; if (dbdiff >= 16) { ring_eq_db(sc, eq, dbdiff); dbdiff = 0; } STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); MPASS(wrq->nwr_pending > 0); wrq->nwr_pending--; MPASS(wrq->ndesc_needed >= n); wrq->ndesc_needed -= n; } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); if (dbdiff) ring_eq_db(sc, eq, dbdiff); } /* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { #ifdef INVARIANTS struct sge_eq *eq = &wrq->eq; #endif EQ_LOCK_ASSERT_OWNED(eq); MPASS(wr != NULL); MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); MPASS((wr->wr_len & 0x7) == 0); STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); wrq->nwr_pending++; wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) return; /* commit_wrq_wr will drain wr_list as well. */ drain_wrq_wr_list(sc, wrq); /* Doorbell must have caught up to the pidx. */ MPASS(eq->pidx == eq->dbidx); } void t4_update_fl_bufsize(if_t ifp) { struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; int i, maxp; maxp = max_rx_payload(sc, ifp, false); for_each_rxq(vi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD maxp = max_rx_payload(sc, ifp, true); for_each_ofld_rxq(vi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING); FL_UNLOCK(fl); } #endif } #ifdef RATELIMIT static inline int mbuf_eo_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[1]); } #if defined(INET) || defined(INET6) static inline void set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[1] = nsegs; } #endif static inline int mbuf_eo_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[2]; MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } #if defined(INET) || defined(INET6) static inline void set_mbuf_eo_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[2] = len16; } #endif static inline int mbuf_eo_tsclk_tsoff(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[3]); } #if defined(INET) || defined(INET6) static inline void set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff; } #endif static inline int needs_eo(struct m_snd_tag *mst) { return (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_RATE_LIMIT); } #endif /* * Try to allocate an mbuf to contain a raw work request. To make it * easy to construct the work request, don't allocate a chain but a * single mbuf. */ struct mbuf * alloc_wr_mbuf(int len, int how) { struct mbuf *m; if (len <= MHLEN) m = m_gethdr(how, MT_DATA); else if (len <= MCLBYTES) m = m_getcl(how, MT_DATA, M_PKTHDR); else m = NULL; if (m == NULL) return (NULL); m->m_pkthdr.len = len; m->m_len = len; set_mbuf_cflags(m, MC_RAW_WR); set_mbuf_len16(m, howmany(len, 16)); return (m); } static inline bool needs_hwcsum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_tso(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO | CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_vxlan_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN); } static inline bool needs_vxlan_tso(struct mbuf *m) { const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return ((m->m_pkthdr.csum_flags & csum_flags) != 0 && (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN); } #if defined(INET) || defined(INET6) static inline bool needs_inner_tcp_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } #endif static inline bool needs_l3_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_outer_tcp_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP | CSUM_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } #ifdef RATELIMIT static inline bool needs_outer_l4_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_outer_udp_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } #endif static inline bool needs_vlan_insertion(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_flags & M_VLANTAG); } #if defined(INET) || defined(INET6) static void * m_advance(struct mbuf **pm, int *poffset, int len) { struct mbuf *m = *pm; int offset = *poffset; uintptr_t p = 0; MPASS(len > 0); for (;;) { if (offset + len < m->m_len) { offset += len; p = mtod(m, uintptr_t) + offset; break; } len -= m->m_len - offset; m = m->m_next; offset = 0; MPASS(m != NULL); } *poffset = offset; *pm = m; return ((void *)p); } #endif static inline int count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr) { vm_paddr_t paddr; int i, len, off, pglen, pgoff, seglen, segoff; int nsegs = 0; M_ASSERTEXTPG(m); off = mtod(m, vm_offset_t); len = m->m_len; off += skip; len -= skip; if (m->m_epg_hdrlen != 0) { if (off >= m->m_epg_hdrlen) { off -= m->m_epg_hdrlen; } else { seglen = m->m_epg_hdrlen - off; segoff = off; seglen = min(seglen, len); off = 0; len -= seglen; paddr = pmap_kextract( (vm_offset_t)&m->m_epg_hdr[segoff]); if (*nextaddr != paddr) nsegs++; *nextaddr = paddr + seglen; } } pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off >= pglen) { off -= pglen; pgoff = 0; continue; } seglen = pglen - off; segoff = pgoff + off; off = 0; seglen = min(seglen, len); len -= seglen; paddr = m->m_epg_pa[i] + segoff; if (*nextaddr != paddr) nsegs++; *nextaddr = paddr + seglen; pgoff = 0; }; if (len != 0) { seglen = min(len, m->m_epg_trllen - off); len -= seglen; paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]); if (*nextaddr != paddr) nsegs++; *nextaddr = paddr + seglen; } return (nsegs); } /* * Can deal with empty mbufs in the chain that have m_len = 0, but the chain * must have at least one mbuf that's not empty. It is possible for this * routine to return 0 if skip accounts for all the contents of the mbuf chain. */ static inline int count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags) { vm_paddr_t nextaddr, paddr; vm_offset_t va; int len, nsegs; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len > 0); MPASS(m->m_pkthdr.len >= skip); nsegs = 0; nextaddr = 0; for (; m; m = m->m_next) { len = m->m_len; if (__predict_false(len == 0)) continue; if (skip >= len) { skip -= len; continue; } if ((m->m_flags & M_EXTPG) != 0) { *cflags |= MC_NOMAP; nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr); skip = 0; continue; } va = mtod(m, vm_offset_t) + skip; len -= skip; skip = 0; paddr = pmap_kextract(va); nsegs += sglist_count((void *)(uintptr_t)va, len); if (paddr == nextaddr) nsegs--; nextaddr = pmap_kextract(va + len - 1) + 1; } return (nsegs); } /* * The maximum number of segments that can fit in a WR. */ static int max_nsegs_allowed(struct mbuf *m, bool vm_wr) { if (vm_wr) { if (needs_tso(m)) return (TX_SGL_SEGS_VM_TSO); return (TX_SGL_SEGS_VM); } if (needs_tso(m)) { if (needs_vxlan_tso(m)) return (TX_SGL_SEGS_VXLAN_TSO); else return (TX_SGL_SEGS_TSO); } return (TX_SGL_SEGS); } static struct timeval txerr_ratecheck = {0}; static const struct timeval txerr_interval = {3, 0}; /* * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: * a) caller can assume it's been freed if this function returns with an error. * b) it may get defragged up if the gather list is too long for the hardware. */ int parse_pkt(struct mbuf **mp, bool vm_wr) { struct mbuf *m0 = *mp, *m; int rc, nsegs, defragged = 0; struct ether_header *eh; #ifdef INET void *l3hdr; #endif #if defined(INET) || defined(INET6) int offset; struct tcphdr *tcp; #endif #if defined(KERN_TLS) || defined(RATELIMIT) struct m_snd_tag *mst; #endif uint16_t eh_type; uint8_t cflags; cflags = 0; M_ASSERTPKTHDR(m0); if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { rc = EINVAL; fail: m_freem(m0); *mp = NULL; return (rc); } restart: /* * First count the number of gather list segments in the payload. * Defrag the mbuf if nsegs exceeds the hardware limit. */ M_ASSERTPKTHDR(m0); MPASS(m0->m_pkthdr.len > 0); nsegs = count_mbuf_nsegs(m0, 0, &cflags); #if defined(KERN_TLS) || defined(RATELIMIT) if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) mst = m0->m_pkthdr.snd_tag; else mst = NULL; #endif #ifdef KERN_TLS if (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_TLS) { cflags |= MC_TLS; set_mbuf_cflags(m0, cflags); rc = t6_ktls_parse_pkt(m0); if (rc != 0) goto fail; return (EINPROGRESS); } #endif if (nsegs > max_nsegs_allowed(m0, vm_wr)) { if (defragged++ > 0) { rc = EFBIG; goto fail; } counter_u64_add(defrags, 1); if ((m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = ENOMEM; goto fail; } *mp = m0 = m; /* update caller's copy after defrag */ goto restart; } if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN && !(cflags & MC_NOMAP))) { counter_u64_add(pullups, 1); m0 = m_pullup(m0, m0->m_pkthdr.len); if (m0 == NULL) { /* Should have left well enough alone. */ rc = EFBIG; goto fail; } *mp = m0; /* update caller's copy after pullup */ goto restart; } set_mbuf_nsegs(m0, nsegs); set_mbuf_cflags(m0, cflags); calculate_mbuf_len16(m0, vm_wr); #ifdef RATELIMIT /* * Ethofld is limited to TCP and UDP for now, and only when L4 hw * checksumming is enabled. needs_outer_l4_csum happens to check for * all the right things. */ if (__predict_false(needs_eo(mst) && !needs_outer_l4_csum(m0))) { m_snd_tag_rele(m0->m_pkthdr.snd_tag); m0->m_pkthdr.snd_tag = NULL; m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; mst = NULL; } #endif if (!needs_hwcsum(m0) #ifdef RATELIMIT && !needs_eo(mst) #endif ) return (0); m = m0; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.l2hlen = sizeof(*evh); } else m0->m_pkthdr.l2hlen = sizeof(*eh); #if defined(INET) || defined(INET6) offset = 0; #ifdef INET l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); #else m_advance(&m, &offset, m0->m_pkthdr.l2hlen); #endif #endif switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr); break; #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; if (needs_vxlan_csum(m0)) { /* Driver will do the outer IP hdr checksum. */ ip->ip_sum = 0; if (needs_vxlan_tso(m0)) { const uint16_t ipl = ip->ip_len; ip->ip_len = 0; ip->ip_sum = ~in_cksum_hdr(ip); ip->ip_len = ipl; } else ip->ip_sum = in_cksum_hdr(ip); } m0->m_pkthdr.l3hlen = ip->ip_hl << 2; break; } #endif default: if (ratecheck(&txerr_ratecheck, &txerr_interval)) { log(LOG_ERR, "%s: ethertype 0x%04x unknown. " "if_cxgbe must be compiled with the same " "INET/INET6 options as the kernel.\n", __func__, eh_type); } rc = EINVAL; goto fail; } #if defined(INET) || defined(INET6) if (needs_vxlan_csum(m0)) { m0->m_pkthdr.l4hlen = sizeof(struct udphdr); m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header); /* Inner headers. */ eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen + sizeof(struct udphdr) + sizeof(struct vxlan_header)); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.inner_l2hlen = sizeof(*evh); } else m0->m_pkthdr.inner_l2hlen = sizeof(*eh); #ifdef INET l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen); #else m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen); #endif switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr); break; #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2; break; } #endif default: if (ratecheck(&txerr_ratecheck, &txerr_interval)) { log(LOG_ERR, "%s: VXLAN hw offload requested" "with unknown ethertype 0x%04x. if_cxgbe " "must be compiled with the same INET/INET6 " "options as the kernel.\n", __func__, eh_type); } rc = EINVAL; goto fail; } if (needs_inner_tcp_csum(m0)) { tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen); m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4; } MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN; } if (needs_outer_tcp_csum(m0)) { tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); m0->m_pkthdr.l4hlen = tcp->th_off * 4; #ifdef RATELIMIT if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) { set_mbuf_eo_tsclk_tsoff(m0, V_FW_ETH_TX_EO_WR_TSCLK(tsclk) | V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1)); } else set_mbuf_eo_tsclk_tsoff(m0, 0); } else if (needs_outer_udp_csum(m0)) { m0->m_pkthdr.l4hlen = sizeof(struct udphdr); #endif } #ifdef RATELIMIT if (needs_eo(mst)) { u_int immhdrs; /* EO WRs have the headers in the WR and not the GL. */ immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; cflags = 0; nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags); MPASS(cflags == mbuf_cflags(m0)); set_mbuf_eo_nsegs(m0, nsegs); set_mbuf_eo_len16(m0, txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0))); rc = ethofld_transmit(mst->ifp, m0); if (rc != 0) goto fail; return (EINPROGRESS); } #endif #endif MPASS(m0 == *mp); return (0); } void * start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, available; struct wrqe *wr; void *w; MPASS(len16 > 0); ndesc = tx_len16_to_desc(len16); MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); EQ_LOCK(eq); if (__predict_false((eq->flags & EQ_HW_ALLOCATED) == 0)) { EQ_UNLOCK(eq); return (NULL); } if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); if (!STAILQ_EMPTY(&wrq->wr_list)) { slowpath: EQ_UNLOCK(eq); wr = alloc_wrqe(len16 * 16, wrq); if (__predict_false(wr == NULL)) return (NULL); cookie->pidx = -1; cookie->ndesc = ndesc; return (&wr->wr); } eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < ndesc) goto slowpath; cookie->pidx = eq->pidx; cookie->ndesc = ndesc; TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); w = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, ndesc, eq->sidx); if (__predict_false(cookie->pidx + ndesc > eq->sidx)) { w = &wrq->ss[0]; wrq->ss_pidx = cookie->pidx; wrq->ss_len = len16 * 16; } EQ_UNLOCK(eq); return (w); } void commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, pidx; struct wrq_cookie *prev, *next; if (cookie->pidx == -1) { struct wrqe *wr = __containerof(w, struct wrqe, wr); t4_wrq_tx(sc, wr); return; } if (__predict_false(w == &wrq->ss[0])) { int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; MPASS(wrq->ss_len > n); /* WR had better wrap around. */ bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); wrq->tx_wrs_ss++; } else wrq->tx_wrs_direct++; EQ_LOCK(eq); ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ pidx = cookie->pidx; MPASS(pidx >= 0 && pidx < eq->sidx); prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); next = TAILQ_NEXT(cookie, link); if (prev == NULL) { MPASS(pidx == eq->dbidx); if (next == NULL || ndesc >= 16) { int available; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ /* * Note that the WR via which we'll request tx updates * is at pidx and not eq->pidx, which has moved on * already. */ dst = (void *)&eq->desc[pidx]; available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { /* * XXX: This is not 100% reliable with some * types of WRs. But this is a very unusual * situation for an ofld/ctrl queue anyway. */ dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); } if (__predict_true(eq->flags & EQ_HW_ALLOCATED)) ring_eq_db(wrq->adapter, eq, ndesc); else IDXINCR(eq->dbidx, ndesc, eq->sidx); } else { MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); next->pidx = pidx; next->ndesc += ndesc; } } else { MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); prev->ndesc += ndesc; } TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); #ifdef INVARIANTS if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { /* Doorbell must have caught up to the pidx. */ MPASS(wrq->eq.pidx == wrq->eq.dbidx); } #endif EQ_UNLOCK(eq); } static u_int can_resume_eth_tx(struct mp_ring *r) { struct sge_eq *eq = r->cookie; return (total_available_tx_desc(eq) > eq->sidx / 8); } static inline bool cannot_use_txpkts(struct mbuf *m) { /* maybe put a GL limit too, to avoid silliness? */ return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0); } static inline int discard_tx(struct sge_eq *eq) { return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED); } static inline int wr_can_update_eq(void *p) { struct fw_eth_tx_pkts_wr *wr = p; switch (G_FW_WR_OP(be32toh(wr->op_pkd))) { case FW_ULPTX_WR: case FW_ETH_TX_PKT_WR: case FW_ETH_TX_PKTS_WR: case FW_ETH_TX_PKTS2_WR: case FW_ETH_TX_PKT_VM_WR: case FW_ETH_TX_PKTS_VM_WR: return (1); default: return (0); } } static inline void set_txupdate_flags(struct sge_txq *txq, u_int avail, struct fw_eth_tx_pkt_wr *wr) { struct sge_eq *eq = &txq->eq; struct txpkts *txp = &txq->txp; if ((txp->npkt > 0 || avail < eq->sidx / 2) && atomic_cmpset_int(&eq->equiq, 0, 1)) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } } #if defined(__i386__) || defined(__amd64__) extern uint64_t tsc_freq; #endif static inline bool record_eth_tx_time(struct sge_txq *txq) { const uint64_t cycles = get_cyclecount(); const uint64_t last_tx = txq->last_tx; #if defined(__i386__) || defined(__amd64__) const uint64_t itg = tsc_freq * t4_tx_coalesce_gap / 1000000; #else const uint64_t itg = 0; #endif MPASS(cycles >= last_tx); txq->last_tx = cycles; return (cycles - last_tx < itg); } /* * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to * be consumed. Return the actual number consumed. 0 indicates a stall. */ static u_int eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing) { struct sge_txq *txq = r->cookie; if_t ifp = txq->ifp; struct sge_eq *eq = &txq->eq; struct txpkts *txp = &txq->txp; struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; u_int total, remaining; /* # of packets */ u_int n, avail, dbdiff; /* # of hardware descriptors */ int i, rc; struct mbuf *m0; bool snd, recent_tx; void *wr; /* start of the last WR written to the ring */ TXQ_LOCK_ASSERT_OWNED(txq); recent_tx = record_eth_tx_time(txq); remaining = IDXDIFF(pidx, cidx, r->size); if (__predict_false(discard_tx(eq))) { for (i = 0; i < txp->npkt; i++) m_freem(txp->mb[i]); txp->npkt = 0; while (cidx != pidx) { m0 = r->items[cidx]; m_freem(m0); if (++cidx == r->size) cidx = 0; } reclaim_tx_descs(txq, eq->sidx); *coalescing = false; return (remaining); /* emptied */ } /* How many hardware descriptors do we have readily available. */ if (eq->pidx == eq->cidx) avail = eq->sidx - 1; else avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; total = 0; if (remaining == 0) { txp->score = 0; txq->txpkts_flush++; goto send_txpkts; } dbdiff = 0; MPASS(remaining > 0); while (remaining > 0) { m0 = r->items[cidx]; M_ASSERTPKTHDR(m0); MPASS(m0->m_nextpkt == NULL); if (avail < 2 * SGE_MAX_WR_NDESC) avail += reclaim_tx_descs(txq, 64); if (t4_tx_coalesce == 0 && txp->npkt == 0) goto skip_coalescing; if (cannot_use_txpkts(m0)) txp->score = 0; else if (recent_tx) { if (++txp->score == 0) txp->score = UINT8_MAX; } else txp->score = 1; if (txp->npkt > 0 || remaining > 1 || txp->score >= t4_tx_coalesce_pkts || atomic_load_int(&txq->eq.equiq) != 0) { if (vi->flags & TX_USES_VM_WR) rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd); else rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd); } else { snd = false; rc = EINVAL; } if (snd) { MPASS(txp->npkt > 0); for (i = 0; i < txp->npkt; i++) ETHER_BPF_MTAP(ifp, txp->mb[i]); if (txp->npkt > 1) { MPASS(avail >= tx_len16_to_desc(txp->len16)); if (vi->flags & TX_USES_VM_WR) n = write_txpkts_vm_wr(sc, txq); else n = write_txpkts_wr(sc, txq); } else { MPASS(avail >= tx_len16_to_desc(mbuf_len16(txp->mb[0]))); if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); else n = write_txpkt_wr(sc, txq, txp->mb[0], avail); } MPASS(n <= SGE_MAX_WR_NDESC); avail -= n; dbdiff += n; wr = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, n, eq->sidx); txp->npkt = 0; /* emptied */ } if (rc == 0) { /* m0 was coalesced into txq->txpkts. */ goto next_mbuf; } if (rc == EAGAIN) { /* * m0 is suitable for tx coalescing but could not be * combined with the existing txq->txpkts, which has now * been transmitted. Start a new txpkts with m0. */ MPASS(snd); MPASS(txp->npkt == 0); continue; } MPASS(rc != 0 && rc != EAGAIN); MPASS(txp->npkt == 0); skip_coalescing: n = tx_len16_to_desc(mbuf_len16(m0)); if (__predict_false(avail < n)) { avail += reclaim_tx_descs(txq, min(n, 32)); if (avail < n) break; /* out of descriptors */ } wr = &eq->desc[eq->pidx]; if (mbuf_cflags(m0) & MC_RAW_WR) { n = write_raw_wr(txq, wr, m0, avail); #ifdef KERN_TLS } else if (mbuf_cflags(m0) & MC_TLS) { ETHER_BPF_MTAP(ifp, m0); n = t6_ktls_write_wr(txq, wr, m0, avail); #endif } else { ETHER_BPF_MTAP(ifp, m0); if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, m0); else n = write_txpkt_wr(sc, txq, m0, avail); } MPASS(n >= 1 && n <= avail); if (!(mbuf_cflags(m0) & MC_TLS)) MPASS(n <= SGE_MAX_WR_NDESC); avail -= n; dbdiff += n; IDXINCR(eq->pidx, n, eq->sidx); if (dbdiff >= 512 / EQ_ESIZE) { /* X_FETCHBURSTMAX_512B */ if (wr_can_update_eq(wr)) set_txupdate_flags(txq, avail, wr); ring_eq_db(sc, eq, dbdiff); avail += reclaim_tx_descs(txq, 32); dbdiff = 0; } next_mbuf: total++; remaining--; if (__predict_false(++cidx == r->size)) cidx = 0; } if (dbdiff != 0) { if (wr_can_update_eq(wr)) set_txupdate_flags(txq, avail, wr); ring_eq_db(sc, eq, dbdiff); reclaim_tx_descs(txq, 32); } else if (eq->pidx == eq->cidx && txp->npkt > 0 && atomic_load_int(&txq->eq.equiq) == 0) { /* * If nothing was submitted to the chip for tx (it was coalesced * into txpkts instead) and there is no tx update outstanding * then we need to send txpkts now. */ send_txpkts: MPASS(txp->npkt > 0); for (i = 0; i < txp->npkt; i++) ETHER_BPF_MTAP(ifp, txp->mb[i]); if (txp->npkt > 1) { MPASS(avail >= tx_len16_to_desc(txp->len16)); if (vi->flags & TX_USES_VM_WR) n = write_txpkts_vm_wr(sc, txq); else n = write_txpkts_wr(sc, txq); } else { MPASS(avail >= tx_len16_to_desc(mbuf_len16(txp->mb[0]))); if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); else n = write_txpkt_wr(sc, txq, txp->mb[0], avail); } MPASS(n <= SGE_MAX_WR_NDESC); wr = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, n, eq->sidx); txp->npkt = 0; /* emptied */ MPASS(wr_can_update_eq(wr)); set_txupdate_flags(txq, avail - n, wr); ring_eq_db(sc, eq, n); reclaim_tx_descs(txq, 32); } *coalescing = txp->npkt > 0; return (total); } static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, int qsize, int intr_idx, int cong, int qtype) { KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ ("%s: bad pktc_idx %d", __func__, pktc_idx)); KASSERT(intr_idx >= -1 && intr_idx < sc->intr_count, ("%s: bad intr_idx %d", __func__, intr_idx)); KASSERT(qtype == FW_IQ_IQTYPE_OTHER || qtype == FW_IQ_IQTYPE_NIC || qtype == FW_IQ_IQTYPE_OFLD, ("%s: bad qtype %d", __func__, qtype)); iq->flags = 0; iq->state = IQS_DISABLED; iq->adapter = sc; iq->qtype = qtype; iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); iq->intr_pktc_idx = SGE_NCOUNTERS - 1; if (pktc_idx >= 0) { iq->intr_params |= F_QINTR_CNT_EN; iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE; iq->intr_idx = intr_idx; iq->cong_drop = cong; } static inline void init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) { struct sge_params *sp = &sc->params.sge; fl->qsize = qsize; fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); if (sc->flags & BUF_PACKING_OK && ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ fl->flags |= FL_BUF_PACKING; fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING); fl->safe_zidx = sc->sge.safe_zidx; if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sp->fl_starve_threshold2, 8); fl->buf_boundary = sp->pack_boundary; } else { fl->lowat = roundup2(sp->fl_starve_threshold, 8); fl->buf_boundary = 16; } if (fl_pad && fl->buf_boundary < sp->pad_boundary) fl->buf_boundary = sp->pad_boundary; } static inline void init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize, uint8_t port_id, struct sge_iq *iq, char *name) { KASSERT(eqtype >= EQ_CTRL && eqtype <= EQ_OFLD, ("%s: bad qtype %d", __func__, eqtype)); eq->type = eqtype; eq->port_id = port_id; eq->tx_chan = sc->port[port_id]->tx_chan; eq->iq = iq; eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(eq->lockname, name, sizeof(eq->lockname)); mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); } int alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_addr_t *pa, void **va) { int rc; rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); if (rc != 0) { CH_ERR(sc, "cannot allocate DMA tag: %d\n", rc); goto done; } rc = bus_dmamem_alloc(*tag, va, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); if (rc != 0) { CH_ERR(sc, "cannot allocate DMA memory: %d\n", rc); goto done; } rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); if (rc != 0) { CH_ERR(sc, "cannot load DMA map: %d\n", rc); goto done; } done: if (rc) free_ring(sc, *tag, *map, *pa, *va); return (rc); } int free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, bus_addr_t pa, void *va) { if (pa) bus_dmamap_unload(tag, map); if (va) bus_dmamem_free(tag, va, map); if (tag) bus_dma_tag_destroy(tag); return (0); } /* * Allocates the software resources (mainly memory and sysctl nodes) for an * ingress queue and an optional freelist. * * Sets IQ_SW_ALLOCATED and returns 0 on success. */ static int alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) { int rc; size_t len; struct adapter *sc = vi->adapter; MPASS(!(iq->flags & IQ_SW_ALLOCATED)); len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) return (rc); if (fl) { len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) { free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); return (rc); } /* Allocate space for one software descriptor per buffer. */ fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); add_fl_sysctls(sc, ctx, oid, fl); iq->flags |= IQ_HAS_FL; } add_iq_sysctls(ctx, oid, iq); iq->flags |= IQ_SW_ALLOCATED; return (0); } /* * Frees all software resources (memory and locks) associated with an ingress * queue and an optional freelist. */ static void free_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) { MPASS(iq->flags & IQ_SW_ALLOCATED); if (fl) { MPASS(iq->flags & IQ_HAS_FL); free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); free_fl_buffers(sc, fl); free(fl->sdesc, M_CXGBE); mtx_destroy(&fl->fl_lock); bzero(fl, sizeof(*fl)); } free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); bzero(iq, sizeof(*iq)); } /* * Allocates a hardware ingress queue and an optional freelist that will be * associated with it. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ static int alloc_iq_fl_hwq(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl) { int rc, cntxt_id, cong_map; struct fw_iq_cmd c; struct adapter *sc = vi->adapter; struct port_info *pi = vi->pi; __be32 v = 0; MPASS (!(iq->flags & IQ_HW_ALLOCATED)); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); /* Special handling for firmware event queue */ if (iq == &sc->sge.fwq) v |= F_FW_IQ_CMD_IQASYNCH; if (iq->intr_idx < 0) { /* Forwarded interrupts, all headed to fwq */ v |= F_FW_IQ_CMD_IQANDST; v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id); } else { KASSERT(iq->intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, iq->intr_idx)); v |= V_FW_IQ_CMD_IQANDSTINDEX(iq->intr_idx); } bzero(iq->desc, iq->qsize * IQ_ESIZE); c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); c.iqns_to_fl0congen = htobe32(V_FW_IQ_CMD_IQTYPE(iq->qtype)); if (iq->cong_drop != -1) { cong_map = iq->qtype == IQ_ETH ? pi->rx_e_chan_map : 0; c.iqns_to_fl0congen |= htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); } if (fl) { bzero(fl->desc, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len); c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 0)); if (iq->cong_drop != -1) { c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong_map) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) | V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B)); c.fl0size = htobe16(fl->qsize); c.fl0addr = htobe64(fl->ba); } rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { CH_ERR(sc, "failed to create hw ingress queue: %d\n", rc); return (rc); } iq->cidx = 0; iq->gen = F_RSPD_GEN; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); cntxt_id = iq->cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.iqmap_sz) { panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.iqmap_sz - 1); } sc->sge.iqmap[cntxt_id] = iq; if (fl) { u_int qid; #ifdef INVARIANTS int i; MPASS(!(fl->flags & FL_BUF_RESUME)); for (i = 0; i < fl->sidx * 8; i++) MPASS(fl->sdesc[i].cl == NULL); #endif fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = fl->hw_cidx = fl->dbidx = 0; fl->rx_offset = 0; fl->flags &= ~(FL_STARVING | FL_DOOMED); cntxt_id = fl->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) { panic("%s: fl->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); } sc->sge.eqmap[cntxt_id] = (void *)fl; qid = fl->cntxt_id; if (isset(&sc->doorbells, DOORBELL_UDB)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (qid >> s_qpp) << PAGE_SHIFT; qid &= mask; if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { udb += qid << UDBS_SEG_SHIFT; qid = 0; } fl->udb = (volatile void *)udb; } fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db; FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); } if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && iq->cong_drop != -1) { t4_sge_set_conm_context(sc, iq->cntxt_id, iq->cong_drop, cong_map); } /* Enable IQ interrupts */ atomic_store_rel_int(&iq->state, IQS_IDLE); t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) | V_INGRESSQID(iq->cntxt_id)); iq->flags |= IQ_HW_ALLOCATED; return (0); } static int free_iq_fl_hwq(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) { int rc; MPASS(iq->flags & IQ_HW_ALLOCATED); rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); if (rc != 0) { CH_ERR(sc, "failed to free iq %p: %d\n", iq, rc); return (rc); } iq->flags &= ~IQ_HW_ALLOCATED; return (0); } static void add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_iq *iq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, iq->qsize * IQ_ESIZE, "descriptor ring size in bytes"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, &iq->abs_id, 0, "absolute id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &iq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &iq->cidx, 0, "consumer index"); } static void add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_fl *fl) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &fl->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &fl->cntxt_id, 0, "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, fl_pad ? 1 : 0, "padding enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); } SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 0, "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); } /* * Idempotent. */ static int alloc_fwq(struct adapter *sc) { int rc, intr_idx; struct sge_iq *fwq = &sc->sge.fwq; struct vi_info *vi = &sc->port[0]->vi[0]; if (!(fwq->flags & IQ_SW_ALLOCATED)) { MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); if (sc->flags & IS_VF) intr_idx = 0; else intr_idx = sc->intr_count > 1 ? 1 : 0; init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, intr_idx, -1, IQ_OTHER); rc = alloc_iq_fl(vi, fwq, NULL, &sc->ctx, sc->fwq_oid); if (rc != 0) { CH_ERR(sc, "failed to allocate fwq: %d\n", rc); return (rc); } MPASS(fwq->flags & IQ_SW_ALLOCATED); } if (!(fwq->flags & IQ_HW_ALLOCATED)) { MPASS(fwq->flags & IQ_SW_ALLOCATED); rc = alloc_iq_fl_hwq(vi, fwq, NULL); if (rc != 0) { CH_ERR(sc, "failed to create hw fwq: %d\n", rc); return (rc); } MPASS(fwq->flags & IQ_HW_ALLOCATED); } return (0); } /* * Idempotent. */ static void free_fwq(struct adapter *sc) { struct sge_iq *fwq = &sc->sge.fwq; if (fwq->flags & IQ_HW_ALLOCATED) { MPASS(fwq->flags & IQ_SW_ALLOCATED); free_iq_fl_hwq(sc, fwq, NULL); MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); } if (fwq->flags & IQ_SW_ALLOCATED) { MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); free_iq_fl(sc, fwq, NULL); MPASS(!(fwq->flags & IQ_SW_ALLOCATED)); } } /* * Idempotent. */ static int alloc_ctrlq(struct adapter *sc, int idx) { int rc; char name[16]; struct sysctl_oid *oid; struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx]; MPASS(idx < sc->params.nports); if (!(ctrlq->eq.flags & EQ_SW_ALLOCATED)) { MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->ctrlq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ctrl queue"); snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev), idx); init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, idx, &sc->sge.fwq, name); rc = alloc_wrq(sc, NULL, ctrlq, &sc->ctx, oid); if (rc != 0) { CH_ERR(sc, "failed to allocate ctrlq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); return (rc); } MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); } if (!(ctrlq->eq.flags & EQ_HW_ALLOCATED)) { MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); MPASS(ctrlq->nwr_pending == 0); MPASS(ctrlq->ndesc_needed == 0); rc = alloc_eq_hwq(sc, NULL, &ctrlq->eq); if (rc != 0) { CH_ERR(sc, "failed to create hw ctrlq%d: %d\n", idx, rc); return (rc); } MPASS(ctrlq->eq.flags & EQ_HW_ALLOCATED); } return (0); } /* * Idempotent. */ static void free_ctrlq(struct adapter *sc, int idx) { struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx]; if (ctrlq->eq.flags & EQ_HW_ALLOCATED) { MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); free_eq_hwq(sc, NULL, &ctrlq->eq); MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); } if (ctrlq->eq.flags & EQ_SW_ALLOCATED) { MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); free_wrq(sc, ctrlq); MPASS(!(ctrlq->eq.flags & EQ_SW_ALLOCATED)); } } int t4_sge_set_conm_context(struct adapter *sc, int cntxt_id, int cong_drop, int cong_map) { const int cng_ch_bits_log = sc->chip_params->cng_ch_bits_log; uint32_t param, val; uint16_t ch_map; int cong_mode, rc, i; if (chip_id(sc) < CHELSIO_T5) return (ENOTSUP); /* Convert the driver knob to the mode understood by the firmware. */ switch (cong_drop) { case -1: cong_mode = X_CONMCTXT_CNGTPMODE_DISABLE; break; case 0: cong_mode = X_CONMCTXT_CNGTPMODE_CHANNEL; break; case 1: cong_mode = X_CONMCTXT_CNGTPMODE_QUEUE; break; case 2: cong_mode = X_CONMCTXT_CNGTPMODE_BOTH; break; default: MPASS(0); CH_ERR(sc, "cong_drop = %d is invalid (ingress queue %d).\n", cong_drop, cntxt_id); return (EINVAL); } param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(cntxt_id); val = V_CONMCTXT_CNGTPMODE(cong_mode); if (cong_mode == X_CONMCTXT_CNGTPMODE_CHANNEL || cong_mode == X_CONMCTXT_CNGTPMODE_BOTH) { for (i = 0, ch_map = 0; i < 4; i++) { if (cong_map & (1 << i)) ch_map |= 1 << (i << cng_ch_bits_log); } val |= V_CONMCTXT_CNGCHMAP(ch_map); } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { CH_ERR(sc, "failed to set congestion manager context " "for ingress queue %d: %d\n", cntxt_id, rc); } return (rc); } /* * Idempotent. */ static int alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int idx, int intr_idx, int maxp) { int rc; struct adapter *sc = vi->adapter; if_t ifp = vi->ifp; struct sysctl_oid *oid; char name[16]; if (!(rxq->iq.flags & IQ_SW_ALLOCATED)) { MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); #if defined(INET) || defined(INET6) rc = tcp_lro_init_args(&rxq->lro, ifp, lro_entries, lro_mbufs); if (rc != 0) return (rc); MPASS(rxq->lro.ifp == ifp); /* also indicates LRO init'ed */ #endif rxq->ifp = ifp; snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->rxq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue"); init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq, intr_idx, cong_drop, IQ_ETH); #if defined(INET) || defined(INET6) if (if_getcapenable(ifp) & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; #endif if (if_getcapenable(ifp) & IFCAP_HWRXTSTMP) rxq->iq.flags |= IQ_RX_TIMESTAMP; snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(vi->dev), idx); init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name); rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, &vi->ctx, oid); if (rc != 0) { CH_ERR(vi, "failed to allocate rxq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); #if defined(INET) || defined(INET6) tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; #endif return (rc); } MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); add_rxq_sysctls(&vi->ctx, oid, rxq); } if (!(rxq->iq.flags & IQ_HW_ALLOCATED)) { MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); rc = alloc_iq_fl_hwq(vi, &rxq->iq, &rxq->fl); if (rc != 0) { CH_ERR(vi, "failed to create hw rxq%d: %d\n", idx, rc); return (rc); } MPASS(rxq->iq.flags & IQ_HW_ALLOCATED); if (idx == 0) sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id; else KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id, ("iq_base mismatch")); KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF, ("PF with non-zero iq_base")); /* * The freelist is just barely above the starvation threshold * right now, fill it up a bit more. */ FL_LOCK(&rxq->fl); refill_fl(sc, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); } return (0); } /* * Idempotent. */ static void free_rxq(struct vi_info *vi, struct sge_rxq *rxq) { if (rxq->iq.flags & IQ_HW_ALLOCATED) { MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); free_iq_fl_hwq(vi->adapter, &rxq->iq, &rxq->fl); MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); } if (rxq->iq.flags & IQ_SW_ALLOCATED) { MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); #if defined(INET) || defined(INET6) tcp_lro_free(&rxq->lro); #endif free_iq_fl(vi->adapter, &rxq->iq, &rxq->fl); MPASS(!(rxq->iq.flags & IQ_SW_ALLOCATED)); bzero(rxq, sizeof(*rxq)); } } static void add_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_rxq *rxq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); #if defined(INET) || defined(INET6) SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &rxq->lro.lro_queued, 0, NULL); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_rxcsum", CTLFLAG_RD, &rxq->vxlan_rxcsum, "# of times hardware assisted with inner checksum (VXLAN)"); } #ifdef TCP_OFFLOAD /* * Idempotent. */ static int alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int idx, int intr_idx, int maxp) { int rc; struct adapter *sc = vi->adapter; struct sysctl_oid *oid; char name[16]; if (!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)) { MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->ofld_rxq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload rx queue"); init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx, vi->qsize_rxq, intr_idx, ofld_cong_drop, IQ_OFLD); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(vi->dev), idx); init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name); rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, &vi->ctx, oid); if (rc != 0) { CH_ERR(vi, "failed to allocate ofld_rxq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); return (rc); } MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK); ofld_rxq->rx_iscsi_ddp_setup_error = counter_u64_alloc(M_WAITOK); ofld_rxq->ddp_buffer_alloc = counter_u64_alloc(M_WAITOK); ofld_rxq->ddp_buffer_reuse = counter_u64_alloc(M_WAITOK); ofld_rxq->ddp_buffer_free = counter_u64_alloc(M_WAITOK); add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq); } if (!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)) { MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); rc = alloc_iq_fl_hwq(vi, &ofld_rxq->iq, &ofld_rxq->fl); if (rc != 0) { CH_ERR(vi, "failed to create hw ofld_rxq%d: %d\n", idx, rc); return (rc); } MPASS(ofld_rxq->iq.flags & IQ_HW_ALLOCATED); } return (rc); } /* * Idempotent. */ static void free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq) { if (ofld_rxq->iq.flags & IQ_HW_ALLOCATED) { MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); free_iq_fl_hwq(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl); MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); } if (ofld_rxq->iq.flags & IQ_SW_ALLOCATED) { MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); free_iq_fl(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl); MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)); counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok); counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error); counter_u64_free(ofld_rxq->ddp_buffer_alloc); counter_u64_free(ofld_rxq->ddp_buffer_reuse); counter_u64_free(ofld_rxq->ddp_buffer_free); bzero(ofld_rxq, sizeof(*ofld_rxq)); } } static void add_ofld_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_ofld_rxq *ofld_rxq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "rx_aio_ddp_jobs", CTLFLAG_RD, &ofld_rxq->rx_aio_ddp_jobs, 0, "# of aio_read(2) jobs completed via DDP"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "rx_aio_ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_aio_ddp_octets, 0, "# of octets placed directly for aio_read(2) jobs"); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_toe_tls_records", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_records, "# of TOE TLS records received"); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets, "# of payload octets in received TOE TLS records"); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_toe_ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_ddp_octets, "# of payload octets received via TCP DDP"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_buffer_alloc", CTLFLAG_RD, &ofld_rxq->ddp_buffer_alloc, "# of DDP RCV buffers allocated"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_buffer_reuse", CTLFLAG_RD, &ofld_rxq->ddp_buffer_reuse, "# of DDP RCV buffers reused"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_buffer_free", CTLFLAG_RD, &ofld_rxq->ddp_buffer_free, "# of DDP RCV buffers freed"); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_ok", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_ok, "# of times DDP buffer was setup successfully."); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_error", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_error, "# of times DDP buffer setup failed."); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_octets, 0, "# of octets placed directly"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_pdus", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_pdus, 0, "# of PDUs with data placed directly."); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_octets", CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_octets, 0, "# of data octets delivered in freelist"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_pdus", CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_pdus, 0, "# of PDUs with data delivered in freelist"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "padding_errors", CTLFLAG_RD, &ofld_rxq->rx_iscsi_padding_errors, 0, "# of PDUs with invalid padding"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "header_digest_errors", CTLFLAG_RD, &ofld_rxq->rx_iscsi_header_digest_errors, 0, "# of PDUs with invalid header digests"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "data_digest_errors", CTLFLAG_RD, &ofld_rxq->rx_iscsi_data_digest_errors, 0, "# of PDUs with invalid data digests"); } #endif /* * Returns a reasonable automatic cidx flush threshold for a given queue size. */ static u_int qsize_to_fthresh(int qsize) { u_int fthresh; while (!powerof2(qsize)) qsize++; fthresh = ilog2(qsize); if (fthresh > X_CIDXFLUSHTHRESH_128) fthresh = X_CIDXFLUSHTHRESH_128; return (fthresh); } static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { CH_ERR(sc, "failed to create hw ctrlq for tx_chan %d: %d\n", eq->tx_chan, rc); return (rc); } eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); eq->abs_id = G_FW_EQ_CTRL_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } static int eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create Ethernet egress queue: %d\n", rc); return (rc); } eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | V_FW_EQ_OFLD_CMD_VFN(0)); c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create egress queue for TCP offload: %d\n", rc); return (rc); } eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); eq->abs_id = G_FW_EQ_OFLD_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #endif /* SW only */ static int alloc_eq(struct adapter *sc, struct sge_eq *eq, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) { int rc, qsize; size_t len; MPASS(!(eq->flags & EQ_SW_ALLOCATED)); qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; len = qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); if (ctx != NULL && oid != NULL) add_eq_sysctls(sc, ctx, oid, eq); eq->flags |= EQ_SW_ALLOCATED; return (0); } /* SW only */ static void free_eq(struct adapter *sc, struct sge_eq *eq) { MPASS(eq->flags & EQ_SW_ALLOCATED); if (eq->type == EQ_ETH) MPASS(eq->pidx == eq->cidx); free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); mtx_destroy(&eq->eq_lock); bzero(eq, sizeof(*eq)); } static void add_eq_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_eq *eq) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &eq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, &eq->abs_id, 0, "absolute id of the queue"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &eq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &eq->cidx, 0, "consumer index"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &eq->pidx, 0, "producer index"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL, eq->sidx, "status page index"); } static int alloc_eq_hwq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc; MPASS(!(eq->flags & EQ_HW_ALLOCATED)); eq->iqid = eq->iq->cntxt_id; eq->pidx = eq->cidx = eq->dbidx = 0; /* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */ eq->equeqidx = 0; eq->doorbells = sc->doorbells; bzero(eq->desc, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len); switch (eq->type) { case EQ_CTRL: rc = ctrl_eq_alloc(sc, eq); break; case EQ_ETH: rc = eth_eq_alloc(sc, vi, eq); break; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) case EQ_OFLD: rc = ofld_eq_alloc(sc, vi, eq); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->type); } if (rc != 0) { CH_ERR(sc, "failed to allocate egress queue(%d): %d\n", eq->type, rc); return (rc); } if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ eq->udb_qid = 0; } eq->udb = (volatile void *)udb; } eq->flags |= EQ_HW_ALLOCATED; return (0); } static int free_eq_hwq(struct adapter *sc, struct vi_info *vi __unused, struct sge_eq *eq) { int rc; MPASS(eq->flags & EQ_HW_ALLOCATED); switch (eq->type) { case EQ_CTRL: rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; case EQ_ETH: rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->type); } if (rc != 0) { CH_ERR(sc, "failed to free eq (type %d): %d\n", eq->type, rc); return (rc); } eq->flags &= ~EQ_HW_ALLOCATED; return (0); } static int alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) { struct sge_eq *eq = &wrq->eq; int rc; MPASS(!(eq->flags & EQ_SW_ALLOCATED)); rc = alloc_eq(sc, eq, ctx, oid); if (rc) return (rc); MPASS(eq->flags & EQ_SW_ALLOCATED); /* Can't fail after this. */ wrq->adapter = sc; TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); TAILQ_INIT(&wrq->incomplete_wrs); STAILQ_INIT(&wrq->wr_list); wrq->nwr_pending = 0; wrq->ndesc_needed = 0; add_wrq_sysctls(ctx, oid, wrq); return (0); } static void free_wrq(struct adapter *sc, struct sge_wrq *wrq) { free_eq(sc, &wrq->eq); MPASS(wrq->nwr_pending == 0); MPASS(wrq->ndesc_needed == 0); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); MPASS(STAILQ_EMPTY(&wrq->wr_list)); bzero(wrq, sizeof(*wrq)); } static void add_wrq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_wrq *wrq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, &wrq->tx_wrs_direct, "# of work requests (direct)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, &wrq->tx_wrs_copied, "# of work requests (copied)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD, &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)"); } /* * Idempotent. */ static int alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx) { int rc, iqidx; struct port_info *pi = vi->pi; struct adapter *sc = vi->adapter; struct sge_eq *eq = &txq->eq; struct txpkts *txp; char name[16]; struct sysctl_oid *oid; if (!(eq->flags & EQ_SW_ALLOCATED)) { MPASS(!(eq->flags & EQ_HW_ALLOCATED)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->txq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue"); iqidx = vi->first_rxq + (idx % vi->nrxq); snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(vi->dev), idx); init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->port_id, &sc->sge.rxq[iqidx].iq, name); rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, M_CXGBE, &eq->eq_lock, M_WAITOK); if (rc != 0) { CH_ERR(vi, "failed to allocate mp_ring for txq%d: %d\n", idx, rc); failed: sysctl_remove_oid(oid, 1, 1); return (rc); } rc = alloc_eq(sc, eq, &vi->ctx, oid); if (rc) { CH_ERR(vi, "failed to allocate txq%d: %d\n", idx, rc); mp_ring_free(txq->r); goto failed; } MPASS(eq->flags & EQ_SW_ALLOCATED); /* Can't fail after this point. */ TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); txq->ifp = vi->ifp; txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); add_txq_sysctls(vi, &vi->ctx, oid, txq); } if (!(eq->flags & EQ_HW_ALLOCATED)) { MPASS(eq->flags & EQ_SW_ALLOCATED); rc = alloc_eq_hwq(sc, vi, eq); if (rc != 0) { CH_ERR(vi, "failed to create hw txq%d: %d\n", idx, rc); return (rc); } MPASS(eq->flags & EQ_HW_ALLOCATED); /* Can't fail after this point. */ if (idx == 0) sc->sge.eq_base = eq->abs_id - eq->cntxt_id; else KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id, ("eq_base mismatch")); KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF, ("PF with non-zero eq_base")); txp = &txq->txp; MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr); txq->txp.max_npkt = min(nitems(txp->mb), sc->params.max_pkts_per_eth_tx_pkts_wr); if (vi->flags & TX_USES_VM_WR && !(sc->flags & IS_VF)) txq->txp.max_npkt--; if (vi->flags & TX_USES_VM_WR) txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan)); else txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); txq->tc_idx = -1; } return (0); } /* * Idempotent. */ static void free_txq(struct vi_info *vi, struct sge_txq *txq) { struct adapter *sc = vi->adapter; struct sge_eq *eq = &txq->eq; if (eq->flags & EQ_HW_ALLOCATED) { MPASS(eq->flags & EQ_SW_ALLOCATED); free_eq_hwq(sc, NULL, eq); MPASS(!(eq->flags & EQ_HW_ALLOCATED)); } if (eq->flags & EQ_SW_ALLOCATED) { MPASS(!(eq->flags & EQ_HW_ALLOCATED)); sglist_free(txq->gl); free(txq->sdesc, M_CXGBE); mp_ring_free(txq->r); free_eq(sc, eq); MPASS(!(eq->flags & EQ_SW_ALLOCATED)); bzero(txq, sizeof(*txq)); } } static void add_txq_sysctls(struct vi_info *vi, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_txq *txq) { struct adapter *sc; struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; sc = vi->adapter; children = SYSCTL_CHILDREN(oid); mp_ring_sysctls(txq->r, ctx, children); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tc", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, txq - sc->sge.txq, sysctl_tc, "I", "traffic class (-1 means none)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, &txq->txcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, &txq->tso_wrs, "# of TSO work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, &txq->imm_wrs, "# of work requests with immediate data"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD, &txq->txpkts0_wrs, "# of txpkts (type 0) work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD, &txq->txpkts1_wrs, "# of txpkts (type 1) work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD, &txq->txpkts0_pkts, "# of frames tx'd using type0 txpkts work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD, &txq->txpkts1_pkts, "# of frames tx'd using type1 txpkts work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts_flush", CTLFLAG_RD, &txq->txpkts_flush, "# of times txpkts had to be flushed out by an egress-update"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD, &txq->raw_wrs, "# of raw work requests (non-packets)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_tso_wrs", CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_txcsum", CTLFLAG_RD, &txq->vxlan_txcsum, "# of times hardware assisted with inner checksums (VXLAN)"); #ifdef KERN_TLS if (is_ktls(sc)) { SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_records", CTLFLAG_RD, &txq->kern_tls_records, "# of NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_short", CTLFLAG_RD, &txq->kern_tls_short, "# of short NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_partial", CTLFLAG_RD, &txq->kern_tls_partial, "# of partial NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_full", CTLFLAG_RD, &txq->kern_tls_full, "# of full NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_octets", CTLFLAG_RD, &txq->kern_tls_octets, "# of payload octets in transmitted NIC TLS records"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_waste", CTLFLAG_RD, &txq->kern_tls_waste, "# of octets DMAd but not transmitted in NIC TLS records"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_options", CTLFLAG_RD, &txq->kern_tls_options, "# of NIC TLS options-only packets transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_header", CTLFLAG_RD, &txq->kern_tls_header, "# of NIC TLS header-only packets transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin", CTLFLAG_RD, &txq->kern_tls_fin, "# of NIC TLS FIN-only packets transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin_short", CTLFLAG_RD, &txq->kern_tls_fin_short, "# of NIC TLS padded FIN packets on short TLS records"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_cbc", CTLFLAG_RD, &txq->kern_tls_cbc, "# of NIC TLS sessions using AES-CBC"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_gcm", CTLFLAG_RD, &txq->kern_tls_gcm, "# of NIC TLS sessions using AES-GCM"); } #endif } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) /* * Idempotent. */ static int alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx) { struct sysctl_oid *oid; struct port_info *pi = vi->pi; struct adapter *sc = vi->adapter; struct sge_eq *eq = &ofld_txq->wrq.eq; int rc, iqidx; char name[16]; MPASS(idx >= 0); MPASS(idx < vi->nofldtxq); if (!(eq->flags & EQ_SW_ALLOCATED)) { snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->ofld_txq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue"); snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(vi->dev), idx); if (vi->nofldrxq > 0) { iqidx = vi->first_ofld_rxq + (idx % vi->nofldrxq); init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->port_id, &sc->sge.ofld_rxq[iqidx].iq, name); } else { iqidx = vi->first_rxq + (idx % vi->nrxq); init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->port_id, &sc->sge.rxq[iqidx].iq, name); } rc = alloc_wrq(sc, vi, &ofld_txq->wrq, &vi->ctx, oid); if (rc != 0) { CH_ERR(vi, "failed to allocate ofld_txq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); return (rc); } MPASS(eq->flags & EQ_SW_ALLOCATED); /* Can't fail after this point. */ ofld_txq->tx_iscsi_pdus = counter_u64_alloc(M_WAITOK); ofld_txq->tx_iscsi_octets = counter_u64_alloc(M_WAITOK); ofld_txq->tx_iscsi_iso_wrs = counter_u64_alloc(M_WAITOK); ofld_txq->tx_aio_jobs = counter_u64_alloc(M_WAITOK); ofld_txq->tx_aio_octets = counter_u64_alloc(M_WAITOK); ofld_txq->tx_toe_tls_records = counter_u64_alloc(M_WAITOK); ofld_txq->tx_toe_tls_octets = counter_u64_alloc(M_WAITOK); add_ofld_txq_sysctls(&vi->ctx, oid, ofld_txq); } if (!(eq->flags & EQ_HW_ALLOCATED)) { MPASS(eq->flags & EQ_SW_ALLOCATED); MPASS(ofld_txq->wrq.nwr_pending == 0); MPASS(ofld_txq->wrq.ndesc_needed == 0); rc = alloc_eq_hwq(sc, vi, eq); if (rc != 0) { CH_ERR(vi, "failed to create hw ofld_txq%d: %d\n", idx, rc); return (rc); } MPASS(eq->flags & EQ_HW_ALLOCATED); } return (0); } /* * Idempotent. */ static void free_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq) { struct adapter *sc = vi->adapter; struct sge_eq *eq = &ofld_txq->wrq.eq; if (eq->flags & EQ_HW_ALLOCATED) { MPASS(eq->flags & EQ_SW_ALLOCATED); free_eq_hwq(sc, NULL, eq); MPASS(!(eq->flags & EQ_HW_ALLOCATED)); } if (eq->flags & EQ_SW_ALLOCATED) { MPASS(!(eq->flags & EQ_HW_ALLOCATED)); counter_u64_free(ofld_txq->tx_iscsi_pdus); counter_u64_free(ofld_txq->tx_iscsi_octets); counter_u64_free(ofld_txq->tx_iscsi_iso_wrs); counter_u64_free(ofld_txq->tx_aio_jobs); counter_u64_free(ofld_txq->tx_aio_octets); counter_u64_free(ofld_txq->tx_toe_tls_records); counter_u64_free(ofld_txq->tx_toe_tls_octets); free_wrq(sc, &ofld_txq->wrq); MPASS(!(eq->flags & EQ_SW_ALLOCATED)); bzero(ofld_txq, sizeof(*ofld_txq)); } } static void add_ofld_txq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_ofld_txq *ofld_txq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_pdus", CTLFLAG_RD, &ofld_txq->tx_iscsi_pdus, "# of iSCSI PDUs transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_octets", CTLFLAG_RD, &ofld_txq->tx_iscsi_octets, "# of payload octets in transmitted iSCSI PDUs"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_iso_wrs", CTLFLAG_RD, &ofld_txq->tx_iscsi_iso_wrs, "# of iSCSI segmentation offload work requests"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_aio_jobs", CTLFLAG_RD, &ofld_txq->tx_aio_jobs, "# of zero-copy aio_write(2) jobs transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_aio_octets", CTLFLAG_RD, &ofld_txq->tx_aio_octets, "# of payload octets in transmitted zero-copy aio_write(2) jobs"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_records", CTLFLAG_RD, &ofld_txq->tx_toe_tls_records, "# of TOE TLS records transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_octets", CTLFLAG_RD, &ofld_txq->tx_toe_tls_octets, "# of payload octets in transmitted TOE TLS records"); } #endif static void oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *ba = arg; KASSERT(nseg == 1, ("%s meant for single segment mappings only.", __func__)); *ba = error ? 0 : segs->ds_addr; } static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { uint32_t n, v; n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx); MPASS(n > 0); wmb(); v = fl->dbval | V_PIDX(n); if (fl->udb) *fl->udb = htole32(v); else t4_write_reg(sc, sc->sge_kdoorbell_reg, v); IDXINCR(fl->dbidx, n, fl->sidx); } /* * Fills up the freelist by allocating up to 'n' buffers. Buffers that are * recycled do not count towards this allocation budget. * * Returns non-zero to indicate that this freelist should be added to the list * of starving freelists. */ static int refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { __be64 *d; struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; struct rx_buf_info *rxb; struct cluster_metadata *clm; uint16_t max_pidx, zidx = fl->zidx; uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); /* * We always stop at the beginning of the hardware descriptor that's just * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, * which would mean an empty freelist to the chip. */ max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; if (fl->pidx == max_pidx * 8) return (0); d = &fl->desc[fl->pidx]; sd = &fl->sdesc[fl->pidx]; rxb = &sc->sge.rx_buf_info[zidx]; while (n > 0) { if (sd->cl != NULL) { if (sd->nmbuf == 0) { /* * Fast recycle without involving any atomics on * the cluster's metadata (if the cluster has * metadata). This happens when all frames * received in the cluster were small enough to * fit within a single mbuf each. */ fl->cl_fast_recycled++; goto recycled; } /* * Cluster is guaranteed to have metadata. Clusters * without metadata always take the fast recycle path * when they're recycled. */ clm = cl_metadata(sd); MPASS(clm != NULL); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { fl->cl_recycled++; counter_u64_add(extfree_rels, 1); goto recycled; } sd->cl = NULL; /* gave up my reference */ } MPASS(sd->cl == NULL); cl = uma_zalloc(rxb->zone, M_NOWAIT); if (__predict_false(cl == NULL)) { if (zidx != fl->safe_zidx) { zidx = fl->safe_zidx; rxb = &sc->sge.rx_buf_info[zidx]; cl = uma_zalloc(rxb->zone, M_NOWAIT); } if (cl == NULL) break; } fl->cl_allocated++; n--; pa = pmap_kextract((vm_offset_t)cl); sd->cl = cl; sd->zidx = zidx; if (fl->flags & FL_BUF_PACKING) { *d = htobe64(pa | rxb->hwidx2); sd->moff = rxb->size2; } else { *d = htobe64(pa | rxb->hwidx1); sd->moff = 0; } recycled: sd->nmbuf = 0; d++; sd++; if (__predict_false((++fl->pidx & 7) == 0)) { uint16_t pidx = fl->pidx >> 3; if (__predict_false(pidx == fl->sidx)) { fl->pidx = 0; pidx = 0; sd = fl->sdesc; d = fl->desc; } if (n < 8 || pidx == max_pidx) break; if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) ring_fl_db(sc, fl); } } if ((fl->pidx >> 3) != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); } /* * Attempt to refill all starving freelists. */ static void refill_sfl(void *arg) { struct adapter *sc = arg; struct sge_fl *fl, *fl_temp; mtx_assert(&sc->sfl_lock, MA_OWNED); TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { FL_LOCK(fl); refill_fl(sc, fl, 64); if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { TAILQ_REMOVE(&sc->sfl, fl, link); fl->flags &= ~FL_STARVING; } FL_UNLOCK(fl); } if (!TAILQ_EMPTY(&sc->sfl)) callout_schedule(&sc->sfl_callout, hz / 5); } /* * Release the driver's reference on all buffers in the given freelist. Buffers * with kernel references cannot be freed and will prevent the driver from being * unloaded safely. */ void free_fl_buffers(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; struct cluster_metadata *clm; int i; sd = fl->sdesc; for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; if (sd->nmbuf == 0) uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl); else if (fl->flags & FL_BUF_PACKING) { clm = cl_metadata(sd); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl); counter_u64_add(extfree_rels, 1); } } sd->cl = NULL; } if (fl->flags & FL_BUF_RESUME) { m_freem(fl->m0); fl->flags &= ~FL_BUF_RESUME; } } static inline void get_pkt_gl(struct mbuf *m, struct sglist *gl) { int rc; M_ASSERTPKTHDR(m); sglist_reset(gl); rc = sglist_append_mbuf(gl, m); if (__predict_false(rc != 0)) { panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " "with %d.", __func__, m, mbuf_nsegs(m), rc); } KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); #if 0 /* vm_wr not readily available here. */ KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m, vm_wr), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, gl->sg_nseg, max_nsegs_allowed(m, vm_wr))); #endif } /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_len16(u_int nsegs, const u_int extra) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = extra + sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkt_vm WR with a GL. Includes the firmware work * request header. */ static inline u_int txpkt_vm_len16(u_int nsegs, const u_int extra) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } static inline void calculate_mbuf_len16(struct mbuf *m, bool vm_wr) { const int lso = sizeof(struct cpl_tx_pkt_lso_core); const int tnl_lso = sizeof(struct cpl_tx_tnl_lso); if (vm_wr) { if (needs_tso(m)) set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso)); else set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0)); return; } if (needs_tso(m)) { if (needs_vxlan_tso(m)) set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso)); else set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso)); } else set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0)); } /* * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts0_len16(u_int nsegs) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts1_len16(void) { u_int n; n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); return (howmany(n, 16)); } static inline u_int imm_payload(u_int ndesc) { u_int n; n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - sizeof(struct cpl_tx_pkt_core); return (n); } static inline uint64_t csum_to_ctrl(struct adapter *sc, struct mbuf *m) { uint64_t ctrl; int csum_type, l2hlen, l3hlen; int x, y; static const int csum_types[3][2] = { {TX_CSUM_TCPIP, TX_CSUM_TCPIP6}, {TX_CSUM_UDPIP, TX_CSUM_UDPIP6}, {TX_CSUM_IP, 0} }; M_ASSERTPKTHDR(m); if (!needs_hwcsum(m)) return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS); MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN); MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip)); if (needs_vxlan_csum(m)) { MPASS(m->m_pkthdr.l4hlen > 0); MPASS(m->m_pkthdr.l5hlen > 0); MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN); MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip)); l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen + m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN; l3hlen = m->m_pkthdr.inner_l3hlen; } else { l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN; l3hlen = m->m_pkthdr.l3hlen; } ctrl = 0; if (!needs_l3_csum(m)) ctrl |= F_TXPKT_IPCSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP | CSUM_IP6_TCP | CSUM_INNER_IP6_TCP)) x = 0; /* TCP */ else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP | CSUM_IP6_UDP | CSUM_INNER_IP6_UDP)) x = 1; /* UDP */ else x = 2; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP | CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP)) y = 0; /* IPv4 */ else { MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP)); y = 1; /* IPv6 */ } /* * needs_hwcsum returned true earlier so there must be some kind of * checksum to calculate. */ csum_type = csum_types[x][y]; MPASS(csum_type != 0); if (csum_type == TX_CSUM_IP) ctrl |= F_TXPKT_L4CSUM_DIS; ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen); if (chip_id(sc) <= CHELSIO_T5) ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen); else ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen); return (ctrl); } static inline void * write_lso_cpl(void *cpl, struct mbuf *m0) { struct cpl_tx_pkt_lso_core *lso; uint32_t ctrl; KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso = cpl; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(m0->m_pkthdr.len); return (lso + 1); } static void * write_tnl_lso_cpl(void *cpl, struct mbuf *m0) { struct cpl_tx_tnl_lso *tnl_lso = cpl; uint32_t ctrl; KASSERT(m0->m_pkthdr.inner_l2hlen > 0 && m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 && m0->m_pkthdr.inner_l5hlen > 0, ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths", __func__, m0)); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0, ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths", __func__, m0)); /* Outer headers. */ ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) | F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST | V_CPL_TX_TNL_LSO_ETHHDRLENOUT( (m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) | F_CPL_TX_TNL_LSO_IPLENSETOUT; if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT; else { ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT | F_CPL_TX_TNL_LSO_IPIDINCOUT; } tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl); tnl_lso->IpIdOffsetOut = 0; tnl_lso->UdpLenSetOut_to_TnlHdrLen = htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT | F_CPL_TX_TNL_LSO_UDPLENSETOUT | V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen + m0->m_pkthdr.l5hlen) | V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN)); tnl_lso->r1 = 0; /* Inner headers. */ ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN( (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) | V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) | V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2); if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_CPL_TX_TNL_LSO_IPV6; tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl); tnl_lso->IpIdOffset = 0; tnl_lso->IpIdSplit_to_Mss = htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz)); tnl_lso->TCPSeqOffset = 0; tnl_lso->EthLenOffset_Size = htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len)); return (tnl_lso + 1); } #define VM_TX_L2HDR_LEN 16 /* ethmacdst to vlantci */ /* * Write a VM txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0) { struct sge_eq *eq; struct fw_eth_tx_pkt_vm_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); len16 = mbuf_len16(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); ndesc = tx_len16_to_desc(len16); /* Firmware work request header */ eq = &txq->eq; wr = (void *)&eq->desc[eq->pidx]; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3[0] = 0; wr->r3[1] = 0; /* * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci. * vlantci is ignored unless the ethtype is 0x8100, so it's * simpler to always copy it rather than making it * conditional. Also, it seems that we do not have to set * vlantci or fake the ethtype when doing VLAN tag insertion. */ m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst); if (needs_tso(m0)) { cpl = write_lso_cpl(wr + 1, m0); txq->tso_wrs++; } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m0); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } else if (sc->vlan_id) ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(sc->vlan_id); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); /* * A packet using TSO will use up an entire descriptor for the * firmware work request header, LSO CPL, and TX_PKT_XT CPL. * If this descriptor is the last descriptor in the ring, wrap * around to the front of the ring explicitly for the start of * the sgl. */ if (dst == (void *)&eq->desc[eq->sidx]) { dst = (void *)&eq->desc[0]; write_gl_to_txd(txq, m0, &dst, 0); } else write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * Write a raw WR to the hardware descriptors, update the software * descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct mbuf *m; caddr_t dst; int len16, ndesc; len16 = mbuf_len16(m0); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); dst = wr; for (m = m0; m != NULL; m = m->m_next) copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); txq->raw_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * Write a txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0, u_int available) { struct sge_eq *eq; struct fw_eth_tx_pkt_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) { if (needs_vxlan_tso(m0)) ctrl += sizeof(struct cpl_tx_tnl_lso); else ctrl += sizeof(struct cpl_tx_pkt_lso_core); } else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) && available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + pktlen, 16); nsegs = 0; } ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ eq = &txq->eq; wr = (void *)&eq->desc[eq->pidx]; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; if (needs_tso(m0)) { if (needs_vxlan_tso(m0)) { cpl = write_tnl_lso_cpl(wr + 1, m0); txq->vxlan_tso_wrs++; } else { cpl = write_lso_cpl(wr + 1, m0); txq->tso_wrs++; } } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m0); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { /* some hardware assistance provided */ if (needs_vxlan_csum(m0)) txq->vxlan_txcsum++; else txq->txcsum++; } /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx])) dst = (caddr_t)&eq->desc[0]; if (nsegs > 0) { write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; } else { struct mbuf *m; for (m = m0; m != NULL; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; #endif } #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif txq->imm_wrs++; } txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } static inline bool cmp_l2hdr(struct txpkts *txp, struct mbuf *m) { int len; MPASS(txp->npkt > 0); MPASS(m->m_len >= VM_TX_L2HDR_LEN); if (txp->ethtype == be16toh(ETHERTYPE_VLAN)) len = VM_TX_L2HDR_LEN; else len = sizeof(struct ether_header); return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0); } static inline void save_l2hdr(struct txpkts *txp, struct mbuf *m) { MPASS(m->m_len >= VM_TX_L2HDR_LEN); memcpy(&txp->ethmacdst[0], mtod(m, const void *), VM_TX_L2HDR_LEN); } static int add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, int avail, bool *send) { struct txpkts *txp = &txq->txp; /* Cannot have TSO and coalesce at the same time. */ if (cannot_use_txpkts(m)) { cannot_coalesce: *send = txp->npkt > 0; return (EINVAL); } /* VF allows coalescing of type 1 (1 GL) only */ if (mbuf_nsegs(m) > 1) goto cannot_coalesce; *send = false; if (txp->npkt > 0) { MPASS(tx_len16_to_desc(txp->len16) <= avail); MPASS(txp->npkt < txp->max_npkt); MPASS(txp->wr_type == 1); /* VF supports type 1 only */ if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) { retry_after_send: *send = true; return (EAGAIN); } if (m->m_pkthdr.len + txp->plen > 65535) goto retry_after_send; if (cmp_l2hdr(txp, m)) goto retry_after_send; txp->len16 += txpkts1_len16(); txp->plen += m->m_pkthdr.len; txp->mb[txp->npkt++] = m; if (txp->npkt == txp->max_npkt) *send = true; } else { txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) + txpkts1_len16(); if (tx_len16_to_desc(txp->len16) > avail) goto cannot_coalesce; txp->npkt = 1; txp->wr_type = 1; txp->plen = m->m_pkthdr.len; txp->mb[0] = m; save_l2hdr(txp, m); } return (0); } static int add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, int avail, bool *send) { struct txpkts *txp = &txq->txp; int nsegs; MPASS(!(sc->flags & IS_VF)); /* Cannot have TSO and coalesce at the same time. */ if (cannot_use_txpkts(m)) { cannot_coalesce: *send = txp->npkt > 0; return (EINVAL); } *send = false; nsegs = mbuf_nsegs(m); if (txp->npkt == 0) { if (m->m_pkthdr.len > 65535) goto cannot_coalesce; if (nsegs > 1) { txp->wr_type = 0; txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + txpkts0_len16(nsegs); } else { txp->wr_type = 1; txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + txpkts1_len16(); } if (tx_len16_to_desc(txp->len16) > avail) goto cannot_coalesce; txp->npkt = 1; txp->plen = m->m_pkthdr.len; txp->mb[0] = m; } else { MPASS(tx_len16_to_desc(txp->len16) <= avail); MPASS(txp->npkt < txp->max_npkt); if (m->m_pkthdr.len + txp->plen > 65535) { retry_after_send: *send = true; return (EAGAIN); } MPASS(txp->wr_type == 0 || txp->wr_type == 1); if (txp->wr_type == 0) { if (tx_len16_to_desc(txp->len16 + txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC)) goto retry_after_send; txp->len16 += txpkts0_len16(nsegs); } else { if (nsegs != 1) goto retry_after_send; if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) goto retry_after_send; txp->len16 += txpkts1_len16(); } txp->plen += m->m_pkthdr.len; txp->mb[txp->npkt++] = m; if (txp->npkt == txp->max_npkt) *send = true; } return (0); } /* * Write a txpkts WR for the packets in txp to the hardware descriptors, update * the software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkts_wr(struct adapter *sc, struct sge_txq *txq) { const struct txpkts *txp = &txq->txp; struct sge_eq *eq = &txq->eq; struct fw_eth_tx_pkts_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint64_t ctrl1; int ndesc, i, checkwrap; struct mbuf *m, *last; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); wr = (void *)&eq->desc[eq->pidx]; wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r3 = 0; wr->type = txp->wr_type; flitp = wr + 1; /* * At this point we are 16B into a hardware descriptor. If checkwrap is * set then we know the WR is going to wrap around somewhere. We'll * check for that at appropriate points. */ ndesc = tx_len16_to_desc(txp->len16); last = NULL; checkwrap = eq->sidx - ndesc < eq->pidx; for (i = 0; i < txp->npkt; i++) { m = txp->mb[i]; if (txp->wr_type == 0) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; /* ULP master command */ ulpmc = flitp; ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m))); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); cpl = (void *)(ulpsc + 1); if (checkwrap && (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) cpl = (void *)&eq->desc[0]; } else { cpl = flitp; } /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { /* some hardware assistance provided */ if (needs_vxlan_csum(m)) txq->vxlan_txcsum++; else txq->txcsum++; } /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; if (checkwrap && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = (void *)&eq->desc[0]; write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); if (last != NULL) last->m_nextpkt = m; last = m; } txq->sgl_wrs++; if (txp->wr_type == 0) { txq->txpkts0_pkts += txp->npkt; txq->txpkts0_wrs++; } else { txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; } txsd = &txq->sdesc[eq->pidx]; txsd->m = txp->mb[0]; txsd->desc_used = ndesc; return (ndesc); } static u_int write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq) { const struct txpkts *txp = &txq->txp; struct sge_eq *eq = &txq->eq; struct fw_eth_tx_pkts_vm_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint64_t ctrl1; int ndesc, i; struct mbuf *m, *last; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->wr_type == 1); /* VF supports type 1 only */ MPASS(txp->mb[0] != NULL); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); wr = (void *)&eq->desc[eq->pidx]; wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); wr->r3 = 0; wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r4 = 0; memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16); flitp = wr + 1; /* * At this point we are 32B into a hardware descriptor. Each mbuf in * the WR will take 32B so we check for the end of the descriptor ring * before writing odd mbufs (mb[1], 3, 5, ..) */ ndesc = tx_len16_to_desc(txp->len16); last = NULL; for (i = 0; i < txp->npkt; i++) { m = txp->mb[i]; if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = &eq->desc[0]; cpl = flitp; /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } else if (sc->vlan_id) ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(sc->vlan_id); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; MPASS(mbuf_nsegs(m) == 1); write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0); if (last != NULL) last->m_nextpkt = m; last = m; } txq->sgl_wrs++; txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = txp->mb[0]; txsd->desc_used = ndesc; return (ndesc); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) { struct sge_eq *eq = &txq->eq; struct sglist *gl = txq->gl; struct sglist_seg *seg; __be64 *flitp, *wrap; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); get_pkt_gl(m, gl); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)(*to); wrap = (__be64 *)(&eq->desc[eq->sidx]); seg = &gl->sg_segs[0]; usgl = (void *)flitp; /* * We start at a 16 byte boundary somewhere inside the tx descriptor * ring, so we're at least 16 bytes away from the status page. There is * no chance of a wrap around in the middle of usgl (which is 16 bytes). */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { /* Won't wrap around at all */ for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); for (i = 0; i < nflits - 2; i++) { if (flitp == wrap) flitp = (void *)eq->desc; *flitp++ = get_flit(seg, nsegs - 1, i); } } if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); if (__predict_false(flitp == wrap)) *to = (void *)eq->desc; else *to = (void *)flitp; } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static inline void ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) { u_int db; MPASS(n > 0); db = eq->doorbells; if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; int i; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(eq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", __func__, eq->doorbells, n, eq->dbidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); i = eq->dbidx; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: t4_write_reg(sc, sc->sge_kdoorbell_reg, V_QID(eq->cntxt_id) | V_PIDX(n)); break; } IDXINCR(eq->dbidx, n, eq->sidx); } static inline u_int reclaimable_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx; hw_cidx = read_hw_cidx(eq); return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); } static inline u_int total_available_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx, pidx; hw_cidx = read_hw_cidx(eq); pidx = eq->pidx; if (pidx == hw_cidx) return (eq->sidx - 1); else return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); } static inline uint16_t read_hw_cidx(struct sge_eq *eq) { struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; uint16_t cidx = spg->cidx; /* stable snapshot */ return (be16toh(cidx)); } /* * Reclaim 'n' descriptors approximately. */ static u_int reclaim_tx_descs(struct sge_txq *txq, u_int n) { struct tx_sdesc *txsd; struct sge_eq *eq = &txq->eq; u_int can_reclaim, reclaimed; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(n > 0); reclaimed = 0; can_reclaim = reclaimable_tx_desc(eq); while (can_reclaim && reclaimed < n) { int ndesc; struct mbuf *m, *nextpkt; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; /* Firmware doesn't return "partial" credits. */ KASSERT(can_reclaim >= ndesc, ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); KASSERT(ndesc != 0, ("%s: descriptor with no credits: cidx %d", __func__, eq->cidx)); for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } reclaimed += ndesc; can_reclaim -= ndesc; IDXINCR(eq->cidx, ndesc, eq->sidx); } return (reclaimed); } static void tx_reclaim(void *arg, int n) { struct sge_txq *txq = arg; struct sge_eq *eq = &txq->eq; do { if (TXQ_TRYLOCK(txq) == 0) break; n = reclaim_tx_descs(txq, 32); if (eq->cidx == eq->pidx) eq->equeqidx = eq->pidx; TXQ_UNLOCK(txq); } while (n > 0); } static __be64 get_flit(struct sglist_seg *segs, int nsegs, int idx) { int i = (idx / 3) * 2; switch (idx % 3) { case 0: { uint64_t rc; rc = (uint64_t)segs[i].ss_len << 32; if (i + 1 < nsegs) rc |= (uint64_t)(segs[i + 1].ss_len); return (htobe64(rc)); } case 1: return (htobe64(segs[i].ss_paddr)); case 2: return (htobe64(segs[i + 1].ss_paddr)); } return (0); } static int find_refill_source(struct adapter *sc, int maxp, bool packing) { int i, zidx = -1; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0]; if (packing) { for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { if (rxb->hwidx2 == -1) continue; if (rxb->size1 < PAGE_SIZE && rxb->size1 < largest_rx_cluster) continue; if (rxb->size1 > largest_rx_cluster) break; MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE); if (rxb->size2 >= maxp) return (i); zidx = i; } } else { for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { if (rxb->hwidx1 == -1) continue; if (rxb->size1 > largest_rx_cluster) break; if (rxb->size1 >= maxp) return (i); zidx = i; } } return (zidx); } static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); if ((fl->flags & FL_DOOMED) == 0) { fl->flags |= FL_STARVING; TAILQ_INSERT_TAIL(&sc->sfl, fl, link); callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); } FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); } static void handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_wrq *wrq = (void *)eq; atomic_readandclear_int(&eq->equiq); taskqueue_enqueue(sc->tq[eq->port_id], &wrq->wrq_tx_task); } static void handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_txq *txq = (void *)eq; MPASS(eq->type == EQ_ETH); atomic_readandclear_int(&eq->equiq); if (mp_ring_is_idle(txq->r)) taskqueue_enqueue(sc->tq[eq->port_id], &txq->tx_reclaim_task); else mp_ring_check_drainage(txq->r, 64); } static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, &handle_wrq_egr_update, &handle_eth_egr_update, &handle_wrq_egr_update}; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start - s->eq_base]; (*h[eq->type])(sc, eq); return (0); } /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ offsetof(struct cpl_fw6_msg, data)); static int handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { const struct rss_header *rss2; rss2 = (const struct rss_header *)&cpl->data[0]; return (t4_cpl_handler[rss2->opcode](iq, rss2, m)); } return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0])); } /** * t4_handle_wrerr_rpl - process a FW work request error message * @adap: the adapter * @rpl: start of the FW message */ static int t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl) { u8 opcode = *(const u8 *)rpl; const struct fw_error_cmd *e = (const void *)rpl; unsigned int i; if (opcode != FW_ERROR_CMD) { log(LOG_ERR, "%s: Received WRERR_RPL message with opcode %#x\n", device_get_nameunit(adap->dev), opcode); return (EINVAL); } log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev), G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" : "non-fatal"); switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) { case FW_ERROR_TYPE_EXCEPTION: log(LOG_ERR, "exception info:\n"); for (i = 0; i < nitems(e->u.exception.info); i++) log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ", be32toh(e->u.exception.info[i])); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_HWMODULE: log(LOG_ERR, "HW module regaddr %08x regval %08x\n", be32toh(e->u.hwmodule.regaddr), be32toh(e->u.hwmodule.regval)); break; case FW_ERROR_TYPE_WR: log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n", be16toh(e->u.wr.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)), be32toh(e->u.wr.eqid)); for (i = 0; i < nitems(e->u.wr.wrhdr); i++) log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ", e->u.wr.wrhdr[i]); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_ACL: log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s", be16toh(e->u.acl.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)), be32toh(e->u.acl.eqid), G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" : "MAC"); for (i = 0; i < nitems(e->u.acl.val); i++) log(LOG_ERR, " %02x", e->u.acl.val[i]); log(LOG_ERR, "\n"); break; default: log(LOG_ERR, "type %#x\n", G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))); return (EINVAL); } return (0); } static inline bool bufidx_used(struct adapter *sc, int idx) { struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0]; int i; for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { if (rxb->size1 > largest_rx_cluster) continue; if (rxb->hwidx1 == idx || rxb->hwidx2 == idx) return (true); } return (false); } static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sge_params *sp = &sc->params.sge; int i, rc; struct sbuf sb; char c; sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND); for (i = 0; i < SGE_FLBUF_SIZES; i++) { if (bufidx_used(sc, i)) c = '*'; else c = '\0'; sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c); } sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } #ifdef RATELIMIT #if defined(INET) || defined(INET6) /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso) { u_int n; MPASS(immhdrs > 0); n = roundup2(sizeof(struct fw_eth_tx_eo_wr) + sizeof(struct cpl_tx_pkt_core) + immhdrs, 16); if (__predict_false(nsegs == 0)) goto done; nsegs--; /* first segment is part of ulptx_sgl */ n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); done: return (howmany(n, 16)); } #endif #define ETID_FLOWC_NPARAMS 6 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \ ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16)) #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16)) #if defined(INET) || defined(INET6) static int send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi, struct vi_info *vi) { struct wrq_cookie cookie; u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN; struct fw_flowc_wr *flowc; mtx_assert(&cst->lock, MA_OWNED); MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) == EO_FLOWC_PENDING); flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLOWC_LEN16, &cookie); if (__predict_false(flowc == NULL)) return (ENOMEM); bzero(flowc, ETID_FLOWC_LEN); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) | V_FW_WR_FLOWID(cst->etid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(cst->iqid); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE; flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; flowc->mnemval[5].val = htobe32(cst->schedcl); commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie); cst->flags &= ~EO_FLOWC_PENDING; cst->flags |= EO_FLOWC_RPL_PENDING; MPASS(cst->tx_credits >= ETID_FLOWC_LEN16); /* flowc is first WR. */ cst->tx_credits -= ETID_FLOWC_LEN16; return (0); } #endif #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16)) void send_etid_flush_wr(struct cxgbe_rate_tag *cst) { struct fw_flowc_wr *flowc; struct wrq_cookie cookie; mtx_assert(&cst->lock, MA_OWNED); flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLUSH_LEN16, &cookie); if (__predict_false(flowc == NULL)) CXGBE_UNIMPLEMENTED(__func__); bzero(flowc, ETID_FLUSH_LEN16 * 16); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL); flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) | V_FW_WR_FLOWID(cst->etid)); commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie); cst->flags |= EO_FLUSH_RPL_PENDING; MPASS(cst->tx_credits >= ETID_FLUSH_LEN16); cst->tx_credits -= ETID_FLUSH_LEN16; cst->ncompl++; } static void write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr, struct mbuf *m0, int compl) { struct cpl_tx_pkt_core *cpl; uint64_t ctrl1; uint32_t ctrl; /* used in many unrelated places */ int len16, pktlen, nsegs, immhdrs; uintptr_t p; struct ulptx_sgl *usgl; struct sglist sg; struct sglist_seg segs[38]; /* XXX: find real limit. XXX: get off the stack */ mtx_assert(&cst->lock, MA_OWNED); M_ASSERTPKTHDR(m0); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: ethofld mbuf %p is missing header lengths", __func__, m0)); len16 = mbuf_eo_len16(m0); nsegs = mbuf_eo_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; ctrl += immhdrs; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) | V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) | V_FW_WR_FLOWID(cst->etid)); wr->r3 = 0; if (needs_outer_udp_csum(m0)) { wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG; wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen; wr->u.udpseg.rtplen = 0; wr->u.udpseg.r4 = 0; wr->u.udpseg.mss = htobe16(pktlen - immhdrs); wr->u.udpseg.schedpktsize = wr->u.udpseg.mss; wr->u.udpseg.plen = htobe32(pktlen - immhdrs); cpl = (void *)(wr + 1); } else { MPASS(needs_outer_tcp_csum(m0)); wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG; wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen; wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0); wr->u.tcpseg.r4 = 0; wr->u.tcpseg.r5 = 0; wr->u.tcpseg.plen = htobe32(pktlen - immhdrs); if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); } else { wr->u.tcpseg.mss = htobe16(0xffff); cpl = (void *)(wr + 1); } } /* Checksum offload must be requested for ethofld. */ MPASS(needs_outer_l4_csum(m0)); ctrl1 = csum_to_ctrl(cst->adapter, m0); /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); } /* CPL header */ cpl->ctrl0 = cst->ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */ p = (uintptr_t)(cpl + 1); m_copydata(m0, 0, immhdrs, (void *)p); /* SGL */ if (nsegs > 0) { int i, pad; /* zero-pad upto next 16Byte boundary, if not 16Byte aligned */ p += immhdrs; pad = 16 - (immhdrs & 0xf); bzero((void *)p, pad); usgl = (void *)(p + pad); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); sglist_init(&sg, nitems(segs), segs); for (; m0 != NULL; m0 = m0->m_next) { if (__predict_false(m0->m_len == 0)) continue; if (immhdrs >= m0->m_len) { immhdrs -= m0->m_len; continue; } if (m0->m_flags & M_EXTPG) sglist_append_mbuf_epg(&sg, m0, mtod(m0, vm_offset_t), m0->m_len); else sglist_append(&sg, mtod(m0, char *) + immhdrs, m0->m_len - immhdrs); immhdrs = 0; } MPASS(sg.sg_nseg == nsegs); /* * Zero pad last 8B in case the WR doesn't end on a 16B * boundary. */ *(uint64_t *)((char *)wr + len16 * 16 - 8) = 0; usgl->len0 = htobe32(segs[0].ss_len); usgl->addr0 = htobe64(segs[0].ss_paddr); for (i = 0; i < nsegs - 1; i++) { usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); } } static void ethofld_tx(struct cxgbe_rate_tag *cst) { struct mbuf *m; struct wrq_cookie cookie; int next_credits, compl; struct fw_eth_tx_eo_wr *wr; mtx_assert(&cst->lock, MA_OWNED); while ((m = mbufq_first(&cst->pending_tx)) != NULL) { M_ASSERTPKTHDR(m); /* How many len16 credits do we need to send this mbuf. */ next_credits = mbuf_eo_len16(m); MPASS(next_credits > 0); if (next_credits > cst->tx_credits) { /* * Tx will make progress eventually because there is at * least one outstanding fw4_ack that will return * credits and kick the tx. */ MPASS(cst->ncompl > 0); return; } wr = start_wrq_wr(&cst->eo_txq->wrq, next_credits, &cookie); if (__predict_false(wr == NULL)) { /* XXX: wishful thinking, not a real assertion. */ MPASS(cst->ncompl > 0); return; } cst->tx_credits -= next_credits; cst->tx_nocompl += next_credits; compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2; ETHER_BPF_MTAP(cst->com.ifp, m); write_ethofld_wr(cst, wr, m, compl); commit_wrq_wr(&cst->eo_txq->wrq, wr, &cookie); if (compl) { cst->ncompl++; cst->tx_nocompl = 0; } (void) mbufq_dequeue(&cst->pending_tx); /* * Drop the mbuf's reference on the tag now rather * than waiting until m_freem(). This ensures that * cxgbe_rate_tag_free gets called when the inp drops * its reference on the tag and there are no more * mbufs in the pending_tx queue and can flush any * pending requests. Otherwise if the last mbuf * doesn't request a completion the etid will never be * released. */ m->m_pkthdr.snd_tag = NULL; m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; m_snd_tag_rele(&cst->com); mbufq_enqueue(&cst->pending_fwack, m); } } #if defined(INET) || defined(INET6) static int ethofld_transmit(if_t ifp, struct mbuf *m0) { struct cxgbe_rate_tag *cst; int rc; MPASS(m0->m_nextpkt == NULL); MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG); MPASS(m0->m_pkthdr.snd_tag != NULL); cst = mst_to_crt(m0->m_pkthdr.snd_tag); mtx_lock(&cst->lock); MPASS(cst->flags & EO_SND_TAG_REF); if (__predict_false(cst->flags & EO_FLOWC_PENDING)) { struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; const uint32_t rss_mask = vi->rss_size - 1; uint32_t rss_hash; cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq]; if (M_HASHTYPE_ISHASH(m0)) rss_hash = m0->m_pkthdr.flowid; else rss_hash = arc4random(); /* We assume RSS hashing */ cst->iqid = vi->rss[rss_hash & rss_mask]; cst->eo_txq += rss_hash % vi->nofldtxq; rc = send_etid_flowc_wr(cst, pi, vi); if (rc != 0) goto done; } if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) { rc = ENOBUFS; goto done; } mbufq_enqueue(&cst->pending_tx, m0); cst->plen += m0->m_pkthdr.len; /* * Hold an extra reference on the tag while generating work * requests to ensure that we don't try to free the tag during * ethofld_tx() in case we are sending the final mbuf after * the inp was freed. */ m_snd_tag_ref(&cst->com); ethofld_tx(cst); mtx_unlock(&cst->lock); m_snd_tag_rele(&cst->com); return (0); done: mtx_unlock(&cst->lock); return (rc); } #endif static int ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); struct mbuf *m; u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct cxgbe_rate_tag *cst; uint8_t credits = cpl->credits; cst = lookup_etid(sc, etid); mtx_lock(&cst->lock); if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) { MPASS(credits >= ETID_FLOWC_LEN16); credits -= ETID_FLOWC_LEN16; cst->flags &= ~EO_FLOWC_RPL_PENDING; } KASSERT(cst->ncompl > 0, ("%s: etid %u (%p) wasn't expecting completion.", __func__, etid, cst)); cst->ncompl--; while (credits > 0) { m = mbufq_dequeue(&cst->pending_fwack); if (__predict_false(m == NULL)) { /* * The remaining credits are for the final flush that * was issued when the tag was freed by the kernel. */ MPASS((cst->flags & (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) == EO_FLUSH_RPL_PENDING); MPASS(credits == ETID_FLUSH_LEN16); MPASS(cst->tx_credits + cpl->credits == cst->tx_total); MPASS(cst->ncompl == 0); cst->flags &= ~EO_FLUSH_RPL_PENDING; cst->tx_credits += cpl->credits; cxgbe_rate_tag_free_locked(cst); return (0); /* cst is gone. */ } KASSERT(m != NULL, ("%s: too many credits (%u, %u)", __func__, cpl->credits, credits)); KASSERT(credits >= mbuf_eo_len16(m), ("%s: too few credits (%u, %u, %u)", __func__, cpl->credits, credits, mbuf_eo_len16(m))); credits -= mbuf_eo_len16(m); cst->plen -= m->m_pkthdr.len; m_freem(m); } cst->tx_credits += cpl->credits; MPASS(cst->tx_credits <= cst->tx_total); if (cst->flags & EO_SND_TAG_REF) { /* * As with ethofld_transmit(), hold an extra reference * so that the tag is stable across ethold_tx(). */ m_snd_tag_ref(&cst->com); m = mbufq_first(&cst->pending_tx); if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m)) ethofld_tx(cst); mtx_unlock(&cst->lock); m_snd_tag_rele(&cst->com); } else { /* * There shouldn't be any pending packets if the tag * was freed by the kernel since any pending packet * should hold a reference to the tag. */ MPASS(mbufq_first(&cst->pending_tx) == NULL); mtx_unlock(&cst->lock); } return (0); } #endif diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c index a740c31789a1..69d82086c7a5 100644 --- a/sys/kern/kern_mbuf.c +++ b/sys/kern/kern_mbuf.c @@ -1,1792 +1,1792 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_param.h" #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -_Static_assert(MJUMPAGESIZE > MCLBYTES, - "Cluster must be smaller than a jumbo page"); +_Static_assert(MCLBYTES <= MJUMPAGESIZE, + "Cluster must not be larger than a jumbo page"); /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Primary Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Primary Zone, * thus sharing backend Slab kegs with the Mbuf Primary Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Primary Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Primary Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decommissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ bool mb_use_ext_pgs = false; /* use M_EXTPG mbufs for sendfile & TLS */ static int sysctl_mb_use_ext_pgs(SYSCTL_HANDLER_ARGS) { int error, extpg; extpg = mb_use_ext_pgs; error = sysctl_handle_int(oidp, &extpg, 0, req); if (error == 0 && req->newptr != NULL) { if (extpg != 0 && !PMAP_HAS_DMAP) error = EOPNOTSUPP; else mb_use_ext_pgs = extpg != 0; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &mb_use_ext_pgs, 0, sysctl_mb_use_ext_pgs, "IU", "Use unmapped mbufs for sendfile(2) and TLS offload"); static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); static counter_u64_t snd_tag_count; SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, &snd_tag_count, "# of active mbuf send tags"); /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; int extpg; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); /* * Unmapped mbufs can only safely be used on platforms with a direct * map. */ if (PMAP_HAS_DMAP) { extpg = 1; TUNABLE_INT_FETCH("kern.ipc.mb_use_ext_pgs", &extpg); mb_use_ext_pgs = extpg != 0; } } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); _Static_assert(sizeof(struct mbuf) <= MSIZE, "size of mbuf exceeds MSIZE"); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); uma_zone_set_maxaction(zone_mbuf, mb_reclaim); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); uma_zone_set_maxaction(zone_clust, mb_reclaim); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); uma_zone_set_maxaction(zone_jumbop, mb_reclaim); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_CONTIG); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); #ifdef DEBUGNET /* * debugnet makes use of a pre-allocated pool of mbufs and clusters. When * debugnet is configured, we initialize a set of UMA cache zones which return * items from this pool. At panic-time, the regular UMA zone pointers are * overwritten with those of the cache zones so that drivers may allocate and * free mbufs and clusters without attempting to allocate physical memory. * * We keep mbufs and clusters in a pair of mbuf queues. In particular, for * the purpose of caching clusters, we treat them as mbufs. */ static struct mbufq dn_mbufq = { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX }; static struct mbufq dn_clustq = { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX }; static int dn_clsize; static uma_zone_t dn_zone_mbuf; static uma_zone_t dn_zone_clust; static uma_zone_t dn_zone_pack; static struct debugnet_saved_zones { uma_zone_t dsz_mbuf; uma_zone_t dsz_clust; uma_zone_t dsz_pack; uma_zone_t dsz_jumbop; uma_zone_t dsz_jumbo9; uma_zone_t dsz_jumbo16; bool dsz_debugnet_zones_enabled; } dn_saved_zones; static int dn_buf_import(void *arg, void **store, int count, int domain __unused, int flags) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = mbufq_dequeue(q); if (m == NULL) break; trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_buf_release(void *arg, void **store, int count) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = store[i]; (void)mbufq_enqueue(q, m); } } static int dn_pack_import(void *arg __unused, void **store, int count, int domain __unused, int flags __unused) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) break; clust = uma_zalloc(dn_zone_clust, M_NOWAIT); if (clust == NULL) { m_free(m); break; } mb_ctor_clust(clust, dn_clsize, m, 0); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void dn_pack_release(void *arg __unused, void **store, int count) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = store[i]; clust = m->m_ext.ext_buf; uma_zfree(dn_zone_clust, clust); uma_zfree(dn_zone_mbuf, m); } } /* * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy * the corresponding UMA cache zones. */ void debugnet_mbuf_drain(void) { struct mbuf *m; void *item; if (dn_zone_mbuf != NULL) { uma_zdestroy(dn_zone_mbuf); dn_zone_mbuf = NULL; } if (dn_zone_clust != NULL) { uma_zdestroy(dn_zone_clust); dn_zone_clust = NULL; } if (dn_zone_pack != NULL) { uma_zdestroy(dn_zone_pack); dn_zone_pack = NULL; } while ((m = mbufq_dequeue(&dn_mbufq)) != NULL) m_free(m); while ((item = mbufq_dequeue(&dn_clustq)) != NULL) uma_zfree(m_getzone(dn_clsize), item); } /* * Callback invoked immediately prior to starting a debugnet connection. */ void debugnet_mbuf_start(void) { MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled); /* Save the old zone pointers to restore when debugnet is closed. */ dn_saved_zones = (struct debugnet_saved_zones) { .dsz_debugnet_zones_enabled = true, .dsz_mbuf = zone_mbuf, .dsz_clust = zone_clust, .dsz_pack = zone_pack, .dsz_jumbop = zone_jumbop, .dsz_jumbo9 = zone_jumbo9, .dsz_jumbo16 = zone_jumbo16, }; /* * All cluster zones return buffers of the size requested by the * drivers. It's up to the driver to reinitialize the zones if the * MTU of a debugnet-enabled interface changes. */ printf("debugnet: overwriting mbuf zone pointers\n"); zone_mbuf = dn_zone_mbuf; zone_clust = dn_zone_clust; zone_pack = dn_zone_pack; zone_jumbop = dn_zone_clust; zone_jumbo9 = dn_zone_clust; zone_jumbo16 = dn_zone_clust; } /* * Callback invoked when a debugnet connection is closed/finished. */ void debugnet_mbuf_finish(void) { MPASS(dn_saved_zones.dsz_debugnet_zones_enabled); printf("debugnet: restoring mbuf zone pointers\n"); zone_mbuf = dn_saved_zones.dsz_mbuf; zone_clust = dn_saved_zones.dsz_clust; zone_pack = dn_saved_zones.dsz_pack; zone_jumbop = dn_saved_zones.dsz_jumbop; zone_jumbo9 = dn_saved_zones.dsz_jumbo9; zone_jumbo16 = dn_saved_zones.dsz_jumbo16; memset(&dn_saved_zones, 0, sizeof(dn_saved_zones)); } /* * Reinitialize the debugnet mbuf+cluster pool and cache zones. */ void debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize) { struct mbuf *m; void *item; debugnet_mbuf_drain(); dn_clsize = clsize; dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL, dn_buf_import, dn_buf_release, &dn_mbufq, UMA_ZONE_NOBUCKET); dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME, clsize, mb_ctor_clust, NULL, NULL, NULL, dn_buf_import, dn_buf_release, &dn_clustq, UMA_ZONE_NOBUCKET); dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME, MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, dn_pack_import, dn_pack_release, NULL, UMA_ZONE_NOBUCKET); while (nmbuf-- > 0) { m = m_get(M_WAITOK, MT_DATA); uma_zfree(dn_zone_mbuf, m); } while (nclust-- > 0) { item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK); uma_zfree(dn_zone_clust, item); } } #endif /* DEBUGNET */ /* * Constructor for Mbuf primary zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); return (error); } /* * The Mbuf primary zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags __diagused; m = (struct mbuf *)mem; flags = (unsigned long)arg; KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__)); if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); #if defined(INVARIANTS) && !defined(KMSAN) trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, cause them to be woken up by draining the * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted(zone_clust)) uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; m = (struct mbuf *)arg; if (m != NULL) { m->m_ext.ext_buf = (char *)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = m_gettype(size); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; } return (0); } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #if defined(INVARIANTS) && !defined(KMSAN) trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #if defined(INVARIANTS) && !defined(KMSAN) trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #if defined(INVARIANTS) && !defined(KMSAN) trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; MPASS((flags & M_NOFREE) == 0); #if defined(INVARIANTS) && !defined(KMSAN) trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } /* * This is the protocol drain routine. Called by UMA whenever any of the * mbuf zones is closed to its limit. */ static void mb_reclaim(uma_zone_t zone __unused, int pending __unused) { EVENTHANDLER_INVOKE(mbuf_lowmem, VM_LOW_MBUFS); } /* * Free "count" units of I/O from an mbuf chain. They could be held * in M_EXTPG or just as a normal mbuf. This code is intended to be * called in an error path (I/O error, closed connection, etc). */ void mb_free_notready(struct mbuf *m, int count) { int i; for (i = 0; i < count && m != NULL; i++) { if ((m->m_flags & M_EXTPG) != 0) { m->m_epg_nrdy--; if (m->m_epg_nrdy != 0) continue; } m = m_free(m); } KASSERT(i == count, ("Removed only %d items from %p", i, m)); } /* * Compress an unmapped mbuf into a simple mbuf when it holds a small * amount of data. This is used as a DOS defense to avoid having * small packets tie up wired pages, an ext_pgs structure, and an * mbuf. Since this converts the existing mbuf in place, it can only * be used if there are no other references to 'm'. */ int mb_unmapped_compress(struct mbuf *m) { volatile u_int *refcnt; char buf[MLEN]; /* * Assert that 'm' does not have a packet header. If 'm' had * a packet header, it would only be able to hold MHLEN bytes * and m_data would have to be initialized differently. */ KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG), ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m)); KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt != 1) return (EBUSY); m_copydata(m, 0, m->m_len, buf); /* Free the backing pages. */ m->m_ext.ext_free(m); /* Turn 'm' into a "normal" mbuf. */ m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG); m->m_data = m->m_dat; /* Copy data back into m. */ bcopy(buf, mtod(m, char *), m->m_len); return (0); } /* * These next few routines are used to permit downgrading an unmapped * mbuf to a chain of mapped mbufs. This is used when an interface * doesn't supported unmapped mbufs or if checksums need to be * computed in software. * * Each unmapped mbuf is converted to a chain of mbufs. First, any * TLS header data is stored in a regular mbuf. Second, each page of * unmapped data is stored in an mbuf with an EXT_SFBUF external * cluster. These mbufs use an sf_buf to provide a valid KVA for the * associated physical page. They also hold a reference on the * original M_EXTPG mbuf to ensure the physical page doesn't go away. * Finally, any TLS trailer data is stored in a regular mbuf. * * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF * mbufs. It frees the associated sf_buf and releases its reference * on the original M_EXTPG mbuf. * * _mb_unmapped_to_ext() is a helper function that converts a single * unmapped mbuf into a chain of mbufs. * * mb_unmapped_to_ext() is the public function that walks an mbuf * chain converting any unmapped mbufs to mapped mbufs. It returns * the new chain of unmapped mbufs on success. On failure it frees * the original mbuf chain and returns NULL. */ static void mb_unmapped_free_mext(struct mbuf *m) { struct sf_buf *sf; struct mbuf *old_m; sf = m->m_ext.ext_arg1; sf_buf_free(sf); /* Drop the reference on the backing M_EXTPG mbuf. */ old_m = m->m_ext.ext_arg2; mb_free_extpg(old_m); } static int _mb_unmapped_to_ext(struct mbuf *m, struct mbuf **mres) { struct mbuf *m_new, *top, *prev, *mref; struct sf_buf *sf; vm_page_t pg; int i, len, off, pglen, pgoff, seglen, segoff; volatile u_int *refcnt; u_int ref_inc = 0; M_ASSERTEXTPG(m); if (m->m_epg_tls != NULL) { /* can't convert TLS mbuf */ m_free(m); *mres = NULL; return (EINVAL); } len = m->m_len; /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Skip over any data removed from the front. */ off = mtod(m, vm_offset_t); top = NULL; if (m->m_epg_hdrlen != 0) { if (off >= m->m_epg_hdrlen) { off -= m->m_epg_hdrlen; } else { seglen = m->m_epg_hdrlen - off; segoff = off; seglen = min(seglen, len); off = 0; len -= seglen; m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; m_new->m_len = seglen; prev = top = m_new; memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff], seglen); } } pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off >= pglen) { off -= pglen; pgoff = 0; continue; } seglen = pglen - off; segoff = pgoff + off; off = 0; seglen = min(seglen, len); len -= seglen; pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) { top = prev = m_new; } else { prev->m_next = m_new; prev = m_new; } sf = sf_buf_alloc(pg, SFB_NOWAIT); if (sf == NULL) goto fail; ref_inc++; m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); m_new->m_data += segoff; m_new->m_len = seglen; pgoff = 0; }; if (len != 0) { KASSERT((off + len) <= m->m_epg_trllen, ("off + len > trail (%d + %d > %d)", off, len, m->m_epg_trllen)); m_new = m_get(M_NOWAIT, MT_DATA); if (m_new == NULL) goto fail; if (top == NULL) top = m_new; else prev->m_next = m_new; m_new->m_len = len; memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len); } if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be dropped * in mb_unmapped_free_mext(). */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); *mres = top; return (0); fail: if (ref_inc != 0) { /* * Obtain an additional reference on the old mbuf for * each created EXT_SFBUF mbuf. They will be * immediately dropped when these mbufs are freed * below. */ if (*refcnt == 1) *refcnt += ref_inc; else atomic_add_int(refcnt, ref_inc); } m_free(m); m_freem(top); *mres = NULL; return (ENOMEM); } int mb_unmapped_to_ext(struct mbuf *top, struct mbuf **mres) { struct mbuf *m, *m1, *next, *prev = NULL; int error; prev = NULL; for (m = top; m != NULL; m = next) { /* m might be freed, so cache the next pointer. */ next = m->m_next; if (m->m_flags & M_EXTPG) { if (prev != NULL) { /* * Remove 'm' from the new chain so * that the 'top' chain terminates * before 'm' in case 'top' is freed * due to an error. */ prev->m_next = NULL; } error = _mb_unmapped_to_ext(m, &m1); if (error != 0) { if (top != m) m_freem(top); m_freem(next); *mres = NULL; return (error); } m = m1; if (prev == NULL) { top = m; } else { prev->m_next = m; } /* * Replaced one mbuf with a chain, so we must * find the end of chain. */ prev = m_last(m); } else { if (prev != NULL) { prev->m_next = m; } prev = m; } } *mres = top; return (0); } /* * Allocate an empty M_EXTPG mbuf. The ext_free routine is * responsible for freeing any pages backing this mbuf when it is * freed. */ struct mbuf * mb_alloc_ext_pgs(int how, m_ext_free_t ext_free) { struct mbuf *m; m = m_get(how, MT_DATA); if (m == NULL) return (NULL); m->m_epg_npgs = 0; m->m_epg_nrdy = 0; m->m_epg_1st_off = 0; m->m_epg_last_len = 0; m->m_epg_flags = 0; m->m_epg_hdrlen = 0; m->m_epg_trllen = 0; m->m_epg_tls = NULL; m->m_epg_so = NULL; m->m_data = NULL; m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_ext.ext_size = 0; m->m_ext.ext_free = ext_free; return (m); } /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ void mb_free_ext(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* * Check if the header is embedded in the cluster. It is * important that we can't touch any of the mbuf fields * after we have freed the external storage, since mbuf * could have been embedded in it. For now, the mbufs * embedded into the cluster are always of type EXT_EXTREF, * and for this type we won't free the mref. */ if (m->m_flags & M_NOFREE) { freembuf = 0; KASSERT(m->m_ext.ext_type == EXT_EXTREF || m->m_ext.ext_type == EXT_RXRING, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*refcnt == 0) *refcnt = 1; uma_zfree(zone_pack, mref); break; case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); m_free_raw(mref); break; case EXT_SFBUF: case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); m_free_raw(mref); break; case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; case EXT_RXRING: KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free is set", __func__)); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf && m != mref) m_free_raw(m); } /* * Clean up after mbufs with M_EXTPG storage attached to them if the * reference count hits 1. */ void mb_free_extpg(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; M_ASSERTEXTPG(m); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); #ifdef KERN_TLS if (mref->m_epg_tls != NULL && !refcount_release_if_not_last(&mref->m_epg_tls->refcount)) ktls_enqueue_to_free(mref); else #endif m_free_raw(mref); } if (m != mref) m_free_raw(m); } /* * Official mbuf(9) allocation KPI for stack and drivers: * * m_get() - a single mbuf without any attachments, sys/mbuf.h. * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. * m_clget() - attach cluster to already allocated mbuf. * m_cljget() - attach jumbo cluster to already allocated mbuf. * m_get2() - allocate minimum mbuf that would fit size argument. * m_getm2() - allocate a chain of mbufs/clusters. * m_extadd() - attach external cluster to mbuf. * * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. * m_freem() - free chain of mbufs. */ int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); retval = uma_zalloc_arg(zone, m, how); MBUF_PROBE4(m__cljget, m, how, size, retval); return (retval); } /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } return (m); } /* * m_get3() allocates minimum mbuf that would fit "size" argument. * Unlike m_get2() it can allocate clusters up to MJUM16BYTES. */ struct mbuf * m_get3(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size <= MJUMPAGESIZE) return (m_get2(size, how, type, flags)); if (size > MJUM16BYTES) return (NULL); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); if (size <= MJUM9BYTES) zone = zone_jumbo9; else zone = zone_jumbo16; n = uma_zalloc_arg(zone, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { m_free_raw(m); return (NULL); } MBUF_PROBE5(m__getjcl, how, type, flags, size, m); return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one and return a pointer to the provided mbuf. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { mb = NULL; if (len > MCLBYTES) { mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR), MJUMPAGESIZE); } if (mb == NULL) { if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* * Fail the whole operation if one mbuf can't be * allocated. */ if (mb == NULL) { m_freem(nm); return (NULL); } } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ void m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, void *arg1, void *arg2, int flags, int type) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; if (type != EXT_EXTREF) { mb->m_ext.ext_count = 1; mb->m_ext.ext_flags = EXT_FLAG_EMBREF; } else mb->m_ext.ext_flags = 0; } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); } /* * Free an entire chain of mbufs and associated external buffers, following * both m_next and m_nextpkt linkage. * Note: doesn't support NULL argument. */ void m_freemp(struct mbuf *m) { struct mbuf *n; MBUF_PROBE1(m__freemp, m); do { n = m->m_nextpkt; while (m != NULL) m = m_free(m); m = n; } while (m != NULL); } /* * Temporary primitive to allow freeing without going through m_free. */ void m_free_raw(struct mbuf *mb) { uma_zfree(zone_mbuf, mb); } int m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **mstp) { return (if_snd_tag_alloc(ifp, params, mstp)); } void m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, const struct if_snd_tag_sw *sw) { if_ref(ifp); mst->ifp = ifp; refcount_init(&mst->refcount, 1); mst->sw = sw; counter_u64_add(snd_tag_count, 1); } void m_snd_tag_destroy(struct m_snd_tag *mst) { struct ifnet *ifp; ifp = mst->ifp; mst->sw->snd_tag_free(mst); if_rele(ifp); counter_u64_add(snd_tag_count, -1); } void m_rcvif_serialize(struct mbuf *m) { u_short idx, gen; M_ASSERTPKTHDR(m); idx = if_getindex(m->m_pkthdr.rcvif); gen = if_getidxgen(m->m_pkthdr.rcvif); m->m_pkthdr.rcvidx = idx; m->m_pkthdr.rcvgen = gen; if (__predict_false(m->m_pkthdr.leaf_rcvif != NULL)) { idx = if_getindex(m->m_pkthdr.leaf_rcvif); gen = if_getidxgen(m->m_pkthdr.leaf_rcvif); } else { idx = -1; gen = 0; } m->m_pkthdr.leaf_rcvidx = idx; m->m_pkthdr.leaf_rcvgen = gen; } struct ifnet * m_rcvif_restore(struct mbuf *m) { struct ifnet *ifp, *leaf_ifp; M_ASSERTPKTHDR(m); NET_EPOCH_ASSERT(); ifp = ifnet_byindexgen(m->m_pkthdr.rcvidx, m->m_pkthdr.rcvgen); if (ifp == NULL || (if_getflags(ifp) & IFF_DYING)) return (NULL); if (__predict_true(m->m_pkthdr.leaf_rcvidx == (u_short)-1)) { leaf_ifp = NULL; } else { leaf_ifp = ifnet_byindexgen(m->m_pkthdr.leaf_rcvidx, m->m_pkthdr.leaf_rcvgen); if (__predict_false(leaf_ifp != NULL && (if_getflags(leaf_ifp) & IFF_DYING))) leaf_ifp = NULL; } m->m_pkthdr.leaf_rcvif = leaf_ifp; m->m_pkthdr.rcvif = ifp; return (ifp); } /* * Allocate an mbuf with anonymous external pages. */ struct mbuf * mb_alloc_ext_plus_pages(int len, int how) { struct mbuf *m; vm_page_t pg; int i, npgs; m = mb_alloc_ext_pgs(how, mb_free_mext_pgs); if (m == NULL) return (NULL); m->m_epg_flags |= EPG_FLAG_ANON; npgs = howmany(len, PAGE_SIZE); for (i = 0; i < npgs; i++) { do { pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP | VM_ALLOC_WIRED); if (pg == NULL) { if (how == M_NOWAIT) { m->m_epg_npgs = i; m_free(m); return (NULL); } vm_wait(NULL); } } while (pg == NULL); m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg); } m->m_epg_npgs = npgs; return (m); } /* * Copy the data in the mbuf chain to a chain of mbufs with anonymous external * unmapped pages. * len is the length of data in the input mbuf chain. * mlen is the maximum number of bytes put into each ext_page mbuf. */ struct mbuf * mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how, struct mbuf **mlast) { struct mbuf *m, *mout; char *pgpos, *mbpos; int i, mblen, mbufsiz, pglen, xfer; if (len == 0) return (NULL); mbufsiz = min(mlen, len); m = mout = mb_alloc_ext_plus_pages(mbufsiz, how); if (m == NULL) return (m); pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]); pglen = PAGE_SIZE; mblen = 0; i = 0; do { if (pglen == 0) { if (++i == m->m_epg_npgs) { m->m_epg_last_len = PAGE_SIZE; mbufsiz = min(mlen, len); m->m_next = mb_alloc_ext_plus_pages(mbufsiz, how); m = m->m_next; if (m == NULL) { m_freem(mout); return (m); } i = 0; } pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]); pglen = PAGE_SIZE; } while (mblen == 0) { if (mp == NULL) { m_freem(mout); return (NULL); } KASSERT((mp->m_flags & M_EXTPG) == 0, ("mb_copym_ext_pgs: ext_pgs input mbuf")); mbpos = mtod(mp, char *); mblen = mp->m_len; mp = mp->m_next; } xfer = min(mblen, pglen); memcpy(pgpos, mbpos, xfer); pgpos += xfer; mbpos += xfer; pglen -= xfer; mblen -= xfer; len -= xfer; m->m_len += xfer; } while (len > 0); m->m_epg_last_len = PAGE_SIZE - pglen; if (mlast != NULL) *mlast = m; return (mout); } diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index c17fc9dec9a4..65a328cb52a1 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -1,1723 +1,1729 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include #include #include #include #define MBUF_PROBE1(probe, arg0) \ SDT_PROBE1(sdt, , , probe, arg0) #define MBUF_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(sdt, , , probe, arg0, arg1) #define MBUF_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2) #define MBUF_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3) #define MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4) SDT_PROBE_DECLARE(sdt, , , m__init); SDT_PROBE_DECLARE(sdt, , , m__gethdr_raw); SDT_PROBE_DECLARE(sdt, , , m__gethdr); SDT_PROBE_DECLARE(sdt, , , m__get_raw); SDT_PROBE_DECLARE(sdt, , , m__get); SDT_PROBE_DECLARE(sdt, , , m__getcl); SDT_PROBE_DECLARE(sdt, , , m__getjcl); SDT_PROBE_DECLARE(sdt, , , m__clget); SDT_PROBE_DECLARE(sdt, , , m__cljget); SDT_PROBE_DECLARE(sdt, , , m__cljset); SDT_PROBE_DECLARE(sdt, , , m__free); SDT_PROBE_DECLARE(sdt, , , m__freem); SDT_PROBE_DECLARE(sdt, , , m__freemp); #endif /* _KERNEL */ /* * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead. * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in * sys/param.h), which has no additional overhead and is used instead of the * internal data area; this is done when at least MINCLSIZE of data must be * stored. Additionally, it is possible to allocate a separate buffer * externally and attach it to the mbuf in a way similar to that of mbuf * clusters. * * NB: These calculation do not take actual compiler-induced alignment and * padding inside the complete struct mbuf into account. Appropriate * attention is required when changing members of struct mbuf. * * MLEN is data length in a normal mbuf. * MHLEN is data length in an mbuf with pktheader. * MINCLSIZE is a smallest amount of data that should be put into cluster. * * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are sensible. */ struct mbuf; #define MHSIZE offsetof(struct mbuf, m_dat) #define MPKTHSIZE offsetof(struct mbuf, m_pktdat) #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) #define M_NODOM 255 #ifdef _KERNEL /*- * Macro for type conversion: convert mbuf pointer to data pointer of correct * type: * * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * mtodo(m, o) -- Same as above but with offset 'o' into data. */ #define mtod(m, t) ((t)((m)->m_data)) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Static network interface owned tag. * Allocated through ifp->if_snd_tag_alloc(). */ struct if_snd_tag_sw; struct m_snd_tag { struct ifnet *ifp; /* network interface tag belongs to */ const struct if_snd_tag_sw *sw; volatile u_int refcount; }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 56 * LP64: 64 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct pkthdr { union { struct m_snd_tag *snd_tag; /* send tag, if any */ struct ifnet *rcvif; /* rcv interface */ struct { uint16_t rcvidx; /* rcv interface index ... */ uint16_t rcvgen; /* ... and generation count */ }; }; union { struct ifnet *leaf_rcvif; /* leaf rcv interface */ struct { uint16_t leaf_rcvidx; /* leaf rcv interface index ... */ uint16_t leaf_rcvgen; /* ... and generation count */ }; }; SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ uint32_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t numa_domain; /* NUMA domain of recvd pkt */ uint8_t rsstype; /* hash type */ #if !defined(__LP64__) uint32_t pad; /* pad for 64bit alignment */ #endif union { uint64_t rcv_tstmp; /* timestamp in ns */ struct { uint8_t l2hlen; /* layer 2 hdr len */ uint8_t l3hlen; /* layer 3 hdr len */ uint8_t l4hlen; /* layer 4 hdr len */ uint8_t l5hlen; /* layer 5 hdr len */ uint8_t inner_l2hlen; uint8_t inner_l3hlen; uint8_t inner_l4hlen; uint8_t inner_l5hlen; }; }; union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_per; /* Layer specific non-persistent local storage for reassembly, etc. */ union { union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_loc; /* Upon allocation: total packet memory consumption. */ u_int memlen; }; }; #define ether_vtag PH_per.sixteen[0] #define tcp_tun_port PH_per.sixteen[0] /* outbound */ #define vt_nrecs PH_per.sixteen[0] /* mld and v6-ND */ #define tso_segsz PH_per.sixteen[1] /* inbound after LRO */ #define lro_nsegs tso_segsz /* inbound after LRO */ #define csum_data PH_per.thirtytwo[1] /* inbound from hardware up */ #define lro_tcp_d_len PH_loc.sixteen[0] /* inbound during LRO (no reassembly) */ #define lro_tcp_d_csum PH_loc.sixteen[1] /* inbound during LRO (no reassembly) */ #define lro_tcp_h_off PH_loc.sixteen[2] /* inbound during LRO (no reassembly) */ #define lro_etype PH_loc.sixteen[3] /* inbound during LRO (no reassembly) */ /* Note PH_loc is used during IP reassembly (all 8 bytes as a ptr) */ /* * TLS records for TLS 1.0-1.2 can have the following header lengths: * - 5 (AES-CBC with implicit IV) * - 21 (AES-CBC with explicit IV) * - 13 (AES-GCM with 8 byte explicit IV) */ #define MBUF_PEXT_HDR_LEN 23 /* * TLS records for TLS 1.0-1.2 can have the following maximum trailer * lengths: * - 16 (AES-GCM) * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding) * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding) * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding) */ #define MBUF_PEXT_TRAIL_LEN 64 #if defined(__LP64__) #define MBUF_PEXT_MAX_PGS (40 / sizeof(vm_paddr_t)) #else #define MBUF_PEXT_MAX_PGS (64 / sizeof(vm_paddr_t)) #endif #define MBUF_PEXT_MAX_BYTES \ (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN) struct ktls_session; struct socket; /* * Description of external storage mapped into mbuf; valid only if M_EXT is * set. * Size ILP32: 28 * LP64: 48 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ typedef void m_ext_free_t(struct mbuf *); struct m_ext { union { /* * If EXT_FLAG_EMBREF is set, then we use refcount in the * mbuf, the 'ext_count' member. Otherwise, we have a * shadow copy and we use pointer 'ext_cnt'. The original * mbuf is responsible to carry the pointer to free routine * and its arguments. They aren't copied into shadows in * mb_dupcl() to avoid dereferencing next cachelines. */ volatile u_int ext_count; volatile u_int *ext_cnt; }; uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ union { struct { /* * Regular M_EXT mbuf: * o ext_buf always points to the external buffer. * o ext_free (below) and two optional arguments * ext_arg1 and ext_arg2 store the free context for * the external storage. They are set only in the * refcount carrying mbuf, the one with * EXT_FLAG_EMBREF flag, with exclusion for * EXT_EXTREF type, where the free context is copied * into all mbufs that use same external storage. */ char *ext_buf; /* start of buffer */ #define m_ext_copylen offsetof(struct m_ext, ext_arg2) void *ext_arg2; }; struct { /* * Multi-page M_EXTPG mbuf: * o extpg_pa - page vector. * o extpg_trail and extpg_hdr - TLS trailer and * header. * Uses ext_free and may also use ext_arg1. */ vm_paddr_t extpg_pa[MBUF_PEXT_MAX_PGS]; char extpg_trail[MBUF_PEXT_TRAIL_LEN]; char extpg_hdr[MBUF_PEXT_HDR_LEN]; /* Pretend these 3 fields are part of mbuf itself. */ #define m_epg_pa m_ext.extpg_pa #define m_epg_trail m_ext.extpg_trail #define m_epg_hdr m_ext.extpg_hdr #define m_epg_ext_copylen offsetof(struct m_ext, ext_free) }; }; /* * Free method and optional argument pointer, both * used by M_EXT and M_EXTPG. */ m_ext_free_t *ext_free; void *ext_arg1; }; /* * The core of the mbuf object along with some shortcut defines for practical * purposes. */ struct mbuf { /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 * Compile-time assertions in uipc_mbuf.c test these values to ensure * that they are correct. */ union { /* next buffer in chain */ struct mbuf *m_next; SLIST_ENTRY(mbuf) m_slist; STAILQ_ENTRY(mbuf) m_stailq; }; union { /* next chain in queue/record */ struct mbuf *m_nextpkt; SLIST_ENTRY(mbuf) m_slistpkt; STAILQ_ENTRY(mbuf) m_stailqpkt; }; caddr_t m_data; /* location of data */ int32_t m_len; /* amount of data in this mbuf */ uint32_t m_type:8, /* type of data in this mbuf */ m_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t m_pad; /* pad for 64bit alignment */ #endif /* * A set of optional headers (packet header, external storage header) * and internal data storage. Historically, these arrays were sized * to MHLEN (space left after a packet header) and MLEN (space left * after only a regular mbuf header); they are now variable size in * order to support future work on variable-size mbufs. */ union { struct { union { /* M_PKTHDR set. */ struct pkthdr m_pkthdr; /* M_EXTPG set. * Multi-page M_EXTPG mbuf has its meta data * split between the below anonymous structure * and m_ext. It carries vector of pages, * optional header and trailer char vectors * and pointers to socket/TLS data. */ #define m_epg_startcopy m_epg_npgs #define m_epg_endcopy m_epg_stailq struct { /* Overall count of pages and count of * pages with I/O pending. */ uint8_t m_epg_npgs; uint8_t m_epg_nrdy; /* TLS header and trailer lengths. * The data itself resides in m_ext. */ uint8_t m_epg_hdrlen; uint8_t m_epg_trllen; /* Offset into 1st page and length of * data in the last page. */ uint16_t m_epg_1st_off; uint16_t m_epg_last_len; uint8_t m_epg_flags; #define EPG_FLAG_ANON 0x1 /* Data can be encrypted in place. */ #define EPG_FLAG_2FREE 0x2 /* Scheduled for free. */ uint8_t m_epg_record_type; uint8_t __spare[2]; int m_epg_enc_cnt; struct ktls_session *m_epg_tls; struct socket *m_epg_so; uint64_t m_epg_seqno; STAILQ_ENTRY(mbuf) m_epg_stailq; }; }; union { /* M_EXT or M_EXTPG set. */ struct m_ext m_ext; /* M_PKTHDR set, neither M_EXT nor M_EXTPG. */ char m_pktdat[0]; }; }; char m_dat[0]; /* !M_PKTHDR, !M_EXT */ }; }; #ifdef _KERNEL static inline int m_epg_pagelen(const struct mbuf *m, int pidx, int pgoff) { KASSERT(pgoff == 0 || pidx == 0, ("page %d with non-zero offset %d in %p", pidx, pgoff, m)); if (pidx == m->m_epg_npgs - 1) { return (m->m_epg_last_len); } else { return (PAGE_SIZE - pgoff); } } #ifdef INVARIANTS #define MCHECK(ex, msg) KASSERT((ex), \ ("Multi page mbuf %p with " #msg " at %s:%d", \ m, __FILE__, __LINE__)) /* * NB: This expects a non-empty buffer (npgs > 0 and * last_pg_len > 0). */ #define MBUF_EXT_PGS_ASSERT_SANITY(m) do { \ MCHECK(m->m_epg_npgs > 0, "no valid pages"); \ MCHECK(m->m_epg_npgs <= nitems(m->m_epg_pa), \ "too many pages"); \ MCHECK(m->m_epg_nrdy <= m->m_epg_npgs, \ "too many ready pages"); \ MCHECK(m->m_epg_1st_off < PAGE_SIZE, \ "too large page offset"); \ MCHECK(m->m_epg_last_len > 0, "zero last page length"); \ MCHECK(m->m_epg_last_len <= PAGE_SIZE, \ "too large last page length"); \ if (m->m_epg_npgs == 1) \ MCHECK(m->m_epg_1st_off + \ m->m_epg_last_len <= PAGE_SIZE, \ "single page too large"); \ MCHECK(m->m_epg_hdrlen <= sizeof(m->m_epg_hdr), \ "too large header length"); \ MCHECK(m->m_epg_trllen <= sizeof(m->m_epg_trail), \ "too large header length"); \ } while (0) #else #define MBUF_EXT_PGS_ASSERT_SANITY(m) do {} while (0) #endif #endif /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped * to M_PROTO[1-11] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_RDONLY 0x00000008 /* associated data is marked read-only */ #define M_BCAST 0x00000010 /* send/received as link-level broadcast */ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ #define M_EXTPG 0x00000100 /* has array of unmapped pages and TLS */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ #define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ #define M_PROTO1 0x00002000 /* protocol-specific */ #define M_PROTO2 0x00004000 /* protocol-specific */ #define M_PROTO3 0x00008000 /* protocol-specific */ #define M_PROTO4 0x00010000 /* protocol-specific */ #define M_PROTO5 0x00020000 /* protocol-specific */ #define M_PROTO6 0x00040000 /* protocol-specific */ #define M_PROTO7 0x00080000 /* protocol-specific */ #define M_PROTO8 0x00100000 /* protocol-specific */ #define M_PROTO9 0x00200000 /* protocol-specific */ #define M_PROTO10 0x00400000 /* protocol-specific */ #define M_PROTO11 0x00800000 /* protocol-specific */ /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ M_PROTO9|M_PROTO10|M_PROTO11) /* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \ M_TSTMP_HPREC|M_TSTMP_LRO|M_PROTOFLAGS) /* * Flags preserved during demote. */ #define M_DEMOTEFLAGS \ (M_EXT | M_RDONLY | M_NOFREE | M_EXTPG) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ "\7M_PROMISC\10M_VLANTAG\11M_EXTPG\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC\15M_TSTMP_LRO" #define M_FLAG_PROTOBITS \ "\16M_PROTO1\17M_PROTO2\20M_PROTO3\21M_PROTO4" \ "\22M_PROTO5\23M_PROTO6\24M_PROTO7\25M_PROTO8\26M_PROTO9" \ "\27M_PROTO10\28M_PROTO11" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* * Network interface cards are able to hash protocol fields (such as IPv4 * addresses and TCP port numbers) classify packets into flows. These flows * can then be used to maintain ordering while delivering packets to the OS * via parallel input queues, as well as to provide a stateless affinity * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set * m_flag fields to indicate how the hash should be interpreted by the * network stack. * * Most NICs support RSS, which provides ordering and explicit affinity, and * use the hash m_flag bits to indicate what header fields were covered by * the hash. M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non- * RSS cards or configurations that provide an opaque flow identifier, allowing * for ordering and distribution without explicit affinity. Additionally, * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash * properties. * * The meaning of the IPV6_EX suffix: * "o Home address from the home address option in the IPv6 destination * options header. If the extension header is not present, use the Source * IPv6 Address. * o IPv6 address that is contained in the Routing-Header-Type-2 from the * associated extension header. If the extension header is not present, * use the Destination IPv6 Address." * Quoted from: * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex */ #define M_HASHTYPE_HASHPROP 0x80 /* has hash properties */ #define M_HASHTYPE_INNER 0x40 /* calculated from inner headers */ #define M_HASHTYPE_HASH(t) (M_HASHTYPE_HASHPROP | (t)) /* Microsoft RSS standard hash types */ #define M_HASHTYPE_NONE 0 #define M_HASHTYPE_RSS_IPV4 M_HASHTYPE_HASH(1) /* IPv4 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV4 M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */ #define M_HASHTYPE_RSS_IPV6 M_HASHTYPE_HASH(3) /* IPv6 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV6 M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */ #define M_HASHTYPE_RSS_IPV6_EX M_HASHTYPE_HASH(5) /* IPv6 2-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_TCP_IPV6_EX M_HASHTYPE_HASH(6) /* TCPv6 4-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6 M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ #define M_HASHTYPE_OPAQUE 0x3f /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ #define M_HASHTYPE_CLEAR(m) ((m)->m_pkthdr.rsstype = 0) #define M_HASHTYPE_GET(m) ((m)->m_pkthdr.rsstype & ~M_HASHTYPE_INNER) #define M_HASHTYPE_SET(m, v) ((m)->m_pkthdr.rsstype = (v)) #define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #define M_HASHTYPE_ISHASH(m) \ (((m)->m_pkthdr.rsstype & M_HASHTYPE_HASHPROP) != 0) #define M_HASHTYPE_SETINNER(m) do { \ (m)->m_pkthdr.rsstype |= M_HASHTYPE_INNER; \ } while (0) /* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_buf */ #define EXT_JUMBOP 3 /* jumbo cluster page sized */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ #define EXT_RXRING 8 /* data in NIC receive ring */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ #define EXT_VENDOR3 226 /* for vendor-internal use */ #define EXT_VENDOR4 227 /* for vendor-internal use */ #define EXT_EXP1 244 /* for experimental use */ #define EXT_EXP2 245 /* for experimental use */ #define EXT_EXP3 246 /* for experimental use */ #define EXT_EXP4 247 /* for experimental use */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 255 /* has externally maintained ext_cnt ptr */ /* * Flags for external mbuf buffer types. * NB: limited to the lower 24 bits. */ #define EXT_FLAG_EMBREF 0x000001 /* embedded ext_count */ #define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */ #define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */ #define EXT_FLAG_VENDOR1 0x010000 /* These flags are vendor */ #define EXT_FLAG_VENDOR2 0x020000 /* or submodule specific, */ #define EXT_FLAG_VENDOR3 0x040000 /* not used by mbuf code. */ #define EXT_FLAG_VENDOR4 0x080000 /* Set/read by submodule. */ #define EXT_FLAG_EXP1 0x100000 /* for experimental use */ #define EXT_FLAG_EXP2 0x200000 /* for experimental use */ #define EXT_FLAG_EXP3 0x400000 /* for experimental use */ #define EXT_FLAG_EXP4 0x800000 /* for experimental use */ /* * EXT flag description for use with printf(9) %b identifier. */ #define EXT_FLAG_BITS \ "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \ "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into * separate inbound and outbound flags. * * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested * against ifnet if_hwassist. Note that the outbound and the inbound flags do * not collide right now but they could be allowed to (as long as the flags are * scrubbed appropriately when the direction of an mbuf changes). CSUM_BITS * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX. * * CSUM_INNER_ is the same as CSUM_ but it applies to the inner frame. * The CSUM_ENCAP_ bits identify the outer encapsulation. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ #define CSUM_IP_TCP 0x00000004 /* TCP checksum offload */ #define CSUM_IP_SCTP 0x00000008 /* SCTP checksum offload */ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ #define CSUM_INNER_IP6_UDP 0x00000040 #define CSUM_INNER_IP6_TCP 0x00000080 #define CSUM_INNER_IP6_TSO 0x00000100 #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ #define CSUM_INNER_IP 0x00004000 #define CSUM_INNER_IP_UDP 0x00008000 #define CSUM_INNER_IP_TCP 0x00010000 #define CSUM_INNER_IP_TSO 0x00020000 #define CSUM_ENCAP_VXLAN 0x00040000 /* VXLAN outer encapsulation */ #define CSUM_ENCAP_RSVD1 0x00080000 /* Inbound checksum support where the checksum was verified by hardware. */ #define CSUM_INNER_L3_CALC 0x00100000 #define CSUM_INNER_L3_VALID 0x00200000 #define CSUM_INNER_L4_CALC 0x00400000 #define CSUM_INNER_L4_VALID 0x00800000 #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ #define CSUM_L4_VALID 0x08000000 /* checksum is correct */ #define CSUM_L5_CALC 0x10000000 /* calculated layer 5 csum */ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ #define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ #define CSUM_FLAGS_TX (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \ CSUM_IP_TSO | CSUM_IP_ISCSI | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | \ CSUM_INNER_IP6_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP | \ CSUM_IP6_TSO | CSUM_IP6_ISCSI | CSUM_INNER_IP | CSUM_INNER_IP_UDP | \ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN | \ CSUM_ENCAP_RSVD1 | CSUM_SND_TAG) #define CSUM_FLAGS_RX (CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | \ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | CSUM_L3_CALC | CSUM_L3_VALID | \ CSUM_L4_CALC | CSUM_L4_VALID | CSUM_L5_CALC | CSUM_L5_VALID | \ CSUM_COALESCED) /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ "\6CSUM_IP_ISCSI\7CSUM_INNER_IP6_UDP\10CSUM_INNER_IP6_TCP" \ "\11CSUM_INNER_IP6_TSO\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP" \ "\15CSUM_IP6_TSO\16CSUM_IP6_ISCSI\17CSUM_INNER_IP\20CSUM_INNER_IP_UDP" \ "\21CSUM_INNER_IP_TCP\22CSUM_INNER_IP_TSO\23CSUM_ENCAP_VXLAN" \ "\24CSUM_ENCAP_RSVD1\25CSUM_INNER_L3_CALC\26CSUM_INNER_L3_VALID" \ "\27CSUM_INNER_L4_CALC\30CSUM_INNER_L4_VALID\31CSUM_L3_CALC" \ "\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID\35CSUM_L5_CALC" \ "\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC #define CSUM_IP_VALID CSUM_L3_VALID #define CSUM_DATA_VALID CSUM_L4_VALID #define CSUM_PSEUDO_HDR CSUM_L4_CALC #define CSUM_SCTP_VALID CSUM_L4_VALID #define CSUM_DELAY_DATA (CSUM_TCP|CSUM_UDP) #define CSUM_DELAY_IP CSUM_IP /* Only v4, no v6 IP hdr csum */ #define CSUM_DELAY_DATA_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6) #define CSUM_DATA_VALID_IPV6 CSUM_DATA_VALID #define CSUM_TCP CSUM_IP_TCP #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) #define CSUM_INNER_TSO (CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP #define CSUM_TLS_MASK (CSUM_L5_CALC|CSUM_L5_VALID) #define CSUM_TLS_DECRYPTED CSUM_L5_CALC /* * mbuf types describing the content of the mbuf (including external storage). */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_VENDOR1 4 /* for vendor-internal use */ #define MT_VENDOR2 5 /* for vendor-internal use */ #define MT_VENDOR3 6 /* for vendor-internal use */ #define MT_VENDOR4 7 /* for vendor-internal use */ #define MT_SONAME 8 /* socket name */ #define MT_EXP1 9 /* for experimental use */ #define MT_EXP2 10 /* for experimental use */ #define MT_EXP3 11 /* for experimental use */ #define MT_EXP4 12 /* for experimental use */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_EXTCONTROL 15 /* control message with externalized contents */ #define MT_OOBDATA 16 /* expedited data */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_page" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs" #ifdef _KERNEL union if_snd_tag_alloc_params; #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; extern uma_zone_t zone_extpgs; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); void mb_free_extpg(struct mbuf *); void mb_free_mext_pgs(struct mbuf *); struct mbuf *mb_alloc_ext_pgs(int, m_ext_free_t); struct mbuf *mb_alloc_ext_plus_pages(int, int); struct mbuf *mb_mapped_to_unmapped(struct mbuf *, int, int, int, struct mbuf **); int mb_unmapped_compress(struct mbuf *m); int mb_unmapped_to_ext(struct mbuf *m, struct mbuf **mres); void mb_free_notready(struct mbuf *m, int count); void m_adj(struct mbuf *, int); void m_adj_decap(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_catpkt(struct mbuf *, struct mbuf *); int m_clget(struct mbuf *m, int how); void *m_cljget(struct mbuf *m, int how, int size); struct mbuf *m_collapse(struct mbuf *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_defrag(struct mbuf *, int); void m_demote_pkthdr(struct mbuf *); void m_demote(struct mbuf *, int, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); void m_dispose_extcontrolm(struct mbuf *m); struct mbuf *m_dup(const struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, const struct mbuf *, int); void m_extadd(struct mbuf *, char *, u_int, m_ext_free_t, void *, void *, int, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); void m_freemp(struct mbuf *); void m_free_raw(struct mbuf *); struct mbuf *m_get2(int, int, short, int); struct mbuf *m_get3(int, int, short, int); struct mbuf *m_getjcl(int, short, int, int); struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, const struct mbuf *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); int m_unmapped_uiomove(const struct mbuf *, int, struct uio *, int); struct mbuf *m_unshare(struct mbuf *, int); int m_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); void m_snd_tag_init(struct m_snd_tag *, struct ifnet *, const struct if_snd_tag_sw *); void m_snd_tag_destroy(struct m_snd_tag *); void m_rcvif_serialize(struct mbuf *); struct ifnet *m_rcvif_restore(struct mbuf *); static __inline int m_gettype(int size) { int type; switch (size) { case MSIZE: type = EXT_MBUF; break; case MCLBYTES: type = EXT_CLUSTER; break; +#if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; break; +#endif case MJUM9BYTES: type = EXT_JUMBO9; break; case MJUM16BYTES: type = EXT_JUMBO16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (type); } /* * Associated an external reference counted buffer with an mbuf. */ static __inline void m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt, m_ext_free_t freef, void *arg1, void *arg2) { KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__)); atomic_add_int(ref_cnt, 1); m->m_flags |= M_EXT; m->m_ext.ext_buf = buf; m->m_ext.ext_cnt = ref_cnt; m->m_data = m->m_ext.ext_buf; m->m_ext.ext_size = size; m->m_ext.ext_free = freef; m->m_ext.ext_arg1 = arg1; m->m_ext.ext_arg2 = arg2; m->m_ext.ext_type = EXT_EXTREF; m->m_ext.ext_flags = 0; } static __inline uma_zone_t m_getzone(int size) { uma_zone_t zone; switch (size) { case MCLBYTES: zone = zone_clust; break; +#if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; +#endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (zone); } /* * Initialize an mbuf with linear storage. * * Inline because the consumer text overhead will be roughly the same to * initialize or call a function with this many parameters and M_PKTHDR * should go away with constant propagation for !MGETHDR. */ static __inline int m_init(struct mbuf *m, int how, short type, int flags) { int error; m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_dat; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) error = m_pkthdr_init(m, how); else error = 0; MBUF_PROBE5(m__init, m, how, type, flags, error); return (error); } static __inline struct mbuf * m_get_raw(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type | MT_NOINIT; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__get_raw, how, type, m); return (m); } static __inline struct mbuf * m_get(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__get, how, type, m); return (m); } static __inline struct mbuf * m_gethdr_raw(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = M_PKTHDR; args.type = type | MT_NOINIT; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__gethdr_raw, how, type, m); return (m); } static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = M_PKTHDR; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__gethdr, how, type, m); return (m); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *m; struct mb_args args; args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_pack, &args, how); MBUF_PROBE4(m__getcl, how, type, flags, m); return (m); } /* * XXX: m_cljset() is a dangerous API. One must attach only a new, * unreferenced cluster to an mbuf(9). It is not possible to assert * that, so care can be taken only by users of the API. */ static __inline void m_cljset(struct mbuf *m, void *cl, int type) { int size; switch (type) { case EXT_CLUSTER: size = MCLBYTES; break; +#if MJUMPAGESIZE != MCLBYTES case EXT_JUMBOP: size = MJUMPAGESIZE; break; +#endif case EXT_JUMBO9: size = MJUM9BYTES; break; case EXT_JUMBO16: size = MJUM16BYTES; break; default: panic("%s: unknown cluster type %d", __func__, type); break; } m->m_data = m->m_ext.ext_buf = cl; m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_flags |= M_EXT; MBUF_PROBE3(m__cljset, m, cl, type); } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } static __inline void m_clrprotoflags(struct mbuf *m) { while (m) { m->m_flags &= ~M_PROTOFLAGS; m = m->m_next; } } static __inline struct mbuf * m_last(struct mbuf *m) { while (m->m_next) m = m->m_next; return (m); } static inline u_int m_extrefcnt(struct mbuf *m) { KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing for %p", __func__, m)); return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count : *m->m_ext.ext_cnt); } /* * mbuf, cluster, and external object allocation macros (for compatibility * purposes). */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, arg1, arg2, flags, type) \ m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2), \ (flags), (type)) #define m_getm(m, len, how, type) \ m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ #define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_EXTPG)) == 0 && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR, \ ("%s: no mbuf %p packet header!", __func__, (m))) /* Check if the supplied mbuf has no send tag, or else panic. */ #define M_ASSERT_NO_SND_TAG(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR && \ ((m)->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0, \ ("%s: receive mbuf %p has send tag!", __func__, (m))) /* Check if mbuf is multipage. */ #define M_ASSERTEXTPG(m) \ KASSERT(((m)->m_flags & (M_EXTPG|M_PKTHDR)) == M_EXTPG, \ ("%s: m %p is not multipage!", __func__, m)) /* * Ensure that the supplied mbuf is a valid, non-free mbuf. * * XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf %p!", __func__, (m))) /* Check whether any mbuf in the chain is unmapped. */ #ifdef INVARIANTS #define M_ASSERTMAPPED(m) do { \ for (struct mbuf *__m = (m); __m != NULL; __m = __m->m_next) \ KASSERT((__m->m_flags & M_EXTPG) == 0, \ ("%s: chain %p contains an unmapped mbuf", __func__, (m)));\ } while (0) #else #define M_ASSERTMAPPED(m) do {} while (0) #endif /* * Return the address of the start of the buffer associated with an mbuf, * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ (((m)->m_flags & M_EXTPG) ? NULL : \ ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) /* * Return the size of the buffer associated with an mbuf, handling external * storage, packet-header mbufs, and regular data mbufs. */ #define M_SIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : \ MLEN) /* * Set the m_data pointer of a newly allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. * * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as * separate macros, each asserting that it was called at the proper moment. * This required callers to themselves test the storage type and call the * right one. Rather than require callers to be aware of those layout * decisions, we centralize here. */ static __inline void m_align(struct mbuf *m, int len) { int adjust; KASSERT(m->m_data == M_START(m), ("%s: not a virgin mbuf %p", __func__, m)); adjust = M_SIZE(m) - len; m->m_data += adjust &~ (sizeof(long)-1); } #define M_ALIGN(m, len) m_align(m, len) #define MH_ALIGN(m, len) m_align(m, len) #define MEXT_ALIGN(m, len) m_align(m, len) /* * Compute the amount of space available before the current start of data in * an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* * So M_TRAILINGROOM() is for when you want to know how much space * would be there if it was writable. This can be used to * detect changes in mbufs by knowing the value at one point * and then being able to compare it later to the current M_TRAILINGROOM(). * The TRAILINGSPACE() macro is not suitable for this since an mbuf * at one point might not be writable and then later it becomes writable * even though the space at the back of it has not changed. */ #define M_TRAILINGROOM(m) ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) /* * Compute the amount of space available after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_TRAILINGSPACE(m) (M_WRITABLE(m) ? M_TRAILINGROOM(m) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be * allocated, how specifies whether to wait. If the allocation fails, the * original mbuf chain is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. This is a relatively expensive operation and * should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Return the rcvif of a packet header. */ static __inline struct ifnet * m_rcvif(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) return (NULL); return (m->m_pkthdr.rcvif); } /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 extern u_int max_linkhdr; /* Largest link-level header */ extern u_int max_hdr; /* Largest link + protocol header */ extern u_int max_protohdr; /* Largest protocol header */ void max_linkhdr_grow(u_int); void max_protohdr_grow(u_int); extern int nmbclusters; /* Maximum number of clusters */ extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */ /*- * Network packets may have annotations attached by affixing a list of * "packet tags" to the pkthdr structure. Packet tags are dynamically * allocated semi-opaque data structures that have a fixed header * (struct m_tag) that specifies the size of the memory block and a * pair that identifies it. The cookie is a 32-bit unique * unsigned value used to identify a module or ABI. By convention this value * is chosen as the date+time that the module is created, expressed as the * number of seconds since the epoch (e.g., using date -u +'%s'). The type * value is an ABI/module-specific value that identifies a particular * annotation and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, the value * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find * compatibility shim functions and several tag types are defined below. * Users that do not require compatibility should use a private cookie value * so that packet tag-related definitions can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default memory * alignment implemented by malloc. To reference private data one can use a * construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members of * struct foo. Otherwise it is necessary to embed struct m_tag within the * private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF 21 /* PF/ALTQ information */ /* was PACKET_TAG_RTSOCKFAM 25 rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #define PACKET_TAG_IPSEC_NAT_T_PORTS 29 /* two uint16_t */ #define PACKET_TAG_ND_OUTGOING 30 /* ND outgoing */ #define PACKET_TAG_PF_REASSEMBLED 31 /* Specific cookies and tags. */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(uint32_t, uint16_t, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, uint32_t, uint16_t, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, const struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not fill in the free * method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was already taken. */ static __inline void m_tag_setup(struct m_tag *t, uint32_t cookie, uint16_t type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m __unused, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(uint16_t type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, uint16_t type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } static inline struct m_snd_tag * m_snd_tag_ref(struct m_snd_tag *mst) { refcount_acquire(&mst->refcount); return (mst); } static inline void m_snd_tag_rele(struct m_snd_tag *mst) { if (refcount_release(&mst->refcount)) m_snd_tag_destroy(mst); } static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG) m_snd_tag_rele(m->m_pkthdr.snd_tag); if (m->m_flags & M_EXTPG) mb_free_extpg(m); else if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) uma_zfree(zone_mbuf, m); return (n); } static __inline int rt_m_getfib(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR, ("%s: Attempt to get FIB from non header mbuf %p", __func__, m)); return (m->m_pkthdr.fibnum); } #define M_GETFIB(_m) rt_m_getfib(_m) #define M_SETFIB(_m, _fib) do { \ KASSERT((_m)->m_flags & M_PKTHDR, \ ("%s: Attempt to set FIB on non header mbuf %p", __func__, (_m))); \ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) /* flags passed as first argument for "m_xxx_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t); uint32_t m_infiniband_tcpip_hash_init(void); uint32_t m_infiniband_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m); #define M_PROFILE(m) m_profile(m) #else #define M_PROFILE(m) #endif struct mbufq { STAILQ_HEAD(, mbuf) mq_head; int mq_len; int mq_maxlen; }; static inline void mbufq_init(struct mbufq *mq, int maxlen) { STAILQ_INIT(&mq->mq_head); mq->mq_maxlen = maxlen; mq->mq_len = 0; } static inline struct mbuf * mbufq_flush(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); STAILQ_INIT(&mq->mq_head); mq->mq_len = 0; return (m); } static inline void mbufq_drain(struct mbufq *mq) { struct mbuf *m, *n; n = mbufq_flush(mq); while ((m = n) != NULL) { n = STAILQ_NEXT(m, m_stailqpkt); m_freem(m); } } static inline struct mbuf * mbufq_first(const struct mbufq *mq) { return (STAILQ_FIRST(&mq->mq_head)); } static inline struct mbuf * mbufq_last(const struct mbufq *mq) { return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt)); } static inline bool mbufq_empty(const struct mbufq *mq) { return (mq->mq_len == 0); } static inline int mbufq_full(const struct mbufq *mq) { return (mq->mq_maxlen > 0 && mq->mq_len >= mq->mq_maxlen); } static inline int mbufq_len(const struct mbufq *mq) { return (mq->mq_len); } static inline int mbufq_enqueue(struct mbufq *mq, struct mbuf *m) { if (mbufq_full(mq)) return (ENOBUFS); STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; return (0); } static inline struct mbuf * mbufq_dequeue(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); if (m) { STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt); m->m_nextpkt = NULL; mq->mq_len--; } return (m); } static inline void mbufq_prepend(struct mbufq *mq, struct mbuf *m) { STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; } /* * Note: this doesn't enforce the maximum list size for dst. */ static inline void mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src) { mq_dst->mq_len += mq_src->mq_len; STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head); mq_src->mq_len = 0; } #ifdef _SYS_TIMESPEC_H_ static inline void mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts) { M_ASSERTPKTHDR(m); KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0, ("%s: mbuf %p no M_TSTMP or M_TSTMP_LRO", __func__, m)); ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000; } #endif static inline void mbuf_tstmp2timeval(struct mbuf *m, struct timeval *tv) { M_ASSERTPKTHDR(m); KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0, ("%s: mbuf %p no M_TSTMP or M_TSTMP_LRO", __func__, m)); tv->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; } #ifdef DEBUGNET /* Invoked from the debugnet client code. */ void debugnet_mbuf_drain(void); void debugnet_mbuf_start(void); void debugnet_mbuf_finish(void); void debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize); #endif static inline bool mbuf_has_tls_session(struct mbuf *m) { if (m->m_flags & M_EXTPG) { if (m->m_epg_tls != NULL) { return (true); } } return (false); } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */