diff --git a/sbin/ifconfig/ifconfig.8 b/sbin/ifconfig/ifconfig.8
index 8e83fbcc7730..54343ba489ee 100644
--- a/sbin/ifconfig/ifconfig.8
+++ b/sbin/ifconfig/ifconfig.8
@@ -1,3127 +1,3131 @@
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     From: @(#)ifconfig.8	8.3 (Berkeley) 1/5/94
 .\" $FreeBSD$
 .\"
-.Dd October 25, 2020
+.Dd October 28, 2020
 .Dt IFCONFIG 8
 .Os
 .Sh NAME
 .Nm ifconfig
 .Nd configure network interface parameters
 .Sh SYNOPSIS
 .Nm
 .Op Fl f Ar type:format Ns Op Ar ,type:format
 .Op Fl L
 .Op Fl k
 .Op Fl m
 .Op Fl n
 .Ar interface
 .Op Cm create
 .Ar address_family
 .Oo
 .Ar address
 .Op Ar dest_address
 .Oc
 .Op Ar parameters
 .Nm
 .Ar interface
 .Cm destroy
 .Nm
 .Fl a
 .Op Fl L
 .Op Fl d
 .Op Fl [gG] Ar groupname
 .Op Fl m
 .Op Fl u
 .Op Fl v
 .Op Ar address_family
 .Nm
 .Fl l
 .Op Fl d
 .Op Fl u
 .Op Ar address_family
 .Nm
 .Op Fl L
 .Op Fl d
 .Op Fl k
 .Op Fl m
 .Op Fl u
 .Op Fl v
 .Op Fl C
 .Nm
 .Op Fl g Ar groupname
 .Sh DESCRIPTION
 The
 .Nm
 utility is used to assign an address
 to a network interface and/or configure
 network interface parameters.
 The
 .Nm
 utility must be used at boot time to define the network address
 of each interface present on a machine; it may also be used at
 a later time to redefine an interface's address
 or other operating parameters.
 .Pp
 The following options are available:
 .Bl -tag -width indent
 .It Ar address
 For the
 .Tn DARPA Ns -Internet
 family,
 the address is either a host name present in the host name data
 base,
 .Xr hosts 5 ,
 or a
 .Tn DARPA
 Internet address expressed in the Internet standard
 .Dq dot notation .
 .Pp
 It is also possible to use the CIDR notation (also known as the
 slash notation) to include the netmask.
 That is, one can specify an address like
 .Li 192.168.0.1/16 .
 .Pp
 For the
 .Dq inet6
 family, it is also possible to specify the prefix length using the slash
 notation, like
 .Li ::1/128 .
 See the
 .Cm prefixlen
 parameter below for more information.
 .\" For the Xerox Network Systems(tm) family,
 .\" addresses are
 .\" .Ar net:a.b.c.d.e.f ,
 .\" where
 .\" .Ar net
 .\" is the assigned network number (in decimal),
 .\" and each of the six bytes of the host number,
 .\" .Ar a
 .\" through
 .\" .Ar f ,
 .\" are specified in hexadecimal.
 .\" The host number may be omitted on IEEE 802 protocol
 .\" (Ethernet, FDDI, and Token Ring) interfaces,
 .\" which use the hardware physical address,
 .\" and on interfaces other than the first.
 .\" For the
 .\" .Tn ISO
 .\" family, addresses are specified as a long hexadecimal string,
 .\" as in the Xerox family.
 .\" However, two consecutive dots imply a zero
 .\" byte, and the dots are optional, if the user wishes to (carefully)
 .\" count out long strings of digits in network byte order.
 .Pp
 The link-level
 .Pq Dq link
 address
 is specified as a series of colon-separated hex digits.
 This can be used to, for example,
 set a new MAC address on an Ethernet interface, though the
 mechanism used is not Ethernet specific.
 Use the
 .Pq Dq random
 keyword to set a randomly generated MAC address.
 A randomly-generated MAC address might be the same as one already in use
 in the network.
 Such duplications are extremely unlikely.
 If the interface is already
 up when this option is used, it will be briefly brought down and
 then brought back up again in order to ensure that the receive
 filter in the underlying Ethernet hardware is properly reprogrammed.
 .It Ar address_family
 Specify the
 address family
 which affects interpretation of the remaining parameters.
 Since an interface can receive transmissions in differing protocols
 with different naming schemes, specifying the address family is recommended.
 The address or protocol families currently
 supported are
 .Dq inet ,
 .Dq inet6 ,
 and
 .Dq link .
 The default if available is
 .Dq inet
 or otherwise
 .Dq link .
 .Dq ether
 and
 .Dq lladdr
 are synonyms for
 .Dq link .
 When using the
 .Fl l
 flag, the
 .Dq ether
 address family has special meaning and is no longer synonymous with
 .Dq link
 or
 .Dq lladdr .
 Specifying
 .Fl l Dq ether
 will list only Ethernet interfaces, excluding all other interface types,
 including the loopback interface.
 .It Ar dest_address
 Specify the address of the correspondent on the other end
 of a point to point link.
 .It Ar interface
 This
 parameter is a string of the form
 .Dq name unit ,
 for example,
 .Dq Li em0 .
 .It Ar groupname
 List the interfaces in the given group.
 .El
 .Pp
 The output format of
 .Nm
 can be controlled using the
 .Fl f
 flag or the
 .Ev IFCONFIG_FORMAT
 environment variable.
 The format is specified as a comma separated list of
 .Sy type:format
 pairs.
 See the
 .Sx EXAMPLES
 section for more information.
 The
 .Sy types
 and their associated
 .Sy format
 strings are:
 .Bl -tag -width ether
 .It Sy addr
 Adjust the display of inet and inet6 addresses
 .Bl -tag -width default
 .It Sy default
 Display inet and inet6 addresses in the default format,
 .Sy numeric
 .It Sy fqdn
 Display inet and inet6 addresses as fully qualified domain names
 .Pq FQDN
 .It Sy host
 Display inet and inet6 addresses as unqualified hostnames
 .It Sy numeric
 Display inet and inet6 addresses in numeric format
 .El
 .It Sy ether
 Adjust the display of link-level ethernet (MAC) addresses
 .Bl -tag -width default
 .It Sy colon
 Separate address segments with a colon
 .It Sy dash
 Separate address segments with a dash
 .It Sy default
 Display ethernet addresses in the default format,
 .Sy colon
 .El
 .It Sy inet
 Adjust the display of inet address subnet masks:
 .Bl -tag -width default
 .It Sy cidr
 Display subnet masks in CIDR notation, for example:
 .br
 10.0.0.0/8 or 203.0.113.224/26
 .It Sy default
 Display subnet masks in the default format,
 .Sy hex
 .It Sy dotted
 Display subnet masks in dotted quad notation, for example:
 .br
 255.255.0.0 or 255.255.255.192
 .It Sy hex
 Display subnet masks in hexadecimal, for example:
 .br
 0xffff0000 or 0xffffffc0
 .El
 .It Sy inet6
 Adjust the display of inet6 address prefixes (subnet masks):
 .Bl -tag -width default
 .It Sy cidr
 Display subnet prefix in CIDR notation, for example:
 .br
 ::1/128 or fe80::1%lo0/64
 .It Sy default
 Display subnet prefix in the default format
 .Sy numeric
 .It Sy numeric
 Display subnet prefix in integer format, for example:
 .br
 prefixlen 64
 .El
 .El
 .Pp
 The following parameters may be set with
 .Nm :
 .Bl -tag -width indent
 .It Cm add
 Another name for the
 .Cm alias
 parameter.
 Introduced for compatibility
 with
 .Bsx .
 .It Cm alias
 Establish an additional network address for this interface.
 This is sometimes useful when changing network numbers, and
 one wishes to accept packets addressed to the old interface.
 If the address is on the same subnet as the first network address
 for this interface, a non-conflicting netmask must be given.
 Usually
 .Li 0xffffffff
 is most appropriate.
 .It Fl alias
 Remove the network address specified.
 This would be used if you incorrectly specified an alias, or it
 was no longer needed.
 If you have incorrectly set an NS address having the side effect
 of specifying the host portion, removing all NS addresses will
 allow you to respecify the host portion.
 .It Cm anycast
 (Inet6 only.)
 Specify that the address configured is an anycast address.
 Based on the current specification,
 only routers may configure anycast addresses.
 Anycast address will not be used as source address of any of outgoing
 IPv6 packets.
 .It Cm arp
 Enable the use of the Address Resolution Protocol
 .Pq Xr arp 4
 in mapping
 between network level addresses and link level addresses (default).
 This is currently implemented for mapping between
 .Tn DARPA
 Internet
 addresses and
 .Tn IEEE
 802 48-bit MAC addresses (Ethernet, FDDI, and Token Ring addresses).
 .It Fl arp
 Disable the use of the Address Resolution Protocol
 .Pq Xr arp 4 .
 .It Cm staticarp
 If the Address Resolution Protocol is enabled,
 the host will only reply to requests for its addresses,
 and will never send any requests.
 .It Fl staticarp
 If the Address Resolution Protocol is enabled,
 the host will perform normally,
 sending out requests and listening for replies.
 .It Cm broadcast
 (Inet only.)
 Specify the address to use to represent broadcasts to the
 network.
 The default broadcast address is the address with a host part of all 1's.
 .It Cm debug
 Enable driver dependent debugging code; usually, this turns on
 extra console error logging.
 .It Fl debug
 Disable driver dependent debugging code.
 .It Cm promisc
 Put interface into permanently promiscuous mode.
 .It Fl promisc
 Disable permanently promiscuous mode.
 .It Cm delete
 Another name for the
 .Fl alias
 parameter.
 .It Cm description Ar value , Cm descr Ar value
 Specify a description of the interface.
 This can be used to label interfaces in situations where they may
 otherwise be difficult to distinguish.
 .It Cm -description , Cm -descr
 Clear the interface description.
 .It Cm down
 Mark an interface
 .Dq down .
 When an interface is marked
 .Dq down ,
 the system will not attempt to
 transmit messages through that interface.
 If possible, the interface will be reset to disable reception as well.
 This action does not automatically disable routes using the interface.
 .It Cm group Ar groupname
 Assign the interface to a
 .Dq group .
 Any interface can be in multiple groups.
 .Pp
 Cloned interfaces are members of their interface family group by default.
 For example, a PPP interface such as
 .Em ppp0
 is a member of the PPP interface family group,
 .Em ppp .
 .\" The interface(s) the default route(s) point to are members of the
 .\" .Em egress
 .\" interface group.
 .It Cm -group Ar groupname
 Remove the interface from the given
 .Dq group .
 .It Cm eui64
 (Inet6 only.)
 Fill interface index
 (lowermost 64bit of an IPv6 address)
 automatically.
 .It Cm fib Ar fib_number
 Specify interface FIB.
 A FIB
 .Ar fib_number
 is assigned to all frames or packets received on that interface.
 The FIB is not inherited, e.g., vlans or other sub-interfaces will use
 the default FIB (0) irrespective of the parent interface's FIB.
 The kernel needs to be tuned to support more than the default FIB
 using the
 .Va ROUTETABLES
 kernel configuration option, or the
 .Va net.fibs
 tunable.
 .It Cm tunnelfib Ar fib_number
 Specify tunnel FIB.
 A FIB
 .Ar fib_number
 is assigned to all packets encapsulated by tunnel interface, e.g.,
 .Xr gif 4
 and
 .Xr gre 4 .
 .It Cm maclabel Ar label
 If Mandatory Access Control support is enabled in the kernel,
 set the MAC label to
 .Ar label .
 .\" (see
 .\" .Xr maclabel 7 ) .
 .It Cm media Ar type
 If the driver supports the media selection system, set the media type
 of the interface to
 .Ar type .
 Some interfaces support the mutually exclusive use of one of several
 different physical media connectors.
 For example, a 10Mbit/s Ethernet
 interface might support the use of either
 .Tn AUI
 or twisted pair connectors.
 Setting the media type to
 .Cm 10base5/AUI
 would change the currently active connector to the AUI port.
 Setting it to
 .Cm 10baseT/UTP
 would activate twisted pair.
 Refer to the interfaces' driver
 specific documentation or man page for a complete list of the
 available types.
 .It Cm mediaopt Ar opts
 If the driver supports the media selection system, set the specified
 media options on the interface.
 The
 .Ar opts
 argument
 is a comma delimited list of options to apply to the interface.
 Refer to the interfaces' driver specific man page for a complete
 list of available options.
 .It Fl mediaopt Ar opts
 If the driver supports the media selection system, disable the
 specified media options on the interface.
 .It Cm mode Ar mode
 If the driver supports the media selection system, set the specified
 operating mode on the interface to
 .Ar mode .
 For IEEE 802.11 wireless interfaces that support multiple operating modes
 this directive is used to select between 802.11a
 .Pq Cm 11a ,
 802.11b
 .Pq Cm 11b ,
 and 802.11g
 .Pq Cm 11g
 operating modes.
 .It Cm txrtlmt
 Set if the driver supports TX rate limiting.
 .It Cm inst Ar minst , Cm instance Ar minst
 Set the media instance to
 .Ar minst .
 This is useful for devices which have multiple physical layer interfaces
 .Pq PHYs .
 .It Cm name Ar name
 Set the interface name to
 .Ar name .
 .It Cm rxcsum , txcsum , rxcsum6 , txcsum6
 If the driver supports user-configurable checksum offloading,
 enable receive (or transmit) checksum offloading on the interface.
 The feature can be turned on selectively per protocol family.
 Use
 .Cm rxcsum6 , txcsum6
 for
 .Xr ip6 4
 or
 .Cm rxcsum , txcsum
 otherwise.
 Some drivers may not be able to enable these flags independently
 of each other, so setting one may also set the other.
 The driver will offload as much checksum work as it can reliably
 support, the exact level of offloading varies between drivers.
 .It Fl rxcsum , txcsum , rxcsum6 , txcsum6
 If the driver supports user-configurable checksum offloading,
 disable receive (or transmit) checksum offloading on the interface.
 The feature can be turned off selectively per protocol family.
 Use
 .Fl rxcsum6 , txcsum6
 for
 .Xr ip6 4
 or
 .Fl rxcsum , txcsum
 otherwise.
 These settings may not always be independent of each other.
 .It Cm tso
 If the driver supports
 .Xr tcp 4
 segmentation offloading, enable TSO on the interface.
 Some drivers may not be able to support TSO for
 .Xr ip 4
 and
 .Xr ip6 4
 packets, so they may enable only one of them.
 .It Fl tso
 If the driver supports
 .Xr tcp 4
 segmentation offloading, disable TSO on the interface.
 It will always disable TSO for
 .Xr ip 4
 and
 .Xr ip6 4 .
 .It Cm tso6 , tso4
 If the driver supports
 .Xr tcp 4
 segmentation offloading for
 .Xr ip6 4
 or
 .Xr ip 4
 use one of these to selectively enabled it only for one protocol family.
 .It Fl tso6 , tso4
 If the driver supports
 .Xr tcp 4
 segmentation offloading for
 .Xr ip6 4
 or
 .Xr ip 4
 use one of these to selectively disable it only for one protocol family.
 .It Cm lro
 If the driver supports
 .Xr tcp 4
 large receive offloading, enable LRO on the interface.
 .It Fl lro
 If the driver supports
 .Xr tcp 4
 large receive offloading, disable LRO on the interface.
 .It Cm txtls
 Transmit TLS offload encrypts Transport Layer Security (TLS) records and
 segments the encrypted record into one or more
 .Xr tcp 4
 segments over either
 .Xr ip 4
 or
 .Xr ip6 4 .
 If the driver supports transmit TLS offload,
 enable transmit TLS offload on the interface.
 Some drivers may not be able to support transmit TLS offload for
 .Xr ip 4
 and
 .Xr ip6 4
 packets, so they may enable only one of them.
 .It Fl txtls
 If the driver supports transmit TLS offload,
 disable transmit TLS offload on the interface.
 It will always disable TLS for
 .Xr ip 4
 and
 .Xr ip6 4 .
+.It Cm txtlsrtlmt
+Enable use of rate limiting (packet pacing) for TLS offload.
+.It Fl txtlsrtlmt
+Disable use of rate limiting for TLS offload.
 .It Cm nomap
 If the driver supports unmapped network buffers,
 enable them on the interface.
 .It Fl nomap
 If the driver supports unmapped network buffers,
 disable them on the interface.
 .It Cm wol , wol_ucast , wol_mcast , wol_magic
 Enable Wake On Lan (WOL) support, if available.
 WOL is a facility whereby a machine in a low power state may be woken
 in response to a received packet.
 There are three types of packets that may wake a system:
 ucast (directed solely to the machine's mac address),
 mcast (directed to a broadcast or multicast address),
 or
 magic (unicast or multicast frames with a ``magic contents'').
 Not all devices support WOL, those that do indicate the mechanisms
 they support in their capabilities.
 .Cm wol
 is a synonym for enabling all available WOL mechanisms.
 To disable WOL use
 .Fl wol .
 .It Cm vlanmtu , vlanhwtag , vlanhwfilter , vlanhwcsum , vlanhwtso
 If the driver offers user-configurable VLAN support, enable
 reception of extended frames, tag processing in hardware,
 frame filtering in hardware, checksum offloading, or TSO on VLAN,
 respectively.
 Note that this must be configured on a physical interface associated with
 .Xr vlan 4 ,
 not on a
 .Xr vlan 4
 interface itself.
 .It Fl vlanmtu , vlanhwtag, vlanhwfilter, vlanhwtso
 If the driver offers user-configurable VLAN support, disable
 reception of extended frames, tag processing in hardware,
 frame filtering in hardware, or TSO on VLAN,
 respectively.
 .It Cm vxlanhwcsum , vxlanhwtso
 If the driver offers user-configurable VXLAN support, enable inner checksum
 offloading (receive and transmit) or TSO on VXLAN, respectively.
 Note that this must be configured on a physical interface associated with
 .Xr vxlan 4 ,
 not on a
 .Xr vxlan 4
 interface itself.
 The physical interface is either the interface specified as the vxlandev
 or the interface hosting the vxlanlocal address.
 The driver will offload as much checksum work and TSO as it can reliably
 support, the exact level of offloading may vary between drivers.
 .It Fl vxlanhwcsum , vxlanhwtso
 If the driver offers user-configurable VXLAN support, disable checksum
 offloading (receive and transmit) or TSO on VXLAN, respectively.
 .It Cm vnet Ar jail
 Move the interface to the
 .Xr jail 8 ,
 specified by name or JID.
 If the jail has a virtual network stack, the interface will disappear
 from the current environment and become visible to the jail.
 .It Fl vnet Ar jail
 Reclaim the interface from the
 .Xr jail 8 ,
 specified by name or JID.
 If the jail has a virtual network stack, the interface will disappear
 from the jail, and become visible to the current network environment.
 .It Cm polling
 Turn on
 .Xr polling 4
 feature and disable interrupts on the interface, if driver supports
 this mode.
 .It Fl polling
 Turn off
 .Xr polling 4
 feature and enable interrupt mode on the interface.
 .It Cm create
 Create the specified network pseudo-device.
 If the interface is given without a unit number, try to create a new
 device with an arbitrary unit number.
 If creation of an arbitrary device is successful, the new device name is
 printed to standard output unless the interface is renamed or destroyed
 in the same
 .Nm
 invocation.
 .It Cm destroy
 Destroy the specified network pseudo-device.
 .It Cm plumb
 Another name for the
 .Cm create
 parameter.
 Included for
 .Tn Solaris
 compatibility.
 .It Cm unplumb
 Another name for the
 .Cm destroy
 parameter.
 Included for
 .Tn Solaris
 compatibility.
 .It Cm metric Ar n
 Set the routing metric of the interface to
 .Ar n ,
 default 0.
 The routing metric is used by the routing protocol
 .Pq Xr routed 8 .
 Higher metrics have the effect of making a route
 less favorable; metrics are counted as additional hops
 to the destination network or host.
 .It Cm mtu Ar n
 Set the maximum transmission unit of the interface to
 .Ar n ,
 default is interface specific.
 The MTU is used to limit the size of packets that are transmitted on an
 interface.
 Not all interfaces support setting the MTU, and some interfaces have
 range restrictions.
 .It Cm netmask Ar mask
 .\" (Inet and ISO.)
 (Inet only.)
 Specify how much of the address to reserve for subdividing
 networks into sub-networks.
 The mask includes the network part of the local address
 and the subnet part, which is taken from the host field of the address.
 The mask can be specified as a single hexadecimal number
 with a leading
 .Ql 0x ,
 with a dot-notation Internet address,
 or with a pseudo-network name listed in the network table
 .Xr networks 5 .
 The mask contains 1's for the bit positions in the 32-bit address
 which are to be used for the network and subnet parts,
 and 0's for the host part.
 The mask should contain at least the standard network portion,
 and the subnet field should be contiguous with the network
 portion.
 .Pp
 The netmask can also be specified in CIDR notation after the address.
 See the
 .Ar address
 option above for more information.
 .It Cm prefixlen Ar len
 (Inet6 only.)
 Specify that
 .Ar len
 bits are reserved for subdividing networks into sub-networks.
 The
 .Ar len
 must be integer, and for syntactical reason it must be between 0 to 128.
 It is almost always 64 under the current IPv6 assignment rule.
 If the parameter is omitted, 64 is used.
 .Pp
 The prefix can also be specified using the slash notation after the address.
 See the
 .Ar address
 option above for more information.
 .It Cm remove
 Another name for the
 .Fl alias
 parameter.
 Introduced for compatibility
 with
 .Bsx .
 .Sm off
 .It Cm link Op Cm 0 No - Cm 2
 .Sm on
 Enable special processing of the link level of the interface.
 These three options are interface specific in actual effect, however,
 they are in general used to select special modes of operation.
 An example
 of this is to enable SLIP compression, or to select the connector type
 for some Ethernet cards.
 Refer to the man page for the specific driver
 for more information.
 .Sm off
 .It Fl link Op Cm 0 No - Cm 2
 .Sm on
 Disable special processing at the link level with the specified interface.
 .It Cm monitor
 Put the interface in monitor mode.
 No packets are transmitted, and received packets are discarded after
 .Xr bpf 4
 processing.
 .It Fl monitor
 Take the interface out of monitor mode.
 .It Cm pcp Ar priority_code_point
 Priority code point
 .Pq Dv PCP
 is an 3-bit field which refers to the IEEE 802.1p
 class of service and maps to the frame priority level.
 .It Fl pcp
 Stop tagging packets on the interface w/ the priority code point.
 .It Cm up
 Mark an interface
 .Dq up .
 This may be used to enable an interface after an
 .Dq Nm Cm down .
 It happens automatically when setting the first address on an interface.
 If the interface was reset when previously marked down,
 the hardware will be re-initialized.
 .El
 .Pp
 The following parameters are for ICMPv6 Neighbor Discovery Protocol.
 Note that the address family keyword
 .Dq Li inet6
 is needed for them:
 .Bl -tag -width indent
 .It Cm accept_rtadv
 Set a flag to enable accepting ICMPv6 Router Advertisement messages.
 The
 .Xr sysctl 8
 variable
 .Va net.inet6.ip6.accept_rtadv
 controls whether this flag is set by default or not.
 .It Cm -accept_rtadv
 Clear a flag
 .Cm accept_rtadv .
 .It Cm no_radr
 Set a flag to control whether routers from which the system accepts
 Router Advertisement messages will be added to the Default Router List
 or not.
 When the
 .Cm accept_rtadv
 flag is disabled, this flag has no effect.
 The
 .Xr sysctl 8
 variable
 .Va net.inet6.ip6.no_radr
 controls whether this flag is set by default or not.
 .It Cm -no_radr
 Clear a flag
 .Cm no_radr .
 .It Cm auto_linklocal
 Set a flag to perform automatic link-local address configuration when
 the interface becomes available.
 The
 .Xr sysctl 8
 variable
 .Va net.inet6.ip6.auto_linklocal
 controls whether this flag is set by default or not.
 .It Cm -auto_linklocal
 Clear a flag
 .Cm auto_linklocal .
 .It Cm defaultif
 Set the specified interface as the default route when there is no
 default router.
 .It Cm -defaultif
 Clear a flag
 .Cm defaultif .
 .It Cm ifdisabled
 Set a flag to disable all of IPv6 network communications on the
 specified interface.
 Note that if there are already configured IPv6
 addresses on that interface, all of them are marked as
 .Dq tentative
 and DAD will be performed when this flag is cleared.
 .It Cm -ifdisabled
 Clear a flag
 .Cm ifdisabled .
 When this flag is cleared and
 .Cm auto_linklocal
 flag is enabled, automatic configuration of a link-local address is
 performed.
 .It Cm nud
 Set a flag to enable Neighbor Unreachability Detection.
 .It Cm -nud
 Clear a flag
 .Cm nud .
 .It Cm no_prefer_iface
 Set a flag to not honor rule 5 of source address selection in RFC 3484.
 In practice this means the address on the outgoing interface will not be
 preferred, effectively yielding the decision to the address selection
 policy table, configurable with
 .Xr ip6addrctl 8 .
 .It Cm -no_prefer_iface
 Clear a flag
 .Cm no_prefer_iface .
 .It Cm no_dad
 Set a flag to disable Duplicate Address Detection.
 .It Cm -no_dad
 Clear a flag
 .Cm no_dad .
 .El
 .Pp
 The following parameters are specific for IPv6 addresses.
 Note that the address family keyword
 .Dq Li inet6
 is needed for them:
 .Bl -tag -width indent
 .It Cm autoconf
 Set the IPv6 autoconfigured address bit.
 .It Fl autoconf
 Clear the IPv6 autoconfigured address bit.
 .It Cm deprecated
 Set the IPv6 deprecated address bit.
 .It Fl deprecated
 Clear the IPv6 deprecated address bit.
 .It Cm pltime Ar n
 Set preferred lifetime for the address.
 .It Cm prefer_source
 Set a flag to prefer address as a candidate of the source address for
 outgoing packets.
 .It Cm -prefer_source
 Clear a flag
 .Cm prefer_source .
 .It Cm vltime Ar n
 Set valid lifetime for the address.
 .El
 .Pp
 The following parameters are specific to cloning
 IEEE 802.11 wireless interfaces with the
 .Cm create
 request:
 .Bl -tag -width indent
 .It Cm wlandev Ar device
 Use
 .Ar device
 as the parent for the cloned device.
 .It Cm wlanmode Ar mode
 Specify the operating mode for this cloned device.
 .Ar mode
 is one of
 .Cm sta ,
 .Cm ahdemo
 (or
 .Cm adhoc-demo ) ,
 .Cm ibss
 (or
 .Cm adhoc ) ,
 .Cm ap
 (or
 .Cm hostap ) ,
 .Cm wds ,
 .Cm tdma ,
 .Cm mesh ,
 and
 .Cm monitor .
 The operating mode of a cloned interface cannot be changed.
 The
 .Cm tdma
 mode is actually implemented as an
 .Cm adhoc-demo
 interface with special properties.
 .It Cm wlanbssid Ar bssid
 The 802.11 mac address to use for the bssid.
 This must be specified at create time for a legacy
 .Cm wds
 device.
 .It Cm wlanaddr Ar address
 The local mac address.
 If this is not specified then a mac address will automatically be assigned
 to the cloned device.
 Typically this address is the same as the address of the parent device
 but if the
 .Cm bssid
 parameter is specified then the driver will craft a unique address for
 the device (if supported).
 .It Cm wdslegacy
 Mark a
 .Cm wds
 device as operating in ``legacy mode''.
 Legacy
 .Cm wds
 devices have a fixed peer relationship and do not, for example, roam
 if their peer stops communicating.
 For completeness a Dynamic WDS (DWDS) interface may marked as
 .Fl wdslegacy .
 .It Cm bssid
 Request a unique local mac address for the cloned device.
 This is only possible if the device supports multiple mac addresses.
 To force use of the parent's mac address use
 .Fl bssid .
 .It Cm beacons
 Mark the cloned interface as depending on hardware support to
 track received beacons.
 To have beacons tracked in software use
 .Fl beacons .
 For
 .Cm hostap
 mode
 .Fl beacons
 can also be used to indicate no beacons should
 be transmitted; this can be useful when creating a WDS configuration but
 .Cm wds
 interfaces can only be created as companions to an access point.
 .El
 .Pp
 The following parameters are specific to IEEE 802.11 wireless interfaces
 cloned with a
 .Cm create
 operation:
 .Bl -tag -width indent
 .It Cm ampdu
 Enable sending and receiving AMPDU frames when using 802.11n (default).
 The 802.11n specification states a compliant station must be capable
 of receiving AMPDU frames but transmission is optional.
 Use
 .Fl ampdu
 to disable all use of AMPDU with 802.11n.
 For testing and/or to work around interoperability problems one can use
 .Cm ampdutx
 and
 .Cm ampdurx
 to control use of AMPDU in one direction.
 .It Cm ampdudensity Ar density
 Set the AMPDU density parameter used when operating with 802.11n.
 This parameter controls the inter-packet gap for AMPDU frames.
 The sending device normally controls this setting but a receiving station
 may request wider gaps.
 Legal values for
 .Ar density
 are 0, .25, .5, 1, 2, 4, 8, and 16 (microseconds).
 A value of
 .Cm -
 is treated the same as 0.
 .It Cm ampdulimit Ar limit
 Set the limit on packet size for receiving AMPDU frames when operating
 with 802.11n.
 Legal values for
 .Ar limit
 are 8192, 16384, 32768, and 65536 but one can also specify
 just the unique prefix: 8, 16, 32, 64.
 Note the sender may limit the size of AMPDU frames to be less
 than the maximum specified by the receiving station.
 .It Cm amsdu
 Enable sending and receiving AMSDU frames when using 802.11n.
 By default AMSDU is received but not transmitted.
 Use
 .Fl amsdu
 to disable all use of AMSDU with 802.11n.
 For testing and/or to work around interoperability problems one can use
 .Cm amsdutx
 and
 .Cm amsdurx
 to control use of AMSDU in one direction.
 .It Cm amsdulimit Ar limit
 Set the limit on packet size for sending and receiving AMSDU frames
 when operating with 802.11n.
 Legal values for
 .Ar limit
 are 7935 and 3839 (bytes).
 Note the sender may limit the size of AMSDU frames to be less
 than the maximum specified by the receiving station.
 Note also that devices are not required to support the 7935 limit,
 only 3839 is required by the specification and the larger value
 may require more memory to be dedicated to support functionality
 that is rarely used.
 .It Cm apbridge
 When operating as an access point, pass packets between
 wireless clients directly (default).
 To instead let them pass up through the
 system and be forwarded using some other mechanism, use
 .Fl apbridge .
 Disabling the internal bridging
 is useful when traffic is to be processed with
 packet filtering.
 .It Cm authmode Ar mode
 Set the desired authentication mode in infrastructure mode.
 Not all adapters support all modes.
 The set of
 valid modes is
 .Cm none , open , shared
 (shared key),
 .Cm 8021x
 (IEEE 802.1x),
 and
 .Cm wpa
 (IEEE WPA/WPA2/802.11i).
 The
 .Cm 8021x
 and
 .Cm wpa
 modes are only useful when using an authentication service
 (a supplicant for client operation or an authenticator when
 operating as an access point).
 Modes are case insensitive.
 .It Cm bgscan
 Enable background scanning when operating as a station.
 Background scanning is a technique whereby a station associated to
 an access point will temporarily leave the channel to scan for
 neighboring stations.
 This allows a station to maintain a cache of nearby access points
 so that roaming between access points can be done without
 a lengthy scan operation.
 Background scanning is done only when a station is not busy and
 any outbound traffic will cancel a scan operation.
 Background scanning should never cause packets to be lost though
 there may be some small latency if outbound traffic interrupts a
 scan operation.
 By default background scanning is enabled if the device is capable.
 To disable background scanning, use
 .Fl bgscan .
 Background scanning is controlled by the
 .Cm bgscanidle
 and
 .Cm bgscanintvl
 parameters.
 Background scanning must be enabled for roaming; this is an artifact
 of the current implementation and may not be required in the future.
 .It Cm bgscanidle Ar idletime
 Set the minimum time a station must be idle (not transmitting or
 receiving frames) before a background scan is initiated.
 The
 .Ar idletime
 parameter is specified in milliseconds.
 By default a station must be idle at least 250 milliseconds before
 a background scan is initiated.
 The idle time may not be set to less than 100 milliseconds.
 .It Cm bgscanintvl Ar interval
 Set the interval at which background scanning is attempted.
 The
 .Ar interval
 parameter is specified in seconds.
 By default a background scan is considered every 300 seconds (5 minutes).
 The
 .Ar interval
 may not be set to less than 15 seconds.
 .It Cm bintval Ar interval
 Set the interval at which beacon frames are sent when operating in
 ad-hoc or ap mode.
 The
 .Ar interval
 parameter is specified in TU's (1024 usecs).
 By default beacon frames are transmitted every 100 TU's.
 .It Cm bmissthreshold Ar count
 Set the number of consecutive missed beacons at which the station
 will attempt to roam (i.e., search for a new access point).
 The
 .Ar count
 parameter must be in the range 1 to 255; though the
 upper bound may be reduced according to device capabilities.
 The default threshold is 7 consecutive missed beacons; but
 this may be overridden by the device driver.
 Another name for the
 .Cm bmissthreshold
 parameter is
 .Cm bmiss .
 .It Cm bssid Ar address
 Specify the MAC address of the access point to use when operating
 as a station in a BSS network.
 This overrides any automatic selection done by the system.
 To disable a previously selected access point, supply
 .Cm any , none ,
 or
 .Cm -
 for the address.
 This option is useful when more than one access point uses the same SSID.
 Another name for the
 .Cm bssid
 parameter is
 .Cm ap .
 .It Cm burst
 Enable packet bursting.
 Packet bursting is a transmission technique whereby the wireless
 medium is acquired once to send multiple frames and the interframe
 spacing is reduced.
 This technique can significantly increase throughput by reducing
 transmission overhead.
 Packet bursting is supported by the 802.11e QoS specification
 and some devices that do not support QoS may still be capable.
 By default packet bursting is enabled if a device is capable
 of doing it.
 To disable packet bursting, use
 .Fl burst .
 .It Cm chanlist Ar channels
 Set the desired channels to use when scanning for access
 points, neighbors in an IBSS network, or looking for unoccupied
 channels when operating as an access point.
 The set of channels is specified as a comma-separated list with
 each element in the list representing either a single channel number or a range
 of the form
 .Dq Li a-b .
 Channel numbers must be in the range 1 to 255 and be permissible
 according to the operating characteristics of the device.
 .It Cm channel Ar number
 Set a single desired channel.
 Channels range from 1 to 255, but the exact selection available
 depends on the region your adaptor was manufactured for.
 Setting
 the channel to
 .Li any ,
 or
 .Cm -
 will clear any desired channel and, if the device is marked up,
 force a scan for a channel to operate on.
 Alternatively the frequency, in megahertz, may be specified
 instead of the channel number.
 .Pp
 When there are several ways to use a channel the channel
 number/frequency may be appended with attributes to clarify.
 For example, if a device is capable of operating on channel 6
 with 802.11n and 802.11g then one can specify that g-only use
 should be used by specifying ``6:g''.
 Similarly the channel width can be specified by appending it
 with ``/''; e.g., ``6/40'' specifies a 40MHz wide channel,
 These attributes can be combined as in: ``6:ht/40''.
 The full set of flags specified following a ``:'' are:
 .Cm a
 (802.11a),
 .Cm b
 (802.11b),
 .Cm d
 (Atheros Dynamic Turbo mode),
 .Cm g
 (802.11g),
 .Cm h
 or
 .Cm n
 (802.11n aka HT),
 .Cm s
 (Atheros Static Turbo mode),
 and
 .Cm t
 (Atheros Dynamic Turbo mode, or appended to ``st'' and ``dt'').
 The full set of channel widths following a '/' are:
 .Cm 5
 (5MHz aka quarter-rate channel),
 .Cm 10
 (10MHz aka half-rate channel),
 .Cm 20
 (20MHz mostly for use in specifying ht20),
 and
 .Cm 40
 (40MHz mostly for use in specifying ht40).
 In addition,
 a 40MHz HT channel specification may include the location
 of the extension channel by appending ``+'' or ``-'' for above and below,
 respectively; e.g., ``2437:ht/40+'' specifies 40MHz wide HT operation
 with the center channel at frequency 2437 and the extension channel above.
 .It Cm country Ar name
 Set the country code to use in calculating the regulatory constraints
 for operation.
 In particular the set of available channels, how the wireless device
 will operation on the channels, and the maximum transmit power that
 can be used on a channel are defined by this setting.
 Country/Region codes are specified as a 2-character abbreviation
 defined by ISO 3166 or using a longer, but possibly ambiguous, spelling;
 e.g., "ES" and "Spain".
 The set of country codes are taken from
 .Pa /etc/regdomain.xml
 and can also
 be viewed with the ``list countries'' request.
 Note that not all devices support changing the country code from a default
 setting; typically stored in EEPROM.
 See also
 .Cm regdomain ,
 .Cm indoor ,
 .Cm outdoor ,
 and
 .Cm anywhere .
 .It Cm dfs
 Enable Dynamic Frequency Selection (DFS) as specified in 802.11h.
 DFS embodies several facilities including detection of overlapping
 radar signals, dynamic transmit power control, and channel selection
 according to a least-congested criteria.
 DFS support is mandatory for some 5GHz frequencies in certain
 locales (e.g., ETSI).
 By default DFS is enabled according to the regulatory definitions
 specified in
 .Pa /etc/regdomain.xml
 and the current country code, regdomain,
 and channel.
 Note the underlying device (and driver) must support radar detection
 for full DFS support to work.
 To be fully compliant with the local regulatory agency frequencies that
 require DFS should not be used unless it is fully supported.
 Use
 .Fl dfs
 to disable this functionality for testing.
 .It Cm dotd
 Enable support for the 802.11d specification (default).
 When this support is enabled in station mode, beacon frames that advertise
 a country code different than the currently configured country code will
 cause an event to be dispatched to user applications.
 This event can be used by the station to adopt that country code and
 operate according to the associated regulatory constraints.
 When operating as an access point with 802.11d enabled the beacon and
 probe response frames transmitted will advertise the current regulatory
 domain settings.
 To disable 802.11d use
 .Fl dotd .
 .It Cm doth
 Enable 802.11h support including spectrum management.
 When 802.11h is enabled beacon and probe response frames will have
 the SpectrumMgt bit set in the capabilities field and
 country and power constraint information elements will be present.
 802.11h support also includes handling Channel Switch Announcements (CSA)
 which are a mechanism to coordinate channel changes by an access point.
 By default 802.11h is enabled if the device is capable.
 To disable 802.11h use
 .Fl doth .
 .It Cm deftxkey Ar index
 Set the default key to use for transmission.
 Typically this is only set when using WEP encryption.
 Note that you must set a default transmit key
 for the system to know which key to use in encrypting outbound traffic.
 The
 .Cm weptxkey
 is an alias for this request; it is provided for backwards compatibility.
 .It Cm dtimperiod Ar period
 Set the
 DTIM
 period for transmitting buffered multicast data frames when
 operating in ap mode.
 The
 .Ar period
 specifies the number of beacon intervals between DTIM
 and must be in the range 1 to 15.
 By default DTIM is 1 (i.e., DTIM occurs at each beacon).
 .It Cm quiet
 Enable the use of quiet IE.
 Hostap will use this to silence other
 stations to reduce interference for radar detection when
 operating on 5GHz frequency and doth support is enabled.
 Use
 .Fl quiet
 to disable this functionality.
 .It Cm quiet_period Ar period
 Set the QUIET
 .Ar period
 to the number of beacon intervals between the start of regularly
 scheduled quiet intervals defined by Quiet element.
 .It Cm quiet_count Ar count
 Set the QUIET
 .Ar count
 to the number of TBTTs until the beacon interval during which the
 next quiet interval shall start.
 A value of 1 indicates the quiet
 interval will start during the beacon interval starting at the next
 TBTT.
 A value 0 is reserved.
 .It Cm quiet_offset Ar offset
 Set the QUIET
 .Ar offset
 to the offset of the start of the quiet interval from the TBTT
 specified by the Quiet count, expressed in TUs.
 The value of the
 .Ar offset
 shall be less than one beacon interval.
 .It Cm quiet_duration Ar dur
 Set the QUIET
 .Ar dur
 to the duration of the Quiet interval, expressed in TUs.
 The value should be less than beacon interval.
 .It Cm dturbo
 Enable the use of Atheros Dynamic Turbo mode when communicating with
 another Dynamic Turbo-capable station.
 Dynamic Turbo mode is an Atheros-specific mechanism by which
 stations switch between normal 802.11 operation and a ``boosted''
 mode in which a 40MHz wide channel is used for communication.
 Stations using Dynamic Turbo mode operate boosted only when the
 channel is free of non-dturbo stations; when a non-dturbo station
 is identified on the channel all stations will automatically drop
 back to normal operation.
 By default, Dynamic Turbo mode is not enabled, even if the device is capable.
 Note that turbo mode (dynamic or static) is only allowed on some
 channels depending on the regulatory constraints; use the
 .Cm list chan
 command to identify the channels where turbo mode may be used.
 To disable Dynamic Turbo mode use
 .Fl dturbo .
 .It Cm dwds
 Enable Dynamic WDS (DWDS) support.
 DWDS is a facility by which 4-address traffic can be carried between
 stations operating in infrastructure mode.
 A station first associates to an access point and authenticates using
 normal procedures (e.g., WPA).
 Then 4-address frames are passed to carry traffic for stations
 operating on either side of the wireless link.
 DWDS extends the normal WDS mechanism by leveraging existing security
 protocols and eliminating static binding.
 .Pp
 When DWDS is enabled on an access point 4-address frames received from
 an authorized station will generate a ``DWDS discovery'' event to user
 applications.
 This event should be used to create a WDS interface that is bound
 to the remote station (and usually plumbed into a bridge).
 Once the WDS interface is up and running 4-address traffic then logically
 flows through that interface.
 .Pp
 When DWDS is enabled on a station, traffic with a destination address
 different from the peer station are encapsulated in a 4-address frame
 and transmitted to the peer.
 All 4-address traffic uses the security information of the stations
 (e.g., cryptographic keys).
 A station is associated using 802.11n facilities may transport
 4-address traffic using these same mechanisms; this depends on available
 resources and capabilities of the device.
 The DWDS implementation guards against layer 2 routing loops of
 multicast traffic.
 .It Cm ff
 Enable the use of Atheros Fast Frames when communicating with
 another Fast Frames-capable station.
 Fast Frames are an encapsulation technique by which two 802.3
 frames are transmitted in a single 802.11 frame.
 This can noticeably improve throughput but requires that the
 receiving station understand how to decapsulate the frame.
 Fast frame use is negotiated using the Atheros 802.11 vendor-specific
 protocol extension so enabling use is safe when communicating with
 non-Atheros devices.
 By default, use of fast frames is enabled if the device is capable.
 To explicitly disable fast frames, use
 .Fl ff .
 .It Cm fragthreshold Ar length
 Set the threshold for which transmitted frames are broken into fragments.
 The
 .Ar length
 argument is the frame size in bytes and must be in the range 256 to 2346.
 Setting
 .Ar length
 to
 .Li 2346 ,
 .Cm any ,
 or
 .Cm -
 disables transmit fragmentation.
 Not all adapters honor the fragmentation threshold.
 .It Cm hidessid
 When operating as an access point, do not broadcast the SSID
 in beacon frames or respond to probe request frames unless
 they are directed to the ap (i.e., they include the ap's SSID).
 By default, the SSID is included in beacon frames and
 undirected probe request frames are answered.
 To re-enable the broadcast of the SSID etc., use
 .Fl hidessid .
 .It Cm ht
 Enable use of High Throughput (HT) when using 802.11n (default).
 The 802.11n specification includes mechanisms for operation
 on 20MHz and 40MHz wide channels using different signalling mechanisms
 than specified in 802.11b, 802.11g, and 802.11a.
 Stations negotiate use of these facilities, termed HT20 and HT40,
 when they associate.
 To disable all use of 802.11n use
 .Fl ht .
 To disable use of HT20 (e.g., to force only HT40 use) use
 .Fl ht20 .
 To disable use of HT40 use
 .Fl ht40 .
 .Pp
 HT configuration is used to ``auto promote'' operation
 when several choices are available.
 For example, if a station associates to an 11n-capable access point
 it controls whether the station uses legacy operation, HT20, or HT40.
 When an 11n-capable device is setup as an access point and
 Auto Channel Selection is used to locate a channel to operate on,
 HT configuration controls whether legacy, HT20, or HT40 operation is setup
 on the selected channel.
 If a fixed channel is specified for a station then HT configuration can
 be given as part of the channel specification; e.g., 6:ht/20 to setup
 HT20 operation on channel 6.
 .It Cm htcompat
 Enable use of compatibility support for pre-802.11n devices (default).
 The 802.11n protocol specification went through several incompatible iterations.
 Some vendors implemented 11n support to older specifications that
 will not interoperate with a purely 11n-compliant station.
 In particular the information elements included in management frames
 for old devices are different.
 When compatibility support is enabled both standard and compatible data
 will be provided.
 Stations that associate using the compatibility mechanisms are flagged
 in ``list sta''.
 To disable compatibility support use
 .Fl htcompat .
 .It Cm htprotmode Ar technique
 For interfaces operating in 802.11n, use the specified
 .Ar technique
 for protecting HT frames in a mixed legacy/HT network.
 The set of valid techniques is
 .Cm off ,
 and
 .Cm rts
 (RTS/CTS, default).
 Technique names are case insensitive.
 .It Cm inact
 Enable inactivity processing for stations associated to an
 access point (default).
 When operating as an access point the 802.11 layer monitors
 the activity of each associated station.
 When a station is inactive for 5 minutes it will send several
 ``probe frames'' to see if the station is still present.
 If no response is received then the station is deauthenticated.
 Applications that prefer to handle this work can disable this
 facility by using
 .Fl inact .
 .It Cm indoor
 Set the location to use in calculating regulatory constraints.
 The location is also advertised in beacon and probe response frames
 when 802.11d is enabled with
 .Cm dotd .
 See also
 .Cm outdoor ,
 .Cm anywhere ,
 .Cm country ,
 and
 .Cm regdomain .
 .It Cm list active
 Display the list of channels available for use taking into account
 any restrictions set with the
 .Cm chanlist
 directive.
 See the description of
 .Cm list chan
 for more information.
 .It Cm list caps
 Display the adaptor's capabilities, including the operating
 modes supported.
 .It Cm list chan
 Display the list of channels available for use.
 Channels are shown with their IEEE channel number, equivalent
 frequency, and usage modes.
 Channels identified as
 .Ql 11g
 are also usable in
 .Ql 11b
 mode.
 Channels identified as
 .Ql 11a Turbo
 may be used only for Atheros' Static Turbo mode
 (specified with
 . Cm mediaopt turbo ) .
 Channels marked with a
 .Ql *
 have a regulatory constraint that they be passively scanned.
 This means a station is not permitted to transmit on the channel until
 it identifies the channel is being used for 802.11 communication;
 typically by hearing a beacon frame from an access point operating
 on the channel.
 .Cm list freq
 is another way of requesting this information.
 By default a compacted list of channels is displayed; if the
 .Fl v
 option is specified then all channels are shown.
 .It Cm list countries
 Display the set of country codes and regulatory domains that can be
 used in regulatory configuration.
 .It Cm list mac
 Display the current MAC Access Control List state.
 Each address is prefixed with a character that indicates the
 current policy applied to it:
 .Ql +
 indicates the address is allowed access,
 .Ql -
 indicates the address is denied access,
 .Ql *
 indicates the address is present but the current policy open
 (so the ACL is not consulted).
 .It Cm list mesh
 Displays the mesh routing table, used for forwarding packets on a mesh
 network.
 .It Cm list regdomain
 Display the current regulatory settings including the available channels
 and transmit power caps.
 .It Cm list roam
 Display the parameters that govern roaming operation.
 .It Cm list txparam
 Display the parameters that govern transmit operation.
 .It Cm list txpower
 Display the transmit power caps for each channel.
 .It Cm list scan
 Display the access points and/or ad-hoc neighbors
 located in the vicinity.
 This information may be updated automatically by the adapter
 with a
 .Cm scan
 request or through background scanning.
 Depending on the capabilities of the stations the following
 flags can be included in the output:
 .Bl -tag -width 3n
 .It Li A
 Channel agility.
 .It Li B
 PBCC modulation.
 .It Li C
 Poll request capability.
 .It Li D
 DSSS/OFDM capability.
 .It Li E
 Extended Service Set (ESS).
 .It Li I
 Independent Basic Service Set (IBSS).
 .It Li P
 Privacy capability.
 The station requires authentication.
 .It Li R
 Robust Secure Network (RSN).
 .It Li S
 Short Preamble.
 Indicates that the station is doing short preamble to optionally
 improve throughput performance with 802.11g and 802.11b.
 .It Li c
 Pollable capability.
 .It Li s
 Short slot time capability.
 .El
 .Pp
 By default interesting information elements captured from the neighboring
 stations are displayed at the end of each row.
 Possible elements include:
 .Cm WME
 (station supports WME),
 .Cm WPA
 (station supports WPA),
 .Cm WPS
 (station supports WPS),
 .Cm RSN
 (station supports 802.11i/RSN),
 .Cm HTCAP
 (station supports 802.11n/HT communication),
 .Cm ATH
 (station supports Atheros protocol extensions),
 .Cm VEN
 (station supports unknown vendor-specific extensions).
 If the
 .Fl v
 flag is used all the information elements and their
 contents will be shown.
 Specifying the
 .Fl v
 flag also enables display of long SSIDs.
 The
 .Cm list ap
 command is another way of requesting this information.
 .It Cm list sta
 When operating as an access point display the stations that are
 currently associated.
 When operating in ad-hoc mode display stations identified as
 neighbors in the IBSS.
 When operating in mesh mode display stations identified as
 neighbors in the MBSS.
 When operating in station mode display the access point.
 Capabilities advertised by the stations are described under
 the
 .Cm scan
 request.
 The following flags can be included in the output:
 .Bl -tag -width 3n
 .It Li A
 Authorized.
 Indicates that the station is permitted to send/receive data frames.
 .It Li E
 Extended Rate Phy (ERP).
 Indicates that the station is operating in an 802.11g network
 using extended transmit rates.
 .It Li H
 High Throughput (HT).
 Indicates that the station is using HT transmit rates.
 If a
 .Sq Li +
 follows immediately after then the station associated
 using deprecated mechanisms supported only when
 .Cm htcompat
 is enabled.
 .It Li P
 Power Save.
 Indicates that the station is operating in power save mode.
 .It Li Q
 Quality of Service (QoS).
 Indicates that the station is using QoS encapsulation for
 data frame.
 QoS encapsulation is enabled only when WME mode is enabled.
 .It Li S
 Short GI in HT 40MHz mode enabled.
 If a
 .Sq Li +
 follows immediately after then short GI in HT 20MHz mode is enabled as well.
 .It Li T
 Transitional Security Network (TSN).
 Indicates that the station associated using TSN; see also
 .Cm tsn
 below.
 .It Li W
 Wi-Fi Protected Setup (WPS).
 Indicates that the station associated using WPS.
 .It Li s
 Short GI in HT 20MHz mode enabled.
 .El
 .Pp
 By default information elements received from associated stations
 are displayed in a short form; the
 .Fl v
 flag causes this information to be displayed symbolically.
 .It Cm list wme
 Display the current channel parameters to use when operating in WME mode.
 If the
 .Fl v
 option is specified then both channel and BSS parameters are displayed
 for each AC (first channel, then BSS).
 When WME mode is enabled for an adaptor this information will be
 displayed with the regular status; this command is mostly useful
 for examining parameters when WME mode is disabled.
 See the description of the
 .Cm wme
 directive for information on the various parameters.
 .It Cm maxretry Ar count
 Set the maximum number of tries to use in sending unicast frames.
 The default setting is 6 but drivers may override this with a value
 they choose.
 .It Cm mcastrate Ar rate
 Set the rate for transmitting multicast/broadcast frames.
 Rates are specified as megabits/second in decimal; e.g.,\& 5.5 for 5.5 Mb/s.
 This rate should be valid for the current operating conditions;
 if an invalid rate is specified drivers are free to chose an
 appropriate rate.
 .It Cm mgtrate Ar rate
 Set the rate for transmitting management and/or control frames.
 Rates are specified as megabits/second in decimal; e.g.,\& 5.5 for 5.5 Mb/s.
 .It Cm outdoor
 Set the location to use in calculating regulatory constraints.
 The location is also advertised in beacon and probe response frames
 when 802.11d is enabled with
 .Cm dotd .
 See also
 .Cm anywhere ,
 .Cm country ,
 .Cm indoor ,
 and
 .Cm regdomain .
 .It Cm powersave
 Enable powersave operation.
 When operating as a client, the station will conserve power by
 periodically turning off the radio and listening for
 messages from the access point telling it there are packets waiting.
 The station must then retrieve the packets.
 Not all devices support power save operation as a client.
 The 802.11 specification requires that all access points support
 power save but some drivers do not.
 Use
 .Fl powersave
 to disable powersave operation when operating as a client.
 .It Cm powersavesleep Ar sleep
 Set the desired max powersave sleep time in TU's (1024 usecs).
 By default the max powersave sleep time is 100 TU's.
 .It Cm protmode Ar technique
 For interfaces operating in 802.11g, use the specified
 .Ar technique
 for protecting OFDM frames in a mixed 11b/11g network.
 The set of valid techniques is
 .Cm off , cts
 (CTS to self),
 and
 .Cm rtscts
 (RTS/CTS).
 Technique names are case insensitive.
 Not all devices support
 .Cm cts
 as a protection technique.
 .It Cm pureg
 When operating as an access point in 802.11g mode allow only
 11g-capable stations to associate (11b-only stations are not
 permitted to associate).
 To allow both 11g and 11b-only stations to associate, use
 .Fl pureg .
 .It Cm puren
 When operating as an access point in 802.11n mode allow only
 HT-capable stations to associate (legacy stations are not
 permitted to associate).
 To allow both HT and legacy stations to associate, use
 .Fl puren .
 .It Cm regdomain Ar sku
 Set the regulatory domain to use in calculating the regulatory constraints
 for operation.
 In particular the set of available channels, how the wireless device
 will operation on the channels, and the maximum transmit power that
 can be used on a channel are defined by this setting.
 Regdomain codes (SKU's) are taken from
 .Pa /etc/regdomain.xml
 and can also
 be viewed with the ``list countries'' request.
 Note that not all devices support changing the regdomain from a default
 setting; typically stored in EEPROM.
 See also
 .Cm country ,
 .Cm indoor ,
 .Cm outdoor ,
 and
 .Cm anywhere .
 .It Cm rifs
 Enable use of Reduced InterFrame Spacing (RIFS) when operating in 802.11n
 on an HT channel.
 Note that RIFS must be supported by both the station and access point
 for it to be used.
 To disable RIFS use
 .Fl rifs .
 .It Cm roam:rate Ar rate
 Set the threshold for controlling roaming when operating in a BSS.
 The
 .Ar rate
 parameter specifies the transmit rate in megabits
 at which roaming should be considered.
 If the current transmit rate drops below this setting and background scanning
 is enabled, then the system will check if a more desirable access point is
 available and switch over to it.
 The current scan cache contents are used if they are considered
 valid according to the
 .Cm scanvalid
 parameter; otherwise a background scan operation is triggered before
 any selection occurs.
 Each channel type has a separate rate threshold; the default values are:
 12 Mb/s (11a), 2 Mb/s (11b), 2 Mb/s (11g), MCS 1 (11na, 11ng).
 .It Cm roam:rssi Ar rssi
 Set the threshold for controlling roaming when operating in a BSS.
 The
 .Ar rssi
 parameter specifies the receive signal strength in dBm units
 at which roaming should be considered.
 If the current rssi drops below this setting and background scanning
 is enabled, then the system will check if a more desirable access point is
 available and switch over to it.
 The current scan cache contents are used if they are considered
 valid according to the
 .Cm scanvalid
 parameter; otherwise a background scan operation is triggered before
 any selection occurs.
 Each channel type has a separate rssi threshold; the default values are
 all 7 dBm.
 .It Cm roaming Ar mode
 When operating as a station, control how the system will
 behave when communication with the current access point
 is broken.
 The
 .Ar mode
 argument may be one of
 .Cm device
 (leave it to the hardware device to decide),
 .Cm auto
 (handle either in the device or the operating system\[em]as appropriate),
 .Cm manual
 (do nothing until explicitly instructed).
 By default, the device is left to handle this if it is
 capable; otherwise, the operating system will automatically
 attempt to reestablish communication.
 Manual mode is used by applications such as
 .Xr wpa_supplicant 8
 that want to control the selection of an access point.
 .It Cm rtsthreshold Ar length
 Set the threshold for which
 transmitted frames are preceded by transmission of an
 RTS
 control frame.
 The
 .Ar length
 argument
 is the frame size in bytes and must be in the range 1 to 2346.
 Setting
 .Ar length
 to
 .Li 2346 ,
 .Cm any ,
 or
 .Cm -
 disables transmission of RTS frames.
 Not all adapters support setting the RTS threshold.
 .It Cm scan
 Initiate a scan of neighboring stations, wait for it to complete, and
 display all stations found.
 Only the super-user can initiate a scan.
 See
 .Cm list scan
 for information on the display.
 By default a background scan is done; otherwise a foreground
 scan is done and the station may roam to a different access point.
 The
 .Cm list scan
 request can be used to show recent scan results without
 initiating a new scan.
 .It Cm scanvalid Ar threshold
 Set the maximum time the scan cache contents are considered valid;
 i.e., will be used without first triggering a scan operation to
 refresh the data.
 The
 .Ar threshold
 parameter is specified in seconds and defaults to 60 seconds.
 The minimum setting for
 .Ar threshold
 is 10 seconds.
 One should take care setting this threshold; if it is set too low
 then attempts to roam to another access point may trigger unnecessary
 background scan operations.
 .It Cm shortgi
 Enable use of Short Guard Interval when operating in 802.11n
 on an HT channel.
 NB: this currently enables Short GI on both HT40 and HT20 channels.
 To disable Short GI use
 .Fl shortgi .
 .It Cm smps
 Enable use of Static Spatial Multiplexing Power Save (SMPS)
 when operating in 802.11n.
 A station operating with Static SMPS maintains only a single
 receive chain active (this can significantly reduce power consumption).
 To disable SMPS use
 .Fl smps .
 .It Cm smpsdyn
 Enable use of Dynamic Spatial Multiplexing Power Save (SMPS)
 when operating in 802.11n.
 A station operating with Dynamic SMPS maintains only a single
 receive chain active but switches to multiple receive chains when it
 receives an RTS frame (this can significantly reduce power consumption).
 Note that stations cannot distinguish between RTS/CTS intended to
 enable multiple receive chains and those used for other purposes.
 To disable SMPS use
 .Fl smps .
 .It Cm ssid Ar ssid
 Set the desired Service Set Identifier (aka network name).
 The SSID is a string up to 32 characters
 in length and may be specified as either a normal string or in
 hexadecimal when preceded by
 .Ql 0x .
 Additionally, the SSID may be cleared by setting it to
 .Ql - .
 .It Cm tdmaslot Ar slot
 When operating with TDMA, use the specified
 .Ar slot
 configuration.
 The
 .Ar slot
 is a number between 0 and the maximum number of slots in the BSS.
 Note that a station configured as slot 0 is a master and
 will broadcast beacon frames advertising the BSS;
 stations configured to use other slots will always
 scan to locate a master before they ever transmit.
 By default
 .Cm tdmaslot
 is set to 1.
 .It Cm tdmaslotcnt Ar cnt
 When operating with TDMA, setup a BSS with
 .Ar cnt
 slots.
 The slot count may be at most 8.
 The current implementation is only tested with two stations
 (i.e., point to point applications).
 This setting is only meaningful when a station is configured as slot 0;
 other stations adopt this setting from the BSS they join.
 By default
 .Cm tdmaslotcnt
 is set to 2.
 .It Cm tdmaslotlen Ar len
 When operating with TDMA, setup a BSS such that each station has a slot
 .Ar len
 microseconds long.
 The slot length must be at least 150 microseconds (1/8 TU)
 and no more than 65 milliseconds.
 Note that setting too small a slot length may result in poor channel
 bandwidth utilization due to factors such as timer granularity and
 guard time.
 This setting is only meaningful when a station is configured as slot 0;
 other stations adopt this setting from the BSS they join.
 By default
 .Cm tdmaslotlen
 is set to 10 milliseconds.
 .It Cm tdmabintval Ar intval
 When operating with TDMA, setup a BSS such that beacons are transmitted every
 .Ar intval
 superframes to synchronize the TDMA slot timing.
 A superframe is defined as the number of slots times the slot length; e.g.,
 a BSS with two slots of 10 milliseconds has a 20 millisecond superframe.
 The beacon interval may not be zero.
 A lower setting of
 .Cm tdmabintval
 causes the timers to be resynchronized more often; this can be help if
 significant timer drift is observed.
 By default
 .Cm tdmabintval
 is set to 5.
 .It Cm tsn
 When operating as an access point with WPA/802.11i allow legacy
 stations to associate using static key WEP and open authentication.
 To disallow legacy station use of WEP, use
 .Fl tsn .
 .It Cm txpower Ar power
 Set the power used to transmit frames.
 The
 .Ar power
 argument is specified in .5 dBm units.
 Out of range values are truncated.
 Typically only a few discreet power settings are available and
 the driver will use the setting closest to the specified value.
 Not all adapters support changing the transmit power.
 .It Cm ucastrate Ar rate
 Set a fixed rate for transmitting unicast frames.
 Rates are specified as megabits/second in decimal; e.g.,\& 5.5 for 5.5 Mb/s.
 This rate should be valid for the current operating conditions;
 if an invalid rate is specified drivers are free to chose an
 appropriate rate.
 .It Cm wepmode Ar mode
 Set the desired WEP mode.
 Not all adapters support all modes.
 The set of valid modes is
 .Cm off , on ,
 and
 .Cm mixed .
 The
 .Cm mixed
 mode explicitly tells the adaptor to allow association with access
 points which allow both encrypted and unencrypted traffic.
 On these adapters,
 .Cm on
 means that the access point must only allow encrypted connections.
 On other adapters,
 .Cm on
 is generally another name for
 .Cm mixed .
 Modes are case insensitive.
 .It Cm weptxkey Ar index
 Set the WEP key to be used for transmission.
 This is the same as setting the default transmission key with
 .Cm deftxkey .
 .It Cm wepkey Ar key Ns | Ns Ar index : Ns Ar key
 Set the selected WEP key.
 If an
 .Ar index
 is not given, key 1 is set.
 A WEP key will be either 5 or 13
 characters (40 or 104 bits) depending on the local network and the
 capabilities of the adaptor.
 It may be specified either as a plain
 string or as a string of hexadecimal digits preceded by
 .Ql 0x .
 For maximum portability, hex keys are recommended;
 the mapping of text keys to WEP encryption is usually driver-specific.
 In particular, the
 .Tn Windows
 drivers do this mapping differently to
 .Fx .
 A key may be cleared by setting it to
 .Ql - .
 If WEP is supported then there are at least four keys.
 Some adapters support more than four keys.
 If that is the case, then the first four keys
 (1-4) will be the standard temporary keys and any others will be adaptor
 specific keys such as permanent keys stored in NVRAM.
 .Pp
 Note that you must set a default transmit key with
 .Cm deftxkey
 for the system to know which key to use in encrypting outbound traffic.
 .It Cm wme
 Enable Wireless Multimedia Extensions (WME) support, if available,
 for the specified interface.
 WME is a subset of the IEEE 802.11e standard to support the
 efficient communication of realtime and multimedia data.
 To disable WME support, use
 .Fl wme .
 Another name for this parameter is
 .Cm wmm .
 .Pp
 The following parameters are meaningful only when WME support is in use.
 Parameters are specified per-AC (Access Category) and
 split into those that are used by a station when acting
 as an access point and those for client stations in the BSS.
 The latter are received from the access point and may not be changed
 (at the station).
 The following Access Categories are recognized:
 .Pp
 .Bl -tag -width ".Cm AC_BK" -compact
 .It Cm AC_BE
 (or
 .Cm BE )
 best effort delivery,
 .It Cm AC_BK
 (or
 .Cm BK )
 background traffic,
 .It Cm AC_VI
 (or
 .Cm VI )
 video traffic,
 .It Cm AC_VO
 (or
 .Cm VO )
 voice traffic.
 .El
 .Pp
 AC parameters are case-insensitive.
 Traffic classification is done in the operating system using the
 vlan priority associated with data frames or the
 ToS (Type of Service) indication in IP-encapsulated frames.
 If neither information is present, traffic is assigned to the
 Best Effort (BE) category.
 .Bl -tag -width indent
 .It Cm ack Ar ac
 Set the ACK policy for QoS transmissions by the local station;
 this controls whether or not data frames transmitted by a station
 require an ACK response from the receiving station.
 To disable waiting for an ACK use
 .Fl ack .
 This parameter is applied only to the local station.
 .It Cm acm Ar ac
 Enable the Admission Control Mandatory (ACM) mechanism
 for transmissions by the local station.
 To disable the ACM use
 .Fl acm .
 On stations in a BSS this parameter is read-only and indicates
 the setting received from the access point.
 NB: ACM is not supported right now.
 .It Cm aifs Ar ac Ar count
 Set the Arbitration Inter Frame Spacing (AIFS)
 channel access parameter to use for transmissions
 by the local station.
 On stations in a BSS this parameter is read-only and indicates
 the setting received from the access point.
 .It Cm cwmin Ar ac Ar count
 Set the CWmin channel access parameter to use for transmissions
 by the local station.
 On stations in a BSS this parameter is read-only and indicates
 the setting received from the access point.
 .It Cm cwmax Ar ac Ar count
 Set the CWmax channel access parameter to use for transmissions
 by the local station.
 On stations in a BSS this parameter is read-only and indicates
 the setting received from the access point.
 .It Cm txoplimit Ar ac Ar limit
 Set the Transmission Opportunity Limit channel access parameter
 to use for transmissions by the local station.
 This parameter defines an interval of time when a WME station
 has the right to initiate transmissions onto the wireless medium.
 On stations in a BSS this parameter is read-only and indicates
 the setting received from the access point.
 .It Cm bss:aifs Ar ac Ar count
 Set the AIFS channel access parameter to send to stations in a BSS.
 This parameter is meaningful only when operating in ap mode.
 .It Cm bss:cwmin Ar ac Ar count
 Set the CWmin channel access parameter to send to stations in a BSS.
 This parameter is meaningful only when operating in ap mode.
 .It Cm bss:cwmax Ar ac Ar count
 Set the CWmax channel access parameter to send to stations in a BSS.
 This parameter is meaningful only when operating in ap mode.
 .It Cm bss:txoplimit Ar ac Ar limit
 Set the TxOpLimit channel access parameter to send to stations in a BSS.
 This parameter is meaningful only when operating in ap mode.
 .El
 .It Cm wps
 Enable Wireless Privacy Subscriber support.
 Note that WPS support requires a WPS-capable supplicant.
 To disable this function use
 .Fl wps .
 .El
 .Pp
 The following parameters support an optional access control list
 feature available with some adapters when operating in ap mode; see
 .Xr wlan_acl 4 .
 This facility allows an access point to accept/deny association
 requests based on the MAC address of the station.
 Note that this feature does not significantly enhance security
 as MAC address spoofing is easy to do.
 .Bl -tag -width indent
 .It Cm mac:add Ar address
 Add the specified MAC address to the database.
 Depending on the policy setting association requests from the
 specified station will be allowed or denied.
 .It Cm mac:allow
 Set the ACL policy to permit association only by
 stations registered in the database.
 .It Cm mac:del Ar address
 Delete the specified MAC address from the database.
 .It Cm mac:deny
 Set the ACL policy to deny association only by
 stations registered in the database.
 .It Cm mac:kick Ar address
 Force the specified station to be deauthenticated.
 This typically is done to block a station after updating the
 address database.
 .It Cm mac:open
 Set the ACL policy to allow all stations to associate.
 .It Cm mac:flush
 Delete all entries in the database.
 .It Cm mac:radius
 Set the ACL policy to permit association only by
 stations approved by a RADIUS server.
 Note that this feature requires the
 .Xr hostapd 8
 program be configured to do the right thing
 as it handles the RADIUS processing
 (and marks stations as authorized).
 .El
 .Pp
 The following parameters are related to a wireless interface operating in mesh
 mode:
 .Bl -tag -width indent
 .It Cm meshid Ar meshid
 Set the desired Mesh Identifier.
 The Mesh ID is a string up to 32 characters in length.
 A mesh interface must have a Mesh Identifier specified
 to reach an operational state.
 .It Cm meshttl Ar ttl
 Set the desired ``time to live'' for mesh forwarded packets;
 this is the number of hops a packet may be forwarded before
 it is discarded.
 The default setting for
 .Cm meshttl
 is 31.
 .It Cm meshpeering
 Enable or disable peering with neighbor mesh stations.
 Stations must peer before any data packets can be exchanged.
 By default
 .Cm meshpeering
 is enabled.
 .It Cm meshforward
 Enable or disable forwarding packets by a mesh interface.
 By default
 .Cm meshforward
 is enabled.
 .It Cm meshgate
 This attribute specifies whether or not the mesh STA activates mesh gate
 announcements.
 By default
 .Cm meshgate
 is disabled.
 .It Cm meshmetric Ar protocol
 Set the specified
 .Ar protocol
 as the link metric protocol used on a mesh network.
 The default protocol is called
 .Ar AIRTIME .
 The mesh interface will restart after changing this setting.
 .It Cm meshpath Ar protocol
 Set the specified
 .Ar protocol
 as the path selection protocol used on a mesh network.
 The only available protocol at the moment is called
 .Ar HWMP
 (Hybrid Wireless Mesh Protocol).
 The mesh interface will restart after changing this setting.
 .It Cm hwmprootmode Ar mode
 Stations on a mesh network can operate as ``root nodes.''
 Root nodes try to find paths to all mesh nodes and advertise themselves
 regularly.
 When there is a root mesh node on a network, other mesh nodes can setup
 paths between themselves faster because they can use the root node
 to find the destination.
 This path may not be the best, but on-demand
 routing will eventually find the best path.
 The following modes are recognized:
 .Pp
 .Bl -tag -width ".Cm PROACTIVE" -compact
 .It Cm DISABLED
 Disable root mode.
 .It Cm NORMAL
 Send broadcast path requests every two seconds.
 Nodes on the mesh without a path to this root mesh station with try to
 discover a path to us.
 .It Cm PROACTIVE
 Send broadcast path requests every two seconds and every node must reply
 with a path reply even if it already has a path to this root mesh station.
 .It Cm RANN
 Send broadcast root announcement (RANN) frames.
 Nodes on the mesh without a path to this root mesh station with try to
 discover a path to us.
 .El
 By default
 .Cm hwmprootmode
 is set to
 .Ar DISABLED .
 .It Cm hwmpmaxhops Ar cnt
 Set the maximum number of hops allowed in an HMWP path to
 .Ar cnt .
 The default setting for
 .Cm hwmpmaxhops
 is 31.
 .El
 .Pp
 The following parameters are for compatibility with other systems:
 .Bl -tag -width indent
 .It Cm nwid Ar ssid
 Another name for the
 .Cm ssid
 parameter.
 Included for
 .Nx
 compatibility.
 .It Cm stationname Ar name
 Set the name of this station.
 The station name is not part of the IEEE 802.11
 protocol though some interfaces support it.
 As such it only
 seems to be meaningful to identical or virtually identical equipment.
 Setting the station name is identical in syntax to setting the SSID.
 One can also use
 .Cm station
 for
 .Bsx
 compatibility.
 .It Cm wep
 Another way of saying
 .Cm wepmode on .
 Included for
 .Bsx
 compatibility.
 .It Fl wep
 Another way of saying
 .Cm wepmode off .
 Included for
 .Bsx
 compatibility.
 .It Cm nwkey key
 Another way of saying:
 .Dq Li "wepmode on weptxkey 1 wepkey 1:key wepkey 2:- wepkey 3:- wepkey 4:-" .
 Included for
 .Nx
 compatibility.
 .It Cm nwkey Xo
 .Sm off
 .Ar n : k1 , k2 , k3 , k4
 .Sm on
 .Xc
 Another way of saying
 .Dq Li "wepmode on weptxkey n wepkey 1:k1 wepkey 2:k2 wepkey 3:k3 wepkey 4:k4" .
 Included for
 .Nx
 compatibility.
 .It Fl nwkey
 Another way of saying
 .Cm wepmode off .
 Included for
 .Nx
 compatibility.
 .El
 .Pp
 The following parameters are specific to bridge interfaces:
 .Bl -tag -width indent
 .It Cm addm Ar interface
 Add the interface named by
 .Ar interface
 as a member of the bridge.
 The interface is put into promiscuous mode
 so that it can receive every packet sent on the network.
 .It Cm deletem Ar interface
 Remove the interface named by
 .Ar interface
 from the bridge.
 Promiscuous mode is disabled on the interface when
 it is removed from the bridge.
 .It Cm maxaddr Ar size
 Set the size of the bridge address cache to
 .Ar size .
 The default is 2000 entries.
 .It Cm timeout Ar seconds
 Set the timeout of address cache entries to
 .Ar seconds
 seconds.
 If
 .Ar seconds
 is zero, then address cache entries will not be expired.
 The default is 1200 seconds.
 .It Cm addr
 Display the addresses that have been learned by the bridge.
 .It Cm static Ar interface-name Ar address
 Add a static entry into the address cache pointing to
 .Ar interface-name .
 Static entries are never aged out of the cache or re-placed, even if the
 address is seen on a different interface.
 .It Cm deladdr Ar address
 Delete
 .Ar address
 from the address cache.
 .It Cm flush
 Delete all dynamically-learned addresses from the address cache.
 .It Cm flushall
 Delete all addresses, including static addresses, from the address cache.
 .It Cm discover Ar interface
 Mark an interface as a
 .Dq discovering
 interface.
 When the bridge has no address cache entry
 (either dynamic or static)
 for the destination address of a packet,
 the bridge will forward the packet to all
 member interfaces marked as
 .Dq discovering .
 This is the default for all interfaces added to a bridge.
 .It Cm -discover Ar interface
 Clear the
 .Dq discovering
 attribute on a member interface.
 For packets without the
 .Dq discovering
 attribute, the only packets forwarded on the interface are broadcast
 or multicast packets and packets for which the destination address
 is known to be on the interface's segment.
 .It Cm learn Ar interface
 Mark an interface as a
 .Dq learning
 interface.
 When a packet arrives on such an interface, the source
 address of the packet is entered into the address cache as being a
 destination address on the interface's segment.
 This is the default for all interfaces added to a bridge.
 .It Cm -learn Ar interface
 Clear the
 .Dq learning
 attribute on a member interface.
 .It Cm sticky Ar interface
 Mark an interface as a
 .Dq sticky
 interface.
 Dynamically learned address entries are treated at static once entered into
 the cache.
 Sticky entries are never aged out of the cache or replaced, even if the
 address is seen on a different interface.
 .It Cm -sticky Ar interface
 Clear the
 .Dq sticky
 attribute on a member interface.
 .It Cm private Ar interface
 Mark an interface as a
 .Dq private
 interface.
 A private interface does not forward any traffic to any other port that is also
 a private interface.
 .It Cm -private Ar interface
 Clear the
 .Dq private
 attribute on a member interface.
 .It Cm span Ar interface
 Add the interface named by
 .Ar interface
 as a span port on the bridge.
 Span ports transmit a copy of every frame received by the bridge.
 This is most useful for snooping a bridged network passively on
 another host connected to one of the span ports of the bridge.
 .It Cm -span Ar interface
 Delete the interface named by
 .Ar interface
 from the list of span ports of the bridge.
 .It Cm stp Ar interface
 Enable Spanning Tree protocol on
 .Ar interface .
 The
 .Xr if_bridge 4
 driver has support for the IEEE 802.1D Spanning Tree protocol (STP).
 Spanning Tree is used to detect and remove loops in a network topology.
 .It Cm -stp Ar interface
 Disable Spanning Tree protocol on
 .Ar interface .
 This is the default for all interfaces added to a bridge.
 .It Cm edge Ar interface
 Set
 .Ar interface
 as an edge port.
 An edge port connects directly to end stations cannot create bridging
 loops in the network, this allows it to transition straight to forwarding.
 .It Cm -edge Ar interface
 Disable edge status on
 .Ar interface .
 .It Cm autoedge Ar interface
 Allow
 .Ar interface
 to automatically detect edge status.
 This is the default for all interfaces added to a bridge.
 .It Cm -autoedge Ar interface
 Disable automatic edge status on
 .Ar interface .
 .It Cm ptp Ar interface
 Set the
 .Ar interface
 as a point to point link.
 This is required for straight transitions to forwarding and
 should be enabled on a direct link to another RSTP capable switch.
 .It Cm -ptp Ar interface
 Disable point to point link status on
 .Ar interface .
 This should be disabled for a half duplex link and for an interface
 connected to a shared network segment,
 like a hub or a wireless network.
 .It Cm autoptp Ar interface
 Automatically detect the point to point status on
 .Ar interface
 by checking the full duplex link status.
 This is the default for interfaces added to the bridge.
 .It Cm -autoptp Ar interface
 Disable automatic point to point link detection on
 .Ar interface .
 .It Cm maxage Ar seconds
 Set the time that a Spanning Tree protocol configuration is valid.
 The default is 20 seconds.
 The minimum is 6 seconds and the maximum is 40 seconds.
 .It Cm fwddelay Ar seconds
 Set the time that must pass before an interface begins forwarding
 packets when Spanning Tree is enabled.
 The default is 15 seconds.
 The minimum is 4 seconds and the maximum is 30 seconds.
 .It Cm hellotime Ar seconds
 Set the time between broadcasting of Spanning Tree protocol
 configuration messages.
 The hello time may only be changed when operating in legacy stp mode.
 The default is 2 seconds.
 The minimum is 1 second and the maximum is 2 seconds.
 .It Cm priority Ar value
 Set the bridge priority for Spanning Tree.
 The default is 32768.
 The minimum is 0 and the maximum is 61440.
 .It Cm proto Ar value
 Set the Spanning Tree protocol.
 The default is rstp.
 The available options are stp and rstp.
 .It Cm holdcnt Ar value
 Set the transmit hold count for Spanning Tree.
 This is the number of packets transmitted before being rate limited.
 The default is 6.
 The minimum is 1 and the maximum is 10.
 .It Cm ifpriority Ar interface Ar value
 Set the Spanning Tree priority of
 .Ar interface
 to
 .Ar value .
 The default is 128.
 The minimum is 0 and the maximum is 240.
 .It Cm ifpathcost Ar interface Ar value
 Set the Spanning Tree path cost of
 .Ar interface
 to
 .Ar value .
 The default is calculated from the link speed.
 To change a previously selected path cost back to automatic, set the
 cost to 0.
 The minimum is 1 and the maximum is 200000000.
 .It Cm ifmaxaddr Ar interface Ar size
 Set the maximum number of hosts allowed from an interface, packets with unknown
 source addresses are dropped until an existing host cache entry expires or is
 removed.
 Set to 0 to disable.
 .El
 .Pp
 The following parameters are specific to lagg interfaces:
 .Bl -tag -width indent
 .It Cm laggtype Ar type
 When creating a lagg interface the type can be specified as either
 .Cm ethernet
 or
 .Cm infiniband .
 If not specified ethernet is the default lagg type.
 .It Cm laggport Ar interface
 Add the interface named by
 .Ar interface
 as a port of the aggregation interface.
 .It Cm -laggport Ar interface
 Remove the interface named by
 .Ar interface
 from the aggregation interface.
 .It Cm laggproto Ar proto
 Set the aggregation protocol.
 The default is
 .Li failover .
 The available options are
 .Li failover ,
 .Li lacp ,
 .Li loadbalance ,
 .Li roundrobin ,
 .Li broadcast
 and
 .Li none .
 .It Cm lagghash Ar option Ns Oo , Ns Ar option Oc
 Set the packet layers to hash for aggregation protocols which load balance.
 The default is
 .Dq l2,l3,l4 .
 The options can be combined using commas.
 .Pp
 .Bl -tag -width ".Cm l2" -compact
 .It Cm l2
 src/dst mac address and optional vlan number.
 .It Cm l3
 src/dst address for IPv4 or IPv6.
 .It Cm l4
 src/dst port for TCP/UDP/SCTP.
 .El
 .It Cm -use_flowid
 Enable local hash computation for RSS hash on the interface.
 The
 .Li loadbalance
 and
 .Li lacp
 modes will use the RSS hash from the network card if available
 to avoid computing one, this may give poor traffic distribution
 if the hash is invalid or uses less of the protocol header information.
 .Cm -use_flowid
 disables use of RSS hash from the network card.
 The default value can be set via the
 .Va net.link.lagg.default_use_flowid
 .Xr sysctl 8
 variable.
 .Li 0
 means
 .Dq disabled
 and
 .Li 1
 means
 .Dq enabled .
 .It Cm use_flowid
 Use the RSS hash from the network card if available.
 .It Cm flowid_shift Ar number
 Set a shift parameter for RSS local hash computation.
 Hash is calculated by using flowid bits in a packet header mbuf
 which are shifted by the number of this parameter.
 .It Cm use_numa
 Enable selection of egress ports based on the native
 .Xr NUMA 4
 domain for the packets being transmitted.
 This is currently only implemented for lacp mode.
 This works only on
 .Xr NUMA 4
 hardware, running a kernel compiled with the
 .Xr NUMA 4
 option, and when interfaces from multiple
 .Xr NUMA 4
 domains are ports of the aggregation interface.
 .It Cm -use_numa
 Disable selection of egress ports based on the native
 .Xr NUMA 4
 domain for the packets being transmitted.
 .It Cm lacp_fast_timeout
 Enable lacp fast-timeout on the interface.
 .It Cm -lacp_fast_timeout
 Disable lacp fast-timeout on the interface.
 .It Cm lacp_strict
 Enable lacp strict compliance on the interface.
 The default value can be set via the
 .Va net.link.lagg.lacp.default_strict_mode
 .Xr sysctl 8
 variable.
 .Li 0
 means
 .Dq disabled
 and
 .Li 1
 means
 .Dq enabled .
 .It Cm -lacp_strict
 Disable lacp strict compliance on the interface.
 .It Cm rr_limit Ar number
 Configure a stride for an interface in round-robin mode.
 The default stride is 1.
 .El
 .Pp
 The following parameters apply to IP tunnel interfaces,
 .Xr gif 4 :
 .Bl -tag -width indent
 .It Cm tunnel Ar src_addr dest_addr
 Configure the physical source and destination address for IP tunnel
 interfaces.
 The arguments
 .Ar src_addr
 and
 .Ar dest_addr
 are interpreted as the outer source/destination for the encapsulating
 IPv4/IPv6 header.
 .It Fl tunnel
 Unconfigure the physical source and destination address for IP tunnel
 interfaces previously configured with
 .Cm tunnel .
 .It Cm deletetunnel
 Another name for the
 .Fl tunnel
 parameter.
 .It Cm accept_rev_ethip_ver
 Set a flag to accept both correct EtherIP packets and ones
 with reversed version field.
 Enabled by default.
 This is for backward compatibility with
 .Fx 6.1 ,
 6.2, 6.3, 7.0, and 7.1.
 .It Cm -accept_rev_ethip_ver
 Clear a flag
 .Cm accept_rev_ethip_ver .
 .It Cm ignore_source
 Set a flag to accept encapsulated packets destined to this host
 independently from source address.
 This may be useful for hosts, that receive encapsulated packets
 from the load balancers.
 .It Cm -ignore_source
 Clear a flag
 .Cm ignore_source .
 .It Cm send_rev_ethip_ver
 Set a flag to send EtherIP packets with reversed version
 field intentionally.
 Disabled by default.
 This is for backward compatibility with
 .Fx 6.1 ,
 6.2, 6.3, 7.0, and 7.1.
 .It Cm -send_rev_ethip_ver
 Clear a flag
 .Cm send_rev_ethip_ver .
 .El
 .Pp
 The following parameters apply to GRE tunnel interfaces,
 .Xr gre 4 :
 .Bl -tag -width indent
 .It Cm tunnel Ar src_addr dest_addr
 Configure the physical source and destination address for GRE tunnel
 interfaces.
 The arguments
 .Ar src_addr
 and
 .Ar dest_addr
 are interpreted as the outer source/destination for the encapsulating
 IPv4/IPv6 header.
 .It Fl tunnel
 Unconfigure the physical source and destination address for GRE tunnel
 interfaces previously configured with
 .Cm tunnel .
 .It Cm deletetunnel
 Another name for the
 .Fl tunnel
 parameter.
 .It Cm grekey Ar key
 Configure the GRE key to be used for outgoing packets.
 Note that
 .Xr gre 4 will always accept GRE packets with invalid or absent keys.
 This command will result in a four byte MTU reduction on the interface.
 .El
 .Pp
 The following parameters are specific to
 .Xr pfsync 4
 interfaces:
 .Bl -tag -width indent
 .It Cm syncdev Ar iface
 Use the specified interface
 to send and receive pfsync state synchronisation messages.
 .It Fl syncdev
 Stop sending pfsync state synchronisation messages over the network.
 .It Cm syncpeer Ar peer_address
 Make the pfsync link point-to-point rather than using
 multicast to broadcast the state synchronisation messages.
 The peer_address is the IP address of the other host taking part in
 the pfsync cluster.
 .It Fl syncpeer
 Broadcast the packets using multicast.
 .It Cm maxupd Ar n
 Set the maximum number of updates for a single state which
 can be collapsed into one.
 This is an 8-bit number; the default value is 128.
 .It Cm defer
 Defer transmission of the first packet in a state until a peer has
 acknowledged that the associated state has been inserted.
 .It Fl defer
 Do not defer the first packet in a state.
 This is the default.
 .El
 .Pp
 The following parameters are specific to
 .Xr vlan 4
 interfaces:
 .Bl -tag -width indent
 .It Cm vlan Ar vlan_tag
 Set the VLAN tag value to
 .Ar vlan_tag .
 This value is a 12-bit VLAN Identifier (VID) which is used to create an 802.1Q
 or 802.1ad VLAN header for packets sent from the
 .Xr vlan 4
 interface.
 Note that
 .Cm vlan
 and
 .Cm vlandev
 must both be set at the same time.
 .It Cm vlanproto Ar vlan_proto
 Set the VLAN encapsulation protocol to
 .Ar vlan_proto .
 Supported encapsulation protocols are currently
 .Dq 802.1Q
 and
 .Dq 802.1ad .
 The default encapsulation protocol is
 .Dq 802.1Q .
 .It Cm vlanpcp Ar priority_code_point
 Priority code point
 .Pq Dv PCP
 is an 3-bit field which refers to the IEEE 802.1p
 class of service and maps to the frame priority level.
 .Pp
 Values in order of priority are:
 .Cm 1
 .Pq Dv Background (lowest) ,
 .Cm 0
 .Pq Dv Best effort (default) ,
 .Cm 2
 .Pq Dv Excellent effort ,
 .Cm 3
 .Pq Dv Critical applications ,
 .Cm 4
 .Pq Dv Video, < 100ms latency ,
 .Cm 5
 .Pq Dv Video, < 10ms latency ,
 .Cm 6
 .Pq Dv Internetwork control ,
 .Cm 7
 .Pq Dv Network control (highest) .
 .It Cm vlandev Ar iface
 Associate the physical interface
 .Ar iface
 with a
 .Xr vlan 4
 interface.
 Packets transmitted through the
 .Xr vlan 4
 interface will be
 diverted to the specified physical interface
 .Ar iface
 with 802.1Q VLAN encapsulation.
 Packets with 802.1Q encapsulation received
 by the parent interface with the correct VLAN Identifier will be diverted to
 the associated
 .Xr vlan 4
 pseudo-interface.
 The
 .Xr vlan 4
 interface is assigned a
 copy of the parent interface's flags and the parent's Ethernet address.
 The
 .Cm vlandev
 and
 .Cm vlan
 must both be set at the same time.
 If the
 .Xr vlan 4
 interface already has
 a physical interface associated with it, this command will fail.
 To
 change the association to another physical interface, the existing
 association must be cleared first.
 .Pp
 Note: if the hardware tagging capability
 is set on the parent interface, the
 .Xr vlan 4
 pseudo
 interface's behavior changes:
 the
 .Xr vlan 4
 interface recognizes that the
 parent interface supports insertion and extraction of VLAN tags on its
 own (usually in firmware) and that it should pass packets to and from
 the parent unaltered.
 .It Fl vlandev Op Ar iface
 If the driver is a
 .Xr vlan 4
 pseudo device, disassociate the parent interface from it.
 This breaks the link between the
 .Xr vlan 4
 interface and its parent,
 clears its VLAN Identifier, flags and its link address and shuts the interface
 down.
 The
 .Ar iface
 argument is useless and hence deprecated.
 .El
 .Pp
 The following parameters are used to configure
 .Xr vxlan 4
 interfaces.
 .Bl -tag -width indent
 .It Cm vxlanid Ar identifier
 This value is a 24-bit VXLAN Network Identifier (VNI) that identifies the
 virtual network segment membership of the interface.
 .It Cm vxlanlocal Ar address
 The source address used in the encapsulating IPv4/IPv6 header.
 The address should already be assigned to an existing interface.
 When the interface is configured in unicast mode, the listening socket
 is bound to this address.
 .It Cm vxlanremote Ar address
 The interface can be configured in a unicast, or point-to-point, mode
 to create a tunnel between two hosts.
 This is the IP address of the remote end of the tunnel.
 .It Cm vxlangroup Ar address
 The interface can be configured in a multicast mode
 to create a virtual network of hosts.
 This is the IP multicast group address the interface will join.
 .It Cm vxlanlocalport Ar port
 The port number the interface will listen on.
 The default port number is 4789.
 .It Cm vxlanremoteport Ar port
 The destination port number used in the encapsulating IPv4/IPv6 header.
 The remote host should be listening on this port.
 The default port number is 4789.
 Note some other implementations, such as Linux,
 do not default to the IANA assigned port,
 but instead listen on port 8472.
 .It Cm vxlanportrange Ar low high
 The range of source ports used in the encapsulating IPv4/IPv6 header.
 The port selected within the range is based on a hash of the inner frame.
 A range is useful to provide entropy within the outer IP header
 for more effective load balancing.
 The default range is between the
 .Xr sysctl 8
 variables
 .Va net.inet.ip.portrange.first
 and
 .Va net.inet.ip.portrange.last
 .It Cm vxlantimeout Ar timeout
 The maximum time, in seconds, before an entry in the forwarding table
 is pruned.
 The default is 1200 seconds (20 minutes).
 .It Cm vxlanmaxaddr Ar max
 The maximum number of entries in the forwarding table.
 The default is 2000.
 .It Cm vxlandev Ar dev
 When the interface is configured in multicast mode, the
 .Cm dev
 interface is used to transmit IP multicast packets.
 .It Cm vxlanttl Ar ttl
 The TTL used in the encapsulating IPv4/IPv6 header.
 The default is 64.
 .It Cm vxlanlearn
 The source IP address and inner source Ethernet MAC address of
 received packets are used to dynamically populate the forwarding table.
 When in multicast mode, an entry in the forwarding table allows the
 interface to send the frame directly to the remote host instead of
 broadcasting the frame to the multicast group.
 This is the default.
 .It Fl vxlanlearn
 The forwarding table is not populated by received packets.
 .It Cm vxlanflush
 Delete all dynamically-learned addresses from the forwarding table.
 .It Cm vxlanflushall
 Delete all addresses, including static addresses, from the forwarding table.
 .El
 .Pp
 The following parameters are used to configure
 .Xr carp 4
 protocol on an interface:
 .Bl -tag -width indent
 .It Cm vhid Ar n
 Set the virtual host ID.
 This is a required setting to initiate
 .Xr carp 4 .
 If the virtual host ID does not exist yet, it is created and attached to the
 interface, otherwise configuration of an existing vhid is adjusted.
 If the
 .Cm vhid
 keyword is supplied along with an
 .Dq inet6
 or
 .Dq inet
 address, then this address is configured to be run under control of the
 specified vhid.
 Whenever a last address that refers to a particular vhid is removed from an
 interface, the vhid is automatically removed from interface and destroyed.
 Any other configuration parameters for the
 .Xr carp 4
 protocol should be supplied along with the
 .Cm vhid
 keyword.
 Acceptable values for vhid are 1 to 255.
 .It Cm advbase Ar seconds
 Specifies the base of the advertisement interval in seconds.
 The acceptable values are 1 to 255.
 The default value is 1.
 .It Cm advskew Ar interval
 Specifies the skew to add to the base advertisement interval to
 make one host advertise slower than another host.
 It is specified in 1/256 of seconds.
 The acceptable values are 1 to 254.
 The default value is 0.
 .It Cm pass Ar phrase
 Set the authentication key to
 .Ar phrase .
 .It Cm state Ar MASTER|BACKUP
 Forcibly change state of a given vhid.
 .El
 .Pp
 The
 .Nm
 utility displays the current configuration for a network interface
 when no optional parameters are supplied.
 If a protocol family is specified,
 .Nm
 will report only the details specific to that protocol family.
 .Pp
 If the
 .Fl m
 flag is passed before an interface name,
 .Nm
 will display the capability list and all
 of the supported media for the specified interface.
 If
 .Fl L
 flag is supplied, address lifetime is displayed for IPv6 addresses,
 as time offset string.
 .Pp
 Optionally, the
 .Fl a
 flag may be used instead of an interface name.
 This flag instructs
 .Nm
 to display information about all interfaces in the system.
 The
 .Fl d
 flag limits this to interfaces that are down,
 .Fl u
 limits this to interfaces that are up,
 .Fl g
 limits this to members of the specified group of interfaces, and
 .Fl G
 excludes members of the specified group from the list.
 Both
 .Fl g
 and
 .Fl G
 flags may be specified to apply both conditions.
 Only one option
 .Fl g
 should be specified as later override previous ones
 (same for
 .Fl G ) .
 .Sy groupname
 may contain shell patterns in which case it should be quoted.
 When no arguments are given,
 .Fl a
 is implied.
 .Pp
 The
 .Fl l
 flag may be used to list all available interfaces on the system, with
 no other additional information.
 If an
 .Ar address_family
 is specified, only interfaces of that type will be listed.
 .Fl l Dq ether
 will list only Ethernet adapters, excluding the loopback interface.
 Use of this flag is mutually exclusive
 with all other flags and commands, except for
 .Fl d
 (only list interfaces that are down)
 and
 .Fl u
 (only list interfaces that are up).
 .Pp
 The
 .Fl v
 flag may be used to get more verbose status for an interface.
 .Pp
 The
 .Fl C
 flag may be used to list all of the interface cloners available on
 the system, with no additional information.
 Use of this flag is mutually exclusive with all other flags and commands.
 .Pp
 The
 .Fl k
 flag causes keying information for the interface, if available, to be
 printed.
 For example, the values of 802.11 WEP keys and
 .Xr carp 4
 passphrases will be printed, if accessible to the current user.
 This information is not printed by default, as it may be considered
 sensitive.
 .Pp
 If the network interface driver is not present in the kernel then
 .Nm
 will attempt to load it.
 The
 .Fl n
 flag disables this behavior.
 .Pp
 Only the super-user may modify the configuration of a network interface.
 .Sh EXAMPLES
 Assign the IPv4 address
 .Li 192.0.2.10 ,
 with a network mask of
 .Li 255.255.255.0 ,
 to the interface
 .Li em0 :
 .Dl # ifconfig em0 inet 192.0.2.10 netmask 255.255.255.0
 .Pp
 Add the IPv4 address
 .Li 192.0.2.45 ,
 with the CIDR network prefix
 .Li /28 ,
 to the interface
 .Li em0 ,
 using
 .Cm add
 as a synonym for the canonical form of the option
 .Cm alias :
 .Dl # ifconfig em0 inet 192.0.2.45/28 add
 .Pp
 Remove the IPv4 address
 .Li 192.0.2.45
 from the interface
 .Li em0 :
 .Dl # ifconfig em0 inet 192.0.2.45 -alias
 .Pp
 Enable IPv6 functionality of the interface:
 .Dl # ifconfig em0 inet6 -ifdisabled
 .Pp
 Add the IPv6 address
 .Li 2001:DB8:DBDB::123/48
 to the interface
 .Li em0 :
 .Dl # ifconfig em0 inet6 2001:db8:bdbd::123 prefixlen 48 alias
 Note that lower case hexadecimal IPv6 addresses are acceptable.
 .Pp
 Remove the IPv6 address added in the above example,
 using the
 .Li /
 character as shorthand for the network prefix,
 and using
 .Cm delete
 as a synonym for the canonical form of the option
 .Fl alias :
 .Dl # ifconfig em0 inet6 2001:db8:bdbd::123/48 delete
 .Pp
 Configure a single CARP redundant address on igb0, and then switch it
 to be master:
 .Dl # ifconfig igb0 vhid 1 10.0.0.1/24 pass foobar up
 .Dl # ifconfig igb0 vhid 1 state master
 .Pp
 Configure the interface
 .Li xl0 ,
 to use 100baseTX, full duplex Ethernet media options:
 .Dl # ifconfig xl0 media 100baseTX mediaopt full-duplex
 .Pp
 Label the em0 interface as an uplink:
 .Dl # ifconfig em0 description \&"Uplink to Gigabit Switch 2\&"
 .Pp
 Create the software network interface
 .Li gif1 :
 .Dl # ifconfig gif1 create
 .Pp
 Destroy the software network interface
 .Li gif1 :
 .Dl # ifconfig gif1 destroy
 .Pp
 Display available wireless networks using
 .Li wlan0 :
 .Dl # ifconfig wlan0 list scan
 .Pp
 Display inet and inet6 address subnet masks in CIDR notation
 .Dl # ifconfig -f inet:cidr,inet6:cidr
 .Pp
 Display interfaces that are up with the exception of loopback
 .Dl # ifconfig -a -u -G lo
 .Sh DIAGNOSTICS
 Messages indicating the specified interface does not exist, the
 requested address is unknown, or the user is not privileged and
 tried to alter an interface's configuration.
 .Sh SEE ALSO
 .Xr netstat 1 ,
 .Xr carp 4 ,
 .Xr gif 4 ,
 .Xr netintro 4 ,
 .Xr pfsync 4 ,
 .Xr polling 4 ,
 .Xr vlan 4 ,
 .Xr vxlan 4 ,
 .Xr devd.conf 5 ,
 .\" .Xr eon 5 ,
 .Xr devd 8 ,
 .Xr jail 8 ,
 .Xr rc 8 ,
 .Xr routed 8 ,
 .Xr sysctl 8
 .Sh HISTORY
 The
 .Nm
 utility appeared in
 .Bx 4.2 .
 .Sh BUGS
 Basic IPv6 node operation requires a link-local address on each
 interface configured for IPv6.
 Normally, such an address is automatically configured by the
 kernel on each interface added to the system or enabled; this behavior may
 be disabled by setting per-interface flag
 .Cm -auto_linklocal .
 The default value of this flag is 1 and can be disabled by using the sysctl
 MIB variable
 .Va net.inet6.ip6.auto_linklocal .
 .Pp
 Do not configure IPv6 addresses with no link-local address by using
 .Nm .
 It can result in unexpected behaviors of the kernel.
diff --git a/sbin/ifconfig/ifconfig.c b/sbin/ifconfig/ifconfig.c
index df3514ef5bd8..e47d0000c7ab 100644
--- a/sbin/ifconfig/ifconfig.c
+++ b/sbin/ifconfig/ifconfig.c
@@ -1,1704 +1,1706 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1983, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1983, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #ifndef lint
 #if 0
 static char sccsid[] = "@(#)ifconfig.c	8.2 (Berkeley) 2/16/94";
 #endif
 static const char rcsid[] =
   "$FreeBSD$";
 #endif /* not lint */
 
 #include <sys/param.h>
 #include <sys/ioctl.h>
 #include <sys/module.h>
 #include <sys/linker.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 /* IP */
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <arpa/inet.h>
 #include <netdb.h>
 
 #include <fnmatch.h>
 #include <ifaddrs.h>
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #ifdef JAIL
 #include <jail.h>
 #endif
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "ifconfig.h"
 
 /*
  * Since "struct ifreq" is composed of various union members, callers
  * should pay special attention to interpret the value.
  * (.e.g. little/big endian difference in the structure.)
  */
 struct	ifreq ifr;
 
 char	name[IFNAMSIZ];
 char	*descr = NULL;
 size_t	descrlen = 64;
 int	setaddr;
 int	setmask;
 int	doalias;
 int	clearaddr;
 int	newaddr = 1;
 int	verbose;
 int	noload;
 int	printifname = 0;
 
 int	supmedia = 0;
 int	printkeys = 0;		/* Print keying material for interfaces. */
 int	exit_code = 0;
 
 /* Formatter Strings */
 char	*f_inet, *f_inet6, *f_ether, *f_addr;
 
 static	bool group_member(const char *ifname, const char *match,
 		const char *nomatch);
 static	int ifconfig(int argc, char *const *argv, int iscreate,
 		const struct afswtch *afp);
 static	void status(const struct afswtch *afp, const struct sockaddr_dl *sdl,
 		struct ifaddrs *ifa);
 static	void tunnel_status(int s);
 static _Noreturn void usage(void);
 
 static int getifflags(const char *ifname, int us);
 
 static struct afswtch *af_getbyname(const char *name);
 static struct afswtch *af_getbyfamily(int af);
 static void af_other_status(int);
 
 void printifnamemaybe(void);
 
 static struct option *opts = NULL;
 
 struct ifa_order_elt {
 	int if_order;
 	int af_orders[255];
 	struct ifaddrs *ifa;
 	TAILQ_ENTRY(ifa_order_elt) link;
 };
 
 TAILQ_HEAD(ifa_queue, ifa_order_elt);
 
 static struct module_map_entry {
 	const char *ifname;
 	const char *kldname;
 } module_map[] = {
 	{
 		.ifname = "tun",
 		.kldname = "if_tuntap",
 	},
 	{
 		.ifname = "tap",
 		.kldname = "if_tuntap",
 	},
 	{
 		.ifname = "vmnet",
 		.kldname = "if_tuntap",
 	},
 	{
 		.ifname = "ipsec",
 		.kldname = "ipsec",
 	},
 	{
 		/*
 		 * This mapping exists because there is a conflicting enc module
 		 * in CAM.  ifconfig's guessing behavior will attempt to match
 		 * the ifname to a module as well as if_${ifname} and clash with
 		 * CAM enc.  This is an assertion of the correct module to load.
 		 */
 		.ifname = "enc",
 		.kldname = "if_enc",
 	},
 };
 
 
 void
 opt_register(struct option *p)
 {
 	p->next = opts;
 	opts = p;
 }
 
 static void
 usage(void)
 {
 	char options[1024];
 	struct option *p;
 
 	/* XXX not right but close enough for now */
 	options[0] = '\0';
 	for (p = opts; p != NULL; p = p->next) {
 		strlcat(options, p->opt_usage, sizeof(options));
 		strlcat(options, " ", sizeof(options));
 	}
 
 	fprintf(stderr,
 	"usage: ifconfig [-f type:format] %sinterface address_family\n"
 	"                [address [dest_address]] [parameters]\n"
 	"       ifconfig interface create\n"
 	"       ifconfig -a %s[-d] [-m] [-u] [-v] [address_family]\n"
 	"       ifconfig -l [-d] [-u] [address_family]\n"
 	"       ifconfig %s[-d] [-m] [-u] [-v]\n",
 		options, options, options);
 	exit(1);
 }
 
 #define ORDERS_SIZE(x) sizeof(x) / sizeof(x[0])
 
 static int
 calcorders(struct ifaddrs *ifa, struct ifa_queue *q)
 {
 	struct ifaddrs *prev;
 	struct ifa_order_elt *cur;
 	unsigned int ord, af, ifa_ord;
 
 	prev = NULL;
 	cur = NULL;
 	ord = 0;
 	ifa_ord = 0;
 
 	while (ifa != NULL) {
 		if (prev == NULL ||
 		    strcmp(ifa->ifa_name, prev->ifa_name) != 0) {
 			cur = calloc(1, sizeof(*cur));
 
 			if (cur == NULL)
 				return (-1);
 
 			TAILQ_INSERT_TAIL(q, cur, link);
 			cur->if_order = ifa_ord ++;
 			cur->ifa = ifa;
 			ord = 0;
 		}
 
 		if (ifa->ifa_addr) {
 			af = ifa->ifa_addr->sa_family;
 
 			if (af < ORDERS_SIZE(cur->af_orders) &&
 			    cur->af_orders[af] == 0)
 				cur->af_orders[af] = ++ord;
 		}
 		prev = ifa;
 		ifa = ifa->ifa_next;
 	}
 
 	return (0);
 }
 
 static int
 cmpifaddrs(struct ifaddrs *a, struct ifaddrs *b, struct ifa_queue *q)
 {
 	struct ifa_order_elt *cur, *e1, *e2;
 	unsigned int af1, af2;
 	int ret;
 
 	e1 = e2 = NULL;
 
 	ret = strcmp(a->ifa_name, b->ifa_name);
 	if (ret != 0) {
 		TAILQ_FOREACH(cur, q, link) {
 			if (e1 && e2)
 				break;
 
 			if (strcmp(cur->ifa->ifa_name, a->ifa_name) == 0)
 				e1 = cur;
 			else if (strcmp(cur->ifa->ifa_name, b->ifa_name) == 0)
 				e2 = cur;
 		}
 
 		if (!e1 || !e2)
 			return (0);
 		else
 			return (e1->if_order - e2->if_order);
 
 	} else if (a->ifa_addr != NULL && b->ifa_addr != NULL) {
 		TAILQ_FOREACH(cur, q, link) {
 			if (strcmp(cur->ifa->ifa_name, a->ifa_name) == 0) {
 				e1 = cur;
 				break;
 			}
 		}
 
 		if (!e1)
 			return (0);
 
 		af1 = a->ifa_addr->sa_family;
 		af2 = b->ifa_addr->sa_family;
 
 		if (af1 < ORDERS_SIZE(e1->af_orders) &&
 		    af2 < ORDERS_SIZE(e1->af_orders))
 			return (e1->af_orders[af1] - e1->af_orders[af2]);
 	}
 
 	return (0);
 }
 
 static void freeformat(void)
 {
 
 	if (f_inet != NULL)
 		free(f_inet);
 	if (f_inet6 != NULL)
 		free(f_inet6);
 	if (f_ether != NULL)
 		free(f_ether);
 	if (f_addr != NULL)
 		free(f_addr);
 }
 
 static void setformat(char *input)
 {
 	char	*formatstr, *category, *modifier; 
 
 	formatstr = strdup(input);
 	while ((category = strsep(&formatstr, ",")) != NULL) {
 		modifier = strchr(category, ':');
 		if (modifier == NULL || modifier[1] == '\0') {
 			warnx("Skipping invalid format specification: %s\n",
 			    category);
 			continue;
 		}
 
 		/* Split the string on the separator, then seek past it */
 		modifier[0] = '\0';
 		modifier++;
 
 		if (strcmp(category, "addr") == 0)
 			f_addr = strdup(modifier);
 		else if (strcmp(category, "ether") == 0)
 			f_ether = strdup(modifier);
 		else if (strcmp(category, "inet") == 0)
 			f_inet = strdup(modifier);
 		else if (strcmp(category, "inet6") == 0)
 			f_inet6 = strdup(modifier);
 	}
 	free(formatstr);
 }
 
 #undef ORDERS_SIZE
 
 static struct ifaddrs *
 sortifaddrs(struct ifaddrs *list,
     int (*compare)(struct ifaddrs *, struct ifaddrs *, struct ifa_queue *),
     struct ifa_queue *q)
 {
 	struct ifaddrs *right, *temp, *last, *result, *next, *tail;
 	
 	right = list;
 	temp = list;
 	last = list;
 	result = NULL;
 	next = NULL;
 	tail = NULL;
 
 	if (!list || !list->ifa_next)
 		return (list);
 
 	while (temp && temp->ifa_next) {
 		last = right;
 		right = right->ifa_next;
 		temp = temp->ifa_next->ifa_next;
 	}
 
 	last->ifa_next = NULL;
 
 	list = sortifaddrs(list, compare, q);
 	right = sortifaddrs(right, compare, q);
 
 	while (list || right) {
 
 		if (!right) {
 			next = list;
 			list = list->ifa_next;
 		} else if (!list) {
 			next = right;
 			right = right->ifa_next;
 		} else if (compare(list, right, q) <= 0) {
 			next = list;
 			list = list->ifa_next;
 		} else {
 			next = right;
 			right = right->ifa_next;
 		}
 
 		if (!result)
 			result = next;
 		else
 			tail->ifa_next = next;
 
 		tail = next;
 	}
 
 	return (result);
 }
 
 void printifnamemaybe()
 {
 	if (printifname)
 		printf("%s\n", name);
 }
 
 int
 main(int argc, char *argv[])
 {
 	int c, all, namesonly, downonly, uponly;
 	const struct afswtch *afp = NULL;
 	int ifindex;
 	struct ifaddrs *ifap, *sifap, *ifa;
 	struct ifreq paifr;
 	const struct sockaddr_dl *sdl;
 	char options[1024], *cp, *envformat, *namecp = NULL;
 	struct ifa_queue q = TAILQ_HEAD_INITIALIZER(q);
 	struct ifa_order_elt *cur, *tmp;
 	const char *ifname, *matchgroup, *nogroup;
 	struct option *p;
 	size_t iflen;
 	int flags;
 
 	all = downonly = uponly = namesonly = noload = verbose = 0;
 	f_inet = f_inet6 = f_ether = f_addr = NULL;
 	matchgroup = nogroup = NULL;
 
 	envformat = getenv("IFCONFIG_FORMAT");
 	if (envformat != NULL)
 		setformat(envformat);
 
 	/*
 	 * Ensure we print interface name when expected to,
 	 * even if we terminate early due to error.
 	 */
 	atexit(printifnamemaybe);
 
 	/* Parse leading line options */
 	strlcpy(options, "G:adf:klmnuv", sizeof(options));
 	for (p = opts; p != NULL; p = p->next)
 		strlcat(options, p->opt, sizeof(options));
 	while ((c = getopt(argc, argv, options)) != -1) {
 		switch (c) {
 		case 'a':	/* scan all interfaces */
 			all++;
 			break;
 		case 'd':	/* restrict scan to "down" interfaces */
 			downonly++;
 			break;
 		case 'f':
 			if (optarg == NULL)
 				usage();
 			setformat(optarg);
 			break;
 		case 'G':
 			if (optarg == NULL || all == 0)
 				usage();
 			nogroup = optarg;
 			break;
 		case 'k':
 			printkeys++;
 			break;
 		case 'l':	/* scan interface names only */
 			namesonly++;
 			break;
 		case 'm':	/* show media choices in status */
 			supmedia = 1;
 			break;
 		case 'n':	/* suppress module loading */
 			noload++;
 			break;
 		case 'u':	/* restrict scan to "up" interfaces */
 			uponly++;
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'g':
 			if (all) {
 				if (optarg == NULL)
 					usage();
 				matchgroup = optarg;
 				break;
 			}
 			/* FALLTHROUGH */
 		default:
 			for (p = opts; p != NULL; p = p->next)
 				if (p->opt[0] == c) {
 					p->cb(optarg);
 					break;
 				}
 			if (p == NULL)
 				usage();
 			break;
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	/* -l cannot be used with -a or -m */
 	if (namesonly && (all || supmedia))
 		usage();
 
 	/* nonsense.. */
 	if (uponly && downonly)
 		usage();
 
 	/* no arguments is equivalent to '-a' */
 	if (!namesonly && argc < 1)
 		all = 1;
 
 	/* -a and -l allow an address family arg to limit the output */
 	if (all || namesonly) {
 		if (argc > 1)
 			usage();
 
 		ifname = NULL;
 		ifindex = 0;
 		if (argc == 1) {
 			afp = af_getbyname(*argv);
 			if (afp == NULL) {
 				warnx("Address family '%s' unknown.", *argv);
 				usage();
 			}
 			if (afp->af_name != NULL)
 				argc--, argv++;
 			/* leave with afp non-zero */
 		}
 	} else {
 		/* not listing, need an argument */
 		if (argc < 1)
 			usage();
 
 		ifname = *argv;
 		argc--, argv++;
 
 		/* check and maybe load support for this interface */
 		ifmaybeload(ifname);
 
 		ifindex = if_nametoindex(ifname);
 		if (ifindex == 0) {
 			/*
 			 * NOTE:  We must special-case the `create' command
 			 * right here as we would otherwise fail when trying
 			 * to find the interface.
 			 */
 			if (argc > 0 && (strcmp(argv[0], "create") == 0 ||
 			    strcmp(argv[0], "plumb") == 0)) {
 				iflen = strlcpy(name, ifname, sizeof(name));
 				if (iflen >= sizeof(name))
 					errx(1, "%s: cloning name too long",
 					    ifname);
 				ifconfig(argc, argv, 1, NULL);
 				exit(exit_code);
 			}
 #ifdef JAIL
 			/*
 			 * NOTE:  We have to special-case the `-vnet' command
 			 * right here as we would otherwise fail when trying
 			 * to find the interface as it lives in another vnet.
 			 */
 			if (argc > 0 && (strcmp(argv[0], "-vnet") == 0)) {
 				iflen = strlcpy(name, ifname, sizeof(name));
 				if (iflen >= sizeof(name))
 					errx(1, "%s: interface name too long",
 					    ifname);
 				ifconfig(argc, argv, 0, NULL);
 				exit(exit_code);
 			}
 #endif
 			errx(1, "interface %s does not exist", ifname);
 		} else {
 			/*
 			 * Do not allow use `create` command as hostname if
 			 * address family is not specified.
 			 */
 			if (argc > 0 && (strcmp(argv[0], "create") == 0 ||
 			    strcmp(argv[0], "plumb") == 0)) {
 				if (argc == 1)
 					errx(1, "interface %s already exists",
 					    ifname);
 				argc--, argv++;
 			}
 		}
 	}
 
 	/* Check for address family */
 	if (argc > 0) {
 		afp = af_getbyname(*argv);
 		if (afp != NULL)
 			argc--, argv++;
 	}
 
 	/*
 	 * Check for a requested configuration action on a single interface,
 	 * which doesn't require building, sorting, and searching the entire
 	 * system address list
 	 */
 	if ((argc > 0) && (ifname != NULL)) {
 		iflen = strlcpy(name, ifname, sizeof(name));
 		if (iflen >= sizeof(name)) {
 			warnx("%s: interface name too long, skipping", ifname);
 		} else {
 			flags = getifflags(name, -1);
 			if (!(((flags & IFF_CANTCONFIG) != 0) ||
 				(downonly && (flags & IFF_UP) != 0) ||
 				(uponly && (flags & IFF_UP) == 0)))
 				ifconfig(argc, argv, 0, afp);
 		}
 		goto done;
 	}
 
 	if (getifaddrs(&ifap) != 0)
 		err(EXIT_FAILURE, "getifaddrs");
 
 	cp = NULL;
 	
 	if (calcorders(ifap, &q) != 0)
 		err(EXIT_FAILURE, "calcorders");
 		
 	sifap = sortifaddrs(ifap, cmpifaddrs, &q);
 
 	TAILQ_FOREACH_SAFE(cur, &q, link, tmp)
 		free(cur);
 
 	ifindex = 0;
 	for (ifa = sifap; ifa; ifa = ifa->ifa_next) {
 		memset(&paifr, 0, sizeof(paifr));
 		strlcpy(paifr.ifr_name, ifa->ifa_name, sizeof(paifr.ifr_name));
 		if (sizeof(paifr.ifr_addr) >= ifa->ifa_addr->sa_len) {
 			memcpy(&paifr.ifr_addr, ifa->ifa_addr,
 			    ifa->ifa_addr->sa_len);
 		}
 
 		if (ifname != NULL && strcmp(ifname, ifa->ifa_name) != 0)
 			continue;
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			sdl = (const struct sockaddr_dl *) ifa->ifa_addr;
 		else
 			sdl = NULL;
 		if (cp != NULL && strcmp(cp, ifa->ifa_name) == 0 && !namesonly)
 			continue;
 		iflen = strlcpy(name, ifa->ifa_name, sizeof(name));
 		if (iflen >= sizeof(name)) {
 			warnx("%s: interface name too long, skipping",
 			    ifa->ifa_name);
 			continue;
 		}
 		cp = ifa->ifa_name;
 
 		if ((ifa->ifa_flags & IFF_CANTCONFIG) != 0)
 			continue;
 		if (downonly && (ifa->ifa_flags & IFF_UP) != 0)
 			continue;
 		if (uponly && (ifa->ifa_flags & IFF_UP) == 0)
 			continue;
 		if (!group_member(ifa->ifa_name, matchgroup, nogroup))
 			continue;
 		/*
 		 * Are we just listing the interfaces?
 		 */
 		if (namesonly) {
 			if (namecp == cp)
 				continue;
 			if (afp != NULL) {
 				/* special case for "ether" address family */
 				if (!strcmp(afp->af_name, "ether")) {
 					if (sdl == NULL ||
 					    (sdl->sdl_type != IFT_ETHER &&
 					    sdl->sdl_type != IFT_L2VLAN &&
 					    sdl->sdl_type != IFT_BRIDGE) ||
 					    sdl->sdl_alen != ETHER_ADDR_LEN)
 						continue;
 				} else {
 					if (ifa->ifa_addr->sa_family 
 					    != afp->af_af)
 						continue;
 				}
 			}
 			namecp = cp;
 			ifindex++;
 			if (ifindex > 1)
 				printf(" ");
 			fputs(name, stdout);
 			continue;
 		}
 		ifindex++;
 
 		if (argc > 0)
 			ifconfig(argc, argv, 0, afp);
 		else
 			status(afp, sdl, ifa);
 	}
 	if (namesonly)
 		printf("\n");
 	freeifaddrs(ifap);
 
 done:
 	freeformat();
 	exit(exit_code);
 }
 
 /*
  * Returns true if an interface should be listed because any its groups
  * matches shell pattern "match" and none of groups matches pattern "nomatch".
  * If any pattern is NULL, corresponding condition is skipped.
  */
 static bool
 group_member(const char *ifname, const char *match, const char *nomatch)
 {
 	static int		 sock = -1;
 
 	struct ifgroupreq	 ifgr;
 	struct ifg_req		*ifg;
 	int			 len;
 	bool			 matched, nomatched;
 
 	/* Sanity checks. */
 	if (match == NULL && nomatch == NULL)
 		return (true);
 	if (ifname == NULL)
 		return (false);
 
 	memset(&ifgr, 0, sizeof(ifgr));
 	strlcpy(ifgr.ifgr_name, ifname, IFNAMSIZ);
 
 	/* The socket is opened once. Let _exit() close it. */
 	if (sock == -1) {
 		sock = socket(AF_LOCAL, SOCK_DGRAM, 0);
     		if (sock == -1)
             	    errx(1, "%s: socket(AF_LOCAL,SOCK_DGRAM)", __func__);
 	}
 
 	/* Determine amount of memory for the list of groups. */
 	if (ioctl(sock, SIOCGIFGROUP, (caddr_t)&ifgr) == -1) {
 		if (errno == EINVAL || errno == ENOTTY)
 			return (false);
 		else
 			errx(1, "%s: SIOCGIFGROUP", __func__);
 	}
 
 	/* Obtain the list of groups. */
 	len = ifgr.ifgr_len;
 	ifgr.ifgr_groups =
 	    (struct ifg_req *)calloc(len / sizeof(*ifg), sizeof(*ifg));
 
 	if (ifgr.ifgr_groups == NULL)
 		errx(1, "%s: no memory", __func__);
 	if (ioctl(sock, SIOCGIFGROUP, (caddr_t)&ifgr) == -1)
 		errx(1, "%s: SIOCGIFGROUP", __func__);
 
 	/* Perform matching. */
 	matched = false;
 	nomatched = true;
 	for (ifg = ifgr.ifgr_groups; ifg && len >= sizeof(*ifg); ifg++) {
 		len -= sizeof(struct ifg_req);
 		if (match)
 			matched |= !fnmatch(match, ifg->ifgrq_group, 0);
 		if (nomatch)
 			nomatched &= fnmatch(nomatch, ifg->ifgrq_group, 0);
 	}
 	free(ifgr.ifgr_groups);
 
 	if (match && !nomatch)
 		return (matched);
 	if (!match && nomatch)
 		return (nomatched);
 	return (matched && nomatched);
 }
 
 static struct afswtch *afs = NULL;
 
 void
 af_register(struct afswtch *p)
 {
 	p->af_next = afs;
 	afs = p;
 }
 
 static struct afswtch *
 af_getbyname(const char *name)
 {
 	struct afswtch *afp;
 
 	for (afp = afs; afp !=  NULL; afp = afp->af_next)
 		if (strcmp(afp->af_name, name) == 0)
 			return afp;
 	return NULL;
 }
 
 static struct afswtch *
 af_getbyfamily(int af)
 {
 	struct afswtch *afp;
 
 	for (afp = afs; afp != NULL; afp = afp->af_next)
 		if (afp->af_af == af)
 			return afp;
 	return NULL;
 }
 
 static void
 af_other_status(int s)
 {
 	struct afswtch *afp;
 	uint8_t afmask[howmany(AF_MAX, NBBY)];
 
 	memset(afmask, 0, sizeof(afmask));
 	for (afp = afs; afp != NULL; afp = afp->af_next) {
 		if (afp->af_other_status == NULL)
 			continue;
 		if (afp->af_af != AF_UNSPEC && isset(afmask, afp->af_af))
 			continue;
 		afp->af_other_status(s);
 		setbit(afmask, afp->af_af);
 	}
 }
 
 static void
 af_all_tunnel_status(int s)
 {
 	struct afswtch *afp;
 	uint8_t afmask[howmany(AF_MAX, NBBY)];
 
 	memset(afmask, 0, sizeof(afmask));
 	for (afp = afs; afp != NULL; afp = afp->af_next) {
 		if (afp->af_status_tunnel == NULL)
 			continue;
 		if (afp->af_af != AF_UNSPEC && isset(afmask, afp->af_af))
 			continue;
 		afp->af_status_tunnel(s);
 		setbit(afmask, afp->af_af);
 	}
 }
 
 static struct cmd *cmds = NULL;
 
 void
 cmd_register(struct cmd *p)
 {
 	p->c_next = cmds;
 	cmds = p;
 }
 
 static const struct cmd *
 cmd_lookup(const char *name, int iscreate)
 {
 	const struct cmd *p;
 
 	for (p = cmds; p != NULL; p = p->c_next)
 		if (strcmp(name, p->c_name) == 0) {
 			if (iscreate) {
 				if (p->c_iscloneop)
 					return p;
 			} else {
 				if (!p->c_iscloneop)
 					return p;
 			}
 		}
 	return NULL;
 }
 
 struct callback {
 	callback_func *cb_func;
 	void	*cb_arg;
 	struct callback *cb_next;
 };
 static struct callback *callbacks = NULL;
 
 void
 callback_register(callback_func *func, void *arg)
 {
 	struct callback *cb;
 
 	cb = malloc(sizeof(struct callback));
 	if (cb == NULL)
 		errx(1, "unable to allocate memory for callback");
 	cb->cb_func = func;
 	cb->cb_arg = arg;
 	cb->cb_next = callbacks;
 	callbacks = cb;
 }
 
 /* specially-handled commands */
 static void setifaddr(const char *, int, int, const struct afswtch *);
 static const struct cmd setifaddr_cmd = DEF_CMD("ifaddr", 0, setifaddr);
 
 static void setifdstaddr(const char *, int, int, const struct afswtch *);
 static const struct cmd setifdstaddr_cmd =
 	DEF_CMD("ifdstaddr", 0, setifdstaddr);
 
 static int
 ifconfig(int argc, char *const *argv, int iscreate, const struct afswtch *uafp)
 {
 	const struct afswtch *afp, *nafp;
 	const struct cmd *p;
 	struct callback *cb;
 	int s;
 
 	strlcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
 	afp = NULL;
 	if (uafp != NULL)
 		afp = uafp;
 	/*
 	 * This is the historical "accident" allowing users to configure IPv4
 	 * addresses without the "inet" keyword which while a nice feature has
 	 * proven to complicate other things.  We cannot remove this but only
 	 * make sure we will never have a similar implicit default for IPv6 or
 	 * any other address familiy.  We need a fallback though for
 	 * ifconfig IF up/down etc. to work without INET support as people
 	 * never used ifconfig IF link up/down, etc. either.
 	 */
 #ifndef RESCUE
 #ifdef INET
 	if (afp == NULL && feature_present("inet"))
 		afp = af_getbyname("inet");
 #endif
 #endif
 	if (afp == NULL)
 		afp = af_getbyname("link");
 	if (afp == NULL) {
 		warnx("Please specify an address_family.");
 		usage();
 	}
 top:
 	ifr.ifr_addr.sa_family =
 		afp->af_af == AF_LINK || afp->af_af == AF_UNSPEC ?
 		AF_LOCAL : afp->af_af;
 
 	if ((s = socket(ifr.ifr_addr.sa_family, SOCK_DGRAM, 0)) < 0 &&
 	    (uafp != NULL || errno != EAFNOSUPPORT ||
 	     (s = socket(AF_LOCAL, SOCK_DGRAM, 0)) < 0))
 		err(1, "socket(family %u,SOCK_DGRAM)", ifr.ifr_addr.sa_family);
 
 	while (argc > 0) {
 		p = cmd_lookup(*argv, iscreate);
 		if (iscreate && p == NULL) {
 			/*
 			 * Push the clone create callback so the new
 			 * device is created and can be used for any
 			 * remaining arguments.
 			 */
 			cb = callbacks;
 			if (cb == NULL)
 				errx(1, "internal error, no callback");
 			callbacks = cb->cb_next;
 			cb->cb_func(s, cb->cb_arg);
 			iscreate = 0;
 			/*
 			 * Handle any address family spec that
 			 * immediately follows and potentially
 			 * recreate the socket.
 			 */
 			nafp = af_getbyname(*argv);
 			if (nafp != NULL) {
 				argc--, argv++;
 				if (nafp != afp) {
 					close(s);
 					afp = nafp;
 					goto top;
 				}
 			}
 			/*
 			 * Look for a normal parameter.
 			 */
 			continue;
 		}
 		if (p == NULL) {
 			/*
 			 * Not a recognized command, choose between setting
 			 * the interface address and the dst address.
 			 */
 			p = (setaddr ? &setifdstaddr_cmd : &setifaddr_cmd);
 		}
 		if (p->c_parameter == NEXTARG && p->c_u.c_func) {
 			if (argv[1] == NULL)
 				errx(1, "'%s' requires argument",
 				    p->c_name);
 			p->c_u.c_func(argv[1], 0, s, afp);
 			argc--, argv++;
 		} else if (p->c_parameter == OPTARG && p->c_u.c_func) {
 			p->c_u.c_func(argv[1], 0, s, afp);
 			if (argv[1] != NULL)
 				argc--, argv++;
 		} else if (p->c_parameter == NEXTARG2 && p->c_u.c_func2) {
 			if (argc < 3)
 				errx(1, "'%s' requires 2 arguments",
 				    p->c_name);
 			p->c_u.c_func2(argv[1], argv[2], s, afp);
 			argc -= 2, argv += 2;
 		} else if (p->c_u.c_func)
 			p->c_u.c_func(*argv, p->c_parameter, s, afp);
 		argc--, argv++;
 	}
 
 	/*
 	 * Do any post argument processing required by the address family.
 	 */
 	if (afp->af_postproc != NULL)
 		afp->af_postproc(s, afp);
 	/*
 	 * Do deferred callbacks registered while processing
 	 * command-line arguments.
 	 */
 	for (cb = callbacks; cb != NULL; cb = cb->cb_next)
 		cb->cb_func(s, cb->cb_arg);
 	/*
 	 * Do deferred operations.
 	 */
 	if (clearaddr) {
 		if (afp->af_ridreq == NULL || afp->af_difaddr == 0) {
 			warnx("interface %s cannot change %s addresses!",
 			      name, afp->af_name);
 			clearaddr = 0;
 		}
 	}
 	if (clearaddr) {
 		int ret;
 		strlcpy(((struct ifreq *)afp->af_ridreq)->ifr_name, name,
 			sizeof ifr.ifr_name);
 		ret = ioctl(s, afp->af_difaddr, afp->af_ridreq);
 		if (ret < 0) {
 			if (errno == EADDRNOTAVAIL && (doalias >= 0)) {
 				/* means no previous address for interface */
 			} else
 				Perror("ioctl (SIOCDIFADDR)");
 		}
 	}
 	if (newaddr) {
 		if (afp->af_addreq == NULL || afp->af_aifaddr == 0) {
 			warnx("interface %s cannot change %s addresses!",
 			      name, afp->af_name);
 			newaddr = 0;
 		}
 	}
 	if (newaddr && (setaddr || setmask)) {
 		strlcpy(((struct ifreq *)afp->af_addreq)->ifr_name, name,
 			sizeof ifr.ifr_name);
 		if (ioctl(s, afp->af_aifaddr, afp->af_addreq) < 0)
 			Perror("ioctl (SIOCAIFADDR)");
 	}
 
 	close(s);
 	return(0);
 }
 
 /*ARGSUSED*/
 static void
 setifaddr(const char *addr, int param, int s, const struct afswtch *afp)
 {
 	if (afp->af_getaddr == NULL)
 		return;
 	/*
 	 * Delay the ioctl to set the interface addr until flags are all set.
 	 * The address interpretation may depend on the flags,
 	 * and the flags may change when the address is set.
 	 */
 	setaddr++;
 	if (doalias == 0 && afp->af_af != AF_LINK)
 		clearaddr = 1;
 	afp->af_getaddr(addr, (doalias >= 0 ? ADDR : RIDADDR));
 }
 
 static void
 settunnel(const char *src, const char *dst, int s, const struct afswtch *afp)
 {
 	struct addrinfo *srcres, *dstres;
 	int ecode;
 
 	if (afp->af_settunnel == NULL) {
 		warn("address family %s does not support tunnel setup",
 			afp->af_name);
 		return;
 	}
 
 	if ((ecode = getaddrinfo(src, NULL, NULL, &srcres)) != 0)
 		errx(1, "error in parsing address string: %s",
 		    gai_strerror(ecode));
 
 	if ((ecode = getaddrinfo(dst, NULL, NULL, &dstres)) != 0)
 		errx(1, "error in parsing address string: %s",
 		    gai_strerror(ecode));
 
 	if (srcres->ai_addr->sa_family != dstres->ai_addr->sa_family)
 		errx(1,
 		    "source and destination address families do not match");
 
 	afp->af_settunnel(s, srcres, dstres);
 
 	freeaddrinfo(srcres);
 	freeaddrinfo(dstres);
 }
 
 /* ARGSUSED */
 static void
 deletetunnel(const char *vname, int param, int s, const struct afswtch *afp)
 {
 
 	if (ioctl(s, SIOCDIFPHYADDR, &ifr) < 0)
 		err(1, "SIOCDIFPHYADDR");
 }
 
 #ifdef JAIL
 static void
 setifvnet(const char *jname, int dummy __unused, int s,
     const struct afswtch *afp)
 {
 	struct ifreq my_ifr;
 
 	memcpy(&my_ifr, &ifr, sizeof(my_ifr));
 	my_ifr.ifr_jid = jail_getid(jname);
 	if (my_ifr.ifr_jid < 0)
 		errx(1, "%s", jail_errmsg);
 	if (ioctl(s, SIOCSIFVNET, &my_ifr) < 0)
 		err(1, "SIOCSIFVNET");
 }
 
 static void
 setifrvnet(const char *jname, int dummy __unused, int s,
     const struct afswtch *afp)
 {
 	struct ifreq my_ifr;
 
 	memcpy(&my_ifr, &ifr, sizeof(my_ifr));
 	my_ifr.ifr_jid = jail_getid(jname);
 	if (my_ifr.ifr_jid < 0)
 		errx(1, "%s", jail_errmsg);
 	if (ioctl(s, SIOCSIFRVNET, &my_ifr) < 0)
 		err(1, "SIOCSIFRVNET(%d, %s)", my_ifr.ifr_jid, my_ifr.ifr_name);
 }
 #endif
 
 static void
 setifnetmask(const char *addr, int dummy __unused, int s,
     const struct afswtch *afp)
 {
 	if (afp->af_getaddr != NULL) {
 		setmask++;
 		afp->af_getaddr(addr, MASK);
 	}
 }
 
 static void
 setifbroadaddr(const char *addr, int dummy __unused, int s,
     const struct afswtch *afp)
 {
 	if (afp->af_getaddr != NULL)
 		afp->af_getaddr(addr, DSTADDR);
 }
 
 static void
 notealias(const char *addr, int param, int s, const struct afswtch *afp)
 {
 #define rqtosa(x) (&(((struct ifreq *)(afp->x))->ifr_addr))
 	if (setaddr && doalias == 0 && param < 0)
 		if (afp->af_addreq != NULL && afp->af_ridreq != NULL)
 			bcopy((caddr_t)rqtosa(af_addreq),
 			      (caddr_t)rqtosa(af_ridreq),
 			      rqtosa(af_addreq)->sa_len);
 	doalias = param;
 	if (param < 0) {
 		clearaddr = 1;
 		newaddr = 0;
 	} else
 		clearaddr = 0;
 #undef rqtosa
 }
 
 /*ARGSUSED*/
 static void
 setifdstaddr(const char *addr, int param __unused, int s, 
     const struct afswtch *afp)
 {
 	if (afp->af_getaddr != NULL)
 		afp->af_getaddr(addr, DSTADDR);
 }
 
 static int
 getifflags(const char *ifname, int us)
 {
 	struct ifreq my_ifr;
 	int s;
 	
 	memset(&my_ifr, 0, sizeof(my_ifr));
 	(void) strlcpy(my_ifr.ifr_name, ifname, sizeof(my_ifr.ifr_name));
 	if (us < 0) {
 		if ((s = socket(AF_LOCAL, SOCK_DGRAM, 0)) < 0)
 			err(1, "socket(family AF_LOCAL,SOCK_DGRAM");
 	} else
 		s = us;
  	if (ioctl(s, SIOCGIFFLAGS, (caddr_t)&my_ifr) < 0) {
  		Perror("ioctl (SIOCGIFFLAGS)");
  		exit(1);
  	}
 	if (us < 0)
 		close(s);
 	return ((my_ifr.ifr_flags & 0xffff) | (my_ifr.ifr_flagshigh << 16));
 }
 
 /*
  * Note: doing an SIOCIGIFFLAGS scribbles on the union portion
  * of the ifreq structure, which may confuse other parts of ifconfig.
  * Make a private copy so we can avoid that.
  */
 static void
 setifflags(const char *vname, int value, int s, const struct afswtch *afp)
 {
 	struct ifreq		my_ifr;
 	int flags;
 
 	flags = getifflags(name, s);
 	if (value < 0) {
 		value = -value;
 		flags &= ~value;
 	} else
 		flags |= value;
 	memset(&my_ifr, 0, sizeof(my_ifr));
 	(void) strlcpy(my_ifr.ifr_name, name, sizeof(my_ifr.ifr_name));
 	my_ifr.ifr_flags = flags & 0xffff;
 	my_ifr.ifr_flagshigh = flags >> 16;
 	if (ioctl(s, SIOCSIFFLAGS, (caddr_t)&my_ifr) < 0)
 		Perror(vname);
 }
 
 void
 setifcap(const char *vname, int value, int s, const struct afswtch *afp)
 {
 	int flags;
 
  	if (ioctl(s, SIOCGIFCAP, (caddr_t)&ifr) < 0) {
  		Perror("ioctl (SIOCGIFCAP)");
  		exit(1);
  	}
 	flags = ifr.ifr_curcap;
 	if (value < 0) {
 		value = -value;
 		flags &= ~value;
 	} else
 		flags |= value;
 	flags &= ifr.ifr_reqcap;
 	ifr.ifr_reqcap = flags;
 	if (ioctl(s, SIOCSIFCAP, (caddr_t)&ifr) < 0)
 		Perror(vname);
 }
 
 static void
 setifmetric(const char *val, int dummy __unused, int s, 
     const struct afswtch *afp)
 {
 	strlcpy(ifr.ifr_name, name, sizeof (ifr.ifr_name));
 	ifr.ifr_metric = atoi(val);
 	if (ioctl(s, SIOCSIFMETRIC, (caddr_t)&ifr) < 0)
 		err(1, "ioctl SIOCSIFMETRIC (set metric)");
 }
 
 static void
 setifmtu(const char *val, int dummy __unused, int s, 
     const struct afswtch *afp)
 {
 	strlcpy(ifr.ifr_name, name, sizeof (ifr.ifr_name));
 	ifr.ifr_mtu = atoi(val);
 	if (ioctl(s, SIOCSIFMTU, (caddr_t)&ifr) < 0)
 		err(1, "ioctl SIOCSIFMTU (set mtu)");
 }
 
 static void
 setifpcp(const char *val, int arg __unused, int s, const struct afswtch *afp)
 {
 	u_long ul;
 	char *endp;
 
 	ul = strtoul(val, &endp, 0);
 	if (*endp != '\0')
 		errx(1, "invalid value for pcp");
 	if (ul > 7)
 		errx(1, "value for pcp out of range");
 	ifr.ifr_lan_pcp = ul;
 	if (ioctl(s, SIOCSLANPCP, (caddr_t)&ifr) == -1)
 		err(1, "SIOCSLANPCP");
 }
 
 static void
 disableifpcp(const char *val, int arg __unused, int s,
     const struct afswtch *afp)
 {
 
 	ifr.ifr_lan_pcp = IFNET_PCP_NONE;
 	if (ioctl(s, SIOCSLANPCP, (caddr_t)&ifr) == -1)
 		err(1, "SIOCSLANPCP");
 }
 
 static void
 setifname(const char *val, int dummy __unused, int s, 
     const struct afswtch *afp)
 {
 	char *newname;
 	
 	strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
 
 	newname = strdup(val);
 	if (newname == NULL)
 		err(1, "no memory to set ifname");
 	ifr.ifr_data = newname;
 	if (ioctl(s, SIOCSIFNAME, (caddr_t)&ifr) < 0) {
 		free(newname);
 		err(1, "ioctl SIOCSIFNAME (set name)");
 	}
 	printifname = 1;
 	strlcpy(name, newname, sizeof(name));
 	free(newname);
 }
 
 /* ARGSUSED */
 static void
 setifdescr(const char *val, int dummy __unused, int s, 
     const struct afswtch *afp)
 {
 	char *newdescr;
 
 	strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
 	
 	ifr.ifr_buffer.length = strlen(val) + 1;
 	if (ifr.ifr_buffer.length == 1) {
 		ifr.ifr_buffer.buffer = newdescr = NULL;
 		ifr.ifr_buffer.length = 0;
 	} else {
 		newdescr = strdup(val);
 		ifr.ifr_buffer.buffer = newdescr;
 		if (newdescr == NULL) {
 			warn("no memory to set ifdescr");
 			return;
 		}
 	}
 
 	if (ioctl(s, SIOCSIFDESCR, (caddr_t)&ifr) < 0)
 		err(1, "ioctl SIOCSIFDESCR (set descr)");
 
 	free(newdescr);
 }
 
 /* ARGSUSED */
 static void
 unsetifdescr(const char *val, int value, int s, const struct afswtch *afp)
 {
 
 	setifdescr("", 0, s, 0);
 }
 
 #define	IFFBITS \
 "\020\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5POINTOPOINT\7RUNNING" \
 "\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX\15LINK0\16LINK1\17LINK2" \
 "\20MULTICAST\22PPROMISC\23MONITOR\24STATICARP"
 
 #define	IFCAPBITS \
 "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
 "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
 "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
 "\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP\34TXTLS4\35TXTLS6" \
-"\36VXLAN_HWCSUM\37VXLAN_HWTSO"
+"\36VXLAN_HWCSUM\37VXLAN_HWTSO\40TXTLS_RTLMT"
 
 /*
  * Print the status of the interface.  If an address family was
  * specified, show only it; otherwise, show them all.
  */
 static void
 status(const struct afswtch *afp, const struct sockaddr_dl *sdl,
 	struct ifaddrs *ifa)
 {
 	struct ifaddrs *ift;
 	int allfamilies, s;
 	struct ifstat ifs;
 
 	if (afp == NULL) {
 		allfamilies = 1;
 		ifr.ifr_addr.sa_family = AF_LOCAL;
 	} else {
 		allfamilies = 0;
 		ifr.ifr_addr.sa_family =
 		    afp->af_af == AF_LINK ? AF_LOCAL : afp->af_af;
 	}
 	strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
 
 	s = socket(ifr.ifr_addr.sa_family, SOCK_DGRAM, 0);
 	if (s < 0)
 		err(1, "socket(family %u,SOCK_DGRAM)", ifr.ifr_addr.sa_family);
 
 	printf("%s: ", name);
 	printb("flags", ifa->ifa_flags, IFFBITS);
 	if (ioctl(s, SIOCGIFMETRIC, &ifr) != -1)
 		printf(" metric %d", ifr.ifr_metric);
 	if (ioctl(s, SIOCGIFMTU, &ifr) != -1)
 		printf(" mtu %d", ifr.ifr_mtu);
 	putchar('\n');
 
 	for (;;) {
 		if ((descr = reallocf(descr, descrlen)) != NULL) {
 			ifr.ifr_buffer.buffer = descr;
 			ifr.ifr_buffer.length = descrlen;
 			if (ioctl(s, SIOCGIFDESCR, &ifr) == 0) {
 				if (ifr.ifr_buffer.buffer == descr) {
 					if (strlen(descr) > 0)
 						printf("\tdescription: %s\n",
 						    descr);
 				} else if (ifr.ifr_buffer.length > descrlen) {
 					descrlen = ifr.ifr_buffer.length;
 					continue;
 				}
 			}
 		} else
 			warn("unable to allocate memory for interface"
 			    "description");
 		break;
 	}
 
 	if (ioctl(s, SIOCGIFCAP, (caddr_t)&ifr) == 0) {
 		if (ifr.ifr_curcap != 0) {
 			printb("\toptions", ifr.ifr_curcap, IFCAPBITS);
 			putchar('\n');
 		}
 		if (supmedia && ifr.ifr_reqcap != 0) {
 			printb("\tcapabilities", ifr.ifr_reqcap, IFCAPBITS);
 			putchar('\n');
 		}
 	}
 
 	tunnel_status(s);
 
 	for (ift = ifa; ift != NULL; ift = ift->ifa_next) {
 		if (ift->ifa_addr == NULL)
 			continue;
 		if (strcmp(ifa->ifa_name, ift->ifa_name) != 0)
 			continue;
 		if (allfamilies) {
 			const struct afswtch *p;
 			p = af_getbyfamily(ift->ifa_addr->sa_family);
 			if (p != NULL && p->af_status != NULL)
 				p->af_status(s, ift);
 		} else if (afp->af_af == ift->ifa_addr->sa_family)
 			afp->af_status(s, ift);
 	}
 #if 0
 	if (allfamilies || afp->af_af == AF_LINK) {
 		const struct afswtch *lafp;
 
 		/*
 		 * Hack; the link level address is received separately
 		 * from the routing information so any address is not
 		 * handled above.  Cobble together an entry and invoke
 		 * the status method specially.
 		 */
 		lafp = af_getbyname("lladdr");
 		if (lafp != NULL) {
 			info.rti_info[RTAX_IFA] = (struct sockaddr *)sdl;
 			lafp->af_status(s, &info);
 		}
 	}
 #endif
 	if (allfamilies)
 		af_other_status(s);
 	else if (afp->af_other_status != NULL)
 		afp->af_other_status(s);
 
 	strlcpy(ifs.ifs_name, name, sizeof ifs.ifs_name);
 	if (ioctl(s, SIOCGIFSTATUS, &ifs) == 0) 
 		printf("%s", ifs.ascii);
 
 	if (verbose > 0)
 		sfp_status(s, &ifr, verbose);
 
 	close(s);
 	return;
 }
 
 static void
 tunnel_status(int s)
 {
 	af_all_tunnel_status(s);
 }
 
 void
 Perror(const char *cmd)
 {
 	switch (errno) {
 
 	case ENXIO:
 		errx(1, "%s: no such interface", cmd);
 		break;
 
 	case EPERM:
 		errx(1, "%s: permission denied", cmd);
 		break;
 
 	default:
 		err(1, "%s", cmd);
 	}
 }
 
 /*
  * Print a value a la the %b format of the kernel's printf
  */
 void
 printb(const char *s, unsigned v, const char *bits)
 {
 	int i, any = 0;
 	char c;
 
 	if (bits && *bits == 8)
 		printf("%s=%o", s, v);
 	else
 		printf("%s=%x", s, v);
 	if (bits) {
 		bits++;
 		putchar('<');
 		while ((i = *bits++) != '\0') {
 			if (v & (1 << (i-1))) {
 				if (any)
 					putchar(',');
 				any = 1;
 				for (; (c = *bits) > 32; bits++)
 					putchar(c);
 			} else
 				for (; *bits > 32; bits++)
 					;
 		}
 		putchar('>');
 	}
 }
 
 void
 print_vhid(const struct ifaddrs *ifa, const char *s)
 {
 	struct if_data *ifd;
 
 	if (ifa->ifa_data == NULL)
 		return;
 
 	ifd = ifa->ifa_data;
 	if (ifd->ifi_vhid == 0)
 		return;
 	
 	printf(" vhid %d", ifd->ifi_vhid);
 }
 
 void
 ifmaybeload(const char *name)
 {
 #define MOD_PREFIX_LEN		3	/* "if_" */
 	struct module_stat mstat;
 	int i, fileid, modid;
 	char ifkind[IFNAMSIZ + MOD_PREFIX_LEN], ifname[IFNAMSIZ], *dp;
 	const char *cp;
 	struct module_map_entry *mme;
 	bool found;
 
 	/* loading suppressed by the user */
 	if (noload)
 		return;
 
 	/* trim the interface number off the end */
 	strlcpy(ifname, name, sizeof(ifname));
 	for (dp = ifname; *dp != 0; dp++)
 		if (isdigit(*dp)) {
 			*dp = 0;
 			break;
 		}
 
 	/* Either derive it from the map or guess otherwise */
 	*ifkind = '\0';
 	found = false;
 	for (i = 0; i < nitems(module_map); ++i) {
 		mme = &module_map[i];
 		if (strcmp(mme->ifname, ifname) == 0) {
 			strlcpy(ifkind, mme->kldname, sizeof(ifkind));
 			found = true;
 			break;
 		}
 	}
 
 	/* We didn't have an alias for it... we'll guess. */
 	if (!found) {
 	    /* turn interface and unit into module name */
 	    strlcpy(ifkind, "if_", sizeof(ifkind));
 	    strlcat(ifkind, ifname, sizeof(ifkind));
 	}
 
 	/* scan files in kernel */
 	mstat.version = sizeof(struct module_stat);
 	for (fileid = kldnext(0); fileid > 0; fileid = kldnext(fileid)) {
 		/* scan modules in file */
 		for (modid = kldfirstmod(fileid); modid > 0;
 		     modid = modfnext(modid)) {
 			if (modstat(modid, &mstat) < 0)
 				continue;
 			/* strip bus name if present */
 			if ((cp = strchr(mstat.name, '/')) != NULL) {
 				cp++;
 			} else {
 				cp = mstat.name;
 			}
 			/*
 			 * Is it already loaded?  Don't compare with ifname if
 			 * we were specifically told which kld to use.  Doing
 			 * so could lead to conflicts not trivially solved.
 			 */
 			if ((!found && strcmp(ifname, cp) == 0) ||
 			    strcmp(ifkind, cp) == 0)
 				return;
 		}
 	}
 
 	/*
 	 * Try to load the module.  But ignore failures, because ifconfig can't
 	 * infer the names of all drivers (eg mlx4en(4)).
 	 */
 	(void) kldload(ifkind);
 }
 
 static struct cmd basic_cmds[] = {
 	DEF_CMD("up",		IFF_UP,		setifflags),
 	DEF_CMD("down",		-IFF_UP,	setifflags),
 	DEF_CMD("arp",		-IFF_NOARP,	setifflags),
 	DEF_CMD("-arp",		IFF_NOARP,	setifflags),
 	DEF_CMD("debug",	IFF_DEBUG,	setifflags),
 	DEF_CMD("-debug",	-IFF_DEBUG,	setifflags),
 	DEF_CMD_ARG("description",		setifdescr),
 	DEF_CMD_ARG("descr",			setifdescr),
 	DEF_CMD("-description",	0,		unsetifdescr),
 	DEF_CMD("-descr",	0,		unsetifdescr),
 	DEF_CMD("promisc",	IFF_PPROMISC,	setifflags),
 	DEF_CMD("-promisc",	-IFF_PPROMISC,	setifflags),
 	DEF_CMD("add",		IFF_UP,		notealias),
 	DEF_CMD("alias",	IFF_UP,		notealias),
 	DEF_CMD("-alias",	-IFF_UP,	notealias),
 	DEF_CMD("delete",	-IFF_UP,	notealias),
 	DEF_CMD("remove",	-IFF_UP,	notealias),
 #ifdef notdef
 #define	EN_SWABIPS	0x1000
 	DEF_CMD("swabips",	EN_SWABIPS,	setifflags),
 	DEF_CMD("-swabips",	-EN_SWABIPS,	setifflags),
 #endif
 	DEF_CMD_ARG("netmask",			setifnetmask),
 	DEF_CMD_ARG("metric",			setifmetric),
 	DEF_CMD_ARG("broadcast",		setifbroadaddr),
 	DEF_CMD_ARG2("tunnel",			settunnel),
 	DEF_CMD("-tunnel", 0,			deletetunnel),
 	DEF_CMD("deletetunnel", 0,		deletetunnel),
 #ifdef JAIL
 	DEF_CMD_ARG("vnet",			setifvnet),
 	DEF_CMD_ARG("-vnet",			setifrvnet),
 #endif
 	DEF_CMD("link0",	IFF_LINK0,	setifflags),
 	DEF_CMD("-link0",	-IFF_LINK0,	setifflags),
 	DEF_CMD("link1",	IFF_LINK1,	setifflags),
 	DEF_CMD("-link1",	-IFF_LINK1,	setifflags),
 	DEF_CMD("link2",	IFF_LINK2,	setifflags),
 	DEF_CMD("-link2",	-IFF_LINK2,	setifflags),
 	DEF_CMD("monitor",	IFF_MONITOR,	setifflags),
 	DEF_CMD("-monitor",	-IFF_MONITOR,	setifflags),
 	DEF_CMD("nomap",	IFCAP_NOMAP,	setifcap),
 	DEF_CMD("-nomap",	-IFCAP_NOMAP,	setifcap),
 	DEF_CMD("staticarp",	IFF_STATICARP,	setifflags),
 	DEF_CMD("-staticarp",	-IFF_STATICARP,	setifflags),
 	DEF_CMD("rxcsum6",	IFCAP_RXCSUM_IPV6,	setifcap),
 	DEF_CMD("-rxcsum6",	-IFCAP_RXCSUM_IPV6,	setifcap),
 	DEF_CMD("txcsum6",	IFCAP_TXCSUM_IPV6,	setifcap),
 	DEF_CMD("-txcsum6",	-IFCAP_TXCSUM_IPV6,	setifcap),
 	DEF_CMD("rxcsum",	IFCAP_RXCSUM,	setifcap),
 	DEF_CMD("-rxcsum",	-IFCAP_RXCSUM,	setifcap),
 	DEF_CMD("txcsum",	IFCAP_TXCSUM,	setifcap),
 	DEF_CMD("-txcsum",	-IFCAP_TXCSUM,	setifcap),
 	DEF_CMD("netcons",	IFCAP_NETCONS,	setifcap),
 	DEF_CMD("-netcons",	-IFCAP_NETCONS,	setifcap),
 	DEF_CMD_ARG("pcp",			setifpcp),
 	DEF_CMD("-pcp", 0,			disableifpcp),
 	DEF_CMD("polling",	IFCAP_POLLING,	setifcap),
 	DEF_CMD("-polling",	-IFCAP_POLLING,	setifcap),
 	DEF_CMD("tso6",		IFCAP_TSO6,	setifcap),
 	DEF_CMD("-tso6",	-IFCAP_TSO6,	setifcap),
 	DEF_CMD("tso4",		IFCAP_TSO4,	setifcap),
 	DEF_CMD("-tso4",	-IFCAP_TSO4,	setifcap),
 	DEF_CMD("tso",		IFCAP_TSO,	setifcap),
 	DEF_CMD("-tso",		-IFCAP_TSO,	setifcap),
 	DEF_CMD("toe",		IFCAP_TOE,	setifcap),
 	DEF_CMD("-toe",		-IFCAP_TOE,	setifcap),
 	DEF_CMD("lro",		IFCAP_LRO,	setifcap),
 	DEF_CMD("-lro",		-IFCAP_LRO,	setifcap),
 	DEF_CMD("txtls",	IFCAP_TXTLS,	setifcap),
 	DEF_CMD("-txtls",	-IFCAP_TXTLS,	setifcap),
 	DEF_CMD("wol",		IFCAP_WOL,	setifcap),
 	DEF_CMD("-wol",		-IFCAP_WOL,	setifcap),
 	DEF_CMD("wol_ucast",	IFCAP_WOL_UCAST,	setifcap),
 	DEF_CMD("-wol_ucast",	-IFCAP_WOL_UCAST,	setifcap),
 	DEF_CMD("wol_mcast",	IFCAP_WOL_MCAST,	setifcap),
 	DEF_CMD("-wol_mcast",	-IFCAP_WOL_MCAST,	setifcap),
 	DEF_CMD("wol_magic",	IFCAP_WOL_MAGIC,	setifcap),
 	DEF_CMD("-wol_magic",	-IFCAP_WOL_MAGIC,	setifcap),
 	DEF_CMD("txrtlmt",	IFCAP_TXRTLMT,	setifcap),
 	DEF_CMD("-txrtlmt",	-IFCAP_TXRTLMT,	setifcap),
+	DEF_CMD("txtlsrtlmt",	IFCAP_TXTLS_RTLMT,	setifcap),
+	DEF_CMD("-txtlsrtlmt",	-IFCAP_TXTLS_RTLMT,	setifcap),
 	DEF_CMD("hwrxtstmp",	IFCAP_HWRXTSTMP,	setifcap),
 	DEF_CMD("-hwrxtstmp",	-IFCAP_HWRXTSTMP,	setifcap),
 	DEF_CMD("normal",	-IFF_LINK0,	setifflags),
 	DEF_CMD("compress",	IFF_LINK0,	setifflags),
 	DEF_CMD("noicmp",	IFF_LINK1,	setifflags),
 	DEF_CMD_ARG("mtu",			setifmtu),
 	DEF_CMD_ARG("name",			setifname),
 };
 
 static __constructor void
 ifconfig_ctor(void)
 {
 	size_t i;
 
 	for (i = 0; i < nitems(basic_cmds);  i++)
 		cmd_register(&basic_cmds[i]);
 }
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index bb899f053468..39f03b999110 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -1,4737 +1,4743 @@
 /*-
  * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_kern_tls.h"
 
 #include "en.h"
 
 #include <sys/eventhandler.h>
 #include <sys/sockio.h>
 #include <machine/atomic.h>
 
 #include <net/debugnet.h>
 
 #ifndef ETH_DRIVER_VERSION
 #define	ETH_DRIVER_VERSION	"3.5.2"
 #endif
 #define DRIVER_RELDATE	"September 2019"
 
 static const char mlx5e_version[] = "mlx5en: Mellanox Ethernet driver "
 	ETH_DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
 
 static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
 
 struct mlx5e_channel_param {
 	struct mlx5e_rq_param rq;
 	struct mlx5e_sq_param sq;
 	struct mlx5e_cq_param rx_cq;
 	struct mlx5e_cq_param tx_cq;
 };
 
 struct media {
 	u32	subtype;
 	u64	baudrate;
 };
 
 static const struct media mlx5e_mode_table[MLX5E_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = {
 
 	[MLX5E_1000BASE_CX_SGMII][MLX5E_SGMII] = {
 		.subtype = IFM_1000_CX_SGMII,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_1000BASE_KX][MLX5E_KX] = {
 		.subtype = IFM_1000_KX,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_CX4][MLX5E_CX4] = {
 		.subtype = IFM_10G_CX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KX4][MLX5E_KX4] = {
 		.subtype = IFM_10G_KX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KR][MLX5E_KR] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_20GBASE_KR2][MLX5E_KR2] = {
 		.subtype = IFM_20G_KR2,
 		.baudrate = IF_Gbps(20ULL),
 	},
 	[MLX5E_40GBASE_CR4][MLX5E_CR4] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_KR4][MLX5E_KR4] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_56GBASE_R4][MLX5E_R] = {
 		.subtype = IFM_56G_R4,
 		.baudrate = IF_Gbps(56ULL),
 	},
 	[MLX5E_10GBASE_CR][MLX5E_CR1] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_SR][MLX5E_SR] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_ER_LR][MLX5E_ER] = {
 		.subtype = IFM_10G_ER,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_ER_LR][MLX5E_LR] = {
 		.subtype = IFM_10G_LR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_40GBASE_SR4][MLX5E_SR4] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_LR4_ER4][MLX5E_LR4] = {
 		.subtype = IFM_40G_LR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_LR4_ER4][MLX5E_ER4] = {
 		.subtype = IFM_40G_ER4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_100GBASE_CR4][MLX5E_CR4] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_SR4][MLX5E_SR4] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_KR4][MLX5E_KR4] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_LR4][MLX5E_LR4] = {
 		.subtype = IFM_100G_LR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100BASE_TX][MLX5E_TX] = {
 		.subtype = IFM_100_TX,
 		.baudrate = IF_Mbps(100ULL),
 	},
 	[MLX5E_1000BASE_T][MLX5E_T] = {
 		.subtype = IFM_1000_T,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_T][MLX5E_T] = {
 		.subtype = IFM_10G_T,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_25GBASE_CR][MLX5E_CR] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_KR][MLX5E_KR] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_SR][MLX5E_SR] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_50GBASE_CR2][MLX5E_CR2] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GBASE_KR2][MLX5E_KR2] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GBASE_KR4][MLX5E_KR4] = {
 		.subtype = IFM_50G_KR4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 };
 
 static const struct media mlx5e_ext_mode_table[MLX5E_EXT_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = {
 	[MLX5E_SGMII_100M][MLX5E_SGMII] = {
 		.subtype = IFM_100_SGMII,
 		.baudrate = IF_Mbps(100),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_KX] = {
 		.subtype = IFM_1000_KX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CX_SGMII] = {
 		.subtype = IFM_1000_CX_SGMII,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CX] = {
 		.subtype = IFM_1000_CX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_LX] = {
 		.subtype = IFM_1000_LX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_SX] = {
 		.subtype = IFM_1000_SX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_T] = {
 		.subtype = IFM_1000_T,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_T] = {
 		.subtype = IFM_5000_T,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_KR] = {
 		.subtype = IFM_5000_KR,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_KR1] = {
 		.subtype = IFM_5000_KR1,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_KR_S] = {
 		.subtype = IFM_5000_KR_S,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_ER] = {
 		.subtype = IFM_10G_ER,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_KR] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_LR] = {
 		.subtype = IFM_10G_LR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_SR] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_T] = {
 		.subtype = IFM_10G_T,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_AOC] = {
 		.subtype = IFM_10G_AOC,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CR1] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CR4] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_KR4] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_LR4] = {
 		.subtype = IFM_40G_LR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_SR4] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_ER4] = {
 		.subtype = IFM_40G_ER4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_SR] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_ACC] = {
 		.subtype = IFM_25G_ACC,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_AOC] = {
 		.subtype = IFM_25G_AOC,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR1] = {
 		.subtype = IFM_25G_CR1,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR_S] = {
 		.subtype = IFM_25G_CR_S,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR1] = {
 		.subtype = IFM_5000_KR1,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR_S] = {
 		.subtype = IFM_25G_KR_S,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_LR] = {
 		.subtype = IFM_25G_LR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_T] = {
 		.subtype = IFM_25G_T,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CR2] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_KR2] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_KR4] = {
 		.subtype = IFM_50G_KR4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_SR2] = {
 		.subtype = IFM_50G_SR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_LR2] = {
 		.subtype = IFM_50G_LR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_LR] = {
 		.subtype = IFM_50G_LR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_SR] = {
 		.subtype = IFM_50G_SR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CP] = {
 		.subtype = IFM_50G_CP,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_FR] = {
 		.subtype = IFM_50G_FR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_KR_PAM4] = {
 		.subtype = IFM_50G_KR_PAM4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CR4] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_KR4] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_LR4] = {
 		.subtype = IFM_100G_LR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_SR4] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_SR2] = {
 		.subtype = IFM_100G_SR2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CP2] = {
 		.subtype = IFM_100G_CP2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_KR2_PAM4] = {
 		.subtype = IFM_100G_KR2_PAM4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_DR4] = {
 		.subtype = IFM_200G_DR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_LR4] = {
 		.subtype = IFM_200G_LR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_SR4] = {
 		.subtype = IFM_200G_SR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_FR4] = {
 		.subtype = IFM_200G_FR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CR4_PAM4] = {
 		.subtype = IFM_200G_CR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_KR4_PAM4] = {
 		.subtype = IFM_200G_KR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 };
 
 DEBUGNET_DEFINE(mlx5_en);
 
 MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet");
 
 static void
 mlx5e_update_carrier(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	u32 eth_proto_oper;
 	int error;
 	u8 port_state;
 	u8 is_er_type;
 	u8 i, j;
 	bool ext;
 	struct media media_entry = {};
 
 	port_state = mlx5_query_vport_state(mdev,
 	    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0);
 
 	if (port_state == VPORT_STATE_UP) {
 		priv->media_status_last |= IFM_ACTIVE;
 	} else {
 		priv->media_status_last &= ~IFM_ACTIVE;
 		priv->media_active_last = IFM_ETHER;
 		if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 		return;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error) {
 		priv->media_active_last = IFM_ETHER;
 		priv->ifp->if_baudrate = 1;
 		mlx5_en_err(priv->ifp, "query port ptys failed: 0x%x\n",
 		    error);
 		return;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_oper);
 
 	i = ilog2(eth_proto_oper);
 
 	for (j = 0; j != MLX5E_LINK_MODES_NUMBER; j++) {
 		media_entry = ext ? mlx5e_ext_mode_table[i][j] :
 		    mlx5e_mode_table[i][j];
 		if (media_entry.baudrate != 0)
 			break;
 	}
 
 	if (media_entry.subtype == 0) {
 		mlx5_en_err(priv->ifp,
 		    "Could not find operational media subtype\n");
 		return;
 	}
 
 	switch (media_entry.subtype) {
 	case IFM_10G_ER:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		}
 		if (error != 0 || is_er_type == 0)
 			media_entry.subtype = IFM_10G_LR;
 		break;
 	case IFM_40G_LR4:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		}
 		if (error == 0 && is_er_type != 0)
 			media_entry.subtype = IFM_40G_ER4;
 		break;
 	}
 	priv->media_active_last = media_entry.subtype | IFM_ETHER | IFM_FDX;
 	priv->ifp->if_baudrate = media_entry.baudrate;
 
 	if_link_state_change(priv->ifp, LINK_STATE_UP);
 }
 
 static void
 mlx5e_media_status(struct ifnet *dev, struct ifmediareq *ifmr)
 {
 	struct mlx5e_priv *priv = dev->if_softc;
 
 	ifmr->ifm_status = priv->media_status_last;
 	ifmr->ifm_active = priv->media_active_last |
 	    (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) |
 	    (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0);
 
 }
 
 static u32
 mlx5e_find_link_mode(u32 subtype, bool ext)
 {
 	u32 i;
 	u32 j;
 	u32 link_mode = 0;
 	u32 speeds_num = 0;
 	struct media media_entry = {};
 
 	switch (subtype) {
 	case IFM_10G_LR:
 		subtype = IFM_10G_ER;
 		break;
 	case IFM_40G_ER4:
 		subtype = IFM_40G_LR4;
 		break;
 	}
 
 	speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER :
 	    MLX5E_LINK_SPEEDS_NUMBER;
 
 	for (i = 0; i != speeds_num; i++) {
 		for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) {
 			media_entry = ext ? mlx5e_ext_mode_table[i][j] :
 			    mlx5e_mode_table[i][j];
 			if (media_entry.baudrate == 0)
 				continue;
 			if (media_entry.subtype == subtype) {
 				link_mode |= MLX5E_PROT_MASK(i);
 			}
 		}
 	}
 
 	return (link_mode);
 }
 
 static int
 mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv)
 {
 	return (mlx5_set_port_pause_and_pfc(priv->mdev, 1,
 	    priv->params.rx_pauseframe_control,
 	    priv->params.tx_pauseframe_control,
 	    priv->params.rx_priority_flow_control,
 	    priv->params.tx_priority_flow_control));
 }
 
 static int
 mlx5e_set_port_pfc(struct mlx5e_priv *priv)
 {
 	int error;
 
 	if (priv->gone != 0) {
 		error = -ENXIO;
 	} else if (priv->params.rx_pauseframe_control ||
 	    priv->params.tx_pauseframe_control) {
 		mlx5_en_err(priv->ifp,
 		    "Global pauseframes must be disabled before enabling PFC.\n");
 		error = -EINVAL;
 	} else {
 		error = mlx5e_set_port_pause_and_pfc(priv);
 	}
 	return (error);
 }
 
 static int
 mlx5e_media_change(struct ifnet *dev)
 {
 	struct mlx5e_priv *priv = dev->if_softc;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 eth_proto_cap;
 	u32 link_mode;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	int was_opened;
 	int locked;
 	int error;
 	bool ext;
 
 	locked = PRIV_LOCKED(priv);
 	if (!locked)
 		PRIV_LOCK(priv);
 
 	if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) {
 		error = EINVAL;
 		goto done;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error != 0) {
 		mlx5_en_err(dev, "Query port media capability failed\n");
 		goto done;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media), ext);
 
 	/* query supported capabilities */
 	eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_capability);
 
 	/* check for autoselect */
 	if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) {
 		link_mode = eth_proto_cap;
 		if (link_mode == 0) {
 			mlx5_en_err(dev, "Port media capability is zero\n");
 			error = EINVAL;
 			goto done;
 		}
 	} else {
 		link_mode = link_mode & eth_proto_cap;
 		if (link_mode == 0) {
 			mlx5_en_err(dev, "Not supported link mode requested\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
 		/* check if PFC is enabled */
 		if (priv->params.rx_priority_flow_control ||
 		    priv->params.tx_priority_flow_control) {
 			mlx5_en_err(dev, "PFC must be disabled before enabling global pauseframes.\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	/* update pauseframe control bits */
 	priv->params.rx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0;
 	priv->params.tx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0;
 
 	/* check if device is opened */
 	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	/* reconfigure the hardware */
 	mlx5_set_port_status(mdev, MLX5_PORT_DOWN);
 	mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN, ext);
 	error = -mlx5e_set_port_pause_and_pfc(priv);
 	if (was_opened)
 		mlx5_set_port_status(mdev, MLX5_PORT_UP);
 
 done:
 	if (!locked)
 		PRIV_UNLOCK(priv);
 	return (error);
 }
 
 static void
 mlx5e_update_carrier_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
 	    update_carrier_work);
 
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
 		mlx5e_update_carrier(priv);
 	PRIV_UNLOCK(priv);
 }
 
 #define	MLX5E_PCIE_PERF_GET_64(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET64(mpcnt_reg, out, counter_set.f.c);
 
 #define	MLX5E_PCIE_PERF_GET_32(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET(mpcnt_reg, out, counter_set.f.c);
 
 static void
 mlx5e_update_pcie_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	const unsigned sz = MLX5_ST_SZ_BYTES(mpcnt_reg);
 	void *out;
 	void *in;
 	int err;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_64(MLX5E_PCIE_PERF_GET_64)
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_LANE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_LANE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 /*
  * This function reads the physical port counters from the firmware
  * using a pre-defined layout defined by various MLX5E_PPORT_XXX()
  * macros. The output is converted from big-endian 64-bit values into
  * host endian ones and stored in the "priv->stats.pport" structure.
  */
 static void
 mlx5e_update_pport_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_pport_stats *s = &priv->stats.pport;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	u32 *in;
 	u32 *out;
 	const u64 *ptr;
 	unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 	unsigned x;
 	unsigned y;
 	unsigned z;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	/*
 	 * Get pointer to the 64-bit counter set which is located at a
 	 * fixed offset in the output firmware request structure:
 	 */
 	ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set);
 
 	MLX5_SET(ppcnt_reg, in, local_port, 1);
 
 	/* read IEEE802_3 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM;
 	     x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2819 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM +
 	    MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2863 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read physical layer stats counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Ethernet counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Statistical Group */
 	if (MLX5_CAP_GEN(mdev, pcam_reg) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) {
 		/* read Extended Statistical counter group using predefined counter layout */
 		MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		for (x = 0; x != MLX5E_PPORT_STATISTICAL_DEBUG_NUM; x++, y++)
 			s_debug->arg[y] = be64toh(ptr[x]);
 	}
 
 	/* read PCIE counters */
 	mlx5e_update_pcie_counters(priv);
 
 	/* read per-priority counters */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
 
 	/* iterate all the priorities */
 	for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) {
 		MLX5_SET(ppcnt_reg, in, prio_tc, z);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		/* read per priority stats counter group using predefined counter layout */
 		for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM /
 		    MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++)
 			s->arg[y] = be64toh(ptr[x]);
 	}
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 static void
 mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv)
 {
 	u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
 
 	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
 		return;
 
 	MLX5_SET(query_vnic_env_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VNIC_ENV);
 	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
 	MLX5_SET(query_vnic_env_in, in, other_vport, 0);
 
 	if (mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)) != 0)
 		return;
 
 	priv->stats.vport.rx_steer_missed_packets =
 	    MLX5_GET64(query_vnic_env_out, out,
 	    vport_env.nic_receive_steering_discard);
 }
 
 /*
  * This function is called regularly to collect all statistics
  * counters from the firmware. The values can be viewed through the
  * sysctl interface. Execution is serialized using the priv's global
  * configuration lock.
  */
 static void
 mlx5e_update_stats_locked(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_vport_stats *s = &priv->stats.vport;
 	struct mlx5e_sq_stats *sq_stats;
 #if (__FreeBSD_version < 1100000)
 	struct ifnet *ifp = priv->ifp;
 #endif
 
 	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
 	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
 	u64 tso_packets = 0;
 	u64 tso_bytes = 0;
 	u64 tx_queue_dropped = 0;
 	u64 tx_defragged = 0;
 	u64 tx_offload_none = 0;
 	u64 lro_packets = 0;
 	u64 lro_bytes = 0;
 	u64 sw_lro_queued = 0;
 	u64 sw_lro_flushed = 0;
 	u64 rx_csum_none = 0;
 	u64 rx_wqe_err = 0;
 	u64 rx_packets = 0;
 	u64 rx_bytes = 0;
 	u32 rx_out_of_buffer = 0;
 	int error;
 	int i;
 	int j;
 
 	out = mlx5_vzalloc(outlen);
 	if (out == NULL)
 		goto free_out;
 
 	/* Collect firts the SW counters and then HW for consistency */
 	for (i = 0; i < priv->params.num_channels; i++) {
 		struct mlx5e_channel *pch = priv->channel + i;
 		struct mlx5e_rq *rq = &pch->rq;
 		struct mlx5e_rq_stats *rq_stats = &pch->rq.stats;
 
 		/* collect stats from LRO */
 		rq_stats->sw_lro_queued = rq->lro.lro_queued;
 		rq_stats->sw_lro_flushed = rq->lro.lro_flushed;
 		sw_lro_queued += rq_stats->sw_lro_queued;
 		sw_lro_flushed += rq_stats->sw_lro_flushed;
 		lro_packets += rq_stats->lro_packets;
 		lro_bytes += rq_stats->lro_bytes;
 		rx_csum_none += rq_stats->csum_none;
 		rx_wqe_err += rq_stats->wqe_err;
 		rx_packets += rq_stats->packets;
 		rx_bytes += rq_stats->bytes;
 
 		for (j = 0; j < priv->num_tc; j++) {
 			sq_stats = &pch->sq[j].stats;
 
 			tso_packets += sq_stats->tso_packets;
 			tso_bytes += sq_stats->tso_bytes;
 			tx_queue_dropped += sq_stats->dropped;
 			tx_queue_dropped += sq_stats->enobuf;
 			tx_defragged += sq_stats->defragged;
 			tx_offload_none += sq_stats->csum_offload_none;
 		}
 	}
 
 	/* update counters */
 	s->tso_packets = tso_packets;
 	s->tso_bytes = tso_bytes;
 	s->tx_queue_dropped = tx_queue_dropped;
 	s->tx_defragged = tx_defragged;
 	s->lro_packets = lro_packets;
 	s->lro_bytes = lro_bytes;
 	s->sw_lro_queued = sw_lro_queued;
 	s->sw_lro_flushed = sw_lro_flushed;
 	s->rx_csum_none = rx_csum_none;
 	s->rx_wqe_err = rx_wqe_err;
 	s->rx_packets = rx_packets;
 	s->rx_bytes = rx_bytes;
 
 	mlx5e_grp_vnic_env_update_stats(priv);
 
 	/* HW counters */
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(query_vport_counter_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VPORT_COUNTER);
 	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
 	MLX5_SET(query_vport_counter_in, in, other_vport, 0);
 
 	memset(out, 0, outlen);
 
 	/* get number of out-of-buffer drops first */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 &&
 	    mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id,
 	    &rx_out_of_buffer) == 0) {
 		s->rx_out_of_buffer = rx_out_of_buffer;
 	}
 
 	/* get port statistics */
 	if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen) == 0) {
 #define	MLX5_GET_CTR(out, x) \
 	MLX5_GET64(query_vport_counter_out, out, x)
 
 		s->rx_error_packets =
 		    MLX5_GET_CTR(out, received_errors.packets);
 		s->rx_error_bytes =
 		    MLX5_GET_CTR(out, received_errors.octets);
 		s->tx_error_packets =
 		    MLX5_GET_CTR(out, transmit_errors.packets);
 		s->tx_error_bytes =
 		    MLX5_GET_CTR(out, transmit_errors.octets);
 
 		s->rx_unicast_packets =
 		    MLX5_GET_CTR(out, received_eth_unicast.packets);
 		s->rx_unicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_unicast.octets);
 		s->tx_unicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.packets);
 		s->tx_unicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.octets);
 
 		s->rx_multicast_packets =
 		    MLX5_GET_CTR(out, received_eth_multicast.packets);
 		s->rx_multicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_multicast.octets);
 		s->tx_multicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.packets);
 		s->tx_multicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.octets);
 
 		s->rx_broadcast_packets =
 		    MLX5_GET_CTR(out, received_eth_broadcast.packets);
 		s->rx_broadcast_bytes =
 		    MLX5_GET_CTR(out, received_eth_broadcast.octets);
 		s->tx_broadcast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
 		s->tx_broadcast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
 
 		s->tx_packets = s->tx_unicast_packets +
 		    s->tx_multicast_packets + s->tx_broadcast_packets;
 		s->tx_bytes = s->tx_unicast_bytes + s->tx_multicast_bytes +
 		    s->tx_broadcast_bytes;
 
 		/* Update calculated offload counters */
 		s->tx_csum_offload = s->tx_packets - tx_offload_none;
 		s->rx_csum_good = s->rx_packets - s->rx_csum_none;
 	}
 
 	/* Get physical port counters */
 	mlx5e_update_pport_counters(priv);
 
 	s->tx_jumbo_packets =
 	    priv->stats.port_stats_debug.tx_stat_p1519to2047octets +
 	    priv->stats.port_stats_debug.tx_stat_p2048to4095octets +
 	    priv->stats.port_stats_debug.tx_stat_p4096to8191octets +
 	    priv->stats.port_stats_debug.tx_stat_p8192to10239octets;
 
 #if (__FreeBSD_version < 1100000)
 	/* no get_counters interface in fbsd 10 */
 	ifp->if_ipackets = s->rx_packets;
 	ifp->if_ierrors = priv->stats.pport.in_range_len_errors +
 	    priv->stats.pport.out_of_range_len +
 	    priv->stats.pport.too_long_errors +
 	    priv->stats.pport.check_seq_err +
 	    priv->stats.pport.alignment_err;
 	ifp->if_iqdrops = s->rx_out_of_buffer;
 	ifp->if_opackets = s->tx_packets;
 	ifp->if_oerrors = priv->stats.port_stats_debug.out_discards;
 	ifp->if_snd.ifq_drops = s->tx_queue_dropped;
 	ifp->if_ibytes = s->rx_bytes;
 	ifp->if_obytes = s->tx_bytes;
 	ifp->if_collisions =
 	    priv->stats.pport.collisions;
 #endif
 
 free_out:
 	kvfree(out);
 
 	/* Update diagnostics, if any */
 	if (priv->params_ethtool.diag_pci_enable ||
 	    priv->params_ethtool.diag_general_enable) {
 		error = mlx5_core_get_diagnostics_full(mdev,
 		    priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL,
 		    priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL);
 		if (error != 0)
 			mlx5_en_err(priv->ifp,
 			    "Failed reading diagnostics: %d\n", error);
 	}
 
 	/* Update FEC, if any */
 	error = mlx5e_fec_update(priv);
 	if (error != 0 && error != EOPNOTSUPP) {
 		mlx5_en_err(priv->ifp,
 		    "Updating FEC failed: %d\n", error);
 	}
 
 	/* Update temperature, if any */
 	if (priv->params_ethtool.hw_num_temp != 0) {
 		error = mlx5e_hw_temperature_update(priv);
 		if (error != 0 && error != EOPNOTSUPP) {
 			mlx5_en_err(priv->ifp,
 			    "Updating temperature failed: %d\n", error);
 		}
 	}
 }
 
 static void
 mlx5e_update_stats_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv;
 
 	priv = container_of(work, struct mlx5e_priv, update_stats_work);
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 &&
 	    !test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &priv->mdev->intf_state))
 		mlx5e_update_stats_locked(priv);
 	PRIV_UNLOCK(priv);
 }
 
 static void
 mlx5e_update_stats(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	queue_work(priv->wq, &priv->update_stats_work);
 
 	callout_reset(&priv->watchdog, hz / 4, &mlx5e_update_stats, priv);
 }
 
 static void
 mlx5e_async_event_sub(struct mlx5e_priv *priv,
     enum mlx5_dev_event event)
 {
 	switch (event) {
 	case MLX5_DEV_EVENT_PORT_UP:
 	case MLX5_DEV_EVENT_PORT_DOWN:
 		queue_work(priv->wq, &priv->update_carrier_work);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static void
 mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
     enum mlx5_dev_event event, unsigned long param)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	mtx_lock(&priv->async_events_mtx);
 	if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state))
 		mlx5e_async_event_sub(priv, event);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void
 mlx5e_enable_async_events(struct mlx5e_priv *priv)
 {
 	set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 }
 
 static void
 mlx5e_disable_async_events(struct mlx5e_priv *priv)
 {
 	mtx_lock(&priv->async_events_mtx);
 	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void mlx5e_calibration_callout(void *arg);
 static int mlx5e_calibration_duration = 20;
 static int mlx5e_fast_calibration = 1;
 static int mlx5e_normal_calibration = 30;
 
 static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "MLX5 timestamp calibration parameteres");
 
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN,
     &mlx5e_calibration_duration, 0,
     "Duration of initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN,
     &mlx5e_fast_calibration, 0,
     "Recalibration interval during initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN,
     &mlx5e_normal_calibration, 0,
     "Recalibration interval during normal operations");
 
 /*
  * Ignites the calibration process.
  */
 static void
 mlx5e_reset_calibration_callout(struct mlx5e_priv *priv)
 {
 
 	if (priv->clbr_done == 0)
 		mlx5e_calibration_callout(priv);
 	else
 		callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done <
 		    mlx5e_calibration_duration ? mlx5e_fast_calibration :
 		    mlx5e_normal_calibration) * hz, mlx5e_calibration_callout,
 		    priv);
 }
 
 static uint64_t
 mlx5e_timespec2usec(const struct timespec *ts)
 {
 
 	return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec);
 }
 
 static uint64_t
 mlx5e_hw_clock(struct mlx5e_priv *priv)
 {
 	struct mlx5_init_seg *iseg;
 	uint32_t hw_h, hw_h1, hw_l;
 
 	iseg = priv->mdev->iseg;
 	do {
 		hw_h = ioread32be(&iseg->internal_timer_h);
 		hw_l = ioread32be(&iseg->internal_timer_l);
 		hw_h1 = ioread32be(&iseg->internal_timer_h);
 	} while (hw_h1 != hw_h);
 	return (((uint64_t)hw_h << 32) | hw_l);
 }
 
 /*
  * The calibration callout, it runs either in the context of the
  * thread which enables calibration, or in callout.  It takes the
  * snapshot of system and adapter clocks, then advances the pointers to
  * the calibration point to allow rx path to read the consistent data
  * lockless.
  */
 static void
 mlx5e_calibration_callout(void *arg)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_clbr_point *next, *curr;
 	struct timespec ts;
 	int clbr_curr_next;
 
 	priv = arg;
 	curr = &priv->clbr_points[priv->clbr_curr];
 	clbr_curr_next = priv->clbr_curr + 1;
 	if (clbr_curr_next >= nitems(priv->clbr_points))
 		clbr_curr_next = 0;
 	next = &priv->clbr_points[clbr_curr_next];
 
 	next->base_prev = curr->base_curr;
 	next->clbr_hw_prev = curr->clbr_hw_curr;
 
 	next->clbr_hw_curr = mlx5e_hw_clock(priv);
 	if (((next->clbr_hw_curr - curr->clbr_hw_curr) >> MLX5E_TSTMP_PREC) ==
 	    0) {
 		if (priv->clbr_done != 0) {
 			mlx5_en_err(priv->ifp,
 			    "HW failed tstmp frozen %#jx %#jx, disabling\n",
 			     next->clbr_hw_curr, curr->clbr_hw_prev);
 			priv->clbr_done = 0;
 		}
 		atomic_store_rel_int(&curr->clbr_gen, 0);
 		return;
 	}
 
 	nanouptime(&ts);
 	next->base_curr = mlx5e_timespec2usec(&ts);
 
 	curr->clbr_gen = 0;
 	atomic_thread_fence_rel();
 	priv->clbr_curr = clbr_curr_next;
 	atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen));
 
 	if (priv->clbr_done < mlx5e_calibration_duration)
 		priv->clbr_done++;
 	mlx5e_reset_calibration_callout(priv);
 }
 
 static const char *mlx5e_rq_stats_desc[] = {
 	MLX5E_RQ_STATS(MLX5E_STATS_DESC)
 };
 
 static int
 mlx5e_create_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *rqc = param->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	int wq_sz;
 	int err;
 	int i;
 	u32 nsegs, wqe_sz;
 
 	err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	if (err != 0)
 		goto done;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsize */
 	    nsegs,			/* nsegments */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &rq->dma_tag)))
 		goto done;
 
 	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
 	    &rq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
 	err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
 	if (err != 0)
 		goto err_rq_wq_destroy;
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 
 	err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz);
 	if (err)
 		goto err_rq_wq_destroy;
 
 	rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
 	for (i = 0; i != wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 		int j;
 
 		err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
 		if (err != 0) {
 			while (i--)
 				bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 			goto err_rq_mbuf_free;
 		}
 
 		/* set value for constant fields */
 		for (j = 0; j < rq->nsegs; j++)
 			wqe->data[j].lkey = cpu_to_be32(priv->mr.key);
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_dim_work);
 	if (priv->params.rx_cq_moderation_mode < 2) {
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 	} else {
 		void *cqc = container_of(param,
 		    struct mlx5e_channel_param, rq)->rx_cq.cqc;
 
 		switch (MLX5_GET(cqc, cqc, cq_period_mode)) {
 		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		default:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 			break;
 		}
 	}
 
 	rq->ifp = priv->ifp;
 	rq->channel = c;
 	rq->ix = c->ix;
 
 	snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix);
 	mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM,
 	    rq->stats.arg);
 	return (0);
 
 err_rq_mbuf_free:
 	free(rq->mbuf, M_MLX5EN);
 	tcp_lro_free(&rq->lro);
 err_rq_wq_destroy:
 	mlx5_wq_destroy(&rq->wq_ctrl);
 err_free_dma_tag:
 	bus_dma_tag_destroy(rq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
 	int wq_sz;
 	int i;
 
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&rq->stats.ctx);
 
 	/* free leftover LRO packets, if any */
 	tcp_lro_free(&rq->lro);
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	for (i = 0; i != wq_sz; i++) {
 		if (rq->mbuf[i].mbuf != NULL) {
 			bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map);
 			m_freem(rq->mbuf[i].mbuf);
 		}
 		bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 	}
 	free(rq->mbuf, M_MLX5EN);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 	bus_dma_tag_destroy(rq->dma_tag);
 }
 
 static int
 mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *rqc;
 	void *wq;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rq_in) +
 	    sizeof(u64) * rq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
 	wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
 	memcpy(rqc, param->rqc, sizeof(param->rqc));
 
 	MLX5_SET(rqc, rqc, cqn, c->rq.cq.mcq.cqn);
 	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc, rqc, flush_in_error_en, 1);
 	if (priv->counter_set_id >= 0)
 		MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id);
 	MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&rq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static int
 mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *rqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
 
 	MLX5_SET(modify_rq_in, in, rqn, rq->rqn);
 	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
 	MLX5_SET(rqc, rqc, state, next_state);
 
 	err = mlx5_core_modify_rq(mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_disable_rq(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	mlx5_core_destroy_rq(mdev, rq->rqn);
 }
 
 static int
 mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_wq_ll *wq = &rq->wq;
 	int i;
 
 	for (i = 0; i < 1000; i++) {
 		if (wq->cur_sz >= priv->params.min_rx_wqes)
 			return (0);
 
 		msleep(4);
 	}
 	return (-ETIMEDOUT);
 }
 
 static int
 mlx5e_open_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	int err;
 
 	err = mlx5e_create_rq(c, param, rq);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_rq(rq, param);
 	if (err)
 		goto err_destroy_rq;
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err)
 		goto err_disable_rq;
 
 	c->rq.enabled = 1;
 
 	return (0);
 
 err_disable_rq:
 	mlx5e_disable_rq(rq);
 err_destroy_rq:
 	mlx5e_destroy_rq(rq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_rq(struct mlx5e_rq *rq)
 {
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 }
 
 static void
 mlx5e_close_rq_wait(struct mlx5e_rq *rq)
 {
 
 	mlx5e_disable_rq(rq);
 	mlx5e_close_cq(&rq->cq);
 	cancel_work_sync(&rq->dim.work);
 	mlx5e_destroy_rq(rq);
 }
 
 void
 mlx5e_free_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int x;
 
 	for (x = 0; x != wq_sz; x++) {
 		if (unlikely(sq->mbuf[x].p_refcount != NULL)) {
 			atomic_add_int(sq->mbuf[x].p_refcount, -1);
 			sq->mbuf[x].p_refcount = NULL;
 		}
 		if (sq->mbuf[x].mbuf != NULL) {
 			bus_dmamap_unload(sq->dma_tag, sq->mbuf[x].dma_map);
 			m_freem(sq->mbuf[x].mbuf);
 		}
 		bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 	}
 	free(sq->mbuf, M_MLX5EN);
 }
 
 int
 mlx5e_alloc_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int err;
 	int x;
 
 	sq->mbuf = malloc(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
 
 	/* Create DMA descriptor MAPs */
 	for (x = 0; x != wq_sz; x++) {
 		err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map);
 		if (err != 0) {
 			while (x--)
 				bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 			free(sq->mbuf, M_MLX5EN);
 			return (err);
 		}
 	}
 	return (0);
 }
 
 static const char *mlx5e_sq_stats_desc[] = {
 	MLX5E_SQ_STATS(MLX5E_STATS_DESC)
 };
 
 void
 mlx5e_update_sq_inline(struct mlx5e_sq *sq)
 {
 	sq->max_inline = sq->priv->params.tx_max_inline;
 	sq->min_inline_mode = sq->priv->params.tx_min_inline_mode;
 
 	/*
 	 * Check if trust state is DSCP or if inline mode is NONE which
 	 * indicates CX-5 or newer hardware.
 	 */
 	if (sq->priv->params_ethtool.trust_state != MLX5_QPTS_TRUST_PCP ||
 	    sq->min_inline_mode == MLX5_INLINE_MODE_NONE) {
 		if (MLX5_CAP_ETH(sq->priv->mdev, wqe_vlan_insert))
 			sq->min_insert_caps = MLX5E_INSERT_VLAN | MLX5E_INSERT_NON_VLAN;
 		else
 			sq->min_insert_caps = MLX5E_INSERT_NON_VLAN;
 	} else {
 		sq->min_insert_caps = 0;
 	}
 }
 
 static void
 mlx5e_refresh_sq_inline_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int i;
 
 	for (i = 0; i != priv->num_tc; i++) {
 		mtx_lock(&c->sq[i].lock);
 		mlx5e_update_sq_inline(&c->sq[i]);
 		mtx_unlock(&c->sq[i].lock);
 	}
 }
 
 void
 mlx5e_refresh_sq_inline(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_refresh_sq_inline_sub(priv, &priv->channel[i]);
 }
 
 static int
 mlx5e_create_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *sqc = param->sqc;
 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	int err;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &sq->dma_tag)))
 		goto done;
 
 	err = mlx5_alloc_map_uar(mdev, &sq->uar);
 	if (err)
 		goto err_free_dma_tag;
 
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 	    &sq->wq_ctrl);
 	if (err)
 		goto err_unmap_free_uar;
 
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 
 	err = mlx5e_alloc_sq_db(sq);
 	if (err)
 		goto err_sq_wq_destroy;
 
 	sq->mkey_be = cpu_to_be32(priv->mr.key);
 	sq->ifp = priv->ifp;
 	sq->priv = priv;
 	sq->tc = tc;
 
 	mlx5e_update_sq_inline(sq);
 
 	snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc);
 	mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM,
 	    sq->stats.arg);
 
 	return (0);
 
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
 err_unmap_free_uar:
 	mlx5_unmap_free_uar(mdev, &sq->uar);
 
 err_free_dma_tag:
 	bus_dma_tag_destroy(sq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_sq(struct mlx5e_sq *sq)
 {
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&sq->stats.ctx);
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 	mlx5_unmap_free_uar(sq->priv->mdev, &sq->uar);
 	bus_dma_tag_destroy(sq->dma_tag);
 }
 
 int
 mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param,
     int tis_num)
 {
 	void *in;
 	void *sqc;
 	void *wq;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
 	    sizeof(u64) * sq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
 	wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	memcpy(sqc, param->sqc, sizeof(param->sqc));
 
 	MLX5_SET(sqc, sqc, tis_num_0, tis_num);
 	MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn);
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
 	MLX5_SET(sqc, sqc, tis_lst_sz, 1);
 	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
 
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
 	MLX5_SET(wq, wq, uar_page, sq->uar.index);
 	MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&sq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 int
 mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state)
 {
 	void *in;
 	void *sqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
 	MLX5_SET(modify_sq_in, in, sq_state, curr_state);
 	MLX5_SET(sqc, sqc, state, next_state);
 
 	err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 void
 mlx5e_disable_sq(struct mlx5e_sq *sq)
 {
 
 	mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn);
 }
 
 static int
 mlx5e_open_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	int err;
 
 	sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
 
 	/* ensure the TX completion event factor is not zero */
 	if (sq->cev_factor == 0)
 		sq->cev_factor = 1;
 
 	err = mlx5e_create_sq(c, tc, param, sq);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_sq(sq, param, c->priv->tisn[tc]);
 	if (err)
 		goto err_destroy_sq;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
 	if (err)
 		goto err_disable_sq;
 
 	WRITE_ONCE(sq->running, 1);
 
 	return (0);
 
 err_disable_sq:
 	mlx5e_disable_sq(sq);
 err_destroy_sq:
 	mlx5e_destroy_sq(sq);
 
 	return (err);
 }
 
 static void
 mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
 {
 	/* fill up remainder with NOPs */
 	while (sq->cev_counter != 0) {
 		while (!mlx5e_sq_has_room_for(sq, 1)) {
 			if (can_sleep != 0) {
 				mtx_unlock(&sq->lock);
 				msleep(4);
 				mtx_lock(&sq->lock);
 			} else {
 				goto done;
 			}
 		}
 		/* send a single NOP */
 		mlx5e_send_nop(sq, 1);
 		atomic_thread_fence_rel();
 	}
 done:
 	/* Check if we need to write the doorbell */
 	if (likely(sq->doorbell.d64 != 0)) {
 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
 		sq->doorbell.d64 = 0;
 	}
 }
 
 void
 mlx5e_sq_cev_timeout(void *arg)
 {
 	struct mlx5e_sq *sq = arg;
 
 	mtx_assert(&sq->lock, MA_OWNED);
 
 	/* check next state */
 	switch (sq->cev_next_state) {
 	case MLX5E_CEV_STATE_SEND_NOPS:
 		/* fill TX ring with NOPs, if any */
 		mlx5e_sq_send_nops_locked(sq, 0);
 
 		/* check if completed */
 		if (sq->cev_counter == 0) {
 			sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 			return;
 		}
 		break;
 	default:
 		/* send NOPs on next timeout */
 		sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
 		break;
 	}
 
 	/* restart timer */
 	callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
 }
 
 void
 mlx5e_drain_sq(struct mlx5e_sq *sq)
 {
 	int error;
 	struct mlx5_core_dev *mdev= sq->priv->mdev;
 
 	/*
 	 * Check if already stopped.
 	 *
 	 * NOTE: Serialization of this function is managed by the
 	 * caller ensuring the priv's state lock is locked or in case
 	 * of rate limit support, a single thread manages drain and
 	 * resume of SQs. The "running" variable can therefore safely
 	 * be read without any locks.
 	 */
 	if (READ_ONCE(sq->running) == 0)
 		return;
 
 	/* don't put more packets into the SQ */
 	WRITE_ONCE(sq->running, 0);
 
 	/* serialize access to DMA rings */
 	mtx_lock(&sq->lock);
 
 	/* teardown event factor timer, if any */
 	sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
 	callout_stop(&sq->cev_callout);
 
 	/* send dummy NOPs in order to flush the transmit ring */
 	mlx5e_sq_send_nops_locked(sq, 1);
 	mtx_unlock(&sq->lock);
 
 	/* wait till SQ is empty or link is down */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	    (sq->priv->media_status_last & IFM_ACTIVE) != 0 &&
 	    mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 
 	/* error out remaining requests */
 	error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
 	if (error != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error);
 	}
 
 	/* wait till SQ is empty */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	       mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 }
 
 static void
 mlx5e_close_sq_wait(struct mlx5e_sq *sq)
 {
 
 	mlx5e_drain_sq(sq);
 	mlx5e_disable_sq(sq);
 	mlx5e_destroy_sq(sq);
 }
 
 static int
 mlx5e_create_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	int eqn_not_used;
 	int irqn;
 	int err;
 	u32 i;
 
 	param->wq.buf_numa_node = 0;
 	param->wq.db_numa_node = 0;
 
 	err = mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn);
 	if (err)
 		return (err);
 
 	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
 	    &cq->wq_ctrl);
 	if (err)
 		return (err);
 
 	mcq->cqe_sz = 64;
 	mcq->set_ci_db = cq->wq_ctrl.db.db;
 	mcq->arm_db = cq->wq_ctrl.db.db + 1;
 	*mcq->set_ci_db = 0;
 	*mcq->arm_db = 0;
 	mcq->vector = eq_ix;
 	mcq->comp = comp;
 	mcq->event = mlx5e_cq_error_event;
 	mcq->irqn = irqn;
 	mcq->uar = &priv->cq_uar;
 
 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
 
 		cqe->op_own = 0xf1;
 	}
 
 	cq->priv = priv;
 
 	return (0);
 }
 
 static void
 mlx5e_destroy_cq(struct mlx5e_cq *cq)
 {
 	mlx5_wq_destroy(&cq->wq_ctrl);
 }
 
 static int
 mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix)
 {
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	void *in;
 	void *cqc;
 	int inlen;
 	int irqn_not_used;
 	int eqn;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 	    sizeof(u64) * cq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 
 	memcpy(cqc, param->cqc, sizeof(param->cqc));
 
 	mlx5_fill_page_array(&cq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used);
 
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET(cqc, cqc, uar_page, mcq->uar->index);
 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
 
 	err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen);
 
 	kvfree(in);
 
 	if (err)
 		return (err);
 
 	mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock));
 
 	return (0);
 }
 
 static void
 mlx5e_disable_cq(struct mlx5e_cq *cq)
 {
 
 	mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq);
 }
 
 int
 mlx5e_open_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	int err;
 
 	err = mlx5e_create_cq(priv, param, cq, comp, eq_ix);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_cq(cq, param, eq_ix);
 	if (err)
 		goto err_destroy_cq;
 
 	return (0);
 
 err_destroy_cq:
 	mlx5e_destroy_cq(cq);
 
 	return (err);
 }
 
 void
 mlx5e_close_cq(struct mlx5e_cq *cq)
 {
 	mlx5e_disable_cq(cq);
 	mlx5e_destroy_cq(cq);
 }
 
 static int
 mlx5e_open_tx_cqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++) {
 		/* open completion queue */
 		err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq,
 		    &mlx5e_tx_cq_comp, c->ix);
 		if (err)
 			goto err_close_tx_cqs;
 	}
 	return (0);
 
 err_close_tx_cqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_cq(&c->sq[tc].cq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tx_cqs(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++)
 		mlx5e_close_cq(&c->sq[tc].cq);
 }
 
 static int
 mlx5e_open_sqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++) {
 		err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]);
 		if (err)
 			goto err_close_sqs;
 	}
 
 	return (0);
 
 err_close_sqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 
 	return (err);
 }
 
 static void
 mlx5e_close_sqs_wait(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 }
 
 static void
 mlx5e_chan_static_init(struct mlx5e_priv *priv, struct mlx5e_channel *c, int ix)
 {
 	int tc;
 
 	/* setup priv and channel number */
 	c->priv = priv;
 	c->ix = ix;
 
 	/* setup send tag */
 	m_snd_tag_init(&c->tag, c->priv->ifp, IF_SND_TAG_TYPE_UNLIMITED);
 
 	init_completion(&c->completion);
 
 	mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
 
 	callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0);
 
 	for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) {
 		struct mlx5e_sq *sq = c->sq + tc;
 
 		mtx_init(&sq->lock, "mlx5tx",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 		mtx_init(&sq->comp_lock, "mlx5comp",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 
 		callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
 	}
 }
 
 static void
 mlx5e_chan_wait_for_completion(struct mlx5e_channel *c)
 {
 
 	m_snd_tag_rele(&c->tag);
 	wait_for_completion(&c->completion);
 }
 
 static void
 mlx5e_priv_wait_for_completion(struct mlx5e_priv *priv, const uint32_t channels)
 {
 	uint32_t x;
 
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_wait_for_completion(&priv->channel[x]);
 }
 
 static void
 mlx5e_chan_static_destroy(struct mlx5e_channel *c)
 {
 	int tc;
 
 	callout_drain(&c->rq.watchdog);
 
 	mtx_destroy(&c->rq.mtx);
 
 	for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) {
 		callout_drain(&c->sq[tc].cev_callout);
 		mtx_destroy(&c->sq[tc].lock);
 		mtx_destroy(&c->sq[tc].comp_lock);
 	}
 }
 
 static int
 mlx5e_open_channel(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam,
     struct mlx5e_channel *c)
 {
 	struct epoch_tracker et;
 	int i, err;
 
 	/* zero non-persistant data */
 	MLX5E_ZERO(&c->rq, mlx5e_rq_zero_start);
 	for (i = 0; i != priv->num_tc; i++)
 		MLX5E_ZERO(&c->sq[i], mlx5e_sq_zero_start);
 
 	/* open transmit completion queue */
 	err = mlx5e_open_tx_cqs(c, cparam);
 	if (err)
 		goto err_free;
 
 	/* open receive completion queue */
 	err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq,
 	    &mlx5e_rx_cq_comp, c->ix);
 	if (err)
 		goto err_close_tx_cqs;
 
 	err = mlx5e_open_sqs(c, cparam);
 	if (err)
 		goto err_close_rx_cq;
 
 	err = mlx5e_open_rq(c, &cparam->rq, &c->rq);
 	if (err)
 		goto err_close_sqs;
 
 	/* poll receive queue initially */
 	NET_EPOCH_ENTER(et);
 	c->rq.cq.mcq.comp(&c->rq.cq.mcq);
 	NET_EPOCH_EXIT(et);
 
 	return (0);
 
 err_close_sqs:
 	mlx5e_close_sqs_wait(c);
 
 err_close_rx_cq:
 	mlx5e_close_cq(&c->rq.cq);
 
 err_close_tx_cqs:
 	mlx5e_close_tx_cqs(c);
 
 err_free:
 	return (err);
 }
 
 static void
 mlx5e_close_channel(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq(&c->rq);
 }
 
 static void
 mlx5e_close_channel_wait(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq_wait(&c->rq);
 	mlx5e_close_sqs_wait(c);
 	mlx5e_close_tx_cqs(c);
 }
 
 static int
 mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
 {
 	u32 r, n;
 
 	r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
 	    MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
 	if (r > MJUM16BYTES)
 		return (-ENOMEM);
 
 	if (r > MJUM9BYTES)
 		r = MJUM16BYTES;
 	else if (r > MJUMPAGESIZE)
 		r = MJUM9BYTES;
 	else if (r > MCLBYTES)
 		r = MJUMPAGESIZE;
 	else
 		r = MCLBYTES;
 
 	/*
 	 * n + 1 must be a power of two, because stride size must be.
 	 * Stride size is 16 * (n + 1), as the first segment is
 	 * control.
 	 */
 	for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
 		;
 
 	if (n > MLX5E_MAX_BUSDMA_RX_SEGS)
 		return (-ENOMEM);
 
 	*wqe_sz = r;
 	*nsegs = n;
 	return (0);
 }
 
 static void
 mlx5e_build_rq_param(struct mlx5e_priv *priv,
     struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	u32 wqe_sz, nsegs;
 
 	mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
 	    nsegs * sizeof(struct mlx5_wqe_data_seg)));
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.buf_numa_node = 0;
 	param->wq.db_numa_node = 0;
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_sq_param(struct mlx5e_priv *priv,
     struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.buf_numa_node = 0;
 	param->wq.db_numa_node = 0;
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, uar_page, priv->cq_uar.index);
 }
 
 static void
 mlx5e_get_default_profile(struct mlx5e_priv *priv, int mode, struct net_dim_cq_moder *ptr)
 {
 
 	*ptr = net_dim_get_profile(mode, MLX5E_DIM_DEFAULT_PROFILE);
 
 	/* apply LRO restrictions */
 	if (priv->params.hw_lro_en &&
 	    ptr->pkts > MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO) {
 		ptr->pkts = MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO;
 	}
 }
 
 static void
 mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	struct net_dim_cq_moder curr;
 	void *cqc = param->cqc;
 
 	/*
 	 * We use MLX5_CQE_FORMAT_HASH because the RX hash mini CQE
 	 * format is more beneficial for FreeBSD use case.
 	 *
 	 * Adding support for MLX5_CQE_FORMAT_CSUM will require changes
 	 * in mlx5e_decompress_cqe.
 	 */
 	if (priv->params.cqe_zipping_en) {
 		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_HASH);
 		MLX5_SET(cqc, cqc, cqe_compression_en, 1);
 	}
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size);
 
 	switch (priv->params.rx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 1:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 2:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 3:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		break;
 	}
 
 	mlx5e_dim_build_cq_param(priv, param);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size);
 	MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec);
 	MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts);
 
 	switch (priv->params.tx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	}
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_channel_param(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam)
 {
 	memset(cparam, 0, sizeof(*cparam));
 
 	mlx5e_build_rq_param(priv, &cparam->rq);
 	mlx5e_build_sq_param(priv, &cparam->sq);
 	mlx5e_build_rx_cq_param(priv, &cparam->rx_cq);
 	mlx5e_build_tx_cq_param(priv, &cparam->tx_cq);
 }
 
 static int
 mlx5e_open_channels(struct mlx5e_priv *priv)
 {
 	struct mlx5e_channel_param *cparam;
 	int err;
 	int i;
 	int j;
 
 	cparam = malloc(sizeof(*cparam), M_MLX5EN, M_WAITOK);
 
 	mlx5e_build_channel_param(priv, cparam);
 	for (i = 0; i < priv->params.num_channels; i++) {
 		err = mlx5e_open_channel(priv, cparam, &priv->channel[i]);
 		if (err)
 			goto err_close_channels;
 	}
 
 	for (j = 0; j < priv->params.num_channels; j++) {
 		err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j].rq);
 		if (err)
 			goto err_close_channels;
 	}
 	free(cparam, M_MLX5EN);
 	return (0);
 
 err_close_channels:
 	while (i--) {
 		mlx5e_close_channel(&priv->channel[i]);
 		mlx5e_close_channel_wait(&priv->channel[i]);
 	}
 	free(cparam, M_MLX5EN);
 	return (err);
 }
 
 static void
 mlx5e_close_channels(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel(&priv->channel[i]);
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel_wait(&priv->channel[i]);
 }
 
 static int
 mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 
 		switch (priv->params.tx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq,
 		    priv->params.tx_cq_moderation_usec,
 		    priv->params.tx_cq_moderation_pkts,
 		    cq_mode));
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq,
 	    priv->params.tx_cq_moderation_usec,
 	    priv->params.tx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 		uint8_t dim_mode;
 		int retval;
 
 		switch (priv->params.rx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		/* tear down dynamic interrupt moderation */
 		mtx_lock(&rq->mtx);
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 		mtx_unlock(&rq->mtx);
 
 		/* wait for dynamic interrupt moderation work task, if any */
 		cancel_work_sync(&rq->dim.work);
 
 		if (priv->params.rx_cq_moderation_mode >= 2) {
 			struct net_dim_cq_moder curr;
 
 			mlx5e_get_default_profile(priv, dim_mode, &curr);
 
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    curr.usec, curr.pkts, cq_mode);
 
 			/* set dynamic interrupt moderation mode and zero defaults */
 			mtx_lock(&rq->mtx);
 			rq->dim.mode = dim_mode;
 			rq->dim.state = 0;
 			rq->dim.profile_ix = MLX5E_DIM_DEFAULT_PROFILE;
 			mtx_unlock(&rq->mtx);
 		} else {
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    priv->params.rx_cq_moderation_usec,
 			    priv->params.rx_cq_moderation_pkts,
 			    cq_mode);
 		}
 		return (retval);
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq,
 	    priv->params.rx_cq_moderation_usec,
 	    priv->params.rx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int err;
 	int i;
 
 	err = mlx5e_refresh_rq_params(priv, &c->rq);
 	if (err)
 		goto done;
 
 	for (i = 0; i != priv->num_tc; i++) {
 		err = mlx5e_refresh_sq_params(priv, &c->sq[i]);
 		if (err)
 			goto done;
 	}
 done:
 	return (err);
 }
 
 int
 mlx5e_refresh_channel_params(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (EINVAL);
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		int err;
 
 		err = mlx5e_refresh_channel_params_sub(priv, &priv->channel[i]);
 		if (err)
 			return (err);
 	}
 	return (0);
 }
 
 static int
 mlx5e_open_tis(struct mlx5e_priv *priv, int tc)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(tisc, tisc, prio, tc);
 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
 
 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc]));
 }
 
 static void
 mlx5e_close_tis(struct mlx5e_priv *priv, int tc)
 {
 	mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc]);
 }
 
 static int
 mlx5e_open_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int err;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++) {
 		err = mlx5e_open_tis(priv, tc);
 		if (err)
 			goto err_close_tises;
 	}
 
 	return (0);
 
 err_close_tises:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_tis(priv, tc);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++)
 		mlx5e_close_tis(priv, tc);
 }
 
 static int
 mlx5e_open_rqt(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 *in;
 	u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0};
 	void *rqtc;
 	int inlen;
 	int err;
 	int sz;
 	int i;
 
 	sz = 1 << priv->params.rx_hash_log_tbl_sz;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
 	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
 
 	for (i = 0; i < sz; i++) {
 		int ix = i;
 #ifdef RSS
 		ix = rss_get_indirection_to_bucket(ix);
 #endif
 		/* ensure we don't overflow */
 		ix %= priv->params.num_channels;
 
 		/* apply receive side scaling stride, if any */
 		ix -= ix % (int)priv->params.channels_rsss;
 
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix].rq.rqn);
 	}
 
 	MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
 
 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
 	if (!err)
 		priv->rqtn = MLX5_GET(create_rqt_out, out, rqtn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_close_rqt(struct mlx5e_priv *priv)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0};
 	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0};
 
 	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
 	MLX5_SET(destroy_rqt_in, in, rqtn, priv->rqtn);
 
 	mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out));
 }
 
 static void
 mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt)
 {
 	void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 	__be32 *hkey;
 
 	MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
 
 #define	ROUGH_MAX_L2_L3_HDR_SZ 256
 
 #define	MLX5_HASH_IP     (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP)
 
 #define	MLX5_HASH_ALL    (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP   |\
 			  MLX5_HASH_FIELD_SEL_L4_SPORT |\
 			  MLX5_HASH_FIELD_SEL_L4_DPORT)
 
 #define	MLX5_HASH_IP_IPSEC_SPI	(MLX5_HASH_FIELD_SEL_SRC_IP   |\
 				 MLX5_HASH_FIELD_SEL_DST_IP   |\
 				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
 
 	if (priv->params.hw_lro_en) {
 		MLX5_SET(tirc, tirc, lro_enable_mask,
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
 		MLX5_SET(tirc, tirc, lro_max_msg_sz,
 		    (priv->params.lro_wqe_sz -
 		    ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
 		/* TODO: add the option to choose timer value dynamically */
 		MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
 		    MLX5_CAP_ETH(priv->mdev,
 		    lro_timer_supported_periods[2]));
 	}
 
 	/* setup parameters for hashing TIR type, if any */
 	switch (tt) {
 	case MLX5E_TT_ANY:
 		MLX5_SET(tirc, tirc, disp_type,
 		    MLX5_TIRC_DISP_TYPE_DIRECT);
 		MLX5_SET(tirc, tirc, inline_rqn,
 		    priv->channel[0].rq.rqn);
 		break;
 	default:
 		MLX5_SET(tirc, tirc, disp_type,
 		    MLX5_TIRC_DISP_TYPE_INDIRECT);
 		MLX5_SET(tirc, tirc, indirect_table,
 		    priv->rqtn);
 		MLX5_SET(tirc, tirc, rx_hash_fn,
 		    MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ);
 		hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
 #ifdef RSS
 		/*
 		 * The FreeBSD RSS implementation does currently not
 		 * support symmetric Toeplitz hashes:
 		 */
 		MLX5_SET(tirc, tirc, rx_hash_symmetric, 0);
 		rss_getkey((uint8_t *)hkey);
 #else
 		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
 		hkey[0] = cpu_to_be32(0xD181C62C);
 		hkey[1] = cpu_to_be32(0xF7F4DB5B);
 		hkey[2] = cpu_to_be32(0x1983A2FC);
 		hkey[3] = cpu_to_be32(0x943E1ADB);
 		hkey[4] = cpu_to_be32(0xD9389E6B);
 		hkey[5] = cpu_to_be32(0xD1039C2C);
 		hkey[6] = cpu_to_be32(0xA74499AD);
 		hkey[7] = cpu_to_be32(0x593D56D9);
 		hkey[8] = cpu_to_be32(0xF3253C06);
 		hkey[9] = cpu_to_be32(0x2ADC1FFC);
 #endif
 		break;
 	}
 
 	switch (tt) {
 	case MLX5E_TT_IPV4_TCP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_TCP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_UDP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_UDP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	case MLX5E_TT_IPV6:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static int
 mlx5e_open_tir(struct mlx5e_priv *priv, int tt)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 *in;
 	void *tirc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context);
 
 	mlx5e_build_tir_ctx(priv, tirc, tt);
 
 	err = mlx5_core_create_tir(mdev, in, inlen, &priv->tirn[tt]);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tir(struct mlx5e_priv *priv, int tt)
 {
 	mlx5_core_destroy_tir(priv->mdev, priv->tirn[tt]);
 }
 
 static int
 mlx5e_open_tirs(struct mlx5e_priv *priv)
 {
 	int err;
 	int i;
 
 	for (i = 0; i < MLX5E_NUM_TT; i++) {
 		err = mlx5e_open_tir(priv, i);
 		if (err)
 			goto err_close_tirs;
 	}
 
 	return (0);
 
 err_close_tirs:
 	for (i--; i >= 0; i--)
 		mlx5e_close_tir(priv, i);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tirs(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i < MLX5E_NUM_TT; i++)
 		mlx5e_close_tir(priv, i);
 }
 
 /*
  * SW MTU does not include headers,
  * HW MTU includes all headers and checksums.
  */
 static int
 mlx5e_set_dev_port_mtu(struct ifnet *ifp, int sw_mtu)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int hw_mtu;
 	int err;
 
 	hw_mtu = MLX5E_SW2HW_MTU(sw_mtu);
 
 	err = mlx5_set_port_mtu(mdev, hw_mtu);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5_set_port_mtu failed setting %d, err=%d\n",
 		    sw_mtu, err);
 		return (err);
 	}
 
 	/* Update vport context MTU */
 	err = mlx5_set_vport_mtu(mdev, hw_mtu);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "Failed updating vport context with MTU size, err=%d\n",
 		    err);
 	}
 
 	ifp->if_mtu = sw_mtu;
 
 	err = mlx5_query_vport_mtu(mdev, &hw_mtu);
 	if (err || !hw_mtu) {
 		/* fallback to port oper mtu */
 		err = mlx5_query_port_oper_mtu(mdev, &hw_mtu);
 	}
 	if (err) {
 		mlx5_en_err(ifp,
 		    "Query port MTU, after setting new MTU value, failed\n");
 		return (err);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) {
 		err = -E2BIG,
 		mlx5_en_err(ifp,
 		    "Port MTU %d is smaller than ifp mtu %d\n",
 		    hw_mtu, sw_mtu);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) {
 		err = -EINVAL;
                 mlx5_en_err(ifp,
 		    "Port MTU %d is bigger than ifp mtu %d\n",
 		    hw_mtu, sw_mtu);
 	}
 	priv->params_ethtool.hw_mtu = hw_mtu;
 
 	/* compute MSB */
 	while (hw_mtu & (hw_mtu - 1))
 		hw_mtu &= (hw_mtu - 1);
 	priv->params_ethtool.hw_mtu_msb = hw_mtu;
 
 	return (err);
 }
 
 int
 mlx5e_open_locked(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	int err;
 	u16 set_id;
 
 	/* check if already opened */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 		return (0);
 
 #ifdef RSS
 	if (rss_getnumbuckets() > priv->params.num_channels) {
 		mlx5_en_info(ifp,
 		    "NOTE: There are more RSS buckets(%u) than channels(%u) available\n",
 		    rss_getnumbuckets(), priv->params.num_channels);
 	}
 #endif
 	err = mlx5e_open_tises(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_tises failed, %d\n", err);
 		return (err);
 	}
 	err = mlx5_vport_alloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, &set_id);
 	if (err) {
 		mlx5_en_err(priv->ifp,
 		    "mlx5_vport_alloc_q_counter failed: %d\n", err);
 		goto err_close_tises;
 	}
 	/* store counter set ID */
 	priv->counter_set_id = set_id;
 
 	err = mlx5e_open_channels(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_open_channels failed, %d\n", err);
 		goto err_dalloc_q_counter;
 	}
 	err = mlx5e_open_rqt(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_rqt failed, %d\n", err);
 		goto err_close_channels;
 	}
 	err = mlx5e_open_tirs(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_tir failed, %d\n", err);
 		goto err_close_rqls;
 	}
 	err = mlx5e_open_flow_table(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_open_flow_table failed, %d\n", err);
 		goto err_close_tirs;
 	}
 	err = mlx5e_add_all_vlan_rules(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_add_all_vlan_rules failed, %d\n", err);
 		goto err_close_flow_table;
 	}
 	set_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	mlx5e_update_carrier(priv);
 	mlx5e_set_rx_mode_core(priv);
 
 	return (0);
 
 err_close_flow_table:
 	mlx5e_close_flow_table(priv);
 
 err_close_tirs:
 	mlx5e_close_tirs(priv);
 
 err_close_rqls:
 	mlx5e_close_rqt(priv);
 
 err_close_channels:
 	mlx5e_close_channels(priv);
 
 err_dalloc_q_counter:
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 
 err_close_tises:
 	mlx5e_close_tises(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_open(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	PRIV_LOCK(priv);
 	if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP))
 		mlx5_en_err(priv->ifp,
 		    "Setting port status to up failed\n");
 
 	mlx5e_open_locked(priv->ifp);
 	priv->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	PRIV_UNLOCK(priv);
 }
 
 int
 mlx5e_close_locked(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 
 	/* check if already closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (0);
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	mlx5e_set_rx_mode_core(priv);
 	mlx5e_del_all_vlan_rules(priv);
 	if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 	mlx5e_close_flow_table(priv);
 	mlx5e_close_tirs(priv);
 	mlx5e_close_rqt(priv);
 	mlx5e_close_channels(priv);
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 	mlx5e_close_tises(priv);
 
 	return (0);
 }
 
 #if (__FreeBSD_version >= 1100000)
 static uint64_t
 mlx5e_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	u64 retval;
 
 	/* PRIV_LOCK(priv); XXX not allowed */
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		retval = priv->stats.vport.rx_packets;
 		break;
 	case IFCOUNTER_IERRORS:
 		retval = priv->stats.pport.in_range_len_errors +
 		    priv->stats.pport.out_of_range_len +
 		    priv->stats.pport.too_long_errors +
 		    priv->stats.pport.check_seq_err +
 		    priv->stats.pport.alignment_err;
 		break;
 	case IFCOUNTER_IQDROPS:
 		retval = priv->stats.vport.rx_out_of_buffer;
 		break;
 	case IFCOUNTER_OPACKETS:
 		retval = priv->stats.vport.tx_packets;
 		break;
 	case IFCOUNTER_OERRORS:
 		retval = priv->stats.port_stats_debug.out_discards;
 		break;
 	case IFCOUNTER_IBYTES:
 		retval = priv->stats.vport.rx_bytes;
 		break;
 	case IFCOUNTER_OBYTES:
 		retval = priv->stats.vport.tx_bytes;
 		break;
 	case IFCOUNTER_IMCASTS:
 		retval = priv->stats.vport.rx_multicast_packets;
 		break;
 	case IFCOUNTER_OMCASTS:
 		retval = priv->stats.vport.tx_multicast_packets;
 		break;
 	case IFCOUNTER_OQDROPS:
 		retval = priv->stats.vport.tx_queue_dropped;
 		break;
 	case IFCOUNTER_COLLISIONS:
 		retval = priv->stats.pport.collisions;
 		break;
 	default:
 		retval = if_get_counter_default(ifp, cnt);
 		break;
 	}
 	/* PRIV_UNLOCK(priv); XXX not allowed */
 	return (retval);
 }
 #endif
 
 static void
 mlx5e_set_rx_mode(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 
 	queue_work(priv->wq, &priv->set_rx_mode_work);
 }
 
 static int
 mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct mlx5e_priv *priv;
 	struct ifreq *ifr;
 	struct ifdownreason *ifdr;
 	struct ifi2creq i2c;
 	int error = 0;
 	int mask = 0;
 	int size_read = 0;
 	int module_status;
 	int module_num;
 	int max_mtu;
 	uint8_t read_addr;
 
 	priv = ifp->if_softc;
 
 	/* check if detaching */
 	if (priv == NULL || priv->gone != 0)
 		return (ENXIO);
 
 	switch (command) {
 	case SIOCSIFMTU:
 		ifr = (struct ifreq *)data;
 
 		PRIV_LOCK(priv);
 		mlx5_query_port_max_mtu(priv->mdev, &max_mtu);
 
 		if (ifr->ifr_mtu >= MLX5E_MTU_MIN &&
 		    ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) {
 			int was_opened;
 
 			was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			if (was_opened)
 				mlx5e_close_locked(ifp);
 
 			/* set new MTU */
 			mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu);
 
 			if (was_opened)
 				mlx5e_open_locked(ifp);
 		} else {
 			error = EINVAL;
 			mlx5_en_err(ifp,
 			    "Invalid MTU value. Min val: %d, Max val: %d\n",
 			    MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu));
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCSIFFLAGS:
 		if ((ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			mlx5e_set_rx_mode(ifp);
 			break;
 		}
 		PRIV_LOCK(priv);
 		if (ifp->if_flags & IFF_UP) {
 			if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 					mlx5e_open_locked(ifp);
 				ifp->if_drv_flags |= IFF_DRV_RUNNING;
 				mlx5_set_port_status(priv->mdev, MLX5_PORT_UP);
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				mlx5_set_port_status(priv->mdev,
 				    MLX5_PORT_DOWN);
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 					mlx5e_close_locked(ifp);
 				mlx5e_update_carrier(priv);
 				ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			}
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		mlx5e_set_rx_mode(ifp);
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		ifr = (struct ifreq *)data;
 		error = ifmedia_ioctl(ifp, ifr, &priv->media, command);
 		break;
 	case SIOCSIFCAP:
 		ifr = (struct ifreq *)data;
 		PRIV_LOCK(priv);
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 
 			if (IFCAP_TSO4 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				mask &= ~IFCAP_TSO4;
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				ifp->if_hwassist &= ~CSUM_IP_TSO;
 				mlx5_en_err(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 			if (IFCAP_TSO6 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				mask &= ~IFCAP_TSO6;
 				ifp->if_capenable &= ~IFCAP_TSO6;
 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
 				mlx5_en_err(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_NOMAP)
 			ifp->if_capenable ^= IFCAP_NOMAP;
 		if (mask & IFCAP_TXTLS4)
 			ifp->if_capenable ^= IFCAP_TXTLS4;
 		if (mask & IFCAP_TXTLS6)
 			ifp->if_capenable ^= IFCAP_TXTLS6;
+#ifdef RATELIMIT
+		if (mask & IFCAP_TXTLS_RTLMT)
+			ifp->if_capenable ^= IFCAP_TXTLS_RTLMT;
+#endif
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				mlx5_en_err(ifp, "enable txcsum first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			ifp->if_capenable ^= IFCAP_TSO4;
 			ifp->if_hwassist ^= CSUM_IP_TSO;
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				mlx5_en_err(ifp, "enable txcsum6 first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			ifp->if_capenable ^= IFCAP_TSO6;
 			ifp->if_hwassist ^= CSUM_IP6_TSO;
 		}
 		if (mask & IFCAP_VLAN_HWFILTER) {
 			if (ifp->if_capenable & IFCAP_VLAN_HWFILTER)
 				mlx5e_disable_vlan_filter(priv);
 			else
 				mlx5e_enable_vlan_filter(priv);
 
 			ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
 		}
 		if (mask & IFCAP_VLAN_HWTAGGING)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 		if (mask & IFCAP_WOL_MAGIC)
 			ifp->if_capenable ^= IFCAP_WOL_MAGIC;
 
 		VLAN_CAPABILITIES(ifp);
 		/* turn off LRO means also turn of HW LRO - if it's on */
 		if (mask & IFCAP_LRO) {
 			int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			bool need_restart = false;
 
 			ifp->if_capenable ^= IFCAP_LRO;
 
 			/* figure out if updating HW LRO is needed */
 			if (!(ifp->if_capenable & IFCAP_LRO)) {
 				if (priv->params.hw_lro_en) {
 					priv->params.hw_lro_en = false;
 					need_restart = true;
 				}
 			} else {
 				if (priv->params.hw_lro_en == false &&
 				    priv->params_ethtool.hw_lro != 0) {
 					priv->params.hw_lro_en = true;
 					need_restart = true;
 				}
 			}
 			if (was_opened && need_restart) {
 				mlx5e_close_locked(ifp);
 				mlx5e_open_locked(ifp);
 			}
 		}
 		if (mask & IFCAP_HWRXTSTMP) {
 			ifp->if_capenable ^= IFCAP_HWRXTSTMP;
 			if (ifp->if_capenable & IFCAP_HWRXTSTMP) {
 				if (priv->clbr_done == 0)
 					mlx5e_reset_calibration_callout(priv);
 			} else {
 				callout_drain(&priv->tstmp_clbr);
 				priv->clbr_done = 0;
 			}
 		}
 out:
 		PRIV_UNLOCK(priv);
 		break;
 
 	case SIOCGI2C:
 		ifr = (struct ifreq *)data;
 
 		/*
 		 * Copy from the user-space address ifr_data to the
 		 * kernel-space address i2c
 		 */
 		error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (error)
 			break;
 
 		if (i2c.len > sizeof(i2c.data)) {
 			error = EINVAL;
 			break;
 		}
 
 		PRIV_LOCK(priv);
 		/* Get module_num which is required for the query_eeprom */
 		error = mlx5_query_module_num(priv->mdev, &module_num);
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query module num failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/* Check if module is present before doing an access */
 		module_status = mlx5_query_module_status(priv->mdev, module_num);
 		if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) {
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/*
 		 * Currently 0XA0 and 0xA2 are the only addresses permitted.
 		 * The internal conversion is as follows:
 		 */
 		if (i2c.dev_addr == 0xA0)
 			read_addr = MLX5_I2C_ADDR_LOW;
 		else if (i2c.dev_addr == 0xA2)
 			read_addr = MLX5_I2C_ADDR_HIGH;
 		else {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, Invalid Address: %X\n",
 			    i2c.dev_addr);
 			error = EINVAL;
 			goto err_i2c;
 		}
 		error = mlx5_query_eeprom(priv->mdev,
 		    read_addr, MLX5_EEPROM_LOW_PAGE,
 		    (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num,
 		    (uint32_t *)i2c.data, &size_read);
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		if (i2c.len > MLX5_EEPROM_MAX_BYTES) {
 			error = mlx5_query_eeprom(priv->mdev,
 			    read_addr, MLX5_EEPROM_LOW_PAGE,
 			    (uint32_t)(i2c.offset + size_read),
 			    (uint32_t)(i2c.len - size_read), module_num,
 			    (uint32_t *)(i2c.data + size_read), &size_read);
 		}
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 err_i2c:
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCGIFDOWNREASON:
 		ifdr = (struct ifdownreason *)data;
 		bzero(ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg));
 		PRIV_LOCK(priv);
 		error = -mlx5_query_pddr_troubleshooting_info(priv->mdev, NULL,
 		    ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg));
 		PRIV_UNLOCK(priv);
 		if (error == 0)
 			ifdr->ifdr_reason = IFDR_REASON_MSG;
 		break;
 
 	default:
 		error = ether_ioctl(ifp, command, data);
 		break;
 	}
 	return (error);
 }
 
 static int
 mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
 {
 	/*
 	 * TODO: uncoment once FW really sets all these bits if
 	 * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap ||
 	 * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap ||
 	 * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return
 	 * -ENOTSUPP;
 	 */
 
 	/* TODO: add more must-to-have features */
 
 	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
 		return (-ENODEV);
 
 	return (0);
 }
 
 static u16
 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev)
 {
 	const int min_size = ETHER_VLAN_ENCAP_LEN + ETHER_HDR_LEN;
 	const int max_size = MLX5E_MAX_TX_INLINE;
 	const int bf_buf_size =
 	    ((1U << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2U) -
 	    (sizeof(struct mlx5e_tx_wqe) - 2);
 
 	/* verify against driver limits */
 	if (bf_buf_size > max_size)
 		return (max_size);
 	else if (bf_buf_size < min_size)
 		return (min_size);
 	else
 		return (bf_buf_size);
 }
 
 static int
 mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev,
     struct mlx5e_priv *priv,
     int num_comp_vectors)
 {
 	int err;
 
 	/*
 	 * TODO: Consider link speed for setting "log_sq_size",
 	 * "log_rq_size" and "cq_moderation_xxx":
 	 */
 	priv->params.log_sq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
 	priv->params.log_rq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
 	priv->params.rx_cq_moderation_usec =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE :
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	priv->params.rx_cq_moderation_mode =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0;
 	priv->params.rx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
 	priv->params.tx_cq_moderation_usec =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
 	priv->params.tx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.min_rx_wqes =
 	    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
 	priv->params.rx_hash_log_tbl_sz =
 	    (order_base_2(num_comp_vectors) >
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ?
 	    order_base_2(num_comp_vectors) :
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ;
 	priv->params.num_tc = 1;
 	priv->params.default_vlan_prio = 0;
 	priv->counter_set_id = -1;
 	priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
 
 	err = mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
 	if (err)
 		return (err);
 
 	/*
 	 * hw lro is currently defaulted to off. when it won't anymore we
 	 * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)"
 	 */
 	priv->params.hw_lro_en = false;
 	priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
 
 	/*
 	 * CQE zipping is currently defaulted to off. when it won't
 	 * anymore we will consider the HW capability:
 	 * "!!MLX5_CAP_GEN(mdev, cqe_compression)"
 	 */
 	priv->params.cqe_zipping_en = false;
 
 	priv->mdev = mdev;
 	priv->params.num_channels = num_comp_vectors;
 	priv->params.channels_rsss = 1;
 	priv->order_base_2_num_channels = order_base_2(num_comp_vectors);
 	priv->queue_mapping_channel_mask =
 	    roundup_pow_of_two(num_comp_vectors) - 1;
 	priv->num_tc = priv->params.num_tc;
 	priv->default_vlan_prio = priv->params.default_vlan_prio;
 
 	INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
 
 	return (0);
 }
 
 static int
 mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn,
 		  struct mlx5_core_mr *mkey)
 {
 	struct ifnet *ifp = priv->ifp;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	void *mkc;
 	u32 *in;
 	int err;
 
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL) {
 		mlx5_en_err(ifp, "failed to allocate inbox\n");
 		return (-ENOMEM);
 	}
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA);
 	MLX5_SET(mkc, mkc, umr_en, 1);	/* used by HW TLS */
 	MLX5_SET(mkc, mkc, lw, 1);
 	MLX5_SET(mkc, mkc, lr, 1);
 
 	MLX5_SET(mkc, mkc, pd, pdn);
 	MLX5_SET(mkc, mkc, length64, 1);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 
 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
 	if (err)
 		mlx5_en_err(ifp, "mlx5_core_create_mkey failed, %d\n",
 		    err);
 
 	kvfree(in);
 	return (err);
 }
 
 static const char *mlx5e_vport_stats_desc[] = {
 	MLX5E_VPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_pport_stats_desc[] = {
 	MLX5E_PPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static void
 mlx5e_priv_static_init(struct mlx5e_priv *priv, const uint32_t channels)
 {
 	uint32_t x;
 
 	mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF);
 	sx_init(&priv->state_lock, "mlx5state");
 	callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0);
 	MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock);
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_init(priv, &priv->channel[x], x);
 }
 
 static void
 mlx5e_priv_static_destroy(struct mlx5e_priv *priv, const uint32_t channels)
 {
 	uint32_t x;
 
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_destroy(&priv->channel[x]);
 	callout_drain(&priv->watchdog);
 	mtx_destroy(&priv->async_events_mtx);
 	sx_destroy(&priv->state_lock);
 }
 
 static int
 sysctl_firmware(SYSCTL_HANDLER_ARGS)
 {
 	/*
 	 * %d.%d%.d the string format.
 	 * fw_rev_{maj,min,sub} return u16, 2^16 = 65536.
 	 * We need at most 5 chars to store that.
 	 * It also has: two "." and NULL at the end, which means we need 18
 	 * (5*3 + 3) chars at most.
 	 */
 	char fw[18];
 	struct mlx5e_priv *priv = arg1;
 	int error;
 
 	snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev),
 	    fw_rev_sub(priv->mdev));
 	error = sysctl_handle_string(oidp, fw, sizeof(fw), req);
 	return (error);
 }
 
 static void
 mlx5e_disable_tx_dma(struct mlx5e_channel *ch)
 {
 	int i;
 
 	for (i = 0; i < ch->priv->num_tc; i++)
 		mlx5e_drain_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq)
 {
 
 	sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP);
 	sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8);
 	mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
 	sq->doorbell.d64 = 0;
 }
 
 void
 mlx5e_resume_sq(struct mlx5e_sq *sq)
 {
 	int err;
 
 	/* check if already enabled */
 	if (READ_ONCE(sq->running) != 0)
 		return;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR,
 	    MLX5_SQC_STATE_RST);
 	if (err != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from ERR to RST failed: %d\n", err);
 	}
 
 	sq->cc = 0;
 	sq->pc = 0;
 
 	/* reset doorbell prior to moving from RST to RDY */
 	mlx5e_reset_sq_doorbell_record(sq);
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST,
 	    MLX5_SQC_STATE_RDY);
 	if (err != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from RST to RDY failed: %d\n", err);
 	}
 
 	sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 	WRITE_ONCE(sq->running, 1);
 }
 
 static void
 mlx5e_enable_tx_dma(struct mlx5e_channel *ch)
 {
         int i;
 
 	for (i = 0; i < ch->priv->num_tc; i++)
 		mlx5e_resume_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_disable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	struct epoch_tracker et;
 	int err;
 
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from RDY to RST failed: %d\n", err);
 	}
 
 	while (!mlx5_wq_ll_is_empty(&rq->wq)) {
 		msleep(1);
 		NET_EPOCH_ENTER(et);
 		rq->cq.mcq.comp(&rq->cq.mcq);
 		NET_EPOCH_EXIT(et);
 	}
 
 	/*
 	 * Transitioning into RST state will allow the FW to track less ERR state queues,
 	 * thus reducing the recv queue flushing time
 	 */
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from ERR to RST failed: %d\n", err);
 	}
 }
 
 static void
 mlx5e_enable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	struct epoch_tracker et;
 	int err;
 
 	rq->wq.wqe_ctr = 0;
 	mlx5_wq_ll_update_db_record(&rq->wq);
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from RST to RDY failed: %d\n", err);
         }
 
 	rq->enabled = 1;
 
 	NET_EPOCH_ENTER(et);
 	rq->cq.mcq.comp(&rq->cq.mcq);
 	NET_EPOCH_EXIT(et);
 }
 
 void
 mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_tx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_tx_dma(&priv->channel[i]);
 	}
 }
 
 void
 mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_rx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_rx_dma(&priv->channel[i]);
 	}
 }
 
 static void
 mlx5e_add_hw_stats(struct mlx5e_priv *priv)
 {
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    priv, 0, sysctl_firmware, "A", "HCA firmware version");
 
 	SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0,
 	    "Board ID");
 }
 
 static int
 mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t tx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	tx_pfc = priv->params.tx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (tx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.tx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.tx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (tx_pfc != priv->params.tx_priority_flow_control)
 		err = -mlx5e_set_port_pfc(priv);
 done:
 	if (err != 0)
 		priv->params.tx_priority_flow_control= tx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static int
 mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t rx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	rx_pfc = priv->params.rx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (rx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.rx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.rx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (rx_pfc != priv->params.rx_priority_flow_control) {
 		err = -mlx5e_set_port_pfc(priv);
 		if (err == 0 && priv->sw_is_port_buf_owner)
 			err = mlx5e_update_buf_lossy(priv);
 	}
 done:
 	if (err != 0)
 		priv->params.rx_priority_flow_control= rx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_setup_pauseframes(struct mlx5e_priv *priv)
 {
 #if (__FreeBSD_version < 1100000)
 	char path[96];
 #endif
 	int error;
 
 	/* enable pauseframes by default */
 	priv->params.tx_pauseframe_control = 1;
 	priv->params.rx_pauseframe_control = 1;
 
 	/* disable ports flow control, PFC, by default */
 	priv->params.tx_priority_flow_control = 0;
 	priv->params.rx_priority_flow_control = 0;
 
 #if (__FreeBSD_version < 1100000)
 	/* compute path for sysctl */
 	snprintf(path, sizeof(path), "dev.mce.%d.tx_pauseframe_control",
 	    device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	/* try to fetch tunable, if any */
 	TUNABLE_INT_FETCH(path, &priv->params.tx_pauseframe_control);
 
 	/* compute path for sysctl */
 	snprintf(path, sizeof(path), "dev.mce.%d.rx_pauseframe_control",
 	    device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	/* try to fetch tunable, if any */
 	TUNABLE_INT_FETCH(path, &priv->params.rx_pauseframe_control);
 #endif
 
 	/* register pauseframe SYSCTLs */
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.tx_pauseframe_control, 0,
 	    "Set to enable TX pause frames. Clear to disable.");
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.rx_pauseframe_control, 0,
 	    "Set to enable RX pause frames. Clear to disable.");
 
 	/* register priority flow control, PFC, SYSCTLs */
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_tx_priority_flow_control, "CU",
 	    "Set to enable TX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_rx_priority_flow_control, "CU",
 	    "Set to enable RX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	PRIV_LOCK(priv);
 
 	/* range check */
 	priv->params.tx_pauseframe_control =
 	    priv->params.tx_pauseframe_control ? 1 : 0;
 	priv->params.rx_pauseframe_control =
 	    priv->params.rx_pauseframe_control ? 1 : 0;
 
 	/* update firmware */
 	error = mlx5e_set_port_pause_and_pfc(priv);
 	if (error == -EINVAL) {
 		mlx5_en_err(priv->ifp,
 		    "Global pauseframes must be disabled before enabling PFC.\n");
 		priv->params.rx_priority_flow_control = 0;
 		priv->params.tx_priority_flow_control = 0;
 
 		/* update firmware */
 		(void) mlx5e_set_port_pause_and_pfc(priv);
 	}
 	PRIV_UNLOCK(priv);
 }
 
 int
 mlx5e_ul_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_channel *pch;
 
 	priv = ifp->if_softc;
 
 	if (unlikely(priv->gone || params->hdr.flowtype == M_HASHTYPE_NONE)) {
 		return (EOPNOTSUPP);
 	} else {
 		/* keep this code synced with mlx5e_select_queue() */
 		u32 ch = priv->params.num_channels;
 #ifdef RSS
 		u32 temp;
 
 		if (rss_hash2bucket(params->hdr.flowid,
 		    params->hdr.flowtype, &temp) == 0)
 			ch = temp % ch;
 		else
 #endif
 			ch = (params->hdr.flowid % 128) % ch;
 
 		/*
 		 * NOTE: The channels array is only freed at detach
 		 * and it safe to return a pointer to the send tag
 		 * inside the channels structure as long as we
 		 * reference the priv.
 		 */
 		pch = priv->channel + ch;
 
 		/* check if send queue is not running */
 		if (unlikely(pch->sq[0].running == 0))
 			return (ENXIO);
 		m_snd_tag_ref(&pch->tag);
 		*ppmt = &pch->tag;
 		return (0);
 	}
 }
 
 int
 mlx5e_ul_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag);
 
 	params->unlimited.max_rate = -1ULL;
 	params->unlimited.queue_level = mlx5e_sq_queue_level(&pch->sq[0]);
 	return (0);
 }
 
 void
 mlx5e_ul_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag);
 
 	complete(&pch->completion);
 }
 
 static int
 mlx5e_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_alloc(ifp, params, ppmt));
 #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt));
 #endif
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		return (mlx5e_ul_snd_tag_alloc(ifp, params, ppmt));
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt));
 #endif
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 mlx5e_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
 {
 
 	switch (pmt->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_modify(pmt, params));
 #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		return (mlx5e_tls_snd_tag_modify(pmt, params));
 #endif
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 #endif
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 
 	switch (pmt->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_query(pmt, params));
 #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		return (mlx5e_tls_snd_tag_query(pmt, params));
 #endif
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		return (mlx5e_ul_snd_tag_query(pmt, params));
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		return (mlx5e_tls_snd_tag_query(pmt, params));
 #endif
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 #ifdef RATELIMIT
 #define NUM_HDWR_RATES_MLX 13
 static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
 	135375,			/* 1,083,000 */
 	180500,			/* 1,444,000 */
 	270750,			/* 2,166,000 */
 	361000,			/* 2,888,000 */
 	541500,			/* 4,332,000 */
 	721875,			/* 5,775,000 */
 	1082875,		/* 8,663,000 */
 	1443875,		/* 11,551,000 */
 	2165750,		/* 17,326,000 */
 	2887750,		/* 23,102,000 */
 	4331625,		/* 34,653,000 */
 	5775500,		/* 46,204,000 */
 	8663125			/* 69,305,000 */
 };
 
 static void
 mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
 {
 	/*
 	 * This function needs updating by the driver maintainer!
 	 * For the MLX card there are currently (ConectX-4?) 13 
 	 * pre-set rates and others i.e. ConnectX-5, 6, 7??
 	 *
 	 * This will change based on later adapters
 	 * and this code should be updated to look at ifp
 	 * and figure out the specific adapter type
 	 * settings i.e. how many rates as well
 	 * as if they are fixed (as is shown here) or
 	 * if they are dynamic (example chelsio t4). Also if there
 	 * is a maximum number of flows that the adapter
 	 * can handle that too needs to be updated in
 	 * the max_flows field.
 	 */
 	q->rate_table = adapter_rates_mlx;
 	q->flags = RT_IS_FIXED_TABLE;
 	q->max_flows = 0;	/* mlx has no limit */
 	q->number_of_rates = NUM_HDWR_RATES_MLX;
 	q->min_segment_burst = 1;
 }
 #endif
 
 static void
 mlx5e_snd_tag_free(struct m_snd_tag *pmt)
 {
 
 	switch (pmt->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		mlx5e_rl_snd_tag_free(pmt);
 		break;
 #if defined(KERN_TLS) && defined(IF_SND_TAG_TYPE_TLS_RATE_LIMIT)
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		mlx5e_tls_snd_tag_free(pmt);
 		break;
 #endif
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		mlx5e_ul_snd_tag_free(pmt);
 		break;
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		mlx5e_tls_snd_tag_free(pmt);
 		break;
 #endif
 	default:
 		break;
 	}
 }
 
 static void *
 mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 {
 	struct ifnet *ifp;
 	struct mlx5e_priv *priv;
 	u8 dev_addr[ETHER_ADDR_LEN] __aligned(4);
 	u8 connector_type;
 	struct sysctl_oid_list *child;
 	int ncv = mdev->priv.eq_table.num_comp_vectors;
 	char unit[16];
 	struct pfil_head_args pa;
 	int err;
 	int i,j;
 	u32 eth_proto_cap;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	bool ext = 0;
 	u32 speeds_num;
 	struct media media_entry = {};
 
 	if (mlx5e_check_required_hca_cap(mdev)) {
 		mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n");
 		return (NULL);
 	}
 	/*
 	 * Try to allocate the priv and make room for worst-case
 	 * number of channel structures:
 	 */
 	priv = malloc(sizeof(*priv) +
 	    (sizeof(priv->channel[0]) * mdev->priv.eq_table.num_comp_vectors),
 	    M_MLX5EN, M_WAITOK | M_ZERO);
 
 	ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev);
 	if (ifp == NULL) {
 		mlx5_core_err(mdev, "if_alloc() failed\n");
 		goto err_free_priv;
 	}
 	/* setup all static fields */
 	mlx5e_priv_static_init(priv, mdev->priv.eq_table.num_comp_vectors);
 
 	ifp->if_softc = priv;
 	if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev));
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_init = mlx5e_open;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST |
 	    IFF_KNOWSEPOCH;
 	ifp->if_ioctl = mlx5e_ioctl;
 	ifp->if_transmit = mlx5e_xmit;
 	ifp->if_qflush = if_qflush;
 #if (__FreeBSD_version >= 1100000)
 	ifp->if_get_counter = mlx5e_get_counter;
 #endif
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/*
          * Set driver features
          */
 	ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6;
 	ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
 	ifp->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER;
 	ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
 	ifp->if_capabilities |= IFCAP_LRO;
 	ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
 	ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
 	ifp->if_capabilities |= IFCAP_NOMAP;
 	ifp->if_capabilities |= IFCAP_TXTLS4 | IFCAP_TXTLS6;
-	ifp->if_capabilities |= IFCAP_TXRTLMT;
+#ifdef RATELIMIT
+	ifp->if_capabilities |= IFCAP_TXRTLMT | IFCAP_TXTLS_RTLMT;
+#endif
 	ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc;
 	ifp->if_snd_tag_free = mlx5e_snd_tag_free;
 	ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
 	ifp->if_snd_tag_query = mlx5e_snd_tag_query;
 #ifdef RATELIMIT
 	ifp->if_ratelimit_query = mlx5e_ratelimit_query;
 #endif
 	/* set TSO limits so that we don't have to drop TX packets */
 	ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 	ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
 	ifp->if_hw_tsomaxsegsize = MLX5E_MAX_TX_MBUF_SIZE;
 
 	ifp->if_capenable = ifp->if_capabilities;
 	ifp->if_hwassist = 0;
 	if (ifp->if_capenable & IFCAP_TSO)
 		ifp->if_hwassist |= CSUM_TSO;
 	if (ifp->if_capenable & IFCAP_TXCSUM)
 		ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 		ifp->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 	/* ifnet sysctl tree */
 	sysctl_ctx_init(&priv->sysctl_ctx);
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev),
 	    OID_AUTO, ifp->if_dname, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet - interface name");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 	snprintf(unit, sizeof(unit), "%d", ifp->if_dunit);
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, unit, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet - interface unit");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	/* HW sysctl tree */
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev));
 	priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child,
 	    OID_AUTO, "hw", CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet dev hw");
 	if (priv->sysctl_hw == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	err = mlx5e_build_ifp_priv(mdev, priv, ncv);
 	if (err) {
 		mlx5_core_err(mdev, "mlx5e_build_ifp_priv() failed (%d)\n", err);
 		goto err_free_sysctl;
 	}
 
 	/* reuse mlx5core's watchdog workqueue */
 	priv->wq = mdev->priv.health.wq_watchdog;
 
 	err = mlx5_alloc_map_uar(mdev, &priv->cq_uar);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5_alloc_map_uar failed, %d\n", err);
 		goto err_free_wq;
 	}
 	err = mlx5_core_alloc_pd(mdev, &priv->pdn);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5_core_alloc_pd failed, %d\n", err);
 		goto err_unmap_free_uar;
 	}
 	err = mlx5_alloc_transport_domain(mdev, &priv->tdn);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5_alloc_transport_domain failed, %d\n", err);
 		goto err_dealloc_pd;
 	}
 	err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_create_mkey failed, %d\n", err);
 		goto err_dealloc_transport_domain;
 	}
 	mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr);
 
 	/* check if we should generate a random MAC address */
 	if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 &&
 	    is_zero_ether_addr(dev_addr)) {
 		random_ether_addr(dev_addr);
 		mlx5_en_err(ifp, "Assigned random MAC address\n");
 	}
 
 	err = mlx5e_rl_init(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_rl_init failed, %d\n", err);
 		goto err_create_mkey;
 	}
 
 	err = mlx5e_tls_init(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_tls_init failed\n", __func__);
 		goto err_rl_init;
 	}
 
 	/* set default MTU */
 	mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu);
 
 	/* Set default media status */
 	priv->media_status_last = IFM_AVALID;
 	priv->media_active_last = IFM_ETHER | IFM_AUTO |
 	    IFM_ETH_RXPAUSE | IFM_FDX;
 
 	/* setup default pauseframes configuration */
 	mlx5e_setup_pauseframes(priv);
 
 	/* Setup supported medias */
 	//TODO: If we failed to query ptys is it ok to proceed??
 	if (!mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1)) {
 		ext = MLX5_CAP_PCAM_FEATURE(mdev,
 		    ptys_extended_ethernet);
 		eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 		    eth_proto_capability);
 		if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_connector_type))
 			connector_type = MLX5_GET(ptys_reg, out,
 			    connector_type);
 	} else {
 		eth_proto_cap = 0;
 		mlx5_en_err(ifp, "Query port media capability failed, %d\n", err);
 	}
 
 	ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK,
 	    mlx5e_media_change, mlx5e_media_status);
 
 	speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER : MLX5E_LINK_SPEEDS_NUMBER;
 	for (i = 0; i != speeds_num; i++) {
 		for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) {
 			media_entry = ext ? mlx5e_ext_mode_table[i][j] :
 			    mlx5e_mode_table[i][j];
 			if (media_entry.baudrate == 0)
 				continue;
 			if (MLX5E_PROT_MASK(i) & eth_proto_cap) {
 				ifmedia_add(&priv->media,
 				    media_entry.subtype |
 				    IFM_ETHER, 0, NULL);
 				ifmedia_add(&priv->media,
 				    media_entry.subtype |
 				    IFM_ETHER | IFM_FDX |
 				    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 			}
 		}
 	}
 
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 
 	/* Set autoselect by default */
 	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE);
 
 	DEBUGNET_SET(ifp, mlx5_en);
 
 	ether_ifattach(ifp, dev_addr);
 
 	/* Register for VLAN events */
 	priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 	    mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
 	priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 	    mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
 
 	/* Link is down by default */
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 
 	mlx5e_enable_async_events(priv);
 
 	mlx5e_add_hw_stats(priv);
 
 	mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM,
 	    priv->stats.vport.arg);
 
 	mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM,
 	    priv->stats.pport.arg);
 
 	mlx5e_create_ethtool(priv);
 
 	mtx_lock(&priv->async_events_mtx);
 	mlx5e_update_stats(priv);
 	mtx_unlock(&priv->async_events_mtx);
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_clbr_done", CTLFLAG_RD,
 	    &priv->clbr_done, 0,
 	    "RX timestamps calibration state");
 	callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT);
 	mlx5e_reset_calibration_callout(priv);
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = ifp->if_xname;
 	priv->pfil = pfil_head_register(&pa);
 
 	return (priv);
 
 err_rl_init:
 	mlx5e_rl_cleanup(priv);
 
 err_create_mkey:
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 
 err_dealloc_transport_domain:
 	mlx5_dealloc_transport_domain(mdev, priv->tdn);
 
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(mdev, priv->pdn);
 
 err_unmap_free_uar:
 	mlx5_unmap_free_uar(mdev, &priv->cq_uar);
 
 err_free_wq:
 	flush_workqueue(priv->wq);
 
 err_free_sysctl:
 	sysctl_ctx_free(&priv->sysctl_ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	mlx5e_priv_static_destroy(priv, mdev->priv.eq_table.num_comp_vectors);
 	if_free(ifp);
 
 err_free_priv:
 	free(priv, M_MLX5EN);
 	return (NULL);
 }
 
 static void
 mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 	struct ifnet *ifp = priv->ifp;
 
 	/* don't allow more IOCTLs */
 	priv->gone = 1;
 
 	/* XXX wait a bit to allow IOCTL handlers to complete */
 	pause("W", hz);
 
 #ifdef RATELIMIT
 	/*
 	 * The kernel can have reference(s) via the m_snd_tag's into
 	 * the ratelimit channels, and these must go away before
 	 * detaching:
 	 */
 	while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) {
 		mlx5_en_err(priv->ifp,
 		    "Waiting for all ratelimit connections to terminate\n");
 		pause("W", hz);
 	}
 #endif
 	/* wait for all unlimited send tags to complete */
 	mlx5e_priv_wait_for_completion(priv, mdev->priv.eq_table.num_comp_vectors);
 
 	/* stop watchdog timer */
 	callout_drain(&priv->watchdog);
 
 	callout_drain(&priv->tstmp_clbr);
 
 	if (priv->vlan_attach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
 	if (priv->vlan_detach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
 
 	/* make sure device gets closed */
 	PRIV_LOCK(priv);
 	mlx5e_close_locked(ifp);
 	PRIV_UNLOCK(priv);
 
 	/* deregister pfil */
 	if (priv->pfil != NULL) {
 		pfil_head_unregister(priv->pfil);
 		priv->pfil = NULL;
 	}
 
 	/* unregister device */
 	ifmedia_removeall(&priv->media);
 	ether_ifdetach(ifp);
 
 	mlx5e_tls_cleanup(priv);
 	mlx5e_rl_cleanup(priv);
 
 	/* destroy all remaining sysctl nodes */
 	sysctl_ctx_free(&priv->stats.vport.ctx);
 	sysctl_ctx_free(&priv->stats.pport.ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	sysctl_ctx_free(&priv->sysctl_ctx);
 
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 	mlx5_dealloc_transport_domain(priv->mdev, priv->tdn);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
 	mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
 	mlx5e_disable_async_events(priv);
 	flush_workqueue(priv->wq);
 	mlx5e_priv_static_destroy(priv, mdev->priv.eq_table.num_comp_vectors);
 	if_free(ifp);
 	free(priv, M_MLX5EN);
 }
 
 #ifdef DEBUGNET
 static void
 mlx5_en_debugnet_init(struct ifnet *dev, int *nrxr, int *ncl, int *clsize)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 
 	PRIV_LOCK(priv);
 	*nrxr = priv->params.num_channels;
 	*ncl = DEBUGNET_MAX_IN_FLIGHT;
 	*clsize = MLX5E_MAX_RX_BYTES;
 	PRIV_UNLOCK(priv);
 }
 
 static void
 mlx5_en_debugnet_event(struct ifnet *dev, enum debugnet_ev event)
 {
 }
 
 static int
 mlx5_en_debugnet_transmit(struct ifnet *dev, struct mbuf *m)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 	struct mlx5e_sq *sq;
 	int err;
 
 	if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING || (priv->media_status_last & IFM_ACTIVE) == 0)
 		return (ENOENT);
 
 	sq = &priv->channel[0].sq[0];
 
 	if (sq->running == 0) {
 		m_freem(m);
 		return (ENOENT);
 	}
 
 	if (mlx5e_sq_xmit(sq, &m) != 0) {
 		m_freem(m);
 		err = ENOBUFS;
 	} else {
 		err = 0;
 	}
 
 	if (likely(sq->doorbell.d64 != 0)) {
 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
 		sq->doorbell.d64 = 0;
 	}
 	return (err);
 }
 
 static int
 mlx5_en_debugnet_poll(struct ifnet *dev, int count)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 
 	if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0 ||
 	    (priv->media_status_last & IFM_ACTIVE) == 0)
 		return (ENOENT);
 
 	mlx5_poll_interrupts(priv->mdev);
 
 	return (0);
 }
 #endif /* DEBUGNET */
 
 static void *
 mlx5e_get_ifp(void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	return (priv->ifp);
 }
 
 static struct mlx5_interface mlx5e_interface = {
 	.add = mlx5e_create_ifp,
 	.remove = mlx5e_destroy_ifp,
 	.event = mlx5e_async_event,
 	.protocol = MLX5_INTERFACE_PROTOCOL_ETH,
 	.get_dev = mlx5e_get_ifp,
 };
 
 void
 mlx5e_init(void)
 {
 	mlx5_register_interface(&mlx5e_interface);
 }
 
 void
 mlx5e_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5e_interface);
 }
 
 static void
 mlx5e_show_version(void __unused *arg)
 {
 
 	printf("%s", mlx5e_version);
 }
 SYSINIT(mlx5e_show_version, SI_SUB_DRIVERS, SI_ORDER_ANY, mlx5e_show_version, NULL);
 
 module_init_order(mlx5e_init, SI_ORDER_SIXTH);
 module_exit_order(mlx5e_cleanup, SI_ORDER_SIXTH);
 
 #if (__FreeBSD_version >= 1100000)
 MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1);
 #endif
 MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1);
 MODULE_VERSION(mlx5en, 1);
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index ac90e1d45f51..c048f708ecfe 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1,2076 +1,2133 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/kthread.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 #include <machine/pcb.h>
 #endif
 #include <machine/vmparam.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #endif
 #include <net/route.h>
 #include <net/route/nhop.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #endif
 #include <netinet/tcp_var.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <opencrypto/xform.h>
 #include <vm/uma_dbg.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, mbuf) m_head;
 	STAILQ_HEAD(, socket) so_head;
 	bool		running;
 } __aligned(CACHE_LINE_SIZE);
 
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 LIST_HEAD(, ktls_crypto_backend) ktls_backends;
 static struct rmlock ktls_backends_lock;
 static uma_zone_t ktls_session_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload stats");
 
 static int ktls_allow_unload;
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, allow_unload, CTLFLAG_RDTUN,
     &ktls_allow_unload, 0, "Allow software crypto modules to unload");
 
 #ifdef RSS
 static int ktls_bind_threads = 1;
 #else
 static int ktls_bind_threads;
 #endif
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
     &ktls_bind_threads, 0,
     "Bind crypto threads to cores or domains at boot");
 
 static u_int ktls_maxlen = 16384;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN,
     &ktls_maxlen, 0, "Maximum TLS record size");
 
 static int ktls_number_threads;
 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RW,
     &ktls_offload_enable, 0,
     "Enable support for kernel TLS offload");
 
 static bool ktls_cbc_enable = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RW,
     &ktls_cbc_enable, 1,
     "Enable Support of AES-CBC crypto for kernel TLS");
 
 static counter_u64_t ktls_tasks_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
 static counter_u64_t ktls_cnt_tx_queued;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
     &ktls_cnt_tx_queued,
     "Number of TLS records in queue to tasks for SW encryption");
 
 static counter_u64_t ktls_cnt_rx_queued;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
     &ktls_cnt_rx_queued,
     "Number of TLS sockets in queue to tasks for SW decryption");
 
 static counter_u64_t ktls_offload_total;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
     "Total successful TLS setups (parameters set)");
 
 static counter_u64_t ktls_offload_enable_calls;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
     CTLFLAG_RD, &ktls_offload_enable_calls,
     "Total number of TLS enable calls made");
 
 static counter_u64_t ktls_offload_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
 static counter_u64_t ktls_offload_corrupted_records;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
 
 static counter_u64_t ktls_offload_failed_crypto;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
 
 static counter_u64_t ktls_switch_to_ifnet;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
 
 static counter_u64_t ktls_switch_to_sw;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
 
 static counter_u64_t ktls_switch_failed;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Hardware (ifnet) TLS session stats");
 #ifdef TCP_OFFLOAD
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TOE TLS session stats");
 #endif
 
 static counter_u64_t ktls_sw_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
     "Active number of software TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_sw_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
     "Active number of software TLS sessions using AES-GCM");
 
 static counter_u64_t ktls_ifnet_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_ifnet_cbc,
     "Active number of ifnet TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_ifnet_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_ifnet_gcm,
     "Active number of ifnet TLS sessions using AES-GCM");
 
 static counter_u64_t ktls_ifnet_reset;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
 
 static counter_u64_t ktls_ifnet_reset_dropped;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
     &ktls_ifnet_reset_dropped,
     "TLS sessions dropped after failing to update ifnet send tag");
 
 static counter_u64_t ktls_ifnet_reset_failed;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
     &ktls_ifnet_reset_failed,
     "TLS sessions that failed to allocate a new ifnet send tag");
 
 static int ktls_ifnet_permitted;
 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
     &ktls_ifnet_permitted, 1,
     "Whether to permit hardware (ifnet) TLS sessions");
 
 #ifdef TCP_OFFLOAD
 static counter_u64_t ktls_toe_cbc;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_toe_cbc,
     "Active number of TOE TLS sessions using AES-CBC");
 
 static counter_u64_t ktls_toe_gcm;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_toe_gcm,
     "Active number of TOE TLS sessions using AES-GCM");
 #endif
 
 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
 
 static void ktls_cleanup(struct ktls_session *tls);
 #if defined(INET) || defined(INET6)
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
 
 int
 ktls_crypto_backend_register(struct ktls_crypto_backend *be)
 {
 	struct ktls_crypto_backend *curr_be, *tmp;
 
 	if (be->api_version != KTLS_API_VERSION) {
 		printf("KTLS: API version mismatch (%d vs %d) for %s\n",
 		    be->api_version, KTLS_API_VERSION,
 		    be->name);
 		return (EINVAL);
 	}
 
 	rm_wlock(&ktls_backends_lock);
 	printf("KTLS: Registering crypto method %s with prio %d\n",
 	       be->name, be->prio);
 	if (LIST_EMPTY(&ktls_backends)) {
 		LIST_INSERT_HEAD(&ktls_backends, be, next);
 	} else {
 		LIST_FOREACH_SAFE(curr_be, &ktls_backends, next, tmp) {
 			if (curr_be->prio < be->prio) {
 				LIST_INSERT_BEFORE(curr_be, be, next);
 				break;
 			}
 			if (LIST_NEXT(curr_be, next) == NULL) {
 				LIST_INSERT_AFTER(curr_be, be, next);
 				break;
 			}
 		}
 	}
 	rm_wunlock(&ktls_backends_lock);
 	return (0);
 }
 
 int
 ktls_crypto_backend_deregister(struct ktls_crypto_backend *be)
 {
 	struct ktls_crypto_backend *tmp;
 
 	/*
 	 * Don't error if the backend isn't registered.  This permits
 	 * MOD_UNLOAD handlers to use this function unconditionally.
 	 */
 	rm_wlock(&ktls_backends_lock);
 	LIST_FOREACH(tmp, &ktls_backends, next) {
 		if (tmp == be)
 			break;
 	}
 	if (tmp == NULL) {
 		rm_wunlock(&ktls_backends_lock);
 		return (0);
 	}
 
 	if (!ktls_allow_unload) {
 		rm_wunlock(&ktls_backends_lock);
 		printf(
 		    "KTLS: Deregistering crypto method %s is not supported\n",
 		    be->name);
 		return (EBUSY);
 	}
 
 	if (be->use_count) {
 		rm_wunlock(&ktls_backends_lock);
 		return (EBUSY);
 	}
 
 	LIST_REMOVE(be, next);
 	rm_wunlock(&ktls_backends_lock);
 	return (0);
 }
 
 #if defined(INET) || defined(INET6)
 static u_int
 ktls_get_cpu(struct socket *so)
 {
 	struct inpcb *inp;
 	u_int cpuid;
 
 	inp = sotoinpcb(so);
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid != NETISR_CPUID_NONE)
 		return (cpuid);
 #endif
 	/*
 	 * Just use the flowid to shard connections in a repeatable
 	 * fashion.  Note that some crypto backends rely on the
 	 * serialization provided by having the same connection use
 	 * the same queue.
 	 */
 	cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
 	return (cpuid);
 }
 #endif
 
 static void
 ktls_init(void *dummy __unused)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	cpuset_t mask;
 	int error, i;
 
 	ktls_tasks_active = counter_u64_alloc(M_WAITOK);
 	ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK);
 	ktls_cnt_rx_queued = counter_u64_alloc(M_WAITOK);
 	ktls_offload_total = counter_u64_alloc(M_WAITOK);
 	ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK);
 	ktls_offload_active = counter_u64_alloc(M_WAITOK);
 	ktls_offload_corrupted_records = counter_u64_alloc(M_WAITOK);
 	ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_sw = counter_u64_alloc(M_WAITOK);
 	ktls_switch_failed = counter_u64_alloc(M_WAITOK);
 	ktls_sw_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_sw_gcm = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_gcm = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset_dropped = counter_u64_alloc(M_WAITOK);
 	ktls_ifnet_reset_failed = counter_u64_alloc(M_WAITOK);
 #ifdef TCP_OFFLOAD
 	ktls_toe_cbc = counter_u64_alloc(M_WAITOK);
 	ktls_toe_gcm = counter_u64_alloc(M_WAITOK);
 #endif
 
 	rm_init(&ktls_backends_lock, "ktls backends");
 	LIST_INIT(&ktls_backends);
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
 	    M_WAITOK | M_ZERO);
 
 	ktls_session_zone = uma_zcreate("ktls_session",
 	    sizeof(struct ktls_session),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	/*
 	 * Initialize the workqueues to run the TLS work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&ktls_wq[i].m_head);
 		STAILQ_INIT(&ktls_wq[i].so_head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
 		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
 		if (error)
 			panic("Can't add KTLS thread %d error %d", i, error);
 
 		/*
 		 * Bind threads to cores.  If ktls_bind_threads is >
 		 * 1, then we bind to the NUMA domain.
 		 */
 		if (ktls_bind_threads) {
 			if (ktls_bind_threads > 1) {
 				pc = pcpu_find(i);
 				CPU_COPY(&cpuset_domain[pc->pc_domain], &mask);
 			} else {
 				CPU_SETOF(i, &mask);
 			}
 			error = cpuset_setthread(td->td_tid, &mask);
 			if (error)
 				panic(
 			    "Unable to bind KTLS thread for CPU %d error %d",
 				     i, error);
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 	printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 }
 SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
 
 #if defined(INET) || defined(INET6)
 static int
 ktls_create_session(struct socket *so, struct tls_enable *en,
     struct ktls_session **tlsp)
 {
 	struct ktls_session *tls;
 	int error;
 
 	/* Only TLS 1.0 - 1.3 are supported. */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
 		return (EINVAL);
 	if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
 	    en->tls_vminor > TLS_MINOR_VER_THREE)
 		return (EINVAL);
 
 	if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
 		return (EINVAL);
 
 	/* All supported algorithms require a cipher key. */
 	if (en->cipher_key_len == 0)
 		return (EINVAL);
 
 	/* No flags are currently supported. */
 	if (en->flags != 0)
 		return (EINVAL);
 
 	/* Common checks for supported algorithms. */
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * auth_algorithm isn't used, but permit GMAC values
 		 * for compatibility.
 		 */
 		switch (en->auth_algorithm) {
 		case 0:
 #ifdef COMPAT_FREEBSD12
 		/* XXX: Really 13.0-current COMPAT. */
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
 #endif
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len != 0)
 			return (EINVAL);
 		if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
 			en->iv_len != TLS_AEAD_GCM_LEN) ||
 		    (en->tls_vminor == TLS_MINOR_VER_THREE &&
 			en->iv_len != TLS_1_3_GCM_IV_LEN))
 			return (EINVAL);
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			/*
 			 * TLS 1.0 requires an implicit IV.  TLS 1.1+
 			 * all use explicit IVs.
 			 */
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
 					return (EINVAL);
 				break;
 			}
 
 			/* FALLTHROUGH */
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			/* Ignore any supplied IV. */
 			en->iv_len = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len == 0)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
 	TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
 
 	tls->wq_index = ktls_get_cpu(so);
 
 	tls->params.cipher_algorithm = en->cipher_algorithm;
 	tls->params.auth_algorithm = en->auth_algorithm;
 	tls->params.tls_vmajor = en->tls_vmajor;
 	tls->params.tls_vminor = en->tls_vminor;
 	tls->params.flags = en->flags;
 	tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
 
 	/* Set the header and trailer lengths. */
 	tls->params.tls_hlen = sizeof(struct tls_record_layer);
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
 		 * nonce.  TLS 1.3 uses a 12 byte implicit IV.
 		 */
 		if (en->tls_vminor < TLS_MINOR_VER_THREE)
 			tls->params.tls_hlen += sizeof(uint64_t);
 		tls->params.tls_tlen = AES_GMAC_HASH_LEN;
 
 		/*
 		 * TLS 1.3 includes optional padding which we
 		 * do not support, and also puts the "real" record
 		 * type at the end of the encrypted data.
 		 */
 		if (en->tls_vminor == TLS_MINOR_VER_THREE)
 			tls->params.tls_tlen += sizeof(uint8_t);
 
 		tls->params.tls_bs = 1;
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA1_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_256_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_384_HASH_LEN;
 			break;
 		default:
 			panic("invalid hmac");
 		}
 		tls->params.tls_bs = AES_BLOCK_LEN;
 		break;
 	default:
 		panic("invalid cipher");
 	}
 
 	KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
 	    ("TLS header length too long: %d", tls->params.tls_hlen));
 	KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
 	    ("TLS trailer length too long: %d", tls->params.tls_tlen));
 
 	if (en->auth_key_len != 0) {
 		tls->params.auth_key_len = en->auth_key_len;
 		tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
 		    M_WAITOK);
 		error = copyin(en->auth_key, tls->params.auth_key,
 		    en->auth_key_len);
 		if (error)
 			goto out;
 	}
 
 	tls->params.cipher_key_len = en->cipher_key_len;
 	tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
 	error = copyin(en->cipher_key, tls->params.cipher_key,
 	    en->cipher_key_len);
 	if (error)
 		goto out;
 
 	/*
 	 * This holds the implicit portion of the nonce for GCM and
 	 * the initial implicit IV for TLS 1.0.  The explicit portions
 	 * of the IV are generated in ktls_frame().
 	 */
 	if (en->iv_len != 0) {
 		tls->params.iv_len = en->iv_len;
 		error = copyin(en->iv, tls->params.iv, en->iv_len);
 		if (error)
 			goto out;
 
 		/*
 		 * For TLS 1.2, generate an 8-byte nonce as a counter
 		 * to generate unique explicit IVs.
 		 *
 		 * Store this counter in the last 8 bytes of the IV
 		 * array so that it is 8-byte aligned.
 		 */
 		if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    en->tls_vminor == TLS_MINOR_VER_TWO)
 			arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
 	}
 
 	*tlsp = tls;
 	return (0);
 
 out:
 	ktls_cleanup(tls);
 	return (error);
 }
 
 static struct ktls_session *
 ktls_clone_session(struct ktls_session *tls)
 {
 	struct ktls_session *tls_new;
 
 	tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
 	tls_new->wq_index = tls->wq_index;
 
 	/* Deep copy keys. */
 	if (tls_new->params.auth_key != NULL) {
 		tls_new->params.auth_key = malloc(tls->params.auth_key_len,
 		    M_KTLS, M_WAITOK);
 		memcpy(tls_new->params.auth_key, tls->params.auth_key,
 		    tls->params.auth_key_len);
 	}
 
 	tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
 	    M_WAITOK);
 	memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
 	    tls->params.cipher_key_len);
 
 	return (tls_new);
 }
 #endif
 
 static void
 ktls_cleanup(struct ktls_session *tls)
 {
 
 	counter_u64_add(ktls_offload_active, -1);
 	switch (tls->mode) {
 	case TCP_TLS_MODE_SW:
 		MPASS(tls->be != NULL);
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_sw_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_sw_gcm, -1);
 			break;
 		}
 		tls->free(tls);
 		break;
 	case TCP_TLS_MODE_IFNET:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, -1);
 			break;
 		}
 		if (tls->snd_tag != NULL)
 			m_snd_tag_rele(tls->snd_tag);
 		break;
 #ifdef TCP_OFFLOAD
 	case TCP_TLS_MODE_TOE:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, -1);
 			break;
 		}
 		break;
 #endif
 	}
 	if (tls->params.auth_key != NULL) {
 		zfree(tls->params.auth_key, M_KTLS);
 		tls->params.auth_key = NULL;
 		tls->params.auth_key_len = 0;
 	}
 	if (tls->params.cipher_key != NULL) {
 		zfree(tls->params.cipher_key, M_KTLS);
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 }
 
 #if defined(INET) || defined(INET6)
 
 #ifdef TCP_OFFLOAD
 static int
 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (!(tp->t_flags & TF_TOE)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = tcp_offload_alloc_tls_session(tp, tls, direction);
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_TOE;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, 1);
 			break;
 		}
 	}
 	return (error);
 }
 #endif
 
 /*
  * Common code used when first enabling ifnet TLS on a connection or
  * when allocating a new ifnet TLS session due to a routing change.
  * This function allocates a new TLS send tag on whatever interface
  * the connection is currently routed over.
  */
 static int
 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	struct tcpcb *tp;
 	int error;
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 *
 	 * - Always permit 'force' requests.
 	 * - ktls_ifnet_permitted == 0: always deny.
 	 */
 	if (!force && ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: Use the cached route in the inpcb to find the
 	 * interface.  This should perhaps instead use
 	 * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
 	 * enabled after a connection has completed key negotiation in
 	 * userland, the cached route will be present in practice.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 
-	params.hdr.type = IF_SND_TAG_TYPE_TLS;
+	/*
+	 * Allocate a TLS + ratelimit tag if the connection has an
+	 * existing pacing rate.
+	 */
+	if (tp->t_pacing_rate != -1 &&
+	    (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
+		params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
+		params.tls_rate_limit.inp = inp;
+		params.tls_rate_limit.tls = tls;
+		params.tls_rate_limit.max_rate = tp->t_pacing_rate;
+	} else {
+		params.hdr.type = IF_SND_TAG_TYPE_TLS;
+		params.tls.inp = inp;
+		params.tls.tls = tls;
+	}
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
-	params.tls.inp = inp;
-	params.tls.tls = tls;
 	INP_RUNLOCK(inp);
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {	
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = ifp->if_snd_tag_alloc(ifp, &params, mstp);
 out:
 	if_rele(ifp);
 	return (error);
 }
 
 static int
 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
 {
 	struct m_snd_tag *mst;
 	int error;
 
 	error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_IFNET;
 		tls->snd_tag = mst;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, 1);
 			break;
 		}
 	}
 	return (error);
 }
 
 static int
 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct rm_priotracker prio;
 	struct ktls_crypto_backend *be;
 
 	/*
 	 * Choose the best software crypto backend.  Backends are
 	 * stored in sorted priority order (larget value == most
 	 * important at the head of the list), so this just stops on
 	 * the first backend that claims the session by returning
 	 * success.
 	 */
 	if (ktls_allow_unload)
 		rm_rlock(&ktls_backends_lock, &prio);
 	LIST_FOREACH(be, &ktls_backends, next) {
 		if (be->try(so, tls, direction) == 0)
 			break;
 		KASSERT(tls->cipher == NULL,
 		    ("ktls backend leaked a cipher pointer"));
 	}
 	if (be != NULL) {
 		if (ktls_allow_unload)
 			be->use_count++;
 		tls->be = be;
 	}
 	if (ktls_allow_unload)
 		rm_runlock(&ktls_backends_lock, &prio);
 	if (be == NULL)
 		return (EOPNOTSUPP);
 	tls->mode = TCP_TLS_MODE_SW;
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_sw_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_sw_gcm, 1);
 		break;
 	}
 	return (0);
 }
 
 /*
  * KTLS RX stores data in the socket buffer as a list of TLS records,
  * where each record is stored as a control message containg the TLS
  * header followed by data mbufs containing the decrypted data.  This
  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
  * both encrypted and decrypted data.  TLS records decrypted by a NIC
  * should be queued to the socket buffer as records, but encrypted
  * data which needs to be decrypted by software arrives as a stream of
  * regular mbufs which need to be converted.  In addition, there may
  * already be pending encrypted data in the socket buffer when KTLS RX
  * is enabled.
  *
  * To manage not-yet-decrypted data for KTLS RX, the following scheme
  * is used:
  *
  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
  *
  * - ktls_check_rx checks this chain of mbufs reading the TLS header
  *   from the first mbuf.  Once all of the data for that TLS record is
  *   queued, the socket is queued to a worker thread.
  *
  * - The worker thread calls ktls_decrypt to decrypt TLS records in
  *   the TLS chain.  Each TLS record is detached from the TLS chain,
  *   decrypted, and inserted into the regular socket buffer chain as
  *   record starting with a control message holding the TLS header and
  *   a chain of mbufs holding the encrypted data.
  */
 
 static void
 sb_mark_notready(struct sockbuf *sb)
 {
 	struct mbuf *m;
 
 	m = sb->sb_mb;
 	sb->sb_mtls = m;
 	sb->sb_mb = NULL;
 	sb->sb_mbtail = NULL;
 	sb->sb_lastrecord = NULL;
 	for (; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 		    __func__));
 		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
 		    __func__));
 		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 		    __func__));
 		m->m_flags |= M_NOTREADY;
 		sb->sb_acc -= m->m_len;
 		sb->sb_tlscc += m->m_len;
 		sb->sb_mtlstail = m;
 	}
 	KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
 	    ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
 	    sb->sb_ccc));
 }
 
 int
 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_rcv.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS 1.3 is not yet supported. */
 	if (en->tls_vmajor == TLS_MAJOR_VER_ONE &&
 	    en->tls_vminor == TLS_MINOR_VER_THREE)
 		return (ENOTSUP);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_RX);
 	if (error)
 #endif
 		error = ktls_try_sw(so, tls, KTLS_RX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/* Mark the socket as using TLS offload. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_rcv.sb_tls_info = tls;
 	so->so_rcv.sb_flags |= SB_TLS_RX;
 
 	/* Mark existing data as not ready until it can be decrypted. */
 	sb_mark_notready(&so->so_rcv);
 	ktls_check_rx(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
+	struct inpcb *inp;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_snd.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS requires ext pgs */
 	if (mb_use_ext_pgs == 0)
 		return (ENXIO);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_TX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, false);
 	if (error)
 		error = ktls_try_sw(so, tls, KTLS_TX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	error = sblock(&so->so_snd, SBL_WAIT);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
+	/*
+	 * Write lock the INP when setting sb_tls_info so that
+	 * routines in tcp_ratelimit.c can read sb_tls_info while
+	 * holding the INP lock.
+	 */
+	inp = so->so_pcb;
+	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_snd.sb_tls_info = tls;
 	if (tls->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
+	INP_WUNLOCK(inp);
 	sbunlock(&so->so_snd);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_get_rx_mode(struct socket *so)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int mode;
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_rcv);
 	tls = so->so_rcv.sb_tls_info;
 	if (tls == NULL)
 		mode = TCP_TLS_MODE_NONE;
 	else
 		mode = tls->mode;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	return (mode);
 }
 
 int
 ktls_get_tx_mode(struct socket *so)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int mode;
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL)
 		mode = TCP_TLS_MODE_NONE;
 	else
 		mode = tls->mode;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (mode);
 }
 
 /*
  * Switch between SW and ifnet TLS sessions as requested.
  */
 int
 ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
 	int error;
 
 	switch (mode) {
 	case TCP_TLS_MODE_SW:
 	case TCP_TLS_MODE_IFNET:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	if (tls->mode == mode) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	tls = ktls_hold(tls);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 
 	tls_new = ktls_clone_session(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, true);
 	else
 		error = ktls_try_sw(so, tls_new, KTLS_TX);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	error = sblock(&so->so_snd, SBL_WAIT);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	/*
 	 * If we raced with another session change, keep the existing
 	 * session.
 	 */
 	if (tls != so->so_snd.sb_tls_info) {
 		counter_u64_add(ktls_switch_failed, 1);
 		sbunlock(&so->so_snd);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (EBUSY);
 	}
 
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
 	if (tls_new->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	sbunlock(&so->so_snd);
 
 	/*
 	 * Drop two references on 'tls'.  The first is for the
 	 * ktls_hold() above.  The second drops the reference from the
 	 * socket buffer.
 	 */
 	KASSERT(tls->refcount >= 2, ("too few references on old session"));
 	ktls_free(tls);
 	ktls_free(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		counter_u64_add(ktls_switch_to_ifnet, 1);
 	else
 		counter_u64_add(ktls_switch_to_sw, 1);
 
 	INP_WLOCK(inp);
 	return (0);
 }
 
 /*
  * Try to allocate a new TLS send tag.  This task is scheduled when
  * ip_output detects a route change while trying to transmit a packet
  * holding a TLS record.  If a new tag is allocated, replace the tag
  * in the TLS session.  Subsequent packets on the connection will use
  * the new tag.  If a new tag cannot be allocated, drop the
  * connection.
  */
 static void
 ktls_reset_send_tag(void *context, int pending)
 {
 	struct epoch_tracker et;
 	struct ktls_session *tls;
 	struct m_snd_tag *old, *new;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	inp = tls->inp;
 
 	/*
 	 * Free the old tag first before allocating a new one.
 	 * ip[6]_output_send() will treat a NULL send tag the same as
 	 * an ifp mismatch and drop packets until a new tag is
 	 * allocated.
 	 *
 	 * Write-lock the INP when changing tls->snd_tag since
 	 * ip[6]_output_send() holds a read-lock when reading the
 	 * pointer.
 	 */
 	INP_WLOCK(inp);
 	old = tls->snd_tag;
 	tls->snd_tag = NULL;
 	INP_WUNLOCK(inp);
 	if (old != NULL)
 		m_snd_tag_rele(old);
 
 	error = ktls_alloc_snd_tag(inp, tls, true, &new);
 
 	if (error == 0) {
 		INP_WLOCK(inp);
 		tls->snd_tag = new;
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
 		if (!in_pcbrele_wlocked(inp))
 			INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
 		/*
 		 * XXX: Should we kick tcp_output explicitly now that
 		 * the send tag is fixed or just rely on timers?
 		 */
 	} else {
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		if (!in_pcbrele_wlocked(inp)) {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED)) {
 				tp = intotcpcb(inp);
 				CURVNET_SET(tp->t_vnet);
 				tp = tcp_drop(tp, ECONNABORTED);
 				CURVNET_RESTORE();
 				if (tp != NULL)
 					INP_WUNLOCK(inp);
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
 			} else
 				INP_WUNLOCK(inp);
 		}
 		NET_EPOCH_EXIT(et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 
 		/*
 		 * Leave reset_pending true to avoid future tasks while
 		 * the socket goes away.
 		 */
 	}
 
 	ktls_free(tls);
 }
 
 int
 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 {
 
 	if (inp == NULL)
 		return (ENOBUFS);
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * See if we should schedule a task to update the send tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		in_pcbref(inp);
 		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 	return (ENOBUFS);
 }
+
+#ifdef RATELIMIT
+int
+ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
+{
+	union if_snd_tag_modify_params params = {
+		.rate_limit.max_rate = max_pacing_rate,
+		.rate_limit.flags = M_NOWAIT,
+	};
+	struct m_snd_tag *mst;
+	struct ifnet *ifp;
+	int error;
+
+	/* Can't get to the inp, but it should be locked. */
+	/* INP_LOCK_ASSERT(inp); */
+
+	MPASS(tls->mode == TCP_TLS_MODE_IFNET);
+
+	if (tls->snd_tag == NULL) {
+		/*
+		 * Resetting send tag, ignore this change.  The
+		 * pending reset may or may not see this updated rate
+		 * in the tcpcb.  If it doesn't, we will just lose
+		 * this rate change.
+		 */
+		return (0);
+	}
+
+	MPASS(tls->snd_tag != NULL);
+	MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
+
+	mst = tls->snd_tag;
+	ifp = mst->ifp;
+	return (ifp->if_snd_tag_modify(mst, &params));
+}
+#endif
 #endif
 
 void
 ktls_destroy(struct ktls_session *tls)
 {
 	struct rm_priotracker prio;
 
 	ktls_cleanup(tls);
 	if (tls->be != NULL && ktls_allow_unload) {
 		rm_rlock(&ktls_backends_lock, &prio);
 		tls->be->use_count--;
 		rm_runlock(&ktls_backends_lock, &prio);
 	}
 	uma_zfree(ktls_session_zone, tls);
 }
 
 void
 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 {
 
 	for (; m != NULL; m = m->m_next) {
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_seq: mapped mbuf %p", m));
 
 		m->m_epg_seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 	}
 }
 
 /*
  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
  * mbuf in the chain must be an unmapped mbuf.  The payload of the
  * mbuf must be populated with the payload of each TLS record.
  *
  * The record_type argument specifies the TLS record type used when
  * populating the TLS header.
  *
  * The enq_count argument on return is set to the number of pages of
  * payload data for this entire chain that need to be encrypted via SW
  * encryption.  The returned value should be passed to ktls_enqueue
  * when scheduling encryption of this chain of mbufs.  To handle the
  * special case of empty fragments for TLS 1.0 sessions, an empty
  * fragment counts as one page.
  */
 void
 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
     uint8_t record_type)
 {
 	struct tls_record_layer *tlshdr;
 	struct mbuf *m;
 	uint64_t *noncep;
 	uint16_t tls_len;
 	int maxlen;
 
 	maxlen = tls->params.max_frame_len;
 	*enq_cnt = 0;
 	for (m = top; m != NULL; m = m->m_next) {
 		/*
 		 * All mbufs in the chain should be TLS records whose
 		 * payload does not exceed the maximum frame length.
 		 *
 		 * Empty TLS records are permitted when using CBC.
 		 */
 		KASSERT(m->m_len <= maxlen &&
 		    (tls->params.cipher_algorithm == CRYPTO_AES_CBC ?
 		    m->m_len >= 0 : m->m_len > 0),
 		    ("ktls_frame: m %p len %d\n", m, m->m_len));
 
 		/*
 		 * TLS frames require unmapped mbufs to store session
 		 * info.
 		 */
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
 
 		tls_len = m->m_len;
 
 		/* Save a reference to the session. */
 		m->m_epg_tls = ktls_hold(tls);
 
 		m->m_epg_hdrlen = tls->params.tls_hlen;
 		m->m_epg_trllen = tls->params.tls_tlen;
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 			int bs, delta;
 
 			/*
 			 * AES-CBC pads messages to a multiple of the
 			 * block size.  Note that the padding is
 			 * applied after the digest and the encryption
 			 * is done on the "plaintext || mac || padding".
 			 * At least one byte of padding is always
 			 * present.
 			 *
 			 * Compute the final trailer length assuming
 			 * at most one block of padding.
 			 * tls->params.sb_tls_tlen is the maximum
 			 * possible trailer length (padding + digest).
 			 * delta holds the number of excess padding
 			 * bytes if the maximum were used.  Those
 			 * extra bytes are removed.
 			 */
 			bs = tls->params.tls_bs;
 			delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
 			m->m_epg_trllen -= delta;
 		}
 		m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
 
 		/* Populate the TLS header. */
 		tlshdr = (void *)m->m_epg_hdr;
 		tlshdr->tls_vmajor = tls->params.tls_vmajor;
 
 		/*
 		 * TLS 1.3 masquarades as TLS 1.2 with a record type
 		 * of TLS_RLTYPE_APP.
 		 */
 		if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
 		    tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
 			tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
 			tlshdr->tls_type = TLS_RLTYPE_APP;
 			/* save the real record type for later */
 			m->m_epg_record_type = record_type;
 			m->m_epg_trail[0] = record_type;
 		} else {
 			tlshdr->tls_vminor = tls->params.tls_vminor;
 			tlshdr->tls_type = record_type;
 		}
 		tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
 		/*
 		 * Store nonces / explicit IVs after the end of the
 		 * TLS header.
 		 *
 		 * For GCM with TLS 1.2, an 8 byte nonce is copied
 		 * from the end of the IV.  The nonce is then
 		 * incremented for use by the next record.
 		 *
 		 * For CBC, a random nonce is inserted for TLS 1.1+.
 		 */
 		if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
 			noncep = (uint64_t *)(tls->params.iv + 8);
 			be64enc(tlshdr + 1, *noncep);
 			(*noncep)++;
 		} else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 		    tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 			arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 
 		/*
 		 * When using SW encryption, mark the mbuf not ready.
 		 * It will be marked ready via sbready() after the
 		 * record has been encrypted.
 		 *
 		 * When using ifnet TLS, unencrypted TLS records are
 		 * sent down the stack to the NIC.
 		 */
 		if (tls->mode == TCP_TLS_MODE_SW) {
 			m->m_flags |= M_NOTREADY;
 			m->m_epg_nrdy = m->m_epg_npgs;
 			if (__predict_false(tls_len == 0)) {
 				/* TLS 1.0 empty fragment. */
 				*enq_cnt += 1;
 			} else
 				*enq_cnt += m->m_epg_npgs;
 		}
 	}
 }
 
 void
 ktls_check_rx(struct sockbuf *sb)
 {
 	struct tls_record_layer hdr;
 	struct ktls_wq *wq;
 	struct socket *so;
 	bool running;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 	    __func__, sb));
 	so = __containerof(sb, struct socket, so_rcv);
 
 	if (sb->sb_flags & SB_TLS_RX_RUNNING)
 		return;
 
 	/* Is there enough queued for a TLS header? */
 	if (sb->sb_tlscc < sizeof(hdr)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
 
 	/* Is the entire record queued? */
 	if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	sb->sb_flags |= SB_TLS_RX_RUNNING;
 
 	soref(so);
 	wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_rx_queued, 1);
 }
 
 static struct mbuf *
 ktls_detach_record(struct sockbuf *sb, int len)
 {
 	struct mbuf *m, *n, *top;
 	int remain;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	MPASS(len <= sb->sb_tlscc);
 
 	/*
 	 * If TLS chain is the exact size of the record,
 	 * just grab the whole record.
 	 */
 	top = sb->sb_mtls;
 	if (sb->sb_tlscc == len) {
 		sb->sb_mtls = NULL;
 		sb->sb_mtlstail = NULL;
 		goto out;
 	}
 
 	/*
 	 * While it would be nice to use m_split() here, we need
 	 * to know exactly what m_split() allocates to update the
 	 * accounting, so do it inline instead.
 	 */
 	remain = len;
 	for (m = top; remain > m->m_len; m = m->m_next)
 		remain -= m->m_len;
 
 	/* Easy case: don't have to split 'm'. */
 	if (remain == m->m_len) {
 		sb->sb_mtls = m->m_next;
 		if (sb->sb_mtls == NULL)
 			sb->sb_mtlstail = NULL;
 		m->m_next = NULL;
 		goto out;
 	}
 
 	/*
 	 * Need to allocate an mbuf to hold the remainder of 'm'.  Try
 	 * with M_NOWAIT first.
 	 */
 	n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL) {
 		/*
 		 * Use M_WAITOK with socket buffer unlocked.  If
 		 * 'sb_mtls' changes while the lock is dropped, return
 		 * NULL to force the caller to retry.
 		 */
 		SOCKBUF_UNLOCK(sb);
 
 		n = m_get(M_WAITOK, MT_DATA);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_mtls != top) {
 			m_free(n);
 			return (NULL);
 		}
 	}
 	n->m_flags |= M_NOTREADY;
 
 	/* Store remainder in 'n'. */
 	n->m_len = m->m_len - remain;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + remain;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
 	}
 
 	/* Trim 'm' and update accounting. */
 	m->m_len -= n->m_len;
 	sb->sb_tlscc -= n->m_len;
 	sb->sb_ccc -= n->m_len;
 
 	/* Account for 'n'. */
 	sballoc_ktls_rx(sb, n);
 
 	/* Insert 'n' into the TLS chain. */
 	sb->sb_mtls = n;
 	n->m_next = m->m_next;
 	if (sb->sb_mtlstail == m)
 		sb->sb_mtlstail = n;
 
 	/* Detach the record from the TLS chain. */
 	m->m_next = NULL;
 
 out:
 	MPASS(m_length(top, NULL) == len);
 	for (m = top; m != NULL; m = m->m_next)
 		sbfree_ktls_rx(sb, m);
 	sb->sb_tlsdcc = len;
 	sb->sb_ccc += len;
 	SBCHECK(sb);
 	return (top);
 }
 
 static void
 ktls_decrypt(struct socket *so)
 {
 	char tls_header[MBUF_PEXT_HDR_LEN];
 	struct ktls_session *tls;
 	struct sockbuf *sb;
 	struct tls_record_layer *hdr;
 	struct tls_get_record tgr;
 	struct mbuf *control, *data, *m;
 	uint64_t seqno;
 	int error, remain, tls_len, trail_len;
 
 	hdr = (struct tls_record_layer *)tls_header;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
 	    ("%s: socket %p not running", __func__, so));
 
 	tls = sb->sb_tls_info;
 	MPASS(tls != NULL);
 
 	for (;;) {
 		/* Is there enough queued for a TLS header? */
 		if (sb->sb_tlscc < tls->params.tls_hlen)
 			break;
 
 		m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
 		tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
 
 		if (hdr->tls_vmajor != tls->params.tls_vmajor ||
 		    hdr->tls_vminor != tls->params.tls_vminor)
 			error = EINVAL;
 		else if (tls_len < tls->params.tls_hlen || tls_len >
 		    tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
 		    tls->params.tls_tlen)
 			error = EMSGSIZE;
 		else
 			error = 0;
 		if (__predict_false(error != 0)) {
 			/*
 			 * We have a corrupted record and are likely
 			 * out of sync.  The connection isn't
 			 * recoverable at this point, so abort it.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			counter_u64_add(ktls_offload_corrupted_records, 1);
 
 			CURVNET_SET(so->so_vnet);
 			so->so_proto->pr_usrreqs->pru_abort(so);
 			so->so_error = error;
 			CURVNET_RESTORE();
 			goto deref;
 		}
 
 		/* Is the entire record queued? */
 		if (sb->sb_tlscc < tls_len)
 			break;
 
 		/*
 		 * Split out the portion of the mbuf chain containing
 		 * this TLS record.
 		 */
 		data = ktls_detach_record(sb, tls_len);
 		if (data == NULL)
 			continue;
 		MPASS(sb->sb_tlsdcc == tls_len);
 
 		seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 		SBCHECK(sb);
 		SOCKBUF_UNLOCK(sb);
 
 		error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 
 			SOCKBUF_LOCK(sb);
 			if (sb->sb_tlsdcc == 0) {
 				/*
 				 * sbcut/drop/flush discarded these
 				 * mbufs.
 				 */
 				m_freem(data);
 				break;
 			}
 
 			/*
 			 * Drop this TLS record's data, but keep
 			 * decrypting subsequent records.
 			 */
 			sb->sb_ccc -= tls_len;
 			sb->sb_tlsdcc = 0;
 
 			CURVNET_SET(so->so_vnet);
 			so->so_error = EBADMSG;
 			sorwakeup_locked(so);
 			CURVNET_RESTORE();
 
 			m_freem(data);
 
 			SOCKBUF_LOCK(sb);
 			continue;
 		}
 
 		/* Allocate the control mbuf. */
 		tgr.tls_type = hdr->tls_type;
 		tgr.tls_vmajor = hdr->tls_vmajor;
 		tgr.tls_vminor = hdr->tls_vminor;
 		tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
 		    trail_len);
 		control = sbcreatecontrol_how(&tgr, sizeof(tgr),
 		    TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_tlsdcc == 0) {
 			/* sbcut/drop/flush discarded these mbufs. */
 			MPASS(sb->sb_tlscc == 0);
 			m_freem(data);
 			m_freem(control);
 			break;
 		}
 
 		/*
 		 * Clear the 'dcc' accounting in preparation for
 		 * adding the decrypted record.
 		 */
 		sb->sb_ccc -= tls_len;
 		sb->sb_tlsdcc = 0;
 		SBCHECK(sb);
 
 		/* If there is no payload, drop all of the data. */
 		if (tgr.tls_length == htobe16(0)) {
 			m_freem(data);
 			data = NULL;
 		} else {
 			/* Trim header. */
 			remain = tls->params.tls_hlen;
 			while (remain > 0) {
 				if (data->m_len > remain) {
 					data->m_data += remain;
 					data->m_len -= remain;
 					break;
 				}
 				remain -= data->m_len;
 				data = m_free(data);
 			}
 
 			/* Trim trailer and clear M_NOTREADY. */
 			remain = be16toh(tgr.tls_length);
 			m = data;
 			for (m = data; remain > m->m_len; m = m->m_next) {
 				m->m_flags &= ~M_NOTREADY;
 				remain -= m->m_len;
 			}
 			m->m_len = remain;
 			m_freem(m->m_next);
 			m->m_next = NULL;
 			m->m_flags &= ~M_NOTREADY;
 
 			/* Set EOR on the final mbuf. */
 			m->m_flags |= M_EOR;
 		}
 
 		sbappendcontrol_locked(sb, data, control, 0);
 	}
 
 	sb->sb_flags &= ~SB_TLS_RX_RUNNING;
 
 	if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
 		so->so_error = EMSGSIZE;
 
 	sorwakeup_locked(so);
 
 deref:
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	CURVNET_SET(so->so_vnet);
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_enqueue_to_free(struct mbuf *m)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	/* Mark it for freeing. */
 	m->m_epg_flags |= EPG_FLAG_2FREE;
 	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
 	    (M_EXTPG | M_NOTREADY)),
 	    ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 	KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 
 	KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
 
 	m->m_epg_enc_cnt = page_count;
 
 	/*
 	 * Save a pointer to the socket.  The caller is responsible
 	 * for taking an additional reference via soref().
 	 */
 	m->m_epg_so = so;
 
 	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_tx_queued, 1);
 }
 
 static __noinline void
 ktls_encrypt(struct mbuf *top)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	vm_paddr_t parray[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)];
 	vm_page_t pg;
 	int error, i, len, npages, off, total_pages;
 	bool is_anon;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	/*
 	 * Encrypt the TLS records in the chain of mbufs starting with
 	 * 'top'.  'total_pages' gives us a total count of pages and is
 	 * used to know when we have finished encrypting the TLS
 	 * records originally queued with 'top'.
 	 *
 	 * NB: These mbufs are queued in the socket buffer and
 	 * 'm_next' is traversing the mbufs in the socket buffer.  The
 	 * socket buffer lock is not held while traversing this chain.
 	 * Since the mbufs are all marked M_NOTREADY their 'm_next'
 	 * pointers should be stable.  However, the 'm_next' of the
 	 * last mbuf encrypted is not necessarily NULL.  It can point
 	 * to other mbufs appended while 'top' was on the TLS work
 	 * queue.
 	 *
 	 * Each mbuf holds an entire TLS record.
 	 */
 	error = 0;
 	for (m = top; npages != total_pages; m = m->m_next) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
 		    (M_EXTPG | M_NOTREADY),
 		    ("%p not unready & nomap mbuf (top = %p)\n", m, top));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		/*
 		 * Generate source and destination ivoecs to pass to
 		 * the SW encryption backend.  For writable mbufs, the
 		 * destination iovec is a copy of the source and
 		 * encryption is done in place.  For file-backed mbufs
 		 * (from sendfile), anonymous wired pages are
 		 * allocated and assigned to the destination iovec.
 		 */
 		is_anon = (m->m_epg_flags & EPG_FLAG_ANON) != 0;
 
 		off = m->m_epg_1st_off;
 		for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
 			len = m_epg_pagelen(m, i, off);
 			src_iov[i].iov_len = len;
 			src_iov[i].iov_base =
 			    (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]) +
 				off;
 
 			if (is_anon) {
 				dst_iov[i].iov_base = src_iov[i].iov_base;
 				dst_iov[i].iov_len = src_iov[i].iov_len;
 				continue;
 			}
 retry_page:
 			pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED);
 			if (pg == NULL) {
 				vm_wait(NULL);
 				goto retry_page;
 			}
 			parray[i] = VM_PAGE_TO_PHYS(pg);
 			dst_iov[i].iov_base =
 			    (char *)(void *)PHYS_TO_DMAP(parray[i]) + off;
 			dst_iov[i].iov_len = len;
 		}
 
 		if (__predict_false(m->m_epg_npgs == 0)) {
 			/* TLS 1.0 empty fragment. */
 			npages++;
 		} else
 			npages += i;
 
 		error = (*tls->sw_encrypt)(tls,
 		    (const struct tls_record_layer *)m->m_epg_hdr,
 		    m->m_epg_trail, src_iov, dst_iov, i, m->m_epg_seqno,
 		    m->m_epg_record_type);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			break;
 		}
 
 		/*
 		 * For file-backed mbufs, release the file-backed
 		 * pages and replace them in the ext_pgs array with
 		 * the anonymous wired pages allocated above.
 		 */
 		if (!is_anon) {
 			/* Free the old pages. */
 			m->m_ext.ext_free(m);
 
 			/* Replace them with the new pages. */
 			for (i = 0; i < m->m_epg_npgs; i++)
 				m->m_epg_pa[i] = parray[i];
 
 			/* Use the basic free routine. */
 			m->m_ext.ext_free = mb_free_mext_pgs;
 
 			/* Pages are now writable. */
 			m->m_epg_flags |= EPG_FLAG_ANON;
 		}
 
 		/*
 		 * Drop a reference to the session now that it is no
 		 * longer needed.  Existing code depends on encrypted
 		 * records having no associated session vs
 		 * yet-to-be-encrypted records having an associated
 		 * session.
 		 */
 		m->m_epg_tls = NULL;
 		ktls_free(tls);
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error == 0) {
 		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
 	} else {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(top, total_pages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
 	struct mbuf *m, *n;
 	struct socket *so, *son;
 	STAILQ_HEAD(, mbuf) local_m_head;
 	STAILQ_HEAD(, socket) local_so_head;
 
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->m_head) &&
 		    STAILQ_EMPTY(&wq->so_head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
 		STAILQ_INIT(&local_m_head);
 		STAILQ_CONCAT(&local_m_head, &wq->m_head);
 		STAILQ_INIT(&local_so_head);
 		STAILQ_CONCAT(&local_so_head, &wq->so_head);
 		mtx_unlock(&wq->mtx);
 
 		STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
 			if (m->m_epg_flags & EPG_FLAG_2FREE) {
 				ktls_free(m->m_epg_tls);
 				uma_zfree(zone_mbuf, m);
 			} else {
 				ktls_encrypt(m);
 				counter_u64_add(ktls_cnt_tx_queued, -1);
 			}
 		}
 
 		STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
 			ktls_decrypt(so);
 			counter_u64_add(ktls_cnt_rx_queued, -1);
 		}
 	}
 }
diff --git a/sys/net/if.h b/sys/net/if.h
index be306dfbb378..6d9cc906d383 100644
--- a/sys/net/if.h
+++ b/sys/net/if.h
@@ -1,621 +1,622 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NET_IF_H_
 #define	_NET_IF_H_
 
 #include <sys/cdefs.h>
 
 #if __BSD_VISIBLE
 /*
  * <net/if.h> does not depend on <sys/time.h> on most other systems.  This
  * helps userland compatibility.  (struct timeval ifi_lastchange)
  * The same holds for <sys/socket.h>.  (struct sockaddr ifru_addr)
  */
 #ifndef _KERNEL
 #include <sys/time.h>
 #include <sys/socket.h>
 #endif
 #endif
 
 /*
  * Length of interface external name, including terminating '\0'.
  * Note: this is the same size as a generic device's external name.
  */
 #define		IF_NAMESIZE	16
 #if __BSD_VISIBLE
 #define		IFNAMSIZ	IF_NAMESIZE
 #define		IF_MAXUNIT	0x7fff	/* historical value */
 #endif
 #if __BSD_VISIBLE
 
 /*
  * Structure used to query names of interface cloners.
  */
 
 struct if_clonereq {
 	int	ifcr_total;		/* total cloners (out) */
 	int	ifcr_count;		/* room for this many in user buffer */
 	char	*ifcr_buffer;		/* buffer for cloner names */
 };
 
 /*
  * Structure describing information about an interface
  * which may be of interest to management entities.
  */
 struct if_data {
 	/* generic interface information */
 	uint8_t	ifi_type;		/* ethernet, tokenring, etc */
 	uint8_t	ifi_physical;		/* e.g., AUI, Thinnet, 10base-T, etc */
 	uint8_t	ifi_addrlen;		/* media address length */
 	uint8_t	ifi_hdrlen;		/* media header length */
 	uint8_t	ifi_link_state;		/* current link state */
 	uint8_t	ifi_vhid;		/* carp vhid */
 	uint16_t	ifi_datalen;	/* length of this data struct */
 	uint32_t	ifi_mtu;	/* maximum transmission unit */
 	uint32_t	ifi_metric;	/* routing metric (external only) */
 	uint64_t	ifi_baudrate;	/* linespeed */
 	/* volatile statistics */
 	uint64_t	ifi_ipackets;	/* packets received on interface */
 	uint64_t	ifi_ierrors;	/* input errors on interface */
 	uint64_t	ifi_opackets;	/* packets sent on interface */
 	uint64_t	ifi_oerrors;	/* output errors on interface */
 	uint64_t	ifi_collisions;	/* collisions on csma interfaces */
 	uint64_t	ifi_ibytes;	/* total number of octets received */
 	uint64_t	ifi_obytes;	/* total number of octets sent */
 	uint64_t	ifi_imcasts;	/* packets received via multicast */
 	uint64_t	ifi_omcasts;	/* packets sent via multicast */
 	uint64_t	ifi_iqdrops;	/* dropped on input */
 	uint64_t	ifi_oqdrops;	/* dropped on output */
 	uint64_t	ifi_noproto;	/* destined for unsupported protocol */
 	uint64_t	ifi_hwassist;	/* HW offload capabilities, see IFCAP */
 
 	/* Unions are here to make sizes MI. */
 	union {				/* uptime at attach or stat reset */
 		time_t		tt;
 		uint64_t	ph;
 	} __ifi_epoch;
 #define	ifi_epoch	__ifi_epoch.tt
 	union {				/* time of last administrative change */
 		struct timeval	tv;
 		struct {
 			uint64_t ph1;
 			uint64_t ph2;
 		} ph;
 	} __ifi_lastchange;
 #define	ifi_lastchange	__ifi_lastchange.tv
 };
 
 /*-
  * Interface flags are of two types: network stack owned flags, and driver
  * owned flags.  Historically, these values were stored in the same ifnet
  * flags field, but with the advent of fine-grained locking, they have been
  * broken out such that the network stack is responsible for synchronizing
  * the stack-owned fields, and the device driver the device-owned fields.
  * Both halves can perform lockless reads of the other half's field, subject
  * to accepting the involved races.
  *
  * Both sets of flags come from the same number space, and should not be
  * permitted to conflict, as they are exposed to user space via a single
  * field.
  *
  * The following symbols identify read and write requirements for fields:
  *
  * (i) if_flags field set by device driver before attach, read-only there
  *     after.
  * (n) if_flags field written only by the network stack, read by either the
  *     stack or driver.
  * (d) if_drv_flags field written only by the device driver, read by either
  *     the stack or driver.
  */
 #define	IFF_UP		0x1		/* (n) interface is up */
 #define	IFF_BROADCAST	0x2		/* (i) broadcast address valid */
 #define	IFF_DEBUG	0x4		/* (n) turn on debugging */
 #define	IFF_LOOPBACK	0x8		/* (i) is a loopback net */
 #define	IFF_POINTOPOINT	0x10		/* (i) is a point-to-point link */
 #define	IFF_KNOWSEPOCH	0x20		/* (i) calls if_input in net epoch */
 #define	IFF_DRV_RUNNING	0x40		/* (d) resources allocated */
 #define	IFF_NOARP	0x80		/* (n) no address resolution protocol */
 #define	IFF_PROMISC	0x100		/* (n) receive all packets */
 #define	IFF_ALLMULTI	0x200		/* (n) receive all multicast packets */
 #define	IFF_DRV_OACTIVE	0x400		/* (d) tx hardware queue is full */
 #define	IFF_SIMPLEX	0x800		/* (i) can't hear own transmissions */
 #define	IFF_LINK0	0x1000		/* per link layer defined bit */
 #define	IFF_LINK1	0x2000		/* per link layer defined bit */
 #define	IFF_LINK2	0x4000		/* per link layer defined bit */
 #define	IFF_ALTPHYS	IFF_LINK2	/* use alternate physical connection */
 #define	IFF_MULTICAST	0x8000		/* (i) supports multicast */
 #define	IFF_CANTCONFIG	0x10000		/* (i) unconfigurable using ioctl(2) */
 #define	IFF_PPROMISC	0x20000		/* (n) user-requested promisc mode */
 #define	IFF_MONITOR	0x40000		/* (n) user-requested monitor mode */
 #define	IFF_STATICARP	0x80000		/* (n) static ARP */
 #define	IFF_DYING	0x200000	/* (n) interface is winding down */
 #define	IFF_RENAMING	0x400000	/* (n) interface is being renamed */
 #define	IFF_NOGROUP	0x800000	/* (n) interface is not part of any groups */
 
 /*
  * Old names for driver flags so that user space tools can continue to use
  * the old (portable) names.
  */
 #ifndef _KERNEL
 #define	IFF_RUNNING	IFF_DRV_RUNNING
 #define	IFF_OACTIVE	IFF_DRV_OACTIVE
 #endif
 
 /* flags set internally only: */
 #define	IFF_CANTCHANGE \
 	(IFF_BROADCAST|IFF_POINTOPOINT|IFF_DRV_RUNNING|IFF_DRV_OACTIVE|\
 	    IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC|\
 	    IFF_DYING|IFF_CANTCONFIG|IFF_KNOWSEPOCH)
 
 /*
  * Values for if_link_state.
  */
 #define	LINK_STATE_UNKNOWN	0	/* link invalid/unknown */
 #define	LINK_STATE_DOWN		1	/* link is down */
 #define	LINK_STATE_UP		2	/* link is up */
 
 /*
  * Some convenience macros used for setting ifi_baudrate.
  * XXX 1000 vs. 1024? --thorpej@netbsd.org
  */
 #define	IF_Kbps(x)	((uintmax_t)(x) * 1000)	/* kilobits/sec. */
 #define	IF_Mbps(x)	(IF_Kbps((x) * 1000))	/* megabits/sec. */
 #define	IF_Gbps(x)	(IF_Mbps((x) * 1000))	/* gigabits/sec. */
 
 /*
  * Capabilities that interfaces can advertise.
  *
  * struct ifnet.if_capabilities
  *   contains the optional features & capabilities a particular interface
  *   supports (not only the driver but also the detected hw revision).
  *   Capabilities are defined by IFCAP_* below.
  * struct ifnet.if_capenable
  *   contains the enabled (either by default or through ifconfig) optional
  *   features & capabilities on this interface.
  *   Capabilities are defined by IFCAP_* below.
  * struct if_data.ifi_hwassist in mbuf CSUM_ flag form, controlled by above
  *   contains the enabled optional feature & capabilites that can be used
  *   individually per packet and are specified in the mbuf pkthdr.csum_flags
  *   field.  IFCAP_* and CSUM_* do not match one to one and CSUM_* may be
  *   more detailed or differentiated than IFCAP_*.
  *   Hwassist features are defined CSUM_* in sys/mbuf.h
  *
  * Capabilities that cannot be arbitrarily changed with ifconfig/ioctl
  * are listed in IFCAP_CANTCHANGE, similar to IFF_CANTCHANGE.
  * This is not strictly necessary because the common code never
  * changes capabilities, and it is left to the individual driver
  * to do the right thing. However, having the filter here
  * avoids replication of the same code in all individual drivers.
  */
 #define	IFCAP_RXCSUM		0x00001  /* can offload checksum on RX */
 #define	IFCAP_TXCSUM		0x00002  /* can offload checksum on TX */
 #define	IFCAP_NETCONS		0x00004  /* can be a network console */
 #define	IFCAP_VLAN_MTU		0x00008	/* VLAN-compatible MTU */
 #define	IFCAP_VLAN_HWTAGGING	0x00010	/* hardware VLAN tag support */
 #define	IFCAP_JUMBO_MTU		0x00020	/* 9000 byte MTU supported */
 #define	IFCAP_POLLING		0x00040	/* driver supports polling */
 #define	IFCAP_VLAN_HWCSUM	0x00080	/* can do IFCAP_HWCSUM on VLANs */
 #define	IFCAP_TSO4		0x00100	/* can do TCP Segmentation Offload */
 #define	IFCAP_TSO6		0x00200	/* can do TCP6 Segmentation Offload */
 #define	IFCAP_LRO		0x00400	/* can do Large Receive Offload */
 #define	IFCAP_WOL_UCAST		0x00800	/* wake on any unicast frame */
 #define	IFCAP_WOL_MCAST		0x01000	/* wake on any multicast frame */
 #define	IFCAP_WOL_MAGIC		0x02000	/* wake on any Magic Packet */
 #define	IFCAP_TOE4		0x04000	/* interface can offload TCP */
 #define	IFCAP_TOE6		0x08000	/* interface can offload TCP6 */
 #define	IFCAP_VLAN_HWFILTER	0x10000 /* interface hw can filter vlan tag */
 /* 	available		0x20000 */
 #define	IFCAP_VLAN_HWTSO	0x40000 /* can do IFCAP_TSO on VLANs */
 #define	IFCAP_LINKSTATE		0x80000 /* the runtime link state is dynamic */
 #define	IFCAP_NETMAP		0x100000 /* netmap mode supported/enabled */
 #define	IFCAP_RXCSUM_IPV6	0x200000  /* can offload checksum on IPv6 RX */
 #define	IFCAP_TXCSUM_IPV6	0x400000  /* can offload checksum on IPv6 TX */
 #define	IFCAP_HWSTATS		0x800000 /* manages counters internally */
 #define	IFCAP_TXRTLMT		0x1000000 /* hardware supports TX rate limiting */
 #define	IFCAP_HWRXTSTMP		0x2000000 /* hardware rx timestamping */
 #define	IFCAP_NOMAP		0x4000000 /* can TX unmapped mbufs */
 #define	IFCAP_TXTLS4		0x8000000 /* can do TLS encryption and segmentation for TCP */
 #define	IFCAP_TXTLS6		0x10000000 /* can do TLS encryption and segmentation for TCP6 */
 #define	IFCAP_VXLAN_HWCSUM	0x20000000 /* can do IFCAN_HWCSUM on VXLANs */
 #define	IFCAP_VXLAN_HWTSO	0x40000000 /* can do IFCAP_TSO on VXLANs */
+#define	IFCAP_TXTLS_RTLMT	0x80000000 /* can do TLS with rate limiting */
 
 #define IFCAP_HWCSUM_IPV6	(IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
 
 #define IFCAP_HWCSUM	(IFCAP_RXCSUM | IFCAP_TXCSUM)
 #define	IFCAP_TSO	(IFCAP_TSO4 | IFCAP_TSO6)
 #define	IFCAP_WOL	(IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC)
 #define	IFCAP_TOE	(IFCAP_TOE4 | IFCAP_TOE6)
 #define	IFCAP_TXTLS	(IFCAP_TXTLS4 | IFCAP_TXTLS6)
 
 #define	IFCAP_CANTCHANGE	(IFCAP_NETMAP)
 
 #define	IFQ_MAXLEN	50
 #define	IFNET_SLOWHZ	1		/* granularity is 1 second */
 
 /*
  * Message format for use in obtaining information about interfaces
  * from getkerninfo and the routing socket
  * For the new, extensible interface see struct if_msghdrl below.
  */
 struct if_msghdr {
 	u_short	ifm_msglen;	/* to skip over non-understood messages */
 	u_char	ifm_version;	/* future binary compatibility */
 	u_char	ifm_type;	/* message type */
 	int	ifm_addrs;	/* like rtm_addrs */
 	int	ifm_flags;	/* value of if_flags */
 	u_short	ifm_index;	/* index for associated ifp */
 	u_short	_ifm_spare1;
 	struct	if_data ifm_data;/* statistics and other data about if */
 };
 
 /*
  * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL.  It is
  * extensible after ifm_data_off or within ifm_data.  Both the if_msghdr and
  * if_data now have a member field detailing the struct length in addition to
  * the routing message length.  Macros are provided to find the start of
  * ifm_data and the start of the socket address strucutres immediately following
  * struct if_msghdrl given a pointer to struct if_msghdrl.
  */
 #define	IF_MSGHDRL_IFM_DATA(_l) \
     (struct if_data *)((char *)(_l) + (_l)->ifm_data_off)
 #define	IF_MSGHDRL_RTA(_l) \
     (void *)((uintptr_t)(_l) + (_l)->ifm_len)
 struct if_msghdrl {
 	u_short	ifm_msglen;	/* to skip over non-understood messages */
 	u_char	ifm_version;	/* future binary compatibility */
 	u_char	ifm_type;	/* message type */
 	int	ifm_addrs;	/* like rtm_addrs */
 	int	ifm_flags;	/* value of if_flags */
 	u_short	ifm_index;	/* index for associated ifp */
 	u_short _ifm_spare1;	/* spare space to grow if_index, see if_var.h */
 	u_short	ifm_len;	/* length of if_msghdrl incl. if_data */
 	u_short	ifm_data_off;	/* offset of if_data from beginning */
 	int	_ifm_spare2;
 	struct	if_data ifm_data;/* statistics and other data about if */
 };
 
 /*
  * Message format for use in obtaining information about interface addresses
  * from getkerninfo and the routing socket
  * For the new, extensible interface see struct ifa_msghdrl below.
  */
 struct ifa_msghdr {
 	u_short	ifam_msglen;	/* to skip over non-understood messages */
 	u_char	ifam_version;	/* future binary compatibility */
 	u_char	ifam_type;	/* message type */
 	int	ifam_addrs;	/* like rtm_addrs */
 	int	ifam_flags;	/* value of ifa_flags */
 	u_short	ifam_index;	/* index for associated ifp */
 	u_short	_ifam_spare1;
 	int	ifam_metric;	/* value of ifa_ifp->if_metric */
 };
 
 /*
  * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL.  It is
  * extensible after ifam_metric or within ifam_data.  Both the ifa_msghdrl and
  * if_data now have a member field detailing the struct length in addition to
  * the routing message length.  Macros are provided to find the start of
  * ifm_data and the start of the socket address strucutres immediately following
  * struct ifa_msghdrl given a pointer to struct ifa_msghdrl.
  */
 #define	IFA_MSGHDRL_IFAM_DATA(_l) \
     (struct if_data *)((char *)(_l) + (_l)->ifam_data_off)
 #define	IFA_MSGHDRL_RTA(_l) \
     (void *)((uintptr_t)(_l) + (_l)->ifam_len)
 struct ifa_msghdrl {
 	u_short	ifam_msglen;	/* to skip over non-understood messages */
 	u_char	ifam_version;	/* future binary compatibility */
 	u_char	ifam_type;	/* message type */
 	int	ifam_addrs;	/* like rtm_addrs */
 	int	ifam_flags;	/* value of ifa_flags */
 	u_short	ifam_index;	/* index for associated ifp */
 	u_short _ifam_spare1;	/* spare space to grow if_index, see if_var.h */
 	u_short	ifam_len;	/* length of ifa_msghdrl incl. if_data */
 	u_short	ifam_data_off;	/* offset of if_data from beginning */
 	int	ifam_metric;	/* value of ifa_ifp->if_metric */
 	struct	if_data ifam_data;/* statistics and other data about if or
 				 * address */
 };
 
 /*
  * Message format for use in obtaining information about multicast addresses
  * from the routing socket
  */
 struct ifma_msghdr {
 	u_short	ifmam_msglen;	/* to skip over non-understood messages */
 	u_char	ifmam_version;	/* future binary compatibility */
 	u_char	ifmam_type;	/* message type */
 	int	ifmam_addrs;	/* like rtm_addrs */
 	int	ifmam_flags;	/* value of ifa_flags */
 	u_short	ifmam_index;	/* index for associated ifp */
 	u_short	_ifmam_spare1;
 };
 
 /*
  * Message format announcing the arrival or departure of a network interface.
  */
 struct if_announcemsghdr {
 	u_short	ifan_msglen;	/* to skip over non-understood messages */
 	u_char	ifan_version;	/* future binary compatibility */
 	u_char	ifan_type;	/* message type */
 	u_short	ifan_index;	/* index for associated ifp */
 	char	ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */
 	u_short	ifan_what;	/* what type of announcement */
 };
 
 #define	IFAN_ARRIVAL	0	/* interface arrival */
 #define	IFAN_DEPARTURE	1	/* interface departure */
 
 /*
  * Buffer with length to be used in SIOCGIFDESCR/SIOCSIFDESCR requests
  */
 struct ifreq_buffer {
 	size_t	length;
 	void	*buffer;
 };
 
 /*
  * Interface request structure used for socket
  * ioctl's.  All interface ioctl's must have parameter
  * definitions which begin with ifr_name.  The
  * remainder may be interface specific.
  */
 struct	ifreq {
 	char	ifr_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	union {
 		struct	sockaddr ifru_addr;
 		struct	sockaddr ifru_dstaddr;
 		struct	sockaddr ifru_broadaddr;
 		struct	ifreq_buffer ifru_buffer;
 		short	ifru_flags[2];
 		short	ifru_index;
 		int	ifru_jid;
 		int	ifru_metric;
 		int	ifru_mtu;
 		int	ifru_phys;
 		int	ifru_media;
 		caddr_t	ifru_data;
 		int	ifru_cap[2];
 		u_int	ifru_fib;
 		u_char	ifru_vlan_pcp;
 	} ifr_ifru;
 #define	ifr_addr	ifr_ifru.ifru_addr	/* address */
 #define	ifr_dstaddr	ifr_ifru.ifru_dstaddr	/* other end of p-to-p link */
 #define	ifr_broadaddr	ifr_ifru.ifru_broadaddr	/* broadcast address */
 #ifndef _KERNEL
 #define	ifr_buffer	ifr_ifru.ifru_buffer	/* user supplied buffer with its length */
 #endif
 #define	ifr_flags	ifr_ifru.ifru_flags[0]	/* flags (low 16 bits) */
 #define	ifr_flagshigh	ifr_ifru.ifru_flags[1]	/* flags (high 16 bits) */
 #define	ifr_jid		ifr_ifru.ifru_jid	/* jail/vnet */
 #define	ifr_metric	ifr_ifru.ifru_metric	/* metric */
 #define	ifr_mtu		ifr_ifru.ifru_mtu	/* mtu */
 #define ifr_phys	ifr_ifru.ifru_phys	/* physical wire */
 #define ifr_media	ifr_ifru.ifru_media	/* physical media */
 #ifndef _KERNEL
 #define	ifr_data	ifr_ifru.ifru_data	/* for use by interface */
 #endif
 #define	ifr_reqcap	ifr_ifru.ifru_cap[0]	/* requested capabilities */
 #define	ifr_curcap	ifr_ifru.ifru_cap[1]	/* current capabilities */
 #define	ifr_index	ifr_ifru.ifru_index	/* interface index */
 #define	ifr_fib		ifr_ifru.ifru_fib	/* interface fib */
 #define	ifr_vlan_pcp	ifr_ifru.ifru_vlan_pcp	/* VLAN priority */
 #define	ifr_lan_pcp	ifr_ifru.ifru_vlan_pcp	/* VLAN priority */
 };
 
 #define	_SIZEOF_ADDR_IFREQ(ifr) \
 	((ifr).ifr_addr.sa_len > sizeof(struct sockaddr) ? \
 	 (sizeof(struct ifreq) - sizeof(struct sockaddr) + \
 	  (ifr).ifr_addr.sa_len) : sizeof(struct ifreq))
 
 struct ifaliasreq {
 	char	ifra_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	struct	sockaddr ifra_addr;
 	struct	sockaddr ifra_broadaddr;
 	struct	sockaddr ifra_mask;
 	int	ifra_vhid;
 };
 
 /* 9.x compat */
 struct oifaliasreq {
 	char	ifra_name[IFNAMSIZ];
 	struct	sockaddr ifra_addr;
 	struct	sockaddr ifra_broadaddr;
 	struct	sockaddr ifra_mask;
 };
 
 struct ifmediareq {
 	char	ifm_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	int	ifm_current;		/* current media options */
 	int	ifm_mask;		/* don't care mask */
 	int	ifm_status;		/* media status */
 	int	ifm_active;		/* active options */
 	int	ifm_count;		/* # entries in ifm_ulist array */
 	int	*ifm_ulist;		/* media words */
 };
 
 struct  ifdrv {
 	char            ifd_name[IFNAMSIZ];     /* if name, e.g. "en0" */
 	unsigned long   ifd_cmd;
 	size_t          ifd_len;
 	void            *ifd_data;
 };
 
 /* 
  * Structure used to retrieve aux status data from interfaces.
  * Kernel suppliers to this interface should respect the formatting
  * needed by ifconfig(8): each line starts with a TAB and ends with
  * a newline.  The canonical example to copy and paste is in if_tun.c.
  */
 
 #define	IFSTATMAX	800		/* 10 lines of text */
 struct ifstat {
 	char	ifs_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	char	ascii[IFSTATMAX + 1];
 };
 
 /*
  * Structure used in SIOCGIFCONF request.
  * Used to retrieve interface configuration
  * for machine (useful for programs which
  * must know all networks accessible).
  */
 struct	ifconf {
 	int	ifc_len;		/* size of associated buffer */
 	union {
 		caddr_t	ifcu_buf;
 		struct	ifreq *ifcu_req;
 	} ifc_ifcu;
 #define	ifc_buf	ifc_ifcu.ifcu_buf	/* buffer address */
 #define	ifc_req	ifc_ifcu.ifcu_req	/* array of structures returned */
 };
 
 /*
  * interface groups
  */
 
 #define	IFG_ALL		"all"		/* group contains all interfaces */
 /* XXX: will we implement this? */
 #define	IFG_EGRESS	"egress"	/* if(s) default route(s) point to */
 
 struct ifg_req {
 	union {
 		char			 ifgrqu_group[IFNAMSIZ];
 		char			 ifgrqu_member[IFNAMSIZ];
 	} ifgrq_ifgrqu;
 #define	ifgrq_group	ifgrq_ifgrqu.ifgrqu_group
 #define	ifgrq_member	ifgrq_ifgrqu.ifgrqu_member
 };
 
 /*
  * Used to lookup groups for an interface
  */
 struct ifgroupreq {
 	char	ifgr_name[IFNAMSIZ];
 	u_int	ifgr_len;
 	union {
 		char	ifgru_group[IFNAMSIZ];
 		struct	ifg_req *ifgru_groups;
 	} ifgr_ifgru;
 #ifndef _KERNEL
 #define ifgr_group	ifgr_ifgru.ifgru_group
 #define ifgr_groups	ifgr_ifgru.ifgru_groups
 #endif
 };
 
 /*
  * Structure used to request i2c data
  * from interface transceivers.
  */
 struct ifi2creq {
 	uint8_t dev_addr;	/* i2c address (0xA0, 0xA2) */
 	uint8_t offset;		/* read offset */
 	uint8_t len;		/* read length */
 	uint8_t spare0;
 	uint32_t spare1;
 	uint8_t data[8];	/* read buffer */
 }; 
 
 /*
  * RSS hash.
  */
 
 #define	RSS_FUNC_NONE		0		/* RSS disabled */
 #define	RSS_FUNC_PRIVATE	1		/* non-standard */
 #define	RSS_FUNC_TOEPLITZ	2
 
 #define	RSS_TYPE_IPV4		0x00000001
 #define	RSS_TYPE_TCP_IPV4	0x00000002
 #define	RSS_TYPE_IPV6		0x00000004
 #define	RSS_TYPE_IPV6_EX	0x00000008
 #define	RSS_TYPE_TCP_IPV6	0x00000010
 #define	RSS_TYPE_TCP_IPV6_EX	0x00000020
 #define	RSS_TYPE_UDP_IPV4	0x00000040
 #define	RSS_TYPE_UDP_IPV6	0x00000080
 #define	RSS_TYPE_UDP_IPV6_EX	0x00000100
 
 #define	RSS_KEYLEN		128
 
 struct ifrsskey {
 	char		ifrk_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	uint8_t		ifrk_func;		/* RSS_FUNC_ */
 	uint8_t		ifrk_spare0;
 	uint16_t	ifrk_keylen;
 	uint8_t		ifrk_key[RSS_KEYLEN];
 };
 
 struct ifrsshash {
 	char		ifrh_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	uint8_t		ifrh_func;		/* RSS_FUNC_ */
 	uint8_t		ifrh_spare0;
 	uint16_t	ifrh_spare1;
 	uint32_t	ifrh_types;		/* RSS_TYPE_ */
 };
 
 #define	IFNET_PCP_NONE	0xff	/* PCP disabled */
 
 #define	IFDR_MSG_SIZE		64
 #define	IFDR_REASON_MSG		1
 #define	IFDR_REASON_VENDOR	2
 struct ifdownreason {
 	char		ifdr_name[IFNAMSIZ];
 	uint32_t	ifdr_reason;
 	uint32_t	ifdr_vendor;
 	char		ifdr_msg[IFDR_MSG_SIZE];
 };
 
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_IFADDR);
 MALLOC_DECLARE(M_IFMADDR);
 #endif
 #endif
 
 #ifndef _KERNEL
 struct if_nameindex {
 	unsigned int	if_index;	/* 1, 2, ... */
 	char		*if_name;	/* null terminated name: "le0", ... */
 };
 
 __BEGIN_DECLS
 void			 if_freenameindex(struct if_nameindex *);
 char			*if_indextoname(unsigned int, char *);
 struct if_nameindex	*if_nameindex(void);
 unsigned int		 if_nametoindex(const char *);
 __END_DECLS
 #endif
 #endif /* !_NET_IF_H_ */
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index 67d95747a778..65d6fb472d90 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -1,802 +1,813 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef	_NET_IF_VAR_H_
 #define	_NET_IF_VAR_H_
 
 /*
  * Structures defining a network interface, providing a packet
  * transport mechanism (ala level 0 of the PUP protocols).
  *
  * Each interface accepts output datagrams of a specified maximum
  * length, and provides higher level routines with input datagrams
  * received from its medium.
  *
  * Output occurs when the routine if_output is called, with three parameters:
  *	(*ifp->if_output)(ifp, m, dst, ro)
  * Here m is the mbuf chain to be sent and dst is the destination address.
  * The output routine encapsulates the supplied datagram if necessary,
  * and then transmits it on its medium.
  *
  * On input, each interface unwraps the data received by it, and either
  * places it on the input queue of an internetwork datagram routine
  * and posts the associated software interrupt, or passes the datagram to a raw
  * packet input routine.
  *
  * Routines exist for locating interfaces by their addresses
  * or for locating an interface on a certain network, as well as more general
  * routing and gateway routines maintaining information used to locate
  * interfaces.  These routines live in the files if.c and route.c
  */
 
 struct	rtentry;		/* ifa_rtrequest */
 struct	socket;
 struct	carp_if;
 struct	carp_softc;
 struct  ifvlantrunk;
 struct	route;			/* if_output */
 struct	vnet;
 struct	ifmedia;
 struct	netmap_adapter;
 struct	debugnet_methods;
 
 #ifdef _KERNEL
 #include <sys/_eventhandler.h>
 #include <sys/mbuf.h>		/* ifqueue only? */
 #include <sys/buf_ring.h>
 #include <net/vnet.h>
 #endif /* _KERNEL */
 #include <sys/ck.h>
 #include <sys/counter.h>
 #include <sys/epoch.h>
 #include <sys/lock.h>		/* XXX */
 #include <sys/mutex.h>		/* struct ifqueue */
 #include <sys/rwlock.h>		/* XXX */
 #include <sys/sx.h>		/* XXX */
 #include <sys/_task.h>		/* if_link_task */
 #define	IF_DUNIT_NONE	-1
 
 #include <net/altq/if_altq.h>
 
 CK_STAILQ_HEAD(ifnethead, ifnet);	/* we use TAILQs so that the order of */
 CK_STAILQ_HEAD(ifaddrhead, ifaddr);	/* instantiation is preserved in the list */
 CK_STAILQ_HEAD(ifmultihead, ifmultiaddr);
 CK_STAILQ_HEAD(ifgrouphead, ifg_group);
 
 #ifdef _KERNEL
 VNET_DECLARE(struct pfil_head *, link_pfil_head);
 #define	V_link_pfil_head	VNET(link_pfil_head)
 #define	PFIL_ETHER_NAME		"ethernet"
 
 #define	HHOOK_IPSEC_INET	0
 #define	HHOOK_IPSEC_INET6	1
 #define	HHOOK_IPSEC_COUNT	2
 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 #define	V_ipsec_hhh_in	VNET(ipsec_hhh_in)
 #define	V_ipsec_hhh_out	VNET(ipsec_hhh_out)
 #endif /* _KERNEL */
 
 typedef enum {
 	IFCOUNTER_IPACKETS = 0,
 	IFCOUNTER_IERRORS,
 	IFCOUNTER_OPACKETS,
 	IFCOUNTER_OERRORS,
 	IFCOUNTER_COLLISIONS,
 	IFCOUNTER_IBYTES,
 	IFCOUNTER_OBYTES,
 	IFCOUNTER_IMCASTS,
 	IFCOUNTER_OMCASTS,
 	IFCOUNTER_IQDROPS,
 	IFCOUNTER_OQDROPS,
 	IFCOUNTER_NOPROTO,
 	IFCOUNTERS /* Array size. */
 } ift_counter;
 
 typedef struct ifnet * if_t;
 
 typedef	void (*if_start_fn_t)(if_t);
 typedef	int (*if_ioctl_fn_t)(if_t, u_long, caddr_t);
 typedef	void (*if_init_fn_t)(void *);
 typedef void (*if_qflush_fn_t)(if_t);
 typedef int (*if_transmit_fn_t)(if_t, struct mbuf *);
 typedef	uint64_t (*if_get_counter_t)(if_t, ift_counter);
 
 struct ifnet_hw_tsomax {
 	u_int	tsomaxbytes;	/* TSO total burst length limit in bytes */
 	u_int	tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	tsomaxsegsize;	/* TSO maximum segment size in bytes */
 };
 
 /* Interface encap request types */
 typedef enum {
 	IFENCAP_LL = 1			/* pre-calculate link-layer header */
 } ife_type;
 
 /*
  * The structure below allows to request various pre-calculated L2/L3 headers
  * for different media. Requests varies by type (rtype field).
  *
  * IFENCAP_LL type: pre-calculates link header based on address family
  *   and destination lladdr.
  *
  *   Input data fields:
  *     buf: pointer to destination buffer
  *     bufsize: buffer size
  *     flags: IFENCAP_FLAG_BROADCAST if destination is broadcast
  *     family: address family defined by AF_ constant.
  *     lladdr: pointer to link-layer address
  *     lladdr_len: length of link-layer address
  *     hdata: pointer to L3 header (optional, used for ARP requests).
  *   Output data fields:
  *     buf: encap data is stored here
  *     bufsize: resulting encap length is stored here
  *     lladdr_off: offset of link-layer address from encap hdr start
  *     hdata: L3 header may be altered if necessary
  */
 
 struct if_encap_req {
 	u_char		*buf;		/* Destination buffer (w) */
 	size_t		bufsize;	/* size of provided buffer (r) */
 	ife_type	rtype;		/* request type (r) */
 	uint32_t	flags;		/* Request flags (r) */
 	int		family;		/* Address family AF_* (r) */
 	int		lladdr_off;	/* offset from header start (w) */
 	int		lladdr_len;	/* lladdr length (r) */
 	char		*lladdr;	/* link-level address pointer (r) */
 	char		*hdata;		/* Upper layer header data (rw) */
 };
 
 #define	IFENCAP_FLAG_BROADCAST	0x02	/* Destination is broadcast */
 
 /*
  * Network interface send tag support. The storage of "struct
  * m_snd_tag" comes from the network driver and it is free to allocate
  * as much additional space as it wants for its own use.
  */
 struct ktls_session;
 struct m_snd_tag;
 
 #define	IF_SND_TAG_TYPE_RATE_LIMIT 0
 #define	IF_SND_TAG_TYPE_UNLIMITED 1
 #define	IF_SND_TAG_TYPE_TLS 2
-#define	IF_SND_TAG_TYPE_MAX 3
+#define	IF_SND_TAG_TYPE_TLS_RATE_LIMIT 3
+#define	IF_SND_TAG_TYPE_MAX 4
 
 struct if_snd_tag_alloc_header {
 	uint32_t type;		/* send tag type, see IF_SND_TAG_XXX */
 	uint32_t flowid;	/* mbuf hash value */
 	uint32_t flowtype;	/* mbuf hash type */
 	uint8_t numa_domain;	/* numa domain of associated inp */
 };
 
 struct if_snd_tag_alloc_rate_limit {
 	struct if_snd_tag_alloc_header hdr;
 	uint64_t max_rate;	/* in bytes/s */
 	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 	uint32_t reserved;	/* alignment */
 };
 
 struct if_snd_tag_alloc_tls {
 	struct if_snd_tag_alloc_header hdr;
 	struct inpcb *inp;
 	const struct ktls_session *tls;
 };
 
+struct if_snd_tag_alloc_tls_rate_limit {
+	struct if_snd_tag_alloc_header hdr;
+	struct inpcb *inp;
+	const struct ktls_session *tls;
+	uint64_t max_rate;	/* in bytes/s */
+};
+
 struct if_snd_tag_rate_limit_params {
 	uint64_t max_rate;	/* in bytes/s */
 	uint32_t queue_level;	/* 0 (empty) .. 65535 (full) */
 #define	IF_SND_QUEUE_LEVEL_MIN 0
 #define	IF_SND_QUEUE_LEVEL_MAX 65535
 	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 };
 
 union if_snd_tag_alloc_params {
 	struct if_snd_tag_alloc_header hdr;
 	struct if_snd_tag_alloc_rate_limit rate_limit;
 	struct if_snd_tag_alloc_rate_limit unlimited;
 	struct if_snd_tag_alloc_tls tls;
+	struct if_snd_tag_alloc_tls_rate_limit tls_rate_limit;
 };
 
 union if_snd_tag_modify_params {
 	struct if_snd_tag_rate_limit_params rate_limit;
 	struct if_snd_tag_rate_limit_params unlimited;
+	struct if_snd_tag_rate_limit_params tls_rate_limit;
 };
 
 union if_snd_tag_query_params {
 	struct if_snd_tag_rate_limit_params rate_limit;
 	struct if_snd_tag_rate_limit_params unlimited;
+	struct if_snd_tag_rate_limit_params tls_rate_limit;
 };
 
 /* Query return flags */
 #define RT_NOSUPPORT	  0x00000000	/* Not supported */
 #define RT_IS_INDIRECT    0x00000001	/*
 					 * Interface like a lagg, select
 					 * the actual interface for
 					 * capabilities.
 					 */
 #define RT_IS_SELECTABLE  0x00000002	/*
 					 * No rate table, you select
 					 * rates and the first
 					 * number_of_rates are created.
 					 */
 #define RT_IS_FIXED_TABLE 0x00000004	/* A fixed table is attached */
 #define RT_IS_UNUSABLE	  0x00000008	/* It is not usable for this */
 #define RT_IS_SETUP_REQ	  0x00000010	/* The interface setup must be called before use */
 
 struct if_ratelimit_query_results {
 	const uint64_t *rate_table;	/* Pointer to table if present */
 	uint32_t flags;			/* Flags indicating results */
 	uint32_t max_flows;		/* Max flows using, 0=unlimited */
 	uint32_t number_of_rates;	/* How many unique rates can be created */
 	uint32_t min_segment_burst;	/* The amount the adapter bursts at each send */
 };
 
 typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
 typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
 typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
 typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
 typedef void (if_ratelimit_query_t)(struct ifnet *,
     struct if_ratelimit_query_results *);
 typedef int (if_ratelimit_setup_t)(struct ifnet *, uint64_t, uint32_t);
 
 /*
  * Structure defining a network interface.
  */
 struct ifnet {
 	/* General book keeping of interface lists. */
 	CK_STAILQ_ENTRY(ifnet) if_link; 	/* all struct ifnets are chained (CK_) */
 	LIST_ENTRY(ifnet) if_clones;	/* interfaces of a cloner */
 	CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */
 					/* protected by if_addr_lock */
 	u_char	if_alloctype;		/* if_type at time of allocation */
 	uint8_t	if_numa_domain;		/* NUMA domain of device */
 	/* Driver and protocol specific information that remains stable. */
 	void	*if_softc;		/* pointer to driver state */
 	void	*if_llsoftc;		/* link layer softc */
 	void	*if_l2com;		/* pointer to protocol bits */
 	const char *if_dname;		/* driver name */
 	int	if_dunit;		/* unit or IF_DUNIT_NONE */
 	u_short	if_index;		/* numeric abbreviation for this if  */
 	short	if_index_reserved;	/* spare space to grow if_index */
 	char	if_xname[IFNAMSIZ];	/* external name (name + unit) */
 	char	*if_description;	/* interface description */
 
 	/* Variable fields that are touched by the stack and drivers. */
 	int	if_flags;		/* up/down, broadcast, etc. */
 	int	if_drv_flags;		/* driver-managed status flags */
 	int	if_capabilities;	/* interface features & capabilities */
 	int	if_capenable;		/* enabled features & capabilities */
 	void	*if_linkmib;		/* link-type-specific MIB data */
 	size_t	if_linkmiblen;		/* length of above data */
 	u_int	if_refcount;		/* reference count */
 
 	/* These fields are shared with struct if_data. */
 	uint8_t		if_type;	/* ethernet, tokenring, etc */
 	uint8_t		if_addrlen;	/* media address length */
 	uint8_t		if_hdrlen;	/* media header length */
 	uint8_t		if_link_state;	/* current link state */
 	uint32_t	if_mtu;		/* maximum transmission unit */
 	uint32_t	if_metric;	/* routing metric (external only) */
 	uint64_t	if_baudrate;	/* linespeed */
 	uint64_t	if_hwassist;	/* HW offload capabilities, see IFCAP */
 	time_t		if_epoch;	/* uptime at attach or stat reset */
 	struct timeval	if_lastchange;	/* time of last administrative change */
 
 	struct  ifaltq if_snd;		/* output queue (includes altq) */
 	struct	task if_linktask;	/* task for link change events */
 	struct	task if_addmultitask;	/* task for SIOCADDMULTI */
 
 	/* Addresses of different protocol families assigned to this if. */
 	struct mtx if_addr_lock;	/* lock to protect address lists */
 		/*
 		 * if_addrhead is the list of all addresses associated to
 		 * an interface.
 		 * Some code in the kernel assumes that first element
 		 * of the list has type AF_LINK, and contains sockaddr_dl
 		 * addresses which store the link-level address and the name
 		 * of the interface.
 		 * However, access to the AF_LINK address through this
 		 * field is deprecated. Use if_addr or ifaddr_byindex() instead.
 		 */
 	struct	ifaddrhead if_addrhead;	/* linked list of addresses per if */
 	struct	ifmultihead if_multiaddrs; /* multicast addresses configured */
 	int	if_amcount;		/* number of all-multicast requests */
 	struct	ifaddr	*if_addr;	/* pointer to link-level address */
 	void	*if_hw_addr;		/* hardware link-level address */
 	const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */
 	struct	mtx if_afdata_lock;
 	void	*if_afdata[AF_MAX];
 	int	if_afdata_initialized;
 
 	/* Additional features hung off the interface. */
 	u_int	if_fib;			/* interface FIB */
 	struct	vnet *if_vnet;		/* pointer to network stack instance */
 	struct	vnet *if_home_vnet;	/* where this ifnet originates from */
 	struct  ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */
 	struct	bpf_if *if_bpf;		/* packet filter structure */
 	int	if_pcount;		/* number of promiscuous listeners */
 	void	*if_bridge;		/* bridge glue */
 	void	*if_lagg;		/* lagg glue */
 	void	*if_pf_kif;		/* pf glue */
 	struct	carp_if *if_carp;	/* carp interface structure */
 	struct	label *if_label;	/* interface MAC label */
 	struct	netmap_adapter *if_netmap; /* netmap(4) softc */
 
 	/* Various procedures of the layer2 encapsulation and drivers. */
 	int	(*if_output)		/* output routine (enqueue) */
 		(struct ifnet *, struct mbuf *, const struct sockaddr *,
 		     struct route *);
 	void	(*if_input)		/* input routine (from h/w driver) */
 		(struct ifnet *, struct mbuf *);
 	struct mbuf *(*if_bridge_input)(struct ifnet *, struct mbuf *);
 	int	(*if_bridge_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
 		    struct rtentry *);
 	void (*if_bridge_linkstate)(struct ifnet *ifp);
 	if_start_fn_t	if_start;	/* initiate output routine */
 	if_ioctl_fn_t	if_ioctl;	/* ioctl routine */
 	if_init_fn_t	if_init;	/* Init routine */
 	int	(*if_resolvemulti)	/* validate/resolve multicast */
 		(struct ifnet *, struct sockaddr **, struct sockaddr *);
 	if_qflush_fn_t	if_qflush;	/* flush any queue */
 	if_transmit_fn_t if_transmit;   /* initiate output routine */
 
 	void	(*if_reassign)		/* reassign to vnet routine */
 		(struct ifnet *, struct vnet *, char *);
 	if_get_counter_t if_get_counter; /* get counter values */
 	int	(*if_requestencap)	/* make link header from request */
 		(struct ifnet *, struct if_encap_req *);
 
 	/* Statistics. */
 	counter_u64_t	if_counters[IFCOUNTERS];
 
 	/* Stuff that's only temporary and doesn't belong here. */
 
 	/*
 	 * Network adapter TSO limits:
 	 * ===========================
 	 *
 	 * If the "if_hw_tsomax" field is zero the maximum segment
 	 * length limit does not apply. If the "if_hw_tsomaxsegcount"
 	 * or the "if_hw_tsomaxsegsize" field is zero the TSO segment
 	 * count limit does not apply. If all three fields are zero,
 	 * there is no TSO limit.
 	 *
 	 * NOTE: The TSO limits should reflect the values used in the
 	 * BUSDMA tag a network adapter is using to load a mbuf chain
 	 * for transmission. The TCP/IP network stack will subtract
 	 * space for all linklevel and protocol level headers and
 	 * ensure that the full mbuf chain passed to the network
 	 * adapter fits within the given limits.
 	 */
 	u_int	if_hw_tsomax;		/* TSO maximum size in bytes */
 	u_int	if_hw_tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	if_hw_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 
 	/*
 	 * Network adapter send tag support:
 	 */
 	if_snd_tag_alloc_t *if_snd_tag_alloc;
 	if_snd_tag_modify_t *if_snd_tag_modify;
 	if_snd_tag_query_t *if_snd_tag_query;
 	if_snd_tag_free_t *if_snd_tag_free;
 	if_ratelimit_query_t *if_ratelimit_query;
 	if_ratelimit_setup_t *if_ratelimit_setup;
 
 	/* Ethernet PCP */
 	uint8_t if_pcp;
 
 	/*
 	 * Debugnet (Netdump) hooks to be called while in db/panic.
 	 */
 	struct debugnet_methods *if_debugnet_methods;
 	struct epoch_context	if_epoch_ctx;
 
 	/*
 	 * Spare fields to be added before branching a stable branch, so
 	 * that structure can be enhanced without changing the kernel
 	 * binary interface.
 	 */
 	int	if_ispare[4];		/* general use */
 };
 
 /* for compatibility with other BSDs */
 #define	if_name(ifp)	((ifp)->if_xname)
 
 #define	IF_NODOM	255
 /*
  * Locks for address lists on the network interface.
  */
 #define	IF_ADDR_LOCK_INIT(if)	mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF)
 #define	IF_ADDR_LOCK_DESTROY(if)	mtx_destroy(&(if)->if_addr_lock)
 
 #define	IF_ADDR_WLOCK(if)	mtx_lock(&(if)->if_addr_lock)
 #define	IF_ADDR_WUNLOCK(if)	mtx_unlock(&(if)->if_addr_lock)
 #define	IF_ADDR_LOCK_ASSERT(if)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock))
 #define	IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED)
 
 #ifdef _KERNEL
 /* interface link layer address change event */
 typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t);
 /* interface address change event */
 typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t);
 typedef void (*ifaddr_event_ext_handler_t)(void *, struct ifnet *,
     struct ifaddr *, int);
 EVENTHANDLER_DECLARE(ifaddr_event_ext, ifaddr_event_ext_handler_t);
 #define	IFADDR_EVENT_ADD	0
 #define	IFADDR_EVENT_DEL	1
 /* new interface arrival event */
 typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t);
 /* interface departure event */
 typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t);
 /* Interface link state change event */
 typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int);
 EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t);
 /* Interface up/down event */
 #define IFNET_EVENT_UP		0
 #define IFNET_EVENT_DOWN	1
 #define IFNET_EVENT_PCP		2	/* priority code point, PCP */
 
 typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event);
 EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn);
 
 /*
  * interface groups
  */
 struct ifg_group {
 	char				 ifg_group[IFNAMSIZ];
 	u_int				 ifg_refcnt;
 	void				*ifg_pf_kif;
 	CK_STAILQ_HEAD(, ifg_member)	 ifg_members; /* (CK_) */
 	CK_STAILQ_ENTRY(ifg_group)		 ifg_next; /* (CK_) */
 };
 
 struct ifg_member {
 	CK_STAILQ_ENTRY(ifg_member)	 ifgm_next; /* (CK_) */
 	struct ifnet		*ifgm_ifp;
 };
 
 struct ifg_list {
 	struct ifg_group	*ifgl_group;
 	CK_STAILQ_ENTRY(ifg_list)	 ifgl_next; /* (CK_) */
 };
 
 #ifdef _SYS_EVENTHANDLER_H_
 /* group attach event */
 typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
 /* group detach event */
 typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
 /* group change event */
 typedef void (*group_change_event_handler_t)(void *, const char *);
 EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);
 #endif /* _SYS_EVENTHANDLER_H_ */
 
 #define	IF_AFDATA_LOCK_INIT(ifp)	\
 	mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF)
 
 #define	IF_AFDATA_WLOCK(ifp)	mtx_lock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_WUNLOCK(ifp)	mtx_unlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_LOCK(ifp)	IF_AFDATA_WLOCK(ifp)
 #define	IF_AFDATA_UNLOCK(ifp)	IF_AFDATA_WUNLOCK(ifp)
 #define	IF_AFDATA_TRYLOCK(ifp)	mtx_trylock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_DESTROY(ifp)	mtx_destroy(&(ifp)->if_afdata_lock)
 
 #define	IF_AFDATA_LOCK_ASSERT(ifp)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock))
 #define	IF_AFDATA_WLOCK_ASSERT(ifp)	mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED)
 #define	IF_AFDATA_UNLOCK_ASSERT(ifp)	mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED)
 
 /*
  * 72 was chosen below because it is the size of a TCP/IP
  * header (40) + the minimum mss (32).
  */
 #define	IF_MINMTU	72
 #define	IF_MAXMTU	65535
 
 #define	TOEDEV(ifp)	((ifp)->if_llsoftc)
 
 /*
  * The ifaddr structure contains information about one address
  * of an interface.  They are maintained by the different address families,
  * are allocated and attached when an address is set, and are linked
  * together so all addresses for an interface can be located.
  *
  * NOTE: a 'struct ifaddr' is always at the beginning of a larger
  * chunk of malloc'ed memory, where we store the three addresses
  * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here.
  */
 struct ifaddr {
 	struct	sockaddr *ifa_addr;	/* address of interface */
 	struct	sockaddr *ifa_dstaddr;	/* other end of p-to-p link */
 #define	ifa_broadaddr	ifa_dstaddr	/* broadcast address interface */
 	struct	sockaddr *ifa_netmask;	/* used to determine subnet */
 	struct	ifnet *ifa_ifp;		/* back-pointer to interface */
 	struct	carp_softc *ifa_carp;	/* pointer to CARP data */
 	CK_STAILQ_ENTRY(ifaddr) ifa_link;	/* queue macro glue */
 	u_short	ifa_flags;		/* mostly rt_flags for cloning */
 #define	IFA_ROUTE	RTF_UP		/* route installed */
 #define	IFA_RTSELF	RTF_HOST	/* loopback route to self installed */
 	u_int	ifa_refcnt;		/* references to this structure */
 
 	counter_u64_t	ifa_ipackets;
 	counter_u64_t	ifa_opackets;
 	counter_u64_t	ifa_ibytes;
 	counter_u64_t	ifa_obytes;
 	struct	epoch_context	ifa_epoch_ctx;
 };
 
 struct ifaddr *	ifa_alloc(size_t size, int flags);
 void	ifa_free(struct ifaddr *ifa);
 void	ifa_ref(struct ifaddr *ifa);
 
 /*
  * Multicast address structure.  This is analogous to the ifaddr
  * structure except that it keeps track of multicast addresses.
  */
 #define IFMA_F_ENQUEUED		0x1
 struct ifmultiaddr {
 	CK_STAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */
 	struct	sockaddr *ifma_addr; 	/* address this membership is for */
 	struct	sockaddr *ifma_lladdr;	/* link-layer translation, if any */
 	struct	ifnet *ifma_ifp;	/* back-pointer to interface */
 	u_int	ifma_refcount;		/* reference count */
 	int	ifma_flags;
 	void	*ifma_protospec;	/* protocol-specific state, if any */
 	struct	ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */
 	struct	epoch_context	ifma_epoch_ctx;
 };
 
 extern	struct rwlock ifnet_rwlock;
 extern	struct sx ifnet_sxlock;
 
 #define	IFNET_WLOCK() do {						\
 	sx_xlock(&ifnet_sxlock);					\
 	rw_wlock(&ifnet_rwlock);					\
 } while (0)
 
 #define	IFNET_WUNLOCK() do {						\
 	rw_wunlock(&ifnet_rwlock);					\
 	sx_xunlock(&ifnet_sxlock);					\
 } while (0)
 
 /*
  * To assert the ifnet lock, you must know not only whether it's for read or
  * write, but also whether it was acquired with sleep support or not.
  */
 #define	IFNET_RLOCK_ASSERT()		sx_assert(&ifnet_sxlock, SA_SLOCKED)
 #define	IFNET_WLOCK_ASSERT() do {					\
 	sx_assert(&ifnet_sxlock, SA_XLOCKED);				\
 	rw_assert(&ifnet_rwlock, RA_WLOCKED);				\
 } while (0)
 
 #define	IFNET_RLOCK()		sx_slock(&ifnet_sxlock)
 #define	IFNET_RUNLOCK()		sx_sunlock(&ifnet_sxlock)
 
 /*
  * Look up an ifnet given its index; the _ref variant also acquires a
  * reference that must be freed using if_rele().  It is almost always a bug
  * to call ifnet_byindex() instead of ifnet_byindex_ref().
  */
 struct ifnet	*ifnet_byindex(u_short idx);
 struct ifnet	*ifnet_byindex_ref(u_short idx);
 
 /*
  * Given the index, ifaddr_byindex() returns the one and only
  * link-level ifaddr for the interface. You are not supposed to use
  * it to traverse the list of addresses associated to the interface.
  */
 struct ifaddr	*ifaddr_byindex(u_short idx);
 
 VNET_DECLARE(struct ifnethead, ifnet);
 VNET_DECLARE(struct ifgrouphead, ifg_head);
 VNET_DECLARE(int, if_index);
 VNET_DECLARE(struct ifnet *, loif);	/* first loopback interface */
 
 #define	V_ifnet		VNET(ifnet)
 #define	V_ifg_head	VNET(ifg_head)
 #define	V_if_index	VNET(if_index)
 #define	V_loif		VNET(loif)
 
 #ifdef MCAST_VERBOSE
 #define MCDPRINTF printf
 #else
 #define MCDPRINTF(...)
 #endif
 
 int	if_addgroup(struct ifnet *, const char *);
 int	if_delgroup(struct ifnet *, const char *);
 int	if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **);
 int	if_allmulti(struct ifnet *, int);
 struct	ifnet* if_alloc(u_char);
 struct	ifnet* if_alloc_dev(u_char, device_t dev);
 struct	ifnet* if_alloc_domain(u_char, int numa_domain);
 void	if_attach(struct ifnet *);
 void	if_dead(struct ifnet *);
 int	if_delmulti(struct ifnet *, struct sockaddr *);
 void	if_delmulti_ifma(struct ifmultiaddr *);
 void	if_delmulti_ifma_flags(struct ifmultiaddr *, int flags);
 void	if_detach(struct ifnet *);
 void	if_purgeaddrs(struct ifnet *);
 void	if_delallmulti(struct ifnet *);
 void	if_down(struct ifnet *);
 struct ifmultiaddr *
 	if_findmulti(struct ifnet *, const struct sockaddr *);
 void	if_freemulti(struct ifmultiaddr *ifma);
 void	if_free(struct ifnet *);
 void	if_initname(struct ifnet *, const char *, int);
 void	if_link_state_change(struct ifnet *, int);
 int	if_printf(struct ifnet *, const char *, ...) __printflike(2, 3);
 void	if_ref(struct ifnet *);
 void	if_rele(struct ifnet *);
 int	if_setlladdr(struct ifnet *, const u_char *, int);
 int	if_tunnel_check_nesting(struct ifnet *, struct mbuf *, uint32_t, int);
 void	if_up(struct ifnet *);
 int	ifioctl(struct socket *, u_long, caddr_t, struct thread *);
 int	ifpromisc(struct ifnet *, int);
 struct	ifnet *ifunit(const char *);
 struct	ifnet *ifunit_ref(const char *);
 
 int	ifa_add_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_del_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *);
 
 struct	ifaddr *ifa_ifwithaddr(const struct sockaddr *);
 int		ifa_ifwithaddr_check(const struct sockaddr *);
 struct	ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int);
 struct	ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int);
 struct	ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int);
 struct	ifaddr *ifa_ifwithroute(int, const struct sockaddr *,
     const struct sockaddr *, u_int);
 struct	ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *);
 int	ifa_preferred(struct ifaddr *, struct ifaddr *);
 
 int	if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen);
 
 typedef	void *if_com_alloc_t(u_char type, struct ifnet *ifp);
 typedef	void if_com_free_t(void *com, u_char type);
 void	if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f);
 void	if_deregister_com_alloc(u_char type);
 void	if_data_copy(struct ifnet *, struct if_data *);
 uint64_t if_get_counter_default(struct ifnet *, ift_counter);
 void	if_inc_counter(struct ifnet *, ift_counter, int64_t);
 
 #define IF_LLADDR(ifp)							\
     LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr))
 
 uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate);
 uint64_t if_getbaudrate(if_t ifp);
 int if_setcapabilities(if_t ifp, int capabilities);
 int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit);
 int if_getcapabilities(if_t ifp);
 int if_togglecapenable(if_t ifp, int togglecap);
 int if_setcapenable(if_t ifp, int capenable);
 int if_setcapenablebit(if_t ifp, int setcap, int clearcap);
 int if_getcapenable(if_t ifp);
 const char *if_getdname(if_t ifp);
 int if_setdev(if_t ifp, void *dev);
 int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags);
 int if_getdrvflags(if_t ifp);
 int if_setdrvflags(if_t ifp, int flags);
 int if_clearhwassist(if_t ifp);
 int if_sethwassistbits(if_t ifp, int toset, int toclear);
 int if_sethwassist(if_t ifp, int hwassist_bit);
 int if_gethwassist(if_t ifp);
 int if_setsoftc(if_t ifp, void *softc);
 void *if_getsoftc(if_t ifp);
 int if_setflags(if_t ifp, int flags);
 int if_gethwaddr(if_t ifp, struct ifreq *);
 int if_setmtu(if_t ifp, int mtu);
 int if_getmtu(if_t ifp);
 int if_getmtu_family(if_t ifp, int family);
 int if_setflagbits(if_t ifp, int set, int clear);
 int if_getflags(if_t ifp);
 int if_sendq_empty(if_t ifp);
 int if_setsendqready(if_t ifp);
 int if_setsendqlen(if_t ifp, int tx_desc_count);
 int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax);
 int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount);
 int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize);
 u_int if_gethwtsomax(if_t ifp);
 u_int if_gethwtsomaxsegcount(if_t ifp);
 u_int if_gethwtsomaxsegsize(if_t ifp);
 int if_input(if_t ifp, struct mbuf* sendmp);
 int if_sendq_prepend(if_t ifp, struct mbuf *m);
 struct mbuf *if_dequeue(if_t ifp);
 int if_setifheaderlen(if_t ifp, int len);
 void if_setrcvif(struct mbuf *m, if_t ifp);
 void if_setvtag(struct mbuf *m, u_int16_t tag);
 u_int16_t if_getvtag(struct mbuf *m);
 int if_vlantrunkinuse(if_t ifp);
 caddr_t if_getlladdr(if_t ifp);
 void *if_gethandle(u_char);
 void if_bpfmtap(if_t ifp, struct mbuf *m);
 void if_etherbpfmtap(if_t ifp, struct mbuf *m);
 void if_vlancap(if_t ifp);
 
 /*
  * Traversing through interface address lists.
  */
 struct sockaddr_dl;
 typedef u_int iflladdr_cb_t(void *, struct sockaddr_dl *, u_int);
 u_int if_foreach_lladdr(if_t, iflladdr_cb_t, void *);
 u_int if_foreach_llmaddr(if_t, iflladdr_cb_t, void *);
 u_int if_lladdr_count(if_t);
 u_int if_llmaddr_count(if_t);
 
 int if_getamcount(if_t ifp);
 struct ifaddr * if_getifaddr(if_t ifp);
 
 /* Functions */
 void if_setinitfn(if_t ifp, void (*)(void *));
 void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t));
 void if_setstartfn(if_t ifp, void (*)(if_t));
 void if_settransmitfn(if_t ifp, if_transmit_fn_t);
 void if_setqflushfn(if_t ifp, if_qflush_fn_t);
 void if_setgetcounterfn(if_t ifp, if_get_counter_t);
 
 /* Revisit the below. These are inline functions originally */
 int drbr_inuse_drv(if_t ifp, struct buf_ring *br);
 struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br);
 int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br);
 int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m);
 
 /* TSO */
 void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *);
 int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *);
 
 /* accessors for struct ifreq */
 void *ifr_data_get_ptr(void *ifrp);
 void *ifr_buffer_get_buffer(void *data);
 size_t ifr_buffer_get_length(void *data);
 
 int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 
 #ifdef DEVICE_POLLING
 enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS };
 
 typedef	int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count);
 int    ether_poll_register(poll_handler_t *h, if_t ifp);
 int    ether_poll_deregister(if_t ifp);
 #endif /* DEVICE_POLLING */
 
 #endif /* _KERNEL */
 
 #include <net/ifq.h>	/* XXXAO: temporary unconditional include */
 
 #endif /* !_NET_IF_VAR_H_ */
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index 192dba7eed82..920c65323ae7 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -1,2105 +1,2105 @@
 /*-
  * Copyright 1998 Massachusetts Institute of Technology
  * Copyright 2012 ADARA Networks, Inc.
  * Copyright 2017 Dell EMC Isilon
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to ADARA Networks, Inc.
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs.
  * This is sort of sneaky in the implementation, since
  * we need to pretend to be enough of an Ethernet implementation
  * to make arp work.  The way we do this is by telling everyone
  * that we are an Ethernet, and then catch the packets that
  * ether_output() sends to us via if_transmit(), rewrite them for
  * use by the real outgoing interface, and ask it to send them.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_vlan.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/priv.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 #define	VLAN_DEF_HWIDTH	4
 #define	VLAN_IFFLAGS	(IFF_BROADCAST | IFF_MULTICAST)
 
 #define	UP_AND_RUNNING(ifp) \
     ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING)
 
 CK_SLIST_HEAD(ifvlanhead, ifvlan);
 
 struct ifvlantrunk {
 	struct	ifnet   *parent;	/* parent interface of this trunk */
 	struct	mtx	lock;
 #ifdef VLAN_ARRAY
 #define	VLAN_ARRAY_SIZE	(EVL_VLID_MASK + 1)
 	struct	ifvlan	*vlans[VLAN_ARRAY_SIZE]; /* static table */
 #else
 	struct	ifvlanhead *hash;	/* dynamic hash-list table */
 	uint16_t	hmask;
 	uint16_t	hwidth;
 #endif
 	int		refcnt;
 };
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 struct vlan_snd_tag {
 	struct m_snd_tag com;
 	struct m_snd_tag *tag;
 };
 
 static inline struct vlan_snd_tag *
 mst_to_vst(struct m_snd_tag *mst)
 {
 
 	return (__containerof(mst, struct vlan_snd_tag, com));
 }
 #endif
 
 /*
  * This macro provides a facility to iterate over every vlan on a trunk with
  * the assumption that none will be added/removed during iteration.
  */
 #ifdef VLAN_ARRAY
 #define VLAN_FOREACH(_ifv, _trunk) \
 	size_t _i; \
 	for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]) != NULL)
 #else /* VLAN_ARRAY */
 #define VLAN_FOREACH(_ifv, _trunk) \
 	struct ifvlan *_next; \
 	size_t _i; \
 	for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \
 		CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next)
 #endif /* VLAN_ARRAY */
 
 /*
  * This macro provides a facility to iterate over every vlan on a trunk while
  * also modifying the number of vlans on the trunk. The iteration continues
  * until some condition is met or there are no more vlans on the trunk.
  */
 #ifdef VLAN_ARRAY
 /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]))
 #else /* VLAN_ARRAY */
 /*
  * The hash table case is more complicated. We allow for the hash table to be
  * modified (i.e. vlans removed) while we are iterating over it. To allow for
  * this we must restart the iteration every time we "touch" something during
  * the iteration, since removal will resize the hash table and invalidate our
  * current position. If acting on the touched element causes the trunk to be
  * emptied, then iteration also stops.
  */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	bool _touch = false; \
 	for (_i = 0; \
 	    !(_cond) && _i < (1 << (_trunk)->hwidth); \
 	    _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \
 		if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \
 		    (_touch = true))
 #endif /* VLAN_ARRAY */
 
 struct vlan_mc_entry {
 	struct sockaddr_dl		mc_addr;
 	CK_SLIST_ENTRY(vlan_mc_entry)	mc_entries;
 	struct epoch_context		mc_epoch_ctx;
 };
 
 struct ifvlan {
 	struct	ifvlantrunk *ifv_trunk;
 	struct	ifnet *ifv_ifp;
 #define	TRUNK(ifv)	((ifv)->ifv_trunk)
 #define	PARENT(ifv)	(TRUNK(ifv)->parent)
 	void	*ifv_cookie;
 	int	ifv_pflags;	/* special flags we have set on parent */
 	int	ifv_capenable;
 	int	ifv_encaplen;	/* encapsulation length */
 	int	ifv_mtufudge;	/* MTU fudged by this much */
 	int	ifv_mintu;	/* min transmission unit */
 	struct  ether_8021q_tag ifv_qtag;
 #define ifv_proto	ifv_qtag.proto
 #define ifv_vid		ifv_qtag.vid
 #define ifv_pcp		ifv_qtag.pcp
 	struct task lladdr_task;
 	CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead;
 #ifndef VLAN_ARRAY
 	CK_SLIST_ENTRY(ifvlan) ifv_list;
 #endif
 };
 
 /* Special flags we should propagate to parent. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } vlan_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 extern int vlan_mtag_pcp;
 
 static const char vlanname[] = "vlan";
 static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface");
 
 static eventhandler_tag ifdetach_tag;
 static eventhandler_tag iflladdr_tag;
 
 /*
  * if_vlan uses two module-level synchronizations primitives to allow concurrent
  * modification of vlan interfaces and (mostly) allow for vlans to be destroyed
  * while they are being used for tx/rx. To accomplish this in a way that has
  * acceptable performance and cooperation with other parts of the network stack
  * there is a non-sleepable epoch(9) and an sx(9).
  *
  * The performance-sensitive paths that warrant using the epoch(9) are
  * vlan_transmit and vlan_input. Both have to check for the vlan interface's
  * existence using if_vlantrunk, and being in the network tx/rx paths the use
  * of an epoch(9) gives a measureable improvement in performance.
  *
  * The reason for having an sx(9) is mostly because there are still areas that
  * must be sleepable and also have safe concurrent access to a vlan interface.
  * Since the sx(9) exists, it is used by default in most paths unless sleeping
  * is not permitted, or if it is not clear whether sleeping is permitted.
  *
  */
 #define _VLAN_SX_ID ifv_sx
 
 static struct sx _VLAN_SX_ID;
 
 #define VLAN_LOCKING_INIT() \
 	sx_init_flags(&_VLAN_SX_ID, "vlan_sx", SX_RECURSE)
 
 #define VLAN_LOCKING_DESTROY() \
 	sx_destroy(&_VLAN_SX_ID)
 
 #define	VLAN_SLOCK()			sx_slock(&_VLAN_SX_ID)
 #define	VLAN_SUNLOCK()			sx_sunlock(&_VLAN_SX_ID)
 #define	VLAN_XLOCK()			sx_xlock(&_VLAN_SX_ID)
 #define	VLAN_XUNLOCK()			sx_xunlock(&_VLAN_SX_ID)
 #define	VLAN_SLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_SLOCKED)
 #define	VLAN_XLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_XLOCKED)
 #define	VLAN_SXLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_LOCKED)
 
 /*
  * We also have a per-trunk mutex that should be acquired when changing
  * its state.
  */
 #define	TRUNK_LOCK_INIT(trunk)		mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF)
 #define	TRUNK_LOCK_DESTROY(trunk)	mtx_destroy(&(trunk)->lock)
 #define	TRUNK_WLOCK(trunk)		mtx_lock(&(trunk)->lock)
 #define	TRUNK_WUNLOCK(trunk)		mtx_unlock(&(trunk)->lock)
 #define	TRUNK_WLOCK_ASSERT(trunk)	mtx_assert(&(trunk)->lock, MA_OWNED);
 
 /*
  * The VLAN_ARRAY substitutes the dynamic hash with a static array
  * with 4096 entries. In theory this can give a boost in processing,
  * however in practice it does not. Probably this is because the array
  * is too big to fit into CPU cache.
  */
 #ifndef VLAN_ARRAY
 static	void vlan_inithash(struct ifvlantrunk *trunk);
 static	void vlan_freehash(struct ifvlantrunk *trunk);
 static	int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	void vlan_growhash(struct ifvlantrunk *trunk, int howmuch);
 static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk,
 	uint16_t vid);
 #endif
 static	void trunk_destroy(struct ifvlantrunk *trunk);
 
 static	void vlan_init(void *foo);
 static	void vlan_input(struct ifnet *ifp, struct mbuf *m);
 static	int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static	int vlan_snd_tag_alloc(struct ifnet *,
     union if_snd_tag_alloc_params *, struct m_snd_tag **);
 static	int vlan_snd_tag_modify(struct m_snd_tag *,
     union if_snd_tag_modify_params *);
 static	int vlan_snd_tag_query(struct m_snd_tag *,
     union if_snd_tag_query_params *);
 static	void vlan_snd_tag_free(struct m_snd_tag *);
 #endif
 static	void vlan_qflush(struct ifnet *ifp);
 static	int vlan_setflag(struct ifnet *ifp, int flag, int status,
     int (*func)(struct ifnet *, int));
 static	int vlan_setflags(struct ifnet *ifp, int status);
 static	int vlan_setmulti(struct ifnet *ifp);
 static	int vlan_transmit(struct ifnet *ifp, struct mbuf *m);
 static	int vlan_output(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, struct route *ro);
 static	void vlan_unconfig(struct ifnet *ifp);
 static	void vlan_unconfig_locked(struct ifnet *ifp, int departing);
 static	int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag,
 	uint16_t proto);
 static	void vlan_link_state(struct ifnet *ifp);
 static	void vlan_capabilities(struct ifvlan *ifv);
 static	void vlan_trunk_capabilities(struct ifnet *ifp);
 
 static	struct ifnet *vlan_clone_match_ethervid(const char *, int *);
 static	int vlan_clone_match(struct if_clone *, const char *);
 static	int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static	int vlan_clone_destroy(struct if_clone *, struct ifnet *);
 
 static	void vlan_ifdetach(void *arg, struct ifnet *ifp);
 static  void vlan_iflladdr(void *arg, struct ifnet *ifp);
 
 static  void vlan_lladdr_fn(void *arg, int pending);
 
 static struct if_clone *vlan_cloner;
 
 #ifdef VIMAGE
 VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner);
 #define	V_vlan_cloner	VNET(vlan_cloner)
 #endif
 
 static void
 vlan_mc_free(struct epoch_context *ctx)
 {
 	struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx);
 	free(mc, M_VLAN);
 }
 
 #ifndef VLAN_ARRAY
 #define HASH(n, m)	((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m))
 
 static void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 	int i, n;
 
 	/*
 	 * The trunk must not be locked here since we call malloc(M_WAITOK).
 	 * It is OK in case this function is called before the trunk struct
 	 * gets hooked up and becomes visible from other threads.
 	 */
 
 	KASSERT(trunk->hwidth == 0 && trunk->hash == NULL,
 	    ("%s: hash already initialized", __func__));
 
 	trunk->hwidth = VLAN_DEF_HWIDTH;
 	n = 1 << trunk->hwidth;
 	trunk->hmask = n - 1;
 	trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK);
 	for (i = 0; i < n; i++)
 		CK_SLIST_INIT(&trunk->hash[i]);
 }
 
 static void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 #ifdef INVARIANTS
 	int i;
 
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 	for (i = 0; i < (1 << trunk->hwidth); i++)
 		KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]),
 		    ("%s: hash table not empty", __func__));
 #endif
 	free(trunk->hash, M_VLAN);
 	trunk->hash = NULL;
 	trunk->hwidth = trunk->hmask = 0;
 }
 
 static int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	b = 1 << trunk->hwidth;
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv->ifv_vid == ifv2->ifv_vid)
 			return (EEXIST);
 
 	/*
 	 * Grow the hash when the number of vlans exceeds half of the number of
 	 * hash buckets squared. This will make the average linked-list length
 	 * buckets/2.
 	 */
 	if (trunk->refcnt > (b * b) / 2) {
 		vlan_growhash(trunk, 1);
 		i = HASH(ifv->ifv_vid, trunk->hmask);
 	}
 	CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list);
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	b = 1 << trunk->hwidth;
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv2 == ifv) {
 			trunk->refcnt--;
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list);
 			if (trunk->refcnt < (b * b) / 2)
 				vlan_growhash(trunk, -1);
 			return (0);
 		}
 
 	panic("%s: vlan not found\n", __func__);
 	return (ENOENT); /*NOTREACHED*/
 }
 
 /*
  * Grow the hash larger or smaller if memory permits.
  */
 static void
 vlan_growhash(struct ifvlantrunk *trunk, int howmuch)
 {
 	struct ifvlan *ifv;
 	struct ifvlanhead *hash2;
 	int hwidth2, i, j, n, n2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	if (howmuch == 0) {
 		/* Harmless yet obvious coding error */
 		printf("%s: howmuch is 0\n", __func__);
 		return;
 	}
 
 	hwidth2 = trunk->hwidth + howmuch;
 	n = 1 << trunk->hwidth;
 	n2 = 1 << hwidth2;
 	/* Do not shrink the table below the default */
 	if (hwidth2 < VLAN_DEF_HWIDTH)
 		return;
 
 	hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK);
 	if (hash2 == NULL) {
 		printf("%s: out of memory -- hash size not changed\n",
 		    __func__);
 		return;		/* We can live with the old hash table */
 	}
 	for (j = 0; j < n2; j++)
 		CK_SLIST_INIT(&hash2[j]);
 	for (i = 0; i < n; i++)
 		while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) {
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list);
 			j = HASH(ifv->ifv_vid, n2 - 1);
 			CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list);
 		}
 	NET_EPOCH_WAIT();
 	free(trunk->hash, M_VLAN);
 	trunk->hash = hash2;
 	trunk->hwidth = hwidth2;
 	trunk->hmask = n2 - 1;
 
 	if (bootverbose)
 		if_printf(trunk->parent,
 		    "VLAN hash table resized from %d to %d buckets\n", n, n2);
 }
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list)
 		if (ifv->ifv_vid == vid)
 			return (ifv);
 	return (NULL);
 }
 
 #if 0
 /* Debugging code to view the hashtables. */
 static void
 vlan_dumphash(struct ifvlantrunk *trunk)
 {
 	int i;
 	struct ifvlan *ifv;
 
 	for (i = 0; i < (1 << trunk->hwidth); i++) {
 		printf("%d: ", i);
 		CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list)
 			printf("%s ", ifv->ifv_ifp->if_xname);
 		printf("\n");
 	}
 }
 #endif /* 0 */
 #else
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 
 	return trunk->vlans[vid];
 }
 
 static __inline int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	if (trunk->vlans[ifv->ifv_vid] != NULL)
 		return EEXIST;
 	trunk->vlans[ifv->ifv_vid] = ifv;
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static __inline int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	trunk->vlans[ifv->ifv_vid] = NULL;
 	trunk->refcnt--;
 
 	return (0);
 }
 
 static __inline void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 }
 
 static __inline void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 }
 
 #endif /* !VLAN_ARRAY */
 
 static void
 trunk_destroy(struct ifvlantrunk *trunk)
 {
 	VLAN_XLOCK_ASSERT();
 
 	vlan_freehash(trunk);
 	trunk->parent->if_vlantrunk = NULL;
 	TRUNK_LOCK_DESTROY(trunk);
 	if_rele(trunk->parent);
 	free(trunk, M_VLAN);
 }
 
 /*
  * Program our multicast filter. What we're actually doing is
  * programming the multicast filter of the parent. This has the
  * side effect of causing the parent interface to receive multicast
  * traffic that it doesn't really want, which ends up being discarded
  * later by the upper protocol layers. Unfortunately, there's no way
  * to avoid this: there really is only one physical interface.
  */
 static int
 vlan_setmulti(struct ifnet *ifp)
 {
 	struct ifnet		*ifp_p;
 	struct ifmultiaddr	*ifma;
 	struct ifvlan		*sc;
 	struct vlan_mc_entry	*mc;
 	int			error;
 
 	VLAN_XLOCK_ASSERT();
 
 	/* Find the parent. */
 	sc = ifp->if_softc;
 	ifp_p = PARENT(sc);
 
 	CURVNET_SET_QUIET(ifp_p->if_vnet);
 
 	/* First, remove any existing filter entries. */
 	while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) {
 		CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries);
 		(void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr);
 		NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx);
 	}
 
 	/* Now program new ones. */
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(ifp);
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp_p->if_index;
 		CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 	CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) {
 		error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr,
 		    NULL);
 		if (error)
 			return (error);
 	}
 
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * A handler for parent interface link layer address changes.
  * If the parent interface link layer address is changed we
  * should also change it on all children vlans.
  */
 static void
 vlan_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlan *ifv;
 	struct ifnet *ifv_ifp;
 	struct ifvlantrunk *trunk;
 	struct sockaddr_dl *sdl;
 
 	/* Need the epoch since this is run on taskqueue_swi. */
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and change all vlan's lladdrs on it.
 	 * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR
 	 * ioctl calls on the parent garbling the lladdr of the child vlan.
 	 */
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		/*
 		 * Copy new new lladdr into the ifv_ifp, enqueue a task
 		 * to actually call if_setlladdr. if_setlladdr needs to
 		 * be deferred to a taskqueue because it will call into
 		 * the if_vlan ioctl path and try to acquire the global
 		 * lock.
 		 */
 		ifv_ifp = ifv->ifv_ifp;
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp),
 		    ifp->if_addrlen);
 		sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr;
 		sdl->sdl_alen = ifp->if_addrlen;
 		taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * A handler for network interface departure events.
  * Track departure of trunks here so that we don't access invalid
  * pointers or whatever if a trunk is ripped from under us, e.g.,
  * by ejecting its hot-plug card.  However, if an ifnet is simply
  * being renamed, then there's no need to tear down the state.
  */
 static void
 vlan_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	VLAN_XLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_XUNLOCK();
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and detach all vlan's on it.
 	 * Check trunk pointer after each vlan_unconfig() as it will
 	 * free it and set to NULL after the last vlan was detached.
 	 */
 	VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk,
 	    ifp->if_vlantrunk == NULL)
 		vlan_unconfig_locked(ifv->ifv_ifp, 1);
 
 	/* Trunk should have been destroyed in vlan_unconfig(). */
 	KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__));
 	VLAN_XUNLOCK();
 }
 
 /*
  * Return the trunk device for a virtual interface.
  */
 static struct ifnet  *
 vlan_trunkdev(struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 
 	ifv = ifp->if_softc;
 	ifp = NULL;
 	if (ifv->ifv_trunk)
 		ifp = PARENT(ifv);
 	return (ifp);
 }
 
 /*
  * Return the 12-bit VLAN VID for this interface, for use by external
  * components such as Infiniband.
  *
  * XXXRW: Note that the function name here is historical; it should be named
  * vlan_vid().
  */
 static int
 vlan_tag(struct ifnet *ifp, uint16_t *vidp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*vidp = ifv->ifv_vid;
 	return (0);
 }
 
 static int
 vlan_pcp(struct ifnet *ifp, uint16_t *pcpp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*pcpp = ifv->ifv_pcp;
 	return (0);
 }
 
 /*
  * Return a driver specific cookie for this interface.  Synchronization
  * with setcookie must be provided by the driver.
  */
 static void *
 vlan_cookie(struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 	ifv = ifp->if_softc;
 	return (ifv->ifv_cookie);
 }
 
 /*
  * Store a cookie in our softc that drivers can use to store driver
  * private per-instance data in.
  */
 static int
 vlan_setcookie(struct ifnet *ifp, void *cookie)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	ifv->ifv_cookie = cookie;
 	return (0);
 }
 
 /*
  * Return the vlan device present at the specific VID.
  */
 static struct ifnet *
 vlan_devat(struct ifnet *ifp, uint16_t vid)
 {
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL)
 		return (NULL);
 	ifp = NULL;
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv)
 		ifp = ifv->ifv_ifp;
 	return (ifp);
 }
 
 /*
  * VLAN support can be loaded as a module.  The only place in the
  * system that's intimately aware of this is ether_input.  We hook
  * into this code through vlan_input_p which is defined there and
  * set here.  No one else in the system should be aware of this so
  * we use an explicit reference here.
  */
 extern	void (*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* For if_link_state_change() eyes only... */
 extern	void (*vlan_link_state_p)(struct ifnet *);
 
 static int
 vlan_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 		    vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
 		if (ifdetach_tag == NULL)
 			return (ENOMEM);
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 		if (iflladdr_tag == NULL)
 			return (ENOMEM);
 		VLAN_LOCKING_INIT();
 		vlan_input_p = vlan_input;
 		vlan_link_state_p = vlan_link_state;
 		vlan_trunk_cap_p = vlan_trunk_capabilities;
 		vlan_trunkdev_p = vlan_trunkdev;
 		vlan_cookie_p = vlan_cookie;
 		vlan_setcookie_p = vlan_setcookie;
 		vlan_tag_p = vlan_tag;
 		vlan_pcp_p = vlan_pcp;
 		vlan_devat_p = vlan_devat;
 #ifndef VIMAGE
 		vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match,
 		    vlan_clone_create, vlan_clone_destroy);
 #endif
 		if (bootverbose)
 			printf("vlan: initialized, using "
 #ifdef VLAN_ARRAY
 			       "full-size arrays"
 #else
 			       "hash tables with chaining"
 #endif
 
 			       "\n");
 		break;
 	case MOD_UNLOAD:
 #ifndef VIMAGE
 		if_clone_detach(vlan_cloner);
 #endif
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag);
 		vlan_input_p = NULL;
 		vlan_link_state_p = NULL;
 		vlan_trunk_cap_p = NULL;
 		vlan_trunkdev_p = NULL;
 		vlan_tag_p = NULL;
 		vlan_cookie_p = NULL;
 		vlan_setcookie_p = NULL;
 		vlan_devat_p = NULL;
 		VLAN_LOCKING_DESTROY();
 		if (bootverbose)
 			printf("vlan: unloaded\n");
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t vlan_mod = {
 	"if_vlan",
 	vlan_modevent,
 	0
 };
 
 DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_vlan, 3);
 
 #ifdef VIMAGE
 static void
 vnet_vlan_init(const void *unused __unused)
 {
 
 	vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match,
 		    vlan_clone_create, vlan_clone_destroy);
 	V_vlan_cloner = vlan_cloner;
 }
 VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_vlan_init, NULL);
 
 static void
 vnet_vlan_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_vlan_cloner);
 }
 VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_vlan_uninit, NULL);
 #endif
 
 /*
  * Check for <etherif>.<vlan>[.<vlan> ...] style interface names.
  */
 static struct ifnet *
 vlan_clone_match_ethervid(const char *name, int *vidp)
 {
 	char ifname[IFNAMSIZ];
 	char *cp;
 	struct ifnet *ifp;
 	int vid;
 
 	strlcpy(ifname, name, IFNAMSIZ);
 	if ((cp = strrchr(ifname, '.')) == NULL)
 		return (NULL);
 	*cp = '\0';
 	if ((ifp = ifunit_ref(ifname)) == NULL)
 		return (NULL);
 	/* Parse VID. */
 	if (*++cp == '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	vid = 0;
 	for(; *cp >= '0' && *cp <= '9'; cp++)
 		vid = (vid * 10) + (*cp - '0');
 	if (*cp != '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	if (vidp != NULL)
 		*vidp = vid;
 
 	return (ifp);
 }
 
 static int
 vlan_clone_match(struct if_clone *ifc, const char *name)
 {
 	struct ifnet *ifp;
 	const char *cp;
 
 	ifp = vlan_clone_match_ethervid(name, NULL);
 	if (ifp != NULL) {
 		if_rele(ifp);
 		return (1);
 	}
 
 	if (strncmp(vlanname, name, strlen(vlanname)) != 0)
 		return (0);
 	for (cp = name + 4; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	char *dp;
 	int wildcard;
 	int unit;
 	int error;
 	int vid;
 	uint16_t proto;
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 	struct ifnet *p;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	struct vlanreq vlr;
 	static const u_char eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	proto = ETHERTYPE_VLAN;
 
 	/*
 	 * There are two ways to specify the cloned device:
 	 * o pass a parameter block with the clone request.
 	 * o specify no parameters and get an unattached device that
 	 *   must be configured separately.
 	 * The first technique is preferred; the latter is supported
 	 * for backwards compatibility.
 	 *
 	 * XXXRW: Note historic use of the word "tag" here.  New ioctls may be
 	 * called for.
 	 */
 	if (params) {
 		error = copyin(params, &vlr, sizeof(vlr));
 		if (error)
 			return error;
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL)
 			return (ENXIO);
 		error = ifc_name2unit(name, &unit);
 		if (error != 0) {
 			if_rele(p);
 			return (error);
 		}
 		vid = vlr.vlr_tag;
 		proto = vlr.vlr_proto;
 		wildcard = (unit < 0);
 	} else {
 		p = NULL;
 		error = ifc_name2unit(name, &unit);
 		if (error != 0)
 			return (error);
 
 		wildcard = (unit < 0);
 	}
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0) {
 		if (p != NULL)
 			if_rele(p);
 		return (error);
 	}
 
 	/* In the wildcard case, we need to update the name. */
 	if (wildcard) {
 		for (dp = name; *dp != '\0'; dp++);
 		if (snprintf(dp, len - (dp-name), "%d", unit) >
 		    len - (dp-name) - 1) {
 			panic("%s: interface name too long", __func__);
 		}
 	}
 
 	ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO);
 	ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		ifc_free_unit(ifc, unit);
 		free(ifv, M_VLAN);
 		if (p != NULL)
 			if_rele(p);
 		return (ENOSPC);
 	}
 	CK_SLIST_INIT(&ifv->vlan_mc_listhead);
 	ifp->if_softc = ifv;
 	/*
 	 * Set the name manually rather than using if_initname because
 	 * we don't conform to the default naming convention for interfaces.
 	 */
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = vlanname;
 	ifp->if_dunit = unit;
 
 	ifp->if_init = vlan_init;
 	ifp->if_transmit = vlan_transmit;
 	ifp->if_qflush = vlan_qflush;
 	ifp->if_ioctl = vlan_ioctl;
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
 	ifp->if_snd_tag_modify = vlan_snd_tag_modify;
 	ifp->if_snd_tag_query = vlan_snd_tag_query;
 	ifp->if_snd_tag_free = vlan_snd_tag_free;
 #endif
 	ifp->if_flags = VLAN_IFFLAGS;
 	ether_ifattach(ifp, eaddr);
 	/* Now undo some of the damage... */
 	ifp->if_baudrate = 0;
 	ifp->if_type = IFT_L2VLAN;
 	ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN;
 	ifa = ifp->if_addr;
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_L2VLAN;
 
 	if (p != NULL) {
 		error = vlan_config(ifv, p, vid, proto);
 		if_rele(p);
 		if (error != 0) {
 			/*
 			 * Since we've partially failed, we need to back
 			 * out all the way, otherwise userland could get
 			 * confused.  Thus, we destroy the interface.
 			 */
 			ether_ifdetach(ifp);
 			vlan_unconfig(ifp);
 			if_free(ifp);
 			ifc_free_unit(ifc, unit);
 			free(ifv, M_VLAN);
 
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ifvlan *ifv = ifp->if_softc;
 
 	if (ifp->if_vlantrunk)
 		return (EBUSY);
 
 	ether_ifdetach(ifp);	/* first, remove it from system-wide lists */
 	vlan_unconfig(ifp);	/* now it can be unconfigured and freed */
 	/*
 	 * We should have the only reference to the ifv now, so we can now
 	 * drain any remaining lladdr task before freeing the ifnet and the
 	 * ifvlan.
 	 */
 	taskqueue_drain(taskqueue_thread, &ifv->lladdr_task);
 	NET_EPOCH_WAIT();
 	if_free(ifp);
 	free(ifv, M_VLAN);
 	ifc_free_unit(ifc, ifp->if_dunit);
 
 	return (0);
 }
 
 /*
  * The ifp->if_init entry point for vlan(4) is a no-op.
  */
 static void
 vlan_init(void *foo __unused)
 {
 }
 
 /*
  * The if_transmit method for vlan(4) interface.
  */
 static int
 vlan_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifvlan *ifv;
 	struct ifnet *p;
 	int error, len, mcast;
 
 	NET_EPOCH_ASSERT();
 
 	ifv = ifp->if_softc;
 	if (TRUNK(ifv) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 	p = PARENT(ifv);
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 
 	BPF_MTAP(ifp, m);
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		struct vlan_snd_tag *vst;
 		struct m_snd_tag *mst;
 
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 		mst = m->m_pkthdr.snd_tag;
 		vst = mst_to_vst(mst);
 		if (vst->tag->ifp != p) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			return (EAGAIN);
 		}
 
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag);
 		m_snd_tag_rele(mst);
 	}
 #endif
 
 	/*
 	 * Do not run parent's if_transmit() if the parent is not up,
 	 * or parent's driver will cause a system crash.
 	 */
 	if (!UP_AND_RUNNING(p)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if (!ether_8021q_frame(&m, ifp, p, &ifv->ifv_qtag)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (0);
 	}
 
 	/*
 	 * Send it, precisely as ether_output() would have.
 	 */
 	error = (p->if_transmit)(p, m);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (error);
 }
 
 static int
 vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct ifvlan *ifv;
 	struct ifnet *p;
 
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * Find the first non-VLAN parent interface.
 	 */
 	ifv = ifp->if_softc;
 	do {
 		if (TRUNK(ifv) == NULL) {
 			m_freem(m);
 			return (ENETDOWN);
 		}
 		p = PARENT(ifv);
 		ifv = p->if_softc;
 	} while (p->if_type == IFT_L2VLAN);
 
 	return p->if_output(ifp, m, dst, ro);
 }
 
 /*
  * The ifp->if_qflush entry point for vlan(4) is a no-op.
  */
 static void
 vlan_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static void
 vlan_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 	struct m_tag *mtag;
 	uint16_t vid, tag;
 
 	NET_EPOCH_ASSERT();
 
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_flags & M_VLANTAG) {
 		/*
 		 * Packet is tagged, but m contains a normal
 		 * Ethernet frame; the tag is stored out-of-band.
 		 */
 		tag = m->m_pkthdr.ether_vtag;
 		m->m_flags &= ~M_VLANTAG;
 	} else {
 		struct ether_vlan_header *evl;
 
 		/*
 		 * Packet is tagged in-band as specified by 802.1q.
 		 */
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 			if (m->m_len < sizeof(*evl) &&
 			    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 				if_printf(ifp, "cannot pullup VLAN header\n");
 				return;
 			}
 			evl = mtod(m, struct ether_vlan_header *);
 			tag = ntohs(evl->evl_tag);
 
 			/*
 			 * Remove the 802.1q header by copying the Ethernet
 			 * addresses over it and adjusting the beginning of
 			 * the data in the mbuf.  The encapsulated Ethernet
 			 * type field is already in place.
 			 */
 			bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 			      ETHER_HDR_LEN - ETHER_TYPE_LEN);
 			m_adj(m, ETHER_VLAN_ENCAP_LEN);
 			break;
 
 		default:
 #ifdef INVARIANTS
 			panic("%s: %s has unsupported if_type %u",
 			      __func__, ifp->if_xname, ifp->if_type);
 #endif
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 	}
 
 	vid = EVL_VLANOFTAG(tag);
 
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) {
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		m_freem(m);
 		return;
 	}
 
 	if (vlan_mtag_pcp) {
 		/*
 		 * While uncommon, it is possible that we will find a 802.1q
 		 * packet encapsulated inside another packet that also had an
 		 * 802.1q header.  For example, ethernet tunneled over IPSEC
 		 * arriving over ethernet.  In that case, we replace the
 		 * existing 802.1q PCP m_tag value.
 		 */
 		mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
 		if (mtag == NULL) {
 			mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN,
 			    sizeof(uint8_t), M_NOWAIT);
 			if (mtag == NULL) {
 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 				m_freem(m);
 				return;
 			}
 			m_tag_prepend(m, mtag);
 		}
 		*(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag);
 	}
 
 	m->m_pkthdr.rcvif = ifv->ifv_ifp;
 	if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1);
 
 	/* Pass it back through the parent's input routine. */
 	(*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m);
 }
 
 static void
 vlan_lladdr_fn(void *arg, int pending __unused)
 {
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 
 	ifv = (struct ifvlan *)arg;
 	ifp = ifv->ifv_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
 
 	/* The ifv_ifp already has the lladdr copied in. */
 	if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen);
 
 	CURVNET_RESTORE();
 }
 
 static int
 vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid,
 	uint16_t proto)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifnet *ifp;
 	int error = 0;
 
 	/*
 	 * We can handle non-ethernet hardware types as long as
 	 * they handle the tagging and headers themselves.
 	 */
 	if (p->if_type != IFT_ETHER &&
 	    p->if_type != IFT_L2VLAN &&
 	    (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0)
 		return (EPROTONOSUPPORT);
 	if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS)
 		return (EPROTONOSUPPORT);
 	/*
 	 * Don't let the caller set up a VLAN VID with
 	 * anything except VLID bits.
 	 * VID numbers 0x0 and 0xFFF are reserved.
 	 */
 	if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK))
 		return (EINVAL);
 	if (ifv->ifv_trunk)
 		return (EBUSY);
 
 	VLAN_XLOCK();
 	if (p->if_vlantrunk == NULL) {
 		trunk = malloc(sizeof(struct ifvlantrunk),
 		    M_VLAN, M_WAITOK | M_ZERO);
 		vlan_inithash(trunk);
 		TRUNK_LOCK_INIT(trunk);
 		TRUNK_WLOCK(trunk);
 		p->if_vlantrunk = trunk;
 		trunk->parent = p;
 		if_ref(trunk->parent);
 		TRUNK_WUNLOCK(trunk);
 	} else {
 		trunk = p->if_vlantrunk;
 	}
 
 	ifv->ifv_vid = vid;	/* must set this before vlan_inshash() */
 	ifv->ifv_pcp = 0;       /* Default: best effort delivery. */
 	error = vlan_inshash(trunk, ifv);
 	if (error)
 		goto done;
 	ifv->ifv_proto = proto;
 	ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN;
 	ifv->ifv_mintu = ETHERMIN;
 	ifv->ifv_pflags = 0;
 	ifv->ifv_capenable = -1;
 
 	/*
 	 * If the parent supports the VLAN_MTU capability,
 	 * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames,
 	 * use it.
 	 */
 	if (p->if_capenable & IFCAP_VLAN_MTU) {
 		/*
 		 * No need to fudge the MTU since the parent can
 		 * handle extended frames.
 		 */
 		ifv->ifv_mtufudge = 0;
 	} else {
 		/*
 		 * Fudge the MTU by the encapsulation size.  This
 		 * makes us incompatible with strictly compliant
 		 * 802.1Q implementations, but allows us to use
 		 * the feature with other NetBSD implementations,
 		 * which might still be useful.
 		 */
 		ifv->ifv_mtufudge = ifv->ifv_encaplen;
 	}
 
 	ifv->ifv_trunk = trunk;
 	ifp = ifv->ifv_ifp;
 	/*
 	 * Initialize fields from our parent.  This duplicates some
 	 * work with ether_ifattach() but allows for non-ethernet
 	 * interfaces to also work.
 	 */
 	ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge;
 	ifp->if_baudrate = p->if_baudrate;
 	ifp->if_input = p->if_input;
 	ifp->if_resolvemulti = p->if_resolvemulti;
 	ifp->if_addrlen = p->if_addrlen;
 	ifp->if_broadcastaddr = p->if_broadcastaddr;
 	ifp->if_pcp = ifv->ifv_pcp;
 
 	/*
 	 * We wrap the parent's if_output using vlan_output to ensure that it
 	 * can't become stale.
 	 */
 	ifp->if_output = vlan_output;
 
 	/*
 	 * Copy only a selected subset of flags from the parent.
 	 * Other flags are none of our business.
 	 */
 #define VLAN_COPY_FLAGS (IFF_SIMPLEX)
 	ifp->if_flags &= ~VLAN_COPY_FLAGS;
 	ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS;
 #undef VLAN_COPY_FLAGS
 
 	ifp->if_link_state = p->if_link_state;
 
 	NET_EPOCH_ENTER(et);
 	vlan_capabilities(ifv);
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * Set up our interface address to reflect the underlying
 	 * physical interface's.
 	 */
 	TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv);
 	((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen =
 	    p->if_addrlen;
 
 	/*
 	 * Do not schedule link address update if it was the same
 	 * as previous parent's. This helps avoid updating for each
 	 * associated llentry.
 	 */
 	if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) {
 		bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen);
 		taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task);
 	}
 
 	/* We are ready for operation now. */
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/* Update flags on the parent, if necessary. */
 	vlan_setflags(ifp, 1);
 
 	/*
 	 * Configure multicast addresses that may already be
 	 * joined on the vlan device.
 	 */
 	(void)vlan_setmulti(ifp);
 
 done:
 	if (error == 0)
 		EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid);
 	VLAN_XUNLOCK();
 
 	return (error);
 }
 
 static void
 vlan_unconfig(struct ifnet *ifp)
 {
 
 	VLAN_XLOCK();
 	vlan_unconfig_locked(ifp, 0);
 	VLAN_XUNLOCK();
 }
 
 static void
 vlan_unconfig_locked(struct ifnet *ifp, int departing)
 {
 	struct ifvlantrunk *trunk;
 	struct vlan_mc_entry *mc;
 	struct ifvlan *ifv;
 	struct ifnet  *parent;
 	int error;
 
 	VLAN_XLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	trunk = ifv->ifv_trunk;
 	parent = NULL;
 
 	if (trunk != NULL) {
 		parent = trunk->parent;
 
 		/*
 		 * Since the interface is being unconfigured, we need to
 		 * empty the list of multicast groups that we may have joined
 		 * while we were alive from the parent's list.
 		 */
 		while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) {
 			/*
 			 * If the parent interface is being detached,
 			 * all its multicast addresses have already
 			 * been removed.  Warn about errors if
 			 * if_delmulti() does fail, but don't abort as
 			 * all callers expect vlan destruction to
 			 * succeed.
 			 */
 			if (!departing) {
 				error = if_delmulti(parent,
 				    (struct sockaddr *)&mc->mc_addr);
 				if (error)
 					if_printf(ifp,
 		    "Failed to delete multicast address from parent: %d\n",
 					    error);
 			}
 			CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries);
 			NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx);
 		}
 
 		vlan_setflags(ifp, 0); /* clear special flags on parent */
 
 		vlan_remhash(trunk, ifv);
 		ifv->ifv_trunk = NULL;
 
 		/*
 		 * Check if we were the last.
 		 */
 		if (trunk->refcnt == 0) {
 			parent->if_vlantrunk = NULL;
 			NET_EPOCH_WAIT();
 			trunk_destroy(trunk);
 		}
 	}
 
 	/* Disconnect from parent. */
 	if (ifv->ifv_pflags)
 		if_printf(ifp, "%s: ifv_pflags unclean\n", __func__);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_link_state = LINK_STATE_UNKNOWN;
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * Only dispatch an event if vlan was
 	 * attached, otherwise there is nothing
 	 * to cleanup anyway.
 	 */
 	if (parent != NULL)
 		EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid);
 }
 
 /* Handle a reference counted flag that should be set on the parent as well */
 static int
 vlan_setflag(struct ifnet *ifp, int flag, int status,
 	     int (*func)(struct ifnet *, int))
 {
 	struct ifvlan *ifv;
 	int error;
 
 	VLAN_SXLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	status = status ? (ifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded parent's status is different from what
 	 * we want it to be.  If it is, flip it.  We record parent's
 	 * status in ifv_pflags so that we won't clear parent's flag
 	 * we haven't set.  In fact, we don't clear or set parent's
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual parent's flags.
 	 */
 	if (status != (ifv->ifv_pflags & flag)) {
 		error = (*func)(PARENT(ifv), status);
 		if (error)
 			return (error);
 		ifv->ifv_pflags &= ~flag;
 		ifv->ifv_pflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the parent:
  * if "status" is true, update parent's flags respective to our if_flags;
  * if "status" is false, forcedly clear the flags set on parent.
  */
 static int
 vlan_setflags(struct ifnet *ifp, int status)
 {
 	int error, i;
 
 	for (i = 0; vlan_pflags[i].flag; i++) {
 		error = vlan_setflag(ifp, vlan_pflags[i].flag,
 				     status, vlan_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 /* Inform all vlans that their parent has changed link state */
 static void
 vlan_link_state(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate;
 		if_link_state_change(ifv->ifv_ifp,
 		    trunk->parent->if_link_state);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 vlan_capabilities(struct ifvlan *ifv)
 {
 	struct ifnet *p;
 	struct ifnet *ifp;
 	struct ifnet_hw_tsomax hw_tsomax;
 	int cap = 0, ena = 0, mena;
 	u_long hwa = 0;
 
 	NET_EPOCH_ASSERT();
 	VLAN_SXLOCK_ASSERT();
 
 	p = PARENT(ifv);
 	ifp = ifv->ifv_ifp;
 
 	/* Mask parent interface enabled capabilities disabled by user. */
 	mena = p->if_capenable & ifv->ifv_capenable;
 
 	/*
 	 * If the parent interface can do checksum offloading
 	 * on VLANs, then propagate its hardware-assisted
 	 * checksumming flags. Also assert that checksum
 	 * offloading requires hardware VLAN tagging.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM &&
 	    p->if_capenable & IFCAP_VLAN_HWTAGGING) {
 		ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 		if (ena & IFCAP_TXCSUM)
 			hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP |
 			    CSUM_UDP | CSUM_SCTP);
 		if (ena & IFCAP_TXCSUM_IPV6)
 			hwa |= p->if_hwassist & (CSUM_TCP_IPV6 |
 			    CSUM_UDP_IPV6 | CSUM_SCTP_IPV6);
 	}
 
 	/*
 	 * If the parent interface can do TSO on VLANs then
 	 * propagate the hardware-assisted flag. TSO on VLANs
 	 * does not necessarily require hardware VLAN tagging.
 	 */
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	if_hw_tsomax_common(p, &hw_tsomax);
 	if_hw_tsomax_update(ifp, &hw_tsomax);
 	if (p->if_capabilities & IFCAP_VLAN_HWTSO)
 		cap |= p->if_capabilities & IFCAP_TSO;
 	if (p->if_capenable & IFCAP_VLAN_HWTSO) {
 		ena |= mena & IFCAP_TSO;
 		if (ena & IFCAP_TSO)
 			hwa |= p->if_hwassist & CSUM_TSO;
 	}
 
 	/*
 	 * If the parent interface can do LRO and checksum offloading on
 	 * VLANs, then guess it may do LRO on VLANs.  False positive here
 	 * cost nothing, while false negative may lead to some confusions.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & IFCAP_LRO;
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM)
 		ena |= p->if_capenable & IFCAP_LRO;
 
 	/*
 	 * If the parent interface can offload TCP connections over VLANs then
 	 * propagate its TOE capability to the VLAN interface.
 	 *
 	 * All TOE drivers in the tree today can deal with VLANs.  If this
 	 * changes then IFCAP_VLAN_TOE should be promoted to a full capability
 	 * with its own bit.
 	 */
 #define	IFCAP_VLAN_TOE IFCAP_TOE
 	if (p->if_capabilities & IFCAP_VLAN_TOE)
 		cap |= p->if_capabilities & IFCAP_TOE;
 	if (p->if_capenable & IFCAP_VLAN_TOE) {
 		TOEDEV(ifp) = TOEDEV(p);
 		ena |= mena & IFCAP_TOE;
 	}
 
 	/*
 	 * If the parent interface supports dynamic link state, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_LINKSTATE);
 	ena |= (mena & IFCAP_LINKSTATE);
 
 #ifdef RATELIMIT
 	/*
 	 * If the parent interface supports ratelimiting, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_TXRTLMT);
 	ena |= (mena & IFCAP_TXRTLMT);
 #endif
 
 	/*
 	 * If the parent interface supports unmapped mbufs, so does
 	 * the VLAN interface.  Note that this should be fine even for
 	 * interfaces that don't support hardware tagging as headers
 	 * are prepended in normal mbufs to unmapped mbufs holding
 	 * payload data.
 	 */
 	cap |= (p->if_capabilities & IFCAP_NOMAP);
 	ena |= (mena & IFCAP_NOMAP);
 
 	/*
 	 * If the parent interface can offload encryption and segmentation
 	 * of TLS records over TCP, propagate it's capability to the VLAN
 	 * interface.
 	 *
 	 * All TLS drivers in the tree today can deal with VLANs.  If
 	 * this ever changes, then a new IFCAP_VLAN_TXTLS can be
 	 * defined.
 	 */
-	if (p->if_capabilities & IFCAP_TXTLS)
-		cap |= p->if_capabilities & IFCAP_TXTLS;
-	if (p->if_capenable & IFCAP_TXTLS)
-		ena |= mena & IFCAP_TXTLS;
+	if (p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT))
+		cap |= p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT);
+	if (p->if_capenable & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT))
+		ena |= mena & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT);
 
 	ifp->if_capabilities = cap;
 	ifp->if_capenable = ena;
 	ifp->if_hwassist = hwa;
 }
 
 static void
 vlan_trunk_capabilities(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	VLAN_SLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_SUNLOCK();
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 	VLAN_FOREACH(ifv, trunk)
 		vlan_capabilities(ifv);
 	NET_EPOCH_EXIT(et);
 	VLAN_SUNLOCK();
 }
 
 static int
 vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifnet *p;
 	struct ifreq *ifr;
 	struct ifaddr *ifa;
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 	struct vlanreq vlr;
 	int error = 0, oldmtu;
 
 	ifr = (struct ifreq *)data;
 	ifa = (struct ifaddr *) data;
 	ifv = ifp->if_softc;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 #endif
 		break;
 	case SIOCGIFADDR:
 		bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
 		    ifp->if_addrlen);
 		break;
 	case SIOCGIFMEDIA:
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			p = PARENT(ifv);
 			if_ref(p);
 			error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data);
 			if_rele(p);
 			/* Limit the result to the parent's current config. */
 			if (error == 0) {
 				struct ifmediareq *ifmr;
 
 				ifmr = (struct ifmediareq *)data;
 				if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) {
 					ifmr->ifm_count = 1;
 					error = copyout(&ifmr->ifm_current,
 						ifmr->ifm_ulist,
 						sizeof(int));
 				}
 			}
 		} else {
 			error = EINVAL;
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSIFMEDIA:
 		error = EINVAL;
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		VLAN_SLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			TRUNK_WLOCK(trunk);
 			if (ifr->ifr_mtu >
 			     (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) ||
 			    ifr->ifr_mtu <
 			     (ifv->ifv_mintu - ifv->ifv_mtufudge))
 				error = EINVAL;
 			else
 				ifp->if_mtu = ifr->ifr_mtu;
 			TRUNK_WUNLOCK(trunk);
 		} else
 			error = EINVAL;
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSETVLAN:
 #ifdef VIMAGE
 		/*
 		 * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN
 		 * interface to be delegated to a jail without allowing the
 		 * jail to change what underlying interface/VID it is
 		 * associated with.  We are not entirely convinced that this
 		 * is the right way to accomplish that policy goal.
 		 */
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr));
 		if (error)
 			break;
 		if (vlr.vlr_parent[0] == '\0') {
 			vlan_unconfig(ifp);
 			break;
 		}
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL) {
 			error = ENOENT;
 			break;
 		}
 		oldmtu = ifp->if_mtu;
 		error = vlan_config(ifv, p, vlr.vlr_tag, vlr.vlr_proto);
 		if_rele(p);
 
 		/*
 		 * VLAN MTU may change during addition of the vlandev.
 		 * If it did, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 
 	case SIOCGETVLAN:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		bzero(&vlr, sizeof(vlr));
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname,
 			    sizeof(vlr.vlr_parent));
 			vlr.vlr_tag = ifv->ifv_vid;
 			vlr.vlr_proto = ifv->ifv_proto;
 		}
 		VLAN_SUNLOCK();
 		error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr));
 		break;
 
 	case SIOCSIFFLAGS:
 		/*
 		 * We should propagate selected flags to the parent,
 		 * e.g., promiscuous mode.
 		 */
 		VLAN_XLOCK();
 		if (TRUNK(ifv) != NULL)
 			error = vlan_setflags(ifp, 1);
 		VLAN_XUNLOCK();
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		/*
 		 * If we don't have a parent, just remember the membership for
 		 * when we do.
 		 *
 		 * XXX We need the rmlock here to avoid sleeping while
 		 * holding in6_multi_mtx.
 		 */
 		VLAN_XLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL)
 			error = vlan_setmulti(ifp);
 		VLAN_XUNLOCK();
 
 		break;
 	case SIOCGVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		ifr->ifr_vlan_pcp = ifv->ifv_pcp;
 		break;
 
 	case SIOCSVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = priv_check(curthread, PRIV_NET_SETVLANPCP);
 		if (error)
 			break;
 		if (ifr->ifr_vlan_pcp > 7) {
 			error = EINVAL;
 			break;
 		}
 		ifv->ifv_pcp = ifr->ifr_vlan_pcp;
 		ifp->if_pcp = ifv->ifv_pcp;
 		/* broadcast event about PCP change */
 		EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP);
 		break;
 
 	case SIOCSIFCAP:
 		VLAN_SLOCK();
 		ifv->ifv_capenable = ifr->ifr_reqcap;
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			vlan_capabilities(ifv);
 			NET_EPOCH_EXIT(et);
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static int
 vlan_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct epoch_tracker et;
 	struct vlan_snd_tag *vst;
 	struct ifvlan *ifv;
 	struct ifnet *parent;
 	int error;
 
 	NET_EPOCH_ENTER(et);
 	ifv = ifp->if_softc;
 	if (ifv->ifv_trunk != NULL)
 		parent = PARENT(ifv);
 	else
 		parent = NULL;
 	if (parent == NULL || parent->if_snd_tag_alloc == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EOPNOTSUPP);
 	}
 	if_ref(parent);
 	NET_EPOCH_EXIT(et);
 
 	vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT);
 	if (vst == NULL) {
 		if_rele(parent);
 		return (ENOMEM);
 	}
 
 	error = parent->if_snd_tag_alloc(parent, params, &vst->tag);
 	if_rele(parent);
 	if (error) {
 		free(vst, M_VLAN);
 		return (error);
 	}
 
 	m_snd_tag_init(&vst->com, ifp, vst->tag->type);
 
 	*ppmt = &vst->com;
 	return (0);
 }
 
 static int
 vlan_snd_tag_modify(struct m_snd_tag *mst,
     union if_snd_tag_modify_params *params)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params));
 }
 
 static int
 vlan_snd_tag_query(struct m_snd_tag *mst,
     union if_snd_tag_query_params *params)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	return (vst->tag->ifp->if_snd_tag_query(vst->tag, params));
 }
 
 static void
 vlan_snd_tag_free(struct m_snd_tag *mst)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	m_snd_tag_rele(vst->tag);
 	free(vst, M_VLAN);
 }
 #endif
diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c
index 161ed0b5f33a..2ec2752e46ef 100644
--- a/sys/netinet/tcp_ratelimit.c
+++ b/sys/netinet/tcp_ratelimit.c
@@ -1,1508 +1,1562 @@
 /*-
  *
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2018-2020
  *	Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 /**
  * Author: Randall Stewart <rrs@netflix.com>
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 #include "opt_ratelimit.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/mutex.h>
 #include <sys/ck.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #define TCPSTATES		/* for logging */
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcp_ratelimit.h>
 #ifndef USECS_IN_SECOND
 #define USECS_IN_SECOND 1000000
 #endif
 /*
  * For the purposes of each send, what is the size
  * of an ethernet frame.
  */
 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
 #ifdef RATELIMIT
 
 /*
  * The following preferred table will seem weird to
  * the casual viewer. Why do we not have any rates below
  * 1Mbps? Why do we have a rate at 1.44Mbps called common?
  * Why do the rates cluster in the 1-100Mbps range more
  * than others? Why does the table jump around at the beginnign
  * and then be more consistently raising?
  *
  * Let me try to answer those questions. A lot of
  * this is dependant on the hardware. We have three basic
  * supporters of rate limiting
  *
  * Chelsio - Supporting 16 configurable rates.
  * Mlx  - c4 supporting 13 fixed rates.
  * Mlx  - c5 & c6 supporting 127 configurable rates.
  *
  * The c4 is why we have a common rate that is available
  * in all rate tables. This is a selected rate from the
  * c4 table and we assure its available in all ratelimit
  * tables. This way the tcp_ratelimit code has an assured
  * rate it should always be able to get. This answers a
  * couple of the questions above.
  *
  * So what about the rest, well the table is built to
  * try to get the most out of a joint hardware/software
  * pacing system.  The software pacer will always pick
  * a rate higher than the b/w that it is estimating
  *
  * on the path. This is done for two reasons.
  * a) So we can discover more b/w
  * and
  * b) So we can send a block of MSS's down and then
  *    have the software timer go off after the previous
  *    send is completely out of the hardware.
  *
  * But when we do <b> we don't want to have the delay
  * between the last packet sent by the hardware be
  * excessively long (to reach our desired rate).
  *
  * So let me give an example for clarity.
  *
  * Lets assume that the tcp stack sees that 29,110,000 bps is
  * what the bw of the path is. The stack would select the
  * rate 31Mbps. 31Mbps means that each send that is done
  * by the hardware will cause a 390 micro-second gap between
  * the packets sent at that rate. For 29,110,000 bps we
  * would need 416 micro-seconds gap between each send.
  *
  * Note that are calculating a complete time for pacing
  * which includes the ethernet, IP and TCP overhead. So
  * a full 1514 bytes is used for the above calculations.
  * My testing has shown that both cards are also using this
  * as their basis i.e. full payload size of the ethernet frame.
  * The TCP stack caller needs to be aware of this and make the
  * appropriate overhead calculations be included in its choices.
  *
  * Now, continuing our example, we pick a MSS size based on the
  * delta between the two rates (416 - 390) divided into the rate
  * we really wish to send at rounded up.  That results in a MSS
  * send of 17 mss's at once. The hardware then will
  * run out of data in a single 17MSS send in 6,630 micro-seconds.
  *
  * On the other hand the software pacer will send more data
  * in 7,072 micro-seconds. This means that we will refill
  * the hardware 52 microseconds after it would have sent
  * next if it had not ran out of data. This is a win since we are
  * only sending every 7ms or so and yet all the packets are spaced on
  * the wire with 94% of what they should be and only
  * the last packet is delayed extra to make up for the
  * difference.
  *
  * Note that the above formula has two important caveat.
  * If we are above (b/w wise) over 100Mbps we double the result
  * of the MSS calculation. The second caveat is if we are 500Mbps
  * or more we just send the maximum MSS at once i.e. 45MSS. At
  * the higher b/w's even the cards have limits to what times (timer granularity)
  * they can insert between packets and start to send more than one
  * packet at a time on the wire.
  *
  */
 #define COMMON_RATE 180500
 const uint64_t desired_rates[] = {
 	122500,			/* 1Mbps  - rate 1 */
 	180500,			/* 1.44Mpbs - rate 2  common rate */
 	375000,			/* 3Mbps    - rate 3 */
 	625000,			/* 5Mbps    - rate 4 */
 	875000,			/* 7Mbps    - rate 5 */
 	1125000,		/* 9Mbps    - rate 6 */
 	1375000,		/* 11Mbps   - rate 7 */
 	1625000,	       	/* 13Mbps   - rate 8 */
 	2625000,		/* 21Mbps   - rate 9 */
 	3875000,		/* 31Mbps   - rate 10 */
 	5125000,		/* 41Meg    - rate 11 */
 	12500000,		/* 100Mbps  - rate 12 */
 	25000000,		/* 200Mbps  - rate 13 */
 	50000000,		/* 400Mbps  - rate 14 */
 	63750000,		/* 51Mbps   - rate 15 */
 	100000000,		/* 800Mbps  - rate 16 */
 	1875000,		/* 15Mbps   - rate 17 */
 	2125000,		/* 17Mbps   - rate 18 */
 	2375000,		/* 19Mbps   - rate 19 */
 	2875000,		/* 23Mbps   - rate 20 */
 	3125000,		/* 25Mbps   - rate 21 */
 	3375000,		/* 27Mbps   - rate 22 */
 	3625000,		/* 29Mbps   - rate 23 */
 	4125000,		/* 33Mbps   - rate 24 */
 	4375000,		/* 35Mbps   - rate 25 */
 	4625000,		/* 37Mbps   - rate 26 */
 	4875000,		/* 39Mbps   - rate 27 */
 	5375000,		/* 43Mbps   - rate 28 */
 	5625000,		/* 45Mbps   - rate 29 */
 	5875000,		/* 47Mbps   - rate 30 */
 	6125000,		/* 49Mbps   - rate 31 */
 	6625000,		/* 53Mbps   - rate 32 */
 	6875000,		/* 55Mbps   - rate 33 */
 	7125000,		/* 57Mbps   - rate 34 */
 	7375000,		/* 59Mbps   - rate 35 */
 	7625000,		/* 61Mbps   - rate 36 */
 	7875000,		/* 63Mbps   - rate 37 */
 	8125000,		/* 65Mbps   - rate 38 */
 	8375000,		/* 67Mbps   - rate 39 */
 	8625000,		/* 69Mbps   - rate 40 */
 	8875000,		/* 71Mbps   - rate 41 */
 	9125000,		/* 73Mbps   - rate 42 */
 	9375000,		/* 75Mbps   - rate 43 */
 	9625000,		/* 77Mbps   - rate 44 */
 	9875000,		/* 79Mbps   - rate 45 */
 	10125000,		/* 81Mbps   - rate 46 */
 	10375000,		/* 83Mbps   - rate 47 */
 	10625000,		/* 85Mbps   - rate 48 */
 	10875000,		/* 87Mbps   - rate 49 */
 	11125000,		/* 89Mbps   - rate 50 */
 	11375000,		/* 91Mbps   - rate 51 */
 	11625000,		/* 93Mbps   - rate 52 */
 	11875000,		/* 95Mbps   - rate 53 */
 	13125000,		/* 105Mbps  - rate 54 */
 	13750000,		/* 110Mbps  - rate 55 */
 	14375000,		/* 115Mbps  - rate 56 */
 	15000000,		/* 120Mbps  - rate 57 */
 	15625000,		/* 125Mbps  - rate 58 */
 	16250000,		/* 130Mbps  - rate 59 */
 	16875000,		/* 135Mbps  - rate 60 */
 	17500000,		/* 140Mbps  - rate 61 */
 	18125000,		/* 145Mbps  - rate 62 */
 	18750000,		/* 150Mbps  - rate 64 */
 	20000000,		/* 160Mbps  - rate 65 */
 	21250000,		/* 170Mbps  - rate 66 */
 	22500000,		/* 180Mbps  - rate 67 */
 	23750000,		/* 190Mbps  - rate 68 */
 	26250000,		/* 210Mbps  - rate 69 */
 	27500000,		/* 220Mbps  - rate 70 */
 	28750000,		/* 230Mbps  - rate 71 */
 	30000000,	       	/* 240Mbps  - rate 72 */
 	31250000,		/* 250Mbps  - rate 73 */
 	34375000,		/* 275Mbps  - rate 74 */
 	37500000,		/* 300Mbps  - rate 75 */
 	40625000,		/* 325Mbps  - rate 76 */
 	43750000,		/* 350Mbps  - rate 77 */
 	46875000,		/* 375Mbps  - rate 78 */
 	53125000,		/* 425Mbps  - rate 79 */
 	56250000,		/* 450Mbps  - rate 80 */
 	59375000,		/* 475Mbps  - rate 81 */
 	62500000,		/* 500Mbps  - rate 82 */
 	68750000,		/* 550Mbps  - rate 83 */
 	75000000,		/* 600Mbps  - rate 84 */
 	81250000,		/* 650Mbps  - rate 85 */
 	87500000,		/* 700Mbps  - rate 86 */
 	93750000,		/* 750Mbps  - rate 87 */
 	106250000,		/* 850Mbps  - rate 88 */
 	112500000,		/* 900Mbps  - rate 89 */
 	125000000,		/* 1Gbps    - rate 90 */
 	156250000,		/* 1.25Gps  - rate 91 */
 	187500000,		/* 1.5Gps   - rate 92 */
 	218750000,		/* 1.75Gps  - rate 93 */
 	250000000,		/* 2Gbps    - rate 94 */
 	281250000,		/* 2.25Gps  - rate 95 */
 	312500000,		/* 2.5Gbps  - rate 96 */
 	343750000,		/* 2.75Gbps - rate 97 */
 	375000000,		/* 3Gbps    - rate 98 */
 	500000000,		/* 4Gbps    - rate 99 */
 	625000000,		/* 5Gbps    - rate 100 */
 	750000000,		/* 6Gbps    - rate 101 */
 	875000000,		/* 7Gbps    - rate 102 */
 	1000000000,		/* 8Gbps    - rate 103 */
 	1125000000,		/* 9Gbps    - rate 104 */
 	1250000000,		/* 10Gbps   - rate 105 */
 	1875000000,		/* 15Gbps   - rate 106 */
 	2500000000		/* 20Gbps   - rate 107 */
 };
 
 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
 #define RS_ORDERED_COUNT 16	/*
 				 * Number that are in order
 				 * at the beginning of the table,
 				 * over this a sort is required.
 				 */
 #define RS_NEXT_ORDER_GROUP 16	/*
 				 * The point in our table where
 				 * we come fill in a second ordered
 				 * group (index wise means -1).
 				 */
 #define ALL_HARDWARE_RATES 1004 /*
 				 * 1Meg - 1Gig in 1 Meg steps
 				 * plus 100, 200k  and 500k and
 				 * 10Gig
 				 */
 
 #define RS_ONE_MEGABIT_PERSEC 1000000
 #define RS_ONE_GIGABIT_PERSEC 1000000000
 #define RS_TEN_GIGABIT_PERSEC 10000000000
 
 static struct head_tcp_rate_set int_rs;
 static struct mtx rs_mtx;
 uint32_t rs_number_alive;
 uint32_t rs_number_dead;
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Ratelimit stats");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
     &rs_number_alive, 0,
     "Number of interfaces initialized for ratelimiting");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
     &rs_number_dead, 0,
     "Number of interfaces departing from ratelimiting");
 
 static void
 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
 {
 	/*
 	 * Add sysctl entries for thus interface.
 	 */
 	if (rs->rs_flags & RS_INTF_NO_SUP) {
 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
 		   SYSCTL_CHILDREN(rl_sysctl_root),
 		   OID_AUTO, "disable", CTLFLAG_RD,
 		   &rs->rs_disable, 0,
 		   "Disable this interface from new hdwr limiting?");
 	} else {
 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
 		   SYSCTL_CHILDREN(rl_sysctl_root),
 		   OID_AUTO, "disable", CTLFLAG_RW,
 		   &rs->rs_disable, 0,
 		   "Disable this interface from new hdwr limiting?");
 	}
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "minseg", CTLFLAG_RW,
 	    &rs->rs_min_seg, 0,
 	    "What is the minimum we need to send on this interface?");
 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flow_limit", CTLFLAG_RW,
 	    &rs->rs_flow_limit, 0,
 	    "What is the limit for number of flows (0=unlimited)?");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "highest", CTLFLAG_RD,
 	    &rs->rs_highest_valid, 0,
 	    "Highest valid rate");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "lowest", CTLFLAG_RD,
 	    &rs->rs_lowest_valid, 0,
 	    "Lowest valid rate");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flags", CTLFLAG_RD,
 	    &rs->rs_flags, 0,
 	    "What lags are on the entry?");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "numrates", CTLFLAG_RD,
 	    &rs->rs_rate_cnt, 0,
 	    "How many rates re there?");
 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flows_using", CTLFLAG_RD,
 	    &rs->rs_flows_using, 0,
 	    "How many flows are using this interface now?");
 #ifdef DETAILED_RATELIMIT_SYSCTL
 	if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
 		/*  Lets display the rates */
 		int i;
 		struct sysctl_oid *rl_rates;
 		struct sysctl_oid *rl_rate_num;
 		char rate_num[16];
 		rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 					    SYSCTL_CHILDREN(rl_sysctl_root),
 					    OID_AUTO,
 					    "rate",
 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 					    "Ratelist");
 		for( i = 0; i < rs->rs_rate_cnt; i++) {
 			sprintf(rate_num, "%d", i);
 			rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 					    SYSCTL_CHILDREN(rl_rates),
 					    OID_AUTO,
 					    rate_num,
 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 					    "Individual Rate");
 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "flags", CTLFLAG_RD,
 				       &rs->rs_rlt[i].flags, 0,
 				       "Flags on this rate");
 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "pacetime", CTLFLAG_RD,
 				       &rs->rs_rlt[i].time_between, 0,
 				       "Time hardware inserts between 1500 byte sends");
 			SYSCTL_ADD_U64(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "rate", CTLFLAG_RD,
 				       &rs->rs_rlt[i].rate, 0,
 				       "Rate in bytes per second");
 		}
 	}
 #endif
 }
 
 static void
 rs_destroy(epoch_context_t ctx)
 {
 	struct tcp_rate_set *rs;
 	bool do_free_rs;
 
 	rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
 
 	mtx_lock(&rs_mtx);
 	rs->rs_flags &= ~RS_FUNERAL_SCHD;
 	/*
 	 * In theory its possible (but unlikely)
 	 * that while the delete was occuring
 	 * and we were applying the DEAD flag
 	 * someone slipped in and found the
 	 * interface in a lookup. While we
 	 * decided rs_flows_using were 0 and
 	 * scheduling the epoch_call, the other
 	 * thread incremented rs_flow_using. This
 	 * is because users have a pointer and
 	 * we only use the rs_flows_using in an
 	 * atomic fashion, i.e. the other entities
 	 * are not protected. To assure this did
 	 * not occur, we check rs_flows_using here
 	 * before deleting.
 	 */
 	do_free_rs = (rs->rs_flows_using == 0);
 	rs_number_dead--;
 	mtx_unlock(&rs_mtx);
 
 	if (do_free_rs) {
 		sysctl_ctx_free(&rs->sysctl_ctx);
 		free(rs->rs_rlt, M_TCPPACE);
 		free(rs, M_TCPPACE);
 	}
 }
 
 static void
 rs_defer_destroy(struct tcp_rate_set *rs)
 {
 
 	mtx_assert(&rs_mtx, MA_OWNED);
 
 	/* Check if already pending. */
 	if (rs->rs_flags & RS_FUNERAL_SCHD)
 		return;
 
 	rs_number_dead++;
 
 	/* Set flag to only defer once. */
 	rs->rs_flags |= RS_FUNERAL_SCHD;
 	NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
 }
 
 #ifdef INET
 extern counter_u64_t rate_limit_set_ok;
 extern counter_u64_t rate_limit_active;
 extern counter_u64_t rate_limit_alloc_fail;
 #endif
 
 static int
 rl_attach_txrtlmt(struct ifnet *ifp,
     uint32_t flowtype,
     int flowid,
     uint64_t cfg_rate,
     struct m_snd_tag **tag)
 {
 	int error;
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.max_rate = cfg_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_alloc(ifp, &params, tag);
 #ifdef INET
 		if (error == 0) {
 			if_ref((*tag)->ifp);
 			counter_u64_add(rate_limit_set_ok, 1);
 			counter_u64_add(rate_limit_active, 1);
 		} else
 			counter_u64_add(rate_limit_alloc_fail, 1);
 #endif
 	}
 	return (error);
 }
 
 static void
 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
 {
 	/*
 	 * The internal table is "special", it
 	 * is two seperate ordered tables that
 	 * must be merged. We get here when the
 	 * adapter specifies a number of rates that
 	 * covers both ranges in the table in some
 	 * form.
 	 */
 	int i, at_low, at_high;
 	uint8_t low_disabled = 0, high_disabled = 0;
 
 	for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
 		rs->rs_rlt[i].flags = 0;
 		rs->rs_rlt[i].time_between = 0;
 		if ((low_disabled == 0) &&
 		    (high_disabled ||
 		     (rate_table_act[at_low] < rate_table_act[at_high]))) {
 			rs->rs_rlt[i].rate = rate_table_act[at_low];
 			at_low++;
 			if (at_low == RS_NEXT_ORDER_GROUP)
 				low_disabled = 1;
 		} else if (high_disabled == 0) {
 			rs->rs_rlt[i].rate = rate_table_act[at_high];
 			at_high++;
 			if (at_high == MAX_HDWR_RATES)
 				high_disabled = 1;
 		}
 	}
 }
 
 static struct tcp_rate_set *
 rt_setup_new_rs(struct ifnet *ifp, int *error)
 {
 	struct tcp_rate_set *rs;
 	const uint64_t *rate_table_act;
 	uint64_t lentim, res;
 	size_t sz;
 	uint32_t hash_type;
 	int i;
 	struct if_ratelimit_query_results rl;
 	struct sysctl_oid *rl_sysctl_root;
 	/*
 	 * We expect to enter with the
 	 * mutex locked.
 	 */
 
 	if (ifp->if_ratelimit_query == NULL) {
 		/*
 		 * We can do nothing if we cannot
 		 * get a query back from the driver.
 		 */
 		printf("Warning:No query functions for %s:%d-- failed\n",
 		       ifp->if_dname, ifp->if_dunit);
 		return (NULL);
 	}
 	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
 	if (rs == NULL) {
 		if (error)
 			*error = ENOMEM;
 		printf("Warning:No memory for malloc of tcp_rate_set\n");
 		return (NULL);
 	}
 	memset(&rl, 0, sizeof(rl));
 	rl.flags = RT_NOSUPPORT;
 	ifp->if_ratelimit_query(ifp, &rl);
 	if (rl.flags & RT_IS_UNUSABLE) {
 		/*
 		 * The interface does not really support
 		 * the rate-limiting.
 		 */
 		memset(rs, 0, sizeof(struct tcp_rate_set));
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_flags = RS_INTF_NO_SUP;
 		rs->rs_disable = 1;
 		rs_number_alive++;
 		sysctl_ctx_init(&rs->sysctl_ctx);
 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 		    OID_AUTO,
 		    rs->rs_ifp->if_xname,
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		rl_add_syctl_entries(rl_sysctl_root, rs);
 		mtx_lock(&rs_mtx);
 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 		mtx_unlock(&rs_mtx);
 		return (rs);
 	} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
 		memset(rs, 0, sizeof(struct tcp_rate_set));
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_flags = RS_IS_DEFF;
 		rs_number_alive++;
 		sysctl_ctx_init(&rs->sysctl_ctx);
 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 		    OID_AUTO,
 		    rs->rs_ifp->if_xname,
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		rl_add_syctl_entries(rl_sysctl_root, rs);
 		mtx_lock(&rs_mtx);
 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 		mtx_unlock(&rs_mtx);
 		return (rs);
 	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
 		/* Mellanox C4 likely */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
 		rs->rs_min_seg = rl.min_segment_burst;
 		rs->rs_highest_valid = 0;
 		rs->rs_flow_limit = rl.max_flows;
 		rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
 		rs->rs_disable = 0;
 		rate_table_act = rl.rate_table;
 	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
 		/* Chelsio, C5 and C6 of Mellanox? */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
 		rs->rs_min_seg = rl.min_segment_burst;
 		rs->rs_disable = 0;
 		rs->rs_flow_limit = rl.max_flows;
 		rate_table_act = desired_rates;
 		if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
 		    (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
 			/*
 			 * Our desired table is not big
 			 * enough, do what we can.
 			 */
 			rs->rs_rate_cnt = MAX_HDWR_RATES;
 		 }
 		if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
 			rs->rs_flags = RS_IS_INTF;
 		else
 			rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
 		if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
 			rs->rs_rate_cnt = ALL_HARDWARE_RATES;
 	} else {
 		free(rs, M_TCPPACE);
 		return (NULL);
 	}
 	sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
 	rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
 	if (rs->rs_rlt == NULL) {
 		if (error)
 			*error = ENOMEM;
 bail:
 		free(rs, M_TCPPACE);
 		return (NULL);
 	}
 	if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
 		/*
 		 * The interface supports all
 		 * the rates we could possibly want.
 		 */
 		uint64_t rat;
 
 		rs->rs_rlt[0].rate = 12500;	/* 100k */
 		rs->rs_rlt[1].rate = 25000;	/* 200k */
 		rs->rs_rlt[2].rate = 62500;	/* 500k */
 		/* Note 125000 == 1Megabit
 		 * populate 1Meg - 1000meg.
 		 */
 		for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
 			rs->rs_rlt[i].rate = rat;
 			rat += 125000;
 		}
 		rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
 	} else if (rs->rs_flags & RS_INT_TBL) {
 		/* We populate this in a special way */
 		populate_canned_table(rs, rate_table_act);
 	} else {
 		/*
 		 * Just copy in the rates from
 		 * the table, it is in order.
 		 */
 		for (i=0; i<rs->rs_rate_cnt; i++) {
 			rs->rs_rlt[i].rate = rate_table_act[i];
 			rs->rs_rlt[i].time_between = 0;
 			rs->rs_rlt[i].flags = 0;
 		}
 	}
 	for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
 		/*
 		 * We go backwards through the list so that if we can't get
 		 * a rate and fail to init one, we have at least a chance of
 		 * getting the highest one.
 		 */
 		rs->rs_rlt[i].ptbl = rs;
 		rs->rs_rlt[i].tag = NULL;
 		/*
 		 * Calculate the time between.
 		 */
 		lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 		res = lentim / rs->rs_rlt[i].rate;
 		if (res > 0)
 			rs->rs_rlt[i].time_between = res;
 		else
 			rs->rs_rlt[i].time_between = 1;
 		if (rs->rs_flags & RS_NO_PRE) {
 			rs->rs_rlt[i].flags = HDWRPACE_INITED;
 			rs->rs_lowest_valid = i;
 		} else {
 			int err;
 
 			if ((rl.flags & RT_IS_SETUP_REQ)  &&
 			    (ifp->if_ratelimit_query)) {
 				err = ifp->if_ratelimit_setup(ifp,
   				         rs->rs_rlt[i].rate, i);
 				if (err)
 					goto handle_err;
 			}
 #ifdef RSS
 			hash_type = M_HASHTYPE_RSS_TCP_IPV4;
 #else
 			hash_type = M_HASHTYPE_OPAQUE_HASH;
 #endif
 			err = rl_attach_txrtlmt(ifp,
 			    hash_type,
 			    (i + 1),
 			    rs->rs_rlt[i].rate,
 			    &rs->rs_rlt[i].tag);
 			if (err) {
 handle_err:
 				if (i == (rs->rs_rate_cnt - 1)) {
 					/*
 					 * Huh - first rate and we can't get
 					 * it?
 					 */
 					free(rs->rs_rlt, M_TCPPACE);
 					if (error)
 						*error = err;
 					goto bail;
 				} else {
 					if (error)
 						*error = err;
 				}
 				break;
 			} else {
 				rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
 				rs->rs_lowest_valid = i;
 			}
 		}
 	}
 	/* Did we get at least 1 rate? */
 	if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
 		rs->rs_highest_valid = rs->rs_rate_cnt - 1;
 	else {
 		free(rs->rs_rlt, M_TCPPACE);
 		goto bail;
 	}
 	rs_number_alive++;
 	sysctl_ctx_init(&rs->sysctl_ctx);
 	rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 	    OID_AUTO,
 	    rs->rs_ifp->if_xname,
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "");
 	rl_add_syctl_entries(rl_sysctl_root, rs);
 	mtx_lock(&rs_mtx);
 	CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 	mtx_unlock(&rs_mtx);
 	return (rs);
 }
 
 static const struct tcp_hwrate_limit_table *
 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
     uint64_t bytes_per_sec, uint32_t flags)
 {
 	struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
 	uint64_t mbits_per_sec, ind_calc;
 	int i;
 
 	mbits_per_sec = (bytes_per_sec * 8);
 	if (flags & RS_PACING_LT) {
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			/*
 			 * Smaller than 1Meg, only
 			 * 3 entries can match it.
 			 */
 			for(i = rs->rs_lowest_valid; i < 3; i++) {
 				if (bytes_per_sec <= rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					break;
 				} else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
 					arte = &rs->rs_rlt[i];
 				}
 			}
 			goto done;
 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 			/*
 			 * Larger than 1G (the majority of
 			 * our table.
 			 */
 			if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			else
 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			goto done;
 		}
 		/*
 		 * If we reach here its in our table (between 1Meg - 1000Meg),
 		 * just take the rounded down mbits per second, and add
 		 * 1Megabit to it, from this we can calculate
 		 * the index in the table.
 		 */
 		ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 		if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
 			ind_calc++;
 		/* our table is offset by 3, we add 2 */
 		ind_calc += 2;
 		if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 			/* This should not happen */
 			ind_calc = ALL_HARDWARE_RATES-1;
 		}
 		if ((ind_calc >= rs->rs_lowest_valid) &&
 		    (ind_calc <= rs->rs_highest_valid))
 		rte = &rs->rs_rlt[ind_calc];
 	} else if (flags & RS_PACING_EXACT_MATCH) {
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			for(i = rs->rs_lowest_valid; i < 3; i++) {
 				if (bytes_per_sec == rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					break;
 				}
 			}
 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 			/* > 1Gbps only one rate */
 			if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
 				/* Its 10G wow */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			}
 		} else {
 			/* Ok it must be a exact meg (its between 1G and 1Meg) */
 			ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 			if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 				/* its an exact Mbps */
 				ind_calc += 2;
 				if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 					/* This should not happen */
 					ind_calc = ALL_HARDWARE_RATES-1;
 				}
 				if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
 					rte = &rs->rs_rlt[ind_calc];
 			}
 		}
 	} else {
 		/* we want greater than the requested rate */
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
 			for (i=2; i>=rs->rs_lowest_valid; i--) {
 				if (bytes_per_sec < rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					break;
 				} else if ((flags & RS_PACING_GEQ) &&
 					   (bytes_per_sec == rs->rs_rlt[i].rate)) {
 					rte = &rs->rs_rlt[i];
 					break;
 				} else {
 					arte = &rs->rs_rlt[i]; /* new alternate */
 				}
 			}
 		} else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
 			if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 			    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 				/* Our top rate is larger than the request */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			} else if ((flags & RS_PACING_GEQ) &&
 				   (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 				   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 				/* It matches our top rate */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			} else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
 				/* The top rate is an alternative */
 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			}
 		} else {
 			/* Its in our range 1Meg - 1Gig */
 			if (flags & RS_PACING_GEQ) {
 				ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 				if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 					if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 						/* This should not happen */
 						ind_calc = (ALL_HARDWARE_RATES-1);
 					}
 					rte = &rs->rs_rlt[ind_calc];
 				}
 				goto done;
 			}
 			ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
 			ind_calc += 2;
 			if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 				/* This should not happen */
 				ind_calc = ALL_HARDWARE_RATES-1;
 			}
 			if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
 				rte = &rs->rs_rlt[ind_calc];
 		}
 	}
 done:
 	if ((rte == NULL) &&
 	    (arte != NULL) &&
 	    (flags & RS_PACING_SUB_OK)) {
 		/* We can use the substitute */
 		rte = arte;
 	}
 	return (rte);
 }
 
 static const struct tcp_hwrate_limit_table *
 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
 {
 	/**
 	 * Hunt the rate table with the restrictions in flags and find a
 	 * suitable rate if possible.
 	 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
 	 * RS_PACING_GT     - must be greater than.
 	 * RS_PACING_GEQ    - must be greater than or equal.
 	 * RS_PACING_LT     - must be less than.
 	 * RS_PACING_SUB_OK - If we don't meet criteria a
 	 *                    substitute is ok.
 	 */
 	int i, matched;
 	struct tcp_hwrate_limit_table *rte = NULL;
 
 	if ((rs->rs_flags & RS_INT_TBL) &&
 	    (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
 		/*
 		 * Here we don't want to paw thru
 		 * a big table, we have everything
 		 * from 1Meg - 1000Meg in 1Meg increments.
 		 * Use an alternate method to "lookup".
 		 */
 		return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
 	}
 	if ((flags & RS_PACING_LT) ||
 	    (flags & RS_PACING_EXACT_MATCH)) {
 		/*
 		 * For exact and less than we go forward through the table.
 		 * This way when we find one larger we stop (exact was a
 		 * toss up).
 		 */
 		for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
 			if ((flags & RS_PACING_EXACT_MATCH) &&
 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
 				rte = &rs->rs_rlt[i];
 				matched = 1;
 				break;
 			} else if ((flags & RS_PACING_LT) &&
 			    (bytes_per_sec <= rs->rs_rlt[i].rate)) {
 				rte = &rs->rs_rlt[i];
 				matched = 1;
 				break;
 			}
 			if (bytes_per_sec > rs->rs_rlt[i].rate)
 				break;
 		}
 		if ((matched == 0) &&
 		    (flags & RS_PACING_LT) &&
 		    (flags & RS_PACING_SUB_OK)) {
 			/* Kick in a substitute (the lowest) */
 			rte = &rs->rs_rlt[rs->rs_lowest_valid];
 		}
 	} else {
 		/*
 		 * Here we go backward through the table so that we can find
 		 * the one greater in theory faster (but its probably a
 		 * wash).
 		 */
 		for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
 			if (rs->rs_rlt[i].rate > bytes_per_sec) {
 				/* A possible candidate */
 				rte = &rs->rs_rlt[i];
 			}
 			if ((flags & RS_PACING_GEQ) &&
 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
 				/* An exact match and we want equal */
 				matched = 1;
 				rte = &rs->rs_rlt[i];
 				break;
 			} else if (rte) {
 				/*
 				 * Found one that is larger than but don't
 				 * stop, there may be a more closer match.
 				 */
 				matched = 1;
 			}
 			if (rs->rs_rlt[i].rate < bytes_per_sec) {
 				/*
 				 * We found a table entry that is smaller,
 				 * stop there will be none greater or equal.
 				 */
 				break;
 			}
 		}
 		if ((matched == 0) &&
 		    (flags & RS_PACING_SUB_OK)) {
 			/* Kick in a substitute (the highest) */
 			rte = &rs->rs_rlt[rs->rs_highest_valid];
 		}
 	}
 	return (rte);
 }
 
 static struct ifnet *
 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
 {
 	struct ifnet *tifp;
 	struct m_snd_tag *tag;
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = 1,
 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 		.rate_limit.max_rate = COMMON_RATE,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	int err;
 #ifdef RSS
 	params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
 #else
 	params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
 #endif
 	tag = NULL;
 	if (ifp->if_snd_tag_alloc) {
 		if (error)
 			*error = ENODEV;
 		return (NULL);
 	}
 	err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
 	if (err) {
 		/* Failed to setup a tag? */
 		if (error)
 			*error = err;
 		return (NULL);
 	}
 	tifp = tag->ifp;
 	tifp->if_snd_tag_free(tag);
 	return (tifp);
 }
 
 static const struct tcp_hwrate_limit_table *
 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
     uint32_t flags, int *error)
 {
 	/* First lets find the interface if it exists */
 	const struct tcp_hwrate_limit_table *rte;
 	struct tcp_rate_set *rs;
 	struct epoch_tracker et;
 	int err;
 
 	NET_EPOCH_ENTER(et);
 use_real_interface:
 	CK_LIST_FOREACH(rs, &int_rs, next) {
 		/*
 		 * Note we don't look with the lock since we either see a
 		 * new entry or will get one when we try to add it.
 		 */
 		if (rs->rs_flags & RS_IS_DEAD) {
 			/* The dead are not looked at */
 			continue;
 		}
 		if ((rs->rs_ifp == ifp) &&
 		    (rs->rs_if_dunit == ifp->if_dunit)) {
 			/* Ok we found it */
 			break;
 		}
 	}
 	if ((rs == NULL) ||
 	    (rs->rs_flags & RS_INTF_NO_SUP) ||
 	    (rs->rs_flags & RS_IS_DEAD)) {
 		/*
 		 * This means we got a packet *before*
 		 * the IF-UP was processed below, <or>
 		 * while or after we already received an interface
 		 * departed event. In either case we really don't
 		 * want to do anything with pacing, in
 		 * the departing case the packet is not
 		 * going to go very far. The new case
 		 * might be arguable, but its impossible
 		 * to tell from the departing case.
 		 */
 		if (rs->rs_disable && error)
 			*error = ENODEV;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 
 	if ((rs == NULL) || (rs->rs_disable != 0)) {
 		if (rs->rs_disable && error)
 			*error = ENOSPC;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	if (rs->rs_flags & RS_IS_DEFF) {
 		/* We need to find the real interface */
 		struct ifnet *tifp;
 
 		tifp = rt_find_real_interface(ifp, inp, error);
 		if (tifp == NULL) {
 			if (rs->rs_disable && error)
 				*error = ENOTSUP;
 			NET_EPOCH_EXIT(et);
 			return (NULL);
 		}
 		goto use_real_interface;
 	}
 	if (rs->rs_flow_limit &&
 	    ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
 		if (error)
 			*error = ENOSPC;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
 	if (rte) {
 		err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
 		    inp->inp_flowtype,
 		    inp->inp_flowid,
 		    rte->rate,
 		    &inp->inp_snd_tag);
 		if (err) {
 			/* Failed to attach */
 			if (error)
 				*error = err;
 			rte = NULL;
 		}
 	}
 	if (rte) {
 		/*
 		 * We use an atomic here for accounting so we don't have to
 		 * use locks when freeing.
 		 */
 		atomic_add_64(&rs->rs_flows_using, 1);
 	}
 	NET_EPOCH_EXIT(et);
 	return (rte);
 }
 
 static void
 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
 {
 	int error;
 	struct tcp_rate_set *rs;
 
 	if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
 	    (link_state != LINK_STATE_UP)) {
 		/*
 		 * We only care on an interface going up that is rate-limit
 		 * capable.
 		 */
 		return;
 	}
 	mtx_lock(&rs_mtx);
 	CK_LIST_FOREACH(rs, &int_rs, next) {
 		if ((rs->rs_ifp == ifp) &&
 		    (rs->rs_if_dunit == ifp->if_dunit)) {
 			/* We already have initialized this guy */
 			mtx_unlock(&rs_mtx);
 			return;
 		}
 	}
 	mtx_unlock(&rs_mtx);
 	rt_setup_new_rs(ifp, &error);
 }
 
 static void
 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
 {
 	struct tcp_rate_set *rs, *nrs;
 	struct ifnet *tifp;
 	int i;
 
 	mtx_lock(&rs_mtx);
 	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
 		if ((rs->rs_ifp == ifp) &&
 		    (rs->rs_if_dunit == ifp->if_dunit)) {
 			CK_LIST_REMOVE(rs, next);
 			rs_number_alive--;
 			rs->rs_flags |= RS_IS_DEAD;
 			for (i = 0; i < rs->rs_rate_cnt; i++) {
 				if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 					tifp = rs->rs_rlt[i].tag->ifp;
 					in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
 					rs->rs_rlt[i].tag = NULL;
 				}
 				rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 			}
 			if (rs->rs_flows_using == 0)
 				rs_defer_destroy(rs);
 			break;
 		}
 	}
 	mtx_unlock(&rs_mtx);
 }
 
 static void
 tcp_rl_shutdown(void *arg __unused, int howto __unused)
 {
 	struct tcp_rate_set *rs, *nrs;
 	struct ifnet *tifp;
 	int i;
 
 	mtx_lock(&rs_mtx);
 	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
 		CK_LIST_REMOVE(rs, next);
 		rs_number_alive--;
 		rs->rs_flags |= RS_IS_DEAD;
 		for (i = 0; i < rs->rs_rate_cnt; i++) {
 			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 				tifp = rs->rs_rlt[i].tag->ifp;
 				in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
 				rs->rs_rlt[i].tag = NULL;
 			}
 			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 		}
 		if (rs->rs_flows_using == 0)
 			rs_defer_destroy(rs);
 	}
 	mtx_unlock(&rs_mtx);
 }
 
 const struct tcp_hwrate_limit_table *
 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
     uint64_t bytes_per_sec, int flags, int *error)
 {
 	const struct tcp_hwrate_limit_table *rte;
+#ifdef KERN_TLS
+	struct ktls_session *tls;
+#endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (tp->t_inpcb->inp_snd_tag == NULL) {
 		/*
 		 * We are setting up a rate for the first time.
 		 */
 		if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
 			/* Not supported by the egress */
 			if (error)
 				*error = ENODEV;
 			return (NULL);
 		}
 #ifdef KERN_TLS
+		tls = NULL;
 		if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
+			tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
+
+			if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
+			    tls->mode != TCP_TLS_MODE_IFNET) {
+				if (error)
+					*error = ENODEV;
+				return (NULL);
+			}
+		}
+#endif
+		rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
+#ifdef KERN_TLS
+		if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
 			/*
-			 * We currently can't do both TLS and hardware
-			 * pacing
+			 * Fake a route change error to reset the TLS
+			 * send tag.  This will convert the existing
+			 * tag to a TLS ratelimit tag.
 			 */
-			if (error)
-				*error = EINVAL;
-			return (NULL);
+			MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS);
+			ktls_output_eagain(tp->t_inpcb, tls);
 		}
 #endif
-		rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
 	} else {
 		/*
 		 * We are modifying a rate, wrong interface?
 		 */
 		if (error)
 			*error = EINVAL;
 		rte = NULL;
 	}
 	tp->t_pacing_rate = rte->rate;
 	*error = 0;
 	return (rte);
 }
 
 const struct tcp_hwrate_limit_table *
 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
     struct tcpcb *tp, struct ifnet *ifp,
     uint64_t bytes_per_sec, int flags, int *error)
 {
 	const struct tcp_hwrate_limit_table *nrte;
 	const struct tcp_rate_set *rs;
+#ifdef KERN_TLS
+	struct ktls_session *tls = NULL;
+#endif
 	int is_indirect = 0;
 	int err;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	if ((tp->t_inpcb->inp_snd_tag == NULL) ||
-	    (crte == NULL)) {
+	if (crte == NULL) {
+		/* Wrong interface */
+		if (error)
+			*error = EINVAL;
+		return (NULL);
+	}
+
+#ifdef KERN_TLS
+	if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
+		tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
+		MPASS(tls->mode == TCP_TLS_MODE_IFNET);
+		if (tls->snd_tag != NULL &&
+		    tls->snd_tag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
+			/*
+			 * NIC probably doesn't support ratelimit TLS
+			 * tags if it didn't allocate one when an
+			 * existing rate was present, so ignore.
+			 */
+			if (error)
+				*error = EOPNOTSUPP;
+			return (NULL);
+		}
+	}
+#endif
+	if (tp->t_inpcb->inp_snd_tag == NULL) {
 		/* Wrong interface */
 		if (error)
 			*error = EINVAL;
 		return (NULL);
 	}
 	rs = crte->ptbl;
 	if ((rs->rs_flags & RS_IS_DEAD) ||
 	    (crte->flags & HDWRPACE_IFPDEPARTED)) {
 		/* Release the rate, and try anew */
 re_rate:
 		tcp_rel_pacing_rate(crte, tp);
 		nrte = tcp_set_pacing_rate(tp, ifp,
 		    bytes_per_sec, flags, error);
 		return (nrte);
 	}
 	if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
 		is_indirect = 1;
 	else
 		is_indirect = 0;
 	if ((is_indirect == 0) &&
 	    ((ifp != rs->rs_ifp) ||
 	    (ifp->if_dunit != rs->rs_if_dunit))) {
 		/*
 		 * Something changed, the user is not pointing to the same
 		 * ifp? Maybe a route updated on this guy?
 		 */
 		goto re_rate;
 	} else if (is_indirect) {
 		/*
 		 * For indirect we have to dig in and find the real interface.
 		 */
 		struct ifnet *rifp;
 
 		rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
 		if (rifp == NULL) {
 			/* Can't find it? */
 			goto re_rate;
 		}
 		if ((rifp != rs->rs_ifp) ||
 		    (ifp->if_dunit != rs->rs_if_dunit)) {
 			goto re_rate;
 		}
 	}
 	nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
 	if (nrte == crte) {
 		/* No change */
 		if (error)
 			*error = 0;
 		return (crte);
 	}
 	if (nrte == NULL) {
 		/* Release the old rate */
 		tcp_rel_pacing_rate(crte, tp);
 		return (NULL);
 	}
 	/* Change rates to our new entry */
-	err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
+#ifdef KERN_TLS
+	if (tls != NULL)
+		err = ktls_modify_txrtlmt(tls, nrte->rate);
+	else
+#endif
+		err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
 	if (err) {
 		if (error)
 			*error = err;
 		return (NULL);
 	}
 	if (error)
 		*error = 0;
 	tp->t_pacing_rate = nrte->rate;
 	return (nrte);
 }
 
 void
 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
 {
 	const struct tcp_rate_set *crs;
 	struct tcp_rate_set *rs;
 	uint64_t pre;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tp->t_pacing_rate = -1;
 	crs = crte->ptbl;
 	/*
 	 * Now we must break the const
 	 * in order to release our refcount.
 	 */
 	rs = __DECONST(struct tcp_rate_set *, crs);
 	pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
 	if (pre == 1) {
 		mtx_lock(&rs_mtx);
 		/*
 		 * Is it dead?
 		 */
 		if (rs->rs_flags & RS_IS_DEAD)
 			rs_defer_destroy(rs);
 		mtx_unlock(&rs_mtx);
 	}
+
+	/*
+	 * XXX: If this connection is using ifnet TLS, should we
+	 * switch it to using an unlimited rate, or perhaps use
+	 * ktls_output_eagain() to reset the send tag to a plain
+	 * TLS tag?
+	 */
 	in_pcbdetach_txrtlmt(tp->t_inpcb);
 }
 
 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
 #define ONE_HUNDRED_MBPS 12500000	/* 100Mbps in bytes per second */
 #define FIVE_HUNDRED_MBPS 62500000	/* 500Mbps in bytes per second */
 #define MAX_MSS_SENT 43	/* 43 mss = 43 x 1500 = 64,500 bytes */
 
 uint32_t
 tcp_get_pacing_burst_size (uint64_t bw, uint32_t segsiz, int can_use_1mss,
    const struct tcp_hwrate_limit_table *te, int *err)
 {
 	/*
 	 * We use the google formula to calculate the
 	 * TSO size. I.E.
 	 * bw < 24Meg
 	 *   tso = 2mss
 	 * else
 	 *   tso = min(bw/1000, 64k)
 	 *
 	 * Note for these calculations we ignore the
 	 * packet overhead (enet hdr, ip hdr and tcp hdr).
 	 */
 	uint64_t lentim, res, bytes;
 	uint32_t new_tso, min_tso_segs;
 
 	bytes = bw / 1000;
 	if (bytes > (64 * 1000))
 		bytes = 64 * 1000;
 	/* Round up */
 	new_tso = (bytes + segsiz - 1) / segsiz;
 	if (can_use_1mss && (bw < ONE_POINT_TWO_MEG))
 		min_tso_segs = 1;
 	else
 		min_tso_segs = 2;
 	if (new_tso < min_tso_segs)
 		new_tso = min_tso_segs;
 	if (new_tso > MAX_MSS_SENT)
 		new_tso = MAX_MSS_SENT;
 	new_tso *= segsiz;
 	/*
 	 * If we are not doing hardware pacing
 	 * then we are done.
 	 */
 	if (te == NULL) {
 		if (err)
 			*err = 0;
 		return(new_tso);
 	}
 	/*
 	 * For hardware pacing we look at the
 	 * rate you are sending at and compare
 	 * that to the rate you have in hardware.
 	 *
 	 * If the hardware rate is slower than your
 	 * software rate then you are in error and
 	 * we will build a queue in our hardware whic
 	 * is probably not desired, in such a case
 	 * just return the non-hardware TSO size.
 	 *
 	 * If the rate in hardware is faster (which
 	 * it should be) then look at how long it
 	 * takes to send one ethernet segment size at
 	 * your b/w and compare that to the time it
 	 * takes to send at the rate you had selected.
 	 *
 	 * If your time is greater (which we hope it is)
 	 * we get the delta between the two, and then
 	 * divide that into your pacing time. This tells
 	 * us how many MSS you can send down at once (rounded up).
 	 *
 	 * Note we also double this value if the b/w is over
 	 * 100Mbps. If its over 500meg we just set you to the
 	 * max (43 segments).
 	 */
 	if (te->rate > FIVE_HUNDRED_MBPS)
 		return (segsiz * MAX_MSS_SENT);
 	if (te->rate == bw) {
 		/* We are pacing at exactly the hdwr rate */
 		return (segsiz * MAX_MSS_SENT);
 	}
 	lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 	res = lentim / bw;
 	if (res > te->time_between) {
 		uint32_t delta, segs;
 
 		delta = res - te->time_between;
 		segs = (res + delta - 1)/delta;
 		if (te->rate > ONE_HUNDRED_MBPS)
 			segs *= 2;
 		if (segs < min_tso_segs)
 			segs = min_tso_segs;
 		if (segs > MAX_MSS_SENT)
 			segs = MAX_MSS_SENT;
 		segs *= segsiz;
 		if (err)
 			*err = 0;
 		if (segs < new_tso) {
 			/* unexpected ? */
 			return(new_tso);
 		} else {
 			return (segs);
 		}
 	} else {
 		/*
 		 * Your time is smaller which means
 		 * we will grow a queue on our
 		 * hardware. Send back the non-hardware
 		 * rate.
 		 */
 		if (err)
 			*err = -1;
 		return (new_tso);
 	}
 }
 
 static eventhandler_tag rl_ifnet_departs;
 static eventhandler_tag rl_ifnet_arrives;
 static eventhandler_tag rl_shutdown_start;
 
 static void
 tcp_rs_init(void *st __unused)
 {
 	CK_LIST_INIT(&int_rs);
 	rs_number_alive = 0;
 	rs_number_dead = 0;
 	mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
 	rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    tcp_rl_ifnet_departure,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
 	    tcp_rl_ifnet_link,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 	    tcp_rl_shutdown, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	printf("TCP_ratelimit: Is now initialized\n");
 }
 
 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
 #endif
diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
index edbfe53f51ba..8d591888466c 100644
--- a/sys/sys/ktls.h
+++ b/sys/sys/ktls.h
@@ -1,244 +1,247 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef _SYS_KTLS_H_
 #define	_SYS_KTLS_H_
 
 #include <sys/refcount.h>
 #include <sys/_task.h>
 
 struct tls_record_layer {
 	uint8_t  tls_type;
 	uint8_t  tls_vmajor;
 	uint8_t  tls_vminor;
 	uint16_t tls_length;
 	uint8_t  tls_data[0];
 } __attribute__ ((packed));
 
 #define	TLS_MAX_MSG_SIZE_V10_2	16384
 #define	TLS_MAX_PARAM_SIZE	1024	/* Max key/mac/iv in sockopt */
 #define	TLS_AEAD_GCM_LEN	4
 #define	TLS_1_3_GCM_IV_LEN	12
 #define	TLS_CBC_IMPLICIT_IV_LEN	16
 
 /* Type values for the record layer */
 #define	TLS_RLTYPE_APP		23
 
 /*
  * Nonce for GCM for TLS 1.2 per RFC 5288.
  */
 struct tls_nonce_data {
 	uint8_t fixed[TLS_AEAD_GCM_LEN];
 	uint64_t seq;
 } __packed; 
 
 /*
  * AEAD additional data format for TLS 1.2 per RFC 5246.
  */
 struct tls_aead_data {
 	uint64_t seq;	/* In network order */
 	uint8_t	type;
 	uint8_t tls_vmajor;
 	uint8_t tls_vminor;
 	uint16_t tls_length;	
 } __packed;
 
 /*
  * AEAD additional data format for TLS 1.3 per RFC 8446.
  */
 struct tls_aead_data_13 {
 	uint8_t	type;
 	uint8_t tls_vmajor;
 	uint8_t tls_vminor;
 	uint16_t tls_length;
 } __packed;
 
 /*
  * Stream Cipher MAC additional data input.  This does not match the
  * exact data on the wire (the sequence number is not placed on the
  * wire, and any explicit IV after the record header is not covered by
  * the MAC).
  */
 struct tls_mac_data {
 	uint64_t seq;
 	uint8_t type;
 	uint8_t tls_vmajor;
 	uint8_t tls_vminor;
 	uint16_t tls_length;	
 } __packed;
 
 #define	TLS_MAJOR_VER_ONE	3
 #define	TLS_MINOR_VER_ZERO	1	/* 3, 1 */
 #define	TLS_MINOR_VER_ONE	2	/* 3, 2 */
 #define	TLS_MINOR_VER_TWO	3	/* 3, 3 */
 #define	TLS_MINOR_VER_THREE	4	/* 3, 4 */
 
 /* For TCP_TXTLS_ENABLE and TCP_RXTLS_ENABLE. */
 #ifdef _KERNEL
 struct tls_enable_v0 {
 	const uint8_t *cipher_key;
 	const uint8_t *iv;		/* Implicit IV. */
 	const uint8_t *auth_key;
 	int	cipher_algorithm;	/* e.g. CRYPTO_AES_CBC */
 	int	cipher_key_len;
 	int	iv_len;
 	int	auth_algorithm;		/* e.g. CRYPTO_SHA2_256_HMAC */
 	int	auth_key_len;
 	int	flags;
 	uint8_t tls_vmajor;
 	uint8_t tls_vminor;
 };
 #endif
 
 struct tls_enable {
 	const uint8_t *cipher_key;
 	const uint8_t *iv;		/* Implicit IV. */
 	const uint8_t *auth_key;
 	int	cipher_algorithm;	/* e.g. CRYPTO_AES_CBC */
 	int	cipher_key_len;
 	int	iv_len;
 	int	auth_algorithm;		/* e.g. CRYPTO_SHA2_256_HMAC */
 	int	auth_key_len;
 	int	flags;
 	uint8_t tls_vmajor;
 	uint8_t tls_vminor;
 	uint8_t rec_seq[8];
 };
 
 /* Structure for TLS_GET_RECORD. */
 struct tls_get_record {
 	/* TLS record header. */
 	uint8_t  tls_type;
 	uint8_t  tls_vmajor;
 	uint8_t  tls_vminor;
 	uint16_t tls_length;
 };
 
 #ifdef _KERNEL
 
 struct tls_session_params {
 	uint8_t *cipher_key;
 	uint8_t *auth_key;
 	uint8_t iv[TLS_CBC_IMPLICIT_IV_LEN];
 	int	cipher_algorithm;
 	int	auth_algorithm;
 	uint16_t cipher_key_len;
 	uint16_t iv_len;
 	uint16_t auth_key_len;
 	uint16_t max_frame_len;
 	uint8_t tls_vmajor;
 	uint8_t tls_vminor;
 	uint8_t tls_hlen;
 	uint8_t tls_tlen;
 	uint8_t tls_bs;
 	uint8_t flags;
 };
 
 /* Used in APIs to request RX vs TX sessions. */
 #define	KTLS_TX		1
 #define	KTLS_RX		2
 
 #define	KTLS_API_VERSION 7
 
 struct iovec;
 struct ktls_session;
 struct m_snd_tag;
 struct mbuf;
 struct sockbuf;
 struct socket;
 
 struct ktls_crypto_backend {
 	LIST_ENTRY(ktls_crypto_backend) next;
 	int (*try)(struct socket *so, struct ktls_session *tls, int direction);
 	int prio;
 	int api_version;
 	int use_count;
 	const char *name;
 };
 
 struct ktls_session {
 	union {
 		int	(*sw_encrypt)(struct ktls_session *tls,
 		    const struct tls_record_layer *hdr, uint8_t *trailer,
 		    struct iovec *src, struct iovec *dst, int iovcnt,
 		    uint64_t seqno, uint8_t record_type);
 		int	(*sw_decrypt)(struct ktls_session *tls,
 		    const struct tls_record_layer *hdr, struct mbuf *m,
 		    uint64_t seqno, int *trailer_len);
 	};
 	union {
 		void *cipher;
 		struct m_snd_tag *snd_tag;
 	};
 	struct ktls_crypto_backend *be;
 	void (*free)(struct ktls_session *tls);
 	struct tls_session_params params;
 	u_int	wq_index;
 	volatile u_int refcount;
 	int mode;
 
 	struct task reset_tag_task;
 	struct inpcb *inp;
 	bool reset_pending;
 } __aligned(CACHE_LINE_SIZE);
 
 void ktls_check_rx(struct sockbuf *sb);
 int ktls_crypto_backend_register(struct ktls_crypto_backend *be);
 int ktls_crypto_backend_deregister(struct ktls_crypto_backend *be);
 int ktls_enable_rx(struct socket *so, struct tls_enable *en);
 int ktls_enable_tx(struct socket *so, struct tls_enable *en);
 void ktls_destroy(struct ktls_session *tls);
 void ktls_frame(struct mbuf *m, struct ktls_session *tls, int *enqueue_cnt,
     uint8_t record_type);
 void ktls_seq(struct sockbuf *sb, struct mbuf *m);
 void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count);
 void ktls_enqueue_to_free(struct mbuf *m);
 int ktls_get_rx_mode(struct socket *so);
 int ktls_set_tx_mode(struct socket *so, int mode);
 int ktls_get_tx_mode(struct socket *so);
 int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls);
+#ifdef RATELIMIT
+int ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate);
+#endif
 
 static inline struct ktls_session *
 ktls_hold(struct ktls_session *tls)
 {
 
 	if (tls != NULL)
 		refcount_acquire(&tls->refcount);
 	return (tls);
 }
 
 static inline void
 ktls_free(struct ktls_session *tls)
 {
 
 	if (refcount_release(&tls->refcount))
 		ktls_destroy(tls);
 }
 
 #endif /* !_KERNEL */
 #endif /* !_SYS_KTLS_H_ */