diff --git a/sys/compat/linux/linux_netlink.c b/sys/compat/linux/linux_netlink.c
index 0e8188d4cdf6..775a36994d2d 100644
--- a/sys/compat/linux/linux_netlink.c
+++ b/sys/compat/linux/linux_netlink.c
@@ -1,624 +1,624 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2022 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_netlink.h"
 
 #include <sys/types.h>
 #include <sys/ck.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/route/route_ctl.h>
 #include <netlink/netlink.h>
 #include <netlink/netlink_ctl.h>
 #include <netlink/netlink_linux.h>
 #include <netlink/netlink_route.h>
 
 #include <compat/linux/linux.h>
 #include <compat/linux/linux_common.h>
 #include <compat/linux/linux_util.h>
 
 #define	DEBUG_MOD_NAME	nl_linux
 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
 #include <netlink/netlink_debug.h>
 _DECLARE_DEBUG(LOG_DEBUG);
 
 static bool
 valid_rta_size(const struct rtattr *rta, int sz)
 {
 	return (NL_RTA_DATA_LEN(rta) == sz);
 }
 
 static bool
 valid_rta_u32(const struct rtattr *rta)
 {
 	return (valid_rta_size(rta, sizeof(uint32_t)));
 }
 
 static uint32_t
 _rta_get_uint32(const struct rtattr *rta)
 {
 	return (*((const uint32_t *)NL_RTA_DATA_CONST(rta)));
 }
 
 static struct nlmsghdr *
 rtnl_neigh_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt)
 {
 	struct ndmsg *ndm = (struct ndmsg *)(hdr + 1);
 
 	if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ndmsg))
 		ndm->ndm_family = linux_to_bsd_domain(ndm->ndm_family);
 
 	return (hdr);
 }
 
 static struct nlmsghdr *
 rtnl_ifaddr_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt)
 {
 	struct ifaddrmsg *ifam = (struct ifaddrmsg *)(hdr + 1);
 
 	if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ifaddrmsg))
 		ifam->ifa_family = linux_to_bsd_domain(ifam->ifa_family);
 
 	return (hdr);
 }
 
 static struct nlmsghdr *
 rtnl_route_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt)
 {
 	/* Tweak address families and default fib only */
 	struct rtmsg *rtm = (struct rtmsg *)(hdr + 1);
 	struct nlattr *nla, *nla_head;
 	int attrs_len;
 
 	rtm->rtm_family = linux_to_bsd_domain(rtm->rtm_family);
 
 	if (rtm->rtm_table == 254)
 		rtm->rtm_table = 0;
 
 	attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr);
 	attrs_len -= NETLINK_ALIGN(sizeof(struct rtmsg));
 	nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg)));
 
 	NLA_FOREACH(nla, nla_head, attrs_len) {
 		RT_LOG(LOG_DEBUG3, "GOT type %d len %d total %d",
 		    nla->nla_type, nla->nla_len, attrs_len);
 		struct rtattr *rta = (struct rtattr *)nla;
 		if (rta->rta_len < sizeof(struct rtattr)) {
 			break;
 		}
 		switch (rta->rta_type) {
 		case NL_RTA_TABLE:
 			if (!valid_rta_u32(rta))
 				goto done;
 			rtm->rtm_table = 0;
 			uint32_t fibnum = _rta_get_uint32(rta);
 			RT_LOG(LOG_DEBUG3, "GET RTABLE: %u", fibnum);
 			if (fibnum == 254) {
 				*((uint32_t *)NL_RTA_DATA(rta)) = 0;
 			}
 			break;
 		}
 	}
 
 done:
 	return (hdr);
 }
 
 static struct nlmsghdr *
 rtnl_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt)
 {
 	switch (hdr->nlmsg_type) {
 	case NL_RTM_GETROUTE:
 	case NL_RTM_NEWROUTE:
 	case NL_RTM_DELROUTE:
 		return (rtnl_route_from_linux(hdr, npt));
 	case NL_RTM_GETNEIGH:
 		return (rtnl_neigh_from_linux(hdr, npt));
 	case NL_RTM_GETADDR:
 		return (rtnl_ifaddr_from_linux(hdr, npt));
 	/* Silence warning for the messages where no translation is required */
 	case NL_RTM_NEWLINK:
 	case NL_RTM_DELLINK:
 	case NL_RTM_GETLINK:
 		break;
 	default:
 		RT_LOG(LOG_DEBUG, "Passing message type %d untranslated",
 		    hdr->nlmsg_type);
 	}
 
 	return (hdr);
 }
 
 static struct nlmsghdr *
 nlmsg_from_linux(int netlink_family, struct nlmsghdr *hdr,
     struct nl_pstate *npt)
 {
 	switch (netlink_family) {
 	case NETLINK_ROUTE:
 		return (rtnl_from_linux(hdr, npt));
 	}
 
 	return (hdr);
 }
 
 
 /************************************************************
  * Kernel -> Linux
  ************************************************************/
 
 static bool
 handle_default_out(struct nlmsghdr *hdr, struct nl_writer *nw)
 {
 	char *out_hdr;
 	out_hdr = nlmsg_reserve_data(nw, NLMSG_ALIGN(hdr->nlmsg_len), char);
 
 	if (out_hdr != NULL) {
 		memcpy(out_hdr, hdr, hdr->nlmsg_len);
 		return (true);
 	}
 	return (false);
 }
 
 static bool
 nlmsg_copy_header(struct nlmsghdr *hdr, struct nl_writer *nw)
 {
 	return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type,
 	    hdr->nlmsg_flags, 0));
 }
 
 static void *
 _nlmsg_copy_next_header(struct nlmsghdr *hdr, struct nl_writer *nw, int sz)
 {
 	void *next_hdr = nlmsg_reserve_data(nw, sz, void);
 	memcpy(next_hdr, hdr + 1, NLMSG_ALIGN(sz));
 
 	return (next_hdr);
 }
 #define	nlmsg_copy_next_header(_hdr, _ns, _t)	\
 	((_t *)(_nlmsg_copy_next_header(_hdr, _ns, sizeof(_t))))
 
 static bool
 nlmsg_copy_nla(const struct nlattr *nla_orig, struct nl_writer *nw)
 {
 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_orig->nla_len, struct nlattr);
 	if (nla != NULL) {
 		memcpy(nla, nla_orig, nla_orig->nla_len);
 		return (true);
 	}
 	return (false);
 }
 
 /*
  * Translate a FreeBSD interface name to a Linux interface name.
  */
 static bool
 nlmsg_translate_ifname_nla(struct nlattr *nla, struct nl_writer *nw)
 {
 	char ifname[LINUX_IFNAMSIZ];
 
 	if (ifname_bsd_to_linux_name((char *)(nla + 1), ifname,
 	    sizeof(ifname)) <= 0)
 		return (false);
 	return (nlattr_add_string(nw, IFLA_IFNAME, ifname));
 }
 
 #define	LINUX_NLA_UNHANDLED	-1
 /*
  * Translate a FreeBSD attribute to a Linux attribute.
  * Returns LINUX_NLA_UNHANDLED when the attribute is not processed
  * and the caller must take care of it, otherwise the result is returned.
  */
 static int
 nlmsg_translate_all_nla(struct nlmsghdr *hdr, struct nlattr *nla,
     struct nl_writer *nw)
 {
 
 	switch (hdr->nlmsg_type) {
 	case NL_RTM_NEWLINK:
 	case NL_RTM_DELLINK:
 	case NL_RTM_GETLINK:
 		switch (nla->nla_type) {
 		case IFLA_IFNAME:
 			return (nlmsg_translate_ifname_nla(nla, nw));
 		default:
 			break;
 		}
 	default:
 		break;
 	}
 	return (LINUX_NLA_UNHANDLED);
 }
 
 static bool
 nlmsg_copy_all_nla(struct nlmsghdr *hdr, int raw_hdrlen, struct nl_writer *nw)
 {
 	struct nlattr *nla;
 	int ret;
 
 	int hdrlen = NETLINK_ALIGN(raw_hdrlen);
 	int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen;
 	struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen);
 
 	NLA_FOREACH(nla, nla_head, attrs_len) {
 		RT_LOG(LOG_DEBUG3, "reading attr %d len %d", nla->nla_type, nla->nla_len);
 		if (nla->nla_len < sizeof(struct nlattr)) {
 			return (false);
 		}
 		ret = nlmsg_translate_all_nla(hdr, nla, nw);
 		if (ret == LINUX_NLA_UNHANDLED)
 			ret = nlmsg_copy_nla(nla, nw);
 		if (!ret)
 			return (false);
 	}
 	return (true);
 }
 #undef LINUX_NLA_UNHANDLED
 
 static unsigned int
 rtnl_if_flags_to_linux(unsigned int if_flags)
 {
 	unsigned int result = 0;
 
 	for (int i = 0; i < 31; i++) {
 		unsigned int flag = 1 << i;
 		if (!(flag & if_flags))
 			continue;
 		switch (flag) {
 		case IFF_UP:
 		case IFF_BROADCAST:
 		case IFF_DEBUG:
 		case IFF_LOOPBACK:
 		case IFF_POINTOPOINT:
 		case IFF_DRV_RUNNING:
 		case IFF_NOARP:
 		case IFF_PROMISC:
 		case IFF_ALLMULTI:
 			result |= flag;
 			break;
-		case IFF_KNOWSEPOCH:
+		case IFF_NEEDSEPOCH:
 		case IFF_DRV_OACTIVE:
 		case IFF_SIMPLEX:
 		case IFF_LINK0:
 		case IFF_LINK1:
 		case IFF_LINK2:
 		case IFF_CANTCONFIG:
 		case IFF_PPROMISC:
 		case IFF_MONITOR:
 		case IFF_STATICARP:
 		case IFF_STICKYARP:
 		case IFF_DYING:
 		case IFF_RENAMING:
 		case IFF_NOGROUP:
 			/* No Linux analogue */
 			break;
 		case IFF_MULTICAST:
 			result |= 1 << 12;
 		}
 	}
 	return (result);
 }
 
 static bool
 rtnl_newlink_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp,
     struct nl_writer *nw)
 {
 	if (!nlmsg_copy_header(hdr, nw))
 		return (false);
 
 	struct ifinfomsg *ifinfo;
 	ifinfo = nlmsg_copy_next_header(hdr, nw, struct ifinfomsg);
 
 	ifinfo->ifi_family = bsd_to_linux_domain(ifinfo->ifi_family);
 	/* Convert interface type */
 	switch (ifinfo->ifi_type) {
 	case IFT_ETHER:
 		ifinfo->ifi_type = LINUX_ARPHRD_ETHER;
 		break;
 	}
 	ifinfo->ifi_flags = rtnl_if_flags_to_linux(ifinfo->ifi_flags);
 
 	/* Copy attributes unchanged */
 	if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifinfomsg), nw))
 		return (false);
 
 	/* make ip(8) happy */
 	if (!nlattr_add_string(nw, IFLA_QDISC, "noqueue"))
 		return (false);
 
 	if (!nlattr_add_u32(nw, IFLA_TXQLEN, 1000))
 		return (false);
 
 	nlmsg_end(nw);
 	RT_LOG(LOG_DEBUG2, "done processing nw %p", nw);
 	return (true);
 }
 
 static bool
 rtnl_newaddr_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp,
     struct nl_writer *nw)
 {
 	if (!nlmsg_copy_header(hdr, nw))
 		return (false);
 
 	struct ifaddrmsg *ifamsg;
 	ifamsg = nlmsg_copy_next_header(hdr, nw, struct ifaddrmsg);
 
 	ifamsg->ifa_family = bsd_to_linux_domain(ifamsg->ifa_family);
 	/* XXX: fake ifa_flags? */
 
 	/* Copy attributes unchanged */
 	if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifaddrmsg), nw))
 		return (false);
 
 	nlmsg_end(nw);
 	RT_LOG(LOG_DEBUG2, "done processing nw %p", nw);
 	return (true);
 }
 
 static bool
 rtnl_newneigh_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp,
     struct nl_writer *nw)
 {
 	if (!nlmsg_copy_header(hdr, nw))
 		return (false);
 
 	struct ndmsg *ndm;
 	ndm = nlmsg_copy_next_header(hdr, nw, struct ndmsg);
 
 	ndm->ndm_family = bsd_to_linux_domain(ndm->ndm_family);
 
 	/* Copy attributes unchanged */
 	if (!nlmsg_copy_all_nla(hdr, sizeof(struct ndmsg), nw))
 		return (false);
 
 	nlmsg_end(nw);
 	RT_LOG(LOG_DEBUG2, "done processing nw %p", nw);
 	return (true);
 }
 
 static bool
 rtnl_newroute_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp,
     struct nl_writer *nw)
 {
 	if (!nlmsg_copy_header(hdr, nw))
 		return (false);
 
 	struct rtmsg *rtm;
 	rtm = nlmsg_copy_next_header(hdr, nw, struct rtmsg);
 	rtm->rtm_family = bsd_to_linux_domain(rtm->rtm_family);
 
 	struct nlattr *nla;
 
 	int hdrlen = NETLINK_ALIGN(sizeof(struct rtmsg));
 	int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen;
 	struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen);
 
 	NLA_FOREACH(nla, nla_head, attrs_len) {
 		struct rtattr *rta = (struct rtattr *)nla;
 		//RT_LOG(LOG_DEBUG, "READING attr %d len %d", nla->nla_type, nla->nla_len);
 		if (rta->rta_len < sizeof(struct rtattr)) {
 			break;
 		}
 
 		switch (rta->rta_type) {
 		case NL_RTA_TABLE:
 			{
 				uint32_t fibnum;
 				fibnum = _rta_get_uint32(rta);
 				if (fibnum == 0)
 					fibnum = 254;
 				RT_LOG(LOG_DEBUG3, "XFIBNUM %u", fibnum);
 				if (!nlattr_add_u32(nw, NL_RTA_TABLE, fibnum))
 					return (false);
 			}
 			break;
 		default:
 			if (!nlmsg_copy_nla(nla, nw))
 				return (false);
 			break;
 		}
 	}
 
 	nlmsg_end(nw);
 	RT_LOG(LOG_DEBUG2, "done processing nw %p", nw);
 	return (true);
 }
 
 static bool
 rtnl_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw)
 {
 	RT_LOG(LOG_DEBUG2, "Got message type %d", hdr->nlmsg_type);
 
 	switch (hdr->nlmsg_type) {
 	case NL_RTM_NEWLINK:
 	case NL_RTM_DELLINK:
 	case NL_RTM_GETLINK:
 		return (rtnl_newlink_to_linux(hdr, nlp, nw));
 	case NL_RTM_NEWADDR:
 	case NL_RTM_DELADDR:
 		return (rtnl_newaddr_to_linux(hdr, nlp, nw));
 	case NL_RTM_NEWROUTE:
 	case NL_RTM_DELROUTE:
 		return (rtnl_newroute_to_linux(hdr, nlp, nw));
 	case NL_RTM_NEWNEIGH:
 	case NL_RTM_DELNEIGH:
 	case NL_RTM_GETNEIGH:
 		return (rtnl_newneigh_to_linux(hdr, nlp, nw));
 	default:
 		RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated",
 		    hdr->nlmsg_type);
 		return (handle_default_out(hdr, nw));
 	}
 }
 
 static bool
 nlmsg_error_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw)
 {
 	if (!nlmsg_copy_header(hdr, nw))
 		return (false);
 
 	struct nlmsgerr *nlerr;
 	nlerr = nlmsg_copy_next_header(hdr, nw, struct nlmsgerr);
 	nlerr->error = bsd_to_linux_errno(nlerr->error);
 
 	int copied_len = sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr);
 	if (hdr->nlmsg_len == copied_len) {
 		nlmsg_end(nw);
 		return (true);
 	}
 
 	/*
 	 * CAP_ACK was not set. Original request needs to be translated.
 	 * XXX: implement translation of the original message
 	 */
 	RT_LOG(LOG_DEBUG, "[WARN] Passing ack message type %d untranslated",
 	    nlerr->msg.nlmsg_type);
 	char *dst_payload, *src_payload;
 	int copy_len = hdr->nlmsg_len - copied_len;
 	dst_payload = nlmsg_reserve_data(nw, NLMSG_ALIGN(copy_len), char);
 
 	src_payload = (char *)hdr + copied_len;
 
 	memcpy(dst_payload, src_payload, copy_len);
 	nlmsg_end(nw);
 
 	return (true);
 }
 
 static bool
 nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp,
     struct nl_writer *nw)
 {
 	if (hdr->nlmsg_type < NLMSG_MIN_TYPE) {
 		switch (hdr->nlmsg_type) {
 		case NLMSG_ERROR:
 			return (nlmsg_error_to_linux(hdr, nlp, nw));
 		case NLMSG_NOOP:
 		case NLMSG_DONE:
 		case NLMSG_OVERRUN:
 			return (handle_default_out(hdr, nw));
 		default:
 			RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated",
 			    hdr->nlmsg_type);
 			return (handle_default_out(hdr, nw));
 		}
 	}
 
 	switch (netlink_family) {
 	case NETLINK_ROUTE:
 		return (rtnl_to_linux(hdr, nlp, nw));
 	default:
 		return (handle_default_out(hdr, nw));
 	}
 }
 
 static struct mbuf *
 nlmsgs_to_linux(int netlink_family, char *buf, int data_length, struct nlpcb *nlp)
 {
 	RT_LOG(LOG_DEBUG3, "LINUX: get %p size %d", buf, data_length);
 	struct nl_writer nw = {};
 
 	struct mbuf *m = NULL;
 	if (!nlmsg_get_chain_writer(&nw, data_length, &m)) {
 		RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d",
 		    data_length);
 		return (NULL);
 	}
 
 	/* Assume correct headers. Buffer IS mutable */
 	int count = 0;
 	for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
 		struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset];
 		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
 		count++;
 
 		if (!nlmsg_to_linux(netlink_family, hdr, nlp, &nw)) {
 			RT_LOG(LOG_DEBUG, "failed to process msg type %d",
 			    hdr->nlmsg_type);
 			m_freem(m);
 			return (NULL);
 		}
 		offset += msglen;
 	}
 	nlmsg_flush(&nw);
 	RT_LOG(LOG_DEBUG3, "Processed %d messages, chain size %d", count,
 	    m ? m_length(m, NULL) : 0);
 
 	return (m);
 }
 
 static struct mbuf *
 mbufs_to_linux(int netlink_family, struct mbuf *m, struct nlpcb *nlp)
 {
 	/* XXX: easiest solution, not optimized for performance */
 	int data_length = m_length(m, NULL);
 	char *buf = malloc(data_length, M_LINUX, M_NOWAIT);
 	if (buf == NULL) {
 		RT_LOG(LOG_DEBUG, "unable to allocate %d bytes, dropping message",
 		    data_length);
 		m_freem(m);
 		return (NULL);
 	}
 	m_copydata(m, 0, data_length, buf);
 	m_freem(m);
 
 	m = nlmsgs_to_linux(netlink_family, buf, data_length, nlp);
 	free(buf, M_LINUX);
 
 	return (m);
 }
 
 static struct linux_netlink_provider linux_netlink_v1 = {
 	.mbufs_to_linux = mbufs_to_linux,
 	.msgs_to_linux = nlmsgs_to_linux,
 	.msg_from_linux = nlmsg_from_linux,
 };
 
 void
 linux_netlink_register(void)
 {
 	linux_netlink_p = &linux_netlink_v1;
 }
 
 void
 linux_netlink_deregister(void)
 {
 	linux_netlink_p = NULL;
 }
diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c
index 72846a8bed51..a4762ce9ebb1 100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@ -1,3935 +1,3934 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2015-2021 Amazon.com, Inc. or its affiliates.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 #include <machine/resource.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include "ena.h"
 #include "ena_datapath.h"
 #include "ena_rss.h"
 #include "ena_sysctl.h"
 
 #ifdef DEV_NETMAP
 #include "ena_netmap.h"
 #endif /* DEV_NETMAP */
 
 /*********************************************************
  *  Function prototypes
  *********************************************************/
 static int ena_probe(device_t);
 static void ena_intr_msix_mgmnt(void *);
 static void ena_free_pci_resources(struct ena_adapter *);
 static int ena_change_mtu(if_t, int);
 static inline void ena_alloc_counters(counter_u64_t *, int);
 static inline void ena_free_counters(counter_u64_t *, int);
 static inline void ena_reset_counters(counter_u64_t *, int);
 static void ena_init_io_rings_common(struct ena_adapter *, struct ena_ring *,
     uint16_t);
 static void ena_init_io_rings_basic(struct ena_adapter *);
 static void ena_init_io_rings_advanced(struct ena_adapter *);
 static void ena_init_io_rings(struct ena_adapter *);
 static void ena_free_io_ring_resources(struct ena_adapter *, unsigned int);
 static void ena_free_all_io_rings_resources(struct ena_adapter *);
 static int ena_setup_tx_dma_tag(struct ena_adapter *);
 static int ena_free_tx_dma_tag(struct ena_adapter *);
 static int ena_setup_rx_dma_tag(struct ena_adapter *);
 static int ena_free_rx_dma_tag(struct ena_adapter *);
 static void ena_release_all_tx_dmamap(struct ena_ring *);
 static int ena_setup_tx_resources(struct ena_adapter *, int);
 static void ena_free_tx_resources(struct ena_adapter *, int);
 static int ena_setup_all_tx_resources(struct ena_adapter *);
 static void ena_free_all_tx_resources(struct ena_adapter *);
 static int ena_setup_rx_resources(struct ena_adapter *, unsigned int);
 static void ena_free_rx_resources(struct ena_adapter *, unsigned int);
 static int ena_setup_all_rx_resources(struct ena_adapter *);
 static void ena_free_all_rx_resources(struct ena_adapter *);
 static inline int ena_alloc_rx_mbuf(struct ena_adapter *, struct ena_ring *,
     struct ena_rx_buffer *);
 static void ena_free_rx_mbuf(struct ena_adapter *, struct ena_ring *,
     struct ena_rx_buffer *);
 static void ena_free_rx_bufs(struct ena_adapter *, unsigned int);
 static void ena_refill_all_rx_bufs(struct ena_adapter *);
 static void ena_free_all_rx_bufs(struct ena_adapter *);
 static void ena_free_tx_bufs(struct ena_adapter *, unsigned int);
 static void ena_free_all_tx_bufs(struct ena_adapter *);
 static void ena_destroy_all_tx_queues(struct ena_adapter *);
 static void ena_destroy_all_rx_queues(struct ena_adapter *);
 static void ena_destroy_all_io_queues(struct ena_adapter *);
 static int ena_create_io_queues(struct ena_adapter *);
 static int ena_handle_msix(void *);
 static int ena_enable_msix(struct ena_adapter *);
 static void ena_setup_mgmnt_intr(struct ena_adapter *);
 static int ena_setup_io_intr(struct ena_adapter *);
 static int ena_request_mgmnt_irq(struct ena_adapter *);
 static int ena_request_io_irq(struct ena_adapter *);
 static void ena_free_mgmnt_irq(struct ena_adapter *);
 static void ena_free_io_irq(struct ena_adapter *);
 static void ena_free_irqs(struct ena_adapter *);
 static void ena_disable_msix(struct ena_adapter *);
 static void ena_unmask_all_io_irqs(struct ena_adapter *);
 static int ena_up_complete(struct ena_adapter *);
 static uint64_t ena_get_counter(if_t, ift_counter);
 static int ena_media_change(if_t);
 static void ena_media_status(if_t, struct ifmediareq *);
 static void ena_init(void *);
 static int ena_ioctl(if_t, u_long, caddr_t);
 static int ena_get_dev_offloads(struct ena_com_dev_get_features_ctx *);
 static void ena_update_host_info(struct ena_admin_host_info *, if_t);
 static void ena_update_hwassist(struct ena_adapter *);
 static int ena_setup_ifnet(device_t, struct ena_adapter *,
     struct ena_com_dev_get_features_ctx *);
 static int ena_enable_wc(device_t, struct resource *);
 static int ena_set_queues_placement_policy(device_t, struct ena_com_dev *,
     struct ena_admin_feature_llq_desc *, struct ena_llq_configurations *);
 static int ena_map_llq_mem_bar(device_t, struct ena_com_dev *);
 static uint32_t ena_calc_max_io_queue_num(device_t, struct ena_com_dev *,
     struct ena_com_dev_get_features_ctx *);
 static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *);
 static void ena_config_host_info(struct ena_com_dev *, device_t);
 static int ena_attach(device_t);
 static int ena_detach(device_t);
 static int ena_device_init(struct ena_adapter *, device_t,
     struct ena_com_dev_get_features_ctx *, int *);
 static int ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *);
 static void ena_update_on_link_change(void *, struct ena_admin_aenq_entry *);
 static void unimplemented_aenq_handler(void *, struct ena_admin_aenq_entry *);
 static int ena_copy_eni_metrics(struct ena_adapter *);
 static void ena_timer_service(void *);
 
 static char ena_version[] = ENA_DEVICE_NAME ENA_DRV_MODULE_NAME
     " v" ENA_DRV_MODULE_VERSION;
 
 static ena_vendor_info_t ena_vendor_info_array[] = {
 	{ PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_PF, 0 },
 	{ PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_PF_RSERV0, 0 },
 	{ PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_VF, 0 },
 	{ PCI_VENDOR_ID_AMAZON, PCI_DEV_ID_ENA_VF_RSERV0, 0 },
 	/* Last entry */
 	{ 0, 0, 0 }
 };
 
 struct sx ena_global_lock;
 
 /*
  * Contains pointers to event handlers, e.g. link state chage.
  */
 static struct ena_aenq_handlers aenq_handlers;
 
 void
 ena_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	if (error != 0)
 		return;
 	*(bus_addr_t *)arg = segs[0].ds_addr;
 }
 
 int
 ena_dma_alloc(device_t dmadev, bus_size_t size, ena_mem_handle_t *dma,
     int mapflags, bus_size_t alignment, int domain)
 {
 	struct ena_adapter *adapter = device_get_softc(dmadev);
 	device_t pdev = adapter->pdev;
 	uint32_t maxsize;
 	uint64_t dma_space_addr;
 	int error;
 
 	maxsize = ((size - 1) / PAGE_SIZE + 1) * PAGE_SIZE;
 
 	dma_space_addr = ENA_DMA_BIT_MASK(adapter->dma_width);
 	if (unlikely(dma_space_addr == 0))
 		dma_space_addr = BUS_SPACE_MAXADDR;
 
 	error = bus_dma_tag_create(bus_get_dma_tag(dmadev), /* parent */
 	    alignment, 0,      /* alignment, bounds 		*/
 	    dma_space_addr,    /* lowaddr of exclusion window	*/
 	    BUS_SPACE_MAXADDR, /* highaddr of exclusion window	*/
 	    NULL, NULL,	       /* filter, filterarg 		*/
 	    maxsize,	       /* maxsize 			*/
 	    1,		       /* nsegments 			*/
 	    maxsize,	       /* maxsegsize 			*/
 	    BUS_DMA_ALLOCNOW,  /* flags 			*/
 	    NULL,	       /* lockfunc 			*/
 	    NULL,	       /* lockarg 			*/
 	    &dma->tag);
 	if (unlikely(error != 0)) {
 		ena_log(pdev, ERR, "bus_dma_tag_create failed: %d\n", error);
 		goto fail_tag;
 	}
 
 	error = bus_dma_tag_set_domain(dma->tag, domain);
 	if (unlikely(error != 0)) {
 		ena_log(pdev, ERR, "bus_dma_tag_set_domain failed: %d\n",
 		    error);
 		goto fail_map_create;
 	}
 
 	error = bus_dmamem_alloc(dma->tag, (void **)&dma->vaddr,
 	    BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->map);
 	if (unlikely(error != 0)) {
 		ena_log(pdev, ERR, "bus_dmamem_alloc(%ju) failed: %d\n",
 		    (uintmax_t)size, error);
 		goto fail_map_create;
 	}
 
 	dma->paddr = 0;
 	error = bus_dmamap_load(dma->tag, dma->map, dma->vaddr, size,
 	    ena_dmamap_callback, &dma->paddr, mapflags);
 	if (unlikely((error != 0) || (dma->paddr == 0))) {
 		ena_log(pdev, ERR, "bus_dmamap_load failed: %d\n", error);
 		goto fail_map_load;
 	}
 
 	bus_dmamap_sync(dma->tag, dma->map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 	return (0);
 
 fail_map_load:
 	bus_dmamem_free(dma->tag, dma->vaddr, dma->map);
 fail_map_create:
 	bus_dma_tag_destroy(dma->tag);
 fail_tag:
 	dma->tag = NULL;
 	dma->vaddr = NULL;
 	dma->paddr = 0;
 
 	return (error);
 }
 
 static void
 ena_free_pci_resources(struct ena_adapter *adapter)
 {
 	device_t pdev = adapter->pdev;
 
 	if (adapter->memory != NULL) {
 		bus_release_resource(pdev, SYS_RES_MEMORY,
 		    PCIR_BAR(ENA_MEM_BAR), adapter->memory);
 	}
 
 	if (adapter->registers != NULL) {
 		bus_release_resource(pdev, SYS_RES_MEMORY,
 		    PCIR_BAR(ENA_REG_BAR), adapter->registers);
 	}
 
 	if (adapter->msix != NULL) {
 		bus_release_resource(pdev, SYS_RES_MEMORY, adapter->msix_rid,
 		    adapter->msix);
 	}
 }
 
 static int
 ena_probe(device_t dev)
 {
 	ena_vendor_info_t *ent;
 	uint16_t pci_vendor_id = 0;
 	uint16_t pci_device_id = 0;
 
 	pci_vendor_id = pci_get_vendor(dev);
 	pci_device_id = pci_get_device(dev);
 
 	ent = ena_vendor_info_array;
 	while (ent->vendor_id != 0) {
 		if ((pci_vendor_id == ent->vendor_id) &&
 		    (pci_device_id == ent->device_id)) {
 			ena_log_raw(DBG, "vendor=%x device=%x\n", pci_vendor_id,
 			    pci_device_id);
 
 			device_set_desc(dev, ENA_DEVICE_DESC);
 			return (BUS_PROBE_DEFAULT);
 		}
 
 		ent++;
 	}
 
 	return (ENXIO);
 }
 
 static int
 ena_change_mtu(if_t ifp, int new_mtu)
 {
 	struct ena_adapter *adapter = if_getsoftc(ifp);
 	device_t pdev = adapter->pdev;
 	int rc;
 
 	if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) {
 		ena_log(pdev, ERR, "Invalid MTU setting. new_mtu: %d max mtu: %d min mtu: %d\n",
 		    new_mtu, adapter->max_mtu, ENA_MIN_MTU);
 		return (EINVAL);
 	}
 
 	rc = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu);
 	if (likely(rc == 0)) {
 		ena_log(pdev, DBG, "set MTU to %d\n", new_mtu);
 		if_setmtu(ifp, new_mtu);
 	} else {
 		ena_log(pdev, ERR, "Failed to set MTU to %d\n", new_mtu);
 	}
 
 	return (rc);
 }
 
 static inline void
 ena_alloc_counters(counter_u64_t *begin, int size)
 {
 	counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
 
 	for (; begin < end; ++begin)
 		*begin = counter_u64_alloc(M_WAITOK);
 }
 
 static inline void
 ena_free_counters(counter_u64_t *begin, int size)
 {
 	counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
 
 	for (; begin < end; ++begin)
 		counter_u64_free(*begin);
 }
 
 static inline void
 ena_reset_counters(counter_u64_t *begin, int size)
 {
 	counter_u64_t *end = (counter_u64_t *)((char *)begin + size);
 
 	for (; begin < end; ++begin)
 		counter_u64_zero(*begin);
 }
 
 static void
 ena_init_io_rings_common(struct ena_adapter *adapter, struct ena_ring *ring,
     uint16_t qid)
 {
 	ring->qid = qid;
 	ring->adapter = adapter;
 	ring->ena_dev = adapter->ena_dev;
 	atomic_store_8(&ring->first_interrupt, 0);
 	ring->no_interrupt_event_cnt = 0;
 }
 
 static void
 ena_init_io_rings_basic(struct ena_adapter *adapter)
 {
 	struct ena_com_dev *ena_dev;
 	struct ena_ring *txr, *rxr;
 	struct ena_que *que;
 	int i;
 
 	ena_dev = adapter->ena_dev;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		txr = &adapter->tx_ring[i];
 		rxr = &adapter->rx_ring[i];
 
 		/* TX/RX common ring state */
 		ena_init_io_rings_common(adapter, txr, i);
 		ena_init_io_rings_common(adapter, rxr, i);
 
 		/* TX specific ring state */
 		txr->tx_max_header_size = ena_dev->tx_max_header_size;
 		txr->tx_mem_queue_type = ena_dev->tx_mem_queue_type;
 
 		que = &adapter->que[i];
 		que->adapter = adapter;
 		que->id = i;
 		que->tx_ring = txr;
 		que->rx_ring = rxr;
 
 		txr->que = que;
 		rxr->que = que;
 
 		rxr->empty_rx_queue = 0;
 		rxr->rx_mbuf_sz = ena_mbuf_sz;
 	}
 }
 
 static void
 ena_init_io_rings_advanced(struct ena_adapter *adapter)
 {
 	struct ena_ring *txr, *rxr;
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		txr = &adapter->tx_ring[i];
 		rxr = &adapter->rx_ring[i];
 
 		/* Allocate a buf ring */
 		txr->buf_ring_size = adapter->buf_ring_size;
 		txr->br = buf_ring_alloc(txr->buf_ring_size, M_DEVBUF, M_WAITOK,
 		    &txr->ring_mtx);
 
 		/* Allocate Tx statistics. */
 		ena_alloc_counters((counter_u64_t *)&txr->tx_stats,
 		    sizeof(txr->tx_stats));
 		txr->tx_last_cleanup_ticks = ticks;
 
 		/* Allocate Rx statistics. */
 		ena_alloc_counters((counter_u64_t *)&rxr->rx_stats,
 		    sizeof(rxr->rx_stats));
 
 		/* Initialize locks */
 		snprintf(txr->mtx_name, nitems(txr->mtx_name), "%s:tx(%d)",
 		    device_get_nameunit(adapter->pdev), i);
 		snprintf(rxr->mtx_name, nitems(rxr->mtx_name), "%s:rx(%d)",
 		    device_get_nameunit(adapter->pdev), i);
 
 		mtx_init(&txr->ring_mtx, txr->mtx_name, NULL, MTX_DEF);
 	}
 }
 
 static void
 ena_init_io_rings(struct ena_adapter *adapter)
 {
 	/*
 	 * IO rings initialization can be divided into the 2 steps:
 	 *   1. Initialize variables and fields with initial values and copy
 	 *      them from adapter/ena_dev (basic)
 	 *   2. Allocate mutex, counters and buf_ring (advanced)
 	 */
 	ena_init_io_rings_basic(adapter);
 	ena_init_io_rings_advanced(adapter);
 }
 
 static void
 ena_free_io_ring_resources(struct ena_adapter *adapter, unsigned int qid)
 {
 	struct ena_ring *txr = &adapter->tx_ring[qid];
 	struct ena_ring *rxr = &adapter->rx_ring[qid];
 
 	ena_free_counters((counter_u64_t *)&txr->tx_stats,
 	    sizeof(txr->tx_stats));
 	ena_free_counters((counter_u64_t *)&rxr->rx_stats,
 	    sizeof(rxr->rx_stats));
 
 	ENA_RING_MTX_LOCK(txr);
 	drbr_free(txr->br, M_DEVBUF);
 	ENA_RING_MTX_UNLOCK(txr);
 
 	mtx_destroy(&txr->ring_mtx);
 }
 
 static void
 ena_free_all_io_rings_resources(struct ena_adapter *adapter)
 {
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_free_io_ring_resources(adapter, i);
 }
 
 static int
 ena_setup_tx_dma_tag(struct ena_adapter *adapter)
 {
 	int ret;
 
 	/* Create DMA tag for Tx buffers */
 	ret = bus_dma_tag_create(bus_get_dma_tag(adapter->pdev),
 	    1, 0,				  /* alignment, bounds 	     */
 	    ENA_DMA_BIT_MASK(adapter->dma_width), /* lowaddr of excl window  */
 	    BUS_SPACE_MAXADDR,			  /* highaddr of excl window */
 	    NULL, NULL,				  /* filter, filterarg 	     */
 	    ENA_TSO_MAXSIZE,			  /* maxsize 		     */
 	    adapter->max_tx_sgl_size - 1,	  /* nsegments 		     */
 	    ENA_TSO_MAXSIZE,			  /* maxsegsize 	     */
 	    0,					  /* flags 		     */
 	    NULL,				  /* lockfunc 		     */
 	    NULL,				  /* lockfuncarg 	     */
 	    &adapter->tx_buf_tag);
 
 	return (ret);
 }
 
 static int
 ena_free_tx_dma_tag(struct ena_adapter *adapter)
 {
 	int ret;
 
 	ret = bus_dma_tag_destroy(adapter->tx_buf_tag);
 
 	if (likely(ret == 0))
 		adapter->tx_buf_tag = NULL;
 
 	return (ret);
 }
 
 static int
 ena_setup_rx_dma_tag(struct ena_adapter *adapter)
 {
 	int ret;
 
 	/* Create DMA tag for Rx buffers*/
 	ret = bus_dma_tag_create(bus_get_dma_tag(adapter->pdev), /* parent   */
 	    1, 0,				  /* alignment, bounds 	     */
 	    ENA_DMA_BIT_MASK(adapter->dma_width), /* lowaddr of excl window  */
 	    BUS_SPACE_MAXADDR,			  /* highaddr of excl window */
 	    NULL, NULL,				  /* filter, filterarg 	     */
 	    ena_mbuf_sz,			  /* maxsize 		     */
 	    adapter->max_rx_sgl_size,		  /* nsegments 		     */
 	    ena_mbuf_sz,			  /* maxsegsize 	     */
 	    0,					  /* flags 		     */
 	    NULL,				  /* lockfunc 		     */
 	    NULL,				  /* lockarg 		     */
 	    &adapter->rx_buf_tag);
 
 	return (ret);
 }
 
 static int
 ena_free_rx_dma_tag(struct ena_adapter *adapter)
 {
 	int ret;
 
 	ret = bus_dma_tag_destroy(adapter->rx_buf_tag);
 
 	if (likely(ret == 0))
 		adapter->rx_buf_tag = NULL;
 
 	return (ret);
 }
 
 static void
 ena_release_all_tx_dmamap(struct ena_ring *tx_ring)
 {
 	struct ena_adapter *adapter = tx_ring->adapter;
 	struct ena_tx_buffer *tx_info;
 	bus_dma_tag_t tx_tag = adapter->tx_buf_tag;
 	int i;
 #ifdef DEV_NETMAP
 	struct ena_netmap_tx_info *nm_info;
 	int j;
 #endif /* DEV_NETMAP */
 
 	for (i = 0; i < tx_ring->ring_size; ++i) {
 		tx_info = &tx_ring->tx_buffer_info[i];
 #ifdef DEV_NETMAP
 		if (if_getcapenable(adapter->ifp) & IFCAP_NETMAP) {
 			nm_info = &tx_info->nm_info;
 			for (j = 0; j < ENA_PKT_MAX_BUFS; ++j) {
 				if (nm_info->map_seg[j] != NULL) {
 					bus_dmamap_destroy(tx_tag,
 					    nm_info->map_seg[j]);
 					nm_info->map_seg[j] = NULL;
 				}
 			}
 		}
 #endif /* DEV_NETMAP */
 		if (tx_info->dmamap != NULL) {
 			bus_dmamap_destroy(tx_tag, tx_info->dmamap);
 			tx_info->dmamap = NULL;
 		}
 	}
 }
 
 /**
  * ena_setup_tx_resources - allocate Tx resources (Descriptors)
  * @adapter: network interface device structure
  * @qid: queue index
  *
  * Returns 0 on success, otherwise on failure.
  **/
 static int
 ena_setup_tx_resources(struct ena_adapter *adapter, int qid)
 {
 	device_t pdev = adapter->pdev;
 	char thread_name[MAXCOMLEN + 1];
 	struct ena_que *que = &adapter->que[qid];
 	struct ena_ring *tx_ring = que->tx_ring;
 	cpuset_t *cpu_mask = NULL;
 	int size, i, err;
 #ifdef DEV_NETMAP
 	bus_dmamap_t *map;
 	int j;
 
 	ena_netmap_reset_tx_ring(adapter, qid);
 #endif /* DEV_NETMAP */
 
 	size = sizeof(struct ena_tx_buffer) * tx_ring->ring_size;
 
 	tx_ring->tx_buffer_info = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (unlikely(tx_ring->tx_buffer_info == NULL))
 		return (ENOMEM);
 
 	size = sizeof(uint16_t) * tx_ring->ring_size;
 	tx_ring->free_tx_ids = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (unlikely(tx_ring->free_tx_ids == NULL))
 		goto err_buf_info_free;
 
 	size = tx_ring->tx_max_header_size;
 	tx_ring->push_buf_intermediate_buf = malloc(size, M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
 	if (unlikely(tx_ring->push_buf_intermediate_buf == NULL))
 		goto err_tx_ids_free;
 
 	/* Req id stack for TX OOO completions */
 	for (i = 0; i < tx_ring->ring_size; i++)
 		tx_ring->free_tx_ids[i] = i;
 
 	/* Reset TX statistics. */
 	ena_reset_counters((counter_u64_t *)&tx_ring->tx_stats,
 	    sizeof(tx_ring->tx_stats));
 
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 	tx_ring->acum_pkts = 0;
 
 	/* Make sure that drbr is empty */
 	ENA_RING_MTX_LOCK(tx_ring);
 	drbr_flush(adapter->ifp, tx_ring->br);
 	ENA_RING_MTX_UNLOCK(tx_ring);
 
 	/* ... and create the buffer DMA maps */
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		err = bus_dmamap_create(adapter->tx_buf_tag, 0,
 		    &tx_ring->tx_buffer_info[i].dmamap);
 		if (unlikely(err != 0)) {
 			ena_log(pdev, ERR,
 			    "Unable to create Tx DMA map for buffer %d\n", i);
 			goto err_map_release;
 		}
 
 #ifdef DEV_NETMAP
 		if (if_getcapenable(adapter->ifp) & IFCAP_NETMAP) {
 			map = tx_ring->tx_buffer_info[i].nm_info.map_seg;
 			for (j = 0; j < ENA_PKT_MAX_BUFS; j++) {
 				err = bus_dmamap_create(adapter->tx_buf_tag, 0,
 				    &map[j]);
 				if (unlikely(err != 0)) {
 					ena_log(pdev, ERR,
 					    "Unable to create Tx DMA for buffer %d %d\n",
 					    i, j);
 					goto err_map_release;
 				}
 			}
 		}
 #endif /* DEV_NETMAP */
 	}
 
 	/* Allocate taskqueues */
 	TASK_INIT(&tx_ring->enqueue_task, 0, ena_deferred_mq_start, tx_ring);
 	tx_ring->enqueue_tq = taskqueue_create_fast("ena_tx_enque", M_NOWAIT,
 	    taskqueue_thread_enqueue, &tx_ring->enqueue_tq);
 	if (unlikely(tx_ring->enqueue_tq == NULL)) {
 		ena_log(pdev, ERR,
 		    "Unable to create taskqueue for enqueue task\n");
 		i = tx_ring->ring_size;
 		goto err_map_release;
 	}
 
 	tx_ring->running = true;
 
 #ifdef RSS
 	cpu_mask = &que->cpu_mask;
 	snprintf(thread_name, sizeof(thread_name), "%s txeq %d",
 	    device_get_nameunit(adapter->pdev), que->cpu);
 #else
 	snprintf(thread_name, sizeof(thread_name), "%s txeq %d",
 	    device_get_nameunit(adapter->pdev), que->id);
 #endif
 	taskqueue_start_threads_cpuset(&tx_ring->enqueue_tq, 1, PI_NET,
 	    cpu_mask, "%s", thread_name);
 
 	return (0);
 
 err_map_release:
 	ena_release_all_tx_dmamap(tx_ring);
 err_tx_ids_free:
 	free(tx_ring->free_tx_ids, M_DEVBUF);
 	tx_ring->free_tx_ids = NULL;
 err_buf_info_free:
 	free(tx_ring->tx_buffer_info, M_DEVBUF);
 	tx_ring->tx_buffer_info = NULL;
 
 	return (ENOMEM);
 }
 
 /**
  * ena_free_tx_resources - Free Tx Resources per Queue
  * @adapter: network interface device structure
  * @qid: queue index
  *
  * Free all transmit software resources
  **/
 static void
 ena_free_tx_resources(struct ena_adapter *adapter, int qid)
 {
 	struct ena_ring *tx_ring = &adapter->tx_ring[qid];
 #ifdef DEV_NETMAP
 	struct ena_netmap_tx_info *nm_info;
 	int j;
 #endif /* DEV_NETMAP */
 
 	while (taskqueue_cancel(tx_ring->enqueue_tq, &tx_ring->enqueue_task, NULL))
 		taskqueue_drain(tx_ring->enqueue_tq, &tx_ring->enqueue_task);
 
 	taskqueue_free(tx_ring->enqueue_tq);
 
 	ENA_RING_MTX_LOCK(tx_ring);
 	/* Flush buffer ring, */
 	drbr_flush(adapter->ifp, tx_ring->br);
 
 	/* Free buffer DMA maps, */
 	for (int i = 0; i < tx_ring->ring_size; i++) {
 		bus_dmamap_sync(adapter->tx_buf_tag,
 		    tx_ring->tx_buffer_info[i].dmamap, BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(adapter->tx_buf_tag,
 		    tx_ring->tx_buffer_info[i].dmamap);
 		bus_dmamap_destroy(adapter->tx_buf_tag,
 		    tx_ring->tx_buffer_info[i].dmamap);
 
 #ifdef DEV_NETMAP
 		if (if_getcapenable(adapter->ifp) & IFCAP_NETMAP) {
 			nm_info = &tx_ring->tx_buffer_info[i].nm_info;
 			for (j = 0; j < ENA_PKT_MAX_BUFS; j++) {
 				if (nm_info->socket_buf_idx[j] != 0) {
 					bus_dmamap_sync(adapter->tx_buf_tag,
 					    nm_info->map_seg[j],
 					    BUS_DMASYNC_POSTWRITE);
 					ena_netmap_unload(adapter,
 					    nm_info->map_seg[j]);
 				}
 				bus_dmamap_destroy(adapter->tx_buf_tag,
 				    nm_info->map_seg[j]);
 				nm_info->socket_buf_idx[j] = 0;
 			}
 		}
 #endif /* DEV_NETMAP */
 
 		m_freem(tx_ring->tx_buffer_info[i].mbuf);
 		tx_ring->tx_buffer_info[i].mbuf = NULL;
 	}
 	ENA_RING_MTX_UNLOCK(tx_ring);
 
 	/* And free allocated memory. */
 	free(tx_ring->tx_buffer_info, M_DEVBUF);
 	tx_ring->tx_buffer_info = NULL;
 
 	free(tx_ring->free_tx_ids, M_DEVBUF);
 	tx_ring->free_tx_ids = NULL;
 
 	free(tx_ring->push_buf_intermediate_buf, M_DEVBUF);
 	tx_ring->push_buf_intermediate_buf = NULL;
 }
 
 /**
  * ena_setup_all_tx_resources - allocate all queues Tx resources
  * @adapter: network interface device structure
  *
  * Returns 0 on success, otherwise on failure.
  **/
 static int
 ena_setup_all_tx_resources(struct ena_adapter *adapter)
 {
 	int i, rc;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rc = ena_setup_tx_resources(adapter, i);
 		if (rc != 0) {
 			ena_log(adapter->pdev, ERR,
 			    "Allocation for Tx Queue %u failed\n", i);
 			goto err_setup_tx;
 		}
 	}
 
 	return (0);
 
 err_setup_tx:
 	/* Rewind the index freeing the rings as we go */
 	while (i--)
 		ena_free_tx_resources(adapter, i);
 	return (rc);
 }
 
 /**
  * ena_free_all_tx_resources - Free Tx Resources for All Queues
  * @adapter: network interface device structure
  *
  * Free all transmit software resources
  **/
 static void
 ena_free_all_tx_resources(struct ena_adapter *adapter)
 {
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_free_tx_resources(adapter, i);
 }
 
 /**
  * ena_setup_rx_resources - allocate Rx resources (Descriptors)
  * @adapter: network interface device structure
  * @qid: queue index
  *
  * Returns 0 on success, otherwise on failure.
  **/
 static int
 ena_setup_rx_resources(struct ena_adapter *adapter, unsigned int qid)
 {
 	device_t pdev = adapter->pdev;
 	struct ena_que *que = &adapter->que[qid];
 	struct ena_ring *rx_ring = que->rx_ring;
 	int size, err, i;
 
 	size = sizeof(struct ena_rx_buffer) * rx_ring->ring_size;
 
 #ifdef DEV_NETMAP
 	ena_netmap_reset_rx_ring(adapter, qid);
 	rx_ring->initialized = false;
 #endif /* DEV_NETMAP */
 
 	/*
 	 * Alloc extra element so in rx path
 	 * we can always prefetch rx_info + 1
 	 */
 	size += sizeof(struct ena_rx_buffer);
 
 	rx_ring->rx_buffer_info = malloc(size, M_DEVBUF, M_WAITOK | M_ZERO);
 
 	size = sizeof(uint16_t) * rx_ring->ring_size;
 	rx_ring->free_rx_ids = malloc(size, M_DEVBUF, M_WAITOK);
 
 	for (i = 0; i < rx_ring->ring_size; i++)
 		rx_ring->free_rx_ids[i] = i;
 
 	/* Reset RX statistics. */
 	ena_reset_counters((counter_u64_t *)&rx_ring->rx_stats,
 	    sizeof(rx_ring->rx_stats));
 
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
 	/* ... and create the buffer DMA maps */
 	for (i = 0; i < rx_ring->ring_size; i++) {
 		err = bus_dmamap_create(adapter->rx_buf_tag, 0,
 		    &(rx_ring->rx_buffer_info[i].map));
 		if (err != 0) {
 			ena_log(pdev, ERR,
 			    "Unable to create Rx DMA map for buffer %d\n", i);
 			goto err_buf_info_unmap;
 		}
 	}
 
 	/* Create LRO for the ring */
 	if ((if_getcapenable(adapter->ifp) & IFCAP_LRO) != 0) {
 		int err = tcp_lro_init(&rx_ring->lro);
 		if (err != 0) {
 			ena_log(pdev, ERR, "LRO[%d] Initialization failed!\n",
 			    qid);
 		} else {
 			ena_log(pdev, DBG, "RX Soft LRO[%d] Initialized\n",
 			    qid);
 			rx_ring->lro.ifp = adapter->ifp;
 		}
 	}
 
 	return (0);
 
 err_buf_info_unmap:
 	while (i--) {
 		bus_dmamap_destroy(adapter->rx_buf_tag,
 		    rx_ring->rx_buffer_info[i].map);
 	}
 
 	free(rx_ring->free_rx_ids, M_DEVBUF);
 	rx_ring->free_rx_ids = NULL;
 	free(rx_ring->rx_buffer_info, M_DEVBUF);
 	rx_ring->rx_buffer_info = NULL;
 	return (ENOMEM);
 }
 
 /**
  * ena_free_rx_resources - Free Rx Resources
  * @adapter: network interface device structure
  * @qid: queue index
  *
  * Free all receive software resources
  **/
 static void
 ena_free_rx_resources(struct ena_adapter *adapter, unsigned int qid)
 {
 	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
 
 	/* Free buffer DMA maps, */
 	for (int i = 0; i < rx_ring->ring_size; i++) {
 		bus_dmamap_sync(adapter->rx_buf_tag,
 		    rx_ring->rx_buffer_info[i].map, BUS_DMASYNC_POSTREAD);
 		m_freem(rx_ring->rx_buffer_info[i].mbuf);
 		rx_ring->rx_buffer_info[i].mbuf = NULL;
 		bus_dmamap_unload(adapter->rx_buf_tag,
 		    rx_ring->rx_buffer_info[i].map);
 		bus_dmamap_destroy(adapter->rx_buf_tag,
 		    rx_ring->rx_buffer_info[i].map);
 	}
 
 	/* free LRO resources, */
 	tcp_lro_free(&rx_ring->lro);
 
 	/* free allocated memory */
 	free(rx_ring->rx_buffer_info, M_DEVBUF);
 	rx_ring->rx_buffer_info = NULL;
 
 	free(rx_ring->free_rx_ids, M_DEVBUF);
 	rx_ring->free_rx_ids = NULL;
 }
 
 /**
  * ena_setup_all_rx_resources - allocate all queues Rx resources
  * @adapter: network interface device structure
  *
  * Returns 0 on success, otherwise on failure.
  **/
 static int
 ena_setup_all_rx_resources(struct ena_adapter *adapter)
 {
 	int i, rc = 0;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rc = ena_setup_rx_resources(adapter, i);
 		if (rc != 0) {
 			ena_log(adapter->pdev, ERR,
 			    "Allocation for Rx Queue %u failed\n", i);
 			goto err_setup_rx;
 		}
 	}
 	return (0);
 
 err_setup_rx:
 	/* rewind the index freeing the rings as we go */
 	while (i--)
 		ena_free_rx_resources(adapter, i);
 	return (rc);
 }
 
 /**
  * ena_free_all_rx_resources - Free Rx resources for all queues
  * @adapter: network interface device structure
  *
  * Free all receive software resources
  **/
 static void
 ena_free_all_rx_resources(struct ena_adapter *adapter)
 {
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_free_rx_resources(adapter, i);
 }
 
 static inline int
 ena_alloc_rx_mbuf(struct ena_adapter *adapter, struct ena_ring *rx_ring,
     struct ena_rx_buffer *rx_info)
 {
 	device_t pdev = adapter->pdev;
 	struct ena_com_buf *ena_buf;
 	bus_dma_segment_t segs[1];
 	int nsegs, error;
 	int mlen;
 
 	/* if previous allocated frag is not used */
 	if (unlikely(rx_info->mbuf != NULL))
 		return (0);
 
 	/* Get mbuf using UMA allocator */
 	rx_info->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
 	    rx_ring->rx_mbuf_sz);
 
 	if (unlikely(rx_info->mbuf == NULL)) {
 		counter_u64_add(rx_ring->rx_stats.mjum_alloc_fail, 1);
 		rx_info->mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (unlikely(rx_info->mbuf == NULL)) {
 			counter_u64_add(rx_ring->rx_stats.mbuf_alloc_fail, 1);
 			return (ENOMEM);
 		}
 		mlen = MCLBYTES;
 	} else {
 		mlen = rx_ring->rx_mbuf_sz;
 	}
 	/* Set mbuf length*/
 	rx_info->mbuf->m_pkthdr.len = rx_info->mbuf->m_len = mlen;
 
 	/* Map packets for DMA */
 	ena_log(pdev, DBG,
 	    "Using tag %p for buffers' DMA mapping, mbuf %p len: %d\n",
 	    adapter->rx_buf_tag, rx_info->mbuf, rx_info->mbuf->m_len);
 	error = bus_dmamap_load_mbuf_sg(adapter->rx_buf_tag, rx_info->map,
 	    rx_info->mbuf, segs, &nsegs, BUS_DMA_NOWAIT);
 	if (unlikely((error != 0) || (nsegs != 1))) {
 		ena_log(pdev, WARN,
 		    "failed to map mbuf, error: %d, nsegs: %d\n", error, nsegs);
 		counter_u64_add(rx_ring->rx_stats.dma_mapping_err, 1);
 		goto exit;
 	}
 
 	bus_dmamap_sync(adapter->rx_buf_tag, rx_info->map, BUS_DMASYNC_PREREAD);
 
 	ena_buf = &rx_info->ena_buf;
 	ena_buf->paddr = segs[0].ds_addr;
 	ena_buf->len = mlen;
 
 	ena_log(pdev, DBG,
 	    "ALLOC RX BUF: mbuf %p, rx_info %p, len %d, paddr %#jx\n",
 	    rx_info->mbuf, rx_info, ena_buf->len, (uintmax_t)ena_buf->paddr);
 
 	return (0);
 
 exit:
 	m_freem(rx_info->mbuf);
 	rx_info->mbuf = NULL;
 	return (EFAULT);
 }
 
 static void
 ena_free_rx_mbuf(struct ena_adapter *adapter, struct ena_ring *rx_ring,
     struct ena_rx_buffer *rx_info)
 {
 	if (rx_info->mbuf == NULL) {
 		ena_log(adapter->pdev, WARN,
 		    "Trying to free unallocated buffer\n");
 		return;
 	}
 
 	bus_dmamap_sync(adapter->rx_buf_tag, rx_info->map,
 	    BUS_DMASYNC_POSTREAD);
 	bus_dmamap_unload(adapter->rx_buf_tag, rx_info->map);
 	m_freem(rx_info->mbuf);
 	rx_info->mbuf = NULL;
 }
 
 /**
  * ena_refill_rx_bufs - Refills ring with descriptors
  * @rx_ring: the ring which we want to feed with free descriptors
  * @num: number of descriptors to refill
  * Refills the ring with newly allocated DMA-mapped mbufs for receiving
  **/
 int
 ena_refill_rx_bufs(struct ena_ring *rx_ring, uint32_t num)
 {
 	struct ena_adapter *adapter = rx_ring->adapter;
 	device_t pdev = adapter->pdev;
 	uint16_t next_to_use, req_id;
 	uint32_t i;
 	int rc;
 
 	ena_log_io(adapter->pdev, DBG, "refill qid: %d\n", rx_ring->qid);
 
 	next_to_use = rx_ring->next_to_use;
 
 	for (i = 0; i < num; i++) {
 		struct ena_rx_buffer *rx_info;
 
 		ena_log_io(pdev, DBG, "RX buffer - next to use: %d\n",
 		    next_to_use);
 
 		req_id = rx_ring->free_rx_ids[next_to_use];
 		rx_info = &rx_ring->rx_buffer_info[req_id];
 #ifdef DEV_NETMAP
 		if (ena_rx_ring_in_netmap(adapter, rx_ring->qid))
 			rc = ena_netmap_alloc_rx_slot(adapter, rx_ring,
 			    rx_info);
 		else
 #endif /* DEV_NETMAP */
 			rc = ena_alloc_rx_mbuf(adapter, rx_ring, rx_info);
 		if (unlikely(rc != 0)) {
 			ena_log_io(pdev, WARN,
 			    "failed to alloc buffer for rx queue %d\n",
 			    rx_ring->qid);
 			break;
 		}
 		rc = ena_com_add_single_rx_desc(rx_ring->ena_com_io_sq,
 		    &rx_info->ena_buf, req_id);
 		if (unlikely(rc != 0)) {
 			ena_log_io(pdev, WARN,
 			    "failed to add buffer for rx queue %d\n",
 			    rx_ring->qid);
 			break;
 		}
 		next_to_use = ENA_RX_RING_IDX_NEXT(next_to_use,
 		    rx_ring->ring_size);
 	}
 
 	if (unlikely(i < num)) {
 		counter_u64_add(rx_ring->rx_stats.refil_partial, 1);
 		ena_log_io(pdev, WARN,
 		    "refilled rx qid %d with only %d mbufs (from %d)\n",
 		    rx_ring->qid, i, num);
 	}
 
 	if (likely(i != 0))
 		ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq);
 
 	rx_ring->next_to_use = next_to_use;
 	return (i);
 }
 
 int
 ena_update_buf_ring_size(struct ena_adapter *adapter,
     uint32_t new_buf_ring_size)
 {
 	uint32_t old_buf_ring_size;
 	int rc = 0;
 	bool dev_was_up;
 
 	old_buf_ring_size = adapter->buf_ring_size;
 	adapter->buf_ring_size = new_buf_ring_size;
 
 	dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter);
 	ena_down(adapter);
 
 	/* Reconfigure buf ring for all Tx rings. */
 	ena_free_all_io_rings_resources(adapter);
 	ena_init_io_rings_advanced(adapter);
 	if (dev_was_up) {
 		/*
 		 * If ena_up() fails, it's not because of recent buf_ring size
 		 * changes. Because of that, we just want to revert old drbr
 		 * value and trigger the reset because something else had to
 		 * go wrong.
 		 */
 		rc = ena_up(adapter);
 		if (unlikely(rc != 0)) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to configure device after setting new drbr size: %u. Reverting old value: %u and triggering the reset\n",
 			    new_buf_ring_size, old_buf_ring_size);
 
 			/* Revert old size and trigger the reset */
 			adapter->buf_ring_size = old_buf_ring_size;
 			ena_free_all_io_rings_resources(adapter);
 			ena_init_io_rings_advanced(adapter);
 
 			ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEV_UP_BEFORE_RESET,
 			    adapter);
 			ena_trigger_reset(adapter, ENA_REGS_RESET_OS_TRIGGER);
 		}
 	}
 
 	return (rc);
 }
 
 int
 ena_update_queue_size(struct ena_adapter *adapter, uint32_t new_tx_size,
     uint32_t new_rx_size)
 {
 	uint32_t old_tx_size, old_rx_size;
 	int rc = 0;
 	bool dev_was_up;
 
 	old_tx_size = adapter->requested_tx_ring_size;
 	old_rx_size = adapter->requested_rx_ring_size;
 	adapter->requested_tx_ring_size = new_tx_size;
 	adapter->requested_rx_ring_size = new_rx_size;
 
 	dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter);
 	ena_down(adapter);
 
 	/* Configure queues with new size. */
 	ena_init_io_rings_basic(adapter);
 	if (dev_was_up) {
 		rc = ena_up(adapter);
 		if (unlikely(rc != 0)) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to configure device with the new sizes - Tx: %u Rx: %u. Reverting old values - Tx: %u Rx: %u\n",
 			    new_tx_size, new_rx_size, old_tx_size, old_rx_size);
 
 			/* Revert old size. */
 			adapter->requested_tx_ring_size = old_tx_size;
 			adapter->requested_rx_ring_size = old_rx_size;
 			ena_init_io_rings_basic(adapter);
 
 			/* And try again. */
 			rc = ena_up(adapter);
 			if (unlikely(rc != 0)) {
 				ena_log(adapter->pdev, ERR,
 				    "Failed to revert old queue sizes. Triggering device reset.\n");
 				/*
 				 * If we've failed again, something had to go
 				 * wrong. After reset, the device should try to
 				 * go up
 				 */
 				ENA_FLAG_SET_ATOMIC(
 				    ENA_FLAG_DEV_UP_BEFORE_RESET, adapter);
 				ena_trigger_reset(adapter,
 				    ENA_REGS_RESET_OS_TRIGGER);
 			}
 		}
 	}
 
 	return (rc);
 }
 
 static void
 ena_update_io_rings(struct ena_adapter *adapter, uint32_t num)
 {
 	ena_free_all_io_rings_resources(adapter);
 	/* Force indirection table to be reinitialized */
 	ena_com_rss_destroy(adapter->ena_dev);
 
 	adapter->num_io_queues = num;
 	ena_init_io_rings(adapter);
 }
 
 /* Caller should sanitize new_num */
 int
 ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num)
 {
 	uint32_t old_num;
 	int rc = 0;
 	bool dev_was_up;
 
 	dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter);
 	old_num = adapter->num_io_queues;
 	ena_down(adapter);
 
 	ena_update_io_rings(adapter, new_num);
 
 	if (dev_was_up) {
 		rc = ena_up(adapter);
 		if (unlikely(rc != 0)) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to configure device with %u IO queues. "
 			    "Reverting to previous value: %u\n",
 			    new_num, old_num);
 
 			ena_update_io_rings(adapter, old_num);
 
 			rc = ena_up(adapter);
 			if (unlikely(rc != 0)) {
 				ena_log(adapter->pdev, ERR,
 				    "Failed to revert to previous setup IO "
 				    "queues. Triggering device reset.\n");
 				ENA_FLAG_SET_ATOMIC(
 				    ENA_FLAG_DEV_UP_BEFORE_RESET, adapter);
 				ena_trigger_reset(adapter,
 				    ENA_REGS_RESET_OS_TRIGGER);
 			}
 		}
 	}
 
 	return (rc);
 }
 
 static void
 ena_free_rx_bufs(struct ena_adapter *adapter, unsigned int qid)
 {
 	struct ena_ring *rx_ring = &adapter->rx_ring[qid];
 	unsigned int i;
 
 	for (i = 0; i < rx_ring->ring_size; i++) {
 		struct ena_rx_buffer *rx_info = &rx_ring->rx_buffer_info[i];
 
 		if (rx_info->mbuf != NULL)
 			ena_free_rx_mbuf(adapter, rx_ring, rx_info);
 #ifdef DEV_NETMAP
 		if (((if_getflags(adapter->ifp) & IFF_DYING) == 0) &&
 		    (if_getcapenable(adapter->ifp) & IFCAP_NETMAP)) {
 			if (rx_info->netmap_buf_idx != 0)
 				ena_netmap_free_rx_slot(adapter, rx_ring,
 				    rx_info);
 		}
 #endif /* DEV_NETMAP */
 	}
 }
 
 /**
  * ena_refill_all_rx_bufs - allocate all queues Rx buffers
  * @adapter: network interface device structure
  *
  */
 static void
 ena_refill_all_rx_bufs(struct ena_adapter *adapter)
 {
 	struct ena_ring *rx_ring;
 	int i, rc, bufs_num;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
 		bufs_num = rx_ring->ring_size - 1;
 		rc = ena_refill_rx_bufs(rx_ring, bufs_num);
 		if (unlikely(rc != bufs_num))
 			ena_log_io(adapter->pdev, WARN,
 			    "refilling Queue %d failed. "
 			    "Allocated %d buffers from: %d\n",
 			    i, rc, bufs_num);
 #ifdef DEV_NETMAP
 		rx_ring->initialized = true;
 #endif /* DEV_NETMAP */
 	}
 }
 
 static void
 ena_free_all_rx_bufs(struct ena_adapter *adapter)
 {
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++)
 		ena_free_rx_bufs(adapter, i);
 }
 
 /**
  * ena_free_tx_bufs - Free Tx Buffers per Queue
  * @adapter: network interface device structure
  * @qid: queue index
  **/
 static void
 ena_free_tx_bufs(struct ena_adapter *adapter, unsigned int qid)
 {
 	bool print_once = true;
 	struct ena_ring *tx_ring = &adapter->tx_ring[qid];
 
 	ENA_RING_MTX_LOCK(tx_ring);
 	for (int i = 0; i < tx_ring->ring_size; i++) {
 		struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i];
 
 		if (tx_info->mbuf == NULL)
 			continue;
 
 		if (print_once) {
 			ena_log(adapter->pdev, WARN,
 			    "free uncompleted tx mbuf qid %d idx 0x%x\n", qid,
 			    i);
 			print_once = false;
 		} else {
 			ena_log(adapter->pdev, DBG,
 			    "free uncompleted tx mbuf qid %d idx 0x%x\n", qid,
 			    i);
 		}
 
 		bus_dmamap_sync(adapter->tx_buf_tag, tx_info->dmamap,
 		    BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(adapter->tx_buf_tag, tx_info->dmamap);
 
 		m_free(tx_info->mbuf);
 		tx_info->mbuf = NULL;
 	}
 	ENA_RING_MTX_UNLOCK(tx_ring);
 }
 
 static void
 ena_free_all_tx_bufs(struct ena_adapter *adapter)
 {
 	for (int i = 0; i < adapter->num_io_queues; i++)
 		ena_free_tx_bufs(adapter, i);
 }
 
 static void
 ena_destroy_all_tx_queues(struct ena_adapter *adapter)
 {
 	uint16_t ena_qid;
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		ena_qid = ENA_IO_TXQ_IDX(i);
 		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
 	}
 }
 
 static void
 ena_destroy_all_rx_queues(struct ena_adapter *adapter)
 {
 	uint16_t ena_qid;
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		ena_qid = ENA_IO_RXQ_IDX(i);
 		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
 	}
 }
 
 static void
 ena_destroy_all_io_queues(struct ena_adapter *adapter)
 {
 	struct ena_que *queue;
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		queue = &adapter->que[i];
 		while (taskqueue_cancel(queue->cleanup_tq, &queue->cleanup_task, NULL))
 			taskqueue_drain(queue->cleanup_tq, &queue->cleanup_task);
 		taskqueue_free(queue->cleanup_tq);
 	}
 
 	ena_destroy_all_tx_queues(adapter);
 	ena_destroy_all_rx_queues(adapter);
 }
 
 static int
 ena_create_io_queues(struct ena_adapter *adapter)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	struct ena_com_create_io_ctx ctx;
 	struct ena_ring *ring;
 	struct ena_que *queue;
 	uint16_t ena_qid;
 	uint32_t msix_vector;
 	cpuset_t *cpu_mask = NULL;
 	int rc, i;
 
 	/* Create TX queues */
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		msix_vector = ENA_IO_IRQ_IDX(i);
 		ena_qid = ENA_IO_TXQ_IDX(i);
 		ctx.mem_queue_type = ena_dev->tx_mem_queue_type;
 		ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_TX;
 		ctx.queue_size = adapter->requested_tx_ring_size;
 		ctx.msix_vector = msix_vector;
 		ctx.qid = ena_qid;
 		ctx.numa_node = adapter->que[i].domain;
 
 		rc = ena_com_create_io_queue(ena_dev, &ctx);
 		if (rc != 0) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to create io TX queue #%d rc: %d\n", i, rc);
 			goto err_tx;
 		}
 		ring = &adapter->tx_ring[i];
 		rc = ena_com_get_io_handlers(ena_dev, ena_qid,
 		    &ring->ena_com_io_sq, &ring->ena_com_io_cq);
 		if (rc != 0) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to get TX queue handlers. TX queue num"
 			    " %d rc: %d\n",
 			    i, rc);
 			ena_com_destroy_io_queue(ena_dev, ena_qid);
 			goto err_tx;
 		}
 
 		if (ctx.numa_node >= 0) {
 			ena_com_update_numa_node(ring->ena_com_io_cq,
 			    ctx.numa_node);
 		}
 	}
 
 	/* Create RX queues */
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		msix_vector = ENA_IO_IRQ_IDX(i);
 		ena_qid = ENA_IO_RXQ_IDX(i);
 		ctx.mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
 		ctx.direction = ENA_COM_IO_QUEUE_DIRECTION_RX;
 		ctx.queue_size = adapter->requested_rx_ring_size;
 		ctx.msix_vector = msix_vector;
 		ctx.qid = ena_qid;
 		ctx.numa_node = adapter->que[i].domain;
 
 		rc = ena_com_create_io_queue(ena_dev, &ctx);
 		if (unlikely(rc != 0)) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to create io RX queue[%d] rc: %d\n", i, rc);
 			goto err_rx;
 		}
 
 		ring = &adapter->rx_ring[i];
 		rc = ena_com_get_io_handlers(ena_dev, ena_qid,
 		    &ring->ena_com_io_sq, &ring->ena_com_io_cq);
 		if (unlikely(rc != 0)) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to get RX queue handlers. RX queue num"
 			    " %d rc: %d\n",
 			    i, rc);
 			ena_com_destroy_io_queue(ena_dev, ena_qid);
 			goto err_rx;
 		}
 
 		if (ctx.numa_node >= 0) {
 			ena_com_update_numa_node(ring->ena_com_io_cq,
 			    ctx.numa_node);
 		}
 	}
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		queue = &adapter->que[i];
 
 		NET_TASK_INIT(&queue->cleanup_task, 0, ena_cleanup, queue);
 		queue->cleanup_tq = taskqueue_create_fast("ena cleanup",
 		    M_WAITOK, taskqueue_thread_enqueue, &queue->cleanup_tq);
 
 #ifdef RSS
 		cpu_mask = &queue->cpu_mask;
 #endif
 		taskqueue_start_threads_cpuset(&queue->cleanup_tq, 1, PI_NET,
 		    cpu_mask, "%s queue %d cleanup",
 		    device_get_nameunit(adapter->pdev), i);
 	}
 
 	return (0);
 
 err_rx:
 	while (i--)
 		ena_com_destroy_io_queue(ena_dev, ENA_IO_RXQ_IDX(i));
 	i = adapter->num_io_queues;
 err_tx:
 	while (i--)
 		ena_com_destroy_io_queue(ena_dev, ENA_IO_TXQ_IDX(i));
 
 	return (ENXIO);
 }
 
 /*********************************************************************
  *
  *  MSIX & Interrupt Service routine
  *
  **********************************************************************/
 
 /**
  * ena_handle_msix - MSIX Interrupt Handler for admin/async queue
  * @arg: interrupt number
  **/
 static void
 ena_intr_msix_mgmnt(void *arg)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)arg;
 
 	ena_com_admin_q_comp_intr_handler(adapter->ena_dev);
 	if (likely(ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter)))
 		ena_com_aenq_intr_handler(adapter->ena_dev, arg);
 }
 
 /**
  * ena_handle_msix - MSIX Interrupt Handler for Tx/Rx
  * @arg: queue
  **/
 static int
 ena_handle_msix(void *arg)
 {
 	struct ena_que *queue = arg;
 	struct ena_adapter *adapter = queue->adapter;
 	if_t ifp = adapter->ifp;
 
 	if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
 		return (FILTER_STRAY);
 
 	taskqueue_enqueue(queue->cleanup_tq, &queue->cleanup_task);
 
 	return (FILTER_HANDLED);
 }
 
 static int
 ena_enable_msix(struct ena_adapter *adapter)
 {
 	device_t dev = adapter->pdev;
 	int msix_vecs, msix_req;
 	int i, rc = 0;
 
 	if (ENA_FLAG_ISSET(ENA_FLAG_MSIX_ENABLED, adapter)) {
 		ena_log(dev, ERR, "Error, MSI-X is already enabled\n");
 		return (EINVAL);
 	}
 
 	/* Reserved the max msix vectors we might need */
 	msix_vecs = ENA_MAX_MSIX_VEC(adapter->max_num_io_queues);
 
 	adapter->msix_entries = malloc(msix_vecs * sizeof(struct msix_entry),
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 
 	ena_log(dev, DBG, "trying to enable MSI-X, vectors: %d\n", msix_vecs);
 
 	for (i = 0; i < msix_vecs; i++) {
 		adapter->msix_entries[i].entry = i;
 		/* Vectors must start from 1 */
 		adapter->msix_entries[i].vector = i + 1;
 	}
 
 	msix_req = msix_vecs;
 	rc = pci_alloc_msix(dev, &msix_vecs);
 	if (unlikely(rc != 0)) {
 		ena_log(dev, ERR, "Failed to enable MSIX, vectors %d rc %d\n",
 		    msix_vecs, rc);
 
 		rc = ENOSPC;
 		goto err_msix_free;
 	}
 
 	if (msix_vecs != msix_req) {
 		if (msix_vecs == ENA_ADMIN_MSIX_VEC) {
 			ena_log(dev, ERR,
 			    "Not enough number of MSI-x allocated: %d\n",
 			    msix_vecs);
 			pci_release_msi(dev);
 			rc = ENOSPC;
 			goto err_msix_free;
 		}
 		ena_log(dev, ERR,
 		    "Enable only %d MSI-x (out of %d), reduce "
 		    "the number of queues\n",
 		    msix_vecs, msix_req);
 	}
 
 	adapter->msix_vecs = msix_vecs;
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_MSIX_ENABLED, adapter);
 
 	return (0);
 
 err_msix_free:
 	free(adapter->msix_entries, M_DEVBUF);
 	adapter->msix_entries = NULL;
 
 	return (rc);
 }
 
 static void
 ena_setup_mgmnt_intr(struct ena_adapter *adapter)
 {
 	snprintf(adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].name, ENA_IRQNAME_SIZE,
 	    "ena-mgmnt@pci:%s", device_get_nameunit(adapter->pdev));
 	/*
 	 * Handler is NULL on purpose, it will be set
 	 * when mgmnt interrupt is acquired
 	 */
 	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].handler = NULL;
 	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].data = adapter;
 	adapter->irq_tbl[ENA_MGMNT_IRQ_IDX].vector =
 	    adapter->msix_entries[ENA_MGMNT_IRQ_IDX].vector;
 }
 
 static int
 ena_setup_io_intr(struct ena_adapter *adapter)
 {
 #ifdef RSS
 	int num_buckets = rss_getnumbuckets();
 	static int last_bind = 0;
 	int cur_bind;
 	int idx;
 #endif
 	int irq_idx;
 
 	if (adapter->msix_entries == NULL)
 		return (EINVAL);
 
 #ifdef RSS
 	if (adapter->first_bind < 0) {
 		adapter->first_bind = last_bind;
 		last_bind = (last_bind + adapter->num_io_queues) % num_buckets;
 	}
 	cur_bind = adapter->first_bind;
 #endif
 
 	for (int i = 0; i < adapter->num_io_queues; i++) {
 		irq_idx = ENA_IO_IRQ_IDX(i);
 
 		snprintf(adapter->irq_tbl[irq_idx].name, ENA_IRQNAME_SIZE,
 		    "%s-TxRx-%d", device_get_nameunit(adapter->pdev), i);
 		adapter->irq_tbl[irq_idx].handler = ena_handle_msix;
 		adapter->irq_tbl[irq_idx].data = &adapter->que[i];
 		adapter->irq_tbl[irq_idx].vector =
 		    adapter->msix_entries[irq_idx].vector;
 		ena_log(adapter->pdev, DBG, "ena_setup_io_intr vector: %d\n",
 		    adapter->msix_entries[irq_idx].vector);
 
 #ifdef RSS
 		adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu =
 		    rss_getcpu(cur_bind);
 		cur_bind = (cur_bind + 1) % num_buckets;
 		CPU_SETOF(adapter->que[i].cpu, &adapter->que[i].cpu_mask);
 
 		for (idx = 0; idx < MAXMEMDOM; ++idx) {
 			if (CPU_ISSET(adapter->que[i].cpu, &cpuset_domain[idx]))
 				break;
 		}
 		adapter->que[i].domain = idx;
 #else
 		adapter->que[i].domain = -1;
 #endif
 	}
 
 	return (0);
 }
 
 static int
 ena_request_mgmnt_irq(struct ena_adapter *adapter)
 {
 	device_t pdev = adapter->pdev;
 	struct ena_irq *irq;
 	unsigned long flags;
 	int rc, rcc;
 
 	flags = RF_ACTIVE | RF_SHAREABLE;
 
 	irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX];
 	irq->res = bus_alloc_resource_any(adapter->pdev, SYS_RES_IRQ,
 	    &irq->vector, flags);
 
 	if (unlikely(irq->res == NULL)) {
 		ena_log(pdev, ERR, "could not allocate irq vector: %d\n",
 		    irq->vector);
 		return (ENXIO);
 	}
 
 	rc = bus_setup_intr(adapter->pdev, irq->res,
 	    INTR_TYPE_NET | INTR_MPSAFE, NULL, ena_intr_msix_mgmnt, irq->data,
 	    &irq->cookie);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR,
 		    "failed to register interrupt handler for irq %ju: %d\n",
 		    rman_get_start(irq->res), rc);
 		goto err_res_free;
 	}
 	irq->requested = true;
 
 	return (rc);
 
 err_res_free:
 	ena_log(pdev, INFO, "releasing resource for irq %d\n", irq->vector);
 	rcc = bus_release_resource(adapter->pdev, SYS_RES_IRQ, irq->vector,
 	    irq->res);
 	if (unlikely(rcc != 0))
 		ena_log(pdev, ERR,
 		    "dev has no parent while releasing res for irq: %d\n",
 		    irq->vector);
 	irq->res = NULL;
 
 	return (rc);
 }
 
 static int
 ena_request_io_irq(struct ena_adapter *adapter)
 {
 	device_t pdev = adapter->pdev;
 	struct ena_irq *irq;
 	unsigned long flags = 0;
 	int rc = 0, i, rcc;
 
 	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_MSIX_ENABLED, adapter))) {
 		ena_log(pdev, ERR,
 		    "failed to request I/O IRQ: MSI-X is not enabled\n");
 		return (EINVAL);
 	} else {
 		flags = RF_ACTIVE | RF_SHAREABLE;
 	}
 
 	for (i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++) {
 		irq = &adapter->irq_tbl[i];
 
 		if (unlikely(irq->requested))
 			continue;
 
 		irq->res = bus_alloc_resource_any(adapter->pdev, SYS_RES_IRQ,
 		    &irq->vector, flags);
 		if (unlikely(irq->res == NULL)) {
 			rc = ENOMEM;
 			ena_log(pdev, ERR,
 			    "could not allocate irq vector: %d\n", irq->vector);
 			goto err;
 		}
 
 		rc = bus_setup_intr(adapter->pdev, irq->res,
 		    INTR_TYPE_NET | INTR_MPSAFE, irq->handler, NULL, irq->data,
 		    &irq->cookie);
 		if (unlikely(rc != 0)) {
 			ena_log(pdev, ERR,
 			    "failed to register interrupt handler for irq %ju: %d\n",
 			    rman_get_start(irq->res), rc);
 			goto err;
 		}
 		irq->requested = true;
 
 #ifdef RSS
 		rc = bus_bind_intr(adapter->pdev, irq->res, irq->cpu);
 		if (unlikely(rc != 0)) {
 			ena_log(pdev, ERR,
 			    "failed to bind interrupt handler for irq %ju to cpu %d: %d\n",
 			    rman_get_start(irq->res), irq->cpu, rc);
 			goto err;
 		}
 
 		ena_log(pdev, INFO, "queue %d - cpu %d\n",
 		    i - ENA_IO_IRQ_FIRST_IDX, irq->cpu);
 #endif
 	}
 
 	return (rc);
 
 err:
 
 	for (; i >= ENA_IO_IRQ_FIRST_IDX; i--) {
 		irq = &adapter->irq_tbl[i];
 		rcc = 0;
 
 		/* Once we entered err: section and irq->requested is true we
 		   free both intr and resources */
 		if (irq->requested)
 			rcc = bus_teardown_intr(adapter->pdev, irq->res,
 			    irq->cookie);
 		if (unlikely(rcc != 0))
 			ena_log(pdev, ERR,
 			    "could not release irq: %d, error: %d\n",
 			    irq->vector, rcc);
 
 		/* If we entered err: section without irq->requested set we know
 		   it was bus_alloc_resource_any() that needs cleanup, provided
 		   res is not NULL. In case res is NULL no work in needed in
 		   this iteration */
 		rcc = 0;
 		if (irq->res != NULL) {
 			rcc = bus_release_resource(adapter->pdev, SYS_RES_IRQ,
 			    irq->vector, irq->res);
 		}
 		if (unlikely(rcc != 0))
 			ena_log(pdev, ERR,
 			    "dev has no parent while releasing res for irq: %d\n",
 			    irq->vector);
 		irq->requested = false;
 		irq->res = NULL;
 	}
 
 	return (rc);
 }
 
 static void
 ena_free_mgmnt_irq(struct ena_adapter *adapter)
 {
 	device_t pdev = adapter->pdev;
 	struct ena_irq *irq;
 	int rc;
 
 	irq = &adapter->irq_tbl[ENA_MGMNT_IRQ_IDX];
 	if (irq->requested) {
 		ena_log(pdev, DBG, "tear down irq: %d\n", irq->vector);
 		rc = bus_teardown_intr(adapter->pdev, irq->res, irq->cookie);
 		if (unlikely(rc != 0))
 			ena_log(pdev, ERR, "failed to tear down irq: %d\n",
 			    irq->vector);
 		irq->requested = 0;
 	}
 
 	if (irq->res != NULL) {
 		ena_log(pdev, DBG, "release resource irq: %d\n", irq->vector);
 		rc = bus_release_resource(adapter->pdev, SYS_RES_IRQ,
 		    irq->vector, irq->res);
 		irq->res = NULL;
 		if (unlikely(rc != 0))
 			ena_log(pdev, ERR,
 			    "dev has no parent while releasing res for irq: %d\n",
 			    irq->vector);
 	}
 }
 
 static void
 ena_free_io_irq(struct ena_adapter *adapter)
 {
 	device_t pdev = adapter->pdev;
 	struct ena_irq *irq;
 	int rc;
 
 	for (int i = ENA_IO_IRQ_FIRST_IDX; i < adapter->msix_vecs; i++) {
 		irq = &adapter->irq_tbl[i];
 		if (irq->requested) {
 			ena_log(pdev, DBG, "tear down irq: %d\n", irq->vector);
 			rc = bus_teardown_intr(adapter->pdev, irq->res,
 			    irq->cookie);
 			if (unlikely(rc != 0)) {
 				ena_log(pdev, ERR,
 				    "failed to tear down irq: %d\n",
 				    irq->vector);
 			}
 			irq->requested = 0;
 		}
 
 		if (irq->res != NULL) {
 			ena_log(pdev, DBG, "release resource irq: %d\n",
 			    irq->vector);
 			rc = bus_release_resource(adapter->pdev, SYS_RES_IRQ,
 			    irq->vector, irq->res);
 			irq->res = NULL;
 			if (unlikely(rc != 0)) {
 				ena_log(pdev, ERR,
 				    "dev has no parent while releasing res for irq: %d\n",
 				    irq->vector);
 			}
 		}
 	}
 }
 
 static void
 ena_free_irqs(struct ena_adapter *adapter)
 {
 	ena_free_io_irq(adapter);
 	ena_free_mgmnt_irq(adapter);
 	ena_disable_msix(adapter);
 }
 
 static void
 ena_disable_msix(struct ena_adapter *adapter)
 {
 	if (ENA_FLAG_ISSET(ENA_FLAG_MSIX_ENABLED, adapter)) {
 		ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_MSIX_ENABLED, adapter);
 		pci_release_msi(adapter->pdev);
 	}
 
 	adapter->msix_vecs = 0;
 	free(adapter->msix_entries, M_DEVBUF);
 	adapter->msix_entries = NULL;
 }
 
 static void
 ena_unmask_all_io_irqs(struct ena_adapter *adapter)
 {
 	struct ena_com_io_cq *io_cq;
 	struct ena_eth_io_intr_reg intr_reg;
 	struct ena_ring *tx_ring;
 	uint16_t ena_qid;
 	int i;
 
 	/* Unmask interrupts for all queues */
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		ena_qid = ENA_IO_TXQ_IDX(i);
 		io_cq = &adapter->ena_dev->io_cq_queues[ena_qid];
 		ena_com_update_intr_reg(&intr_reg, 0, 0, true);
 		tx_ring = &adapter->tx_ring[i];
 		counter_u64_add(tx_ring->tx_stats.unmask_interrupt_num, 1);
 		ena_com_unmask_intr(io_cq, &intr_reg);
 	}
 }
 
 static int
 ena_up_complete(struct ena_adapter *adapter)
 {
 	int rc;
 
 	if (likely(ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
 		rc = ena_rss_configure(adapter);
 		if (rc != 0) {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to configure RSS\n");
 			return (rc);
 		}
 	}
 
 	rc = ena_change_mtu(adapter->ifp, if_getmtu(adapter->ifp));
 	if (unlikely(rc != 0))
 		return (rc);
 
 	ena_refill_all_rx_bufs(adapter);
 	ena_reset_counters((counter_u64_t *)&adapter->hw_stats,
 	    sizeof(adapter->hw_stats));
 
 	return (0);
 }
 
 static void
 set_io_rings_size(struct ena_adapter *adapter, int new_tx_size, int new_rx_size)
 {
 	int i;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		adapter->tx_ring[i].ring_size = new_tx_size;
 		adapter->rx_ring[i].ring_size = new_rx_size;
 	}
 }
 
 static int
 create_queues_with_size_backoff(struct ena_adapter *adapter)
 {
 	device_t pdev = adapter->pdev;
 	int rc;
 	uint32_t cur_rx_ring_size, cur_tx_ring_size;
 	uint32_t new_rx_ring_size, new_tx_ring_size;
 
 	/*
 	 * Current queue sizes might be set to smaller than the requested
 	 * ones due to past queue allocation failures.
 	 */
 	set_io_rings_size(adapter, adapter->requested_tx_ring_size,
 	    adapter->requested_rx_ring_size);
 
 	while (1) {
 		/* Allocate transmit descriptors */
 		rc = ena_setup_all_tx_resources(adapter);
 		if (unlikely(rc != 0)) {
 			ena_log(pdev, ERR, "err_setup_tx\n");
 			goto err_setup_tx;
 		}
 
 		/* Allocate receive descriptors */
 		rc = ena_setup_all_rx_resources(adapter);
 		if (unlikely(rc != 0)) {
 			ena_log(pdev, ERR, "err_setup_rx\n");
 			goto err_setup_rx;
 		}
 
 		/* Create IO queues for Rx & Tx */
 		rc = ena_create_io_queues(adapter);
 		if (unlikely(rc != 0)) {
 			ena_log(pdev, ERR, "create IO queues failed\n");
 			goto err_io_que;
 		}
 
 		return (0);
 
 err_io_que:
 		ena_free_all_rx_resources(adapter);
 err_setup_rx:
 		ena_free_all_tx_resources(adapter);
 err_setup_tx:
 		/*
 		 * Lower the ring size if ENOMEM. Otherwise, return the
 		 * error straightaway.
 		 */
 		if (unlikely(rc != ENOMEM)) {
 			ena_log(pdev, ERR,
 			    "Queue creation failed with error code: %d\n", rc);
 			return (rc);
 		}
 
 		cur_tx_ring_size = adapter->tx_ring[0].ring_size;
 		cur_rx_ring_size = adapter->rx_ring[0].ring_size;
 
 		ena_log(pdev, ERR,
 		    "Not enough memory to create queues with sizes TX=%d, RX=%d\n",
 		    cur_tx_ring_size, cur_rx_ring_size);
 
 		new_tx_ring_size = cur_tx_ring_size;
 		new_rx_ring_size = cur_rx_ring_size;
 
 		/*
 		 * Decrease the size of a larger queue, or decrease both if they
 		 * are the same size.
 		 */
 		if (cur_rx_ring_size <= cur_tx_ring_size)
 			new_tx_ring_size = cur_tx_ring_size / 2;
 		if (cur_rx_ring_size >= cur_tx_ring_size)
 			new_rx_ring_size = cur_rx_ring_size / 2;
 
 		if (new_tx_ring_size < ENA_MIN_RING_SIZE ||
 		    new_rx_ring_size < ENA_MIN_RING_SIZE) {
 			ena_log(pdev, ERR,
 			    "Queue creation failed with the smallest possible queue size"
 			    "of %d for both queues. Not retrying with smaller queues\n",
 			    ENA_MIN_RING_SIZE);
 			return (rc);
 		}
 
 		ena_log(pdev, INFO,
 		    "Retrying queue creation with sizes TX=%d, RX=%d\n",
 		    new_tx_ring_size, new_rx_ring_size);
 
 		set_io_rings_size(adapter, new_tx_ring_size, new_rx_ring_size);
 	}
 }
 
 int
 ena_up(struct ena_adapter *adapter)
 {
 	int rc = 0;
 
 	ENA_LOCK_ASSERT();
 
 	if (unlikely(device_is_attached(adapter->pdev) == 0)) {
 		ena_log(adapter->pdev, ERR, "device is not attached!\n");
 		return (ENXIO);
 	}
 
 	if (ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter))
 		return (0);
 
 	ena_log(adapter->pdev, INFO, "device is going UP\n");
 
 	/* setup interrupts for IO queues */
 	rc = ena_setup_io_intr(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(adapter->pdev, ERR, "error setting up IO interrupt\n");
 		goto error;
 	}
 	rc = ena_request_io_irq(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(adapter->pdev, ERR, "err_req_irq\n");
 		goto error;
 	}
 
 	ena_log(adapter->pdev, INFO,
 	    "Creating %u IO queues. Rx queue size: %d, Tx queue size: %d, LLQ is %s\n",
 	    adapter->num_io_queues,
 	    adapter->requested_rx_ring_size,
 	    adapter->requested_tx_ring_size,
 	    (adapter->ena_dev->tx_mem_queue_type ==
 		ENA_ADMIN_PLACEMENT_POLICY_DEV) ? "ENABLED" : "DISABLED");
 
 	rc = create_queues_with_size_backoff(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(adapter->pdev, ERR,
 		    "error creating queues with size backoff\n");
 		goto err_create_queues_with_backoff;
 	}
 
 	if (ENA_FLAG_ISSET(ENA_FLAG_LINK_UP, adapter))
 		if_link_state_change(adapter->ifp, LINK_STATE_UP);
 
 	rc = ena_up_complete(adapter);
 	if (unlikely(rc != 0))
 		goto err_up_complete;
 
 	counter_u64_add(adapter->dev_stats.interface_up, 1);
 
 	ena_update_hwassist(adapter);
 
 	if_setdrvflagbits(adapter->ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEV_UP, adapter);
 
 	ena_unmask_all_io_irqs(adapter);
 
 	return (0);
 
 err_up_complete:
 	ena_destroy_all_io_queues(adapter);
 	ena_free_all_rx_resources(adapter);
 	ena_free_all_tx_resources(adapter);
 err_create_queues_with_backoff:
 	ena_free_io_irq(adapter);
 error:
 	return (rc);
 }
 
 static uint64_t
 ena_get_counter(if_t ifp, ift_counter cnt)
 {
 	struct ena_adapter *adapter;
 	struct ena_hw_stats *stats;
 
 	adapter = if_getsoftc(ifp);
 	stats = &adapter->hw_stats;
 
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		return (counter_u64_fetch(stats->rx_packets));
 	case IFCOUNTER_OPACKETS:
 		return (counter_u64_fetch(stats->tx_packets));
 	case IFCOUNTER_IBYTES:
 		return (counter_u64_fetch(stats->rx_bytes));
 	case IFCOUNTER_OBYTES:
 		return (counter_u64_fetch(stats->tx_bytes));
 	case IFCOUNTER_IQDROPS:
 		return (counter_u64_fetch(stats->rx_drops));
 	case IFCOUNTER_OQDROPS:
 		return (counter_u64_fetch(stats->tx_drops));
 	default:
 		return (if_get_counter_default(ifp, cnt));
 	}
 }
 
 static int
 ena_media_change(if_t ifp)
 {
 	/* Media Change is not supported by firmware */
 	return (0);
 }
 
 static void
 ena_media_status(if_t ifp, struct ifmediareq *ifmr)
 {
 	struct ena_adapter *adapter = if_getsoftc(ifp);
 	ena_log(adapter->pdev, DBG, "Media status update\n");
 
 	ENA_LOCK_LOCK();
 
 	ifmr->ifm_status = IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER;
 
 	if (!ENA_FLAG_ISSET(ENA_FLAG_LINK_UP, adapter)) {
 		ENA_LOCK_UNLOCK();
 		ena_log(adapter->pdev, INFO, "Link is down\n");
 		return;
 	}
 
 	ifmr->ifm_status |= IFM_ACTIVE;
 	ifmr->ifm_active |= IFM_UNKNOWN | IFM_FDX;
 
 	ENA_LOCK_UNLOCK();
 }
 
 static void
 ena_init(void *arg)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)arg;
 
 	if (!ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter)) {
 		ENA_LOCK_LOCK();
 		ena_up(adapter);
 		ENA_LOCK_UNLOCK();
 	}
 }
 
 static int
 ena_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	struct ena_adapter *adapter;
 	struct ifreq *ifr;
 	int rc;
 
 	adapter = if_getsoftc(ifp);
 	ifr = (struct ifreq *)data;
 
 	/*
 	 * Acquiring lock to prevent from running up and down routines parallel.
 	 */
 	rc = 0;
 	switch (command) {
 	case SIOCSIFMTU:
 		if (if_getmtu(ifp) == ifr->ifr_mtu)
 			break;
 		ENA_LOCK_LOCK();
 		ena_down(adapter);
 
 		ena_change_mtu(ifp, ifr->ifr_mtu);
 
 		rc = ena_up(adapter);
 		ENA_LOCK_UNLOCK();
 		break;
 
 	case SIOCSIFFLAGS:
 		if ((if_getflags(ifp) & IFF_UP) != 0) {
 			if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) {
 				if ((if_getflags(ifp) & (IFF_PROMISC |
 				    IFF_ALLMULTI)) != 0) {
 					ena_log(adapter->pdev, INFO,
 					    "ioctl promisc/allmulti\n");
 				}
 			} else {
 				ENA_LOCK_LOCK();
 				rc = ena_up(adapter);
 				ENA_LOCK_UNLOCK();
 			}
 		} else {
 			if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) {
 				ENA_LOCK_LOCK();
 				ena_down(adapter);
 				ENA_LOCK_UNLOCK();
 			}
 		}
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		rc = ifmedia_ioctl(ifp, ifr, &adapter->media, command);
 		break;
 
 	case SIOCSIFCAP:
 		{
 			int reinit = 0;
 
 			if (ifr->ifr_reqcap != if_getcapenable(ifp)) {
 				if_setcapenable(ifp, ifr->ifr_reqcap);
 				reinit = 1;
 			}
 
 			if ((reinit != 0) &&
 			    ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0)) {
 				ENA_LOCK_LOCK();
 				ena_down(adapter);
 				rc = ena_up(adapter);
 				ENA_LOCK_UNLOCK();
 			}
 		}
 
 		break;
 	default:
 		rc = ether_ioctl(ifp, command, data);
 		break;
 	}
 
 	return (rc);
 }
 
 static int
 ena_get_dev_offloads(struct ena_com_dev_get_features_ctx *feat)
 {
 	int caps = 0;
 
 	if ((feat->offload.tx &
 	    (ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_MASK |
 	    ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK |
 	    ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L3_CSUM_IPV4_MASK)) != 0)
 		caps |= IFCAP_TXCSUM;
 
 	if ((feat->offload.tx &
 	    (ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_FULL_MASK |
 	    ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV6_CSUM_PART_MASK)) != 0)
 		caps |= IFCAP_TXCSUM_IPV6;
 
 	if ((feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV4_MASK) != 0)
 		caps |= IFCAP_TSO4;
 
 	if ((feat->offload.tx & ENA_ADMIN_FEATURE_OFFLOAD_DESC_TSO_IPV6_MASK) != 0)
 		caps |= IFCAP_TSO6;
 
 	if ((feat->offload.rx_supported &
 	    (ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV4_CSUM_MASK |
 	    ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L3_CSUM_IPV4_MASK)) != 0)
 		caps |= IFCAP_RXCSUM;
 
 	if ((feat->offload.rx_supported &
 	    ENA_ADMIN_FEATURE_OFFLOAD_DESC_RX_L4_IPV6_CSUM_MASK) != 0)
 		caps |= IFCAP_RXCSUM_IPV6;
 
 	caps |= IFCAP_LRO | IFCAP_JUMBO_MTU;
 
 	return (caps);
 }
 
 static void
 ena_update_host_info(struct ena_admin_host_info *host_info, if_t ifp)
 {
 	host_info->supported_network_features[0] = (uint32_t)if_getcapabilities(ifp);
 }
 
 static void
 ena_update_hwassist(struct ena_adapter *adapter)
 {
 	if_t ifp = adapter->ifp;
 	uint32_t feat = adapter->tx_offload_cap;
 	int cap = if_getcapenable(ifp);
 	int flags = 0;
 
 	if_clearhwassist(ifp);
 
 	if ((cap & IFCAP_TXCSUM) != 0) {
 		if ((feat &
 		    ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L3_CSUM_IPV4_MASK) != 0)
 			flags |= CSUM_IP;
 		if ((feat &
 		    (ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_FULL_MASK |
 		    ENA_ADMIN_FEATURE_OFFLOAD_DESC_TX_L4_IPV4_CSUM_PART_MASK)) != 0)
 			flags |= CSUM_IP_UDP | CSUM_IP_TCP;
 	}
 
 	if ((cap & IFCAP_TXCSUM_IPV6) != 0)
 		flags |= CSUM_IP6_UDP | CSUM_IP6_TCP;
 
 	if ((cap & IFCAP_TSO4) != 0)
 		flags |= CSUM_IP_TSO;
 
 	if ((cap & IFCAP_TSO6) != 0)
 		flags |= CSUM_IP6_TSO;
 
 	if_sethwassistbits(ifp, flags, 0);
 }
 
 static int
 ena_setup_ifnet(device_t pdev, struct ena_adapter *adapter,
     struct ena_com_dev_get_features_ctx *feat)
 {
 	if_t ifp;
 	int caps = 0;
 
 	ifp = adapter->ifp = if_gethandle(IFT_ETHER);
 	if (unlikely(ifp == NULL)) {
 		ena_log(pdev, ERR, "can not allocate ifnet structure\n");
 		return (ENXIO);
 	}
 	if_initname(ifp, device_get_name(pdev), device_get_unit(pdev));
 	if_setdev(ifp, pdev);
 	if_setsoftc(ifp, adapter);
 
-	if_setflags(ifp,
-	    IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH);
+	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 	if_setinitfn(ifp, ena_init);
 	if_settransmitfn(ifp, ena_mq_start);
 	if_setqflushfn(ifp, ena_qflush);
 	if_setioctlfn(ifp, ena_ioctl);
 	if_setgetcounterfn(ifp, ena_get_counter);
 
 	if_setsendqlen(ifp, adapter->requested_tx_ring_size);
 	if_setsendqready(ifp);
 	if_setmtu(ifp, ETHERMTU);
 	if_setbaudrate(ifp, 0);
 	/* Zeroize capabilities... */
 	if_setcapabilities(ifp, 0);
 	if_setcapenable(ifp, 0);
 	/* check hardware support */
 	caps = ena_get_dev_offloads(feat);
 	/* ... and set them */
 	if_setcapabilitiesbit(ifp, caps, 0);
 
 	/* TSO parameters */
 	if_sethwtsomax(ifp, ENA_TSO_MAXSIZE -
 	    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 	if_sethwtsomaxsegcount(ifp, adapter->max_tx_sgl_size - 1);
 	if_sethwtsomaxsegsize(ifp, ENA_TSO_MAXSIZE);
 
 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 	if_setcapenable(ifp, if_getcapabilities(ifp));
 
 	/*
 	 * Specify the media types supported by this adapter and register
 	 * callbacks to update media and link information
 	 */
 	ifmedia_init(&adapter->media, IFM_IMASK, ena_media_change,
 	    ena_media_status);
 	ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO);
 
 	ether_ifattach(ifp, adapter->mac_addr);
 
 	return (0);
 }
 
 void
 ena_down(struct ena_adapter *adapter)
 {
 	int rc;
 
 	ENA_LOCK_ASSERT();
 
 	if (!ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter))
 		return;
 
 	ena_log(adapter->pdev, INFO, "device is going DOWN\n");
 
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_DEV_UP, adapter);
 	if_setdrvflagbits(adapter->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	ena_free_io_irq(adapter);
 
 	if (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter)) {
 		rc = ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
 		if (unlikely(rc != 0))
 			ena_log(adapter->pdev, ERR, "Device reset failed\n");
 	}
 
 	ena_destroy_all_io_queues(adapter);
 
 	ena_free_all_tx_bufs(adapter);
 	ena_free_all_rx_bufs(adapter);
 	ena_free_all_tx_resources(adapter);
 	ena_free_all_rx_resources(adapter);
 
 	counter_u64_add(adapter->dev_stats.interface_down, 1);
 }
 
 static uint32_t
 ena_calc_max_io_queue_num(device_t pdev, struct ena_com_dev *ena_dev,
     struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
 	uint32_t io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues;
 
 	/* Regular queues capabilities */
 	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
 		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
 		    &get_feat_ctx->max_queue_ext.max_queue_ext;
 		io_rx_num = min_t(int, max_queue_ext->max_rx_sq_num,
 		    max_queue_ext->max_rx_cq_num);
 
 		io_tx_sq_num = max_queue_ext->max_tx_sq_num;
 		io_tx_cq_num = max_queue_ext->max_tx_cq_num;
 	} else {
 		struct ena_admin_queue_feature_desc *max_queues =
 		    &get_feat_ctx->max_queues;
 		io_tx_sq_num = max_queues->max_sq_num;
 		io_tx_cq_num = max_queues->max_cq_num;
 		io_rx_num = min_t(int, io_tx_sq_num, io_tx_cq_num);
 	}
 
 	/* In case of LLQ use the llq fields for the tx SQ/CQ */
 	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
 		io_tx_sq_num = get_feat_ctx->llq.max_llq_num;
 
 	max_num_io_queues = min_t(uint32_t, mp_ncpus, ENA_MAX_NUM_IO_QUEUES);
 	max_num_io_queues = min_t(uint32_t, max_num_io_queues, io_rx_num);
 	max_num_io_queues = min_t(uint32_t, max_num_io_queues, io_tx_sq_num);
 	max_num_io_queues = min_t(uint32_t, max_num_io_queues, io_tx_cq_num);
 	/* 1 IRQ for mgmnt and 1 IRQ for each TX/RX pair */
 	max_num_io_queues = min_t(uint32_t, max_num_io_queues,
 	    pci_msix_count(pdev) - 1);
 #ifdef RSS
 	max_num_io_queues = min_t(uint32_t, max_num_io_queues,
 	    rss_getnumbuckets());
 #endif
 
 	return (max_num_io_queues);
 }
 
 static int
 ena_enable_wc(device_t pdev, struct resource *res)
 {
 #if defined(__i386) || defined(__amd64) || defined(__aarch64__)
 	vm_offset_t va;
 	vm_size_t len;
 	int rc;
 
 	va = (vm_offset_t)rman_get_virtual(res);
 	len = rman_get_size(res);
 	/* Enable write combining */
 	rc = pmap_change_attr(va, len, VM_MEMATTR_WRITE_COMBINING);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "pmap_change_attr failed, %d\n", rc);
 		return (rc);
 	}
 
 	return (0);
 #endif
 	return (EOPNOTSUPP);
 }
 
 static int
 ena_set_queues_placement_policy(device_t pdev, struct ena_com_dev *ena_dev,
     struct ena_admin_feature_llq_desc *llq,
     struct ena_llq_configurations *llq_default_configurations)
 {
 	int rc;
 	uint32_t llq_feature_mask;
 
 	llq_feature_mask = 1 << ENA_ADMIN_LLQ;
 	if (!(ena_dev->supported_features & llq_feature_mask)) {
 		ena_log(pdev, WARN,
 		    "LLQ is not supported. Fallback to host mode policy.\n");
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
 		return (0);
 	}
 
 	if (ena_dev->mem_bar == NULL) {
 		ena_log(pdev, WARN,
 		    "LLQ is advertised as supported but device doesn't expose mem bar.\n");
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
 		return (0);
 	}
 
 	rc = ena_com_config_dev_mode(ena_dev, llq, llq_default_configurations);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, WARN,
 		    "Failed to configure the device mode. "
 		    "Fallback to host mode policy.\n");
 		ena_dev->tx_mem_queue_type = ENA_ADMIN_PLACEMENT_POLICY_HOST;
 	}
 
 	return (0);
 }
 
 static int
 ena_map_llq_mem_bar(device_t pdev, struct ena_com_dev *ena_dev)
 {
 	struct ena_adapter *adapter = device_get_softc(pdev);
 	int rc, rid;
 
 	/* Try to allocate resources for LLQ bar */
 	rid = PCIR_BAR(ENA_MEM_BAR);
 	adapter->memory = bus_alloc_resource_any(pdev, SYS_RES_MEMORY, &rid,
 	    RF_ACTIVE);
 	if (unlikely(adapter->memory == NULL)) {
 		ena_log(pdev, WARN,
 		    "Unable to allocate LLQ bar resource. LLQ mode won't be used.\n");
 		return (0);
 	}
 
 	/* Enable write combining for better LLQ performance */
 	rc = ena_enable_wc(adapter->pdev, adapter->memory);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "failed to enable write combining.\n");
 		return (rc);
 	}
 
 	/*
 	 * Save virtual address of the device's memory region
 	 * for the ena_com layer.
 	 */
 	ena_dev->mem_bar = rman_get_virtual(adapter->memory);
 
 	return (0);
 }
 
 static inline void
 set_default_llq_configurations(struct ena_llq_configurations *llq_config,
     struct ena_admin_feature_llq_desc *llq)
 {
 	llq_config->llq_header_location = ENA_ADMIN_INLINE_HEADER;
 	llq_config->llq_stride_ctrl = ENA_ADMIN_MULTIPLE_DESCS_PER_ENTRY;
 	llq_config->llq_num_decs_before_header =
 	    ENA_ADMIN_LLQ_NUM_DESCS_BEFORE_HEADER_2;
 	if ((llq->entry_size_ctrl_supported & ENA_ADMIN_LIST_ENTRY_SIZE_256B) !=
 	    0 && ena_force_large_llq_header) {
 		llq_config->llq_ring_entry_size =
 		    ENA_ADMIN_LIST_ENTRY_SIZE_256B;
 		llq_config->llq_ring_entry_size_value = 256;
 	} else {
 		llq_config->llq_ring_entry_size =
 		    ENA_ADMIN_LIST_ENTRY_SIZE_128B;
 		llq_config->llq_ring_entry_size_value = 128;
 	}
 }
 
 static int
 ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
 {
 	struct ena_admin_feature_llq_desc *llq = &ctx->get_feat_ctx->llq;
 	struct ena_com_dev *ena_dev = ctx->ena_dev;
 	uint32_t tx_queue_size = ENA_DEFAULT_RING_SIZE;
 	uint32_t rx_queue_size = ENA_DEFAULT_RING_SIZE;
 	uint32_t max_tx_queue_size;
 	uint32_t max_rx_queue_size;
 
 	if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
 		struct ena_admin_queue_ext_feature_fields *max_queue_ext =
 		    &ctx->get_feat_ctx->max_queue_ext.max_queue_ext;
 		max_rx_queue_size = min_t(uint32_t,
 		    max_queue_ext->max_rx_cq_depth,
 		    max_queue_ext->max_rx_sq_depth);
 		max_tx_queue_size = max_queue_ext->max_tx_cq_depth;
 
 		if (ena_dev->tx_mem_queue_type ==
 		    ENA_ADMIN_PLACEMENT_POLICY_DEV)
 			max_tx_queue_size = min_t(uint32_t, max_tx_queue_size,
 			    llq->max_llq_depth);
 		else
 			max_tx_queue_size = min_t(uint32_t, max_tx_queue_size,
 			    max_queue_ext->max_tx_sq_depth);
 
 		ctx->max_tx_sgl_size = min_t(uint16_t, ENA_PKT_MAX_BUFS,
 		    max_queue_ext->max_per_packet_tx_descs);
 		ctx->max_rx_sgl_size = min_t(uint16_t, ENA_PKT_MAX_BUFS,
 		    max_queue_ext->max_per_packet_rx_descs);
 	} else {
 		struct ena_admin_queue_feature_desc *max_queues =
 		    &ctx->get_feat_ctx->max_queues;
 		max_rx_queue_size = min_t(uint32_t, max_queues->max_cq_depth,
 		    max_queues->max_sq_depth);
 		max_tx_queue_size = max_queues->max_cq_depth;
 
 		if (ena_dev->tx_mem_queue_type ==
 		    ENA_ADMIN_PLACEMENT_POLICY_DEV)
 			max_tx_queue_size = min_t(uint32_t, max_tx_queue_size,
 			    llq->max_llq_depth);
 		else
 			max_tx_queue_size = min_t(uint32_t, max_tx_queue_size,
 			    max_queues->max_sq_depth);
 
 		ctx->max_tx_sgl_size = min_t(uint16_t, ENA_PKT_MAX_BUFS,
 		    max_queues->max_packet_tx_descs);
 		ctx->max_rx_sgl_size = min_t(uint16_t, ENA_PKT_MAX_BUFS,
 		    max_queues->max_packet_rx_descs);
 	}
 
 	/* round down to the nearest power of 2 */
 	max_tx_queue_size = 1 << (flsl(max_tx_queue_size) - 1);
 	max_rx_queue_size = 1 << (flsl(max_rx_queue_size) - 1);
 
 	/*
 	 * When forcing large headers, we multiply the entry size by 2,
 	 * and therefore divide the queue size by 2, leaving the amount
 	 * of memory used by the queues unchanged.
 	 */
 	if (ena_force_large_llq_header) {
 		if ((llq->entry_size_ctrl_supported &
 		    ENA_ADMIN_LIST_ENTRY_SIZE_256B) != 0 &&
 		    ena_dev->tx_mem_queue_type ==
 		    ENA_ADMIN_PLACEMENT_POLICY_DEV) {
 			max_tx_queue_size /= 2;
 			ena_log(ctx->pdev, INFO,
 			    "Forcing large headers and decreasing maximum Tx queue size to %d\n",
 			    max_tx_queue_size);
 		} else {
 			ena_log(ctx->pdev, WARN,
 			    "Forcing large headers failed: LLQ is disabled or device does not support large headers\n");
 		}
 	}
 
 	tx_queue_size = clamp_val(tx_queue_size, ENA_MIN_RING_SIZE,
 	    max_tx_queue_size);
 	rx_queue_size = clamp_val(rx_queue_size, ENA_MIN_RING_SIZE,
 	    max_rx_queue_size);
 
 	tx_queue_size = 1 << (flsl(tx_queue_size) - 1);
 	rx_queue_size = 1 << (flsl(rx_queue_size) - 1);
 
 	ctx->max_tx_queue_size = max_tx_queue_size;
 	ctx->max_rx_queue_size = max_rx_queue_size;
 	ctx->tx_queue_size = tx_queue_size;
 	ctx->rx_queue_size = rx_queue_size;
 
 	return (0);
 }
 
 static void
 ena_config_host_info(struct ena_com_dev *ena_dev, device_t dev)
 {
 	struct ena_admin_host_info *host_info;
 	uintptr_t rid;
 	int rc;
 
 	/* Allocate only the host info */
 	rc = ena_com_allocate_host_info(ena_dev);
 	if (unlikely(rc != 0)) {
 		ena_log(dev, ERR, "Cannot allocate host info\n");
 		return;
 	}
 
 	host_info = ena_dev->host_attr.host_info;
 
 	if (pci_get_id(dev, PCI_ID_RID, &rid) == 0)
 		host_info->bdf = rid;
 	host_info->os_type = ENA_ADMIN_OS_FREEBSD;
 	host_info->kernel_ver = osreldate;
 
 	sprintf(host_info->kernel_ver_str, "%d", osreldate);
 	host_info->os_dist = 0;
 	strncpy(host_info->os_dist_str, osrelease,
 	    sizeof(host_info->os_dist_str) - 1);
 
 	host_info->driver_version = (ENA_DRV_MODULE_VER_MAJOR) |
 	    (ENA_DRV_MODULE_VER_MINOR << ENA_ADMIN_HOST_INFO_MINOR_SHIFT) |
 	    (ENA_DRV_MODULE_VER_SUBMINOR << ENA_ADMIN_HOST_INFO_SUB_MINOR_SHIFT);
 	host_info->num_cpus = mp_ncpus;
 	host_info->driver_supported_features =
 	    ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
 	    ENA_ADMIN_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK;
 
 	rc = ena_com_set_host_attributes(ena_dev);
 	if (unlikely(rc != 0)) {
 		if (rc == EOPNOTSUPP)
 			ena_log(dev, WARN, "Cannot set host attributes\n");
 		else
 			ena_log(dev, ERR, "Cannot set host attributes\n");
 
 		goto err;
 	}
 
 	return;
 
 err:
 	ena_com_delete_host_info(ena_dev);
 }
 
 static int
 ena_device_init(struct ena_adapter *adapter, device_t pdev,
     struct ena_com_dev_get_features_ctx *get_feat_ctx, int *wd_active)
 {
 	struct ena_llq_configurations llq_config;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	bool readless_supported;
 	uint32_t aenq_groups;
 	int dma_width;
 	int rc;
 
 	rc = ena_com_mmio_reg_read_request_init(ena_dev);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "failed to init mmio read less\n");
 		return (rc);
 	}
 
 	/*
 	 * The PCIe configuration space revision id indicate if mmio reg
 	 * read is disabled
 	 */
 	readless_supported = !(pci_get_revid(pdev) & ENA_MMIO_DISABLE_REG_READ);
 	ena_com_set_mmio_read_mode(ena_dev, readless_supported);
 
 	rc = ena_com_dev_reset(ena_dev, ENA_REGS_RESET_NORMAL);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "Can not reset device\n");
 		goto err_mmio_read_less;
 	}
 
 	rc = ena_com_validate_version(ena_dev);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "device version is too low\n");
 		goto err_mmio_read_less;
 	}
 
 	dma_width = ena_com_get_dma_width(ena_dev);
 	if (unlikely(dma_width < 0)) {
 		ena_log(pdev, ERR, "Invalid dma width value %d", dma_width);
 		rc = dma_width;
 		goto err_mmio_read_less;
 	}
 	adapter->dma_width = dma_width;
 
 	/* ENA admin level init */
 	rc = ena_com_admin_init(ena_dev, &aenq_handlers);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR,
 		    "Can not initialize ena admin queue with device\n");
 		goto err_mmio_read_less;
 	}
 
 	/*
 	 * To enable the msix interrupts the driver needs to know the number
 	 * of queues. So the driver uses polling mode to retrieve this
 	 * information
 	 */
 	ena_com_set_admin_polling_mode(ena_dev, true);
 
 	ena_config_host_info(ena_dev, pdev);
 
 	/* Get Device Attributes */
 	rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR,
 		    "Cannot get attribute for ena device rc: %d\n", rc);
 		goto err_admin_init;
 	}
 
 	aenq_groups = BIT(ENA_ADMIN_LINK_CHANGE) |
 	    BIT(ENA_ADMIN_FATAL_ERROR) |
 	    BIT(ENA_ADMIN_WARNING) |
 	    BIT(ENA_ADMIN_NOTIFICATION) |
 	    BIT(ENA_ADMIN_KEEP_ALIVE);
 
 	aenq_groups &= get_feat_ctx->aenq.supported_groups;
 	rc = ena_com_set_aenq_config(ena_dev, aenq_groups);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "Cannot configure aenq groups rc: %d\n", rc);
 		goto err_admin_init;
 	}
 
 	*wd_active = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE));
 
 	set_default_llq_configurations(&llq_config, &get_feat_ctx->llq);
 
 	rc = ena_set_queues_placement_policy(pdev, ena_dev, &get_feat_ctx->llq,
 	    &llq_config);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "Failed to set placement policy\n");
 		goto err_admin_init;
 	}
 
 	return (0);
 
 err_admin_init:
 	ena_com_delete_host_info(ena_dev);
 	ena_com_admin_destroy(ena_dev);
 err_mmio_read_less:
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 
 	return (rc);
 }
 
 static int
 ena_enable_msix_and_set_admin_interrupts(struct ena_adapter *adapter)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	int rc;
 
 	rc = ena_enable_msix(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(adapter->pdev, ERR, "Error with MSI-X enablement\n");
 		return (rc);
 	}
 
 	ena_setup_mgmnt_intr(adapter);
 
 	rc = ena_request_mgmnt_irq(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(adapter->pdev, ERR, "Cannot setup mgmnt queue intr\n");
 		goto err_disable_msix;
 	}
 
 	ena_com_set_admin_polling_mode(ena_dev, false);
 
 	ena_com_admin_aenq_enable(ena_dev);
 
 	return (0);
 
 err_disable_msix:
 	ena_disable_msix(adapter);
 
 	return (rc);
 }
 
 /* Function called on ENA_ADMIN_KEEP_ALIVE event */
 static void
 ena_keep_alive_wd(void *adapter_data, struct ena_admin_aenq_entry *aenq_e)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
 	struct ena_admin_aenq_keep_alive_desc *desc;
 	sbintime_t stime;
 	uint64_t rx_drops;
 	uint64_t tx_drops;
 
 	desc = (struct ena_admin_aenq_keep_alive_desc *)aenq_e;
 
 	rx_drops = ((uint64_t)desc->rx_drops_high << 32) | desc->rx_drops_low;
 	tx_drops = ((uint64_t)desc->tx_drops_high << 32) | desc->tx_drops_low;
 	counter_u64_zero(adapter->hw_stats.rx_drops);
 	counter_u64_add(adapter->hw_stats.rx_drops, rx_drops);
 	counter_u64_zero(adapter->hw_stats.tx_drops);
 	counter_u64_add(adapter->hw_stats.tx_drops, tx_drops);
 
 	stime = getsbinuptime();
 	atomic_store_rel_64(&adapter->keep_alive_timestamp, stime);
 }
 
 /* Check for keep alive expiration */
 static void
 check_for_missing_keep_alive(struct ena_adapter *adapter)
 {
 	sbintime_t timestamp, time;
 
 	if (adapter->wd_active == 0)
 		return;
 
 	if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT)
 		return;
 
 	timestamp = atomic_load_acq_64(&adapter->keep_alive_timestamp);
 	time = getsbinuptime() - timestamp;
 	if (unlikely(time > adapter->keep_alive_timeout)) {
 		ena_log(adapter->pdev, ERR, "Keep alive watchdog timeout.\n");
 		counter_u64_add(adapter->dev_stats.wd_expired, 1);
 		ena_trigger_reset(adapter, ENA_REGS_RESET_KEEP_ALIVE_TO);
 	}
 }
 
 /* Check if admin queue is enabled */
 static void
 check_for_admin_com_state(struct ena_adapter *adapter)
 {
 	if (unlikely(ena_com_get_admin_running_state(adapter->ena_dev) == false)) {
 		ena_log(adapter->pdev, ERR,
 		    "ENA admin queue is not in running state!\n");
 		counter_u64_add(adapter->dev_stats.admin_q_pause, 1);
 		ena_trigger_reset(adapter, ENA_REGS_RESET_ADMIN_TO);
 	}
 }
 
 static int
 check_for_rx_interrupt_queue(struct ena_adapter *adapter,
     struct ena_ring *rx_ring)
 {
 	if (likely(atomic_load_8(&rx_ring->first_interrupt)))
 		return (0);
 
 	if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
 		return (0);
 
 	rx_ring->no_interrupt_event_cnt++;
 
 	if (rx_ring->no_interrupt_event_cnt ==
 	    ENA_MAX_NO_INTERRUPT_ITERATIONS) {
 		ena_log(adapter->pdev, ERR,
 		    "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
 		    rx_ring->qid);
 		ena_trigger_reset(adapter, ENA_REGS_RESET_MISS_INTERRUPT);
 		return (EIO);
 	}
 
 	return (0);
 }
 
 static int
 check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
     struct ena_ring *tx_ring)
 {
 	device_t pdev = adapter->pdev;
 	struct bintime curtime, time;
 	struct ena_tx_buffer *tx_buf;
 	int time_since_last_cleanup;
 	int missing_tx_comp_to;
 	sbintime_t time_offset;
 	uint32_t missed_tx = 0;
 	int i, rc = 0;
 
 	getbinuptime(&curtime);
 
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		tx_buf = &tx_ring->tx_buffer_info[i];
 
 		if (bintime_isset(&tx_buf->timestamp) == 0)
 			continue;
 
 		time = curtime;
 		bintime_sub(&time, &tx_buf->timestamp);
 		time_offset = bttosbt(time);
 
 		if (unlikely(!atomic_load_8(&tx_ring->first_interrupt) &&
 		    time_offset > 2 * adapter->missing_tx_timeout)) {
 			/*
 			 * If after graceful period interrupt is still not
 			 * received, we schedule a reset.
 			 */
 			ena_log(pdev, ERR,
 			    "Potential MSIX issue on Tx side Queue = %d. "
 			    "Reset the device\n",
 			    tx_ring->qid);
 			ena_trigger_reset(adapter,
 			    ENA_REGS_RESET_MISS_INTERRUPT);
 			return (EIO);
 		}
 
 		/* Check again if packet is still waiting */
 		if (unlikely(time_offset > adapter->missing_tx_timeout)) {
 
 			if (tx_buf->print_once) {
 				time_since_last_cleanup = TICKS_2_USEC(ticks -
 				    tx_ring->tx_last_cleanup_ticks);
 				missing_tx_comp_to = sbttoms(
 				    adapter->missing_tx_timeout);
 				ena_log(pdev, WARN,
 				    "Found a Tx that wasn't completed on time, qid %d, index %d. "
 				    "%d usecs have passed since last cleanup. Missing Tx timeout value %d msecs.\n",
 				    tx_ring->qid, i, time_since_last_cleanup,
 				    missing_tx_comp_to);
 			}
 
 			tx_buf->print_once = false;
 			missed_tx++;
 		}
 	}
 
 	if (unlikely(missed_tx > adapter->missing_tx_threshold)) {
 		ena_log(pdev, ERR,
 		    "The number of lost tx completion is above the threshold "
 		    "(%d > %d). Reset the device\n",
 		    missed_tx, adapter->missing_tx_threshold);
 		ena_trigger_reset(adapter, ENA_REGS_RESET_MISS_TX_CMPL);
 		rc = EIO;
 	}
 
 	counter_u64_add(tx_ring->tx_stats.missing_tx_comp, missed_tx);
 
 	return (rc);
 }
 
 /*
  * Check for TX which were not completed on time.
  * Timeout is defined by "missing_tx_timeout".
  * Reset will be performed if number of incompleted
  * transactions exceeds "missing_tx_threshold".
  */
 static void
 check_for_missing_completions(struct ena_adapter *adapter)
 {
 	struct ena_ring *tx_ring;
 	struct ena_ring *rx_ring;
 	int i, budget, rc;
 
 	/* Make sure the driver doesn't turn the device in other process */
 	rmb();
 
 	if (!ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter))
 		return;
 
 	if (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))
 		return;
 
 	if (adapter->missing_tx_timeout == ENA_HW_HINTS_NO_TIMEOUT)
 		return;
 
 	budget = adapter->missing_tx_max_queues;
 
 	for (i = adapter->next_monitored_tx_qid; i < adapter->num_io_queues; i++) {
 		tx_ring = &adapter->tx_ring[i];
 		rx_ring = &adapter->rx_ring[i];
 
 		rc = check_missing_comp_in_tx_queue(adapter, tx_ring);
 		if (unlikely(rc != 0))
 			return;
 
 		rc = check_for_rx_interrupt_queue(adapter, rx_ring);
 		if (unlikely(rc != 0))
 			return;
 
 		budget--;
 		if (budget == 0) {
 			i++;
 			break;
 		}
 	}
 
 	adapter->next_monitored_tx_qid = i % adapter->num_io_queues;
 }
 
 /* trigger rx cleanup after 2 consecutive detections */
 #define EMPTY_RX_REFILL 2
 /* For the rare case where the device runs out of Rx descriptors and the
  * msix handler failed to refill new Rx descriptors (due to a lack of memory
  * for example).
  * This case will lead to a deadlock:
  * The device won't send interrupts since all the new Rx packets will be dropped
  * The msix handler won't allocate new Rx descriptors so the device won't be
  * able to send new packets.
  *
  * When such a situation is detected - execute rx cleanup task in another thread
  */
 static void
 check_for_empty_rx_ring(struct ena_adapter *adapter)
 {
 	struct ena_ring *rx_ring;
 	int i, refill_required;
 
 	if (!ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter))
 		return;
 
 	if (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))
 		return;
 
 	for (i = 0; i < adapter->num_io_queues; i++) {
 		rx_ring = &adapter->rx_ring[i];
 
 		refill_required = ena_com_free_q_entries(
 		    rx_ring->ena_com_io_sq);
 		if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
 			rx_ring->empty_rx_queue++;
 
 			if (rx_ring->empty_rx_queue >= EMPTY_RX_REFILL) {
 				counter_u64_add(rx_ring->rx_stats.empty_rx_ring,
 				    1);
 
 				ena_log(adapter->pdev, WARN,
 				    "Rx ring %d is stalled. Triggering the refill function\n",
 				    i);
 
 				taskqueue_enqueue(rx_ring->que->cleanup_tq,
 				    &rx_ring->que->cleanup_task);
 				rx_ring->empty_rx_queue = 0;
 			}
 		} else {
 			rx_ring->empty_rx_queue = 0;
 		}
 	}
 }
 
 static void
 ena_update_hints(struct ena_adapter *adapter,
     struct ena_admin_ena_hw_hints *hints)
 {
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 
 	if (hints->admin_completion_tx_timeout)
 		ena_dev->admin_queue.completion_timeout =
 		    hints->admin_completion_tx_timeout * 1000;
 
 	if (hints->mmio_read_timeout)
 		/* convert to usec */
 		ena_dev->mmio_read.reg_read_to = hints->mmio_read_timeout * 1000;
 
 	if (hints->missed_tx_completion_count_threshold_to_reset)
 		adapter->missing_tx_threshold =
 		    hints->missed_tx_completion_count_threshold_to_reset;
 
 	if (hints->missing_tx_completion_timeout) {
 		if (hints->missing_tx_completion_timeout ==
 		    ENA_HW_HINTS_NO_TIMEOUT)
 			adapter->missing_tx_timeout = ENA_HW_HINTS_NO_TIMEOUT;
 		else
 			adapter->missing_tx_timeout = SBT_1MS *
 			    hints->missing_tx_completion_timeout;
 	}
 
 	if (hints->driver_watchdog_timeout) {
 		if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT)
 			adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT;
 		else
 			adapter->keep_alive_timeout = SBT_1MS *
 			    hints->driver_watchdog_timeout;
 	}
 }
 
 /**
  * ena_copy_eni_metrics - Get and copy ENI metrics from the HW.
  * @adapter: ENA device adapter
  *
  * Returns 0 on success, EOPNOTSUPP if current HW doesn't support those metrics
  * and other error codes on failure.
  *
  * This function can possibly cause a race with other calls to the admin queue.
  * Because of that, the caller should either lock this function or make sure
  * that there is no race in the current context.
  */
 static int
 ena_copy_eni_metrics(struct ena_adapter *adapter)
 {
 	static bool print_once = true;
 	int rc;
 
 	rc = ena_com_get_eni_stats(adapter->ena_dev, &adapter->eni_metrics);
 
 	if (rc != 0) {
 		if (rc == ENA_COM_UNSUPPORTED) {
 			if (print_once) {
 				ena_log(adapter->pdev, WARN,
 				    "Retrieving ENI metrics is not supported.\n");
 				print_once = false;
 			} else {
 				ena_log(adapter->pdev, DBG,
 				    "Retrieving ENI metrics is not supported.\n");
 			}
 		} else {
 			ena_log(adapter->pdev, ERR,
 			    "Failed to get ENI metrics: %d\n", rc);
 		}
 	}
 
 	return (rc);
 }
 
 static void
 ena_timer_service(void *data)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)data;
 	struct ena_admin_host_info *host_info =
 	    adapter->ena_dev->host_attr.host_info;
 
 	check_for_missing_keep_alive(adapter);
 
 	check_for_admin_com_state(adapter);
 
 	check_for_missing_completions(adapter);
 
 	check_for_empty_rx_ring(adapter);
 
 	/*
 	 * User controller update of the ENI metrics.
 	 * If the delay was set to 0, then the stats shouldn't be updated at
 	 * all.
 	 * Otherwise, wait 'eni_metrics_sample_interval' seconds, before
 	 * updating stats.
 	 * As timer service is executed every second, it's enough to increment
 	 * appropriate counter each time the timer service is executed.
 	 */
 	if ((adapter->eni_metrics_sample_interval != 0) &&
 	    (++adapter->eni_metrics_sample_interval_cnt >=
 	     adapter->eni_metrics_sample_interval)) {
 		taskqueue_enqueue(adapter->metrics_tq, &adapter->metrics_task);
 		adapter->eni_metrics_sample_interval_cnt = 0;
 	}
 
 
 	if (host_info != NULL)
 		ena_update_host_info(host_info, adapter->ifp);
 
 	if (unlikely(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
 		/*
 		 * Timeout when validating version indicates that the device
 		 * became unresponsive. If that happens skip the reset and
 		 * reschedule timer service, so the reset can be retried later.
 		 */
 		if (ena_com_validate_version(adapter->ena_dev) ==
 		    ENA_COM_TIMER_EXPIRED) {
 			ena_log(adapter->pdev, WARN,
 			    "FW unresponsive, skipping reset\n");
 			ENA_TIMER_RESET(adapter);
 			return;
 		}
 		ena_log(adapter->pdev, WARN, "Trigger reset is on\n");
 		taskqueue_enqueue(adapter->reset_tq, &adapter->reset_task);
 		return;
 	}
 
 	/*
 	 * Schedule another timeout one second from now.
 	 */
 	ENA_TIMER_RESET(adapter);
 }
 
 void
 ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 {
 	if_t ifp = adapter->ifp;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	bool dev_up;
 
 	if (!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))
 		return;
 
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 
 	ENA_TIMER_DRAIN(adapter);
 
 	dev_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter);
 	if (dev_up)
 		ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEV_UP_BEFORE_RESET, adapter);
 
 	if (!graceful)
 		ena_com_set_admin_running_state(ena_dev, false);
 
 	if (ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter))
 		ena_down(adapter);
 
 	/*
 	 * Stop the device from sending AENQ events (if the device was up, and
 	 * the trigger reset was on, ena_down already performs device reset)
 	 */
 	if (!(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter) && dev_up))
 		ena_com_dev_reset(adapter->ena_dev, adapter->reset_reason);
 
 	ena_free_mgmnt_irq(adapter);
 
 	ena_disable_msix(adapter);
 
 	/*
 	 * IO rings resources should be freed because `ena_restore_device()`
 	 * calls (not directly) `ena_enable_msix()`, which re-allocates MSIX
 	 * vectors. The amount of MSIX vectors after destroy-restore may be
 	 * different than before. Therefore, IO rings resources should be
 	 * established from scratch each time.
 	 */
 	ena_free_all_io_rings_resources(adapter);
 
 	ena_com_abort_admin_commands(ena_dev);
 
 	ena_com_wait_for_abort_completion(ena_dev);
 
 	ena_com_admin_destroy(ena_dev);
 
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter);
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_DEVICE_RUNNING, adapter);
 }
 
 static int
 ena_device_validate_params(struct ena_adapter *adapter,
     struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
 	if (memcmp(get_feat_ctx->dev_attr.mac_addr, adapter->mac_addr,
 	    ETHER_ADDR_LEN) != 0) {
 		ena_log(adapter->pdev, ERR, "Error, mac addresses differ\n");
 		return (EINVAL);
 	}
 
 	if (get_feat_ctx->dev_attr.max_mtu < if_getmtu(adapter->ifp)) {
 		ena_log(adapter->pdev, ERR,
 		    "Error, device max mtu is smaller than ifp MTU\n");
 		return (EINVAL);
 	}
 
 	return 0;
 }
 
 int
 ena_restore_device(struct ena_adapter *adapter)
 {
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	if_t ifp = adapter->ifp;
 	device_t dev = adapter->pdev;
 	int wd_active;
 	int rc;
 
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_ONGOING_RESET, adapter);
 
 	rc = ena_device_init(adapter, dev, &get_feat_ctx, &wd_active);
 	if (rc != 0) {
 		ena_log(dev, ERR, "Cannot initialize device\n");
 		goto err;
 	}
 	/*
 	 * Only enable WD if it was enabled before reset, so it won't override
 	 * value set by the user by the sysctl.
 	 */
 	if (adapter->wd_active != 0)
 		adapter->wd_active = wd_active;
 
 	rc = ena_device_validate_params(adapter, &get_feat_ctx);
 	if (rc != 0) {
 		ena_log(dev, ERR, "Validation of device parameters failed\n");
 		goto err_device_destroy;
 	}
 
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_ONGOING_RESET, adapter);
 	/* Make sure we don't have a race with AENQ Links state handler */
 	if (ENA_FLAG_ISSET(ENA_FLAG_LINK_UP, adapter))
 		if_link_state_change(ifp, LINK_STATE_UP);
 
 	rc = ena_enable_msix_and_set_admin_interrupts(adapter);
 	if (rc != 0) {
 		ena_log(dev, ERR, "Enable MSI-X failed\n");
 		goto err_device_destroy;
 	}
 
 	/*
 	 * Effective value of used MSIX vectors should be the same as before
 	 * `ena_destroy_device()`, if possible, or closest to it if less vectors
 	 * are available.
 	 */
 	if ((adapter->msix_vecs - ENA_ADMIN_MSIX_VEC) < adapter->num_io_queues)
 		adapter->num_io_queues = adapter->msix_vecs - ENA_ADMIN_MSIX_VEC;
 
 	/* Re-initialize rings basic information */
 	ena_init_io_rings(adapter);
 
 	/* If the interface was up before the reset bring it up */
 	if (ENA_FLAG_ISSET(ENA_FLAG_DEV_UP_BEFORE_RESET, adapter)) {
 		rc = ena_up(adapter);
 		if (rc != 0) {
 			ena_log(dev, ERR, "Failed to create I/O queues\n");
 			goto err_disable_msix;
 		}
 	}
 
 	/* Indicate that device is running again and ready to work */
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEVICE_RUNNING, adapter);
 
 	/*
 	 * As the AENQ handlers weren't executed during reset because
 	 * the flag ENA_FLAG_DEVICE_RUNNING was turned off, the
 	 * timestamp must be updated again That will prevent next reset
 	 * caused by missing keep alive.
 	 */
 	adapter->keep_alive_timestamp = getsbinuptime();
 	ENA_TIMER_RESET(adapter);
 
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_DEV_UP_BEFORE_RESET, adapter);
 
 	return (rc);
 
 err_disable_msix:
 	ena_free_mgmnt_irq(adapter);
 	ena_disable_msix(adapter);
 err_device_destroy:
 	ena_com_abort_admin_commands(ena_dev);
 	ena_com_wait_for_abort_completion(ena_dev);
 	ena_com_admin_destroy(ena_dev);
 	ena_com_dev_reset(ena_dev, ENA_REGS_RESET_DRIVER_INVALID_STATE);
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 err:
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_DEVICE_RUNNING, adapter);
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_ONGOING_RESET, adapter);
 	ena_log(dev, ERR, "Reset attempt failed. Can not reset the device\n");
 
 	return (rc);
 }
 
 static void
 ena_metrics_task(void *arg, int pending)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)arg;
 
 	ENA_LOCK_LOCK();
 	(void)ena_copy_eni_metrics(adapter);
 	ENA_LOCK_UNLOCK();
 }
 
 static void
 ena_reset_task(void *arg, int pending)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)arg;
 
 	ENA_LOCK_LOCK();
 	if (likely(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
 		ena_destroy_device(adapter, false);
 		ena_restore_device(adapter);
 
 		ena_log(adapter->pdev, INFO,
 		    "Device reset completed successfully, Driver info: %s\n",
 		    ena_version);
 	}
 	ENA_LOCK_UNLOCK();
 }
 
 /**
  * ena_attach - Device Initialization Routine
  * @pdev: device information struct
  *
  * Returns 0 on success, otherwise on failure.
  *
  * ena_attach initializes an adapter identified by a device structure.
  * The OS initialization, configuring of the adapter private structure,
  * and a hardware reset occur.
  **/
 static int
 ena_attach(device_t pdev)
 {
 	struct ena_com_dev_get_features_ctx get_feat_ctx;
 	struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
 	static int version_printed;
 	struct ena_adapter *adapter;
 	struct ena_com_dev *ena_dev = NULL;
 	uint32_t max_num_io_queues;
 	int msix_rid;
 	int rid, rc;
 
 	adapter = device_get_softc(pdev);
 	adapter->pdev = pdev;
 	adapter->first_bind = -1;
 
 	/*
 	 * Set up the timer service - driver is responsible for avoiding
 	 * concurrency, as the callout won't be using any locking inside.
 	 */
 	ENA_TIMER_INIT(adapter);
 	adapter->keep_alive_timeout = ENA_DEFAULT_KEEP_ALIVE_TO;
 	adapter->missing_tx_timeout = ENA_DEFAULT_TX_CMP_TO;
 	adapter->missing_tx_max_queues = ENA_DEFAULT_TX_MONITORED_QUEUES;
 	adapter->missing_tx_threshold = ENA_DEFAULT_TX_CMP_THRESHOLD;
 
 	if (version_printed++ == 0)
 		ena_log(pdev, INFO, "%s\n", ena_version);
 
 	/* Allocate memory for ena_dev structure */
 	ena_dev = malloc(sizeof(struct ena_com_dev), M_DEVBUF,
 	    M_WAITOK | M_ZERO);
 
 	adapter->ena_dev = ena_dev;
 	ena_dev->dmadev = pdev;
 
 	rid = PCIR_BAR(ENA_REG_BAR);
 	adapter->memory = NULL;
 	adapter->registers = bus_alloc_resource_any(pdev, SYS_RES_MEMORY, &rid,
 	    RF_ACTIVE);
 	if (unlikely(adapter->registers == NULL)) {
 		ena_log(pdev, ERR,
 		    "unable to allocate bus resource: registers!\n");
 		rc = ENOMEM;
 		goto err_dev_free;
 	}
 
 	/* MSIx vector table may reside on BAR0 with registers or on BAR1. */
 	msix_rid = pci_msix_table_bar(pdev);
 	if (msix_rid != rid) {
 		adapter->msix = bus_alloc_resource_any(pdev, SYS_RES_MEMORY,
 		    &msix_rid, RF_ACTIVE);
 		if (unlikely(adapter->msix == NULL)) {
 			ena_log(pdev, ERR,
 			    "unable to allocate bus resource: msix!\n");
 			rc = ENOMEM;
 			goto err_pci_free;
 		}
 		adapter->msix_rid = msix_rid;
 	}
 
 	ena_dev->bus = malloc(sizeof(struct ena_bus), M_DEVBUF,
 	    M_WAITOK | M_ZERO);
 
 	/* Store register resources */
 	((struct ena_bus *)(ena_dev->bus))->reg_bar_t = rman_get_bustag(
 	    adapter->registers);
 	((struct ena_bus *)(ena_dev->bus))->reg_bar_h = rman_get_bushandle(
 	    adapter->registers);
 
 	if (unlikely(((struct ena_bus *)(ena_dev->bus))->reg_bar_h == 0)) {
 		ena_log(pdev, ERR, "failed to pmap registers bar\n");
 		rc = ENXIO;
 		goto err_bus_free;
 	}
 
 	rc = ena_map_llq_mem_bar(pdev, ena_dev);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "Failed to map ENA mem bar");
 		goto err_bus_free;
 	}
 
 	/* Initially clear all the flags */
 	ENA_FLAG_ZERO(adapter);
 
 	/* Device initialization */
 	rc = ena_device_init(adapter, pdev, &get_feat_ctx, &adapter->wd_active);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "ENA device init failed! (err: %d)\n", rc);
 		rc = ENXIO;
 		goto err_bus_free;
 	}
 
 	if (ena_dev->tx_mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_DEV)
 		adapter->disable_meta_caching = !!(
 		    get_feat_ctx.llq.accel_mode.u.get.supported_flags &
 		    BIT(ENA_ADMIN_DISABLE_META_CACHING));
 
 	adapter->keep_alive_timestamp = getsbinuptime();
 
 	adapter->tx_offload_cap = get_feat_ctx.offload.tx;
 
 	memcpy(adapter->mac_addr, get_feat_ctx.dev_attr.mac_addr,
 	    ETHER_ADDR_LEN);
 
 	calc_queue_ctx.pdev = pdev;
 	calc_queue_ctx.ena_dev = ena_dev;
 	calc_queue_ctx.get_feat_ctx = &get_feat_ctx;
 
 	/* Calculate initial and maximum IO queue number and size */
 	max_num_io_queues = ena_calc_max_io_queue_num(pdev, ena_dev,
 	    &get_feat_ctx);
 	rc = ena_calc_io_queue_size(&calc_queue_ctx);
 	if (unlikely((rc != 0) || (max_num_io_queues <= 0))) {
 		rc = EFAULT;
 		goto err_com_free;
 	}
 
 	adapter->requested_tx_ring_size = calc_queue_ctx.tx_queue_size;
 	adapter->requested_rx_ring_size = calc_queue_ctx.rx_queue_size;
 	adapter->max_tx_ring_size = calc_queue_ctx.max_tx_queue_size;
 	adapter->max_rx_ring_size = calc_queue_ctx.max_rx_queue_size;
 	adapter->max_tx_sgl_size = calc_queue_ctx.max_tx_sgl_size;
 	adapter->max_rx_sgl_size = calc_queue_ctx.max_rx_sgl_size;
 
 	adapter->max_num_io_queues = max_num_io_queues;
 
 	adapter->buf_ring_size = ENA_DEFAULT_BUF_RING_SIZE;
 
 	adapter->max_mtu = get_feat_ctx.dev_attr.max_mtu;
 
 	adapter->reset_reason = ENA_REGS_RESET_NORMAL;
 
 	/* set up dma tags for rx and tx buffers */
 	rc = ena_setup_tx_dma_tag(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "Failed to create TX DMA tag\n");
 		goto err_com_free;
 	}
 
 	rc = ena_setup_rx_dma_tag(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "Failed to create RX DMA tag\n");
 		goto err_tx_tag_free;
 	}
 
 	/*
 	 * The amount of requested MSIX vectors is equal to
 	 * adapter::max_num_io_queues (see `ena_enable_msix()`), plus a constant
 	 * number of admin queue interrupts. The former is initially determined
 	 * by HW capabilities (see `ena_calc_max_io_queue_num())` but may not be
 	 * achieved if there are not enough system resources. By default, the
 	 * number of effectively used IO queues is the same but later on it can
 	 * be limited by the user using sysctl interface.
 	 */
 	rc = ena_enable_msix_and_set_admin_interrupts(adapter);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR,
 		    "Failed to enable and set the admin interrupts\n");
 		goto err_io_free;
 	}
 	/* By default all of allocated MSIX vectors are actively used */
 	adapter->num_io_queues = adapter->msix_vecs - ENA_ADMIN_MSIX_VEC;
 
 	/* initialize rings basic information */
 	ena_init_io_rings(adapter);
 
 	/* setup network interface */
 	rc = ena_setup_ifnet(pdev, adapter, &get_feat_ctx);
 	if (unlikely(rc != 0)) {
 		ena_log(pdev, ERR, "Error with network interface setup\n");
 		goto err_msix_free;
 	}
 
 	/* Initialize reset task queue */
 	TASK_INIT(&adapter->reset_task, 0, ena_reset_task, adapter);
 	adapter->reset_tq = taskqueue_create("ena_reset_enqueue",
 	    M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &adapter->reset_tq);
 	taskqueue_start_threads(&adapter->reset_tq, 1, PI_NET, "%s rstq",
 	    device_get_nameunit(adapter->pdev));
 
 	/* Initialize metrics task queue */
 	TASK_INIT(&adapter->metrics_task, 0, ena_metrics_task, adapter);
 	adapter->metrics_tq = taskqueue_create("ena_metrics_enqueue",
 	    M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &adapter->metrics_tq);
 	taskqueue_start_threads(&adapter->metrics_tq, 1, PI_NET, "%s metricsq",
 	    device_get_nameunit(adapter->pdev));
 
 	/* Initialize statistics */
 	ena_alloc_counters((counter_u64_t *)&adapter->dev_stats,
 	    sizeof(struct ena_stats_dev));
 	ena_alloc_counters((counter_u64_t *)&adapter->hw_stats,
 	    sizeof(struct ena_hw_stats));
 	ena_sysctl_add_nodes(adapter);
 
 #ifdef DEV_NETMAP
 	rc = ena_netmap_attach(adapter);
 	if (rc != 0) {
 		ena_log(pdev, ERR, "netmap attach failed: %d\n", rc);
 		goto err_detach;
 	}
 #endif /* DEV_NETMAP */
 
 	/* Tell the stack that the interface is not active */
 	if_setdrvflagbits(adapter->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEVICE_RUNNING, adapter);
 
 	/* Run the timer service */
 	ENA_TIMER_RESET(adapter);
 
 	return (0);
 
 #ifdef DEV_NETMAP
 err_detach:
 	ether_ifdetach(adapter->ifp);
 #endif /* DEV_NETMAP */
 err_msix_free:
 	ena_com_dev_reset(adapter->ena_dev, ENA_REGS_RESET_INIT_ERR);
 	ena_free_mgmnt_irq(adapter);
 	ena_disable_msix(adapter);
 err_io_free:
 	ena_free_all_io_rings_resources(adapter);
 	ena_free_rx_dma_tag(adapter);
 err_tx_tag_free:
 	ena_free_tx_dma_tag(adapter);
 err_com_free:
 	ena_com_admin_destroy(ena_dev);
 	ena_com_delete_host_info(ena_dev);
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 err_bus_free:
 	free(ena_dev->bus, M_DEVBUF);
 err_pci_free:
 	ena_free_pci_resources(adapter);
 err_dev_free:
 	free(ena_dev, M_DEVBUF);
 
 	return (rc);
 }
 
 /**
  * ena_detach - Device Removal Routine
  * @pdev: device information struct
  *
  * ena_detach is called by the device subsystem to alert the driver
  * that it should release a PCI device.
  **/
 static int
 ena_detach(device_t pdev)
 {
 	struct ena_adapter *adapter = device_get_softc(pdev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	int rc;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(adapter->ifp)) {
 		ena_log(adapter->pdev, ERR, "VLAN is in use, detach first\n");
 		return (EBUSY);
 	}
 
 	ether_ifdetach(adapter->ifp);
 
 	/* Stop timer service */
 	ENA_LOCK_LOCK();
 	ENA_TIMER_DRAIN(adapter);
 	ENA_LOCK_UNLOCK();
 
 	/* Release metrics task */
 	while (taskqueue_cancel(adapter->metrics_tq, &adapter->metrics_task, NULL))
 		taskqueue_drain(adapter->metrics_tq, &adapter->metrics_task);
 	taskqueue_free(adapter->metrics_tq);
 
 	/* Release reset task */
 	while (taskqueue_cancel(adapter->reset_tq, &adapter->reset_task, NULL))
 		taskqueue_drain(adapter->reset_tq, &adapter->reset_task);
 	taskqueue_free(adapter->reset_tq);
 
 	ENA_LOCK_LOCK();
 	ena_down(adapter);
 	ena_destroy_device(adapter, true);
 	ENA_LOCK_UNLOCK();
 
 	/* Restore unregistered sysctl queue nodes. */
 	ena_sysctl_update_queue_node_nb(adapter, adapter->num_io_queues,
 	    adapter->max_num_io_queues);
 
 #ifdef DEV_NETMAP
 	netmap_detach(adapter->ifp);
 #endif /* DEV_NETMAP */
 
 	ena_free_counters((counter_u64_t *)&adapter->hw_stats,
 	    sizeof(struct ena_hw_stats));
 	ena_free_counters((counter_u64_t *)&adapter->dev_stats,
 	    sizeof(struct ena_stats_dev));
 
 	rc = ena_free_rx_dma_tag(adapter);
 	if (unlikely(rc != 0))
 		ena_log(adapter->pdev, WARN,
 		    "Unmapped RX DMA tag associations\n");
 
 	rc = ena_free_tx_dma_tag(adapter);
 	if (unlikely(rc != 0))
 		ena_log(adapter->pdev, WARN,
 		    "Unmapped TX DMA tag associations\n");
 
 	ena_free_irqs(adapter);
 
 	ena_free_pci_resources(adapter);
 
 	if (adapter->rss_indir != NULL)
 		free(adapter->rss_indir, M_DEVBUF);
 
 	if (likely(ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter)))
 		ena_com_rss_destroy(ena_dev);
 
 	ena_com_delete_host_info(ena_dev);
 
 	if_free(adapter->ifp);
 
 	free(ena_dev->bus, M_DEVBUF);
 
 	free(ena_dev, M_DEVBUF);
 
 	return (bus_generic_detach(pdev));
 }
 
 /******************************************************************************
  ******************************** AENQ Handlers *******************************
  *****************************************************************************/
 /**
  * ena_update_on_link_change:
  * Notify the network interface about the change in link status
  **/
 static void
 ena_update_on_link_change(void *adapter_data,
     struct ena_admin_aenq_entry *aenq_e)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
 	struct ena_admin_aenq_link_change_desc *aenq_desc;
 	int status;
 	if_t ifp;
 
 	aenq_desc = (struct ena_admin_aenq_link_change_desc *)aenq_e;
 	ifp = adapter->ifp;
 	status = aenq_desc->flags &
 	    ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK;
 
 	if (status != 0) {
 		ena_log(adapter->pdev, INFO, "link is UP\n");
 		ENA_FLAG_SET_ATOMIC(ENA_FLAG_LINK_UP, adapter);
 		if (!ENA_FLAG_ISSET(ENA_FLAG_ONGOING_RESET, adapter))
 			if_link_state_change(ifp, LINK_STATE_UP);
 	} else {
 		ena_log(adapter->pdev, INFO, "link is DOWN\n");
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 		ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_LINK_UP, adapter);
 	}
 }
 
 static void
 ena_notification(void *adapter_data, struct ena_admin_aenq_entry *aenq_e)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
 	struct ena_admin_ena_hw_hints *hints;
 
 	ENA_WARN(aenq_e->aenq_common_desc.group != ENA_ADMIN_NOTIFICATION,
 	    adapter->ena_dev, "Invalid group(%x) expected %x\n",
 	    aenq_e->aenq_common_desc.group, ENA_ADMIN_NOTIFICATION);
 
 	switch (aenq_e->aenq_common_desc.syndrome) {
 	case ENA_ADMIN_UPDATE_HINTS:
 		hints =
 		    (struct ena_admin_ena_hw_hints *)(&aenq_e->inline_data_w4);
 		ena_update_hints(adapter, hints);
 		break;
 	default:
 		ena_log(adapter->pdev, ERR,
 		    "Invalid aenq notification link state %d\n",
 		    aenq_e->aenq_common_desc.syndrome);
 	}
 }
 
 static void
 ena_lock_init(void *arg)
 {
 	ENA_LOCK_INIT();
 }
 SYSINIT(ena_lock_init, SI_SUB_LOCK, SI_ORDER_FIRST, ena_lock_init, NULL);
 
 static void
 ena_lock_uninit(void *arg)
 {
 	ENA_LOCK_DESTROY();
 }
 SYSUNINIT(ena_lock_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, ena_lock_uninit, NULL);
 
 /**
  * This handler will called for unknown event group or unimplemented handlers
  **/
 static void
 unimplemented_aenq_handler(void *adapter_data,
     struct ena_admin_aenq_entry *aenq_e)
 {
 	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
 
 	ena_log(adapter->pdev, ERR,
 	    "Unknown event was received or event with unimplemented handler\n");
 }
 
 static struct ena_aenq_handlers aenq_handlers = {
     .handlers = {
 	    [ENA_ADMIN_LINK_CHANGE] = ena_update_on_link_change,
 	    [ENA_ADMIN_NOTIFICATION] = ena_notification,
 	    [ENA_ADMIN_KEEP_ALIVE] = ena_keep_alive_wd,
     },
     .unimplemented_handler = unimplemented_aenq_handler
 };
 
 /*********************************************************************
  *  FreeBSD Device Interface Entry Points
  *********************************************************************/
 
 static device_method_t ena_methods[] = { /* Device interface */
 	DEVMETHOD(device_probe, ena_probe),
 	DEVMETHOD(device_attach, ena_attach),
 	DEVMETHOD(device_detach, ena_detach), DEVMETHOD_END
 };
 
 static driver_t ena_driver = {
 	"ena",
 	ena_methods,
 	sizeof(struct ena_adapter),
 };
 
 DRIVER_MODULE(ena, pci, ena_driver, 0, 0);
 MODULE_PNP_INFO("U16:vendor;U16:device", pci, ena, ena_vendor_info_array,
     nitems(ena_vendor_info_array) - 1);
 MODULE_DEPEND(ena, pci, 1, 1, 1);
 MODULE_DEPEND(ena, ether, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(ena, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 /*********************************************************************/
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index 84adef8398bb..ab0cf49c2e8a 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -1,5061 +1,5060 @@
 /*-
  * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2022 NVIDIA corporation & affiliates.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_kern_tls.h"
 #include "opt_rss.h"
 #include "opt_ratelimit.h"
 
 #include <dev/mlx5/mlx5_en/en.h>
 
 #include <sys/eventhandler.h>
 #include <sys/sockio.h>
 #include <machine/atomic.h>
 
 #include <net/debugnet.h>
 
 static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
 static if_snd_tag_query_t mlx5e_ul_snd_tag_query;
 static if_snd_tag_free_t mlx5e_ul_snd_tag_free;
 
 struct mlx5e_channel_param {
 	struct mlx5e_rq_param rq;
 	struct mlx5e_sq_param sq;
 	struct mlx5e_cq_param rx_cq;
 	struct mlx5e_cq_param tx_cq;
 };
 
 struct media {
 	u32	subtype;
 	u64	baudrate;
 };
 
 static const struct media mlx5e_mode_table[MLX5E_LINK_SPEEDS_NUMBER] =
 {
 	[MLX5E_1000BASE_CX_SGMII] = {
 		.subtype = IFM_1000_CX_SGMII,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_1000BASE_KX] = {
 		.subtype = IFM_1000_KX,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_CX4] = {
 		.subtype = IFM_10G_CX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KX4] = {
 		.subtype = IFM_10G_KX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KR] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_20GBASE_KR2] = {
 		.subtype = IFM_20G_KR2,
 		.baudrate = IF_Gbps(20ULL),
 	},
 	[MLX5E_40GBASE_CR4] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_KR4] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_56GBASE_R4] = {
 		.subtype = IFM_56G_R4,
 		.baudrate = IF_Gbps(56ULL),
 	},
 	[MLX5E_10GBASE_CR] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_SR] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_ER_LR] = {
 		.subtype = IFM_10G_ER,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_40GBASE_SR4] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_LR4_ER4] = {
 		.subtype = IFM_40G_LR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_100GBASE_CR4] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_SR4] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_KR4] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_LR4] = {
 		.subtype = IFM_100G_LR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100BASE_TX] = {
 		.subtype = IFM_100_TX,
 		.baudrate = IF_Mbps(100ULL),
 	},
 	[MLX5E_1000BASE_T] = {
 		.subtype = IFM_1000_T,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_T] = {
 		.subtype = IFM_10G_T,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_25GBASE_CR] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_KR] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_SR] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_50GBASE_CR2] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GBASE_KR2] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GBASE_KR4] = {
 		.subtype = IFM_50G_KR4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 };
 
 static const struct media mlx5e_ext_mode_table[MLX5E_EXT_LINK_SPEEDS_NUMBER][MLX5E_CABLE_TYPE_NUMBER] =
 {
 	/**/
 	[MLX5E_SGMII_100M][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100_SGMII,
 		.baudrate = IF_Mbps(100),
 	},
 
 	/**/
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_1000_CX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_1000_SX,
 		.baudrate = IF_Mbps(1000),
 	},
 
 	/**/
 	[MLX5E_5GBASE_R][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_5000_KR,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_CABLE_TYPE_TWISTED_PAIR] = {
 		.subtype = IFM_5000_T,
 		.baudrate = IF_Mbps(5000),
 	},
 
 	/**/
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 
 	/**/
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 
 	/**/
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_TWISTED_PAIR] = {
 		.subtype = IFM_25G_T,
 		.baudrate = IF_Gbps(25ULL),
 	},
 
 	/**/
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_50G_SR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 
 	/**/
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_50G_KR_PAM4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_50G_CP,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_50G_SR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 
 	/**/
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 
 	/**/
 	[MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100G_KR_PAM4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_100G_CR_PAM4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_100G_SR2,	/* XXX */
 		.baudrate = IF_Gbps(100ULL),
 	},
 
 	/**/
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_100G_CP2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_100G_SR2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 
 	/**/
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_200G_KR4_PAM4,	/* XXX */
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_200G_CR4_PAM4,	/* XXX */
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_200G_SR4,	/* XXX */
 		.baudrate = IF_Gbps(200ULL),
 	},
 
 	/**/
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_200G_KR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_200G_CR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_200G_SR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 
 	/**/
 	[MLX5E_400GAUI_8][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_400G_LR8,	/* XXX */
 		.baudrate = IF_Gbps(400ULL),
 	},
 
 	/**/
 	[MLX5E_400GAUI_4_400GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_400G_LR8,	/* XXX */
 		.baudrate = IF_Gbps(400ULL),
 	},
 };
 
 static const struct if_snd_tag_sw mlx5e_ul_snd_tag_sw = {
 	.snd_tag_query = mlx5e_ul_snd_tag_query,
 	.snd_tag_free = mlx5e_ul_snd_tag_free,
 	.type = IF_SND_TAG_TYPE_UNLIMITED
 };
 
 DEBUGNET_DEFINE(mlx5_en);
 
 MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet");
 
 static void
 mlx5e_update_carrier(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	u32 eth_proto_oper;
 	int error;
 	u8 i;
 	u8 cable_type;
 	u8 port_state;
 	u8 is_er_type;
 	bool ext;
 	struct media media_entry = {};
 
 	port_state = mlx5_query_vport_state(mdev,
 	    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0);
 
 	if (port_state == VPORT_STATE_UP) {
 		priv->media_status_last |= IFM_ACTIVE;
 	} else {
 		priv->media_status_last &= ~IFM_ACTIVE;
 		priv->media_active_last = IFM_ETHER;
 		if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 		return;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error) {
 		priv->media_active_last = IFM_ETHER;
 		if_setbaudrate(priv->ifp, 1);
 		mlx5_en_err(priv->ifp, "query port ptys failed: 0x%x\n",
 		    error);
 		return;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_oper);
 
 	i = ilog2(eth_proto_oper);
 
 	if (ext) {
 		error = mlx5_query_pddr_cable_type(mdev, 1, &cable_type);
 		if (error != 0) {
 			/* use fallback entry */
 			media_entry = mlx5e_ext_mode_table[i][MLX5E_CABLE_TYPE_UNKNOWN];
 
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		} else {
 			media_entry = mlx5e_ext_mode_table[i][cable_type];
 
 			/* check if we should use fallback entry */
 			if (media_entry.subtype == 0)
 				media_entry = mlx5e_ext_mode_table[i][MLX5E_CABLE_TYPE_UNKNOWN];
 		}
 	} else {
 		media_entry = mlx5e_mode_table[i];
 	}
 
 	if (media_entry.subtype == 0) {
 		mlx5_en_err(priv->ifp,
 		    "Could not find operational media subtype\n");
 		return;
 	}
 
 	switch (media_entry.subtype) {
 	case IFM_10G_ER:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		}
 		if (error != 0 || is_er_type == 0)
 			media_entry.subtype = IFM_10G_LR;
 		break;
 	case IFM_40G_LR4:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		}
 		if (error == 0 && is_er_type != 0)
 			media_entry.subtype = IFM_40G_ER4;
 		break;
 	}
 	priv->media_active_last = media_entry.subtype | IFM_ETHER | IFM_FDX;
 	if_setbaudrate(priv->ifp, media_entry.baudrate);
 
 	if_link_state_change(priv->ifp, LINK_STATE_UP);
 }
 
 static void
 mlx5e_media_status(if_t dev, struct ifmediareq *ifmr)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 
 	ifmr->ifm_status = priv->media_status_last;
 	ifmr->ifm_current = ifmr->ifm_active = priv->media_active_last |
 	    (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) |
 	    (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0);
 
 }
 
 static u32
 mlx5e_find_link_mode(u32 subtype, bool ext)
 {
 	u32 link_mode = 0;
 
 	switch (subtype) {
 	case 0:
 		goto done;
 	case IFM_10G_LR:
 		subtype = IFM_10G_ER;
 		break;
 	case IFM_40G_ER4:
 		subtype = IFM_40G_LR4;
 		break;
 	default:
 		break;
 	}
 
 	if (ext) {
 		for (unsigned i = 0; i != MLX5E_EXT_LINK_SPEEDS_NUMBER; i++) {
 			for (unsigned j = 0; j != MLX5E_CABLE_TYPE_NUMBER; j++) {
 				if (mlx5e_ext_mode_table[i][j].subtype == subtype)
 					link_mode |= MLX5E_PROT_MASK(i);
 			}
 		}
 	} else {
 		for (unsigned i = 0; i != MLX5E_LINK_SPEEDS_NUMBER; i++) {
 			if (mlx5e_mode_table[i].subtype == subtype)
 				link_mode |= MLX5E_PROT_MASK(i);
 		}
 	}
 done:
 	return (link_mode);
 }
 
 static int
 mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv)
 {
 	return (mlx5_set_port_pause_and_pfc(priv->mdev, 1,
 	    priv->params.rx_pauseframe_control,
 	    priv->params.tx_pauseframe_control,
 	    priv->params.rx_priority_flow_control,
 	    priv->params.tx_priority_flow_control));
 }
 
 static int
 mlx5e_set_port_pfc(struct mlx5e_priv *priv)
 {
 	int error;
 
 	if (priv->gone != 0) {
 		error = -ENXIO;
 	} else if (priv->params.rx_pauseframe_control ||
 	    priv->params.tx_pauseframe_control) {
 		mlx5_en_err(priv->ifp,
 		    "Global pauseframes must be disabled before enabling PFC.\n");
 		error = -EINVAL;
 	} else {
 		error = mlx5e_set_port_pause_and_pfc(priv);
 	}
 	return (error);
 }
 
 static int
 mlx5e_media_change(if_t dev)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 eth_proto_cap;
 	u32 link_mode;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	int was_opened;
 	int locked;
 	int error;
 	bool ext;
 
 	locked = PRIV_LOCKED(priv);
 	if (!locked)
 		PRIV_LOCK(priv);
 
 	if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) {
 		error = EINVAL;
 		goto done;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error != 0) {
 		mlx5_en_err(dev, "Query port media capability failed\n");
 		goto done;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media), ext);
 
 	/* query supported capabilities */
 	eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_capability);
 
 	/* check for autoselect */
 	if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) {
 		link_mode = eth_proto_cap;
 		if (link_mode == 0) {
 			mlx5_en_err(dev, "Port media capability is zero\n");
 			error = EINVAL;
 			goto done;
 		}
 	} else {
 		link_mode = link_mode & eth_proto_cap;
 		if (link_mode == 0) {
 			mlx5_en_err(dev, "Not supported link mode requested\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
 		/* check if PFC is enabled */
 		if (priv->params.rx_priority_flow_control ||
 		    priv->params.tx_priority_flow_control) {
 			mlx5_en_err(dev, "PFC must be disabled before enabling global pauseframes.\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	/* update pauseframe control bits */
 	priv->params.rx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0;
 	priv->params.tx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0;
 
 	/* check if device is opened */
 	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	/* reconfigure the hardware */
 	mlx5_set_port_status(mdev, MLX5_PORT_DOWN);
 	mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN, ext);
 	error = -mlx5e_set_port_pause_and_pfc(priv);
 	if (was_opened)
 		mlx5_set_port_status(mdev, MLX5_PORT_UP);
 
 done:
 	if (!locked)
 		PRIV_UNLOCK(priv);
 	return (error);
 }
 
 static void
 mlx5e_update_carrier_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
 	    update_carrier_work);
 
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
 		mlx5e_update_carrier(priv);
 	PRIV_UNLOCK(priv);
 }
 
 #define	MLX5E_PCIE_PERF_GET_64(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET64(mpcnt_reg, out, counter_set.f.c);
 
 #define	MLX5E_PCIE_PERF_GET_32(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET(mpcnt_reg, out, counter_set.f.c);
 
 static void
 mlx5e_update_pcie_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	const unsigned sz = MLX5_ST_SZ_BYTES(mpcnt_reg);
 	void *out;
 	void *in;
 	int err;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_64(MLX5E_PCIE_PERF_GET_64)
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_LANE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_LANE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 /*
  * This function reads the physical port counters from the firmware
  * using a pre-defined layout defined by various MLX5E_PPORT_XXX()
  * macros. The output is converted from big-endian 64-bit values into
  * host endian ones and stored in the "priv->stats.pport" structure.
  */
 static void
 mlx5e_update_pport_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_pport_stats *s = &priv->stats.pport;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	u32 *in;
 	u32 *out;
 	const u64 *ptr;
 	unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 	unsigned x;
 	unsigned y;
 	unsigned z;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	/*
 	 * Get pointer to the 64-bit counter set which is located at a
 	 * fixed offset in the output firmware request structure:
 	 */
 	ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set);
 
 	MLX5_SET(ppcnt_reg, in, local_port, 1);
 
 	/* read IEEE802_3 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM;
 	     x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2819 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM +
 	    MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2863 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read physical layer stats counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Ethernet counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Statistical Group */
 	if (MLX5_CAP_GEN(mdev, pcam_reg) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) {
 		/* read Extended Statistical counter group using predefined counter layout */
 		MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		for (x = 0; x != MLX5E_PPORT_STATISTICAL_DEBUG_NUM; x++, y++)
 			s_debug->arg[y] = be64toh(ptr[x]);
 	}
 
 	/* read PCIE counters */
 	mlx5e_update_pcie_counters(priv);
 
 	/* read per-priority counters */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
 
 	/* iterate all the priorities */
 	for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) {
 		MLX5_SET(ppcnt_reg, in, prio_tc, z);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		/* read per priority stats counter group using predefined counter layout */
 		for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM /
 		    MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++)
 			s->arg[y] = be64toh(ptr[x]);
 	}
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 static void
 mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv)
 {
 	u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
 
 	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
 		return;
 
 	MLX5_SET(query_vnic_env_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VNIC_ENV);
 	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
 	MLX5_SET(query_vnic_env_in, in, other_vport, 0);
 
 	if (mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)) != 0)
 		return;
 
 	priv->stats.vport.rx_steer_missed_packets =
 	    MLX5_GET64(query_vnic_env_out, out,
 	    vport_env.nic_receive_steering_discard);
 }
 
 /*
  * This function is called regularly to collect all statistics
  * counters from the firmware. The values can be viewed through the
  * sysctl interface. Execution is serialized using the priv's global
  * configuration lock.
  */
 static void
 mlx5e_update_stats_locked(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_vport_stats *s = &priv->stats.vport;
 	struct mlx5e_sq_stats *sq_stats;
 	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
 	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
 	u64 tso_packets = 0;
 	u64 tso_bytes = 0;
 	u64 tx_queue_dropped = 0;
 	u64 tx_defragged = 0;
 	u64 tx_offload_none = 0;
 	u64 lro_packets = 0;
 	u64 lro_bytes = 0;
 	u64 sw_lro_queued = 0;
 	u64 sw_lro_flushed = 0;
 	u64 rx_csum_none = 0;
 	u64 rx_wqe_err = 0;
 	u64 rx_packets = 0;
 	u64 rx_bytes = 0;
 	u64 rx_decrypted_error = 0;
 	u64 rx_decrypted_ok = 0;
 	u32 rx_out_of_buffer = 0;
 	int error;
 	int i;
 	int j;
 
 	out = mlx5_vzalloc(outlen);
 	if (out == NULL)
 		goto free_out;
 
 	/* Collect firts the SW counters and then HW for consistency */
 	for (i = 0; i < priv->params.num_channels; i++) {
 		struct mlx5e_channel *pch = priv->channel + i;
 		struct mlx5e_rq *rq = &pch->rq;
 		struct mlx5e_rq_stats *rq_stats = &pch->rq.stats;
 
 		/* collect stats from LRO */
 		rq_stats->sw_lro_queued = rq->lro.lro_queued;
 		rq_stats->sw_lro_flushed = rq->lro.lro_flushed;
 		sw_lro_queued += rq_stats->sw_lro_queued;
 		sw_lro_flushed += rq_stats->sw_lro_flushed;
 		lro_packets += rq_stats->lro_packets;
 		lro_bytes += rq_stats->lro_bytes;
 		rx_csum_none += rq_stats->csum_none;
 		rx_wqe_err += rq_stats->wqe_err;
 		rx_packets += rq_stats->packets;
 		rx_bytes += rq_stats->bytes;
 		rx_decrypted_error += rq_stats->decrypted_error_packets;
 		rx_decrypted_ok += rq_stats->decrypted_ok_packets;
 
 		for (j = 0; j < priv->num_tc; j++) {
 			sq_stats = &pch->sq[j].stats;
 
 			tso_packets += sq_stats->tso_packets;
 			tso_bytes += sq_stats->tso_bytes;
 			tx_queue_dropped += sq_stats->dropped;
 			tx_queue_dropped += sq_stats->enobuf;
 			tx_defragged += sq_stats->defragged;
 			tx_offload_none += sq_stats->csum_offload_none;
 		}
 	}
 
 #ifdef RATELIMIT
 	/* Collect statistics from all rate-limit queues */
 	for (j = 0; j < priv->rl.param.tx_worker_threads_def; j++) {
 		struct mlx5e_rl_worker *rlw = priv->rl.workers + j;
 
 		for (i = 0; i < priv->rl.param.tx_channels_per_worker_def; i++) {
 			struct mlx5e_rl_channel *channel = rlw->channels + i;
 			struct mlx5e_sq *sq = channel->sq;
 
 			if (sq == NULL)
 				continue;
 
 			sq_stats = &sq->stats;
 
 			tso_packets += sq_stats->tso_packets;
 			tso_bytes += sq_stats->tso_bytes;
 			tx_queue_dropped += sq_stats->dropped;
 			tx_queue_dropped += sq_stats->enobuf;
 			tx_defragged += sq_stats->defragged;
 			tx_offload_none += sq_stats->csum_offload_none;
 		}
 	}
 #endif
 
 	/* update counters */
 	s->tso_packets = tso_packets;
 	s->tso_bytes = tso_bytes;
 	s->tx_queue_dropped = tx_queue_dropped;
 	s->tx_defragged = tx_defragged;
 	s->lro_packets = lro_packets;
 	s->lro_bytes = lro_bytes;
 	s->sw_lro_queued = sw_lro_queued;
 	s->sw_lro_flushed = sw_lro_flushed;
 	s->rx_csum_none = rx_csum_none;
 	s->rx_wqe_err = rx_wqe_err;
 	s->rx_packets = rx_packets;
 	s->rx_bytes = rx_bytes;
 	s->rx_decrypted_error_packets = rx_decrypted_error;
 	s->rx_decrypted_ok_packets = rx_decrypted_ok;
 
 	mlx5e_grp_vnic_env_update_stats(priv);
 
 	/* HW counters */
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(query_vport_counter_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VPORT_COUNTER);
 	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
 	MLX5_SET(query_vport_counter_in, in, other_vport, 0);
 
 	memset(out, 0, outlen);
 
 	/* get number of out-of-buffer drops first */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 &&
 	    mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id,
 	    &rx_out_of_buffer) == 0) {
 		s->rx_out_of_buffer = rx_out_of_buffer;
 	}
 
 	/* get port statistics */
 	if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen) == 0) {
 #define	MLX5_GET_CTR(out, x) \
 	MLX5_GET64(query_vport_counter_out, out, x)
 
 		s->rx_error_packets =
 		    MLX5_GET_CTR(out, received_errors.packets);
 		s->rx_error_bytes =
 		    MLX5_GET_CTR(out, received_errors.octets);
 		s->tx_error_packets =
 		    MLX5_GET_CTR(out, transmit_errors.packets);
 		s->tx_error_bytes =
 		    MLX5_GET_CTR(out, transmit_errors.octets);
 
 		s->rx_unicast_packets =
 		    MLX5_GET_CTR(out, received_eth_unicast.packets);
 		s->rx_unicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_unicast.octets);
 		s->tx_unicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.packets);
 		s->tx_unicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.octets);
 
 		s->rx_multicast_packets =
 		    MLX5_GET_CTR(out, received_eth_multicast.packets);
 		s->rx_multicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_multicast.octets);
 		s->tx_multicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.packets);
 		s->tx_multicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.octets);
 
 		s->rx_broadcast_packets =
 		    MLX5_GET_CTR(out, received_eth_broadcast.packets);
 		s->rx_broadcast_bytes =
 		    MLX5_GET_CTR(out, received_eth_broadcast.octets);
 		s->tx_broadcast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
 		s->tx_broadcast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
 
 		s->tx_packets = s->tx_unicast_packets +
 		    s->tx_multicast_packets + s->tx_broadcast_packets;
 		s->tx_bytes = s->tx_unicast_bytes + s->tx_multicast_bytes +
 		    s->tx_broadcast_bytes;
 
 		/* Update calculated offload counters */
 		s->tx_csum_offload = s->tx_packets - tx_offload_none;
 		s->rx_csum_good = s->rx_packets - s->rx_csum_none;
 	}
 
 	/* Get physical port counters */
 	mlx5e_update_pport_counters(priv);
 
 	s->tx_jumbo_packets =
 	    priv->stats.port_stats_debug.tx_stat_p1519to2047octets +
 	    priv->stats.port_stats_debug.tx_stat_p2048to4095octets +
 	    priv->stats.port_stats_debug.tx_stat_p4096to8191octets +
 	    priv->stats.port_stats_debug.tx_stat_p8192to10239octets;
 
 free_out:
 	kvfree(out);
 
 	/* Update diagnostics, if any */
 	if (priv->params_ethtool.diag_pci_enable ||
 	    priv->params_ethtool.diag_general_enable) {
 		error = mlx5_core_get_diagnostics_full(mdev,
 		    priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL,
 		    priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL);
 		if (error != 0)
 			mlx5_en_err(priv->ifp,
 			    "Failed reading diagnostics: %d\n", error);
 	}
 
 	/* Update FEC, if any */
 	error = mlx5e_fec_update(priv);
 	if (error != 0 && error != EOPNOTSUPP) {
 		mlx5_en_err(priv->ifp,
 		    "Updating FEC failed: %d\n", error);
 	}
 
 	/* Update temperature, if any */
 	if (priv->params_ethtool.hw_num_temp != 0) {
 		error = mlx5e_hw_temperature_update(priv);
 		if (error != 0 && error != EOPNOTSUPP) {
 			mlx5_en_err(priv->ifp,
 			    "Updating temperature failed: %d\n", error);
 		}
 	}
 }
 
 static void
 mlx5e_update_stats_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv;
 
 	priv = container_of(work, struct mlx5e_priv, update_stats_work);
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 &&
 	    !test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &priv->mdev->intf_state))
 		mlx5e_update_stats_locked(priv);
 	PRIV_UNLOCK(priv);
 }
 
 static void
 mlx5e_update_stats(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	queue_work(priv->wq, &priv->update_stats_work);
 
 	callout_reset(&priv->watchdog, hz / 4, &mlx5e_update_stats, priv);
 }
 
 static void
 mlx5e_async_event_sub(struct mlx5e_priv *priv,
     enum mlx5_dev_event event)
 {
 	switch (event) {
 	case MLX5_DEV_EVENT_PORT_UP:
 	case MLX5_DEV_EVENT_PORT_DOWN:
 		queue_work(priv->wq, &priv->update_carrier_work);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static void
 mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
     enum mlx5_dev_event event, unsigned long param)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	mtx_lock(&priv->async_events_mtx);
 	if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state))
 		mlx5e_async_event_sub(priv, event);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void
 mlx5e_enable_async_events(struct mlx5e_priv *priv)
 {
 	set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 }
 
 static void
 mlx5e_disable_async_events(struct mlx5e_priv *priv)
 {
 	mtx_lock(&priv->async_events_mtx);
 	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void mlx5e_calibration_callout(void *arg);
 static int mlx5e_calibration_duration = 20;
 static int mlx5e_fast_calibration = 1;
 static int mlx5e_normal_calibration = 30;
 
 static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "MLX5 timestamp calibration parameters");
 
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN,
     &mlx5e_calibration_duration, 0,
     "Duration of initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN,
     &mlx5e_fast_calibration, 0,
     "Recalibration interval during initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN,
     &mlx5e_normal_calibration, 0,
     "Recalibration interval during normal operations");
 
 /*
  * Ignites the calibration process.
  */
 static void
 mlx5e_reset_calibration_callout(struct mlx5e_priv *priv)
 {
 
 	if (priv->clbr_done == 0)
 		mlx5e_calibration_callout(priv);
 	else
 		callout_reset_sbt_curcpu(&priv->tstmp_clbr, (priv->clbr_done <
 		    mlx5e_calibration_duration ? mlx5e_fast_calibration :
 		    mlx5e_normal_calibration) * SBT_1S, 0,
 		    mlx5e_calibration_callout, priv, C_DIRECT_EXEC);
 }
 
 static uint64_t
 mlx5e_timespec2usec(const struct timespec *ts)
 {
 
 	return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec);
 }
 
 static uint64_t
 mlx5e_hw_clock(struct mlx5e_priv *priv)
 {
 	struct mlx5_init_seg *iseg;
 	uint32_t hw_h, hw_h1, hw_l;
 
 	iseg = priv->mdev->iseg;
 	do {
 		hw_h = ioread32be(&iseg->internal_timer_h);
 		hw_l = ioread32be(&iseg->internal_timer_l);
 		hw_h1 = ioread32be(&iseg->internal_timer_h);
 	} while (hw_h1 != hw_h);
 	return (((uint64_t)hw_h << 32) | hw_l);
 }
 
 /*
  * The calibration callout, it runs either in the context of the
  * thread which enables calibration, or in callout.  It takes the
  * snapshot of system and adapter clocks, then advances the pointers to
  * the calibration point to allow rx path to read the consistent data
  * lockless.
  */
 static void
 mlx5e_calibration_callout(void *arg)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_clbr_point *next, *curr;
 	struct timespec ts;
 	int clbr_curr_next;
 
 	priv = arg;
 	curr = &priv->clbr_points[priv->clbr_curr];
 	clbr_curr_next = priv->clbr_curr + 1;
 	if (clbr_curr_next >= nitems(priv->clbr_points))
 		clbr_curr_next = 0;
 	next = &priv->clbr_points[clbr_curr_next];
 
 	next->base_prev = curr->base_curr;
 	next->clbr_hw_prev = curr->clbr_hw_curr;
 
 	next->clbr_hw_curr = mlx5e_hw_clock(priv);
 	if (((next->clbr_hw_curr - curr->clbr_hw_curr) >> MLX5E_TSTMP_PREC) ==
 	    0) {
 		if (priv->clbr_done != 0) {
 			mlx5_en_err(priv->ifp,
 			    "HW failed tstmp frozen %#jx %#jx, disabling\n",
 			     next->clbr_hw_curr, curr->clbr_hw_prev);
 			priv->clbr_done = 0;
 		}
 		atomic_store_rel_int(&curr->clbr_gen, 0);
 		return;
 	}
 
 	nanouptime(&ts);
 	next->base_curr = mlx5e_timespec2usec(&ts);
 
 	curr->clbr_gen = 0;
 	atomic_thread_fence_rel();
 	priv->clbr_curr = clbr_curr_next;
 	atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen));
 
 	if (priv->clbr_done < mlx5e_calibration_duration)
 		priv->clbr_done++;
 	mlx5e_reset_calibration_callout(priv);
 }
 
 static const char *mlx5e_rq_stats_desc[] = {
 	MLX5E_RQ_STATS(MLX5E_STATS_DESC)
 };
 
 static int
 mlx5e_create_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *rqc = param->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	int wq_sz;
 	int err;
 	int i;
 	u32 nsegs, wqe_sz;
 
 	err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	if (err != 0)
 		goto done;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsize */
 	    nsegs,			/* nsegments */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &rq->dma_tag)))
 		goto done;
 
 	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
 	    &rq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
 	err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
 	if (err != 0)
 		goto err_rq_wq_destroy;
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 
 	err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz);
 	if (err)
 		goto err_rq_wq_destroy;
 
 	rq->mbuf = malloc_domainset(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN,
 	    mlx5_dev_domainset(mdev), M_WAITOK | M_ZERO);
 	for (i = 0; i != wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 		int j;
 
 		err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
 		if (err != 0) {
 			while (i--)
 				bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 			goto err_rq_mbuf_free;
 		}
 
 		/* set value for constant fields */
 		for (j = 0; j < rq->nsegs; j++)
 			wqe->data[j].lkey = cpu_to_be32(priv->mr.key);
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_dim_work);
 	if (priv->params.rx_cq_moderation_mode < 2) {
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 	} else {
 		void *cqc = container_of(param,
 		    struct mlx5e_channel_param, rq)->rx_cq.cqc;
 
 		switch (MLX5_GET(cqc, cqc, cq_period_mode)) {
 		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		default:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 			break;
 		}
 	}
 
 	rq->ifp = priv->ifp;
 	rq->channel = c;
 	rq->ix = c->ix;
 
 	snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix);
 	mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM,
 	    rq->stats.arg);
 	return (0);
 
 err_rq_mbuf_free:
 	free(rq->mbuf, M_MLX5EN);
 	tcp_lro_free(&rq->lro);
 err_rq_wq_destroy:
 	mlx5_wq_destroy(&rq->wq_ctrl);
 err_free_dma_tag:
 	bus_dma_tag_destroy(rq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
 	int wq_sz;
 	int i;
 
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&rq->stats.ctx);
 
 	/* free leftover LRO packets, if any */
 	tcp_lro_free(&rq->lro);
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	for (i = 0; i != wq_sz; i++) {
 		if (rq->mbuf[i].mbuf != NULL) {
 			bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map);
 			m_freem(rq->mbuf[i].mbuf);
 		}
 		bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 	}
 	free(rq->mbuf, M_MLX5EN);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 	bus_dma_tag_destroy(rq->dma_tag);
 }
 
 static int
 mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *in;
 	void *rqc;
 	void *wq;
 	int inlen;
 	int err;
 	u8 ts_format;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rq_in) +
 	    sizeof(u64) * rq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	ts_format = mlx5_get_rq_default_ts(mdev);
 	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
 	wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
 	memcpy(rqc, param->rqc, sizeof(param->rqc));
 
 	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc, rqc, ts_format, ts_format);
 	MLX5_SET(rqc, rqc, flush_in_error_en, 1);
 	if (priv->counter_set_id >= 0)
 		MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id);
 	MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift -
 	    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&rq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static int
 mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *rqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
 
 	MLX5_SET(modify_rq_in, in, rqn, rq->rqn);
 	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
 	MLX5_SET(rqc, rqc, state, next_state);
 
 	err = mlx5_core_modify_rq(mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_disable_rq(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	mlx5_core_destroy_rq(mdev, rq->rqn);
 }
 
 static int
 mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_wq_ll *wq = &rq->wq;
 	int i;
 
 	for (i = 0; i < 1000; i++) {
 		if (wq->cur_sz >= priv->params.min_rx_wqes)
 			return (0);
 
 		msleep(4);
 	}
 	return (-ETIMEDOUT);
 }
 
 static int
 mlx5e_open_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	int err;
 
 	err = mlx5e_create_rq(c, param, rq);
 	if (err)
 		return (err);
 
 	/* set CQN in RQ parameters */
 	MLX5_SET(rqc, param->rqc, cqn, c->rq.cq.mcq.cqn);
 
 	err = mlx5e_enable_rq(rq, param);
 	if (err)
 		goto err_destroy_rq;
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err)
 		goto err_disable_rq;
 
 	c->rq.enabled = 1;
 
 	return (0);
 
 err_disable_rq:
 	mlx5e_disable_rq(rq);
 err_destroy_rq:
 	mlx5e_destroy_rq(rq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_rq(struct mlx5e_rq *rq)
 {
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 }
 
 static void
 mlx5e_close_rq_wait(struct mlx5e_rq *rq)
 {
 
 	mlx5e_disable_rq(rq);
 	mlx5e_close_cq(&rq->cq);
 	cancel_work_sync(&rq->dim.work);
 	mlx5e_destroy_rq(rq);
 }
 
 /*
  * What is a drop RQ and why is it needed?
  *
  * The RSS indirection table, also called the RQT, selects the
  * destination RQ based on the receive queue number, RQN. The RQT is
  * frequently referred to by flow steering rules to distribute traffic
  * among multiple RQs. The problem is that the RQs cannot be destroyed
  * before the RQT referring them is destroyed too. Further, TLS RX
  * rules may still be referring to the RQT even if the link went
  * down. Because there is no magic RQN for dropping packets, we create
  * a dummy RQ, also called drop RQ, which sole purpose is to drop all
  * received packets. When the link goes down this RQN is filled in all
  * RQT entries, of the main RQT, so the real RQs which are about to be
  * destroyed can be released and the TLS RX rules can be sustained.
  */
 static void
 mlx5e_open_drop_rq_comp(struct mlx5_core_cq *mcq __unused, struct mlx5_eqe *eqe __unused)
 {
 }
 
 static int
 mlx5e_open_drop_rq(struct mlx5e_priv *priv,
     struct mlx5e_rq *drop_rq)
 {
 	struct mlx5e_cq_param param_cq = {};
 	struct mlx5e_rq_param param_rq = {};
 	void *rqc_wq = MLX5_ADDR_OF(rqc, param_rq.rqc, wq);
 	int err;
 
 	/* set channel pointer */
 	drop_rq->channel = priv->channel;
 
 	/* set basic CQ parameters needed */
 	MLX5_SET(cqc, param_cq.cqc, log_cq_size, 0);
 	MLX5_SET(cqc, param_cq.cqc, uar_page, priv->mdev->priv.uar->index);
 
 	/* open receive completion queue */
 	err = mlx5e_open_cq(priv, &param_cq, &drop_rq->cq,
 	    &mlx5e_open_drop_rq_comp, 0);
 	if (err)
 		goto err_done;
 
 	/* set basic WQ parameters needed */
 	MLX5_SET(wq, rqc_wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
 	MLX5_SET(wq, rqc_wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, rqc_wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) + sizeof(struct mlx5_wqe_data_seg)));
 	MLX5_SET(wq, rqc_wq, log_wq_sz, 0);
 	MLX5_SET(wq, rqc_wq, pd, priv->pdn);
 
 	param_rq.wq.linear = 1;
 
 	err = mlx5_wq_ll_create(priv->mdev, &param_rq.wq, rqc_wq, &drop_rq->wq,
 	    &drop_rq->wq_ctrl);
 	if (err)
 		goto err_close_cq;
 
 	/* set CQN in RQ parameters */
 	MLX5_SET(rqc, param_rq.rqc, cqn, drop_rq->cq.mcq.cqn);
 
 	err = mlx5e_enable_rq(drop_rq, &param_rq);
 	if (err)
 		goto err_wq_destroy;
 
 	err = mlx5e_modify_rq(drop_rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err)
 		goto err_disable_rq;
 
 	return (err);
 
 err_disable_rq:
 	mlx5e_disable_rq(drop_rq);
 err_wq_destroy:
 	mlx5_wq_destroy(&drop_rq->wq_ctrl);
 err_close_cq:
 	mlx5e_close_cq(&drop_rq->cq);
 err_done:
 	return (err);
 }
 
 static void
 mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq)
 {
 	mlx5e_modify_rq(drop_rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 	mlx5e_disable_rq(drop_rq);
 	mlx5_wq_destroy(&drop_rq->wq_ctrl);
 	mlx5e_close_cq(&drop_rq->cq);
 }
 
 void
 mlx5e_free_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int x;
 
 	for (x = 0; x != wq_sz; x++) {
 		if (sq->mbuf[x].mbuf != NULL) {
 			bus_dmamap_unload(sq->dma_tag, sq->mbuf[x].dma_map);
 			m_freem(sq->mbuf[x].mbuf);
 		}
 		if (sq->mbuf[x].mst != NULL) {
 			m_snd_tag_rele(sq->mbuf[x].mst);
 			sq->mbuf[x].mst = NULL;
 		}
 		bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 	}
 	free(sq->mbuf, M_MLX5EN);
 }
 
 int
 mlx5e_alloc_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int err;
 	int x;
 
 	sq->mbuf = malloc_domainset(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN,
 	    mlx5_dev_domainset(sq->priv->mdev), M_WAITOK | M_ZERO);
 
 	/* Create DMA descriptor MAPs */
 	for (x = 0; x != wq_sz; x++) {
 		err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map);
 		if (err != 0) {
 			while (x--)
 				bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 			free(sq->mbuf, M_MLX5EN);
 			return (err);
 		}
 	}
 	return (0);
 }
 
 static const char *mlx5e_sq_stats_desc[] = {
 	MLX5E_SQ_STATS(MLX5E_STATS_DESC)
 };
 
 void
 mlx5e_update_sq_inline(struct mlx5e_sq *sq)
 {
 	sq->max_inline = sq->priv->params.tx_max_inline;
 	sq->min_inline_mode = sq->priv->params.tx_min_inline_mode;
 
 	/*
 	 * Check if trust state is DSCP or if inline mode is NONE which
 	 * indicates CX-5 or newer hardware.
 	 */
 	if (sq->priv->params_ethtool.trust_state != MLX5_QPTS_TRUST_PCP ||
 	    sq->min_inline_mode == MLX5_INLINE_MODE_NONE) {
 		if (MLX5_CAP_ETH(sq->priv->mdev, wqe_vlan_insert))
 			sq->min_insert_caps = MLX5E_INSERT_VLAN | MLX5E_INSERT_NON_VLAN;
 		else
 			sq->min_insert_caps = MLX5E_INSERT_NON_VLAN;
 	} else {
 		sq->min_insert_caps = 0;
 	}
 }
 
 static void
 mlx5e_refresh_sq_inline_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int i;
 
 	for (i = 0; i != priv->num_tc; i++) {
 		mtx_lock(&c->sq[i].lock);
 		mlx5e_update_sq_inline(&c->sq[i]);
 		mtx_unlock(&c->sq[i].lock);
 	}
 }
 
 void
 mlx5e_refresh_sq_inline(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_refresh_sq_inline_sub(priv, &priv->channel[i]);
 }
 
 static int
 mlx5e_create_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *sqc = param->sqc;
 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	int err;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &sq->dma_tag)))
 		goto done;
 
 	sq->mkey_be = cpu_to_be32(priv->mr.key);
 	sq->ifp = priv->ifp;
 	sq->priv = priv;
 	sq->tc = tc;
 
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 	    &sq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
 	err = mlx5e_alloc_sq_db(sq);
 	if (err)
 		goto err_sq_wq_destroy;
 
 	mlx5e_update_sq_inline(sq);
 
 	snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc);
 	mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM,
 	    sq->stats.arg);
 
 	return (0);
 
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
 err_free_dma_tag:
 	bus_dma_tag_destroy(sq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_sq(struct mlx5e_sq *sq)
 {
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&sq->stats.ctx);
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 	bus_dma_tag_destroy(sq->dma_tag);
 }
 
 int
 mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param,
     const struct mlx5_sq_bfreg *bfreg, int tis_num)
 {
 	void *in;
 	void *sqc;
 	void *wq;
 	int inlen;
 	int err;
 	u8 ts_format;
 
 	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
 	    sizeof(u64) * sq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sq->uar_map = bfreg->map;
 
 	ts_format = mlx5_get_sq_default_ts(sq->priv->mdev);
 	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
 	wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	memcpy(sqc, param->sqc, sizeof(param->sqc));
 
 	MLX5_SET(sqc, sqc, tis_num_0, tis_num);
 	MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn);
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
 	MLX5_SET(sqc, sqc, ts_format, ts_format);
 	MLX5_SET(sqc, sqc, tis_lst_sz, 1);
 	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
 	MLX5_SET(sqc, sqc, allow_swp, 1);
 
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
 	MLX5_SET(wq, wq, uar_page, bfreg->index);
 	MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift -
 	    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&sq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 int
 mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state)
 {
 	void *in;
 	void *sqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
 	MLX5_SET(modify_sq_in, in, sq_state, curr_state);
 	MLX5_SET(sqc, sqc, state, next_state);
 
 	err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 void
 mlx5e_disable_sq(struct mlx5e_sq *sq)
 {
 
 	mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn);
 }
 
 static int
 mlx5e_open_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	int err;
 
 	sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
 
 	/* ensure the TX completion event factor is not zero */
 	if (sq->cev_factor == 0)
 		sq->cev_factor = 1;
 
 	err = mlx5e_create_sq(c, tc, param, sq);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_sq(sq, param, &c->bfreg, c->priv->tisn[tc]);
 	if (err)
 		goto err_destroy_sq;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
 	if (err)
 		goto err_disable_sq;
 
 	WRITE_ONCE(sq->running, 1);
 
 	return (0);
 
 err_disable_sq:
 	mlx5e_disable_sq(sq);
 err_destroy_sq:
 	mlx5e_destroy_sq(sq);
 
 	return (err);
 }
 
 static void
 mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
 {
 	/* fill up remainder with NOPs */
 	while (sq->cev_counter != 0) {
 		while (!mlx5e_sq_has_room_for(sq, 1)) {
 			if (can_sleep != 0) {
 				mtx_unlock(&sq->lock);
 				msleep(4);
 				mtx_lock(&sq->lock);
 			} else {
 				goto done;
 			}
 		}
 		/* send a single NOP */
 		mlx5e_send_nop(sq, 1);
 		atomic_thread_fence_rel();
 	}
 done:
 	mlx5e_tx_notify_hw(sq, false);
 }
 
 void
 mlx5e_sq_cev_timeout(void *arg)
 {
 	struct mlx5e_sq *sq = arg;
 
 	mtx_assert(&sq->lock, MA_OWNED);
 
 	/* check next state */
 	switch (sq->cev_next_state) {
 	case MLX5E_CEV_STATE_SEND_NOPS:
 		/* fill TX ring with NOPs, if any */
 		mlx5e_sq_send_nops_locked(sq, 0);
 
 		/* check if completed */
 		if (sq->cev_counter == 0) {
 			sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 			return;
 		}
 		break;
 	default:
 		/* send NOPs on next timeout */
 		sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
 		break;
 	}
 
 	/* restart timer */
 	callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
 }
 
 void
 mlx5e_drain_sq(struct mlx5e_sq *sq)
 {
 	int error;
 	struct mlx5_core_dev *mdev= sq->priv->mdev;
 
 	/*
 	 * Check if already stopped.
 	 *
 	 * NOTE: Serialization of this function is managed by the
 	 * caller ensuring the priv's state lock is locked or in case
 	 * of rate limit support, a single thread manages drain and
 	 * resume of SQs. The "running" variable can therefore safely
 	 * be read without any locks.
 	 */
 	if (READ_ONCE(sq->running) == 0)
 		return;
 
 	/* don't put more packets into the SQ */
 	WRITE_ONCE(sq->running, 0);
 
 	/* serialize access to DMA rings */
 	mtx_lock(&sq->lock);
 
 	/* teardown event factor timer, if any */
 	sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
 	callout_stop(&sq->cev_callout);
 
 	/* send dummy NOPs in order to flush the transmit ring */
 	mlx5e_sq_send_nops_locked(sq, 1);
 	mtx_unlock(&sq->lock);
 
 	/* wait till SQ is empty or link is down */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	    (sq->priv->media_status_last & IFM_ACTIVE) != 0 &&
 	    mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
 	    pci_channel_offline(mdev->pdev) == 0) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq, NULL);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 
 	/* error out remaining requests */
 	error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
 	if (error != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error);
 	}
 
 	/* wait till SQ is empty */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	       mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
 	       pci_channel_offline(mdev->pdev) == 0) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq, NULL);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 }
 
 static void
 mlx5e_close_sq_wait(struct mlx5e_sq *sq)
 {
 
 	mlx5e_drain_sq(sq);
 	mlx5e_disable_sq(sq);
 	mlx5e_destroy_sq(sq);
 }
 
 static int
 mlx5e_create_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	int eqn_not_used;
 	int irqn;
 	int err;
 	u32 i;
 
 	err = mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn);
 	if (err)
 		return (err);
 
 	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
 	    &cq->wq_ctrl);
 	if (err)
 		return (err);
 
 	mcq->cqe_sz = 64;
 	mcq->set_ci_db = cq->wq_ctrl.db.db;
 	mcq->arm_db = cq->wq_ctrl.db.db + 1;
 	*mcq->set_ci_db = 0;
 	*mcq->arm_db = 0;
 	mcq->vector = eq_ix;
 	mcq->comp = comp;
 	mcq->event = mlx5e_cq_error_event;
 	mcq->irqn = irqn;
 
 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
 
 		cqe->op_own = 0xf1;
 	}
 
 	cq->priv = priv;
 
 	return (0);
 }
 
 static void
 mlx5e_destroy_cq(struct mlx5e_cq *cq)
 {
 	mlx5_wq_destroy(&cq->wq_ctrl);
 }
 
 static int
 mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix)
 {
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	void *in;
 	void *cqc;
 	int inlen;
 	int irqn_not_used;
 	int eqn;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 	    sizeof(u64) * cq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 
 	memcpy(cqc, param->cqc, sizeof(param->cqc));
 
 	mlx5_fill_page_array(&cq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used);
 
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 	    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
 
 	err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen, out, sizeof(out));
 
 	kvfree(in);
 
 	if (err)
 		return (err);
 
 	mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock));
 
 	return (0);
 }
 
 static void
 mlx5e_disable_cq(struct mlx5e_cq *cq)
 {
 
 	mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq);
 }
 
 int
 mlx5e_open_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	int err;
 
 	err = mlx5e_create_cq(priv, param, cq, comp, eq_ix);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_cq(cq, param, eq_ix);
 	if (err)
 		goto err_destroy_cq;
 
 	return (0);
 
 err_destroy_cq:
 	mlx5e_destroy_cq(cq);
 
 	return (err);
 }
 
 void
 mlx5e_close_cq(struct mlx5e_cq *cq)
 {
 	mlx5e_disable_cq(cq);
 	mlx5e_destroy_cq(cq);
 }
 
 static int
 mlx5e_open_tx_cqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++) {
 		/* open completion queue */
 		err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq,
 		    &mlx5e_tx_cq_comp, c->ix);
 		if (err)
 			goto err_close_tx_cqs;
 	}
 	return (0);
 
 err_close_tx_cqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_cq(&c->sq[tc].cq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tx_cqs(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++)
 		mlx5e_close_cq(&c->sq[tc].cq);
 }
 
 static int
 mlx5e_open_sqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++) {
 		err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]);
 		if (err)
 			goto err_close_sqs;
 	}
 
 	return (0);
 
 err_close_sqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 
 	return (err);
 }
 
 static void
 mlx5e_close_sqs_wait(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 }
 
 static void
 mlx5e_chan_static_init(struct mlx5e_priv *priv, struct mlx5e_channel *c, int ix)
 {
 	int tc;
 
 	/* setup priv and channel number */
 	c->priv = priv;
 	c->ix = ix;
 
 	/* setup send tag */
 	m_snd_tag_init(&c->tag, c->priv->ifp, &mlx5e_ul_snd_tag_sw);
 
 	init_completion(&c->completion);
 
 	mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
 
 	callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0);
 
 	for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) {
 		struct mlx5e_sq *sq = c->sq + tc;
 
 		mtx_init(&sq->lock, "mlx5tx",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 		mtx_init(&sq->comp_lock, "mlx5comp",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 
 		callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
 	}
 
 	mlx5e_iq_static_init(&c->iq);
 }
 
 static void
 mlx5e_chan_wait_for_completion(struct mlx5e_channel *c)
 {
 
 	m_snd_tag_rele(&c->tag);
 	wait_for_completion(&c->completion);
 }
 
 static void
 mlx5e_priv_wait_for_completion(struct mlx5e_priv *priv, const uint32_t channels)
 {
 	uint32_t x;
 
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_wait_for_completion(&priv->channel[x]);
 }
 
 static void
 mlx5e_chan_static_destroy(struct mlx5e_channel *c)
 {
 	int tc;
 
 	callout_drain(&c->rq.watchdog);
 
 	mtx_destroy(&c->rq.mtx);
 
 	for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) {
 		callout_drain(&c->sq[tc].cev_callout);
 		mtx_destroy(&c->sq[tc].lock);
 		mtx_destroy(&c->sq[tc].comp_lock);
 	}
 
 	mlx5e_iq_static_destroy(&c->iq);
 }
 
 static int
 mlx5e_open_channel(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam,
     struct mlx5e_channel *c)
 {
 	struct epoch_tracker et;
 	int i, err;
 
 	/* zero non-persistent data */
 	MLX5E_ZERO(&c->rq, mlx5e_rq_zero_start);
 	for (i = 0; i != priv->num_tc; i++)
 		MLX5E_ZERO(&c->sq[i], mlx5e_sq_zero_start);
 	MLX5E_ZERO(&c->iq, mlx5e_iq_zero_start);
 
 	/* open transmit completion queue */
 	err = mlx5e_open_tx_cqs(c, cparam);
 	if (err)
 		goto err_free;
 
 	/* open receive completion queue */
 	err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq,
 	    &mlx5e_rx_cq_comp, c->ix);
 	if (err)
 		goto err_close_tx_cqs;
 
 	err = mlx5e_open_sqs(c, cparam);
 	if (err)
 		goto err_close_rx_cq;
 
 	err = mlx5e_iq_open(c, &cparam->sq, &cparam->tx_cq, &c->iq);
 	if (err)
 		goto err_close_sqs;
 
 	err = mlx5e_open_rq(c, &cparam->rq, &c->rq);
 	if (err)
 		goto err_close_iq;
 
 	/* poll receive queue initially */
 	NET_EPOCH_ENTER(et);
 	c->rq.cq.mcq.comp(&c->rq.cq.mcq, NULL);
 	NET_EPOCH_EXIT(et);
 
 	return (0);
 
 err_close_iq:
 	mlx5e_iq_close(&c->iq);
 
 err_close_sqs:
 	mlx5e_close_sqs_wait(c);
 
 err_close_rx_cq:
 	mlx5e_close_cq(&c->rq.cq);
 
 err_close_tx_cqs:
 	mlx5e_close_tx_cqs(c);
 
 err_free:
 	return (err);
 }
 
 static void
 mlx5e_close_channel(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq(&c->rq);
 }
 
 static void
 mlx5e_close_channel_wait(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq_wait(&c->rq);
 	mlx5e_iq_close(&c->iq);
 	mlx5e_close_sqs_wait(c);
 	mlx5e_close_tx_cqs(c);
 }
 
 static int
 mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
 {
 	u32 r, n;
 
 	r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
 	    MLX5E_SW2MB_MTU(if_getmtu(priv->ifp));
 	if (r > MJUM16BYTES)
 		return (-ENOMEM);
 
 	if (r > MJUM9BYTES)
 		r = MJUM16BYTES;
 	else if (r > MJUMPAGESIZE)
 		r = MJUM9BYTES;
 	else if (r > MCLBYTES)
 		r = MJUMPAGESIZE;
 	else
 		r = MCLBYTES;
 
 	/*
 	 * n + 1 must be a power of two, because stride size must be.
 	 * Stride size is 16 * (n + 1), as the first segment is
 	 * control.
 	 */
 	for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
 		;
 
 	if (n > MLX5E_MAX_BUSDMA_RX_SEGS)
 		return (-ENOMEM);
 
 	*wqe_sz = r;
 	*nsegs = n;
 	return (0);
 }
 
 static void
 mlx5e_build_rq_param(struct mlx5e_priv *priv,
     struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	u32 wqe_sz, nsegs;
 
 	mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
 	    nsegs * sizeof(struct mlx5_wqe_data_seg)));
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_sq_param(struct mlx5e_priv *priv,
     struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index);
 }
 
 static void
 mlx5e_get_default_profile(struct mlx5e_priv *priv, int mode, struct net_dim_cq_moder *ptr)
 {
 
 	*ptr = net_dim_get_profile(mode, MLX5E_DIM_DEFAULT_PROFILE);
 
 	/* apply LRO restrictions */
 	if (priv->params.hw_lro_en &&
 	    ptr->pkts > MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO) {
 		ptr->pkts = MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO;
 	}
 }
 
 static void
 mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	struct net_dim_cq_moder curr;
 	void *cqc = param->cqc;
 
 	/*
 	 * We use MLX5_CQE_FORMAT_HASH because the RX hash mini CQE
 	 * format is more beneficial for FreeBSD use case.
 	 *
 	 * Adding support for MLX5_CQE_FORMAT_CSUM will require changes
 	 * in mlx5e_decompress_cqe.
 	 */
 	if (priv->params.cqe_zipping_en) {
 		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_HASH);
 		MLX5_SET(cqc, cqc, cqe_compression_en, 1);
 	}
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size);
 
 	switch (priv->params.rx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 1:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 2:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 3:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		break;
 	}
 
 	mlx5e_dim_build_cq_param(priv, param);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size);
 	MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec);
 	MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts);
 
 	switch (priv->params.tx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	}
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_channel_param(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam)
 {
 	memset(cparam, 0, sizeof(*cparam));
 
 	mlx5e_build_rq_param(priv, &cparam->rq);
 	mlx5e_build_sq_param(priv, &cparam->sq);
 	mlx5e_build_rx_cq_param(priv, &cparam->rx_cq);
 	mlx5e_build_tx_cq_param(priv, &cparam->tx_cq);
 }
 
 static int
 mlx5e_open_channels(struct mlx5e_priv *priv)
 {
 	struct mlx5e_channel_param *cparam;
 	int err;
 	int i;
 	int j;
 
 	cparam = malloc(sizeof(*cparam), M_MLX5EN, M_WAITOK);
 
 	mlx5e_build_channel_param(priv, cparam);
 	for (i = 0; i < priv->params.num_channels; i++) {
 		err = mlx5e_open_channel(priv, cparam, &priv->channel[i]);
 		if (err)
 			goto err_close_channels;
 
 		/* Bind interrupt vectors, if any. */
 		if (priv->params_ethtool.irq_cpu_base > -1) {
 			cpuset_t cpuset;
 			int cpu;
 			int irq;
 			int eqn;
 			int nirq;
 
 			err = mlx5_vector2eqn(priv->mdev, i,
 			    &eqn, &nirq);
 
 			/* error here is non-fatal */
 			if (err != 0)
 				continue;
 
 			irq = priv->mdev->priv.msix_arr[nirq].vector;
 			cpu = (unsigned)(priv->params_ethtool.irq_cpu_base +
 			    i * priv->params_ethtool.irq_cpu_stride) % (unsigned)mp_ncpus;
 
 			CPU_ZERO(&cpuset);
 			CPU_SET(cpu, &cpuset);
 			intr_setaffinity(irq, CPU_WHICH_INTRHANDLER, &cpuset);
 		}
 	}
 
 	for (j = 0; j < priv->params.num_channels; j++) {
 		err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j].rq);
 		if (err)
 			goto err_close_channels;
 	}
 	free(cparam, M_MLX5EN);
 	return (0);
 
 err_close_channels:
 	while (i--) {
 		mlx5e_close_channel(&priv->channel[i]);
 		mlx5e_close_channel_wait(&priv->channel[i]);
 	}
 	free(cparam, M_MLX5EN);
 	return (err);
 }
 
 static void
 mlx5e_close_channels(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel(&priv->channel[i]);
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel_wait(&priv->channel[i]);
 }
 
 static int
 mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 
 		switch (priv->params.tx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq,
 		    priv->params.tx_cq_moderation_usec,
 		    priv->params.tx_cq_moderation_pkts,
 		    cq_mode));
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq,
 	    priv->params.tx_cq_moderation_usec,
 	    priv->params.tx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 		uint8_t dim_mode;
 		int retval;
 
 		switch (priv->params.rx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		/* tear down dynamic interrupt moderation */
 		mtx_lock(&rq->mtx);
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 		mtx_unlock(&rq->mtx);
 
 		/* wait for dynamic interrupt moderation work task, if any */
 		cancel_work_sync(&rq->dim.work);
 
 		if (priv->params.rx_cq_moderation_mode >= 2) {
 			struct net_dim_cq_moder curr;
 
 			mlx5e_get_default_profile(priv, dim_mode, &curr);
 
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    curr.usec, curr.pkts, cq_mode);
 
 			/* set dynamic interrupt moderation mode and zero defaults */
 			mtx_lock(&rq->mtx);
 			rq->dim.mode = dim_mode;
 			rq->dim.state = 0;
 			rq->dim.profile_ix = MLX5E_DIM_DEFAULT_PROFILE;
 			mtx_unlock(&rq->mtx);
 		} else {
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    priv->params.rx_cq_moderation_usec,
 			    priv->params.rx_cq_moderation_pkts,
 			    cq_mode);
 		}
 		return (retval);
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq,
 	    priv->params.rx_cq_moderation_usec,
 	    priv->params.rx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int err;
 	int i;
 
 	err = mlx5e_refresh_rq_params(priv, &c->rq);
 	if (err)
 		goto done;
 
 	for (i = 0; i != priv->num_tc; i++) {
 		err = mlx5e_refresh_sq_params(priv, &c->sq[i]);
 		if (err)
 			goto done;
 	}
 done:
 	return (err);
 }
 
 int
 mlx5e_refresh_channel_params(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (EINVAL);
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		int err;
 
 		err = mlx5e_refresh_channel_params_sub(priv, &priv->channel[i]);
 		if (err)
 			return (err);
 	}
 	return (0);
 }
 
 static int
 mlx5e_open_tis(struct mlx5e_priv *priv, int tc)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(tisc, tisc, prio, tc);
 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
 
 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc]));
 }
 
 static void
 mlx5e_close_tis(struct mlx5e_priv *priv, int tc)
 {
 	mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc], 0);
 }
 
 static int
 mlx5e_open_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int err;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++) {
 		err = mlx5e_open_tis(priv, tc);
 		if (err)
 			goto err_close_tises;
 	}
 
 	return (0);
 
 err_close_tises:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_tis(priv, tc);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++)
 		mlx5e_close_tis(priv, tc);
 }
 
 static int
 mlx5e_open_default_rqt(struct mlx5e_priv *priv, u32 *prqtn, int sz)
 {
 	u32 *in;
 	void *rqtc;
 	int inlen;
 	int err;
 	int i;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
 	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
 
 	for (i = 0; i != sz; i++)
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->drop_rq.rqn);
 
 	err = mlx5_core_create_rqt(priv->mdev, in, inlen, prqtn);
 	kvfree(in);
 
 	return (err);
 }
 
 static int
 mlx5e_open_rqts(struct mlx5e_priv *priv)
 {
 	int err;
 	int i;
 
 	err = mlx5e_open_default_rqt(priv, &priv->rqtn,
 	    1 << priv->params.rx_hash_log_tbl_sz);
 	if (err)
 		goto err_default;
 
 	for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++) {
 		err = mlx5e_open_default_rqt(priv, &priv->channel[i].rqtn, 1);
 		if (err)
 			goto err_channel;
 	}
 	return (0);
 
 err_channel:
 	while (i--)
 		mlx5_core_destroy_rqt(priv->mdev, priv->channel[i].rqtn, 0);
 
 	mlx5_core_destroy_rqt(priv->mdev, priv->rqtn, 0);
 
 err_default:
 	return (err);
 }
 
 static void
 mlx5e_close_rqts(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++)
 		mlx5_core_destroy_rqt(priv->mdev, priv->channel[i].rqtn, 0);
 
 	mlx5_core_destroy_rqt(priv->mdev, priv->rqtn, 0);
 }
 
 static int
 mlx5e_activate_rqt(struct mlx5e_priv *priv)
 {
 	u32 *in;
 	void *rqtc;
 	int inlen;
 	int err;
 	int sz;
 	int i;
 
 	sz = 1 << priv->params.rx_hash_log_tbl_sz;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
 	MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1);
 
 	for (i = 0; i != sz; i++) {
 		int ix;
 #ifdef RSS
 		ix = rss_get_indirection_to_bucket(i);
 #else
 		ix = i;
 #endif
 		/* ensure we don't overflow */
 		ix %= priv->params.num_channels;
 
 		/* apply receive side scaling stride, if any */
 		ix -= ix % (int)priv->params.channels_rsss;
 
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix].rq.rqn);
 	}
 
 	err = mlx5_core_modify_rqt(priv->mdev, priv->rqtn, in, inlen);
 	if (err)
 		goto err_modify;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
 
 	for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++) {
 		int ix;
 #ifdef RSS
 		ix = rss_get_indirection_to_bucket(i);
 #else
 		ix = i;
 #endif
 		/* ensure we don't overflow */
 		ix %= priv->params.num_channels;
 
 		/* apply receive side scaling stride, if any */
 		ix -= ix % (int)priv->params.channels_rsss;
 
 		MLX5_SET(rqtc, rqtc, rq_num[0], priv->channel[ix].rq.rqn);
 
 		err = mlx5_core_modify_rqt(priv->mdev, priv->channel[i].rqtn, in, inlen);
 		if (err)
 			goto err_modify;
 	}
 
 err_modify:
 	kvfree(in);
 	return (err);
 }
 
 static int
 mlx5e_deactivate_rqt(struct mlx5e_priv *priv)
 {
 	u32 *in;
 	void *rqtc;
 	int inlen;
 	int err;
 	int sz;
 	int i;
 
 	sz = 1 << priv->params.rx_hash_log_tbl_sz;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
 	MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1);
 
 	for (i = 0; i != sz; i++)
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->drop_rq.rqn);
 
 	err = mlx5_core_modify_rqt(priv->mdev, priv->rqtn, in, inlen);
 	if (err)
 		goto err_modify;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
 
 	for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++) {
 		MLX5_SET(rqtc, rqtc, rq_num[0], priv->drop_rq.rqn);
 
 		err = mlx5_core_modify_rqt(priv->mdev, priv->channel[i].rqtn, in, inlen);
 		if (err)
 			goto err_modify;
 	}
 
 err_modify:
 	kvfree(in);
 	return (err);
 }
 
 #define	MLX5E_RSS_KEY_SIZE (10 * 4)	/* bytes */
 
 static void
 mlx5e_get_rss_key(void *key_ptr)
 {
 #ifdef RSS
 	rss_getkey(key_ptr);
 #else
 	static const u32 rsskey[] = {
 	    cpu_to_be32(0xD181C62C),
 	    cpu_to_be32(0xF7F4DB5B),
 	    cpu_to_be32(0x1983A2FC),
 	    cpu_to_be32(0x943E1ADB),
 	    cpu_to_be32(0xD9389E6B),
 	    cpu_to_be32(0xD1039C2C),
 	    cpu_to_be32(0xA74499AD),
 	    cpu_to_be32(0x593D56D9),
 	    cpu_to_be32(0xF3253C06),
 	    cpu_to_be32(0x2ADC1FFC),
 	};
 	CTASSERT(sizeof(rsskey) == MLX5E_RSS_KEY_SIZE);
 	memcpy(key_ptr, rsskey, MLX5E_RSS_KEY_SIZE);
 #endif
 }
 
 static void
 mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt, bool inner_vxlan)
 {
 	void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 	void *hfsi = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner);
 	void *hfs = inner_vxlan ? hfsi : hfso;
 	__be32 *hkey;
 
 	MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
 
 #define	ROUGH_MAX_L2_L3_HDR_SZ 256
 
 #define	MLX5_HASH_IP     (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP)
 
 #define	MLX5_HASH_ALL    (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP   |\
 			  MLX5_HASH_FIELD_SEL_L4_SPORT |\
 			  MLX5_HASH_FIELD_SEL_L4_DPORT)
 
 #define	MLX5_HASH_IP_IPSEC_SPI	(MLX5_HASH_FIELD_SEL_SRC_IP   |\
 				 MLX5_HASH_FIELD_SEL_DST_IP   |\
 				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
 
 	if (priv->params.hw_lro_en) {
 		MLX5_SET(tirc, tirc, lro_enable_mask,
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
 		MLX5_SET(tirc, tirc, lro_max_msg_sz,
 		    (priv->params.lro_wqe_sz -
 		    ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
 		/* TODO: add the option to choose timer value dynamically */
 		MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
 		    MLX5_CAP_ETH(priv->mdev,
 		    lro_timer_supported_periods[2]));
 	}
 
 	if (inner_vxlan)
 		MLX5_SET(tirc, tirc, tunneled_offload_en, 1);
 
 	/*
 	 * All packets must go through the indirection table, RQT,
 	 * because it is not possible to modify the RQN of the TIR
 	 * for direct dispatchment after it is created, typically
 	 * when the link goes up and down.
 	 */
 	MLX5_SET(tirc, tirc, disp_type,
 	    MLX5_TIRC_DISP_TYPE_INDIRECT);
 	MLX5_SET(tirc, tirc, indirect_table,
 	    priv->rqtn);
 	MLX5_SET(tirc, tirc, rx_hash_fn,
 		 MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ);
 	hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
 
 	CTASSERT(MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key) >=
 		 MLX5E_RSS_KEY_SIZE);
 #ifdef RSS
 	/*
 	 * The FreeBSD RSS implementation does currently not
 	 * support symmetric Toeplitz hashes:
 	 */
 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 0);
 #else
 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
 #endif
 	mlx5e_get_rss_key(hkey);
 
 	switch (tt) {
 	case MLX5E_TT_IPV4_TCP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_TCP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_UDP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_UDP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	case MLX5E_TT_IPV6:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static int
 mlx5e_open_tir(struct mlx5e_priv *priv, int tt, bool inner_vxlan)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 *in;
 	void *tirc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context);
 
 	mlx5e_build_tir_ctx(priv, tirc, tt, inner_vxlan);
 
 	err = mlx5_core_create_tir(mdev, in, inlen, inner_vxlan ?
 	    &priv->tirn_inner_vxlan[tt] : &priv->tirn[tt]);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tir(struct mlx5e_priv *priv, int tt, bool inner_vxlan)
 {
 	mlx5_core_destroy_tir(priv->mdev, inner_vxlan ?
 	    priv->tirn_inner_vxlan[tt] : priv->tirn[tt], 0);
 }
 
 static int
 mlx5e_open_tirs(struct mlx5e_priv *priv)
 {
 	int err;
 	int i;
 
 	for (i = 0; i != 2 * MLX5E_NUM_TT; i++) {
 		err = mlx5e_open_tir(priv, i / 2, (i % 2) ? true : false);
 		if (err)
 			goto err_close_tirs;
 	}
 
 	return (0);
 
 err_close_tirs:
 	for (i--; i >= 0; i--)
 		mlx5e_close_tir(priv, i / 2, (i % 2) ? true : false);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tirs(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i != 2 * MLX5E_NUM_TT; i++)
 		mlx5e_close_tir(priv, i / 2, (i % 2) ? true : false);
 }
 
 /*
  * SW MTU does not include headers,
  * HW MTU includes all headers and checksums.
  */
 static int
 mlx5e_set_dev_port_mtu(if_t ifp, int sw_mtu)
 {
 	struct mlx5e_priv *priv = if_getsoftc(ifp);
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int hw_mtu;
 	int err;
 
 	hw_mtu = MLX5E_SW2HW_MTU(sw_mtu);
 
 	err = mlx5_set_port_mtu(mdev, hw_mtu);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5_set_port_mtu failed setting %d, err=%d\n",
 		    sw_mtu, err);
 		return (err);
 	}
 
 	/* Update vport context MTU */
 	err = mlx5_set_vport_mtu(mdev, hw_mtu);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "Failed updating vport context with MTU size, err=%d\n",
 		    err);
 	}
 
 	if_setmtu(ifp, sw_mtu);
 
 	err = mlx5_query_vport_mtu(mdev, &hw_mtu);
 	if (err || !hw_mtu) {
 		/* fallback to port oper mtu */
 		err = mlx5_query_port_oper_mtu(mdev, &hw_mtu);
 	}
 	if (err) {
 		mlx5_en_err(ifp,
 		    "Query port MTU, after setting new MTU value, failed\n");
 		return (err);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) {
 		err = -E2BIG,
 		mlx5_en_err(ifp,
 		    "Port MTU %d is smaller than ifp mtu %d\n",
 		    hw_mtu, sw_mtu);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) {
 		err = -EINVAL;
                 mlx5_en_err(ifp,
 		    "Port MTU %d is bigger than ifp mtu %d\n",
 		    hw_mtu, sw_mtu);
 	}
 	priv->params_ethtool.hw_mtu = hw_mtu;
 
 	/* compute MSB */
 	while (hw_mtu & (hw_mtu - 1))
 		hw_mtu &= (hw_mtu - 1);
 	priv->params_ethtool.hw_mtu_msb = hw_mtu;
 
 	return (err);
 }
 
 int
 mlx5e_open_locked(if_t ifp)
 {
 	struct mlx5e_priv *priv = if_getsoftc(ifp);
 	int err;
 	u16 set_id;
 
 	/* check if already opened */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 		return (0);
 
 #ifdef RSS
 	if (rss_getnumbuckets() > priv->params.num_channels) {
 		mlx5_en_info(ifp,
 		    "NOTE: There are more RSS buckets(%u) than channels(%u) available\n",
 		    rss_getnumbuckets(), priv->params.num_channels);
 	}
 #endif
 	err = mlx5e_open_tises(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_tises failed, %d\n", err);
 		return (err);
 	}
 	err = mlx5_vport_alloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, &set_id);
 	if (err) {
 		mlx5_en_err(priv->ifp,
 		    "mlx5_vport_alloc_q_counter failed: %d\n", err);
 		goto err_close_tises;
 	}
 	/* store counter set ID */
 	priv->counter_set_id = set_id;
 
 	err = mlx5e_open_channels(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_open_channels failed, %d\n", err);
 		goto err_dalloc_q_counter;
 	}
 	err = mlx5e_activate_rqt(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_activate_rqt failed, %d\n", err);
 		goto err_close_channels;
 	}
 
 	set_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	mlx5e_update_carrier(priv);
 
 	return (0);
 
 err_close_channels:
 	mlx5e_close_channels(priv);
 
 err_dalloc_q_counter:
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 
 err_close_tises:
 	mlx5e_close_tises(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_open(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	PRIV_LOCK(priv);
 	if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP))
 		mlx5_en_err(priv->ifp,
 		    "Setting port status to up failed\n");
 
 	mlx5e_open_locked(priv->ifp);
 	if_setdrvflagbits(priv->ifp, IFF_DRV_RUNNING, 0);
 	PRIV_UNLOCK(priv);
 }
 
 int
 mlx5e_close_locked(if_t ifp)
 {
 	struct mlx5e_priv *priv = if_getsoftc(ifp);
 
 	/* check if already closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (0);
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 
 	mlx5e_deactivate_rqt(priv);
 	mlx5e_close_channels(priv);
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 	mlx5e_close_tises(priv);
 
 	return (0);
 }
 
 static uint64_t
 mlx5e_get_counter(if_t ifp, ift_counter cnt)
 {
 	struct mlx5e_priv *priv = if_getsoftc(ifp);
 	u64 retval;
 
 	/* PRIV_LOCK(priv); XXX not allowed */
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		retval = priv->stats.vport.rx_packets;
 		break;
 	case IFCOUNTER_IERRORS:
 		retval = priv->stats.pport.in_range_len_errors +
 		    priv->stats.pport.out_of_range_len +
 		    priv->stats.pport.too_long_errors +
 		    priv->stats.pport.check_seq_err +
 		    priv->stats.pport.alignment_err;
 		break;
 	case IFCOUNTER_IQDROPS:
 		retval = priv->stats.vport.rx_out_of_buffer;
 		break;
 	case IFCOUNTER_OPACKETS:
 		retval = priv->stats.vport.tx_packets;
 		break;
 	case IFCOUNTER_OERRORS:
 		retval = priv->stats.port_stats_debug.out_discards;
 		break;
 	case IFCOUNTER_IBYTES:
 		retval = priv->stats.vport.rx_bytes;
 		break;
 	case IFCOUNTER_OBYTES:
 		retval = priv->stats.vport.tx_bytes;
 		break;
 	case IFCOUNTER_IMCASTS:
 		retval = priv->stats.vport.rx_multicast_packets;
 		break;
 	case IFCOUNTER_OMCASTS:
 		retval = priv->stats.vport.tx_multicast_packets;
 		break;
 	case IFCOUNTER_OQDROPS:
 		retval = priv->stats.vport.tx_queue_dropped;
 		break;
 	case IFCOUNTER_COLLISIONS:
 		retval = priv->stats.pport.collisions;
 		break;
 	default:
 		retval = if_get_counter_default(ifp, cnt);
 		break;
 	}
 	/* PRIV_UNLOCK(priv); XXX not allowed */
 	return (retval);
 }
 
 static void
 mlx5e_set_rx_mode(if_t ifp)
 {
 	struct mlx5e_priv *priv = if_getsoftc(ifp);
 
 	queue_work(priv->wq, &priv->set_rx_mode_work);
 }
 
 static int
 mlx5e_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	struct mlx5e_priv *priv;
 	struct ifreq *ifr;
 	struct ifdownreason *ifdr;
 	struct ifi2creq i2c;
 	struct ifrsskey *ifrk;
 	struct ifrsshash *ifrh;
 	struct siocsifcapnv_driver_data *drv_ioctl_data, drv_ioctl_data_d;
 	int error = 0;
 	int mask;
 	int size_read = 0;
 	int module_status;
 	int module_num;
 	int max_mtu;
 	uint8_t read_addr;
 
 	priv = if_getsoftc(ifp);
 
 	/* check if detaching */
 	if (priv == NULL || priv->gone != 0)
 		return (ENXIO);
 
 	switch (command) {
 	case SIOCSIFMTU:
 		ifr = (struct ifreq *)data;
 
 		PRIV_LOCK(priv);
 		mlx5_query_port_max_mtu(priv->mdev, &max_mtu);
 
 		if (ifr->ifr_mtu >= MLX5E_MTU_MIN &&
 		    ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) {
 			int was_opened;
 
 			was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			if (was_opened)
 				mlx5e_close_locked(ifp);
 
 			/* set new MTU */
 			mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu);
 
 			if (was_opened)
 				mlx5e_open_locked(ifp);
 		} else {
 			error = EINVAL;
 			mlx5_en_err(ifp,
 			    "Invalid MTU value. Min val: %d, Max val: %d\n",
 			    MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu));
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCSIFFLAGS:
 		if ((if_getflags(ifp) & IFF_UP) &&
 		    (if_getdrvflags(ifp) & IFF_DRV_RUNNING)) {
 			mlx5e_set_rx_mode(ifp);
 			break;
 		}
 		PRIV_LOCK(priv);
 		if (if_getflags(ifp) & IFF_UP) {
 			if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 					mlx5e_open_locked(ifp);
 				if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
 				mlx5_set_port_status(priv->mdev, MLX5_PORT_UP);
 			}
 		} else {
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 				mlx5_set_port_status(priv->mdev,
 				    MLX5_PORT_DOWN);
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 					mlx5e_close_locked(ifp);
 				mlx5e_update_carrier(priv);
 				if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
 			}
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		mlx5e_set_rx_mode(ifp);
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		ifr = (struct ifreq *)data;
 		error = ifmedia_ioctl(ifp, ifr, &priv->media, command);
 		break;
 	case SIOCGIFCAPNV:
 		error = 0;
 		break;
 	case SIOCSIFCAP:
 		ifr = (struct ifreq *)data;
 		drv_ioctl_data = &drv_ioctl_data_d;
 		drv_ioctl_data->reqcap = ifr->ifr_reqcap;
 		PRIV_LOCK(priv);
 		drv_ioctl_data->reqcap2 = if_getcapenable2(ifp);
 		drv_ioctl_data->nvcap = NULL;
 		goto siocsifcap_driver;
 	case SIOCSIFCAPNV:
 		drv_ioctl_data = (struct siocsifcapnv_driver_data *)data;
 		PRIV_LOCK(priv);
 siocsifcap_driver:
 		mask = drv_ioctl_data->reqcap ^ if_getcapenable(ifp);
 
 		if (mask & IFCAP_TXCSUM) {
 			if_togglecapenable(ifp, IFCAP_TXCSUM);
 			if_togglehwassist(ifp, (CSUM_TCP | CSUM_UDP | CSUM_IP));
 
 			if (IFCAP_TSO4 & if_getcapenable(ifp) &&
 			    !(IFCAP_TXCSUM & if_getcapenable(ifp))) {
 				mask &= ~IFCAP_TSO4;
 				if_setcapenablebit(ifp, 0, IFCAP_TSO4);
 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
 				mlx5_en_err(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
 			if_togglehwassist(ifp, (CSUM_UDP_IPV6 | CSUM_TCP_IPV6));
 
 			if (IFCAP_TSO6 & if_getcapenable(ifp) &&
 			    !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) {
 				mask &= ~IFCAP_TSO6;
 				if_setcapenablebit(ifp, 0, IFCAP_TSO6);
 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
 				mlx5_en_err(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_MEXTPG)
 			if_togglecapenable(ifp, IFCAP_MEXTPG);
 		if (mask & IFCAP_TXTLS4)
 			if_togglecapenable(ifp, IFCAP_TXTLS4);
 		if (mask & IFCAP_TXTLS6)
 			if_togglecapenable(ifp, IFCAP_TXTLS6);
 #ifdef RATELIMIT
 		if (mask & IFCAP_TXTLS_RTLMT)
 			if_togglecapenable(ifp, IFCAP_TXTLS_RTLMT);
 #endif
 		if (mask & IFCAP_RXCSUM)
 			if_togglecapenable(ifp, IFCAP_RXCSUM);
 		if (mask & IFCAP_RXCSUM_IPV6)
 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & if_getcapenable(ifp)) &&
 			    !(IFCAP_TXCSUM & if_getcapenable(ifp))) {
 				mlx5_en_err(ifp, "enable txcsum first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			if_togglecapenable(ifp, IFCAP_TSO4);
 			if_togglehwassist(ifp, CSUM_IP_TSO);
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & if_getcapenable(ifp)) &&
 			    !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) {
 				mlx5_en_err(ifp, "enable txcsum6 first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			if_togglecapenable(ifp, IFCAP_TSO6);
 			if_togglehwassist(ifp, CSUM_IP6_TSO);
 		}
 		if (mask & IFCAP_VLAN_HWTSO)
 			if_togglecapenable(ifp, IFCAP_VLAN_HWTSO);
 		if (mask & IFCAP_VLAN_HWFILTER) {
 			if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 				mlx5e_disable_vlan_filter(priv);
 			else
 				mlx5e_enable_vlan_filter(priv);
 
 			if_togglecapenable(ifp, IFCAP_VLAN_HWFILTER);
 		}
 		if (mask & IFCAP_VLAN_HWTAGGING)
 			if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
 		if (mask & IFCAP_WOL_MAGIC)
 			if_togglecapenable(ifp, IFCAP_WOL_MAGIC);
 		if (mask & IFCAP_VXLAN_HWCSUM) {
 			const bool was_enabled =
 			    (if_getcapenable(ifp) & IFCAP_VXLAN_HWCSUM) != 0;
 			if (was_enabled)
 				mlx5e_del_all_vxlan_rules(priv);
 			if_togglecapenable(ifp, IFCAP_VXLAN_HWCSUM);
 			if_togglehwassist(ifp, CSUM_INNER_IP | CSUM_INNER_IP_UDP |
 			    CSUM_INNER_IP_TCP | CSUM_INNER_IP6_UDP |
 			    CSUM_INNER_IP6_TCP);
 			if (!was_enabled) {
 				int err = mlx5e_add_all_vxlan_rules(priv);
 				if (err != 0) {
 					mlx5_en_err(ifp,
 					    "mlx5e_add_all_vxlan_rules() failed, %d (ignored)\n", err);
 				}
 			}
 		}
 		if (mask & IFCAP_VXLAN_HWTSO) {
 			if_togglecapenable(ifp, IFCAP_VXLAN_HWTSO);
 			if_togglehwassist(ifp, CSUM_INNER_IP_TSO |
 			    CSUM_INNER_IP6_TSO);
 		}
 
 		VLAN_CAPABILITIES(ifp);
 		/* turn off LRO means also turn of HW LRO - if it's on */
 		if (mask & IFCAP_LRO) {
 			int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			bool need_restart = false;
 
 			if_togglecapenable(ifp, IFCAP_LRO);
 
 			/* figure out if updating HW LRO is needed */
 			if (!(if_getcapenable(ifp) & IFCAP_LRO)) {
 				if (priv->params.hw_lro_en) {
 					priv->params.hw_lro_en = false;
 					need_restart = true;
 				}
 			} else {
 				if (priv->params.hw_lro_en == false &&
 				    priv->params_ethtool.hw_lro != 0) {
 					priv->params.hw_lro_en = true;
 					need_restart = true;
 				}
 			}
 			if (was_opened && need_restart) {
 				mlx5e_close_locked(ifp);
 				mlx5e_open_locked(ifp);
 			}
 		}
 		if (mask & IFCAP_HWRXTSTMP) {
 			if_togglecapenable(ifp, IFCAP_HWRXTSTMP);
 			if (if_getcapenable(ifp) & IFCAP_HWRXTSTMP) {
 				if (priv->clbr_done == 0)
 					mlx5e_reset_calibration_callout(priv);
 			} else {
 				callout_drain(&priv->tstmp_clbr);
 				priv->clbr_done = 0;
 			}
 		}
 		mask = drv_ioctl_data->reqcap2 ^ if_getcapenable2(ifp);
 		if ((mask & IFCAP2_BIT(IFCAP2_RXTLS4)) != 0)
 			if_togglecapenable2(ifp, IFCAP2_BIT(IFCAP2_RXTLS4));
 		if ((mask & IFCAP2_BIT(IFCAP2_RXTLS6)) != 0)
 			if_togglecapenable2(ifp, IFCAP2_BIT(IFCAP2_RXTLS6));
 out:
 		PRIV_UNLOCK(priv);
 		break;
 
 	case SIOCGI2C:
 		ifr = (struct ifreq *)data;
 
 		/*
 		 * Copy from the user-space address ifr_data to the
 		 * kernel-space address i2c
 		 */
 		error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (error)
 			break;
 
 		if (i2c.len > sizeof(i2c.data)) {
 			error = EINVAL;
 			break;
 		}
 
 		PRIV_LOCK(priv);
 		/* Get module_num which is required for the query_eeprom */
 		error = mlx5_query_module_num(priv->mdev, &module_num);
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query module num failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/* Check if module is present before doing an access */
 		module_status = mlx5_query_module_status(priv->mdev, module_num);
 		if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) {
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/*
 		 * Currently 0XA0 and 0xA2 are the only addresses permitted.
 		 * The internal conversion is as follows:
 		 */
 		if (i2c.dev_addr == 0xA0)
 			read_addr = MLX5_I2C_ADDR_LOW;
 		else if (i2c.dev_addr == 0xA2)
 			read_addr = MLX5_I2C_ADDR_HIGH;
 		else {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, Invalid Address: %X\n",
 			    i2c.dev_addr);
 			error = EINVAL;
 			goto err_i2c;
 		}
 		error = mlx5_query_eeprom(priv->mdev,
 		    read_addr, MLX5_EEPROM_LOW_PAGE,
 		    (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num,
 		    (uint32_t *)i2c.data, &size_read);
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		if (i2c.len > MLX5_EEPROM_MAX_BYTES) {
 			error = mlx5_query_eeprom(priv->mdev,
 			    read_addr, MLX5_EEPROM_LOW_PAGE,
 			    (uint32_t)(i2c.offset + size_read),
 			    (uint32_t)(i2c.len - size_read), module_num,
 			    (uint32_t *)(i2c.data + size_read), &size_read);
 		}
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 err_i2c:
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCGIFDOWNREASON:
 		ifdr = (struct ifdownreason *)data;
 		bzero(ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg));
 		PRIV_LOCK(priv);
 		error = -mlx5_query_pddr_troubleshooting_info(priv->mdev, NULL,
 		    ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg));
 		PRIV_UNLOCK(priv);
 		if (error == 0)
 			ifdr->ifdr_reason = IFDR_REASON_MSG;
 		break;
 
 	case SIOCGIFRSSKEY:
 		ifrk = (struct ifrsskey *)data;
 		ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
 		ifrk->ifrk_keylen = MLX5E_RSS_KEY_SIZE;
 		CTASSERT(sizeof(ifrk->ifrk_key) >= MLX5E_RSS_KEY_SIZE);
 		mlx5e_get_rss_key(ifrk->ifrk_key);
 		break;
 
 	case SIOCGIFRSSHASH:
 		ifrh = (struct ifrsshash *)data;
 		ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
 		ifrh->ifrh_types =
 		    RSS_TYPE_IPV4 |
 		    RSS_TYPE_TCP_IPV4 |
 		    RSS_TYPE_UDP_IPV4 |
 		    RSS_TYPE_IPV6 |
 		    RSS_TYPE_TCP_IPV6 |
 		    RSS_TYPE_UDP_IPV6;
 		break;
 
 	default:
 		error = ether_ioctl(ifp, command, data);
 		break;
 	}
 	return (error);
 }
 
 static int
 mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
 {
 	/*
 	 * TODO: uncoment once FW really sets all these bits if
 	 * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap ||
 	 * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap ||
 	 * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return
 	 * -ENOTSUPP;
 	 */
 
 	/* TODO: add more must-to-have features */
 
 	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
 		return (-ENODEV);
 
 	return (0);
 }
 
 static u16
 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev)
 {
 	const int min_size = ETHER_VLAN_ENCAP_LEN + ETHER_HDR_LEN;
 	const int max_size = MLX5E_MAX_TX_INLINE;
 	const int bf_buf_size =
 	    ((1U << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2U) -
 	    (sizeof(struct mlx5e_tx_wqe) - 2);
 
 	/* verify against driver limits */
 	if (bf_buf_size > max_size)
 		return (max_size);
 	else if (bf_buf_size < min_size)
 		return (min_size);
 	else
 		return (bf_buf_size);
 }
 
 static int
 mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev,
     struct mlx5e_priv *priv,
     int num_comp_vectors)
 {
 	int err;
 
 	/*
 	 * TODO: Consider link speed for setting "log_sq_size",
 	 * "log_rq_size" and "cq_moderation_xxx":
 	 */
 	priv->params.log_sq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
 	priv->params.log_rq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
 	priv->params.rx_cq_moderation_usec =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE :
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	priv->params.rx_cq_moderation_mode =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0;
 	priv->params.rx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
 	priv->params.tx_cq_moderation_usec =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
 	priv->params.tx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.min_rx_wqes =
 	    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
 	priv->params.rx_hash_log_tbl_sz =
 	    (order_base_2(num_comp_vectors) >
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ?
 	    order_base_2(num_comp_vectors) :
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ;
 	priv->params.num_tc = 1;
 	priv->params.default_vlan_prio = 0;
 	priv->counter_set_id = -1;
 	priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
 
 	err = mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
 	if (err)
 		return (err);
 
 	/*
 	 * hw lro is currently defaulted to off. when it won't anymore we
 	 * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)"
 	 */
 	priv->params.hw_lro_en = false;
 	priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
 
 	/*
 	 * CQE zipping is currently defaulted to off. when it won't
 	 * anymore we will consider the HW capability:
 	 * "!!MLX5_CAP_GEN(mdev, cqe_compression)"
 	 */
 	priv->params.cqe_zipping_en = false;
 
 	priv->mdev = mdev;
 	priv->params.num_channels = num_comp_vectors;
 	priv->params.channels_rsss = 1;
 	priv->order_base_2_num_channels = order_base_2(num_comp_vectors);
 	priv->queue_mapping_channel_mask =
 	    roundup_pow_of_two(num_comp_vectors) - 1;
 	priv->num_tc = priv->params.num_tc;
 	priv->default_vlan_prio = priv->params.default_vlan_prio;
 
 	INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
 
 	return (0);
 }
 
 static void
 mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc)
 {
 	bool ro_pci_enable =
 	    pci_get_relaxed_ordering_enabled(mdev->pdev->dev.bsddev);
 	bool ro_write = MLX5_CAP_GEN(mdev, relaxed_ordering_write);
 	bool ro_read = MLX5_CAP_GEN(mdev, relaxed_ordering_read);
 
 	MLX5_SET(mkc, mkc, relaxed_ordering_read, ro_pci_enable && ro_read);
 	MLX5_SET(mkc, mkc, relaxed_ordering_write, ro_pci_enable && ro_write);
 }
 
 static int
 mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn,
 		  struct mlx5_core_mkey *mkey)
 {
 	if_t ifp = priv->ifp;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	void *mkc;
 	u32 *in;
 	int err;
 
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL) {
 		mlx5_en_err(ifp, "failed to allocate inbox\n");
 		return (-ENOMEM);
 	}
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA);
 	MLX5_SET(mkc, mkc, umr_en, 1);	/* used by HW TLS */
 	MLX5_SET(mkc, mkc, lw, 1);
 	MLX5_SET(mkc, mkc, lr, 1);
 	mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
 	MLX5_SET(mkc, mkc, pd, pdn);
 	MLX5_SET(mkc, mkc, length64, 1);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 
 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
 	if (err)
 		mlx5_en_err(ifp, "mlx5_core_create_mkey failed, %d\n",
 		    err);
 
 	kvfree(in);
 	return (err);
 }
 
 static const char *mlx5e_vport_stats_desc[] = {
 	MLX5E_VPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_pport_stats_desc[] = {
 	MLX5E_PPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static int
 mlx5e_priv_static_init(struct mlx5e_priv *priv, struct mlx5_core_dev *mdev,
     const uint32_t channels)
 {
 	uint32_t x;
 	int err;
 
 	mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF);
 	sx_init(&priv->state_lock, "mlx5state");
 	callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0);
 	MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock);
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_init(priv, &priv->channel[x], x);
 
 	for (x = 0; x != channels; x++) {
 		err = mlx5_alloc_bfreg(mdev, &priv->channel[x].bfreg, false, false);
 		if (err)
 			goto err_alloc_bfreg;
 	}
 	return (0);
 
 err_alloc_bfreg:
 	while (x--)
 		mlx5_free_bfreg(mdev, &priv->channel[x].bfreg);
 
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_destroy(&priv->channel[x]);
 	callout_drain(&priv->watchdog);
 	mtx_destroy(&priv->async_events_mtx);
 	sx_destroy(&priv->state_lock);
 	return (err);
 }
 
 static void
 mlx5e_priv_static_destroy(struct mlx5e_priv *priv, struct mlx5_core_dev *mdev,
     const uint32_t channels)
 {
 	uint32_t x;
 
 	for (x = 0; x != channels; x++)
 		mlx5_free_bfreg(mdev, &priv->channel[x].bfreg);
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_destroy(&priv->channel[x]);
 	callout_drain(&priv->watchdog);
 	mtx_destroy(&priv->async_events_mtx);
 	sx_destroy(&priv->state_lock);
 }
 
 static int
 sysctl_firmware(SYSCTL_HANDLER_ARGS)
 {
 	/*
 	 * %d.%d%.d the string format.
 	 * fw_rev_{maj,min,sub} return u16, 2^16 = 65536.
 	 * We need at most 5 chars to store that.
 	 * It also has: two "." and NULL at the end, which means we need 18
 	 * (5*3 + 3) chars at most.
 	 */
 	char fw[18];
 	struct mlx5e_priv *priv = arg1;
 	int error;
 
 	snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev),
 	    fw_rev_sub(priv->mdev));
 	error = sysctl_handle_string(oidp, fw, sizeof(fw), req);
 	return (error);
 }
 
 static void
 mlx5e_disable_tx_dma(struct mlx5e_channel *ch)
 {
 	int i;
 
 	for (i = 0; i < ch->priv->num_tc; i++)
 		mlx5e_drain_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq)
 {
 
 	sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP);
 	sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8);
 	mlx5e_tx_notify_hw(sq, true);
 }
 
 void
 mlx5e_resume_sq(struct mlx5e_sq *sq)
 {
 	int err;
 
 	/* check if already enabled */
 	if (READ_ONCE(sq->running) != 0)
 		return;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR,
 	    MLX5_SQC_STATE_RST);
 	if (err != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from ERR to RST failed: %d\n", err);
 	}
 
 	sq->cc = 0;
 	sq->pc = 0;
 
 	/* reset doorbell prior to moving from RST to RDY */
 	mlx5e_reset_sq_doorbell_record(sq);
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST,
 	    MLX5_SQC_STATE_RDY);
 	if (err != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from RST to RDY failed: %d\n", err);
 	}
 
 	sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 	WRITE_ONCE(sq->running, 1);
 }
 
 static void
 mlx5e_enable_tx_dma(struct mlx5e_channel *ch)
 {
         int i;
 
 	for (i = 0; i < ch->priv->num_tc; i++)
 		mlx5e_resume_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_disable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	struct epoch_tracker et;
 	int err;
 
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from RDY to RST failed: %d\n", err);
 	}
 
 	while (!mlx5_wq_ll_is_empty(&rq->wq)) {
 		msleep(1);
 		NET_EPOCH_ENTER(et);
 		rq->cq.mcq.comp(&rq->cq.mcq, NULL);
 		NET_EPOCH_EXIT(et);
 	}
 
 	/*
 	 * Transitioning into RST state will allow the FW to track less ERR state queues,
 	 * thus reducing the recv queue flushing time
 	 */
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from ERR to RST failed: %d\n", err);
 	}
 }
 
 static void
 mlx5e_enable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	struct epoch_tracker et;
 	int err;
 
 	rq->wq.wqe_ctr = 0;
 	mlx5_wq_ll_update_db_record(&rq->wq);
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from RST to RDY failed: %d\n", err);
         }
 
 	rq->enabled = 1;
 
 	NET_EPOCH_ENTER(et);
 	rq->cq.mcq.comp(&rq->cq.mcq, NULL);
 	NET_EPOCH_EXIT(et);
 }
 
 void
 mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_tx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_tx_dma(&priv->channel[i]);
 	}
 }
 
 void
 mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_rx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_rx_dma(&priv->channel[i]);
 	}
 }
 
 static void
 mlx5e_add_hw_stats(struct mlx5e_priv *priv)
 {
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    priv, 0, sysctl_firmware, "A", "HCA firmware version");
 
 	SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0,
 	    "Board ID");
 }
 
 static int
 mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t tx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	tx_pfc = priv->params.tx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (tx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.tx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.tx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (tx_pfc != priv->params.tx_priority_flow_control)
 		err = -mlx5e_set_port_pfc(priv);
 done:
 	if (err != 0)
 		priv->params.tx_priority_flow_control= tx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static int
 mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t rx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	rx_pfc = priv->params.rx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (rx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.rx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.rx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (rx_pfc != priv->params.rx_priority_flow_control) {
 		err = -mlx5e_set_port_pfc(priv);
 		if (err == 0 && priv->sw_is_port_buf_owner)
 			err = mlx5e_update_buf_lossy(priv);
 	}
 done:
 	if (err != 0)
 		priv->params.rx_priority_flow_control= rx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_setup_pauseframes(struct mlx5e_priv *priv)
 {
 	int error;
 
 	/* enable pauseframes by default */
 	priv->params.tx_pauseframe_control = 1;
 	priv->params.rx_pauseframe_control = 1;
 
 	/* disable ports flow control, PFC, by default */
 	priv->params.tx_priority_flow_control = 0;
 	priv->params.rx_priority_flow_control = 0;
 
 	/* register pauseframe SYSCTLs */
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.tx_pauseframe_control, 0,
 	    "Set to enable TX pause frames. Clear to disable.");
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.rx_pauseframe_control, 0,
 	    "Set to enable RX pause frames. Clear to disable.");
 
 	/* register priority flow control, PFC, SYSCTLs */
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_tx_priority_flow_control, "CU",
 	    "Set to enable TX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_rx_priority_flow_control, "CU",
 	    "Set to enable RX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	PRIV_LOCK(priv);
 
 	/* range check */
 	priv->params.tx_pauseframe_control =
 	    priv->params.tx_pauseframe_control ? 1 : 0;
 	priv->params.rx_pauseframe_control =
 	    priv->params.rx_pauseframe_control ? 1 : 0;
 
 	/* update firmware */
 	error = mlx5e_set_port_pause_and_pfc(priv);
 	if (error == -EINVAL) {
 		mlx5_en_err(priv->ifp,
 		    "Global pauseframes must be disabled before enabling PFC.\n");
 		priv->params.rx_priority_flow_control = 0;
 		priv->params.tx_priority_flow_control = 0;
 
 		/* update firmware */
 		(void) mlx5e_set_port_pause_and_pfc(priv);
 	}
 	PRIV_UNLOCK(priv);
 }
 
 static int
 mlx5e_ul_snd_tag_alloc(if_t ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_channel *pch;
 
 	priv = if_getsoftc(ifp);
 
 	if (unlikely(priv->gone || params->hdr.flowtype == M_HASHTYPE_NONE)) {
 		return (EOPNOTSUPP);
 	} else {
 		/* keep this code synced with mlx5e_select_queue() */
 		u32 ch = priv->params.num_channels;
 #ifdef RSS
 		u32 temp;
 
 		if (rss_hash2bucket(params->hdr.flowid,
 		    params->hdr.flowtype, &temp) == 0)
 			ch = temp % ch;
 		else
 #endif
 			ch = (params->hdr.flowid % 128) % ch;
 
 		/*
 		 * NOTE: The channels array is only freed at detach
 		 * and it safe to return a pointer to the send tag
 		 * inside the channels structure as long as we
 		 * reference the priv.
 		 */
 		pch = priv->channel + ch;
 
 		/* check if send queue is not running */
 		if (unlikely(pch->sq[0].running == 0))
 			return (ENXIO);
 		m_snd_tag_ref(&pch->tag);
 		*ppmt = &pch->tag;
 		return (0);
 	}
 }
 
 static int
 mlx5e_ul_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag);
 
 	params->unlimited.max_rate = -1ULL;
 	params->unlimited.queue_level = mlx5e_sq_queue_level(&pch->sq[0]);
 	return (0);
 }
 
 static void
 mlx5e_ul_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag);
 
 	complete(&pch->completion);
 }
 
 static int
 mlx5e_snd_tag_alloc(if_t ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_alloc(ifp, params, ppmt));
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt));
 #endif
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		return (mlx5e_ul_snd_tag_alloc(ifp, params, ppmt));
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt));
 	case IF_SND_TAG_TYPE_TLS_RX:
 		return (mlx5e_tls_rx_snd_tag_alloc(ifp, params, ppmt));
 #endif
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 #ifdef RATELIMIT
 #define NUM_HDWR_RATES_MLX 13
 static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
 	135375,			/* 1,083,000 */
 	180500,			/* 1,444,000 */
 	270750,			/* 2,166,000 */
 	361000,			/* 2,888,000 */
 	541500,			/* 4,332,000 */
 	721875,			/* 5,775,000 */
 	1082875,		/* 8,663,000 */
 	1443875,		/* 11,551,000 */
 	2165750,		/* 17,326,000 */
 	2887750,		/* 23,102,000 */
 	4331625,		/* 34,653,000 */
 	5775500,		/* 46,204,000 */
 	8663125			/* 69,305,000 */
 };
 
 static void
 mlx5e_ratelimit_query(if_t ifp __unused, struct if_ratelimit_query_results *q)
 {
 	/*
 	 * This function needs updating by the driver maintainer!
 	 * For the MLX card there are currently (ConectX-4?) 13 
 	 * pre-set rates and others i.e. ConnectX-5, 6, 7??
 	 *
 	 * This will change based on later adapters
 	 * and this code should be updated to look at ifp
 	 * and figure out the specific adapter type
 	 * settings i.e. how many rates as well
 	 * as if they are fixed (as is shown here) or
 	 * if they are dynamic (example chelsio t4). Also if there
 	 * is a maximum number of flows that the adapter
 	 * can handle that too needs to be updated in
 	 * the max_flows field.
 	 */
 	q->rate_table = adapter_rates_mlx;
 	q->flags = RT_IS_FIXED_TABLE;
 	q->max_flows = 0;	/* mlx has no limit */
 	q->number_of_rates = NUM_HDWR_RATES_MLX;
 	q->min_segment_burst = 1;
 }
 #endif
 
 static void
 mlx5e_ifm_add(struct mlx5e_priv *priv, int type)
 {
 	ifmedia_add(&priv->media, type | IFM_ETHER, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_ETH_RXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX |
 	    IFM_ETH_RXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX |
 	    IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 }
 
 static void *
 mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 {
 	if_t ifp;
 	struct mlx5e_priv *priv;
 	u8 dev_addr[ETHER_ADDR_LEN] __aligned(4);
 	struct sysctl_oid_list *child;
 	int ncv = mdev->priv.eq_table.num_comp_vectors;
 	char unit[16];
 	struct pfil_head_args pa;
 	int err;
 	u32 eth_proto_cap;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	bool ext;
 	struct media media_entry = {};
 
 	if (mlx5e_check_required_hca_cap(mdev)) {
 		mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n");
 		return (NULL);
 	}
 
 	/*
 	 * Try to allocate the priv and make room for worst-case
 	 * number of channel structures:
 	 */
 	priv = malloc_domainset(sizeof(*priv) +
 	    (sizeof(priv->channel[0]) * mdev->priv.eq_table.num_comp_vectors),
 	    M_MLX5EN, mlx5_dev_domainset(mdev), M_WAITOK | M_ZERO);
 
 	ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev);
 	if (ifp == NULL) {
 		mlx5_core_err(mdev, "if_alloc() failed\n");
 		goto err_free_priv;
 	}
 	/* setup all static fields */
 	if (mlx5e_priv_static_init(priv, mdev, mdev->priv.eq_table.num_comp_vectors)) {
 		mlx5_core_err(mdev, "mlx5e_priv_static_init() failed\n");
 		goto err_free_ifp;
 	}
 
 	if_setsoftc(ifp, priv);
 	if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev));
 	if_setmtu(ifp, ETHERMTU);
 	if_setinitfn(ifp, mlx5e_open);
-	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST |
-	    IFF_KNOWSEPOCH);
+	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 	if_setioctlfn(ifp, mlx5e_ioctl);
 	if_settransmitfn(ifp, mlx5e_xmit);
 	if_setqflushfn(ifp, if_qflush);
 	if_setgetcounterfn(ifp, mlx5e_get_counter);
 	if_setsendqlen(ifp, ifqmaxlen);
 	/*
          * Set driver features
          */
 	if_setcapabilities(ifp, IFCAP_NV);
 	if_setcapabilitiesbit(ifp, IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE | IFCAP_JUMBO_MTU, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_TSO | IFCAP_VLAN_HWTSO, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_HWSTATS | IFCAP_HWRXTSTMP, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_MEXTPG, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_TXTLS4 | IFCAP_TXTLS6, 0);
 #ifdef RATELIMIT
 	if_setcapabilitiesbit(ifp, IFCAP_TXRTLMT | IFCAP_TXTLS_RTLMT, 0);
 #endif
 	if_setcapabilitiesbit(ifp, IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO, 0);
 	if_setcapabilities2bit(ifp, IFCAP2_BIT(IFCAP2_RXTLS4) |
 	    IFCAP2_BIT(IFCAP2_RXTLS6), 0);
 	if_setsndtagallocfn(ifp, mlx5e_snd_tag_alloc);
 #ifdef RATELIMIT
 	if_setratelimitqueryfn(ifp, mlx5e_ratelimit_query);
 #endif
 	/* set TSO limits so that we don't have to drop TX packets */
 	if_sethwtsomax(ifp, MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 	if_sethwtsomaxsegcount(ifp, MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */);
 	if_sethwtsomaxsegsize(ifp, MLX5E_MAX_TX_MBUF_SIZE);
 
 	if_setcapenable(ifp, if_getcapabilities(ifp));
 	if_setcapenable2(ifp, if_getcapabilities2(ifp));
 	if_sethwassist(ifp, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO)
 		if_sethwassistbits(ifp, CSUM_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 		if_sethwassistbits(ifp, (CSUM_TCP | CSUM_UDP | CSUM_IP), 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 		if_sethwassistbits(ifp, (CSUM_UDP_IPV6 | CSUM_TCP_IPV6), 0);
 	if (if_getcapabilities(ifp) & IFCAP_VXLAN_HWCSUM)
 		if_sethwassistbits(ifp, CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP |
 		    CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP |
 		    CSUM_ENCAP_VXLAN, 0);
 	if (if_getcapabilities(ifp) & IFCAP_VXLAN_HWTSO)
 		if_sethwassistbits(ifp, CSUM_INNER_IP6_TSO | CSUM_INNER_IP_TSO, 0);
 
 	/* ifnet sysctl tree */
 	sysctl_ctx_init(&priv->sysctl_ctx);
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev),
 	    OID_AUTO, if_getdname(ifp), CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet - interface name");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 	snprintf(unit, sizeof(unit), "%d", if_getdunit(ifp));
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, unit, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet - interface unit");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	/* HW sysctl tree */
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev));
 	priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child,
 	    OID_AUTO, "hw", CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet dev hw");
 	if (priv->sysctl_hw == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	err = mlx5e_build_ifp_priv(mdev, priv, ncv);
 	if (err) {
 		mlx5_core_err(mdev, "mlx5e_build_ifp_priv() failed (%d)\n", err);
 		goto err_free_sysctl;
 	}
 
 	/* reuse mlx5core's watchdog workqueue */
 	priv->wq = mdev->priv.health.wq_watchdog;
 
 	err = mlx5_core_alloc_pd(mdev, &priv->pdn, 0);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5_core_alloc_pd failed, %d\n", err);
 		goto err_free_wq;
 	}
 	err = mlx5_alloc_transport_domain(mdev, &priv->tdn, 0);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5_alloc_transport_domain failed, %d\n", err);
 		goto err_dealloc_pd;
 	}
 	err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_create_mkey failed, %d\n", err);
 		goto err_dealloc_transport_domain;
 	}
 	mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr);
 
 	/* check if we should generate a random MAC address */
 	if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 &&
 	    is_zero_ether_addr(dev_addr)) {
 		random_ether_addr(dev_addr);
 		mlx5_en_err(ifp, "Assigned random MAC address\n");
 	}
 
 	err = mlx5e_rl_init(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_rl_init failed, %d\n", err);
 		goto err_create_mkey;
 	}
 
 	err = mlx5e_tls_init(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_tls_init failed\n", __func__);
 		goto err_rl_init;
 	}
 
 	err = mlx5e_open_drop_rq(priv, &priv->drop_rq);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_drop_rq failed (%d)\n", __func__, err);
 		goto err_tls_init;
 	}
 
 	err = mlx5e_open_rqts(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_rqts failed (%d)\n", __func__, err);
 		goto err_open_drop_rq;
 	}
 
 	err = mlx5e_open_tirs(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_tirs() failed, %d\n", err);
 		goto err_open_rqts;
 	}
 
 	err = mlx5e_open_flow_tables(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_flow_tables failed (%d)\n", __func__, err);
 		goto err_open_tirs;
 	}
 
 	err = mlx5e_tls_rx_init(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_tls_rx_init() failed, %d\n", __func__, err);
 		goto err_open_flow_tables;
 	}
 
 	/* set default MTU */
 	mlx5e_set_dev_port_mtu(ifp, if_getmtu(ifp));
 
 	/* Set default media status */
 	priv->media_status_last = IFM_AVALID;
 	priv->media_active_last = IFM_ETHER | IFM_AUTO | IFM_FDX;
 
 	/* setup default pauseframes configuration */
 	mlx5e_setup_pauseframes(priv);
 
 	/* Setup supported medias */
 	if (!mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1)) {
 		ext = MLX5_CAP_PCAM_FEATURE(mdev,
 		    ptys_extended_ethernet);
 		eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 		    eth_proto_capability);
 	} else {
 		ext = false;
 		eth_proto_cap = 0;
 		mlx5_en_err(ifp, "Query port media capability failed, %d\n", err);
 	}
 
 	ifmedia_init(&priv->media, IFM_IMASK,
 	    mlx5e_media_change, mlx5e_media_status);
 
 	if (ext) {
 		for (unsigned i = 0; i != MLX5E_EXT_LINK_SPEEDS_NUMBER; i++) {
 			/* check if hardware has the right capability */
 			if (MLX5E_PROT_MASK(i) & ~eth_proto_cap)
 				continue;
 			for (unsigned j = 0; j != MLX5E_CABLE_TYPE_NUMBER; j++) {
 				media_entry = mlx5e_ext_mode_table[i][j];
 				if (media_entry.subtype == 0)
 					continue;
 				/* check if this subtype was already added */
 				for (unsigned k = 0; k != i; k++) {
 					/* check if hardware has the right capability */
 					if (MLX5E_PROT_MASK(k) & ~eth_proto_cap)
 						continue;
 					for (unsigned m = 0; m != MLX5E_CABLE_TYPE_NUMBER; m++) {
 						if (media_entry.subtype == mlx5e_ext_mode_table[k][m].subtype)
 							goto skip_ext_media;
 					}
 				}
 				mlx5e_ifm_add(priv, media_entry.subtype);
 			skip_ext_media:;
 			}
 		}
 	} else {
 		for (unsigned i = 0; i != MLX5E_LINK_SPEEDS_NUMBER; i++) {
 			media_entry = mlx5e_mode_table[i];
 			if (media_entry.subtype == 0)
 				continue;
 			if (MLX5E_PROT_MASK(i) & ~eth_proto_cap)
 				continue;
 			/* check if this subtype was already added */
 			for (unsigned k = 0; k != i; k++) {
 				if (media_entry.subtype == mlx5e_mode_table[k].subtype)
 					goto skip_media;
 			}
 			mlx5e_ifm_add(priv, media_entry.subtype);
 
 			/* NOTE: 10G ER and LR shares the same entry */
 			if (media_entry.subtype == IFM_10G_ER)
 				mlx5e_ifm_add(priv, IFM_10G_LR);
 		skip_media:;
 		}
 	}
 
 	mlx5e_ifm_add(priv, IFM_AUTO);
 
 	/* Set autoselect by default */
 	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE);
 
 	DEBUGNET_SET(ifp, mlx5_en);
 
 	ether_ifattach(ifp, dev_addr);
 
 	/* Register for VLAN events */
 	priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 	    mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
 	priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 	    mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
 
 	/* Register for VxLAN events */
 	priv->vxlan_start = EVENTHANDLER_REGISTER(vxlan_start,
 	    mlx5e_vxlan_start, priv, EVENTHANDLER_PRI_ANY);
 	priv->vxlan_stop = EVENTHANDLER_REGISTER(vxlan_stop,
 	    mlx5e_vxlan_stop, priv, EVENTHANDLER_PRI_ANY);
 
 	/* Link is down by default */
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 
 	mlx5e_enable_async_events(priv);
 
 	mlx5e_add_hw_stats(priv);
 
 	mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM,
 	    priv->stats.vport.arg);
 
 	mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM,
 	    priv->stats.pport.arg);
 
 	mlx5e_create_ethtool(priv);
 
 	mtx_lock(&priv->async_events_mtx);
 	mlx5e_update_stats(priv);
 	mtx_unlock(&priv->async_events_mtx);
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_clbr_done", CTLFLAG_RD,
 	    &priv->clbr_done, 0,
 	    "RX timestamps calibration state");
 	callout_init(&priv->tstmp_clbr, 1);
 	/* Pull out the frequency of the clock in hz */
 	priv->cclk = (uint64_t)MLX5_CAP_GEN(mdev, device_frequency_khz) * 1000ULL;
 	mlx5e_reset_calibration_callout(priv);
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = if_name(ifp);
 	priv->pfil = pfil_head_register(&pa);
 
 	PRIV_LOCK(priv);
 	err = mlx5e_open_flow_rules(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_open_flow_rules() failed, %d (ignored)\n", err);
 	}
 	PRIV_UNLOCK(priv);
 
 	return (priv);
 
 err_open_flow_tables:
 	mlx5e_close_flow_tables(priv);
 
 err_open_tirs:
 	mlx5e_close_tirs(priv);
 
 err_open_rqts:
 	mlx5e_close_rqts(priv);
 
 err_open_drop_rq:
 	mlx5e_close_drop_rq(&priv->drop_rq);
 
 err_tls_init:
 	mlx5e_tls_cleanup(priv);
 
 err_rl_init:
 	mlx5e_rl_cleanup(priv);
 
 err_create_mkey:
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 
 err_dealloc_transport_domain:
 	mlx5_dealloc_transport_domain(mdev, priv->tdn, 0);
 
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(mdev, priv->pdn, 0);
 
 err_free_wq:
 	flush_workqueue(priv->wq);
 
 err_free_sysctl:
 	sysctl_ctx_free(&priv->sysctl_ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	mlx5e_priv_static_destroy(priv, mdev, mdev->priv.eq_table.num_comp_vectors);
 
 err_free_ifp:
 	if_free(ifp);
 
 err_free_priv:
 	free(priv, M_MLX5EN);
 	return (NULL);
 }
 
 static void
 mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 	if_t ifp = priv->ifp;
 
 	/* don't allow more IOCTLs */
 	priv->gone = 1;
 
 	/* XXX wait a bit to allow IOCTL handlers to complete */
 	pause("W", hz);
 
 #ifdef RATELIMIT
 	/*
 	 * The kernel can have reference(s) via the m_snd_tag's into
 	 * the ratelimit channels, and these must go away before
 	 * detaching:
 	 */
 	while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) {
 		mlx5_en_err(priv->ifp,
 		    "Waiting for all ratelimit connections to terminate\n");
 		pause("W", hz);
 	}
 #endif
 
 #ifdef KERN_TLS
 	/* wait for all TLS tags to get freed */
 	while (priv->tls.init != 0 &&
 	    uma_zone_get_cur(priv->tls.zone) != 0)  {
 		mlx5_en_err(priv->ifp,
 		    "Waiting for all TLS connections to terminate\n");
 		pause("W", hz);
 	}
 
 	/* wait for all TLS RX tags to get freed */
 	while (priv->tls_rx.init != 0 &&
 	    uma_zone_get_cur(priv->tls_rx.zone) != 0)  {
 		mlx5_en_err(priv->ifp,
 		    "Waiting for all TLS RX connections to terminate\n");
 		pause("W", hz);
 	}
 #endif
 	/* wait for all unlimited send tags to complete */
 	mlx5e_priv_wait_for_completion(priv, mdev->priv.eq_table.num_comp_vectors);
 
 	/* stop watchdog timer */
 	callout_drain(&priv->watchdog);
 
 	callout_drain(&priv->tstmp_clbr);
 
 	if (priv->vlan_attach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
 	if (priv->vlan_detach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
 	if (priv->vxlan_start != NULL)
 		EVENTHANDLER_DEREGISTER(vxlan_start, priv->vxlan_start);
 	if (priv->vxlan_stop != NULL)
 		EVENTHANDLER_DEREGISTER(vxlan_stop, priv->vxlan_stop);
 
 	/* make sure device gets closed */
 	PRIV_LOCK(priv);
 	mlx5e_close_locked(ifp);
 	mlx5e_close_flow_rules(priv);
 	PRIV_UNLOCK(priv);
 
 	/* deregister pfil */
 	if (priv->pfil != NULL) {
 		pfil_head_unregister(priv->pfil);
 		priv->pfil = NULL;
 	}
 
 	/* unregister device */
 	ifmedia_removeall(&priv->media);
 	ether_ifdetach(ifp);
 
 	mlx5e_tls_rx_cleanup(priv);
 	mlx5e_close_flow_tables(priv);
 	mlx5e_close_tirs(priv);
 	mlx5e_close_rqts(priv);
 	mlx5e_close_drop_rq(&priv->drop_rq);
 	mlx5e_tls_cleanup(priv);
 	mlx5e_rl_cleanup(priv);
 
 	/* destroy all remaining sysctl nodes */
 	sysctl_ctx_free(&priv->stats.vport.ctx);
 	sysctl_ctx_free(&priv->stats.pport.ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	sysctl_ctx_free(&priv->sysctl_ctx);
 
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 	mlx5_dealloc_transport_domain(priv->mdev, priv->tdn, 0);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn, 0);
 	mlx5e_disable_async_events(priv);
 	flush_workqueue(priv->wq);
 	mlx5e_priv_static_destroy(priv, mdev, mdev->priv.eq_table.num_comp_vectors);
 	if_free(ifp);
 	free(priv, M_MLX5EN);
 }
 
 #ifdef DEBUGNET
 static void
 mlx5_en_debugnet_init(if_t dev, int *nrxr, int *ncl, int *clsize)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 
 	PRIV_LOCK(priv);
 	*nrxr = priv->params.num_channels;
 	*ncl = DEBUGNET_MAX_IN_FLIGHT;
 	*clsize = MLX5E_MAX_RX_BYTES;
 	PRIV_UNLOCK(priv);
 }
 
 static void
 mlx5_en_debugnet_event(if_t dev, enum debugnet_ev event)
 {
 }
 
 static int
 mlx5_en_debugnet_transmit(if_t dev, struct mbuf *m)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 	struct mlx5e_sq *sq;
 	int err;
 
 	if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING || (priv->media_status_last & IFM_ACTIVE) == 0)
 		return (ENOENT);
 
 	sq = &priv->channel[0].sq[0];
 
 	if (sq->running == 0) {
 		m_freem(m);
 		return (ENOENT);
 	}
 
 	if (mlx5e_sq_xmit(sq, &m) != 0) {
 		m_freem(m);
 		err = ENOBUFS;
 	} else {
 		err = 0;
 	}
 
 	mlx5e_tx_notify_hw(sq, true);
 
 	return (err);
 }
 
 static int
 mlx5_en_debugnet_poll(if_t dev, int count)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 
 	if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0 ||
 	    (priv->media_status_last & IFM_ACTIVE) == 0)
 		return (ENOENT);
 
 	mlx5_poll_interrupts(priv->mdev);
 
 	return (0);
 }
 #endif /* DEBUGNET */
 
 static void *
 mlx5e_get_ifp(void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	return (priv->ifp);
 }
 
 static struct mlx5_interface mlx5e_interface = {
 	.add = mlx5e_create_ifp,
 	.remove = mlx5e_destroy_ifp,
 	.event = mlx5e_async_event,
 	.protocol = MLX5_INTERFACE_PROTOCOL_ETH,
 	.get_dev = mlx5e_get_ifp,
 };
 
 void
 mlx5e_init(void)
 {
 	mlx5_register_interface(&mlx5e_interface);
 }
 
 void
 mlx5e_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5e_interface);
 }
 
 module_init_order(mlx5e_init, SI_ORDER_SIXTH);
 module_exit_order(mlx5e_cleanup, SI_ORDER_SIXTH);
 
 MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1);
 MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1);
 MODULE_VERSION(mlx5en, 1);
diff --git a/sys/dev/oce/oce_if.c b/sys/dev/oce/oce_if.c
index cc8cfc3eaa8c..5d250fcac0bd 100644
--- a/sys/dev/oce/oce_if.c
+++ b/sys/dev/oce/oce_if.c
@@ -1,2962 +1,2962 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 2013 Emulex
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the Emulex Corporation nor the names of its
  *    contributors may be used to endorse or promote products derived from
  *    this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Contact Information:
  * freebsd-drivers@emulex.com
  *
  * Emulex
  * 3333 Susan Street
  * Costa Mesa, CA 92626
  */
 
 /* $FreeBSD$ */
 
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include "oce_if.h"
 #include "oce_user.h"
 
 #define is_tso_pkt(m) (m->m_pkthdr.csum_flags & CSUM_TSO)
 
 /* UE Status Low CSR */
 static char *ue_status_low_desc[] = {
         "CEV",
         "CTX",
         "DBUF",
         "ERX",
         "Host",
         "MPU",
         "NDMA",
         "PTC ",
         "RDMA ",
         "RXF ",
         "RXIPS ",
         "RXULP0 ",
         "RXULP1 ",
         "RXULP2 ",
         "TIM ",
         "TPOST ",
         "TPRE ",
         "TXIPS ",
         "TXULP0 ",
         "TXULP1 ",
         "UC ",
         "WDMA ",
         "TXULP2 ",
         "HOST1 ",
         "P0_OB_LINK ",
         "P1_OB_LINK ",
         "HOST_GPIO ",
         "MBOX ",
         "AXGMAC0",
         "AXGMAC1",
         "JTAG",
         "MPU_INTPEND"
 };
 
 /* UE Status High CSR */
 static char *ue_status_hi_desc[] = {
         "LPCMEMHOST",
         "MGMT_MAC",
         "PCS0ONLINE",
         "MPU_IRAM",
         "PCS1ONLINE",
         "PCTL0",
         "PCTL1",
         "PMEM",
         "RR",
         "TXPB",
         "RXPP",
         "XAUI",
         "TXP",
         "ARM",
         "IPC",
         "HOST2",
         "HOST3",
         "HOST4",
         "HOST5",
         "HOST6",
         "HOST7",
         "HOST8",
         "HOST9",
         "NETC",
         "Unknown",
         "Unknown",
         "Unknown",
         "Unknown",
         "Unknown",
         "Unknown",
         "Unknown",
         "Unknown"
 };
 
 struct oce_common_cqe_info{
         uint8_t vtp:1;
         uint8_t l4_cksum_pass:1;
         uint8_t ip_cksum_pass:1;
         uint8_t ipv6_frame:1;
         uint8_t qnq:1;
         uint8_t rsvd:3;
         uint8_t num_frags;
         uint16_t pkt_size;
         uint16_t vtag;
 };
 
 /* Driver entry points prototypes */
 static int  oce_probe(device_t dev);
 static int  oce_attach(device_t dev);
 static int  oce_detach(device_t dev);
 static int  oce_shutdown(device_t dev);
 static int  oce_ioctl(if_t ifp, u_long command, caddr_t data);
 static void oce_init(void *xsc);
 static int  oce_multiq_start(if_t ifp, struct mbuf *m);
 static void oce_multiq_flush(if_t ifp);
 
 /* Driver interrupt routines protypes */
 static void oce_intr(void *arg, int pending);
 static int  oce_setup_intr(POCE_SOFTC sc);
 static int  oce_fast_isr(void *arg);
 static int  oce_alloc_intr(POCE_SOFTC sc, int vector,
 			  void (*isr) (void *arg, int pending));
 
 /* Media callbacks prototypes */
 static void oce_media_status(if_t ifp, struct ifmediareq *req);
 static int  oce_media_change(if_t ifp);
 
 /* Transmit routines prototypes */
 static int  oce_tx(POCE_SOFTC sc, struct mbuf **mpp, int wq_index);
 static void oce_tx_restart(POCE_SOFTC sc, struct oce_wq *wq);
 static void oce_process_tx_completion(struct oce_wq *wq);
 static int  oce_multiq_transmit(if_t ifp, struct mbuf *m,
 				 struct oce_wq *wq);
 
 /* Receive routines prototypes */
 static int  oce_cqe_vtp_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe);
 static int  oce_cqe_portid_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe);
 static void oce_rx(struct oce_rq *rq, struct oce_nic_rx_cqe *cqe);
 static void oce_check_rx_bufs(POCE_SOFTC sc, uint32_t num_cqes, struct oce_rq *rq);
 static uint16_t oce_rq_handler_lro(void *arg);
 static void oce_correct_header(struct mbuf *m, struct nic_hwlro_cqe_part1 *cqe1, struct nic_hwlro_cqe_part2 *cqe2);
 static void oce_rx_lro(struct oce_rq *rq, struct nic_hwlro_singleton_cqe *cqe, struct nic_hwlro_cqe_part2 *cqe2);
 static void oce_rx_mbuf_chain(struct oce_rq *rq, struct oce_common_cqe_info *cqe_info, struct mbuf **m);
 
 /* Helper function prototypes in this file */
 static int  oce_attach_ifp(POCE_SOFTC sc);
 static void oce_add_vlan(void *arg, if_t ifp, uint16_t vtag);
 static void oce_del_vlan(void *arg, if_t ifp, uint16_t vtag);
 static int  oce_vid_config(POCE_SOFTC sc);
 static void oce_mac_addr_set(POCE_SOFTC sc);
 static int  oce_handle_passthrough(if_t ifp, caddr_t data);
 static void oce_local_timer(void *arg);
 static void oce_if_deactivate(POCE_SOFTC sc);
 static void oce_if_activate(POCE_SOFTC sc);
 static void setup_max_queues_want(POCE_SOFTC sc);
 static void update_queues_got(POCE_SOFTC sc);
 static void process_link_state(POCE_SOFTC sc,
 		 struct oce_async_cqe_link_state *acqe);
 static int oce_tx_asic_stall_verify(POCE_SOFTC sc, struct mbuf *m);
 static void oce_get_config(POCE_SOFTC sc);
 static struct mbuf *oce_insert_vlan_tag(POCE_SOFTC sc, struct mbuf *m, boolean_t *complete);
 static void oce_read_env_variables(POCE_SOFTC sc);
 
 /* IP specific */
 #if defined(INET6) || defined(INET)
 static int  oce_init_lro(POCE_SOFTC sc);
 static struct mbuf * oce_tso_setup(POCE_SOFTC sc, struct mbuf **mpp);
 #endif
 
 static device_method_t oce_dispatch[] = {
 	DEVMETHOD(device_probe, oce_probe),
 	DEVMETHOD(device_attach, oce_attach),
 	DEVMETHOD(device_detach, oce_detach),
 	DEVMETHOD(device_shutdown, oce_shutdown),
 
 	DEVMETHOD_END
 };
 
 static driver_t oce_driver = {
 	"oce",
 	oce_dispatch,
 	sizeof(OCE_SOFTC)
 };
 
 /* global vars */
 const char component_revision[32] = {"///" COMPONENT_REVISION "///"};
 
 /* Module capabilites and parameters */
 uint32_t oce_max_rsp_handled = OCE_MAX_RSP_HANDLED;
 uint32_t oce_enable_rss = OCE_MODCAP_RSS;
 uint32_t oce_rq_buf_size = 2048;
 
 TUNABLE_INT("hw.oce.max_rsp_handled", &oce_max_rsp_handled);
 TUNABLE_INT("hw.oce.enable_rss", &oce_enable_rss);
 
 /* Supported devices table */
 static uint32_t supportedDevices[] =  {
 	(PCI_VENDOR_SERVERENGINES << 16) | PCI_PRODUCT_BE2,
 	(PCI_VENDOR_SERVERENGINES << 16) | PCI_PRODUCT_BE3,
 	(PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_BE3,
 	(PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_XE201,
 	(PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_XE201_VF,
 	(PCI_VENDOR_EMULEX << 16) | PCI_PRODUCT_SH
 };
 
 DRIVER_MODULE(oce, pci, oce_driver, 0, 0);
 MODULE_PNP_INFO("W32:vendor/device", pci, oce, supportedDevices,
     nitems(supportedDevices));
 MODULE_DEPEND(oce, pci, 1, 1, 1);
 MODULE_DEPEND(oce, ether, 1, 1, 1);
 MODULE_VERSION(oce, 1);
 
 POCE_SOFTC softc_head = NULL;
 POCE_SOFTC softc_tail = NULL;
 
 struct oce_rdma_if *oce_rdma_if = NULL;
 
 /*****************************************************************************
  *			Driver entry points functions                        *
  *****************************************************************************/
 
 static int
 oce_probe(device_t dev)
 {
 	uint16_t vendor = 0;
 	uint16_t device = 0;
 	int i = 0;
 	char str[256] = {0};
 	POCE_SOFTC sc;
 
 	sc = device_get_softc(dev);
 	bzero(sc, sizeof(OCE_SOFTC));
 	sc->dev = dev;
 
 	vendor = pci_get_vendor(dev);
 	device = pci_get_device(dev);
 
 	for (i = 0; i < (sizeof(supportedDevices) / sizeof(uint32_t)); i++) {
 		if (vendor == ((supportedDevices[i] >> 16) & 0xffff)) {
 			if (device == (supportedDevices[i] & 0xffff)) {
 				sprintf(str, "%s:%s", "Emulex CNA NIC function",
 					component_revision);
 				device_set_desc_copy(dev, str);
 
 				switch (device) {
 				case PCI_PRODUCT_BE2:
 					sc->flags |= OCE_FLAGS_BE2;
 					break;
 				case PCI_PRODUCT_BE3:
 					sc->flags |= OCE_FLAGS_BE3;
 					break;
 				case PCI_PRODUCT_XE201:
 				case PCI_PRODUCT_XE201_VF:
 					sc->flags |= OCE_FLAGS_XE201;
 					break;
 				case PCI_PRODUCT_SH:
 					sc->flags |= OCE_FLAGS_SH;
 					break;
 				default:
 					return ENXIO;
 				}
 				return BUS_PROBE_DEFAULT;
 			}
 		}
 	}
 
 	return ENXIO;
 }
 
 static int
 oce_attach(device_t dev)
 {
 	POCE_SOFTC sc;
 	int rc = 0;
 
 	sc = device_get_softc(dev);
 
 	rc = oce_hw_pci_alloc(sc);
 	if (rc)
 		return rc;
 
 	sc->tx_ring_size = OCE_TX_RING_SIZE;
 	sc->rx_ring_size = OCE_RX_RING_SIZE;
 	/* receive fragment size should be multiple of 2K */
 	sc->rq_frag_size = ((oce_rq_buf_size / 2048) * 2048);
 	sc->flow_control = OCE_DEFAULT_FLOW_CONTROL;
 	sc->promisc	 = OCE_DEFAULT_PROMISCUOUS;
 
 	LOCK_CREATE(&sc->bmbx_lock, "Mailbox_lock");
 	LOCK_CREATE(&sc->dev_lock,  "Device_lock");
 
 	/* initialise the hardware */
 	rc = oce_hw_init(sc);
 	if (rc)
 		goto pci_res_free;
 
 	oce_read_env_variables(sc);
 
 	oce_get_config(sc);
 
 	setup_max_queues_want(sc);	
 
 	rc = oce_setup_intr(sc);
 	if (rc)
 		goto mbox_free;
 
 	rc = oce_queue_init_all(sc);
 	if (rc)
 		goto intr_free;
 
 	rc = oce_attach_ifp(sc);
 	if (rc)
 		goto queues_free;
 
 #if defined(INET6) || defined(INET)
 	rc = oce_init_lro(sc);
 	if (rc)
 		goto ifp_free;
 #endif
 
 	rc = oce_hw_start(sc);
 	if (rc)
 		goto lro_free;
 
 	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 				oce_add_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 				oce_del_vlan, sc, EVENTHANDLER_PRI_FIRST);
 
 	rc = oce_stats_init(sc);
 	if (rc)
 		goto vlan_free;
 
 	oce_add_sysctls(sc);
 
 	callout_init(&sc->timer, CALLOUT_MPSAFE);
 	rc = callout_reset(&sc->timer, 2 * hz, oce_local_timer, sc);
 	if (rc)
 		goto stats_free;
 
 	sc->next =NULL;
 	if (softc_tail != NULL) {
 	  softc_tail->next = sc;
 	} else {
 	  softc_head = sc;
 	}
 	softc_tail = sc;
 
 	return 0;
 
 stats_free:
 	callout_drain(&sc->timer);
 	oce_stats_free(sc);
 vlan_free:
 	if (sc->vlan_attach)
 		EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
 	if (sc->vlan_detach)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
 	oce_hw_intr_disable(sc);
 lro_free:
 #if defined(INET6) || defined(INET)
 	oce_free_lro(sc);
 ifp_free:
 #endif
 	ether_ifdetach(sc->ifp);
 	if_free(sc->ifp);
 queues_free:
 	oce_queue_release_all(sc);
 intr_free:
 	oce_intr_free(sc);
 mbox_free:
 	oce_dma_free(sc, &sc->bsmbx);
 pci_res_free:
 	oce_hw_pci_free(sc);
 	LOCK_DESTROY(&sc->dev_lock);
 	LOCK_DESTROY(&sc->bmbx_lock);
 	return rc;
 
 }
 
 static int
 oce_detach(device_t dev)
 {
 	POCE_SOFTC sc = device_get_softc(dev);
 	POCE_SOFTC poce_sc_tmp, *ppoce_sc_tmp1, poce_sc_tmp2 = NULL;
 
         poce_sc_tmp = softc_head;
         ppoce_sc_tmp1 = &softc_head;
         while (poce_sc_tmp != NULL) {
           if (poce_sc_tmp == sc) {
             *ppoce_sc_tmp1 = sc->next;
             if (sc->next == NULL) {
               softc_tail = poce_sc_tmp2;
             }
             break;
           }
           poce_sc_tmp2 = poce_sc_tmp;
           ppoce_sc_tmp1 = &poce_sc_tmp->next;
           poce_sc_tmp = poce_sc_tmp->next;
         }
 
 	LOCK(&sc->dev_lock);
 	oce_if_deactivate(sc);
 	UNLOCK(&sc->dev_lock);
 
 	callout_drain(&sc->timer);
 
 	if (sc->vlan_attach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
 	if (sc->vlan_detach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
 
 	ether_ifdetach(sc->ifp);
 
 	if_free(sc->ifp);
 
 	oce_hw_shutdown(sc);
 
 	bus_generic_detach(dev);
 
 	return 0;
 }
 
 static int
 oce_shutdown(device_t dev)
 {
 	int rc;
 
 	rc = oce_detach(dev);
 
 	return rc;	
 }
 
 static int
 oce_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	POCE_SOFTC sc = if_getsoftc(ifp);
 	struct ifi2creq i2c;
 	uint8_t	offset = 0;
 	int rc = 0;
 	uint32_t u;
 
 	switch (command) {
 	case SIOCGIFMEDIA:
 		rc = ifmedia_ioctl(ifp, ifr, &sc->media, command);
 		break;
 
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu > OCE_MAX_MTU)
 			rc = EINVAL;
 		else
 			if_setmtu(ifp, ifr->ifr_mtu);
 		break;
 
 	case SIOCSIFFLAGS:
 		if (if_getflags(ifp) & IFF_UP) {
 			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) {
 				if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
 				oce_init(sc);
 			}
 			device_printf(sc->dev, "Interface Up\n");	
 		} else {
 			LOCK(&sc->dev_lock);
 
 			if_setdrvflagbits(sc->ifp, 0, 
 			    IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 			oce_if_deactivate(sc);
 
 			UNLOCK(&sc->dev_lock);
 
 			device_printf(sc->dev, "Interface Down\n");
 		}
 
 		if ((if_getflags(ifp) & IFF_PROMISC) && !sc->promisc) {
 			if (!oce_rxf_set_promiscuous(sc, (1 | (1 << 1))))
 				sc->promisc = TRUE;
 		} else if (!(if_getflags(ifp) & IFF_PROMISC) && sc->promisc) {
 			if (!oce_rxf_set_promiscuous(sc, 0))
 				sc->promisc = FALSE;
 		}
 
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		rc = oce_hw_update_multicast(sc);
 		if (rc)
 			device_printf(sc->dev,
 				"Update multicast address failed\n");
 		break;
 
 	case SIOCSIFCAP:
 		u = ifr->ifr_reqcap ^ if_getcapenable(ifp);
 
 		if (u & IFCAP_TXCSUM) {
 			if_togglecapenable(ifp, IFCAP_TXCSUM);
 			if_togglehwassist(ifp, (CSUM_TCP | CSUM_UDP | CSUM_IP));
 			
 			if (IFCAP_TSO & if_getcapenable(ifp) &&
 			    !(IFCAP_TXCSUM & if_getcapenable(ifp))) {
 				u &= ~IFCAP_TSO;
 				if_setcapenablebit(ifp, 0, IFCAP_TSO);
 				if_sethwassistbits(ifp, 0, CSUM_TSO);
 				if_printf(ifp,
 					 "TSO disabled due to -txcsum.\n");
 			}
 		}
 
 		if (u & IFCAP_RXCSUM)
 			if_togglecapenable(ifp, IFCAP_RXCSUM);
 
 		if (u & IFCAP_TSO4) {
 			if_togglecapenable(ifp, IFCAP_TSO4);
 
 			if (IFCAP_TSO & if_getcapenable(ifp)) {
 				if (IFCAP_TXCSUM & if_getcapenable(ifp))
 					if_sethwassistbits(ifp, CSUM_TSO, 0);
 				else {
 					if_setcapenablebit(ifp, 0, IFCAP_TSO);
 					if_sethwassistbits(ifp, 0, CSUM_TSO);
 					if_printf(ifp,
 					    "Enable txcsum first.\n");
 					rc = EAGAIN;
 				}
 			} else
 				if_sethwassistbits(ifp, 0, CSUM_TSO);
 		}
 
 		if (u & IFCAP_VLAN_HWTAGGING)
 			if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
 
 		if (u & IFCAP_VLAN_HWFILTER) {
 			if_togglecapenable(ifp, IFCAP_VLAN_HWFILTER);
 			oce_vid_config(sc);
 		}
 #if defined(INET6) || defined(INET)
 		if (u & IFCAP_LRO) {
 			if_togglecapenable(ifp, IFCAP_LRO);
 			if(sc->enable_hwlro) {
 				if(if_getcapenable(ifp) & IFCAP_LRO) {
 					rc = oce_mbox_nic_set_iface_lro_config(sc, 1);
 				}else {
 					rc = oce_mbox_nic_set_iface_lro_config(sc, 0);
 				}
 			}
 		}
 #endif
 
 		break;
 
 	case SIOCGI2C:
 		rc = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (rc)
 			break;
 
 		if (i2c.dev_addr == PAGE_NUM_A0) {
 			offset = i2c.offset;
 		} else if (i2c.dev_addr == PAGE_NUM_A2) {
 			offset = TRANSCEIVER_A0_SIZE + i2c.offset;
 		} else {
 			rc = EINVAL;
 			break;
 		}
 
 		if (i2c.len > sizeof(i2c.data) ||
 		    i2c.len + offset > sizeof(sfp_vpd_dump_buffer)) {
 			rc = EINVAL;
 			break;
 		}
 
 		rc = oce_mbox_read_transrecv_data(sc, i2c.dev_addr);
 		if (rc) {
 			rc = -rc;
 			break;
 		}
 
 		memcpy(&i2c.data[0], &sfp_vpd_dump_buffer[offset], i2c.len);
 
 		rc = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 		break;
 
 	case SIOCGPRIVATE_0:
 		rc = priv_check(curthread, PRIV_DRIVER);
 		if (rc != 0)
 			break;
 		rc = oce_handle_passthrough(ifp, data);
 		break;
 	default:
 		rc = ether_ioctl(ifp, command, data);
 		break;
 	}
 
 	return rc;
 }
 
 static void
 oce_init(void *arg)
 {
 	POCE_SOFTC sc = arg;
 
 	LOCK(&sc->dev_lock);
 
 	if (if_getflags(sc->ifp) & IFF_UP) {
 		oce_if_deactivate(sc);
 		oce_if_activate(sc);
 	}
 
 	UNLOCK(&sc->dev_lock);
 
 }
 
 static int
 oce_multiq_start(if_t ifp, struct mbuf *m)
 {
 	POCE_SOFTC sc = if_getsoftc(ifp);
 	struct oce_wq *wq = NULL;
 	int queue_index = 0;
 	int status = 0;
 
 	if (!sc->link_status)
 		return ENXIO;
 
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		queue_index = m->m_pkthdr.flowid % sc->nwqs;
 
 	wq = sc->wq[queue_index];
 
 	LOCK(&wq->tx_lock);
 	status = oce_multiq_transmit(ifp, m, wq);
 	UNLOCK(&wq->tx_lock);
 
 	return status;
 
 }
 
 static void
 oce_multiq_flush(if_t ifp)
 {
 	POCE_SOFTC sc = if_getsoftc(ifp);
 	struct mbuf     *m;
 	int i = 0;
 
 	for (i = 0; i < sc->nwqs; i++) {
 		while ((m = buf_ring_dequeue_sc(sc->wq[i]->br)) != NULL)
 			m_freem(m);
 	}
 	if_qflush(ifp);
 }
 
 /*****************************************************************************
  *                   Driver interrupt routines functions                     *
  *****************************************************************************/
 
 static void
 oce_intr(void *arg, int pending)
 {
 
 	POCE_INTR_INFO ii = (POCE_INTR_INFO) arg;
 	POCE_SOFTC sc = ii->sc;
 	struct oce_eq *eq = ii->eq;
 	struct oce_eqe *eqe;
 	struct oce_cq *cq = NULL;
 	int i, num_eqes = 0;
 
 	bus_dmamap_sync(eq->ring->dma.tag, eq->ring->dma.map,
 				 BUS_DMASYNC_POSTWRITE);
 	do {
 		eqe = RING_GET_CONSUMER_ITEM_VA(eq->ring, struct oce_eqe);
 		if (eqe->evnt == 0)
 			break;
 		eqe->evnt = 0;
 		bus_dmamap_sync(eq->ring->dma.tag, eq->ring->dma.map,
 					BUS_DMASYNC_POSTWRITE);
 		RING_GET(eq->ring, 1);
 		num_eqes++;
 
 	} while (TRUE);
 
 	if (!num_eqes)
 		goto eq_arm; /* Spurious */
 
  	/* Clear EQ entries, but dont arm */
 	oce_arm_eq(sc, eq->eq_id, num_eqes, FALSE, FALSE);
 
 	/* Process TX, RX and MCC. But dont arm CQ*/
 	for (i = 0; i < eq->cq_valid; i++) {
 		cq = eq->cq[i];
 		(*cq->cq_handler)(cq->cb_arg);
 	}
 
 	/* Arm all cqs connected to this EQ */
 	for (i = 0; i < eq->cq_valid; i++) {
 		cq = eq->cq[i];
 		oce_arm_cq(sc, cq->cq_id, 0, TRUE);
 	}
 
 eq_arm:
 	oce_arm_eq(sc, eq->eq_id, 0, TRUE, FALSE);
 
 	return;
 }
 
 static int
 oce_setup_intr(POCE_SOFTC sc)
 {
 	int rc = 0, use_intx = 0;
 	int vector = 0, req_vectors = 0;
 	int tot_req_vectors, tot_vectors;
 
 	if (is_rss_enabled(sc))
 		req_vectors = MAX((sc->nrqs - 1), sc->nwqs);
 	else
 		req_vectors = 1;
 
 	tot_req_vectors = req_vectors;
 	if (sc->rdma_flags & OCE_RDMA_FLAG_SUPPORTED) {
 	  if (req_vectors > 1) {
 	    tot_req_vectors += OCE_RDMA_VECTORS;
 	    sc->roce_intr_count = OCE_RDMA_VECTORS;
 	  }
 	}
 
         if (sc->flags & OCE_FLAGS_MSIX_CAPABLE) {
 		sc->intr_count = req_vectors;
                 tot_vectors = tot_req_vectors;
 		rc = pci_alloc_msix(sc->dev, &tot_vectors);
 		if (rc != 0) {
 			use_intx = 1;
 			pci_release_msi(sc->dev);
 		} else {
 		  if (sc->rdma_flags & OCE_RDMA_FLAG_SUPPORTED) {
 		    if (tot_vectors < tot_req_vectors) {
 		      if (sc->intr_count < (2 * OCE_RDMA_VECTORS)) {
 			sc->roce_intr_count = (tot_vectors / 2);
 		      }
 		      sc->intr_count = tot_vectors - sc->roce_intr_count;
 		    }
 		  } else {
 		    sc->intr_count = tot_vectors;
 		  }
     		  sc->flags |= OCE_FLAGS_USING_MSIX;
 		}
 	} else
 		use_intx = 1;
 
 	if (use_intx)
 		sc->intr_count = 1;
 
 	/* Scale number of queues based on intr we got */
 	update_queues_got(sc);
 
 	if (use_intx) {
 		device_printf(sc->dev, "Using legacy interrupt\n");
 		rc = oce_alloc_intr(sc, vector, oce_intr);
 		if (rc)
 			goto error;		
 	} else {
 		for (; vector < sc->intr_count; vector++) {
 			rc = oce_alloc_intr(sc, vector, oce_intr);
 			if (rc)
 				goto error;
 		}
 	}
 
 	return 0;
 error:
 	oce_intr_free(sc);
 	return rc;
 }
 
 static int
 oce_fast_isr(void *arg)
 {
 	POCE_INTR_INFO ii = (POCE_INTR_INFO) arg;
 	POCE_SOFTC sc = ii->sc;
 
 	if (ii->eq == NULL)
 		return FILTER_STRAY;
 
 	oce_arm_eq(sc, ii->eq->eq_id, 0, FALSE, TRUE);
 
 	taskqueue_enqueue(ii->tq, &ii->task);
 
  	ii->eq->intr++;	
 
 	return FILTER_HANDLED;
 }
 
 static int
 oce_alloc_intr(POCE_SOFTC sc, int vector, void (*isr) (void *arg, int pending))
 {
 	POCE_INTR_INFO ii;
 	int rc = 0, rr;
 
 	if (vector >= OCE_MAX_EQ)
 		return (EINVAL);
 
 	ii = &sc->intrs[vector];
 
 	/* Set the resource id for the interrupt.
 	 * MSIx is vector + 1 for the resource id,
 	 * INTx is 0 for the resource id.
 	 */
 	if (sc->flags & OCE_FLAGS_USING_MSIX)
 		rr = vector + 1;
 	else
 		rr = 0;
 	ii->intr_res = bus_alloc_resource_any(sc->dev,
 					      SYS_RES_IRQ,
 					      &rr, RF_ACTIVE|RF_SHAREABLE);
 	ii->irq_rr = rr;
 	if (ii->intr_res == NULL) {
 		device_printf(sc->dev,
 			  "Could not allocate interrupt\n");
 		rc = ENXIO;
 		return rc;
 	}
 
 	TASK_INIT(&ii->task, 0, isr, ii);
 	ii->vector = vector;
 	sprintf(ii->task_name, "oce_task[%d]", ii->vector);
 	ii->tq = taskqueue_create_fast(ii->task_name,
 			M_NOWAIT,
 			taskqueue_thread_enqueue,
 			&ii->tq);
 	taskqueue_start_threads(&ii->tq, 1, PI_NET, "%s taskq",
 			device_get_nameunit(sc->dev));
 
 	ii->sc = sc;
 	rc = bus_setup_intr(sc->dev,
 			ii->intr_res,
 			INTR_TYPE_NET,
 			oce_fast_isr, NULL, ii, &ii->tag);
 	return rc;
 
 }
 
 void
 oce_intr_free(POCE_SOFTC sc)
 {
 	int i = 0;
 
 	for (i = 0; i < sc->intr_count; i++) {
 		
 		if (sc->intrs[i].tag != NULL)
 			bus_teardown_intr(sc->dev, sc->intrs[i].intr_res,
 						sc->intrs[i].tag);
 		if (sc->intrs[i].tq != NULL)
 			taskqueue_free(sc->intrs[i].tq);
 		
 		if (sc->intrs[i].intr_res != NULL)
 			bus_release_resource(sc->dev, SYS_RES_IRQ,
 						sc->intrs[i].irq_rr,
 						sc->intrs[i].intr_res);
 		sc->intrs[i].tag = NULL;
 		sc->intrs[i].intr_res = NULL;
 	}
 
 	if (sc->flags & OCE_FLAGS_USING_MSIX)
 		pci_release_msi(sc->dev);
 
 }
 
 /******************************************************************************
 *			  Media callbacks functions 			      *
 ******************************************************************************/
 
 static void
 oce_media_status(if_t ifp, struct ifmediareq *req)
 {
 	POCE_SOFTC sc = (POCE_SOFTC) if_getsoftc(ifp);
 
 	req->ifm_status = IFM_AVALID;
 	req->ifm_active = IFM_ETHER;
 
 	if (sc->link_status == 1)
 		req->ifm_status |= IFM_ACTIVE;
 	else 
 		return;
 
 	switch (sc->link_speed) {
 	case 1: /* 10 Mbps */
 		req->ifm_active |= IFM_10_T | IFM_FDX;
 		sc->speed = 10;
 		break;
 	case 2: /* 100 Mbps */
 		req->ifm_active |= IFM_100_TX | IFM_FDX;
 		sc->speed = 100;
 		break;
 	case 3: /* 1 Gbps */
 		req->ifm_active |= IFM_1000_T | IFM_FDX;
 		sc->speed = 1000;
 		break;
 	case 4: /* 10 Gbps */
 		req->ifm_active |= IFM_10G_SR | IFM_FDX;
 		sc->speed = 10000;
 		break;
 	case 5: /* 20 Gbps */
 		req->ifm_active |= IFM_10G_SR | IFM_FDX;
 		sc->speed = 20000;
 		break;
 	case 6: /* 25 Gbps */
 		req->ifm_active |= IFM_10G_SR | IFM_FDX;
 		sc->speed = 25000;
 		break;
 	case 7: /* 40 Gbps */
 		req->ifm_active |= IFM_40G_SR4 | IFM_FDX;
 		sc->speed = 40000;
 		break;
 	default:
 		sc->speed = 0;
 		break;
 	}
 
 	return;
 }
 
 int
 oce_media_change(if_t ifp)
 {
 	return 0;
 }
 
 static void oce_is_pkt_dest_bmc(POCE_SOFTC sc,
 				struct mbuf *m, boolean_t *os2bmc,
 				struct mbuf **m_new)
 {
 	struct ether_header *eh = NULL;
 
 	eh = mtod(m, struct ether_header *);
 
 	if (!is_os2bmc_enabled(sc) || *os2bmc) {
 		*os2bmc = FALSE;
 		goto done;
 	}
 	if (!ETHER_IS_MULTICAST(eh->ether_dhost))
 		goto done;
 
 	if (is_mc_allowed_on_bmc(sc, eh) ||
 	    is_bc_allowed_on_bmc(sc, eh) ||
 	    is_arp_allowed_on_bmc(sc, ntohs(eh->ether_type))) {
 		*os2bmc = TRUE;
 		goto done;
 	}
 
 	if (mtod(m, struct ip *)->ip_p == IPPROTO_IPV6) {
 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 		uint8_t nexthdr = ip6->ip6_nxt;
 		if (nexthdr == IPPROTO_ICMPV6) {
 			struct icmp6_hdr *icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 			switch (icmp6->icmp6_type) {
 			case ND_ROUTER_ADVERT:
 				*os2bmc = is_ipv6_ra_filt_enabled(sc);
 				goto done;
 			case ND_NEIGHBOR_ADVERT:
 				*os2bmc = is_ipv6_na_filt_enabled(sc);
 				goto done;
 			default:
 				break;
 			}
 		}
 	}
 
 	if (mtod(m, struct ip *)->ip_p == IPPROTO_UDP) {
 		struct ip *ip = mtod(m, struct ip *);
 		int iphlen = ip->ip_hl << 2;
 		struct udphdr *uh = (struct udphdr *)((caddr_t)ip + iphlen);
 		switch (uh->uh_dport) {
 		case DHCP_CLIENT_PORT:
 			*os2bmc = is_dhcp_client_filt_enabled(sc);
 			goto done;
 		case DHCP_SERVER_PORT:
 			*os2bmc = is_dhcp_srvr_filt_enabled(sc);
 			goto done;
 		case NET_BIOS_PORT1:
 		case NET_BIOS_PORT2:
 			*os2bmc = is_nbios_filt_enabled(sc);
 			goto done;
 		case DHCPV6_RAS_PORT:
 			*os2bmc = is_ipv6_ras_filt_enabled(sc);
 			goto done;
 		default:
 			break;
 		}
 	}
 done:
 	if (*os2bmc) {
 		*m_new = m_dup(m, M_NOWAIT);
 		if (!*m_new) {
 			*os2bmc = FALSE;
 			return;
 		}
 		*m_new = oce_insert_vlan_tag(sc, *m_new, NULL);
 	}
 }
 
 /*****************************************************************************
  *			  Transmit routines functions			     *
  *****************************************************************************/
 
 static int
 oce_tx(POCE_SOFTC sc, struct mbuf **mpp, int wq_index)
 {
 	int rc = 0, i, retry_cnt = 0;
 	bus_dma_segment_t segs[OCE_MAX_TX_ELEMENTS];
 	struct mbuf *m, *m_temp, *m_new = NULL;
 	struct oce_wq *wq = sc->wq[wq_index];
 	struct oce_packet_desc *pd;
 	struct oce_nic_hdr_wqe *nichdr;
 	struct oce_nic_frag_wqe *nicfrag;
 	struct ether_header *eh = NULL;
 	int num_wqes;
 	uint32_t reg_value;
 	boolean_t complete = TRUE;
 	boolean_t os2bmc = FALSE;
 
 	m = *mpp;
 	if (!m)
 		return EINVAL;
 
 	if (!(m->m_flags & M_PKTHDR)) {
 		rc = ENXIO;
 		goto free_ret;
 	}
 
 	/* Don't allow non-TSO packets longer than MTU */
 	if (!is_tso_pkt(m)) {
 		eh = mtod(m, struct ether_header *);
 		if(m->m_pkthdr.len > ETHER_MAX_FRAME(sc->ifp, eh->ether_type, FALSE))
 			 goto free_ret;
 	}
 
 	if(oce_tx_asic_stall_verify(sc, m)) {
 		m = oce_insert_vlan_tag(sc, m, &complete);
 		if(!m) {
 			device_printf(sc->dev, "Insertion unsuccessful\n");
 			return 0;
 		}
 	}
 
 	/* Lancer, SH ASIC has a bug wherein Packets that are 32 bytes or less
 	 * may cause a transmit stall on that port. So the work-around is to
 	 * pad short packets (<= 32 bytes) to a 36-byte length.
 	*/
 	if(IS_SH(sc) || IS_XE201(sc) ) {
 		if(m->m_pkthdr.len <= 32) {
 			char buf[36];
 			bzero((void *)buf, 36);
 			m_append(m, (36 - m->m_pkthdr.len), buf);
 		}
 	}
 
 tx_start:
 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 		/* consolidate packet buffers for TSO/LSO segment offload */
 #if defined(INET6) || defined(INET)
 		m = oce_tso_setup(sc, mpp);
 #else
 		m = NULL;
 #endif
 		if (m == NULL) {
 			rc = ENXIO;
 			goto free_ret;
 		}
 	}
 
 	pd = &wq->pckts[wq->pkt_desc_head];
 
 retry:
 	rc = bus_dmamap_load_mbuf_sg(wq->tag,
 				     pd->map,
 				     m, segs, &pd->nsegs, BUS_DMA_NOWAIT);
 	if (rc == 0) {
 		num_wqes = pd->nsegs + 1;
 		if (IS_BE(sc) || IS_SH(sc)) {
 			/*Dummy required only for BE3.*/
 			if (num_wqes & 1)
 				num_wqes++;
 		}
 		if (num_wqes >= RING_NUM_FREE(wq->ring)) {
 			bus_dmamap_unload(wq->tag, pd->map);
 			return EBUSY;
 		}
 		atomic_store_rel_int(&wq->pkt_desc_head,
 				     (wq->pkt_desc_head + 1) % \
 				      OCE_WQ_PACKET_ARRAY_SIZE);
 		bus_dmamap_sync(wq->tag, pd->map, BUS_DMASYNC_PREWRITE);
 		pd->mbuf = m;
 
 		nichdr =
 		    RING_GET_PRODUCER_ITEM_VA(wq->ring, struct oce_nic_hdr_wqe);
 		nichdr->u0.dw[0] = 0;
 		nichdr->u0.dw[1] = 0;
 		nichdr->u0.dw[2] = 0;
 		nichdr->u0.dw[3] = 0;
 
 		nichdr->u0.s.complete = complete;
 		nichdr->u0.s.mgmt = os2bmc;
 		nichdr->u0.s.event = 1;
 		nichdr->u0.s.crc = 1;
 		nichdr->u0.s.forward = 0;
 		nichdr->u0.s.ipcs = (m->m_pkthdr.csum_flags & CSUM_IP) ? 1 : 0;
 		nichdr->u0.s.udpcs =
 			(m->m_pkthdr.csum_flags & CSUM_UDP) ? 1 : 0;
 		nichdr->u0.s.tcpcs =
 			(m->m_pkthdr.csum_flags & CSUM_TCP) ? 1 : 0;
 		nichdr->u0.s.num_wqe = num_wqes;
 		nichdr->u0.s.total_length = m->m_pkthdr.len;
 
 		if (m->m_flags & M_VLANTAG) {
 			nichdr->u0.s.vlan = 1; /*Vlan present*/
 			nichdr->u0.s.vlan_tag = m->m_pkthdr.ether_vtag;
 		}
 
 		if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 			if (m->m_pkthdr.tso_segsz) {
 				nichdr->u0.s.lso = 1;
 				nichdr->u0.s.lso_mss  = m->m_pkthdr.tso_segsz;
 			}
 			if (!IS_BE(sc) || !IS_SH(sc))
 				nichdr->u0.s.ipcs = 1;
 		}
 
 		RING_PUT(wq->ring, 1);
 		atomic_add_int(&wq->ring->num_used, 1);
 
 		for (i = 0; i < pd->nsegs; i++) {
 			nicfrag =
 			    RING_GET_PRODUCER_ITEM_VA(wq->ring,
 						      struct oce_nic_frag_wqe);
 			nicfrag->u0.s.rsvd0 = 0;
 			nicfrag->u0.s.frag_pa_hi = ADDR_HI(segs[i].ds_addr);
 			nicfrag->u0.s.frag_pa_lo = ADDR_LO(segs[i].ds_addr);
 			nicfrag->u0.s.frag_len = segs[i].ds_len;
 			pd->wqe_idx = wq->ring->pidx;
 			RING_PUT(wq->ring, 1);
 			atomic_add_int(&wq->ring->num_used, 1);
 		}
 		if (num_wqes > (pd->nsegs + 1)) {
 			nicfrag =
 			    RING_GET_PRODUCER_ITEM_VA(wq->ring,
 						      struct oce_nic_frag_wqe);
 			nicfrag->u0.dw[0] = 0;
 			nicfrag->u0.dw[1] = 0;
 			nicfrag->u0.dw[2] = 0;
 			nicfrag->u0.dw[3] = 0;
 			pd->wqe_idx = wq->ring->pidx;
 			RING_PUT(wq->ring, 1);
 			atomic_add_int(&wq->ring->num_used, 1);
 			pd->nsegs++;
 		}
 
 		if_inc_counter(sc->ifp, IFCOUNTER_OPACKETS, 1);
 		wq->tx_stats.tx_reqs++;
 		wq->tx_stats.tx_wrbs += num_wqes;
 		wq->tx_stats.tx_bytes += m->m_pkthdr.len;
 		wq->tx_stats.tx_pkts++;
 
 		bus_dmamap_sync(wq->ring->dma.tag, wq->ring->dma.map,
 				BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		reg_value = (num_wqes << 16) | wq->wq_id;
 
 		/* if os2bmc is not enabled or if the pkt is already tagged as
 		   bmc, do nothing
 		 */
 		oce_is_pkt_dest_bmc(sc, m, &os2bmc, &m_new);
 
 		if_inc_counter(sc->ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 		if (m->m_flags & M_MCAST)
 			if_inc_counter(sc->ifp, IFCOUNTER_OMCASTS, 1);
 		ETHER_BPF_MTAP(sc->ifp, m);
 
 		OCE_WRITE_REG32(sc, db, wq->db_offset, reg_value);
 
 	} else if (rc == EFBIG)	{
 		if (retry_cnt == 0) {
 			m_temp = m_defrag(m, M_NOWAIT);
 			if (m_temp == NULL)
 				goto free_ret;
 			m = m_temp;
 			*mpp = m_temp;
 			retry_cnt = retry_cnt + 1;
 			goto retry;
 		} else
 			goto free_ret;
 	} else if (rc == ENOMEM)
 		return rc;
 	else
 		goto free_ret;
 
 	if (os2bmc) {
 		m = m_new;
 		goto tx_start;
 	}
 
 	return 0;
 
 free_ret:
 	m_freem(*mpp);
 	*mpp = NULL;
 	return rc;
 }
 
 static void
 oce_process_tx_completion(struct oce_wq *wq)
 {
 	struct oce_packet_desc *pd;
 	POCE_SOFTC sc = (POCE_SOFTC) wq->parent;
 	struct mbuf *m;
 
 	pd = &wq->pckts[wq->pkt_desc_tail];
 	atomic_store_rel_int(&wq->pkt_desc_tail,
 			     (wq->pkt_desc_tail + 1) % OCE_WQ_PACKET_ARRAY_SIZE); 
 	atomic_subtract_int(&wq->ring->num_used, pd->nsegs + 1);
 	bus_dmamap_sync(wq->tag, pd->map, BUS_DMASYNC_POSTWRITE);
 	bus_dmamap_unload(wq->tag, pd->map);
 
 	m = pd->mbuf;
 	m_freem(m);
 	pd->mbuf = NULL;
 
 	if (if_getdrvflags(sc->ifp) & IFF_DRV_OACTIVE) {
 		if (wq->ring->num_used < (wq->ring->num_items / 2)) {
 			if_setdrvflagbits(sc->ifp, 0, (IFF_DRV_OACTIVE));
 			oce_tx_restart(sc, wq);	
 		}
 	}
 }
 
 static void
 oce_tx_restart(POCE_SOFTC sc, struct oce_wq *wq)
 {
 
 	if ((if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING) != IFF_DRV_RUNNING)
 		return;
 
 	if (!drbr_empty(sc->ifp, wq->br))
 		taskqueue_enqueue(taskqueue_swi, &wq->txtask);
 
 }
 
 #if defined(INET6) || defined(INET)
 static struct mbuf *
 oce_tso_setup(POCE_SOFTC sc, struct mbuf **mpp)
 {
 	struct mbuf *m;
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct ether_vlan_header *eh;
 	struct tcphdr *th;
 	uint16_t etype;
 	int total_len = 0, ehdrlen = 0;
 
 	m = *mpp;
 
 	if (M_WRITABLE(m) == 0) {
 		m = m_dup(*mpp, M_NOWAIT);
 		if (!m)
 			return NULL;
 		m_freem(*mpp);
 		*mpp = m;
 	}
 
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		etype = ntohs(eh->evl_proto);
 		ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		etype = ntohs(eh->evl_encap_proto);
 		ehdrlen = ETHER_HDR_LEN;
 	}
 
 	switch (etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		ip = (struct ip *)(m->m_data + ehdrlen);
 		if (ip->ip_p != IPPROTO_TCP)
 			return NULL;
 		th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 
 		total_len = ehdrlen + (ip->ip_hl << 2) + (th->th_off << 2);
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		ip6 = (struct ip6_hdr *)(m->m_data + ehdrlen);
 		if (ip6->ip6_nxt != IPPROTO_TCP)
 			return NULL;
 		th = (struct tcphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
 
 		total_len = ehdrlen + sizeof(struct ip6_hdr) + (th->th_off << 2);
 		break;
 #endif
 	default:
 		return NULL;
 	}
 
 	m = m_pullup(m, total_len);
 	*mpp = m;
 	return m;
 }
 #endif /* INET6 || INET */
 
 void
 oce_tx_task(void *arg, int npending)
 {
 	struct oce_wq *wq = arg;
 	POCE_SOFTC sc = wq->parent;
 	if_t ifp = sc->ifp;
 	int rc = 0;
 
 	LOCK(&wq->tx_lock);
 	rc = oce_multiq_transmit(ifp, NULL, wq);
 	if (rc) {
 		device_printf(sc->dev,
 				"TX[%d] restart failed\n", wq->queue_index);
 	}
 	UNLOCK(&wq->tx_lock);
 }
 
 void
 oce_start(if_t ifp)
 {
 	POCE_SOFTC sc = if_getsoftc(ifp);
 	struct mbuf *m;
 	int rc = 0;
 	int def_q = 0; /* Defualt tx queue is 0*/
 
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 			IFF_DRV_RUNNING)
 		return;
 
 	if (!sc->link_status)
 		return;
 
 	while (true) {
 		m = if_dequeue(sc->ifp);
 		if (m == NULL)
 			break;
 
 		LOCK(&sc->wq[def_q]->tx_lock);
 		rc = oce_tx(sc, &m, def_q);
 		UNLOCK(&sc->wq[def_q]->tx_lock);
 		if (rc) {
 			if (m != NULL) {
 				sc->wq[def_q]->tx_stats.tx_stops ++;
 				if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
 				if_sendq_prepend(ifp, m);
 				m = NULL;
 			}
 			break;
 		}
 	}
 }
 
 /* Handle the Completion Queue for transmit */
 uint16_t
 oce_wq_handler(void *arg)
 {
 	struct oce_wq *wq = (struct oce_wq *)arg;
 	POCE_SOFTC sc = wq->parent;
 	struct oce_cq *cq = wq->cq;
 	struct oce_nic_tx_cqe *cqe;
 	int num_cqes = 0;
 
 	LOCK(&wq->tx_compl_lock);
 	bus_dmamap_sync(cq->ring->dma.tag,
 			cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
 	cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_tx_cqe);
 	while (cqe->u0.dw[3]) {
 		DW_SWAP((uint32_t *) cqe, sizeof(oce_wq_cqe));
 
 		wq->ring->cidx = cqe->u0.s.wqe_index + 1;
 		if (wq->ring->cidx >= wq->ring->num_items)
 			wq->ring->cidx -= wq->ring->num_items;
 
 		oce_process_tx_completion(wq);
 		wq->tx_stats.tx_compl++;
 		cqe->u0.dw[3] = 0;
 		RING_GET(cq->ring, 1);
 		bus_dmamap_sync(cq->ring->dma.tag,
 				cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
 		cqe =
 		    RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_tx_cqe);
 		num_cqes++;
 	}
 
 	if (num_cqes)
 		oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE);
 
 	UNLOCK(&wq->tx_compl_lock);
 	return num_cqes;
 }
 
 static int 
 oce_multiq_transmit(if_t ifp, struct mbuf *m, struct oce_wq *wq)
 {
 	POCE_SOFTC sc = if_getsoftc(ifp);
 	int status = 0, queue_index = 0;
 	struct mbuf *next = NULL;
 	struct buf_ring *br = NULL;
 
 	br  = wq->br;
 	queue_index = wq->queue_index;
 
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 		IFF_DRV_RUNNING) {
 		if (m != NULL)
 			status = drbr_enqueue(ifp, br, m);
 		return status;
 	}
 
 	if (m != NULL) {
 		if ((status = drbr_enqueue(ifp, br, m)) != 0)
 			return status;
 	} 
 	while ((next = drbr_peek(ifp, br)) != NULL) {
 		if (oce_tx(sc, &next, queue_index)) {
 			if (next == NULL) {
 				drbr_advance(ifp, br);
 			} else {
 				drbr_putback(ifp, br, next);
 				wq->tx_stats.tx_stops ++;
 				if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
 			}  
 			break;
 		}
 		drbr_advance(ifp, br);
 	}
 
 	return 0;
 }
 
 /*****************************************************************************
  *			    Receive  routines functions 		     *
  *****************************************************************************/
 
 static void
 oce_correct_header(struct mbuf *m, struct nic_hwlro_cqe_part1 *cqe1, struct nic_hwlro_cqe_part2 *cqe2)
 {
 	uint32_t *p;
         struct ether_header *eh = NULL;
         struct tcphdr *tcp_hdr = NULL;
         struct ip *ip4_hdr = NULL;
         struct ip6_hdr *ip6 = NULL;
         uint32_t payload_len = 0;
 
         eh = mtod(m, struct ether_header *);
         /* correct IP header */
         if(!cqe2->ipv6_frame) {
 		ip4_hdr = (struct ip *)((char*)eh + sizeof(struct ether_header));
                 ip4_hdr->ip_ttl = cqe2->frame_lifespan;
                 ip4_hdr->ip_len = htons(cqe2->coalesced_size - sizeof(struct ether_header));
                 tcp_hdr = (struct tcphdr *)((char*)ip4_hdr + sizeof(struct ip));
         }else {
         	ip6 = (struct ip6_hdr *)((char*)eh + sizeof(struct ether_header));
                 ip6->ip6_ctlun.ip6_un1.ip6_un1_hlim = cqe2->frame_lifespan;
                 payload_len = cqe2->coalesced_size - sizeof(struct ether_header)
                                                 - sizeof(struct ip6_hdr);
                 ip6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(payload_len);
                 tcp_hdr = (struct tcphdr *)((char*)ip6 + sizeof(struct ip6_hdr));
         }
 
         /* correct tcp header */
         tcp_hdr->th_ack = htonl(cqe2->tcp_ack_num);
         if(cqe2->push) {
         	tcp_hdr->th_flags |= TH_PUSH;
         }
         tcp_hdr->th_win = htons(cqe2->tcp_window);
         tcp_hdr->th_sum = 0xffff;
         if(cqe2->ts_opt) {
                 p = (uint32_t *)((char*)tcp_hdr + sizeof(struct tcphdr) + 2);
                 *p = cqe1->tcp_timestamp_val;
                 *(p+1) = cqe1->tcp_timestamp_ecr;
         }
 
 	return;
 }
 
 static void
 oce_rx_mbuf_chain(struct oce_rq *rq, struct oce_common_cqe_info *cqe_info, struct mbuf **m)
 {
 	POCE_SOFTC sc = (POCE_SOFTC) rq->parent;
         uint32_t i = 0, frag_len = 0;
 	uint32_t len = cqe_info->pkt_size;
         struct oce_packet_desc *pd;
         struct mbuf *tail = NULL;
 
         for (i = 0; i < cqe_info->num_frags; i++) {
                 if (rq->ring->cidx == rq->ring->pidx) {
                         device_printf(sc->dev,
                                   "oce_rx_mbuf_chain: Invalid RX completion - Queue is empty\n");
                         return;
                 }
                 pd = &rq->pckts[rq->ring->cidx];
 
                 bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_POSTWRITE);
                 bus_dmamap_unload(rq->tag, pd->map);
 		RING_GET(rq->ring, 1);
                 rq->pending--;
 
                 frag_len = (len > rq->cfg.frag_size) ? rq->cfg.frag_size : len;
                 pd->mbuf->m_len = frag_len;
 
                 if (tail != NULL) {
                         /* additional fragments */
                         pd->mbuf->m_flags &= ~M_PKTHDR;
                         tail->m_next = pd->mbuf;
 			if(rq->islro)
                         	tail->m_nextpkt = NULL;
                         tail = pd->mbuf;
                 } else {
                         /* first fragment, fill out much of the packet header */
                         pd->mbuf->m_pkthdr.len = len;
 			if(rq->islro)
                         	pd->mbuf->m_nextpkt = NULL;
                         pd->mbuf->m_pkthdr.csum_flags = 0;
                         if (IF_CSUM_ENABLED(sc)) {
                                 if (cqe_info->l4_cksum_pass) {
                                         if(!cqe_info->ipv6_frame) { /* IPV4 */
                                                 pd->mbuf->m_pkthdr.csum_flags |=
                                                         (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
                                         }else { /* IPV6 frame */
 						if(rq->islro) {
                                                 	pd->mbuf->m_pkthdr.csum_flags |=
                                                         (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 						}
                                         }
                                         pd->mbuf->m_pkthdr.csum_data = 0xffff;
                                 }
                                 if (cqe_info->ip_cksum_pass) {
                                         pd->mbuf->m_pkthdr.csum_flags |=
                                                (CSUM_IP_CHECKED|CSUM_IP_VALID);
                                 }
                         }
                         *m = tail = pd->mbuf;
                }
                 pd->mbuf = NULL;
                 len -= frag_len;
         }
 
         return;
 }
 
 static void
 oce_rx_lro(struct oce_rq *rq, struct nic_hwlro_singleton_cqe *cqe, struct nic_hwlro_cqe_part2 *cqe2)
 {
         POCE_SOFTC sc = (POCE_SOFTC) rq->parent;
         struct nic_hwlro_cqe_part1 *cqe1 = NULL;
         struct mbuf *m = NULL;
 	struct oce_common_cqe_info cq_info;
 
 	/* parse cqe */
         if(cqe2 == NULL) {
                 cq_info.pkt_size =  cqe->pkt_size;
                 cq_info.vtag = cqe->vlan_tag;
                 cq_info.l4_cksum_pass = cqe->l4_cksum_pass;
                 cq_info.ip_cksum_pass = cqe->ip_cksum_pass;
                 cq_info.ipv6_frame = cqe->ipv6_frame;
                 cq_info.vtp = cqe->vtp;
                 cq_info.qnq = cqe->qnq;
         }else {
                 cqe1 = (struct nic_hwlro_cqe_part1 *)cqe;
                 cq_info.pkt_size =  cqe2->coalesced_size;
                 cq_info.vtag = cqe2->vlan_tag;
                 cq_info.l4_cksum_pass = cqe2->l4_cksum_pass;
                 cq_info.ip_cksum_pass = cqe2->ip_cksum_pass;
                 cq_info.ipv6_frame = cqe2->ipv6_frame;
                 cq_info.vtp = cqe2->vtp;
                 cq_info.qnq = cqe1->qnq;
         }
         
 	cq_info.vtag = BSWAP_16(cq_info.vtag);
 
         cq_info.num_frags = cq_info.pkt_size / rq->cfg.frag_size;
         if(cq_info.pkt_size % rq->cfg.frag_size)
                 cq_info.num_frags++;
 
 	oce_rx_mbuf_chain(rq, &cq_info, &m);
 
 	if (m) {
 		if(cqe2) {
 			//assert(cqe2->valid != 0);
 			
 			//assert(cqe2->cqe_type != 2);
 			oce_correct_header(m, cqe1, cqe2);
 		}
 
 		m->m_pkthdr.rcvif = sc->ifp;
 		if (rq->queue_index)
 			m->m_pkthdr.flowid = (rq->queue_index - 1);
 		else
 			m->m_pkthdr.flowid = rq->queue_index;
 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 
 		/* This deternies if vlan tag is Valid */
 		if (cq_info.vtp) {
 			if (sc->function_mode & FNM_FLEX10_MODE) {
 				/* FLEX10. If QnQ is not set, neglect VLAN */
 				if (cq_info.qnq) {
 					m->m_pkthdr.ether_vtag = cq_info.vtag;
 					m->m_flags |= M_VLANTAG;
 				}
 			} else if (sc->pvid != (cq_info.vtag & VLAN_VID_MASK))  {
 				/* In UMC mode generally pvid will be striped by
 				   hw. But in some cases we have seen it comes
 				   with pvid. So if pvid == vlan, neglect vlan.
 				 */
 				m->m_pkthdr.ether_vtag = cq_info.vtag;
 				m->m_flags |= M_VLANTAG;
 			}
 		}
 		if_inc_counter(sc->ifp, IFCOUNTER_IPACKETS, 1);
 		
 		if_input(sc->ifp, m);
 
 		/* Update rx stats per queue */
 		rq->rx_stats.rx_pkts++;
 		rq->rx_stats.rx_bytes += cq_info.pkt_size;
 		rq->rx_stats.rx_frags += cq_info.num_frags;
 		rq->rx_stats.rx_ucast_pkts++;
 	}
         return;
 }
 
 static void
 oce_rx(struct oce_rq *rq, struct oce_nic_rx_cqe *cqe)
 {
 	POCE_SOFTC sc = (POCE_SOFTC) rq->parent;
 	int len;
 	struct mbuf *m = NULL;
 	struct oce_common_cqe_info cq_info;
 	uint16_t vtag = 0;
 
 	/* Is it a flush compl that has no data */
 	if(!cqe->u0.s.num_fragments)
 		goto exit;
 
 	len = cqe->u0.s.pkt_size;
 	if (!len) {
 		/*partial DMA workaround for Lancer*/
 		oce_discard_rx_comp(rq, cqe->u0.s.num_fragments);
 		goto exit;
 	}
 
 	if (!oce_cqe_portid_valid(sc, cqe)) {
 		oce_discard_rx_comp(rq, cqe->u0.s.num_fragments);
 		goto exit;
 	}
 
 	 /* Get vlan_tag value */
 	if(IS_BE(sc) || IS_SH(sc))
 		vtag = BSWAP_16(cqe->u0.s.vlan_tag);
 	else
 		vtag = cqe->u0.s.vlan_tag;
 
 	cq_info.l4_cksum_pass = cqe->u0.s.l4_cksum_pass;
 	cq_info.ip_cksum_pass = cqe->u0.s.ip_cksum_pass;
 	cq_info.ipv6_frame = cqe->u0.s.ip_ver;
 	cq_info.num_frags = cqe->u0.s.num_fragments;
 	cq_info.pkt_size = cqe->u0.s.pkt_size;
 
 	oce_rx_mbuf_chain(rq, &cq_info, &m);
 
 	if (m) {
 		m->m_pkthdr.rcvif = sc->ifp;
 		if (rq->queue_index)
 			m->m_pkthdr.flowid = (rq->queue_index - 1);
 		else
 			m->m_pkthdr.flowid = rq->queue_index;
 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 
 		/* This deternies if vlan tag is Valid */
 		if (oce_cqe_vtp_valid(sc, cqe)) { 
 			if (sc->function_mode & FNM_FLEX10_MODE) {
 				/* FLEX10. If QnQ is not set, neglect VLAN */
 				if (cqe->u0.s.qnq) {
 					m->m_pkthdr.ether_vtag = vtag;
 					m->m_flags |= M_VLANTAG;
 				}
 			} else if (sc->pvid != (vtag & VLAN_VID_MASK))  {
 				/* In UMC mode generally pvid will be striped by
 				   hw. But in some cases we have seen it comes
 				   with pvid. So if pvid == vlan, neglect vlan.
 				*/
 				m->m_pkthdr.ether_vtag = vtag;
 				m->m_flags |= M_VLANTAG;
 			}
 		}
 
 		if_inc_counter(sc->ifp, IFCOUNTER_IPACKETS, 1);
 #if defined(INET6) || defined(INET)
 		/* Try to queue to LRO */
 		if (IF_LRO_ENABLED(sc) &&
 		    (cqe->u0.s.ip_cksum_pass) &&
 		    (cqe->u0.s.l4_cksum_pass) &&
 		    (!cqe->u0.s.ip_ver)       &&
 		    (rq->lro.lro_cnt != 0)) {
 			if (tcp_lro_rx(&rq->lro, m, 0) == 0) {
 				rq->lro_pkts_queued ++;		
 				goto post_done;
 			}
 			/* If LRO posting fails then try to post to STACK */
 		}
 #endif
 
 		if_input(sc->ifp, m);
 #if defined(INET6) || defined(INET)
 post_done:
 #endif
 		/* Update rx stats per queue */
 		rq->rx_stats.rx_pkts++;
 		rq->rx_stats.rx_bytes += cqe->u0.s.pkt_size;
 		rq->rx_stats.rx_frags += cqe->u0.s.num_fragments;
 		if (cqe->u0.s.pkt_type == OCE_MULTICAST_PACKET)
 			rq->rx_stats.rx_mcast_pkts++;
 		if (cqe->u0.s.pkt_type == OCE_UNICAST_PACKET)
 			rq->rx_stats.rx_ucast_pkts++;
 	}
 exit:
 	return;
 }
 
 void
 oce_discard_rx_comp(struct oce_rq *rq, int num_frags)
 {
 	uint32_t i = 0;
 	struct oce_packet_desc *pd;
 	POCE_SOFTC sc = (POCE_SOFTC) rq->parent;
 
 	for (i = 0; i < num_frags; i++) {
                 if (rq->ring->cidx == rq->ring->pidx) {
                         device_printf(sc->dev,
                                 "oce_discard_rx_comp: Invalid RX completion - Queue is empty\n");
                         return;
                 }
                 pd = &rq->pckts[rq->ring->cidx];
                 bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_POSTWRITE);
                 bus_dmamap_unload(rq->tag, pd->map);
                 if (pd->mbuf != NULL) {
                         m_freem(pd->mbuf);
                         pd->mbuf = NULL;
                 }
 
 		RING_GET(rq->ring, 1);
                 rq->pending--;
 	}
 }
 
 static int
 oce_cqe_vtp_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe)
 {
 	struct oce_nic_rx_cqe_v1 *cqe_v1;
 	int vtp = 0;
 
 	if (sc->be3_native) {
 		cqe_v1 = (struct oce_nic_rx_cqe_v1 *)cqe;
 		vtp =  cqe_v1->u0.s.vlan_tag_present; 
 	} else
 		vtp = cqe->u0.s.vlan_tag_present;
 
 	return vtp;
 
 }
 
 static int
 oce_cqe_portid_valid(POCE_SOFTC sc, struct oce_nic_rx_cqe *cqe)
 {
 	struct oce_nic_rx_cqe_v1 *cqe_v1;
 	int port_id = 0;
 
 	if (sc->be3_native && (IS_BE(sc) || IS_SH(sc))) {
 		cqe_v1 = (struct oce_nic_rx_cqe_v1 *)cqe;
 		port_id =  cqe_v1->u0.s.port;
 		if (sc->port_id != port_id)
 			return 0;
 	} else
 		;/* For BE3 legacy and Lancer this is dummy */
 
 	return 1;
 
 }
 
 #if defined(INET6) || defined(INET)
 void
 oce_rx_flush_lro(struct oce_rq *rq)
 {
 	struct lro_ctrl	*lro = &rq->lro;
 	POCE_SOFTC sc = (POCE_SOFTC) rq->parent;
 
 	if (!IF_LRO_ENABLED(sc))
 		return;
 
 	tcp_lro_flush_all(lro);
 	rq->lro_pkts_queued = 0;
 
 	return;
 }
 
 static int
 oce_init_lro(POCE_SOFTC sc)
 {
 	struct lro_ctrl *lro = NULL;
 	int i = 0, rc = 0;
 
 	for (i = 0; i < sc->nrqs; i++) { 
 		lro = &sc->rq[i]->lro;
 		rc = tcp_lro_init(lro);
 		if (rc != 0) {
 			device_printf(sc->dev, "LRO init failed\n");
 			return rc;		
 		}
 		lro->ifp = sc->ifp;
 	}
 
 	return rc;		
 }
 
 void
 oce_free_lro(POCE_SOFTC sc)
 {
 	struct lro_ctrl *lro = NULL;
 	int i = 0;
 
 	for (i = 0; i < sc->nrqs; i++) {
 		lro = &sc->rq[i]->lro;
 		if (lro)
 			tcp_lro_free(lro);
 	}
 }
 #endif
 
 int
 oce_alloc_rx_bufs(struct oce_rq *rq, int count)
 {
 	POCE_SOFTC sc = (POCE_SOFTC) rq->parent;
 	int i, rc;
 	struct oce_packet_desc *pd;
 	bus_dma_segment_t segs[6];
 	int nsegs, added = 0;
 	struct oce_nic_rqe *rqe;
 	pd_rxulp_db_t rxdb_reg;
 	uint32_t val = 0;
 	uint32_t oce_max_rq_posts = 64;
 
 	bzero(&rxdb_reg, sizeof(pd_rxulp_db_t));
 	for (i = 0; i < count; i++) {
 		pd = &rq->pckts[rq->ring->pidx];
 		pd->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, oce_rq_buf_size);
 		if (pd->mbuf == NULL) {
 			device_printf(sc->dev, "mbuf allocation failed, size = %d\n",oce_rq_buf_size);
 			break;
 		}
 		pd->mbuf->m_nextpkt = NULL;
 
 		pd->mbuf->m_len = pd->mbuf->m_pkthdr.len = rq->cfg.frag_size;
 
 		rc = bus_dmamap_load_mbuf_sg(rq->tag,
 					     pd->map,
 					     pd->mbuf,
 					     segs, &nsegs, BUS_DMA_NOWAIT);
 		if (rc) {
 			m_free(pd->mbuf);
 			device_printf(sc->dev, "bus_dmamap_load_mbuf_sg failed rc = %d\n", rc);
 			break;
 		}
 
 		if (nsegs != 1) {
 			i--;
 			continue;
 		}
 
 		bus_dmamap_sync(rq->tag, pd->map, BUS_DMASYNC_PREREAD);
 
 		rqe = RING_GET_PRODUCER_ITEM_VA(rq->ring, struct oce_nic_rqe);
 		rqe->u0.s.frag_pa_hi = ADDR_HI(segs[0].ds_addr);
 		rqe->u0.s.frag_pa_lo = ADDR_LO(segs[0].ds_addr);
 		DW_SWAP(u32ptr(rqe), sizeof(struct oce_nic_rqe));
 		RING_PUT(rq->ring, 1);
 		added++;
 		rq->pending++;
 	}
 	oce_max_rq_posts = sc->enable_hwlro ? OCE_HWLRO_MAX_RQ_POSTS : OCE_MAX_RQ_POSTS;
 	if (added != 0) {
 		for (i = added / oce_max_rq_posts; i > 0; i--) {
 			rxdb_reg.bits.num_posted = oce_max_rq_posts;
 			rxdb_reg.bits.qid = rq->rq_id;
 			if(rq->islro) {
                                 val |= rq->rq_id & DB_LRO_RQ_ID_MASK;
                                 val |= oce_max_rq_posts << 16;
                                 OCE_WRITE_REG32(sc, db, DB_OFFSET, val);
 			}else {
 				OCE_WRITE_REG32(sc, db, PD_RXULP_DB, rxdb_reg.dw0);
 			}
 			added -= oce_max_rq_posts;
 		}
 		if (added > 0) {
 			rxdb_reg.bits.qid = rq->rq_id;
 			rxdb_reg.bits.num_posted = added;
 			if(rq->islro) {
                                 val |= rq->rq_id & DB_LRO_RQ_ID_MASK;
                                 val |= added << 16;
                                 OCE_WRITE_REG32(sc, db, DB_OFFSET, val);
 			}else {
 				OCE_WRITE_REG32(sc, db, PD_RXULP_DB, rxdb_reg.dw0);
 			}
 		}
 	}
 
 	return 0;	
 }
 
 static void
 oce_check_rx_bufs(POCE_SOFTC sc, uint32_t num_cqes, struct oce_rq *rq)
 {
         if (num_cqes) {
                 oce_arm_cq(sc, rq->cq->cq_id, num_cqes, FALSE);
 		if(!sc->enable_hwlro) {
 			if((OCE_RQ_PACKET_ARRAY_SIZE - rq->pending) > 1)
 				oce_alloc_rx_bufs(rq, ((OCE_RQ_PACKET_ARRAY_SIZE - rq->pending) - 1));
 		}else {
                 	if ((OCE_RQ_PACKET_ARRAY_SIZE -1 - rq->pending) > 64)
                         	oce_alloc_rx_bufs(rq, 64);
         	}
 	}
 
         return;
 }
 
 uint16_t
 oce_rq_handler_lro(void *arg)
 {
         struct oce_rq *rq = (struct oce_rq *)arg;
         struct oce_cq *cq = rq->cq;
         POCE_SOFTC sc = rq->parent;
         struct nic_hwlro_singleton_cqe *cqe;
         struct nic_hwlro_cqe_part2 *cqe2;
         int num_cqes = 0;
 
 	LOCK(&rq->rx_lock);
         bus_dmamap_sync(cq->ring->dma.tag,cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
         cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct nic_hwlro_singleton_cqe);
         while (cqe->valid) {
                 if(cqe->cqe_type == 0) { /* singleton cqe */
 			/* we should not get singleton cqe after cqe1 on same rq */
 			if(rq->cqe_firstpart != NULL) {
 				device_printf(sc->dev, "Got singleton cqe after cqe1 \n");
 				goto exit_rq_handler_lro;
 			}							
                         if(cqe->error != 0) {
                                 rq->rx_stats.rxcp_err++;
 				if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1);
                         }
                         oce_rx_lro(rq, cqe, NULL);
                         rq->rx_stats.rx_compl++;
                         cqe->valid = 0;
                         RING_GET(cq->ring, 1);
                         num_cqes++;
                         if (num_cqes >= (IS_XE201(sc) ? 8 : oce_max_rsp_handled))
                                 break;
                 }else if(cqe->cqe_type == 0x1) { /* first part */
 			/* we should not get cqe1 after cqe1 on same rq */
 			if(rq->cqe_firstpart != NULL) {
 				device_printf(sc->dev, "Got cqe1 after cqe1 \n");
 				goto exit_rq_handler_lro;
 			}
 			rq->cqe_firstpart = (struct nic_hwlro_cqe_part1 *)cqe;
                         RING_GET(cq->ring, 1);
                 }else if(cqe->cqe_type == 0x2) { /* second part */
 			cqe2 = (struct nic_hwlro_cqe_part2 *)cqe;
                         if(cqe2->error != 0) {
                                 rq->rx_stats.rxcp_err++;
 				if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1);
                         }
 			/* We should not get cqe2 without cqe1 */
 			if(rq->cqe_firstpart == NULL) {
 				device_printf(sc->dev, "Got cqe2 without cqe1 \n");
 				goto exit_rq_handler_lro;
 			}
                         oce_rx_lro(rq, (struct nic_hwlro_singleton_cqe *)rq->cqe_firstpart, cqe2);
 
                         rq->rx_stats.rx_compl++;
                         rq->cqe_firstpart->valid = 0;
                         cqe2->valid = 0;
 			rq->cqe_firstpart = NULL;
 
                         RING_GET(cq->ring, 1);
                         num_cqes += 2;
                         if (num_cqes >= (IS_XE201(sc) ? 8 : oce_max_rsp_handled))
                                 break;
 		}
 
                 bus_dmamap_sync(cq->ring->dma.tag,cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
                 cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct nic_hwlro_singleton_cqe);
         }
 	oce_check_rx_bufs(sc, num_cqes, rq);
 exit_rq_handler_lro:
 	UNLOCK(&rq->rx_lock);
 	return 0;
 }
 
 /* Handle the Completion Queue for receive */
 uint16_t
 oce_rq_handler(void *arg)
 {
 	struct epoch_tracker et;
 	struct oce_rq *rq = (struct oce_rq *)arg;
 	struct oce_cq *cq = rq->cq;
 	POCE_SOFTC sc = rq->parent;
 	struct oce_nic_rx_cqe *cqe;
 	int num_cqes = 0;
 
 	NET_EPOCH_ENTER(et);
 	if(rq->islro) {
 		oce_rq_handler_lro(arg);
 		NET_EPOCH_EXIT(et);
 		return 0;
 	}
 	LOCK(&rq->rx_lock);
 	bus_dmamap_sync(cq->ring->dma.tag,
 			cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
 	cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_rx_cqe);
 	while (cqe->u0.dw[2]) {
 		DW_SWAP((uint32_t *) cqe, sizeof(oce_rq_cqe));
 
 		if (cqe->u0.s.error == 0) {
 			oce_rx(rq, cqe);
 		} else {
 			rq->rx_stats.rxcp_err++;
 			if_inc_counter(sc->ifp, IFCOUNTER_IERRORS, 1);
 			/* Post L3/L4 errors to stack.*/
 			oce_rx(rq, cqe);
 		}
 		rq->rx_stats.rx_compl++;
 		cqe->u0.dw[2] = 0;
 
 #if defined(INET6) || defined(INET)
 		if (IF_LRO_ENABLED(sc) && rq->lro_pkts_queued >= 16) {
 			oce_rx_flush_lro(rq);
 		}
 #endif
 
 		RING_GET(cq->ring, 1);
 		bus_dmamap_sync(cq->ring->dma.tag,
 				cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
 		cqe =
 		    RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_nic_rx_cqe);
 		num_cqes++;
 		if (num_cqes >= (IS_XE201(sc) ? 8 : oce_max_rsp_handled))
 			break;
 	}
 
 #if defined(INET6) || defined(INET)
         if (IF_LRO_ENABLED(sc))
                 oce_rx_flush_lro(rq);
 #endif
 
 	oce_check_rx_bufs(sc, num_cqes, rq);
 	UNLOCK(&rq->rx_lock);
 	NET_EPOCH_EXIT(et);
 	return 0;
 
 }
 
 /*****************************************************************************
  *		   Helper function prototypes in this file 		     *
  *****************************************************************************/
 
 static int 
 oce_attach_ifp(POCE_SOFTC sc)
 {
 
 	sc->ifp = if_alloc(IFT_ETHER);
 	if (!sc->ifp)
 		return ENOMEM;
 
 	ifmedia_init(&sc->media, IFM_IMASK, oce_media_change, oce_media_status);
 	ifmedia_add(&sc->media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->media, IFM_ETHER | IFM_AUTO);
 
-	if_setflags(sc->ifp, IFF_BROADCAST | IFF_MULTICAST | IFF_KNOWSEPOCH);
+	if_setflags(sc->ifp, IFF_BROADCAST | IFF_MULTICAST);
 	if_setioctlfn(sc->ifp, oce_ioctl);
 	if_setstartfn(sc->ifp, oce_start);
 	if_setinitfn(sc->ifp, oce_init);
 	if_setmtu(sc->ifp, ETHERMTU);
 	if_setsoftc(sc->ifp, sc);
 	if_settransmitfn(sc->ifp, oce_multiq_start);
 	if_setqflushfn(sc->ifp, oce_multiq_flush);
 
 	if_initname(sc->ifp,
 		    device_get_name(sc->dev), device_get_unit(sc->dev));
 
 	if_setsendqlen(sc->ifp, OCE_MAX_TX_DESC - 1);
 	if_setsendqready(sc->ifp);
 
 	if_sethwassist(sc->ifp, OCE_IF_HWASSIST);
 	if_sethwassistbits(sc->ifp, CSUM_TSO, 0);
 	if_sethwassistbits(sc->ifp, (CSUM_IP | CSUM_TCP | CSUM_UDP), 0);
 
 	if_setcapabilities(sc->ifp, OCE_IF_CAPABILITIES);
 	if_setcapabilitiesbit(sc->ifp, IFCAP_HWCSUM, 0);
 	if_setcapabilitiesbit(sc->ifp, IFCAP_VLAN_HWFILTER, 0);
 
 #if defined(INET6) || defined(INET)
 	if_setcapabilitiesbit(sc->ifp, IFCAP_TSO, 0);
 	if_setcapabilitiesbit(sc->ifp, IFCAP_LRO, 0);
 	if_setcapabilitiesbit(sc->ifp, IFCAP_VLAN_HWTSO, 0);
 #endif
 
 	if_setcapenable(sc->ifp, if_getcapabilities(sc->ifp));
 	if_setbaudrate(sc->ifp, IF_Gbps(10));
 
 	if_sethwtsomax(sc->ifp, 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 	if_sethwtsomaxsegcount(sc->ifp, OCE_MAX_TX_ELEMENTS);
 	if_sethwtsomaxsegsize(sc->ifp, 4096);
 
 	ether_ifattach(sc->ifp, sc->macaddr.mac_addr);
 
 	return 0;
 }
 
 static void
 oce_add_vlan(void *arg, if_t ifp, uint16_t vtag)
 {
 	POCE_SOFTC sc = if_getsoftc(ifp);
 
 	if (if_getsoftc(ifp) !=  arg)
 		return;
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	sc->vlan_tag[vtag] = 1;
 	sc->vlans_added++;
 	if (sc->vlans_added <= (sc->max_vlans + 1))
 		oce_vid_config(sc);
 }
 
 static void
 oce_del_vlan(void *arg, if_t ifp, uint16_t vtag)
 {
 	POCE_SOFTC sc = if_getsoftc(ifp);
 
 	if (if_getsoftc(ifp) !=  arg)
 		return;
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	sc->vlan_tag[vtag] = 0;
 	sc->vlans_added--;
 	oce_vid_config(sc);
 }
 
 /*
  * A max of 64 vlans can be configured in BE. If the user configures
  * more, place the card in vlan promiscuous mode.
  */
 static int
 oce_vid_config(POCE_SOFTC sc)
 {
 	struct normal_vlan vtags[MAX_VLANFILTER_SIZE];
 	uint16_t ntags = 0, i;
 	int status = 0;
 
 	if ((sc->vlans_added <= MAX_VLANFILTER_SIZE) && 
 			(if_getcapenable(sc->ifp) & IFCAP_VLAN_HWFILTER)) {
 		for (i = 0; i < MAX_VLANS; i++) {
 			if (sc->vlan_tag[i]) {
 				vtags[ntags].vtag = i;
 				ntags++;
 			}
 		}
 		if (ntags)
 			status = oce_config_vlan(sc, (uint8_t) sc->if_id,
 						vtags, ntags, 1, 0); 
 	} else 
 		status = oce_config_vlan(sc, (uint8_t) sc->if_id,
 					 	NULL, 0, 1, 1);
 	return status;
 }
 
 static void
 oce_mac_addr_set(POCE_SOFTC sc)
 {
 	uint32_t old_pmac_id = sc->pmac_id;
 	int status = 0;
 
 	status = bcmp((if_getlladdr(sc->ifp)), sc->macaddr.mac_addr,
 			 sc->macaddr.size_of_struct);
 	if (!status)
 		return;
 
 	status = oce_mbox_macaddr_add(sc, (uint8_t *)(if_getlladdr(sc->ifp)),
 					sc->if_id, &sc->pmac_id);
 	if (!status) {
 		status = oce_mbox_macaddr_del(sc, sc->if_id, old_pmac_id);
 		bcopy((if_getlladdr(sc->ifp)), sc->macaddr.mac_addr,
 				 sc->macaddr.size_of_struct); 
 	}
 	if (status)
 		device_printf(sc->dev, "Failed update macaddress\n");
 
 }
 
 static int
 oce_handle_passthrough(if_t ifp, caddr_t data)
 {
 	POCE_SOFTC sc = if_getsoftc(ifp);
 	struct ifreq *ifr = (struct ifreq *)data;
 	int rc = ENXIO;
 	char cookie[32] = {0};
 	void *priv_data = ifr_data_get_ptr(ifr);
 	void *ioctl_ptr;
 	uint32_t req_size;
 	struct mbx_hdr req;
 	OCE_DMA_MEM dma_mem;
 	struct mbx_common_get_cntl_attr *fw_cmd;
 
 	if (copyin(priv_data, cookie, strlen(IOCTL_COOKIE)))
 		return EFAULT;
 
 	if (memcmp(cookie, IOCTL_COOKIE, strlen(IOCTL_COOKIE)))
 		return EINVAL;
 
 	ioctl_ptr = (char *)priv_data + strlen(IOCTL_COOKIE);
 	if (copyin(ioctl_ptr, &req, sizeof(struct mbx_hdr)))
 		return EFAULT;
 
 	req_size = le32toh(req.u0.req.request_length);
 	if (req_size > 65536)
 		return EINVAL;
 
 	req_size += sizeof(struct mbx_hdr);
 	rc = oce_dma_alloc(sc, req_size, &dma_mem, 0);
 	if (rc)
 		return ENOMEM;
 
 	if (copyin(ioctl_ptr, OCE_DMAPTR(&dma_mem,char), req_size)) {
 		rc = EFAULT;
 		goto dma_free;
 	}
 
 	rc = oce_pass_through_mbox(sc, &dma_mem, req_size);
 	if (rc) {
 		rc = EIO;
 		goto dma_free;
 	}
 
 	if (copyout(OCE_DMAPTR(&dma_mem,char), ioctl_ptr, req_size))
 		rc =  EFAULT;
 
 	/* 
 	   firmware is filling all the attributes for this ioctl except
 	   the driver version..so fill it 
 	 */
 	if(req.u0.rsp.opcode == OPCODE_COMMON_GET_CNTL_ATTRIBUTES) {
 		fw_cmd = (struct mbx_common_get_cntl_attr *) ioctl_ptr;
 		strncpy(fw_cmd->params.rsp.cntl_attr_info.hba_attr.drv_ver_str,
 			COMPONENT_REVISION, strlen(COMPONENT_REVISION));	
 	}
 
 dma_free:
 	oce_dma_free(sc, &dma_mem);
 	return rc;
 
 }
 
 static void
 oce_eqd_set_periodic(POCE_SOFTC sc)
 {
 	struct oce_set_eqd set_eqd[OCE_MAX_EQ];
 	struct oce_aic_obj *aic;
 	struct oce_eq *eqo;
 	uint64_t now = 0, delta;
 	int eqd, i, num = 0;
 	uint32_t tx_reqs = 0, rxpkts = 0, pps;
 	struct oce_wq *wq;
 	struct oce_rq *rq;
 
 	#define ticks_to_msecs(t)       (1000 * (t) / hz)
 
 	for (i = 0 ; i < sc->neqs; i++) {
 		eqo = sc->eq[i];
 		aic = &sc->aic_obj[i];
 		/* When setting the static eq delay from the user space */
 		if (!aic->enable) {
 			if (aic->ticks)
 				aic->ticks = 0;
 			eqd = aic->et_eqd;
 			goto modify_eqd;
 		}
 
 		if (i == 0) {
 			rq = sc->rq[0];
 			rxpkts = rq->rx_stats.rx_pkts;
 		} else
 			rxpkts = 0;
 		if (i + 1 < sc->nrqs) {
 			rq = sc->rq[i + 1];
 			rxpkts += rq->rx_stats.rx_pkts;
 		}
 		if (i < sc->nwqs) {
 			wq = sc->wq[i];
 			tx_reqs = wq->tx_stats.tx_reqs;
 		} else
 			tx_reqs = 0;
 		now = ticks;
 
 		if (!aic->ticks || now < aic->ticks ||
 		    rxpkts < aic->prev_rxpkts || tx_reqs < aic->prev_txreqs) {
 			aic->prev_rxpkts = rxpkts;
 			aic->prev_txreqs = tx_reqs;
 			aic->ticks = now;
 			continue;
 		}
 
 		delta = ticks_to_msecs(now - aic->ticks);
 
 		pps = (((uint32_t)(rxpkts - aic->prev_rxpkts) * 1000) / delta) +
 		      (((uint32_t)(tx_reqs - aic->prev_txreqs) * 1000) / delta);
 		eqd = (pps / 15000) << 2;
 		if (eqd < 8)
 			eqd = 0;
 
 		/* Make sure that the eq delay is in the known range */
 		eqd = min(eqd, aic->max_eqd);
 		eqd = max(eqd, aic->min_eqd);
 
 		aic->prev_rxpkts = rxpkts;
 		aic->prev_txreqs = tx_reqs;
 		aic->ticks = now;
 
 modify_eqd:
 		if (eqd != aic->cur_eqd) {
 			set_eqd[num].delay_multiplier = (eqd * 65)/100;
 			set_eqd[num].eq_id = eqo->eq_id;
 			aic->cur_eqd = eqd;
 			num++;
 		}
 	}
 
 	/* Is there atleast one eq that needs to be modified? */
         for(i = 0; i < num; i += 8) {
                 if((num - i) >=8 )
                         oce_mbox_eqd_modify_periodic(sc, &set_eqd[i], 8);
                 else
                         oce_mbox_eqd_modify_periodic(sc, &set_eqd[i], (num - i));
         }
 
 }
 
 static void oce_detect_hw_error(POCE_SOFTC sc)
 {
 
 	uint32_t ue_low = 0, ue_high = 0, ue_low_mask = 0, ue_high_mask = 0;
 	uint32_t sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0;
 	uint32_t i;
 
 	if (sc->hw_error)
 		return;
 
 	if (IS_XE201(sc)) {
 		sliport_status = OCE_READ_REG32(sc, db, SLIPORT_STATUS_OFFSET);
 		if (sliport_status & SLIPORT_STATUS_ERR_MASK) {
 			sliport_err1 = OCE_READ_REG32(sc, db, SLIPORT_ERROR1_OFFSET);
 			sliport_err2 = OCE_READ_REG32(sc, db, SLIPORT_ERROR2_OFFSET);
 		}
 	} else {
 		ue_low = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_LOW);
 		ue_high = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_HIGH);
 		ue_low_mask = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_LOW_MASK);
 		ue_high_mask = OCE_READ_REG32(sc, devcfg, PCICFG_UE_STATUS_HI_MASK);
 
 		ue_low = (ue_low & ~ue_low_mask);
 		ue_high = (ue_high & ~ue_high_mask);
 	}
 
 	/* On certain platforms BE hardware can indicate spurious UEs.
 	 * Allow the h/w to stop working completely in case of a real UE.
 	 * Hence not setting the hw_error for UE detection.
 	 */
 	if (sliport_status & SLIPORT_STATUS_ERR_MASK) {
 		sc->hw_error = TRUE;
 		device_printf(sc->dev, "Error detected in the card\n");
 	}
 
 	if (sliport_status & SLIPORT_STATUS_ERR_MASK) {
 		device_printf(sc->dev,
 				"ERR: sliport status 0x%x\n", sliport_status);
 		device_printf(sc->dev,
 				"ERR: sliport error1 0x%x\n", sliport_err1);
 		device_printf(sc->dev,
 				"ERR: sliport error2 0x%x\n", sliport_err2);
 	}
 
 	if (ue_low) {
 		for (i = 0; ue_low; ue_low >>= 1, i++) {
 			if (ue_low & 1)
 				device_printf(sc->dev, "UE: %s bit set\n",
 							ue_status_low_desc[i]);
 		}
 	}
 
 	if (ue_high) {
 		for (i = 0; ue_high; ue_high >>= 1, i++) {
 			if (ue_high & 1)
 				device_printf(sc->dev, "UE: %s bit set\n",
 							ue_status_hi_desc[i]);
 		}
 	}
 
 }
 
 static void
 oce_local_timer(void *arg)
 {
 	POCE_SOFTC sc = arg;
 	int i = 0;
 
 	oce_detect_hw_error(sc);
 	oce_refresh_nic_stats(sc);
 	oce_refresh_queue_stats(sc);
 	oce_mac_addr_set(sc);
 
 	/* TX Watch Dog*/
 	for (i = 0; i < sc->nwqs; i++)
 		oce_tx_restart(sc, sc->wq[i]);
 
 	/* calculate and set the eq delay for optimal interrupt rate */
 	if (IS_BE(sc) || IS_SH(sc))
 		oce_eqd_set_periodic(sc);
 
 	callout_reset(&sc->timer, hz, oce_local_timer, sc);
 }
 
 static void 
 oce_tx_compl_clean(POCE_SOFTC sc) 
 {
 	struct oce_wq *wq;
 	int i = 0, timeo = 0, num_wqes = 0;
 	int pending_txqs = sc->nwqs;
 
 	/* Stop polling for compls when HW has been silent for 10ms or 
 	 * hw_error or no outstanding completions expected
 	 */
 	do {
 		pending_txqs = sc->nwqs;
 		
 		for_all_wq_queues(sc, wq, i) {
 			num_wqes = oce_wq_handler(wq);
 			
 			if(num_wqes)
 				timeo = 0;
 
 			if(!wq->ring->num_used)
 				pending_txqs--;
 		}
 
 		if (pending_txqs == 0 || ++timeo > 10 || sc->hw_error)
 			break;
 
 		DELAY(1000);
 	} while (TRUE);
 
 	for_all_wq_queues(sc, wq, i) {
 		while(wq->ring->num_used) {
 			LOCK(&wq->tx_compl_lock);
 			oce_process_tx_completion(wq);
 			UNLOCK(&wq->tx_compl_lock);
 		}
 	}	
 		
 }
 
 /* NOTE : This should only be called holding
  *        DEVICE_LOCK.
  */
 static void
 oce_if_deactivate(POCE_SOFTC sc)
 {
 	int i;
 	struct oce_rq *rq;
 	struct oce_wq *wq;
 	struct oce_eq *eq;
 
 	if_setdrvflagbits(sc->ifp, 0, (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
 
 	oce_tx_compl_clean(sc);
 
 	/* Stop intrs and finish any bottom halves pending */
 	oce_hw_intr_disable(sc);
 
 	/* Since taskqueue_drain takes a Gaint Lock, We should not acquire
 	   any other lock. So unlock device lock and require after
 	   completing taskqueue_drain.
 	*/
 	UNLOCK(&sc->dev_lock);
 	for (i = 0; i < sc->intr_count; i++) {
 		if (sc->intrs[i].tq != NULL) {
 			taskqueue_drain(sc->intrs[i].tq, &sc->intrs[i].task);
 		}
 	}
 	LOCK(&sc->dev_lock);
 
 	/* Delete RX queue in card with flush param */
 	oce_stop_rx(sc);
 
 	/* Invalidate any pending cq and eq entries*/	
 	for_all_evnt_queues(sc, eq, i)	
 		oce_drain_eq(eq);
 	for_all_rq_queues(sc, rq, i)
 		oce_drain_rq_cq(rq);
 	for_all_wq_queues(sc, wq, i)
 		oce_drain_wq_cq(wq);
 
 	/* But still we need to get MCC aync events.
 	   So enable intrs and also arm first EQ
 	*/
 	oce_hw_intr_enable(sc);
 	oce_arm_eq(sc, sc->eq[0]->eq_id, 0, TRUE, FALSE);
 
 	DELAY(10);
 }
 
 static void
 oce_if_activate(POCE_SOFTC sc)
 {
 	struct oce_eq *eq;
 	struct oce_rq *rq;
 	struct oce_wq *wq;
 	int i, rc = 0;
 
 	if_setdrvflagbits(sc->ifp, IFF_DRV_RUNNING , 0);
 
 	oce_hw_intr_disable(sc);
 
 	oce_start_rx(sc);
 
 	for_all_rq_queues(sc, rq, i) {
 		rc = oce_start_rq(rq);
 		if (rc)
 			device_printf(sc->dev, "Unable to start RX\n");
 	}
 
 	for_all_wq_queues(sc, wq, i) {
 		rc = oce_start_wq(wq);
 		if (rc)
 			device_printf(sc->dev, "Unable to start TX\n");
 	}
 
 	for_all_evnt_queues(sc, eq, i)
 		oce_arm_eq(sc, eq->eq_id, 0, TRUE, FALSE);
 
 	oce_hw_intr_enable(sc);
 
 }
 
 static void
 process_link_state(POCE_SOFTC sc, struct oce_async_cqe_link_state *acqe)
 {
 	/* Update Link status */
 	if ((acqe->u0.s.link_status & ~ASYNC_EVENT_LOGICAL) ==
 	     ASYNC_EVENT_LINK_UP) {
 		sc->link_status = ASYNC_EVENT_LINK_UP;
 		if_link_state_change(sc->ifp, LINK_STATE_UP);
 	} else {
 		sc->link_status = ASYNC_EVENT_LINK_DOWN;
 		if_link_state_change(sc->ifp, LINK_STATE_DOWN);
 	}
 }
 
 static void oce_async_grp5_osbmc_process(POCE_SOFTC sc,
 					 struct oce_async_evt_grp5_os2bmc *evt)
 {
 	DW_SWAP(evt, sizeof(struct oce_async_evt_grp5_os2bmc));
 	if (evt->u.s.mgmt_enable)
 		sc->flags |= OCE_FLAGS_OS2BMC;
 	else
 		return;
 
 	sc->bmc_filt_mask = evt->u.s.arp_filter;
 	sc->bmc_filt_mask |= (evt->u.s.dhcp_client_filt << 1);
 	sc->bmc_filt_mask |= (evt->u.s.dhcp_server_filt << 2);
 	sc->bmc_filt_mask |= (evt->u.s.net_bios_filt << 3);
 	sc->bmc_filt_mask |= (evt->u.s.bcast_filt << 4);
 	sc->bmc_filt_mask |= (evt->u.s.ipv6_nbr_filt << 5);
 	sc->bmc_filt_mask |= (evt->u.s.ipv6_ra_filt << 6);
 	sc->bmc_filt_mask |= (evt->u.s.ipv6_ras_filt << 7);
 	sc->bmc_filt_mask |= (evt->u.s.mcast_filt << 8);
 }
 
 static void oce_process_grp5_events(POCE_SOFTC sc, struct oce_mq_cqe *cqe)
 {
 	struct oce_async_event_grp5_pvid_state *gcqe;
 	struct oce_async_evt_grp5_os2bmc *bmccqe;
 
 	switch (cqe->u0.s.async_type) {
 	case ASYNC_EVENT_PVID_STATE:
 		/* GRP5 PVID */
 		gcqe = (struct oce_async_event_grp5_pvid_state *)cqe;
 		if (gcqe->enabled)
 			sc->pvid = gcqe->tag & VLAN_VID_MASK;
 		else
 			sc->pvid = 0;
 		break;
 	case ASYNC_EVENT_OS2BMC:
 		bmccqe = (struct oce_async_evt_grp5_os2bmc *)cqe;
 		oce_async_grp5_osbmc_process(sc, bmccqe);
 		break;
 	default:
 		break;
 	}
 }
 
 /* Handle the Completion Queue for the Mailbox/Async notifications */
 uint16_t
 oce_mq_handler(void *arg)
 {
 	struct oce_mq *mq = (struct oce_mq *)arg;
 	POCE_SOFTC sc = mq->parent;
 	struct oce_cq *cq = mq->cq;
 	int num_cqes = 0, evt_type = 0, optype = 0;
 	struct oce_mq_cqe *cqe;
 	struct oce_async_cqe_link_state *acqe;
 	struct oce_async_event_qnq *dbgcqe;
 
 	bus_dmamap_sync(cq->ring->dma.tag,
 			cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
 	cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_mq_cqe);
 
 	while (cqe->u0.dw[3]) {
 		DW_SWAP((uint32_t *) cqe, sizeof(oce_mq_cqe));
 		if (cqe->u0.s.async_event) {
 			evt_type = cqe->u0.s.event_type;
 			optype = cqe->u0.s.async_type;
 			if (evt_type  == ASYNC_EVENT_CODE_LINK_STATE) {
 				/* Link status evt */
 				acqe = (struct oce_async_cqe_link_state *)cqe;
 				process_link_state(sc, acqe);
 			} else if (evt_type == ASYNC_EVENT_GRP5) {
 				oce_process_grp5_events(sc, cqe);
 			} else if (evt_type == ASYNC_EVENT_CODE_DEBUG &&
 					optype == ASYNC_EVENT_DEBUG_QNQ) {
 				dbgcqe =  (struct oce_async_event_qnq *)cqe;
 				if(dbgcqe->valid)
 					sc->qnqid = dbgcqe->vlan_tag;
 				sc->qnq_debug_event = TRUE;
 			}
 		}
 		cqe->u0.dw[3] = 0;
 		RING_GET(cq->ring, 1);
 		bus_dmamap_sync(cq->ring->dma.tag,
 				cq->ring->dma.map, BUS_DMASYNC_POSTWRITE);
 		cqe = RING_GET_CONSUMER_ITEM_VA(cq->ring, struct oce_mq_cqe);
 		num_cqes++;
 	}
 
 	if (num_cqes)
 		oce_arm_cq(sc, cq->cq_id, num_cqes, FALSE);
 
 	return 0;
 }
 
 static void
 setup_max_queues_want(POCE_SOFTC sc)
 {
 	/* Check if it is FLEX machine. Is so dont use RSS */	
 	if ((sc->function_mode & FNM_FLEX10_MODE) ||
 	    (sc->function_mode & FNM_UMC_MODE)    ||
 	    (sc->function_mode & FNM_VNIC_MODE)	  ||
 	    (!is_rss_enabled(sc))		  ||
 	    IS_BE2(sc)) {
 		sc->nrqs = 1;
 		sc->nwqs = 1;
 	} else {
 		sc->nrqs = MIN(OCE_NCPUS, sc->nrssqs) + 1;
 		sc->nwqs = MIN(OCE_NCPUS, sc->nrssqs);
 	}
 
 	if (IS_BE2(sc) && is_rss_enabled(sc))
 		sc->nrqs = MIN(OCE_NCPUS, sc->nrssqs) + 1;
 }
 
 static void
 update_queues_got(POCE_SOFTC sc)
 {
 	if (is_rss_enabled(sc)) {
 		sc->nrqs = sc->intr_count + 1;
 		sc->nwqs = sc->intr_count;
 	} else {
 		sc->nrqs = 1;
 		sc->nwqs = 1;
 	}
 
 	if (IS_BE2(sc))
 		sc->nwqs = 1;
 }
 
 static int 
 oce_check_ipv6_ext_hdr(struct mbuf *m)
 {
 	struct ether_header *eh = mtod(m, struct ether_header *);
 	caddr_t m_datatemp = m->m_data;
 
 	if (eh->ether_type == htons(ETHERTYPE_IPV6)) {
 		m->m_data += sizeof(struct ether_header);
 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 
 		if((ip6->ip6_nxt != IPPROTO_TCP) && \
 				(ip6->ip6_nxt != IPPROTO_UDP)){
 			struct ip6_ext *ip6e = NULL;
 			m->m_data += sizeof(struct ip6_hdr);
 
 			ip6e = (struct ip6_ext *) mtod(m, struct ip6_ext *);
 			if(ip6e->ip6e_len == 0xff) {
 				m->m_data = m_datatemp;
 				return TRUE;
 			}
 		} 
 		m->m_data = m_datatemp;
 	}
 	return FALSE;
 }
 
 static int 
 is_be3_a1(POCE_SOFTC sc)
 {
 	if((sc->flags & OCE_FLAGS_BE3)  && ((sc->asic_revision & 0xFF) < 2)) {
 		return TRUE;
 	}
 	return FALSE;
 }
 
 static struct mbuf *
 oce_insert_vlan_tag(POCE_SOFTC sc, struct mbuf *m, boolean_t *complete)
 {
 	uint16_t vlan_tag = 0;
 
 	if(!M_WRITABLE(m))
 		return NULL;
 
 	/* Embed vlan tag in the packet if it is not part of it */
 	if(m->m_flags & M_VLANTAG) {
 		vlan_tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag);
 		m->m_flags &= ~M_VLANTAG;
 	}
 
 	/* if UMC, ignore vlan tag insertion and instead insert pvid */
 	if(sc->pvid) {
 		if(!vlan_tag)
 			vlan_tag = sc->pvid;
 		if (complete)
 			*complete = FALSE;
 	}
 
 	if(vlan_tag) {
 		m = ether_vlanencap(m, vlan_tag);
 	}
 
 	if(sc->qnqid) {
 		m = ether_vlanencap(m, sc->qnqid);
 
 		if (complete)
 			*complete = FALSE;
 	}
 	return m;
 }
 
 static int 
 oce_tx_asic_stall_verify(POCE_SOFTC sc, struct mbuf *m)
 {
 	if(is_be3_a1(sc) && IS_QNQ_OR_UMC(sc) && \
 			oce_check_ipv6_ext_hdr(m)) {
 		return TRUE;
 	}
 	return FALSE;
 }
 
 static void
 oce_get_config(POCE_SOFTC sc)
 {
 	int rc = 0;
 	uint32_t max_rss = 0;
 
 	if ((IS_BE(sc) || IS_SH(sc)) && (!sc->be3_native))
 		max_rss = OCE_LEGACY_MODE_RSS;
 	else
 		max_rss = OCE_MAX_RSS;
 
 	if (!IS_BE(sc)) {
 		rc = oce_get_profile_config(sc, max_rss);
 		if (rc) {
 			sc->nwqs = OCE_MAX_WQ;
 			sc->nrssqs = max_rss;
 			sc->nrqs = sc->nrssqs + 1;
 		}
 	}
 	else { /* For BE3 don't rely on fw for determining the resources */
 		sc->nrssqs = max_rss;
 		sc->nrqs = sc->nrssqs + 1;
 		sc->nwqs = OCE_MAX_WQ;
 		sc->max_vlans = MAX_VLANFILTER_SIZE; 
 	}
 }
 
 static void
 oce_rdma_close(void)
 {
   if (oce_rdma_if != NULL) {
     oce_rdma_if = NULL;
   }
 }
 
 static void
 oce_get_mac_addr(POCE_SOFTC sc, uint8_t *macaddr)
 {
   memcpy(macaddr, sc->macaddr.mac_addr, 6);
 }
 
 int
 oce_register_rdma(POCE_RDMA_INFO rdma_info, POCE_RDMA_IF rdma_if)
 {
   POCE_SOFTC sc;
   struct oce_dev_info di;
   int i;
 
   if ((rdma_info == NULL) || (rdma_if == NULL)) {
     return -EINVAL;
   }
 
   if ((rdma_info->size != OCE_RDMA_INFO_SIZE) ||
       (rdma_if->size != OCE_RDMA_IF_SIZE)) {
     return -ENXIO;
   }
 
   rdma_info->close = oce_rdma_close;
   rdma_info->mbox_post = oce_mbox_post;
   rdma_info->common_req_hdr_init = mbx_common_req_hdr_init;
   rdma_info->get_mac_addr = oce_get_mac_addr;
 
   oce_rdma_if = rdma_if;
 
   sc = softc_head;
   while (sc != NULL) {
     if (oce_rdma_if->announce != NULL) {
       memset(&di, 0, sizeof(di));
       di.dev = sc->dev;
       di.softc = sc;
       di.ifp = sc->ifp;
       di.db_bhandle = sc->db_bhandle;
       di.db_btag = sc->db_btag;
       di.db_page_size = 4096;
       if (sc->flags & OCE_FLAGS_USING_MSIX) {
         di.intr_mode = OCE_INTERRUPT_MODE_MSIX;
       } else if (sc->flags & OCE_FLAGS_USING_MSI) {
         di.intr_mode = OCE_INTERRUPT_MODE_MSI;
       } else {
         di.intr_mode = OCE_INTERRUPT_MODE_INTX;
       }
       di.dev_family = OCE_GEN2_FAMILY; // fixme: must detect skyhawk
       if (di.intr_mode != OCE_INTERRUPT_MODE_INTX) {
         di.msix.num_vectors = sc->intr_count + sc->roce_intr_count;
         di.msix.start_vector = sc->intr_count;
         for (i=0; i<di.msix.num_vectors; i++) {
           di.msix.vector_list[i] = sc->intrs[i].vector;
         }
       } else {
       }
       memcpy(di.mac_addr, sc->macaddr.mac_addr, 6);
       di.vendor_id = pci_get_vendor(sc->dev);
       di.dev_id = pci_get_device(sc->dev);
 
       if (sc->rdma_flags & OCE_RDMA_FLAG_SUPPORTED) {
           di.flags  |= OCE_RDMA_INFO_RDMA_SUPPORTED;
       }
 
       rdma_if->announce(&di);
       sc = sc->next;
     }
   }
 
   return 0;
 }
 
 static void
 oce_read_env_variables( POCE_SOFTC sc )
 {
 	char *value = NULL;
 	int rc = 0;
 
         /* read if user wants to enable hwlro or swlro */
         //value = getenv("oce_enable_hwlro");
         if(value && IS_SH(sc)) {
                 sc->enable_hwlro = strtol(value, NULL, 10);
                 if(sc->enable_hwlro) {
                         rc = oce_mbox_nic_query_lro_capabilities(sc, NULL, NULL);
                         if(rc) {
                                 device_printf(sc->dev, "no hardware lro support\n");
                 		device_printf(sc->dev, "software lro enabled\n");
                                 sc->enable_hwlro = 0;
                         }else {
                                 device_printf(sc->dev, "hardware lro enabled\n");
 				oce_max_rsp_handled = 32;
                         }
                 }else {
                         device_printf(sc->dev, "software lro enabled\n");
                 }
         }else {
                 sc->enable_hwlro = 0;
         }
 
         /* read mbuf size */
         //value = getenv("oce_rq_buf_size");
         if(value && IS_SH(sc)) {
                 oce_rq_buf_size = strtol(value, NULL, 10);
                 switch(oce_rq_buf_size) {
                 case 2048:
                 case 4096:
                 case 9216:
                 case 16384:
                         break;
 
                 default:
                         device_printf(sc->dev, " Supported oce_rq_buf_size values are 2K, 4K, 9K, 16K \n");
                         oce_rq_buf_size = 2048;
                 }
         }
 
 	return;
 }
diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c
index 41eaa6a56086..9ef667e97a54 100644
--- a/sys/dev/virtio/network/if_vtnet.c
+++ b/sys/dev/virtio/network/if_vtnet.c
@@ -1,4456 +1,4455 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /* Driver for VirtIO network devices. */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sockio.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/msan.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/random.h>
 #include <sys/sglist.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/taskqueue.h>
 #include <sys/smp.h>
 #include <machine/smp.h>
 
 #include <vm/uma.h>
 
 #include <net/debugnet.h>
 #include <net/ethernet.h>
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 
 #include <net/bpf.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/udp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_lro.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 
 #include <dev/virtio/virtio.h>
 #include <dev/virtio/virtqueue.h>
 #include <dev/virtio/network/virtio_net.h>
 #include <dev/virtio/network/if_vtnetvar.h>
 #include "virtio_if.h"
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #if defined(INET) || defined(INET6)
 #include <machine/in_cksum.h>
 #endif
 
 static int	vtnet_modevent(module_t, int, void *);
 
 static int	vtnet_probe(device_t);
 static int	vtnet_attach(device_t);
 static int	vtnet_detach(device_t);
 static int	vtnet_suspend(device_t);
 static int	vtnet_resume(device_t);
 static int	vtnet_shutdown(device_t);
 static int	vtnet_attach_completed(device_t);
 static int	vtnet_config_change(device_t);
 
 static int	vtnet_negotiate_features(struct vtnet_softc *);
 static int	vtnet_setup_features(struct vtnet_softc *);
 static int	vtnet_init_rxq(struct vtnet_softc *, int);
 static int	vtnet_init_txq(struct vtnet_softc *, int);
 static int	vtnet_alloc_rxtx_queues(struct vtnet_softc *);
 static void	vtnet_free_rxtx_queues(struct vtnet_softc *);
 static int	vtnet_alloc_rx_filters(struct vtnet_softc *);
 static void	vtnet_free_rx_filters(struct vtnet_softc *);
 static int	vtnet_alloc_virtqueues(struct vtnet_softc *);
 static int	vtnet_alloc_interface(struct vtnet_softc *);
 static int	vtnet_setup_interface(struct vtnet_softc *);
 static int	vtnet_ioctl_mtu(struct vtnet_softc *, u_int);
 static int	vtnet_ioctl_ifflags(struct vtnet_softc *);
 static int	vtnet_ioctl_multi(struct vtnet_softc *);
 static int	vtnet_ioctl_ifcap(struct vtnet_softc *, struct ifreq *);
 static int	vtnet_ioctl(if_t, u_long, caddr_t);
 static uint64_t	vtnet_get_counter(if_t, ift_counter);
 
 static int	vtnet_rxq_populate(struct vtnet_rxq *);
 static void	vtnet_rxq_free_mbufs(struct vtnet_rxq *);
 static struct mbuf *
 		vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **);
 static int	vtnet_rxq_replace_lro_nomrg_buf(struct vtnet_rxq *,
 		    struct mbuf *, int);
 static int	vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int);
 static int	vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *);
 static int	vtnet_rxq_new_buf(struct vtnet_rxq *);
 static int	vtnet_rxq_csum_needs_csum(struct vtnet_rxq *, struct mbuf *,
 		     uint16_t, int, struct virtio_net_hdr *);
 static int	vtnet_rxq_csum_data_valid(struct vtnet_rxq *, struct mbuf *,
 		     uint16_t, int, struct virtio_net_hdr *);
 static int	vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *,
 		     struct virtio_net_hdr *);
 static void	vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int);
 static void	vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *);
 static int	vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int);
 static void	vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *,
 		    struct virtio_net_hdr *);
 static int	vtnet_rxq_eof(struct vtnet_rxq *);
 static void	vtnet_rx_vq_process(struct vtnet_rxq *rxq, int tries);
 static void	vtnet_rx_vq_intr(void *);
 static void	vtnet_rxq_tq_intr(void *, int);
 
 static int	vtnet_txq_intr_threshold(struct vtnet_txq *);
 static int	vtnet_txq_below_threshold(struct vtnet_txq *);
 static int	vtnet_txq_notify(struct vtnet_txq *);
 static void	vtnet_txq_free_mbufs(struct vtnet_txq *);
 static int	vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *,
 		    int *, int *, int *);
 static int	vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int,
 		    int, struct virtio_net_hdr *);
 static struct mbuf *
 		vtnet_txq_offload(struct vtnet_txq *, struct mbuf *,
 		    struct virtio_net_hdr *);
 static int	vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **,
 		    struct vtnet_tx_header *);
 static int	vtnet_txq_encap(struct vtnet_txq *, struct mbuf **, int);
 #ifdef VTNET_LEGACY_TX
 static void	vtnet_start_locked(struct vtnet_txq *, if_t);
 static void	vtnet_start(if_t);
 #else
 static int	vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *);
 static int	vtnet_txq_mq_start(if_t, struct mbuf *);
 static void	vtnet_txq_tq_deferred(void *, int);
 #endif
 static void	vtnet_txq_start(struct vtnet_txq *);
 static void	vtnet_txq_tq_intr(void *, int);
 static int	vtnet_txq_eof(struct vtnet_txq *);
 static void	vtnet_tx_vq_intr(void *);
 static void	vtnet_tx_start_all(struct vtnet_softc *);
 
 #ifndef VTNET_LEGACY_TX
 static void	vtnet_qflush(if_t);
 #endif
 
 static int	vtnet_watchdog(struct vtnet_txq *);
 static void	vtnet_accum_stats(struct vtnet_softc *,
 		    struct vtnet_rxq_stats *, struct vtnet_txq_stats *);
 static void	vtnet_tick(void *);
 
 static void	vtnet_start_taskqueues(struct vtnet_softc *);
 static void	vtnet_free_taskqueues(struct vtnet_softc *);
 static void	vtnet_drain_taskqueues(struct vtnet_softc *);
 
 static void	vtnet_drain_rxtx_queues(struct vtnet_softc *);
 static void	vtnet_stop_rendezvous(struct vtnet_softc *);
 static void	vtnet_stop(struct vtnet_softc *);
 static int	vtnet_virtio_reinit(struct vtnet_softc *);
 static void	vtnet_init_rx_filters(struct vtnet_softc *);
 static int	vtnet_init_rx_queues(struct vtnet_softc *);
 static int	vtnet_init_tx_queues(struct vtnet_softc *);
 static int	vtnet_init_rxtx_queues(struct vtnet_softc *);
 static void	vtnet_set_active_vq_pairs(struct vtnet_softc *);
 static void	vtnet_update_rx_offloads(struct vtnet_softc *);
 static int	vtnet_reinit(struct vtnet_softc *);
 static void	vtnet_init_locked(struct vtnet_softc *, int);
 static void	vtnet_init(void *);
 
 static void	vtnet_free_ctrl_vq(struct vtnet_softc *);
 static void	vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *,
 		    struct sglist *, int, int);
 static int	vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *);
 static int	vtnet_ctrl_guest_offloads(struct vtnet_softc *, uint64_t);
 static int	vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t);
 static int	vtnet_ctrl_rx_cmd(struct vtnet_softc *, uint8_t, bool);
 static int	vtnet_set_promisc(struct vtnet_softc *, bool);
 static int	vtnet_set_allmulti(struct vtnet_softc *, bool);
 static void	vtnet_rx_filter(struct vtnet_softc *);
 static void	vtnet_rx_filter_mac(struct vtnet_softc *);
 static int	vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t);
 static void	vtnet_rx_filter_vlan(struct vtnet_softc *);
 static void	vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t);
 static void	vtnet_register_vlan(void *, if_t, uint16_t);
 static void	vtnet_unregister_vlan(void *, if_t, uint16_t);
 
 static void	vtnet_update_speed_duplex(struct vtnet_softc *);
 static int	vtnet_is_link_up(struct vtnet_softc *);
 static void	vtnet_update_link_status(struct vtnet_softc *);
 static int	vtnet_ifmedia_upd(if_t);
 static void	vtnet_ifmedia_sts(if_t, struct ifmediareq *);
 static void	vtnet_get_macaddr(struct vtnet_softc *);
 static void	vtnet_set_macaddr(struct vtnet_softc *);
 static void	vtnet_attached_set_macaddr(struct vtnet_softc *);
 static void	vtnet_vlan_tag_remove(struct mbuf *);
 static void	vtnet_set_rx_process_limit(struct vtnet_softc *);
 
 static void	vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *,
 		    struct sysctl_oid_list *, struct vtnet_rxq *);
 static void	vtnet_setup_txq_sysctl(struct sysctl_ctx_list *,
 		    struct sysctl_oid_list *, struct vtnet_txq *);
 static void	vtnet_setup_queue_sysctl(struct vtnet_softc *);
 static void	vtnet_load_tunables(struct vtnet_softc *);
 static void	vtnet_setup_sysctl(struct vtnet_softc *);
 
 static int	vtnet_rxq_enable_intr(struct vtnet_rxq *);
 static void	vtnet_rxq_disable_intr(struct vtnet_rxq *);
 static int	vtnet_txq_enable_intr(struct vtnet_txq *);
 static void	vtnet_txq_disable_intr(struct vtnet_txq *);
 static void	vtnet_enable_rx_interrupts(struct vtnet_softc *);
 static void	vtnet_enable_tx_interrupts(struct vtnet_softc *);
 static void	vtnet_enable_interrupts(struct vtnet_softc *);
 static void	vtnet_disable_rx_interrupts(struct vtnet_softc *);
 static void	vtnet_disable_tx_interrupts(struct vtnet_softc *);
 static void	vtnet_disable_interrupts(struct vtnet_softc *);
 
 static int	vtnet_tunable_int(struct vtnet_softc *, const char *, int);
 
 DEBUGNET_DEFINE(vtnet);
 
 #define vtnet_htog16(_sc, _val)	virtio_htog16(vtnet_modern(_sc), _val)
 #define vtnet_htog32(_sc, _val)	virtio_htog32(vtnet_modern(_sc), _val)
 #define vtnet_htog64(_sc, _val)	virtio_htog64(vtnet_modern(_sc), _val)
 #define vtnet_gtoh16(_sc, _val)	virtio_gtoh16(vtnet_modern(_sc), _val)
 #define vtnet_gtoh32(_sc, _val)	virtio_gtoh32(vtnet_modern(_sc), _val)
 #define vtnet_gtoh64(_sc, _val)	virtio_gtoh64(vtnet_modern(_sc), _val)
 
 /* Tunables. */
 static SYSCTL_NODE(_hw, OID_AUTO, vtnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "VirtIO Net driver parameters");
 
 static int vtnet_csum_disable = 0;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, csum_disable, CTLFLAG_RDTUN,
     &vtnet_csum_disable, 0, "Disables receive and send checksum offload");
 
 static int vtnet_fixup_needs_csum = 0;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, fixup_needs_csum, CTLFLAG_RDTUN,
     &vtnet_fixup_needs_csum, 0,
     "Calculate valid checksum for NEEDS_CSUM packets");
 
 static int vtnet_tso_disable = 0;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_disable, CTLFLAG_RDTUN,
     &vtnet_tso_disable, 0, "Disables TSO");
 
 static int vtnet_lro_disable = 0;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_disable, CTLFLAG_RDTUN,
     &vtnet_lro_disable, 0, "Disables hardware LRO");
 
 static int vtnet_mq_disable = 0;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_disable, CTLFLAG_RDTUN,
     &vtnet_mq_disable, 0, "Disables multiqueue support");
 
 static int vtnet_mq_max_pairs = VTNET_MAX_QUEUE_PAIRS;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_max_pairs, CTLFLAG_RDTUN,
     &vtnet_mq_max_pairs, 0, "Maximum number of multiqueue pairs");
 
 static int vtnet_tso_maxlen = IP_MAXPACKET;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
     &vtnet_tso_maxlen, 0, "TSO burst limit");
 
 static int vtnet_rx_process_limit = 1024;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN,
     &vtnet_rx_process_limit, 0,
     "Number of RX segments processed in one pass");
 
 static int vtnet_lro_entry_count = 128;
 SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
     &vtnet_lro_entry_count, 0, "Software LRO entry count");
 
 /* Enable sorted LRO, and the depth of the mbuf queue. */
 static int vtnet_lro_mbufq_depth = 0;
 SYSCTL_UINT(_hw_vtnet, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
     &vtnet_lro_mbufq_depth, 0, "Depth of software LRO mbuf queue");
 
 static uma_zone_t vtnet_tx_header_zone;
 
 static struct virtio_feature_desc vtnet_feature_desc[] = {
 	{ VIRTIO_NET_F_CSUM,			"TxChecksum"		},
 	{ VIRTIO_NET_F_GUEST_CSUM,		"RxChecksum"		},
 	{ VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,	"CtrlRxOffloads"	},
 	{ VIRTIO_NET_F_MAC,			"MAC"			},
 	{ VIRTIO_NET_F_GSO,			"TxGSO"			},
 	{ VIRTIO_NET_F_GUEST_TSO4,		"RxLROv4"		},
 	{ VIRTIO_NET_F_GUEST_TSO6,		"RxLROv6"		},
 	{ VIRTIO_NET_F_GUEST_ECN,		"RxLROECN"		},
 	{ VIRTIO_NET_F_GUEST_UFO,		"RxUFO"			},
 	{ VIRTIO_NET_F_HOST_TSO4,		"TxTSOv4"		},
 	{ VIRTIO_NET_F_HOST_TSO6,		"TxTSOv6"		},
 	{ VIRTIO_NET_F_HOST_ECN,		"TxTSOECN"		},
 	{ VIRTIO_NET_F_HOST_UFO,		"TxUFO"			},
 	{ VIRTIO_NET_F_MRG_RXBUF,		"MrgRxBuf"		},
 	{ VIRTIO_NET_F_STATUS,			"Status"		},
 	{ VIRTIO_NET_F_CTRL_VQ,			"CtrlVq"		},
 	{ VIRTIO_NET_F_CTRL_RX,			"CtrlRxMode"		},
 	{ VIRTIO_NET_F_CTRL_VLAN,		"CtrlVLANFilter"	},
 	{ VIRTIO_NET_F_CTRL_RX_EXTRA,		"CtrlRxModeExtra"	},
 	{ VIRTIO_NET_F_GUEST_ANNOUNCE,		"GuestAnnounce"		},
 	{ VIRTIO_NET_F_MQ,			"Multiqueue"		},
 	{ VIRTIO_NET_F_CTRL_MAC_ADDR,		"CtrlMacAddr"		},
 	{ VIRTIO_NET_F_SPEED_DUPLEX,		"SpeedDuplex"		},
 
 	{ 0, NULL }
 };
 
 static device_method_t vtnet_methods[] = {
 	/* Device methods. */
 	DEVMETHOD(device_probe,			vtnet_probe),
 	DEVMETHOD(device_attach,		vtnet_attach),
 	DEVMETHOD(device_detach,		vtnet_detach),
 	DEVMETHOD(device_suspend,		vtnet_suspend),
 	DEVMETHOD(device_resume,		vtnet_resume),
 	DEVMETHOD(device_shutdown,		vtnet_shutdown),
 
 	/* VirtIO methods. */
 	DEVMETHOD(virtio_attach_completed,	vtnet_attach_completed),
 	DEVMETHOD(virtio_config_change,		vtnet_config_change),
 
 	DEVMETHOD_END
 };
 
 #ifdef DEV_NETMAP
 #include <dev/netmap/if_vtnet_netmap.h>
 #endif
 
 static driver_t vtnet_driver = {
     .name = "vtnet",
     .methods = vtnet_methods,
     .size = sizeof(struct vtnet_softc)
 };
 VIRTIO_DRIVER_MODULE(vtnet, vtnet_driver, vtnet_modevent, NULL);
 MODULE_VERSION(vtnet, 1);
 MODULE_DEPEND(vtnet, virtio, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(vtnet, netmap, 1, 1, 1);
 #endif
 
 VIRTIO_SIMPLE_PNPINFO(vtnet, VIRTIO_ID_NETWORK, "VirtIO Networking Adapter");
 
 static int
 vtnet_modevent(module_t mod __unused, int type, void *unused __unused)
 {
 	int error = 0;
 	static int loaded = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		if (loaded++ == 0) {
 			vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr",
 				sizeof(struct vtnet_tx_header),
 				NULL, NULL, NULL, NULL, 0, 0);
 #ifdef DEBUGNET
 			/*
 			 * We need to allocate from this zone in the transmit path, so ensure
 			 * that we have at least one item per header available.
 			 * XXX add a separate zone like we do for mbufs? otherwise we may alloc
 			 * buckets
 			 */
 			uma_zone_reserve(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2);
 			uma_prealloc(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2);
 #endif
 		}
 		break;
 	case MOD_QUIESCE:
 		if (uma_zone_get_cur(vtnet_tx_header_zone) > 0)
 			error = EBUSY;
 		break;
 	case MOD_UNLOAD:
 		if (--loaded == 0) {
 			uma_zdestroy(vtnet_tx_header_zone);
 			vtnet_tx_header_zone = NULL;
 		}
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 vtnet_probe(device_t dev)
 {
 	return (VIRTIO_SIMPLE_PROBE(dev, vtnet));
 }
 
 static int
 vtnet_attach(device_t dev)
 {
 	struct vtnet_softc *sc;
 	int error;
 
 	sc = device_get_softc(dev);
 	sc->vtnet_dev = dev;
 	virtio_set_feature_desc(dev, vtnet_feature_desc);
 
 	VTNET_CORE_LOCK_INIT(sc);
 	callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0);
 	vtnet_load_tunables(sc);
 
 	error = vtnet_alloc_interface(sc);
 	if (error) {
 		device_printf(dev, "cannot allocate interface\n");
 		goto fail;
 	}
 
 	vtnet_setup_sysctl(sc);
 
 	error = vtnet_setup_features(sc);
 	if (error) {
 		device_printf(dev, "cannot setup features\n");
 		goto fail;
 	}
 
 	error = vtnet_alloc_rx_filters(sc);
 	if (error) {
 		device_printf(dev, "cannot allocate Rx filters\n");
 		goto fail;
 	}
 
 	error = vtnet_alloc_rxtx_queues(sc);
 	if (error) {
 		device_printf(dev, "cannot allocate queues\n");
 		goto fail;
 	}
 
 	error = vtnet_alloc_virtqueues(sc);
 	if (error) {
 		device_printf(dev, "cannot allocate virtqueues\n");
 		goto fail;
 	}
 
 	error = vtnet_setup_interface(sc);
 	if (error) {
 		device_printf(dev, "cannot setup interface\n");
 		goto fail;
 	}
 
 	error = virtio_setup_intr(dev, INTR_TYPE_NET);
 	if (error) {
 		device_printf(dev, "cannot setup interrupts\n");
 		ether_ifdetach(sc->vtnet_ifp);
 		goto fail;
 	}
 
 #ifdef DEV_NETMAP
 	vtnet_netmap_attach(sc);
 #endif
 	vtnet_start_taskqueues(sc);
 
 fail:
 	if (error)
 		vtnet_detach(dev);
 
 	return (error);
 }
 
 static int
 vtnet_detach(device_t dev)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 
 	sc = device_get_softc(dev);
 	ifp = sc->vtnet_ifp;
 
 	if (device_is_attached(dev)) {
 		VTNET_CORE_LOCK(sc);
 		vtnet_stop(sc);
 		VTNET_CORE_UNLOCK(sc);
 
 		callout_drain(&sc->vtnet_tick_ch);
 		vtnet_drain_taskqueues(sc);
 
 		ether_ifdetach(ifp);
 	}
 
 #ifdef DEV_NETMAP
 	netmap_detach(ifp);
 #endif
 
 	if (sc->vtnet_pfil != NULL) {
 		pfil_head_unregister(sc->vtnet_pfil);
 		sc->vtnet_pfil = NULL;
 	}
 
 	vtnet_free_taskqueues(sc);
 
 	if (sc->vtnet_vlan_attach != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach);
 		sc->vtnet_vlan_attach = NULL;
 	}
 	if (sc->vtnet_vlan_detach != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vtnet_vlan_detach);
 		sc->vtnet_vlan_detach = NULL;
 	}
 
 	ifmedia_removeall(&sc->vtnet_media);
 
 	if (ifp != NULL) {
 		if_free(ifp);
 		sc->vtnet_ifp = NULL;
 	}
 
 	vtnet_free_rxtx_queues(sc);
 	vtnet_free_rx_filters(sc);
 
 	if (sc->vtnet_ctrl_vq != NULL)
 		vtnet_free_ctrl_vq(sc);
 
 	VTNET_CORE_LOCK_DESTROY(sc);
 
 	return (0);
 }
 
 static int
 vtnet_suspend(device_t dev)
 {
 	struct vtnet_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	VTNET_CORE_LOCK(sc);
 	vtnet_stop(sc);
 	sc->vtnet_flags |= VTNET_FLAG_SUSPENDED;
 	VTNET_CORE_UNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vtnet_resume(device_t dev)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 
 	sc = device_get_softc(dev);
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK(sc);
 	if (if_getflags(ifp) & IFF_UP)
 		vtnet_init_locked(sc, 0);
 	sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED;
 	VTNET_CORE_UNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vtnet_shutdown(device_t dev)
 {
 	/*
 	 * Suspend already does all of what we need to
 	 * do here; we just never expect to be resumed.
 	 */
 	return (vtnet_suspend(dev));
 }
 
 static int
 vtnet_attach_completed(device_t dev)
 {
 	struct vtnet_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	VTNET_CORE_LOCK(sc);
 	vtnet_attached_set_macaddr(sc);
 	VTNET_CORE_UNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vtnet_config_change(device_t dev)
 {
 	struct vtnet_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	VTNET_CORE_LOCK(sc);
 	vtnet_update_link_status(sc);
 	if (sc->vtnet_link_active != 0)
 		vtnet_tx_start_all(sc);
 	VTNET_CORE_UNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vtnet_negotiate_features(struct vtnet_softc *sc)
 {
 	device_t dev;
 	uint64_t features, negotiated_features;
 	int no_csum;
 
 	dev = sc->vtnet_dev;
 	features = virtio_bus_is_modern(dev) ? VTNET_MODERN_FEATURES :
 	    VTNET_LEGACY_FEATURES;
 
 	/*
 	 * TSO and LRO are only available when their corresponding checksum
 	 * offload feature is also negotiated.
 	 */
 	no_csum = vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable);
 	if (no_csum)
 		features &= ~(VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM);
 	if (no_csum || vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable))
 		features &= ~VTNET_TSO_FEATURES;
 	if (no_csum || vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable))
 		features &= ~VTNET_LRO_FEATURES;
 
 #ifndef VTNET_LEGACY_TX
 	if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable))
 		features &= ~VIRTIO_NET_F_MQ;
 #else
 	features &= ~VIRTIO_NET_F_MQ;
 #endif
 
 	negotiated_features = virtio_negotiate_features(dev, features);
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MTU)) {
 		uint16_t mtu;
 
 		mtu = virtio_read_dev_config_2(dev,
 		    offsetof(struct virtio_net_config, mtu));
 		if (mtu < VTNET_MIN_MTU /* || mtu > VTNET_MAX_MTU */) {
 			device_printf(dev, "Invalid MTU value: %d. "
 			    "MTU feature disabled.\n", mtu);
 			features &= ~VIRTIO_NET_F_MTU;
 			negotiated_features =
 			    virtio_negotiate_features(dev, features);
 		}
 	}
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MQ)) {
 		uint16_t npairs;
 
 		npairs = virtio_read_dev_config_2(dev,
 		    offsetof(struct virtio_net_config, max_virtqueue_pairs));
 		if (npairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
 		    npairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) {
 			device_printf(dev, "Invalid max_virtqueue_pairs value: "
 			    "%d. Multiqueue feature disabled.\n", npairs);
 			features &= ~VIRTIO_NET_F_MQ;
 			negotiated_features =
 			    virtio_negotiate_features(dev, features);
 		}
 	}
 
 	if (virtio_with_feature(dev, VTNET_LRO_FEATURES) &&
 	    virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0) {
 		/*
 		 * LRO without mergeable buffers requires special care. This
 		 * is not ideal because every receive buffer must be large
 		 * enough to hold the maximum TCP packet, the Ethernet header,
 		 * and the header. This requires up to 34 descriptors with
 		 * MCLBYTES clusters. If we do not have indirect descriptors,
 		 * LRO is disabled since the virtqueue will not contain very
 		 * many receive buffers.
 		 */
 		if (!virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) {
 			device_printf(dev,
 			    "Host LRO disabled since both mergeable buffers "
 			    "and indirect descriptors were not negotiated\n");
 			features &= ~VTNET_LRO_FEATURES;
 			negotiated_features =
 			    virtio_negotiate_features(dev, features);
 		} else
 			sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG;
 	}
 
 	sc->vtnet_features = negotiated_features;
 	sc->vtnet_negotiated_features = negotiated_features;
 
 	return (virtio_finalize_features(dev));
 }
 
 static int
 vtnet_setup_features(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int error;
 
 	dev = sc->vtnet_dev;
 
 	error = vtnet_negotiate_features(sc);
 	if (error)
 		return (error);
 
 	if (virtio_with_feature(dev, VIRTIO_F_VERSION_1))
 		sc->vtnet_flags |= VTNET_FLAG_MODERN;
 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
 		sc->vtnet_flags |= VTNET_FLAG_INDIRECT;
 	if (virtio_with_feature(dev, VIRTIO_RING_F_EVENT_IDX))
 		sc->vtnet_flags |= VTNET_FLAG_EVENT_IDX;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) {
 		/* This feature should always be negotiated. */
 		sc->vtnet_flags |= VTNET_FLAG_MAC;
 	}
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MTU)) {
 		sc->vtnet_max_mtu = virtio_read_dev_config_2(dev,
 		    offsetof(struct virtio_net_config, mtu));
 	} else
 		sc->vtnet_max_mtu = VTNET_MAX_MTU;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) {
 		sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS;
 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	} else if (vtnet_modern(sc)) {
 		/* This is identical to the mergeable header. */
 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_v1);
 	} else
 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
 
 	if (vtnet_modern(sc) || sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS)
 		sc->vtnet_rx_nsegs = VTNET_RX_SEGS_HDR_INLINE;
 	else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
 		sc->vtnet_rx_nsegs = VTNET_RX_SEGS_LRO_NOMRG;
 	else
 		sc->vtnet_rx_nsegs = VTNET_RX_SEGS_HDR_SEPARATE;
 
 	/*
 	 * Favor "hardware" LRO if negotiated, but support software LRO as
 	 * a fallback; there is usually little benefit (or worse) with both.
 	 */
 	if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) == 0 &&
 	    virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6) == 0)
 		sc->vtnet_flags |= VTNET_FLAG_SW_LRO;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_GSO) ||
 	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4) ||
 	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
 		sc->vtnet_tx_nsegs = VTNET_TX_SEGS_MAX;
 	else
 		sc->vtnet_tx_nsegs = VTNET_TX_SEGS_MIN;
 
 	sc->vtnet_req_vq_pairs = 1;
 	sc->vtnet_max_vq_pairs = 1;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) {
 		sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ;
 
 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX))
 			sc->vtnet_flags |= VTNET_FLAG_CTRL_RX;
 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN))
 			sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER;
 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR))
 			sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC;
 
 		if (virtio_with_feature(dev, VIRTIO_NET_F_MQ)) {
 			sc->vtnet_max_vq_pairs = virtio_read_dev_config_2(dev,
 			    offsetof(struct virtio_net_config,
 			    max_virtqueue_pairs));
 		}
 	}
 
 	if (sc->vtnet_max_vq_pairs > 1) {
 		int req;
 
 		/*
 		 * Limit the maximum number of requested queue pairs to the
 		 * number of CPUs and the configured maximum.
 		 */
 		req = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs);
 		if (req < 0)
 			req = 1;
 		if (req == 0)
 			req = mp_ncpus;
 		if (req > sc->vtnet_max_vq_pairs)
 			req = sc->vtnet_max_vq_pairs;
 		if (req > mp_ncpus)
 			req = mp_ncpus;
 		if (req > 1) {
 			sc->vtnet_req_vq_pairs = req;
 			sc->vtnet_flags |= VTNET_FLAG_MQ;
 		}
 	}
 
 	return (0);
 }
 
 static int
 vtnet_init_rxq(struct vtnet_softc *sc, int id)
 {
 	struct vtnet_rxq *rxq;
 
 	rxq = &sc->vtnet_rxqs[id];
 
 	snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d",
 	    device_get_nameunit(sc->vtnet_dev), id);
 	mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF);
 
 	rxq->vtnrx_sc = sc;
 	rxq->vtnrx_id = id;
 
 	rxq->vtnrx_sg = sglist_alloc(sc->vtnet_rx_nsegs, M_NOWAIT);
 	if (rxq->vtnrx_sg == NULL)
 		return (ENOMEM);
 
 #if defined(INET) || defined(INET6)
 	if (vtnet_software_lro(sc)) {
 		if (tcp_lro_init_args(&rxq->vtnrx_lro, sc->vtnet_ifp,
 		    sc->vtnet_lro_entry_count, sc->vtnet_lro_mbufq_depth) != 0)
 			return (ENOMEM);
 	}
 #endif
 
 	NET_TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq);
 	rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT,
 	    taskqueue_thread_enqueue, &rxq->vtnrx_tq);
 
 	return (rxq->vtnrx_tq == NULL ? ENOMEM : 0);
 }
 
 static int
 vtnet_init_txq(struct vtnet_softc *sc, int id)
 {
 	struct vtnet_txq *txq;
 
 	txq = &sc->vtnet_txqs[id];
 
 	snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d",
 	    device_get_nameunit(sc->vtnet_dev), id);
 	mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF);
 
 	txq->vtntx_sc = sc;
 	txq->vtntx_id = id;
 
 	txq->vtntx_sg = sglist_alloc(sc->vtnet_tx_nsegs, M_NOWAIT);
 	if (txq->vtntx_sg == NULL)
 		return (ENOMEM);
 
 #ifndef VTNET_LEGACY_TX
 	txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF,
 	    M_NOWAIT, &txq->vtntx_mtx);
 	if (txq->vtntx_br == NULL)
 		return (ENOMEM);
 
 	TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq);
 #endif
 	TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq);
 	txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT,
 	    taskqueue_thread_enqueue, &txq->vtntx_tq);
 	if (txq->vtntx_tq == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 static int
 vtnet_alloc_rxtx_queues(struct vtnet_softc *sc)
 {
 	int i, npairs, error;
 
 	npairs = sc->vtnet_max_vq_pairs;
 
 	sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
 	sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
 	if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < npairs; i++) {
 		error = vtnet_init_rxq(sc, i);
 		if (error)
 			return (error);
 		error = vtnet_init_txq(sc, i);
 		if (error)
 			return (error);
 	}
 
 	vtnet_set_rx_process_limit(sc);
 	vtnet_setup_queue_sysctl(sc);
 
 	return (0);
 }
 
 static void
 vtnet_destroy_rxq(struct vtnet_rxq *rxq)
 {
 
 	rxq->vtnrx_sc = NULL;
 	rxq->vtnrx_id = -1;
 
 #if defined(INET) || defined(INET6)
 	tcp_lro_free(&rxq->vtnrx_lro);
 #endif
 
 	if (rxq->vtnrx_sg != NULL) {
 		sglist_free(rxq->vtnrx_sg);
 		rxq->vtnrx_sg = NULL;
 	}
 
 	if (mtx_initialized(&rxq->vtnrx_mtx) != 0)
 		mtx_destroy(&rxq->vtnrx_mtx);
 }
 
 static void
 vtnet_destroy_txq(struct vtnet_txq *txq)
 {
 
 	txq->vtntx_sc = NULL;
 	txq->vtntx_id = -1;
 
 	if (txq->vtntx_sg != NULL) {
 		sglist_free(txq->vtntx_sg);
 		txq->vtntx_sg = NULL;
 	}
 
 #ifndef VTNET_LEGACY_TX
 	if (txq->vtntx_br != NULL) {
 		buf_ring_free(txq->vtntx_br, M_DEVBUF);
 		txq->vtntx_br = NULL;
 	}
 #endif
 
 	if (mtx_initialized(&txq->vtntx_mtx) != 0)
 		mtx_destroy(&txq->vtntx_mtx);
 }
 
 static void
 vtnet_free_rxtx_queues(struct vtnet_softc *sc)
 {
 	int i;
 
 	if (sc->vtnet_rxqs != NULL) {
 		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
 			vtnet_destroy_rxq(&sc->vtnet_rxqs[i]);
 		free(sc->vtnet_rxqs, M_DEVBUF);
 		sc->vtnet_rxqs = NULL;
 	}
 
 	if (sc->vtnet_txqs != NULL) {
 		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
 			vtnet_destroy_txq(&sc->vtnet_txqs[i]);
 		free(sc->vtnet_txqs, M_DEVBUF);
 		sc->vtnet_txqs = NULL;
 	}
 }
 
 static int
 vtnet_alloc_rx_filters(struct vtnet_softc *sc)
 {
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
 		sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter),
 		    M_DEVBUF, M_NOWAIT | M_ZERO);
 		if (sc->vtnet_mac_filter == NULL)
 			return (ENOMEM);
 	}
 
 	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
 		sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) *
 		    VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO);
 		if (sc->vtnet_vlan_filter == NULL)
 			return (ENOMEM);
 	}
 
 	return (0);
 }
 
 static void
 vtnet_free_rx_filters(struct vtnet_softc *sc)
 {
 
 	if (sc->vtnet_mac_filter != NULL) {
 		free(sc->vtnet_mac_filter, M_DEVBUF);
 		sc->vtnet_mac_filter = NULL;
 	}
 
 	if (sc->vtnet_vlan_filter != NULL) {
 		free(sc->vtnet_vlan_filter, M_DEVBUF);
 		sc->vtnet_vlan_filter = NULL;
 	}
 }
 
 static int
 vtnet_alloc_virtqueues(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct vq_alloc_info *info;
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i, idx, flags, nvqs, error;
 
 	dev = sc->vtnet_dev;
 	flags = 0;
 
 	nvqs = sc->vtnet_max_vq_pairs * 2;
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
 		nvqs++;
 
 	info = malloc(sizeof(struct vq_alloc_info) * nvqs, M_TEMP, M_NOWAIT);
 	if (info == NULL)
 		return (ENOMEM);
 
 	for (i = 0, idx = 0; i < sc->vtnet_req_vq_pairs; i++, idx += 2) {
 		rxq = &sc->vtnet_rxqs[i];
 		VQ_ALLOC_INFO_INIT(&info[idx], sc->vtnet_rx_nsegs,
 		    vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq,
 		    "%s-rx%d", device_get_nameunit(dev), rxq->vtnrx_id);
 
 		txq = &sc->vtnet_txqs[i];
 		VQ_ALLOC_INFO_INIT(&info[idx+1], sc->vtnet_tx_nsegs,
 		    vtnet_tx_vq_intr, txq, &txq->vtntx_vq,
 		    "%s-tx%d", device_get_nameunit(dev), txq->vtntx_id);
 	}
 
 	/* These queues will not be used so allocate the minimum resources. */
 	for (/**/; i < sc->vtnet_max_vq_pairs; i++, idx += 2) {
 		rxq = &sc->vtnet_rxqs[i];
 		VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, rxq, &rxq->vtnrx_vq,
 		    "%s-rx%d", device_get_nameunit(dev), rxq->vtnrx_id);
 
 		txq = &sc->vtnet_txqs[i];
 		VQ_ALLOC_INFO_INIT(&info[idx+1], 0, NULL, txq, &txq->vtntx_vq,
 		    "%s-tx%d", device_get_nameunit(dev), txq->vtntx_id);
 	}
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
 		VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL,
 		    &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev));
 	}
 
 	/*
 	 * TODO: Enable interrupt binding if this is multiqueue. This will
 	 * only matter when per-virtqueue MSIX is available.
 	 */
 	if (sc->vtnet_flags & VTNET_FLAG_MQ)
 		flags |= 0;
 
 	error = virtio_alloc_virtqueues(dev, flags, nvqs, info);
 	free(info, M_TEMP);
 
 	return (error);
 }
 
 static int
 vtnet_alloc_interface(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 
 	dev = sc->vtnet_dev;
 
 	ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL)
 		return (ENOMEM);
 
 	sc->vtnet_ifp = ifp;
 	if_setsoftc(ifp, sc);
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 
 	return (0);
 }
 
 static int
 vtnet_setup_interface(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct pfil_head_args pa;
 	if_t ifp;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
-	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST |
-	    IFF_KNOWSEPOCH);
+	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 	if_setbaudrate(ifp, IF_Gbps(10));
 	if_setinitfn(ifp, vtnet_init);
 	if_setioctlfn(ifp, vtnet_ioctl);
 	if_setgetcounterfn(ifp, vtnet_get_counter);
 #ifndef VTNET_LEGACY_TX
 	if_settransmitfn(ifp, vtnet_txq_mq_start);
 	if_setqflushfn(ifp, vtnet_qflush);
 #else
 	struct virtqueue *vq = sc->vtnet_txqs[0].vtntx_vq;
 	if_setstartfn(ifp, vtnet_start);
 	if_setsendqlen(ifp, virtqueue_size(vq) - 1);
 	if_setsendqready(ifp);
 #endif
 
 	vtnet_get_macaddr(sc);
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS))
 		if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0);
 
 	ifmedia_init(&sc->vtnet_media, 0, vtnet_ifmedia_upd, vtnet_ifmedia_sts);
 	ifmedia_add(&sc->vtnet_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->vtnet_media, IFM_ETHER | IFM_AUTO);
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) {
 		int gso;
 
 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6, 0);
 
 		gso = virtio_with_feature(dev, VIRTIO_NET_F_GSO);
 		if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4))
 			if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
 		if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
 			if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
 		if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN))
 			sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
 
 		if (if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) {
 			int tso_maxlen;
 
 			if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTSO, 0);
 
 			tso_maxlen = vtnet_tunable_int(sc, "tso_maxlen",
 			    vtnet_tso_maxlen);
 			if_sethwtsomax(ifp, tso_maxlen -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			if_sethwtsomaxsegcount(ifp, sc->vtnet_tx_nsegs - 1);
 			if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
 		}
 	}
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) {
 		if_setcapabilitiesbit(ifp, IFCAP_RXCSUM, 0);
 #ifdef notyet
 		/* BMV: Rx checksums not distinguished between IPv4 and IPv6. */
 		if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
 #endif
 
 		if (vtnet_tunable_int(sc, "fixup_needs_csum",
 		    vtnet_fixup_needs_csum) != 0)
 			sc->vtnet_flags |= VTNET_FLAG_FIXUP_NEEDS_CSUM;
 
 		/* Support either "hardware" or software LRO. */
 		if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
 	}
 
 	if (if_getcapabilities(ifp) & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6)) {
 		/*
 		 * VirtIO does not support VLAN tagging, but we can fake
 		 * it by inserting and removing the 802.1Q header during
 		 * transmit and receive. We are then able to do checksum
 		 * offloading of VLAN frames.
 		 */
 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM, 0);
 	}
 
 	if (sc->vtnet_max_mtu >= ETHERMTU_JUMBO)
 		if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0);
 
 	/*
 	 * Capabilities after here are not enabled by default.
 	 */
 	if_setcapenable(ifp, if_getcapabilities(ifp));
 
 	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0);
 
 		sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		    vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
 		sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		    vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	}
 
 	ether_ifattach(ifp, sc->vtnet_hwaddr);
 
 	/* Tell the upper layer(s) we support long frames. */
 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 
 	DEBUGNET_SET(ifp, vtnet);
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = if_name(ifp);
 	sc->vtnet_pfil = pfil_head_register(&pa);
 
 	return (0);
 }
 
 static int
 vtnet_rx_cluster_size(struct vtnet_softc *sc, int mtu)
 {
 	int framesz;
 
 	if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS)
 		return (MJUMPAGESIZE);
 	else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
 		return (MCLBYTES);
 
 	/*
 	 * Try to scale the receive mbuf cluster size from the MTU. We
 	 * could also use the VQ size to influence the selected size,
 	 * but that would only matter for very small queues.
 	 */
 	if (vtnet_modern(sc)) {
 		MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr_v1));
 		framesz = sizeof(struct virtio_net_hdr_v1);
 	} else
 		framesz = sizeof(struct vtnet_rx_header);
 	framesz += sizeof(struct ether_vlan_header) + mtu;
 
 	if (framesz <= MCLBYTES)
 		return (MCLBYTES);
 	else if (framesz <= MJUMPAGESIZE)
 		return (MJUMPAGESIZE);
 	else if (framesz <= MJUM9BYTES)
 		return (MJUM9BYTES);
 
 	/* Sane default; avoid 16KB clusters. */
 	return (MCLBYTES);
 }
 
 static int
 vtnet_ioctl_mtu(struct vtnet_softc *sc, u_int mtu)
 {
 	if_t ifp;
 	int clustersz;
 
 	ifp = sc->vtnet_ifp;
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (if_getmtu(ifp) == mtu)
 		return (0);
 	else if (mtu < ETHERMIN || mtu > sc->vtnet_max_mtu)
 		return (EINVAL);
 
 	if_setmtu(ifp, mtu);
 	clustersz = vtnet_rx_cluster_size(sc, mtu);
 
 	if (clustersz != sc->vtnet_rx_clustersz &&
 	    if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 		if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
 		vtnet_init_locked(sc, 0);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_ioctl_ifflags(struct vtnet_softc *sc)
 {
 	if_t ifp;
 	int drv_running;
 
 	ifp = sc->vtnet_ifp;
 	drv_running = (if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if ((if_getflags(ifp) & IFF_UP) == 0) {
 		if (drv_running)
 			vtnet_stop(sc);
 		goto out;
 	}
 
 	if (!drv_running) {
 		vtnet_init_locked(sc, 0);
 		goto out;
 	}
 
 	if ((if_getflags(ifp) ^ sc->vtnet_if_flags) &
 	    (IFF_PROMISC | IFF_ALLMULTI)) {
 		if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX)
 			vtnet_rx_filter(sc);
 		else {
 			if ((if_getflags(ifp) ^ sc->vtnet_if_flags) & IFF_ALLMULTI)
 				return (ENOTSUP);
 			if_setflagbits(ifp, IFF_PROMISC, 0);
 		}
 	}
 
 out:
 	sc->vtnet_if_flags = if_getflags(ifp);
 	return (0);
 }
 
 static int
 vtnet_ioctl_multi(struct vtnet_softc *sc)
 {
 	if_t ifp;
 
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX &&
 	    if_getdrvflags(ifp) & IFF_DRV_RUNNING)
 		vtnet_rx_filter_mac(sc);
 
 	return (0);
 }
 
 static int
 vtnet_ioctl_ifcap(struct vtnet_softc *sc, struct ifreq *ifr)
 {
 	if_t ifp;
 	int mask, reinit, update;
 
 	ifp = sc->vtnet_ifp;
 	mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ if_getcapenable(ifp);
 	reinit = update = 0;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (mask & IFCAP_TXCSUM)
 		if_togglecapenable(ifp, IFCAP_TXCSUM);
 	if (mask & IFCAP_TXCSUM_IPV6)
 		if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
 	if (mask & IFCAP_TSO4)
 		if_togglecapenable(ifp, IFCAP_TSO4);
 	if (mask & IFCAP_TSO6)
 		if_togglecapenable(ifp, IFCAP_TSO6);
 
 	if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO)) {
 		/*
 		 * These Rx features require the negotiated features to
 		 * be updated. Avoid a full reinit if possible.
 		 */
 		if (sc->vtnet_features & VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
 			update = 1;
 		else
 			reinit = 1;
 
 		/* BMV: Avoid needless renegotiation for just software LRO. */
 		if ((mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO)) ==
 		    IFCAP_LRO && vtnet_software_lro(sc))
 			reinit = update = 0;
 
 		if (mask & IFCAP_RXCSUM)
 			if_togglecapenable(ifp, IFCAP_RXCSUM);
 		if (mask & IFCAP_RXCSUM_IPV6)
 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
 		if (mask & IFCAP_LRO)
 			if_togglecapenable(ifp, IFCAP_LRO);
 
 		/*
 		 * VirtIO does not distinguish between IPv4 and IPv6 checksums
 		 * so treat them as a pair. Guest TSO (LRO) requires receive
 		 * checksums.
 		 */
 		if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
 			if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
 #ifdef notyet
 			if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
 #endif
 		} else
 			if_setcapenablebit(ifp, 0,
 			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO));
 	}
 
 	if (mask & IFCAP_VLAN_HWFILTER) {
 		/* These Rx features require renegotiation. */
 		reinit = 1;
 
 		if (mask & IFCAP_VLAN_HWFILTER)
 			if_togglecapenable(ifp, IFCAP_VLAN_HWFILTER);
 	}
 
 	if (mask & IFCAP_VLAN_HWTSO)
 		if_togglecapenable(ifp, IFCAP_VLAN_HWTSO);
 	if (mask & IFCAP_VLAN_HWTAGGING)
 		if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
 
 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 		if (reinit) {
 			if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
 			vtnet_init_locked(sc, 0);
 		} else if (update)
 			vtnet_update_rx_offloads(sc);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_ioctl(if_t ifp, u_long cmd, caddr_t data)
 {
 	struct vtnet_softc *sc;
 	struct ifreq *ifr;
 	int error;
 
 	sc = if_getsoftc(ifp);
 	ifr = (struct ifreq *) data;
 	error = 0;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		VTNET_CORE_LOCK(sc);
 		error = vtnet_ioctl_mtu(sc, ifr->ifr_mtu);
 		VTNET_CORE_UNLOCK(sc);
 		break;
 
 	case SIOCSIFFLAGS:
 		VTNET_CORE_LOCK(sc);
 		error = vtnet_ioctl_ifflags(sc);
 		VTNET_CORE_UNLOCK(sc);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		VTNET_CORE_LOCK(sc);
 		error = vtnet_ioctl_multi(sc);
 		VTNET_CORE_UNLOCK(sc);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd);
 		break;
 
 	case SIOCSIFCAP:
 		VTNET_CORE_LOCK(sc);
 		error = vtnet_ioctl_ifcap(sc, ifr);
 		VTNET_CORE_UNLOCK(sc);
 		VLAN_CAPABILITIES(ifp);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc);
 
 	return (error);
 }
 
 static int
 vtnet_rxq_populate(struct vtnet_rxq *rxq)
 {
 	struct virtqueue *vq;
 	int nbufs, error;
 
 #ifdef DEV_NETMAP
 	error = vtnet_netmap_rxq_populate(rxq);
 	if (error >= 0)
 		return (error);
 #endif  /* DEV_NETMAP */
 
 	vq = rxq->vtnrx_vq;
 	error = ENOSPC;
 
 	for (nbufs = 0; !virtqueue_full(vq); nbufs++) {
 		error = vtnet_rxq_new_buf(rxq);
 		if (error)
 			break;
 	}
 
 	if (nbufs > 0) {
 		virtqueue_notify(vq);
 		/*
 		 * EMSGSIZE signifies the virtqueue did not have enough
 		 * entries available to hold the last mbuf. This is not
 		 * an error.
 		 */
 		if (error == EMSGSIZE)
 			error = 0;
 	}
 
 	return (error);
 }
 
 static void
 vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq)
 {
 	struct virtqueue *vq;
 	struct mbuf *m;
 	int last;
 #ifdef DEV_NETMAP
 	struct netmap_kring *kring = netmap_kring_on(NA(rxq->vtnrx_sc->vtnet_ifp),
 							rxq->vtnrx_id, NR_RX);
 #else  /* !DEV_NETMAP */
 	void *kring = NULL;
 #endif /* !DEV_NETMAP */
 
 	vq = rxq->vtnrx_vq;
 	last = 0;
 
 	while ((m = virtqueue_drain(vq, &last)) != NULL) {
 		if (kring == NULL)
 			m_freem(m);
 	}
 
 	KASSERT(virtqueue_empty(vq),
 	    ("%s: mbufs remaining in rx queue %p", __func__, rxq));
 }
 
 static struct mbuf *
 vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp)
 {
 	struct mbuf *m_head, *m_tail, *m;
 	int i, size;
 
 	m_head = NULL;
 	size = sc->vtnet_rx_clustersz;
 
 	KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
 	    ("%s: mbuf %d chain requested without LRO_NOMRG", __func__, nbufs));
 
 	for (i = 0; i < nbufs; i++) {
 		m = m_getjcl(M_NOWAIT, MT_DATA, i == 0 ? M_PKTHDR : 0, size);
 		if (m == NULL) {
 			sc->vtnet_stats.mbuf_alloc_failed++;
 			m_freem(m_head);
 			return (NULL);
 		}
 
 		m->m_len = size;
 		if (m_head != NULL) {
 			m_tail->m_next = m;
 			m_tail = m;
 		} else
 			m_head = m_tail = m;
 	}
 
 	if (m_tailp != NULL)
 		*m_tailp = m_tail;
 
 	return (m_head);
 }
 
 /*
  * Slow path for when LRO without mergeable buffers is negotiated.
  */
 static int
 vtnet_rxq_replace_lro_nomrg_buf(struct vtnet_rxq *rxq, struct mbuf *m0,
     int len0)
 {
 	struct vtnet_softc *sc;
 	struct mbuf *m, *m_prev, *m_new, *m_tail;
 	int len, clustersz, nreplace, error;
 
 	sc = rxq->vtnrx_sc;
 	clustersz = sc->vtnet_rx_clustersz;
 
 	m_prev = NULL;
 	m_tail = NULL;
 	nreplace = 0;
 
 	m = m0;
 	len = len0;
 
 	/*
 	 * Since these mbuf chains are so large, avoid allocating a complete
 	 * replacement when the received frame did not consume the entire
 	 * chain. Unused mbufs are moved to the tail of the replacement mbuf.
 	 */
 	while (len > 0) {
 		if (m == NULL) {
 			sc->vtnet_stats.rx_frame_too_large++;
 			return (EMSGSIZE);
 		}
 
 		/*
 		 * Every mbuf should have the expected cluster size since that
 		 * is also used to allocate the replacements.
 		 */
 		KASSERT(m->m_len == clustersz,
 		    ("%s: mbuf size %d not expected cluster size %d", __func__,
 		    m->m_len, clustersz));
 
 		m->m_len = MIN(m->m_len, len);
 		len -= m->m_len;
 
 		m_prev = m;
 		m = m->m_next;
 		nreplace++;
 	}
 
 	KASSERT(nreplace > 0 && nreplace <= sc->vtnet_rx_nmbufs,
 	    ("%s: invalid replacement mbuf count %d max %d", __func__,
 	    nreplace, sc->vtnet_rx_nmbufs));
 
 	m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail);
 	if (m_new == NULL) {
 		m_prev->m_len = clustersz;
 		return (ENOBUFS);
 	}
 
 	/*
 	 * Move any unused mbufs from the received mbuf chain onto the
 	 * end of the replacement chain.
 	 */
 	if (m_prev->m_next != NULL) {
 		m_tail->m_next = m_prev->m_next;
 		m_prev->m_next = NULL;
 	}
 
 	error = vtnet_rxq_enqueue_buf(rxq, m_new);
 	if (error) {
 		/*
 		 * The replacement is suppose to be an copy of the one
 		 * dequeued so this is a very unexpected error.
 		 *
 		 * Restore the m0 chain to the original state if it was
 		 * modified so we can then discard it.
 		 */
 		if (m_tail->m_next != NULL) {
 			m_prev->m_next = m_tail->m_next;
 			m_tail->m_next = NULL;
 		}
 		m_prev->m_len = clustersz;
 		sc->vtnet_stats.rx_enq_replacement_failed++;
 		m_freem(m_new);
 	}
 
 	return (error);
 }
 
 static int
 vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len)
 {
 	struct vtnet_softc *sc;
 	struct mbuf *m_new;
 	int error;
 
 	sc = rxq->vtnrx_sc;
 
 	if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
 		return (vtnet_rxq_replace_lro_nomrg_buf(rxq, m, len));
 
 	MPASS(m->m_next == NULL);
 	if (m->m_len < len)
 		return (EMSGSIZE);
 
 	m_new = vtnet_rx_alloc_buf(sc, 1, NULL);
 	if (m_new == NULL)
 		return (ENOBUFS);
 
 	error = vtnet_rxq_enqueue_buf(rxq, m_new);
 	if (error) {
 		sc->vtnet_stats.rx_enq_replacement_failed++;
 		m_freem(m_new);
 	} else
 		m->m_len = len;
 
 	return (error);
 }
 
 static int
 vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m)
 {
 	struct vtnet_softc *sc;
 	struct sglist *sg;
 	int header_inlined, error;
 
 	sc = rxq->vtnrx_sc;
 	sg = rxq->vtnrx_sg;
 
 	KASSERT(m->m_next == NULL || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
 	    ("%s: mbuf chain without LRO_NOMRG", __func__));
 	VTNET_RXQ_LOCK_ASSERT(rxq);
 
 	sglist_reset(sg);
 	header_inlined = vtnet_modern(sc) ||
 	    (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) != 0; /* TODO: ANY_LAYOUT */
 
 	if (header_inlined)
 		error = sglist_append_mbuf(sg, m);
 	else {
 		struct vtnet_rx_header *rxhdr =
 		    mtod(m, struct vtnet_rx_header *);
 		MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr));
 
 		/* Append the header and remaining mbuf data. */
 		error = sglist_append(sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size);
 		if (error)
 			return (error);
 		error = sglist_append(sg, &rxhdr[1],
 		    m->m_len - sizeof(struct vtnet_rx_header));
 		if (error)
 			return (error);
 
 		if (m->m_next != NULL)
 			error = sglist_append_mbuf(sg, m->m_next);
 	}
 
 	if (error)
 		return (error);
 
 	return (virtqueue_enqueue(rxq->vtnrx_vq, m, sg, 0, sg->sg_nseg));
 }
 
 static int
 vtnet_rxq_new_buf(struct vtnet_rxq *rxq)
 {
 	struct vtnet_softc *sc;
 	struct mbuf *m;
 	int error;
 
 	sc = rxq->vtnrx_sc;
 
 	m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL);
 	if (m == NULL)
 		return (ENOBUFS);
 
 	error = vtnet_rxq_enqueue_buf(rxq, m);
 	if (error)
 		m_freem(m);
 
 	return (error);
 }
 
 static int
 vtnet_rxq_csum_needs_csum(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t etype,
     int hoff, struct virtio_net_hdr *hdr)
 {
 	struct vtnet_softc *sc;
 	int error;
 
 	sc = rxq->vtnrx_sc;
 
 	/*
 	 * NEEDS_CSUM corresponds to Linux's CHECKSUM_PARTIAL, but FreeBSD does
 	 * not have an analogous CSUM flag. The checksum has been validated,
 	 * but is incomplete (TCP/UDP pseudo header).
 	 *
 	 * The packet is likely from another VM on the same host that itself
 	 * performed checksum offloading so Tx/Rx is basically a memcpy and
 	 * the checksum has little value.
 	 *
 	 * Default to receiving the packet as-is for performance reasons, but
 	 * this can cause issues if the packet is to be forwarded because it
 	 * does not contain a valid checksum. This patch may be helpful:
 	 * https://reviews.freebsd.org/D6611. In the meantime, have the driver
 	 * compute the checksum if requested.
 	 *
 	 * BMV: Need to add an CSUM_PARTIAL flag?
 	 */
 	if ((sc->vtnet_flags & VTNET_FLAG_FIXUP_NEEDS_CSUM) == 0) {
 		error = vtnet_rxq_csum_data_valid(rxq, m, etype, hoff, hdr);
 		return (error);
 	}
 
 	/*
 	 * Compute the checksum in the driver so the packet will contain a
 	 * valid checksum. The checksum is at csum_offset from csum_start.
 	 */
 	switch (etype) {
 #if defined(INET) || defined(INET6)
 	case ETHERTYPE_IP:
 	case ETHERTYPE_IPV6: {
 		int csum_off, csum_end;
 		uint16_t csum;
 
 		csum_off = hdr->csum_start + hdr->csum_offset;
 		csum_end = csum_off + sizeof(uint16_t);
 
 		/* Assume checksum will be in the first mbuf. */
 		if (m->m_len < csum_end || m->m_pkthdr.len < csum_end)
 			return (1);
 
 		/*
 		 * Like in_delayed_cksum()/in6_delayed_cksum(), compute the
 		 * checksum and write it at the specified offset. We could
 		 * try to verify the packet: csum_start should probably
 		 * correspond to the start of the TCP/UDP header.
 		 *
 		 * BMV: Need to properly handle UDP with zero checksum. Is
 		 * the IPv4 header checksum implicitly validated?
 		 */
 		csum = in_cksum_skip(m, m->m_pkthdr.len, hdr->csum_start);
 		*(uint16_t *)(mtodo(m, csum_off)) = csum;
 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 		m->m_pkthdr.csum_data = 0xFFFF;
 		break;
 	}
 #endif
 	default:
 		sc->vtnet_stats.rx_csum_bad_ethtype++;
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_rxq_csum_data_valid(struct vtnet_rxq *rxq, struct mbuf *m,
     uint16_t etype, int hoff, struct virtio_net_hdr *hdr __unused)
 {
 #if 0
 	struct vtnet_softc *sc;
 #endif
 	int protocol;
 
 #if 0
 	sc = rxq->vtnrx_sc;
 #endif
 
 	switch (etype) {
 #if defined(INET)
 	case ETHERTYPE_IP:
 		if (__predict_false(m->m_len < hoff + sizeof(struct ip)))
 			protocol = IPPROTO_DONE;
 		else {
 			struct ip *ip = (struct ip *)(m->m_data + hoff);
 			protocol = ip->ip_p;
 		}
 		break;
 #endif
 #if defined(INET6)
 	case ETHERTYPE_IPV6:
 		if (__predict_false(m->m_len < hoff + sizeof(struct ip6_hdr))
 		    || ip6_lasthdr(m, hoff, IPPROTO_IPV6, &protocol) < 0)
 			protocol = IPPROTO_DONE;
 		break;
 #endif
 	default:
 		protocol = IPPROTO_DONE;
 		break;
 	}
 
 	switch (protocol) {
 	case IPPROTO_TCP:
 	case IPPROTO_UDP:
 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 		m->m_pkthdr.csum_data = 0xFFFF;
 		break;
 	default:
 		/*
 		 * FreeBSD does not support checksum offloading of this
 		 * protocol. Let the stack re-verify the checksum later
 		 * if the protocol is supported.
 		 */
 #if 0
 		if_printf(sc->vtnet_ifp,
 		    "%s: checksum offload of unsupported protocol "
 		    "etype=%#x protocol=%d csum_start=%d csum_offset=%d\n",
 		    __func__, etype, protocol, hdr->csum_start,
 		    hdr->csum_offset);
 #endif
 		break;
 	}
 
 	return (0);
 }
 
 static int
 vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m,
     struct virtio_net_hdr *hdr)
 {
 	const struct ether_header *eh;
 	int hoff;
 	uint16_t etype;
 
 	eh = mtod(m, const struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	if (etype == ETHERTYPE_VLAN) {
 		/* TODO BMV: Handle QinQ. */
 		const struct ether_vlan_header *evh =
 		    mtod(m, const struct ether_vlan_header *);
 		etype = ntohs(evh->evl_proto);
 		hoff = sizeof(struct ether_vlan_header);
 	} else
 		hoff = sizeof(struct ether_header);
 
 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
 		return (vtnet_rxq_csum_needs_csum(rxq, m, etype, hoff, hdr));
 	else /* VIRTIO_NET_HDR_F_DATA_VALID */
 		return (vtnet_rxq_csum_data_valid(rxq, m, etype, hoff, hdr));
 }
 
 static void
 vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs)
 {
 	struct mbuf *m;
 
 	while (--nbufs > 0) {
 		m = virtqueue_dequeue(rxq->vtnrx_vq, NULL);
 		if (m == NULL)
 			break;
 		vtnet_rxq_discard_buf(rxq, m);
 	}
 }
 
 static void
 vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m)
 {
 	int error __diagused;
 
 	/*
 	 * Requeue the discarded mbuf. This should always be successful
 	 * since it was just dequeued.
 	 */
 	error = vtnet_rxq_enqueue_buf(rxq, m);
 	KASSERT(error == 0,
 	    ("%s: cannot requeue discarded mbuf %d", __func__, error));
 }
 
 static int
 vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 	struct mbuf *m_tail;
 
 	sc = rxq->vtnrx_sc;
 	vq = rxq->vtnrx_vq;
 	m_tail = m_head;
 
 	while (--nbufs > 0) {
 		struct mbuf *m;
 		uint32_t len;
 
 		m = virtqueue_dequeue(vq, &len);
 		if (m == NULL) {
 			rxq->vtnrx_stats.vrxs_ierrors++;
 			goto fail;
 		}
 
 		if (vtnet_rxq_new_buf(rxq) != 0) {
 			rxq->vtnrx_stats.vrxs_iqdrops++;
 			vtnet_rxq_discard_buf(rxq, m);
 			if (nbufs > 1)
 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
 			goto fail;
 		}
 
 		if (m->m_len < len)
 			len = m->m_len;
 
 		m->m_len = len;
 		m->m_flags &= ~M_PKTHDR;
 
 		m_head->m_pkthdr.len += len;
 		m_tail->m_next = m;
 		m_tail = m;
 	}
 
 	return (0);
 
 fail:
 	sc->vtnet_stats.rx_mergeable_failed++;
 	m_freem(m_head);
 
 	return (1);
 }
 
 #if defined(INET) || defined(INET6)
 static int
 vtnet_lro_rx(struct vtnet_rxq *rxq, struct mbuf *m)
 {
 	struct lro_ctrl *lro;
 
 	lro = &rxq->vtnrx_lro;
 
 	if (lro->lro_mbuf_max != 0) {
 		tcp_lro_queue_mbuf(lro, m);
 		return (0);
 	}
 
 	return (tcp_lro_rx(lro, m, 0));
 }
 #endif
 
 static void
 vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m,
     struct virtio_net_hdr *hdr)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 
 	sc = rxq->vtnrx_sc;
 	ifp = sc->vtnet_ifp;
 
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING) {
 		struct ether_header *eh = mtod(m, struct ether_header *);
 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 			vtnet_vlan_tag_remove(m);
 			/*
 			 * With the 802.1Q header removed, update the
 			 * checksum starting location accordingly.
 			 */
 			if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
 				hdr->csum_start -= ETHER_VLAN_ENCAP_LEN;
 		}
 	}
 
 	m->m_pkthdr.flowid = rxq->vtnrx_id;
 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 
 	if (hdr->flags &
 	    (VIRTIO_NET_HDR_F_NEEDS_CSUM | VIRTIO_NET_HDR_F_DATA_VALID)) {
 		if (vtnet_rxq_csum(rxq, m, hdr) == 0)
 			rxq->vtnrx_stats.vrxs_csum++;
 		else
 			rxq->vtnrx_stats.vrxs_csum_failed++;
 	}
 
 	if (hdr->gso_size != 0) {
 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 		case VIRTIO_NET_HDR_GSO_TCPV4:
 		case VIRTIO_NET_HDR_GSO_TCPV6:
 			m->m_pkthdr.lro_nsegs =
 			    howmany(m->m_pkthdr.len, hdr->gso_size);
 			rxq->vtnrx_stats.vrxs_host_lro++;
 			break;
 		}
 	}
 
 	rxq->vtnrx_stats.vrxs_ipackets++;
 	rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len;
 
 #if defined(INET) || defined(INET6)
 	if (vtnet_software_lro(sc) && if_getcapenable(ifp) & IFCAP_LRO) {
 		if (vtnet_lro_rx(rxq, m) == 0)
 			return;
 	}
 #endif
 
 	if_input(ifp, m);
 }
 
 static int
 vtnet_rxq_eof(struct vtnet_rxq *rxq)
 {
 	struct virtio_net_hdr lhdr, *hdr;
 	struct vtnet_softc *sc;
 	if_t ifp;
 	struct virtqueue *vq;
 	int deq, count;
 
 	sc = rxq->vtnrx_sc;
 	vq = rxq->vtnrx_vq;
 	ifp = sc->vtnet_ifp;
 	deq = 0;
 	count = sc->vtnet_rx_process_limit;
 
 	VTNET_RXQ_LOCK_ASSERT(rxq);
 
 	while (count-- > 0) {
 		struct mbuf *m;
 		uint32_t len, nbufs, adjsz;
 
 		m = virtqueue_dequeue(vq, &len);
 		if (m == NULL)
 			break;
 		deq++;
 
 		if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) {
 			rxq->vtnrx_stats.vrxs_ierrors++;
 			vtnet_rxq_discard_buf(rxq, m);
 			continue;
 		}
 
 		if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) {
 			struct virtio_net_hdr_mrg_rxbuf *mhdr =
 			    mtod(m, struct virtio_net_hdr_mrg_rxbuf *);
 			kmsan_mark(mhdr, sizeof(*mhdr), KMSAN_STATE_INITED);
 			nbufs = vtnet_htog16(sc, mhdr->num_buffers);
 			adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 		} else if (vtnet_modern(sc)) {
 			nbufs = 1; /* num_buffers is always 1 */
 			adjsz = sizeof(struct virtio_net_hdr_v1);
 		} else {
 			nbufs = 1;
 			adjsz = sizeof(struct vtnet_rx_header);
 			/*
 			 * Account for our gap between the header and start of
 			 * data to keep the segments separated.
 			 */
 			len += VTNET_RX_HEADER_PAD;
 		}
 
 		if (vtnet_rxq_replace_buf(rxq, m, len) != 0) {
 			rxq->vtnrx_stats.vrxs_iqdrops++;
 			vtnet_rxq_discard_buf(rxq, m);
 			if (nbufs > 1)
 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
 			continue;
 		}
 
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = ifp;
 		m->m_pkthdr.csum_flags = 0;
 
 		if (nbufs > 1) {
 			/* Dequeue the rest of chain. */
 			if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0)
 				continue;
 		}
 
 		kmsan_mark_mbuf(m, KMSAN_STATE_INITED);
 
 		/*
 		 * Save an endian swapped version of the header prior to it
 		 * being stripped. The header is always at the start of the
 		 * mbuf data. num_buffers was already saved (and not needed)
 		 * so use the standard header.
 		 */
 		hdr = mtod(m, struct virtio_net_hdr *);
 		lhdr.flags = hdr->flags;
 		lhdr.gso_type = hdr->gso_type;
 		lhdr.hdr_len = vtnet_htog16(sc, hdr->hdr_len);
 		lhdr.gso_size = vtnet_htog16(sc, hdr->gso_size);
 		lhdr.csum_start = vtnet_htog16(sc, hdr->csum_start);
 		lhdr.csum_offset = vtnet_htog16(sc, hdr->csum_offset);
 		m_adj(m, adjsz);
 
 		if (PFIL_HOOKED_IN(sc->vtnet_pfil)) {
 			pfil_return_t pfil;
 
 			pfil = pfil_mbuf_in(sc->vtnet_pfil, &m, ifp, NULL);
 			switch (pfil) {
 			case PFIL_DROPPED:
 			case PFIL_CONSUMED:
 				continue;
 			default:
 				KASSERT(pfil == PFIL_PASS,
 				    ("Filter returned %d!", pfil));
 			}
 		}
 
 		vtnet_rxq_input(rxq, m, &lhdr);
 	}
 
 	if (deq > 0) {
 #if defined(INET) || defined(INET6)
 		if (vtnet_software_lro(sc))
 			tcp_lro_flush_all(&rxq->vtnrx_lro);
 #endif
 		virtqueue_notify(vq);
 	}
 
 	return (count > 0 ? 0 : EAGAIN);
 }
 
 static void
 vtnet_rx_vq_process(struct vtnet_rxq *rxq, int tries)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 	u_int more;
 #ifdef DEV_NETMAP
 	int nmirq;
 #endif /* DEV_NETMAP */
 
 	sc = rxq->vtnrx_sc;
 	ifp = sc->vtnet_ifp;
 
 	if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) {
 		/*
 		 * Ignore this interrupt. Either this is a spurious interrupt
 		 * or multiqueue without per-VQ MSIX so every queue needs to
 		 * be polled (a brain dead configuration we could try harder
 		 * to avoid).
 		 */
 		vtnet_rxq_disable_intr(rxq);
 		return;
 	}
 
 	VTNET_RXQ_LOCK(rxq);
 
 #ifdef DEV_NETMAP
 	/*
 	 * We call netmap_rx_irq() under lock to prevent concurrent calls.
 	 * This is not necessary to serialize the access to the RX vq, but
 	 * rather to avoid races that may happen if this interface is
 	 * attached to a VALE switch, which would cause received packets
 	 * to stall in the RX queue (nm_kr_tryget() could find the kring
 	 * busy when called from netmap_bwrap_intr_notify()).
 	 */
 	nmirq = netmap_rx_irq(ifp, rxq->vtnrx_id, &more);
 	if (nmirq != NM_IRQ_PASS) {
 		VTNET_RXQ_UNLOCK(rxq);
 		if (nmirq == NM_IRQ_RESCHED) {
 			taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
 		}
 		return;
 	}
 #endif /* DEV_NETMAP */
 
 again:
 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
 		VTNET_RXQ_UNLOCK(rxq);
 		return;
 	}
 
 	more = vtnet_rxq_eof(rxq);
 	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
 		if (!more)
 			vtnet_rxq_disable_intr(rxq);
 		/*
 		 * This is an occasional condition or race (when !more),
 		 * so retry a few times before scheduling the taskqueue.
 		 */
 		if (tries-- > 0)
 			goto again;
 
 		rxq->vtnrx_stats.vrxs_rescheduled++;
 		VTNET_RXQ_UNLOCK(rxq);
 		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
 	} else
 		VTNET_RXQ_UNLOCK(rxq);
 }
 
 static void
 vtnet_rx_vq_intr(void *xrxq)
 {
 	struct vtnet_rxq *rxq;
 
 	rxq = xrxq;
 	vtnet_rx_vq_process(rxq, VTNET_INTR_DISABLE_RETRIES);
 }
 
 static void
 vtnet_rxq_tq_intr(void *xrxq, int pending __unused)
 {
 	struct vtnet_rxq *rxq;
 
 	rxq = xrxq;
 	vtnet_rx_vq_process(rxq, 0);
 }
 
 static int
 vtnet_txq_intr_threshold(struct vtnet_txq *txq)
 {
 	struct vtnet_softc *sc;
 	int threshold;
 
 	sc = txq->vtntx_sc;
 
 	/*
 	 * The Tx interrupt is disabled until the queue free count falls
 	 * below our threshold. Completed frames are drained from the Tx
 	 * virtqueue before transmitting new frames and in the watchdog
 	 * callout, so the frequency of Tx interrupts is greatly reduced,
 	 * at the cost of not freeing mbufs as quickly as they otherwise
 	 * would be.
 	 */
 	threshold = virtqueue_size(txq->vtntx_vq) / 4;
 
 	/*
 	 * Without indirect descriptors, leave enough room for the most
 	 * segments we handle.
 	 */
 	if ((sc->vtnet_flags & VTNET_FLAG_INDIRECT) == 0 &&
 	    threshold < sc->vtnet_tx_nsegs)
 		threshold = sc->vtnet_tx_nsegs;
 
 	return (threshold);
 }
 
 static int
 vtnet_txq_below_threshold(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 
 	vq = txq->vtntx_vq;
 
 	return (virtqueue_nfree(vq) <= txq->vtntx_intr_threshold);
 }
 
 static int
 vtnet_txq_notify(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 
 	vq = txq->vtntx_vq;
 
 	txq->vtntx_watchdog = VTNET_TX_TIMEOUT;
 	virtqueue_notify(vq);
 
 	if (vtnet_txq_enable_intr(txq) == 0)
 		return (0);
 
 	/*
 	 * Drain frames that were completed since last checked. If this
 	 * causes the queue to go above the threshold, the caller should
 	 * continue transmitting.
 	 */
 	if (vtnet_txq_eof(txq) != 0 && vtnet_txq_below_threshold(txq) == 0) {
 		virtqueue_disable_intr(vq);
 		return (1);
 	}
 
 	return (0);
 }
 
 static void
 vtnet_txq_free_mbufs(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 	struct vtnet_tx_header *txhdr;
 	int last;
 #ifdef DEV_NETMAP
 	struct netmap_kring *kring = netmap_kring_on(NA(txq->vtntx_sc->vtnet_ifp),
 							txq->vtntx_id, NR_TX);
 #else  /* !DEV_NETMAP */
 	void *kring = NULL;
 #endif /* !DEV_NETMAP */
 
 	vq = txq->vtntx_vq;
 	last = 0;
 
 	while ((txhdr = virtqueue_drain(vq, &last)) != NULL) {
 		if (kring == NULL) {
 			m_freem(txhdr->vth_mbuf);
 			uma_zfree(vtnet_tx_header_zone, txhdr);
 		}
 	}
 
 	KASSERT(virtqueue_empty(vq),
 	    ("%s: mbufs remaining in tx queue %p", __func__, txq));
 }
 
 /*
  * BMV: This can go away once we finally have offsets in the mbuf header.
  */
 static int
 vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m, int *etype,
     int *proto, int *start)
 {
 	struct vtnet_softc *sc;
 	struct ether_vlan_header *evh;
 #if defined(INET) || defined(INET6)
 	int offset;
 #endif
 
 	sc = txq->vtntx_sc;
 
 	evh = mtod(m, struct ether_vlan_header *);
 	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		/* BMV: We should handle nested VLAN tags too. */
 		*etype = ntohs(evh->evl_proto);
 #if defined(INET) || defined(INET6)
 		offset = sizeof(struct ether_vlan_header);
 #endif
 	} else {
 		*etype = ntohs(evh->evl_encap_proto);
 #if defined(INET) || defined(INET6)
 		offset = sizeof(struct ether_header);
 #endif
 	}
 
 	switch (*etype) {
 #if defined(INET)
 	case ETHERTYPE_IP: {
 		struct ip *ip, iphdr;
 		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
 			m_copydata(m, offset, sizeof(struct ip),
 			    (caddr_t) &iphdr);
 			ip = &iphdr;
 		} else
 			ip = (struct ip *)(m->m_data + offset);
 		*proto = ip->ip_p;
 		*start = offset + (ip->ip_hl << 2);
 		break;
 	}
 #endif
 #if defined(INET6)
 	case ETHERTYPE_IPV6:
 		*proto = -1;
 		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
 		/* Assert the network stack sent us a valid packet. */
 		KASSERT(*start > offset,
 		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
 		    *start, offset, *proto));
 		break;
 #endif
 	default:
 		sc->vtnet_stats.tx_csum_unknown_ethtype++;
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type,
     int offset, struct virtio_net_hdr *hdr)
 {
 	static struct timeval lastecn;
 	static int curecn;
 	struct vtnet_softc *sc;
 	struct tcphdr *tcp, tcphdr;
 
 	sc = txq->vtntx_sc;
 
 	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
 		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
 		tcp = &tcphdr;
 	} else
 		tcp = (struct tcphdr *)(m->m_data + offset);
 
 	hdr->hdr_len = vtnet_gtoh16(sc, offset + (tcp->th_off << 2));
 	hdr->gso_size = vtnet_gtoh16(sc, m->m_pkthdr.tso_segsz);
 	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
 	    VIRTIO_NET_HDR_GSO_TCPV6;
 
 	if (__predict_false(tcp->th_flags & TH_CWR)) {
 		/*
 		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In
 		 * FreeBSD, ECN support is not on a per-interface basis,
 		 * but globally via the net.inet.tcp.ecn.enable sysctl
 		 * knob. The default is off.
 		 */
 		if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
 			if (ppsratecheck(&lastecn, &curecn, 1))
 				if_printf(sc->vtnet_ifp,
 				    "TSO with ECN not negotiated with host\n");
 			return (ENOTSUP);
 		}
 		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
 	}
 
 	txq->vtntx_stats.vtxs_tso++;
 
 	return (0);
 }
 
 static struct mbuf *
 vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m,
     struct virtio_net_hdr *hdr)
 {
 	struct vtnet_softc *sc;
 	int flags, etype, csum_start, proto, error;
 
 	sc = txq->vtntx_sc;
 	flags = m->m_pkthdr.csum_flags;
 
 	error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start);
 	if (error)
 		goto drop;
 
 	if (flags & (VTNET_CSUM_OFFLOAD | VTNET_CSUM_OFFLOAD_IPV6)) {
 		/* Sanity check the parsed mbuf matches the offload flags. */
 		if (__predict_false((flags & VTNET_CSUM_OFFLOAD &&
 		    etype != ETHERTYPE_IP) || (flags & VTNET_CSUM_OFFLOAD_IPV6
 		    && etype != ETHERTYPE_IPV6))) {
 			sc->vtnet_stats.tx_csum_proto_mismatch++;
 			goto drop;
 		}
 
 		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
 		hdr->csum_start = vtnet_gtoh16(sc, csum_start);
 		hdr->csum_offset = vtnet_gtoh16(sc, m->m_pkthdr.csum_data);
 		txq->vtntx_stats.vtxs_csum++;
 	}
 
 	if (flags & (CSUM_IP_TSO | CSUM_IP6_TSO)) {
 		/*
 		 * Sanity check the parsed mbuf IP protocol is TCP, and
 		 * VirtIO TSO reqires the checksum offloading above.
 		 */
 		if (__predict_false(proto != IPPROTO_TCP)) {
 			sc->vtnet_stats.tx_tso_not_tcp++;
 			goto drop;
 		} else if (__predict_false((hdr->flags &
 		    VIRTIO_NET_HDR_F_NEEDS_CSUM) == 0)) {
 			sc->vtnet_stats.tx_tso_without_csum++;
 			goto drop;
 		}
 
 		error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr);
 		if (error)
 			goto drop;
 	}
 
 	return (m);
 
 drop:
 	m_freem(m);
 	return (NULL);
 }
 
 static int
 vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head,
     struct vtnet_tx_header *txhdr)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 	struct sglist *sg;
 	struct mbuf *m;
 	int error;
 
 	sc = txq->vtntx_sc;
 	vq = txq->vtntx_vq;
 	sg = txq->vtntx_sg;
 	m = *m_head;
 
 	sglist_reset(sg);
 	error = sglist_append(sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size);
 	if (error != 0 || sg->sg_nseg != 1) {
 		KASSERT(0, ("%s: cannot add header to sglist error %d nseg %d",
 		    __func__, error, sg->sg_nseg));
 		goto fail;
 	}
 
 	error = sglist_append_mbuf(sg, m);
 	if (error) {
 		m = m_defrag(m, M_NOWAIT);
 		if (m == NULL)
 			goto fail;
 
 		*m_head = m;
 		sc->vtnet_stats.tx_defragged++;
 
 		error = sglist_append_mbuf(sg, m);
 		if (error)
 			goto fail;
 	}
 
 	txhdr->vth_mbuf = m;
 	error = virtqueue_enqueue(vq, txhdr, sg, sg->sg_nseg, 0);
 
 	return (error);
 
 fail:
 	sc->vtnet_stats.tx_defrag_failed++;
 	m_freem(*m_head);
 	*m_head = NULL;
 
 	return (ENOBUFS);
 }
 
 static int
 vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head, int flags)
 {
 	struct vtnet_tx_header *txhdr;
 	struct virtio_net_hdr *hdr;
 	struct mbuf *m;
 	int error;
 
 	m = *m_head;
 	M_ASSERTPKTHDR(m);
 
 	txhdr = uma_zalloc(vtnet_tx_header_zone, flags | M_ZERO);
 	if (txhdr == NULL) {
 		m_freem(m);
 		*m_head = NULL;
 		return (ENOMEM);
 	}
 
 	/*
 	 * Always use the non-mergeable header, regardless if mergable headers
 	 * were negotiated, because for transmit num_buffers is always zero.
 	 * The vtnet_hdr_size is used to enqueue the right header size segment.
 	 */
 	hdr = &txhdr->vth_uhdr.hdr;
 
 	if (m->m_flags & M_VLANTAG) {
 		m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
 		if ((*m_head = m) == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 		m->m_flags &= ~M_VLANTAG;
 	}
 
 	if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) {
 		m = vtnet_txq_offload(txq, m, hdr);
 		if ((*m_head = m) == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	error = vtnet_txq_enqueue_buf(txq, m_head, txhdr);
 fail:
 	if (error)
 		uma_zfree(vtnet_tx_header_zone, txhdr);
 
 	return (error);
 }
 
 #ifdef VTNET_LEGACY_TX
 
 static void
 vtnet_start_locked(struct vtnet_txq *txq, if_t ifp)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 	struct mbuf *m0;
 	int tries, enq;
 
 	sc = txq->vtntx_sc;
 	vq = txq->vtntx_vq;
 	tries = 0;
 
 	VTNET_TXQ_LOCK_ASSERT(txq);
 
 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 ||
 	    sc->vtnet_link_active == 0)
 		return;
 
 	vtnet_txq_eof(txq);
 
 again:
 	enq = 0;
 
 	while (!if_sendq_empty(ifp)) {
 		if (virtqueue_full(vq))
 			break;
 
 		m0 = if_dequeue(ifp);
 		if (m0 == NULL)
 			break;
 
 		if (vtnet_txq_encap(txq, &m0, M_NOWAIT) != 0) {
 			if (m0 != NULL)
 				if_sendq_prepend(ifp, m0);
 			break;
 		}
 
 		enq++;
 		ETHER_BPF_MTAP(ifp, m0);
 	}
 
 	if (enq > 0 && vtnet_txq_notify(txq) != 0) {
 		if (tries++ < VTNET_NOTIFY_RETRIES)
 			goto again;
 
 		txq->vtntx_stats.vtxs_rescheduled++;
 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask);
 	}
 }
 
 static void
 vtnet_start(if_t ifp)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 
 	sc = if_getsoftc(ifp);
 	txq = &sc->vtnet_txqs[0];
 
 	VTNET_TXQ_LOCK(txq);
 	vtnet_start_locked(txq, ifp);
 	VTNET_TXQ_UNLOCK(txq);
 }
 
 #else /* !VTNET_LEGACY_TX */
 
 static int
 vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 	struct buf_ring *br;
 	if_t ifp;
 	int enq, tries, error;
 
 	sc = txq->vtntx_sc;
 	vq = txq->vtntx_vq;
 	br = txq->vtntx_br;
 	ifp = sc->vtnet_ifp;
 	tries = 0;
 	error = 0;
 
 	VTNET_TXQ_LOCK_ASSERT(txq);
 
 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 ||
 	    sc->vtnet_link_active == 0) {
 		if (m != NULL)
 			error = drbr_enqueue(ifp, br, m);
 		return (error);
 	}
 
 	if (m != NULL) {
 		error = drbr_enqueue(ifp, br, m);
 		if (error)
 			return (error);
 	}
 
 	vtnet_txq_eof(txq);
 
 again:
 	enq = 0;
 
 	while ((m = drbr_peek(ifp, br)) != NULL) {
 		if (virtqueue_full(vq)) {
 			drbr_putback(ifp, br, m);
 			break;
 		}
 
 		if (vtnet_txq_encap(txq, &m, M_NOWAIT) != 0) {
 			if (m != NULL)
 				drbr_putback(ifp, br, m);
 			else
 				drbr_advance(ifp, br);
 			break;
 		}
 		drbr_advance(ifp, br);
 
 		enq++;
 		ETHER_BPF_MTAP(ifp, m);
 	}
 
 	if (enq > 0 && vtnet_txq_notify(txq) != 0) {
 		if (tries++ < VTNET_NOTIFY_RETRIES)
 			goto again;
 
 		txq->vtntx_stats.vtxs_rescheduled++;
 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_txq_mq_start(if_t ifp, struct mbuf *m)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	int i, npairs, error;
 
 	sc = if_getsoftc(ifp);
 	npairs = sc->vtnet_act_vq_pairs;
 
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		i = m->m_pkthdr.flowid % npairs;
 	else
 		i = curcpu % npairs;
 
 	txq = &sc->vtnet_txqs[i];
 
 	if (VTNET_TXQ_TRYLOCK(txq) != 0) {
 		error = vtnet_txq_mq_start_locked(txq, m);
 		VTNET_TXQ_UNLOCK(txq);
 	} else {
 		error = drbr_enqueue(ifp, txq->vtntx_br, m);
 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask);
 	}
 
 	return (error);
 }
 
 static void
 vtnet_txq_tq_deferred(void *xtxq, int pending __unused)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 
 	txq = xtxq;
 	sc = txq->vtntx_sc;
 
 	VTNET_TXQ_LOCK(txq);
 	if (!drbr_empty(sc->vtnet_ifp, txq->vtntx_br))
 		vtnet_txq_mq_start_locked(txq, NULL);
 	VTNET_TXQ_UNLOCK(txq);
 }
 
 #endif /* VTNET_LEGACY_TX */
 
 static void
 vtnet_txq_start(struct vtnet_txq *txq)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 
 	sc = txq->vtntx_sc;
 	ifp = sc->vtnet_ifp;
 
 #ifdef VTNET_LEGACY_TX
 	if (!if_sendq_empty(ifp))
 		vtnet_start_locked(txq, ifp);
 #else
 	if (!drbr_empty(ifp, txq->vtntx_br))
 		vtnet_txq_mq_start_locked(txq, NULL);
 #endif
 }
 
 static void
 vtnet_txq_tq_intr(void *xtxq, int pending __unused)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	if_t ifp;
 
 	txq = xtxq;
 	sc = txq->vtntx_sc;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_TXQ_LOCK(txq);
 
 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
 		VTNET_TXQ_UNLOCK(txq);
 		return;
 	}
 
 	vtnet_txq_eof(txq);
 	vtnet_txq_start(txq);
 
 	VTNET_TXQ_UNLOCK(txq);
 }
 
 static int
 vtnet_txq_eof(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 	struct vtnet_tx_header *txhdr;
 	struct mbuf *m;
 	int deq;
 
 	vq = txq->vtntx_vq;
 	deq = 0;
 	VTNET_TXQ_LOCK_ASSERT(txq);
 
 	while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) {
 		m = txhdr->vth_mbuf;
 		deq++;
 
 		txq->vtntx_stats.vtxs_opackets++;
 		txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len;
 		if (m->m_flags & M_MCAST)
 			txq->vtntx_stats.vtxs_omcasts++;
 
 		m_freem(m);
 		uma_zfree(vtnet_tx_header_zone, txhdr);
 	}
 
 	if (virtqueue_empty(vq))
 		txq->vtntx_watchdog = 0;
 
 	return (deq);
 }
 
 static void
 vtnet_tx_vq_intr(void *xtxq)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	if_t ifp;
 
 	txq = xtxq;
 	sc = txq->vtntx_sc;
 	ifp = sc->vtnet_ifp;
 
 	if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) {
 		/*
 		 * Ignore this interrupt. Either this is a spurious interrupt
 		 * or multiqueue without per-VQ MSIX so every queue needs to
 		 * be polled (a brain dead configuration we could try harder
 		 * to avoid).
 		 */
 		vtnet_txq_disable_intr(txq);
 		return;
 	}
 
 #ifdef DEV_NETMAP
 	if (netmap_tx_irq(ifp, txq->vtntx_id) != NM_IRQ_PASS)
 		return;
 #endif /* DEV_NETMAP */
 
 	VTNET_TXQ_LOCK(txq);
 
 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
 		VTNET_TXQ_UNLOCK(txq);
 		return;
 	}
 
 	vtnet_txq_eof(txq);
 	vtnet_txq_start(txq);
 
 	VTNET_TXQ_UNLOCK(txq);
 }
 
 static void
 vtnet_tx_start_all(struct vtnet_softc *sc)
 {
 	struct vtnet_txq *txq;
 	int i;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		txq = &sc->vtnet_txqs[i];
 
 		VTNET_TXQ_LOCK(txq);
 		vtnet_txq_start(txq);
 		VTNET_TXQ_UNLOCK(txq);
 	}
 }
 
 #ifndef VTNET_LEGACY_TX
 static void
 vtnet_qflush(if_t ifp)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	struct mbuf *m;
 	int i;
 
 	sc = if_getsoftc(ifp);
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		txq = &sc->vtnet_txqs[i];
 
 		VTNET_TXQ_LOCK(txq);
 		while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL)
 			m_freem(m);
 		VTNET_TXQ_UNLOCK(txq);
 	}
 
 	if_qflush(ifp);
 }
 #endif
 
 static int
 vtnet_watchdog(struct vtnet_txq *txq)
 {
 	if_t ifp;
 
 	ifp = txq->vtntx_sc->vtnet_ifp;
 
 	VTNET_TXQ_LOCK(txq);
 	if (txq->vtntx_watchdog == 1) {
 		/*
 		 * Only drain completed frames if the watchdog is about to
 		 * expire. If any frames were drained, there may be enough
 		 * free descriptors now available to transmit queued frames.
 		 * In that case, the timer will immediately be decremented
 		 * below, but the timeout is generous enough that should not
 		 * be a problem.
 		 */
 		if (vtnet_txq_eof(txq) != 0)
 			vtnet_txq_start(txq);
 	}
 
 	if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) {
 		VTNET_TXQ_UNLOCK(txq);
 		return (0);
 	}
 	VTNET_TXQ_UNLOCK(txq);
 
 	if_printf(ifp, "watchdog timeout on queue %d\n", txq->vtntx_id);
 	return (1);
 }
 
 static void
 vtnet_accum_stats(struct vtnet_softc *sc, struct vtnet_rxq_stats *rxacc,
     struct vtnet_txq_stats *txacc)
 {
 
 	bzero(rxacc, sizeof(struct vtnet_rxq_stats));
 	bzero(txacc, sizeof(struct vtnet_txq_stats));
 
 	for (int i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		struct vtnet_rxq_stats *rxst;
 		struct vtnet_txq_stats *txst;
 
 		rxst = &sc->vtnet_rxqs[i].vtnrx_stats;
 		rxacc->vrxs_ipackets += rxst->vrxs_ipackets;
 		rxacc->vrxs_ibytes += rxst->vrxs_ibytes;
 		rxacc->vrxs_iqdrops += rxst->vrxs_iqdrops;
 		rxacc->vrxs_csum += rxst->vrxs_csum;
 		rxacc->vrxs_csum_failed += rxst->vrxs_csum_failed;
 		rxacc->vrxs_rescheduled += rxst->vrxs_rescheduled;
 
 		txst = &sc->vtnet_txqs[i].vtntx_stats;
 		txacc->vtxs_opackets += txst->vtxs_opackets;
 		txacc->vtxs_obytes += txst->vtxs_obytes;
 		txacc->vtxs_csum += txst->vtxs_csum;
 		txacc->vtxs_tso += txst->vtxs_tso;
 		txacc->vtxs_rescheduled += txst->vtxs_rescheduled;
 	}
 }
 
 static uint64_t
 vtnet_get_counter(if_t ifp, ift_counter cnt)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_rxq_stats rxaccum;
 	struct vtnet_txq_stats txaccum;
 
 	sc = if_getsoftc(ifp);
 	vtnet_accum_stats(sc, &rxaccum, &txaccum);
 
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		return (rxaccum.vrxs_ipackets);
 	case IFCOUNTER_IQDROPS:
 		return (rxaccum.vrxs_iqdrops);
 	case IFCOUNTER_IERRORS:
 		return (rxaccum.vrxs_ierrors);
 	case IFCOUNTER_OPACKETS:
 		return (txaccum.vtxs_opackets);
 #ifndef VTNET_LEGACY_TX
 	case IFCOUNTER_OBYTES:
 		return (txaccum.vtxs_obytes);
 	case IFCOUNTER_OMCASTS:
 		return (txaccum.vtxs_omcasts);
 #endif
 	default:
 		return (if_get_counter_default(ifp, cnt));
 	}
 }
 
 static void
 vtnet_tick(void *xsc)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 	int i, timedout;
 
 	sc = xsc;
 	ifp = sc->vtnet_ifp;
 	timedout = 0;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]);
 
 	if (timedout != 0) {
 		if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
 		vtnet_init_locked(sc, 0);
 	} else
 		callout_schedule(&sc->vtnet_tick_ch, hz);
 }
 
 static void
 vtnet_start_taskqueues(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i, error;
 
 	dev = sc->vtnet_dev;
 
 	/*
 	 * Errors here are very difficult to recover from - we cannot
 	 * easily fail because, if this is during boot, we will hang
 	 * when freeing any successfully started taskqueues because
 	 * the scheduler isn't up yet.
 	 *
 	 * Most drivers just ignore the return value - it only fails
 	 * with ENOMEM so an error is not likely.
 	 */
 	for (i = 0; i < sc->vtnet_req_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET,
 		    "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id);
 		if (error) {
 			device_printf(dev, "failed to start rx taskq %d\n",
 			    rxq->vtnrx_id);
 		}
 
 		txq = &sc->vtnet_txqs[i];
 		error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET,
 		    "%s txq %d", device_get_nameunit(dev), txq->vtntx_id);
 		if (error) {
 			device_printf(dev, "failed to start tx taskq %d\n",
 			    txq->vtntx_id);
 		}
 	}
 }
 
 static void
 vtnet_free_taskqueues(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		if (rxq->vtnrx_tq != NULL) {
 			taskqueue_free(rxq->vtnrx_tq);
 			rxq->vtnrx_tq = NULL;
 		}
 
 		txq = &sc->vtnet_txqs[i];
 		if (txq->vtntx_tq != NULL) {
 			taskqueue_free(txq->vtntx_tq);
 			txq->vtntx_tq = NULL;
 		}
 	}
 }
 
 static void
 vtnet_drain_taskqueues(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		if (rxq->vtnrx_tq != NULL)
 			taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
 
 		txq = &sc->vtnet_txqs[i];
 		if (txq->vtntx_tq != NULL) {
 			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask);
 #ifndef VTNET_LEGACY_TX
 			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask);
 #endif
 		}
 	}
 }
 
 static void
 vtnet_drain_rxtx_queues(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		vtnet_rxq_free_mbufs(rxq);
 
 		txq = &sc->vtnet_txqs[i];
 		vtnet_txq_free_mbufs(txq);
 	}
 }
 
 static void
 vtnet_stop_rendezvous(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	/*
 	 * Lock and unlock the per-queue mutex so we known the stop
 	 * state is visible. Doing only the active queues should be
 	 * sufficient, but it does not cost much extra to do all the
 	 * queues.
 	 */
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		VTNET_RXQ_LOCK(rxq);
 		VTNET_RXQ_UNLOCK(rxq);
 
 		txq = &sc->vtnet_txqs[i];
 		VTNET_TXQ_LOCK(txq);
 		VTNET_TXQ_UNLOCK(txq);
 	}
 }
 
 static void
 vtnet_stop(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
 	sc->vtnet_link_active = 0;
 	callout_stop(&sc->vtnet_tick_ch);
 
 	/* Only advisory. */
 	vtnet_disable_interrupts(sc);
 
 #ifdef DEV_NETMAP
 	/* Stop any pending txsync/rxsync and disable them. */
 	netmap_disable_all_rings(ifp);
 #endif /* DEV_NETMAP */
 
 	/*
 	 * Stop the host adapter. This resets it to the pre-initialized
 	 * state. It will not generate any interrupts until after it is
 	 * reinitialized.
 	 */
 	virtio_stop(dev);
 	vtnet_stop_rendezvous(sc);
 
 	vtnet_drain_rxtx_queues(sc);
 	sc->vtnet_act_vq_pairs = 1;
 }
 
 static int
 vtnet_virtio_reinit(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 	uint64_t features;
 	int error;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 	features = sc->vtnet_negotiated_features;
 
 	/*
 	 * Re-negotiate with the host, removing any disabled receive
 	 * features. Transmit features are disabled only on our side
 	 * via if_capenable and if_hwassist.
 	 */
 
 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) == 0)
 		features &= ~(VIRTIO_NET_F_GUEST_CSUM | VTNET_LRO_FEATURES);
 
 	if ((if_getcapenable(ifp) & IFCAP_LRO) == 0)
 		features &= ~VTNET_LRO_FEATURES;
 
 	if ((if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) == 0)
 		features &= ~VIRTIO_NET_F_CTRL_VLAN;
 
 	error = virtio_reinit(dev, features);
 	if (error) {
 		device_printf(dev, "virtio reinit error %d\n", error);
 		return (error);
 	}
 
 	sc->vtnet_features = features;
 	virtio_reinit_complete(dev);
 
 	return (0);
 }
 
 static void
 vtnet_init_rx_filters(struct vtnet_softc *sc)
 {
 	if_t ifp;
 
 	ifp = sc->vtnet_ifp;
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
 		vtnet_rx_filter(sc);
 		vtnet_rx_filter_mac(sc);
 	}
 
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
 		vtnet_rx_filter_vlan(sc);
 }
 
 static int
 vtnet_init_rx_queues(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 	struct vtnet_rxq *rxq;
 	int i, clustersz, error;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
 	clustersz = vtnet_rx_cluster_size(sc, if_getmtu(ifp));
 	sc->vtnet_rx_clustersz = clustersz;
 
 	if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) {
 		sc->vtnet_rx_nmbufs = howmany(sizeof(struct vtnet_rx_header) +
 		    VTNET_MAX_RX_SIZE, clustersz);
 		KASSERT(sc->vtnet_rx_nmbufs < sc->vtnet_rx_nsegs,
 		    ("%s: too many rx mbufs %d for %d segments", __func__,
 		    sc->vtnet_rx_nmbufs, sc->vtnet_rx_nsegs));
 	} else
 		sc->vtnet_rx_nmbufs = 1;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 
 		/* Hold the lock to satisfy asserts. */
 		VTNET_RXQ_LOCK(rxq);
 		error = vtnet_rxq_populate(rxq);
 		VTNET_RXQ_UNLOCK(rxq);
 
 		if (error) {
 			device_printf(dev, "cannot populate Rx queue %d\n", i);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 vtnet_init_tx_queues(struct vtnet_softc *sc)
 {
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		txq = &sc->vtnet_txqs[i];
 		txq->vtntx_watchdog = 0;
 		txq->vtntx_intr_threshold = vtnet_txq_intr_threshold(txq);
 #ifdef DEV_NETMAP
 		netmap_reset(NA(sc->vtnet_ifp), NR_TX, i, 0);
 #endif /* DEV_NETMAP */
 	}
 
 	return (0);
 }
 
 static int
 vtnet_init_rxtx_queues(struct vtnet_softc *sc)
 {
 	int error;
 
 	error = vtnet_init_rx_queues(sc);
 	if (error)
 		return (error);
 
 	error = vtnet_init_tx_queues(sc);
 	if (error)
 		return (error);
 
 	return (0);
 }
 
 static void
 vtnet_set_active_vq_pairs(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int npairs;
 
 	dev = sc->vtnet_dev;
 
 	if ((sc->vtnet_flags & VTNET_FLAG_MQ) == 0) {
 		sc->vtnet_act_vq_pairs = 1;
 		return;
 	}
 
 	npairs = sc->vtnet_req_vq_pairs;
 
 	if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) {
 		device_printf(dev, "cannot set active queue pairs to %d, "
 		    "falling back to 1 queue pair\n", npairs);
 		npairs = 1;
 	}
 
 	sc->vtnet_act_vq_pairs = npairs;
 }
 
 static void
 vtnet_update_rx_offloads(struct vtnet_softc *sc)
 {
 	if_t ifp;
 	uint64_t features;
 	int error;
 
 	ifp = sc->vtnet_ifp;
 	features = sc->vtnet_features;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (if_getcapabilities(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
 		if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
 			features |= VIRTIO_NET_F_GUEST_CSUM;
 		else
 			features &= ~VIRTIO_NET_F_GUEST_CSUM;
 	}
 
 	if (if_getcapabilities(ifp) & IFCAP_LRO && !vtnet_software_lro(sc)) {
 		if (if_getcapenable(ifp) & IFCAP_LRO)
 			features |= VTNET_LRO_FEATURES;
 		else
 			features &= ~VTNET_LRO_FEATURES;
 	}
 
 	error = vtnet_ctrl_guest_offloads(sc,
 	    features & (VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 |
 		        VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN  |
 			VIRTIO_NET_F_GUEST_UFO));
 	if (error) {
 		device_printf(sc->vtnet_dev,
 		    "%s: cannot update Rx features\n", __func__);
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
 			vtnet_init_locked(sc, 0);
 		}
 	} else
 		sc->vtnet_features = features;
 }
 
 static int
 vtnet_reinit(struct vtnet_softc *sc)
 {
 	if_t ifp;
 	int error;
 
 	ifp = sc->vtnet_ifp;
 
 	bcopy(if_getlladdr(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN);
 
 	error = vtnet_virtio_reinit(sc);
 	if (error)
 		return (error);
 
 	vtnet_set_macaddr(sc);
 	vtnet_set_active_vq_pairs(sc);
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
 		vtnet_init_rx_filters(sc);
 
 	if_sethwassist(ifp, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 		if_sethwassistbits(ifp, VTNET_CSUM_OFFLOAD, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 		if_sethwassistbits(ifp, VTNET_CSUM_OFFLOAD_IPV6, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
 	error = vtnet_init_rxtx_queues(sc);
 	if (error)
 		return (error);
 
 	return (0);
 }
 
 static void
 vtnet_init_locked(struct vtnet_softc *sc, int init_mode)
 {
 	if_t ifp;
 
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
 		return;
 
 	vtnet_stop(sc);
 
 #ifdef DEV_NETMAP
 	/* Once stopped we can update the netmap flags, if necessary. */
 	switch (init_mode) {
 	case VTNET_INIT_NETMAP_ENTER:
 		nm_set_native_flags(NA(ifp));
 		break;
 	case VTNET_INIT_NETMAP_EXIT:
 		nm_clear_native_flags(NA(ifp));
 		break;
 	}
 #endif /* DEV_NETMAP */
 
 	if (vtnet_reinit(sc) != 0) {
 		vtnet_stop(sc);
 		return;
 	}
 
 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
 	vtnet_update_link_status(sc);
 	vtnet_enable_interrupts(sc);
 	callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
 
 #ifdef DEV_NETMAP
 	/* Re-enable txsync/rxsync. */
 	netmap_enable_all_rings(ifp);
 #endif /* DEV_NETMAP */
 }
 
 static void
 vtnet_init(void *xsc)
 {
 	struct vtnet_softc *sc;
 
 	sc = xsc;
 
 	VTNET_CORE_LOCK(sc);
 	vtnet_init_locked(sc, 0);
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_free_ctrl_vq(struct vtnet_softc *sc)
 {
 
 	/*
 	 * The control virtqueue is only polled and therefore it should
 	 * already be empty.
 	 */
 	KASSERT(virtqueue_empty(sc->vtnet_ctrl_vq),
 	    ("%s: ctrl vq %p not empty", __func__, sc->vtnet_ctrl_vq));
 }
 
 static void
 vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie,
     struct sglist *sg, int readable, int writable)
 {
 	struct virtqueue *vq;
 
 	vq = sc->vtnet_ctrl_vq;
 
 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ);
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (!virtqueue_empty(vq))
 		return;
 
 	/*
 	 * Poll for the response, but the command is likely completed before
 	 * returning from the notify.
 	 */
 	if (virtqueue_enqueue(vq, cookie, sg, readable, writable) == 0)  {
 		virtqueue_notify(vq);
 		virtqueue_poll(vq, NULL);
 	}
 }
 
 static int
 vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr __aligned(2);
 		uint8_t pad1;
 		uint8_t addr[ETHER_ADDR_LEN] __aligned(8);
 		uint8_t pad2;
 		uint8_t ack;
 	} s;
 	int error;
 
 	error = 0;
 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_MAC);
 
 	s.hdr.class = VIRTIO_NET_CTRL_MAC;
 	s.hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET;
 	bcopy(hwaddr, &s.addr[0], ETHER_ADDR_LEN);
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, nitems(segs), segs);
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.addr[0], ETHER_ADDR_LEN);
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
 
 	if (error == 0)
 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static int
 vtnet_ctrl_guest_offloads(struct vtnet_softc *sc, uint64_t offloads)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr __aligned(2);
 		uint8_t pad1;
 		uint64_t offloads __aligned(8);
 		uint8_t pad2;
 		uint8_t ack;
 	} s;
 	int error;
 
 	error = 0;
 	MPASS(sc->vtnet_features & VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
 
 	s.hdr.class = VIRTIO_NET_CTRL_GUEST_OFFLOADS;
 	s.hdr.cmd = VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET;
 	s.offloads = vtnet_gtoh64(sc, offloads);
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, nitems(segs), segs);
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.offloads, sizeof(uint64_t));
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
 
 	if (error == 0)
 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static int
 vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr __aligned(2);
 		uint8_t pad1;
 		struct virtio_net_ctrl_mq mq __aligned(2);
 		uint8_t pad2;
 		uint8_t ack;
 	} s;
 	int error;
 
 	error = 0;
 	MPASS(sc->vtnet_flags & VTNET_FLAG_MQ);
 
 	s.hdr.class = VIRTIO_NET_CTRL_MQ;
 	s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET;
 	s.mq.virtqueue_pairs = vtnet_gtoh16(sc, npairs);
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, nitems(segs), segs);
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq));
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
 
 	if (error == 0)
 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static int
 vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, uint8_t cmd, bool on)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr __aligned(2);
 		uint8_t pad1;
 		uint8_t onoff;
 		uint8_t pad2;
 		uint8_t ack;
 	} s;
 	int error;
 
 	error = 0;
 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_RX);
 
 	s.hdr.class = VIRTIO_NET_CTRL_RX;
 	s.hdr.cmd = cmd;
 	s.onoff = on;
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, nitems(segs), segs);
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t));
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
 
 	if (error == 0)
 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static int
 vtnet_set_promisc(struct vtnet_softc *sc, bool on)
 {
 	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on));
 }
 
 static int
 vtnet_set_allmulti(struct vtnet_softc *sc, bool on)
 {
 	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on));
 }
 
 static void
 vtnet_rx_filter(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (vtnet_set_promisc(sc, if_getflags(ifp) & IFF_PROMISC) != 0) {
 		device_printf(dev, "cannot %s promiscuous mode\n",
 		    if_getflags(ifp) & IFF_PROMISC ? "enable" : "disable");
 	}
 
 	if (vtnet_set_allmulti(sc, if_getflags(ifp) & IFF_ALLMULTI) != 0) {
 		device_printf(dev, "cannot %s all-multicast mode\n",
 		    if_getflags(ifp) & IFF_ALLMULTI ? "enable" : "disable");
 	}
 }
 
 static u_int
 vtnet_copy_ifaddr(void *arg, struct sockaddr_dl *sdl, u_int ucnt)
 {
 	struct vtnet_softc *sc = arg;
 
 	if (memcmp(LLADDR(sdl), sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0)
 		return (0);
 
 	if (ucnt < VTNET_MAX_MAC_ENTRIES)
 		bcopy(LLADDR(sdl),
 		    &sc->vtnet_mac_filter->vmf_unicast.macs[ucnt],
 		    ETHER_ADDR_LEN);
 
 	return (1);
 }
 
 static u_int
 vtnet_copy_maddr(void *arg, struct sockaddr_dl *sdl, u_int mcnt)
 {
 	struct vtnet_mac_filter *filter = arg;
 
 	if (mcnt < VTNET_MAX_MAC_ENTRIES)
 		bcopy(LLADDR(sdl), &filter->vmf_multicast.macs[mcnt],
 		    ETHER_ADDR_LEN);
 
 	return (1);
 }
 
 static void
 vtnet_rx_filter_mac(struct vtnet_softc *sc)
 {
 	struct virtio_net_ctrl_hdr hdr __aligned(2);
 	struct vtnet_mac_filter *filter;
 	struct sglist_seg segs[4];
 	struct sglist sg;
 	if_t ifp;
 	bool promisc, allmulti;
 	u_int ucnt, mcnt;
 	int error;
 	uint8_t ack;
 
 	ifp = sc->vtnet_ifp;
 	filter = sc->vtnet_mac_filter;
 	error = 0;
 
 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_RX);
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	/* Unicast MAC addresses: */
 	ucnt = if_foreach_lladdr(ifp, vtnet_copy_ifaddr, sc);
 	promisc = (ucnt > VTNET_MAX_MAC_ENTRIES);
 
 	if (promisc) {
 		ucnt = 0;
 		if_printf(ifp, "more than %d MAC addresses assigned, "
 		    "falling back to promiscuous mode\n",
 		    VTNET_MAX_MAC_ENTRIES);
 	}
 
 	/* Multicast MAC addresses: */
 	mcnt = if_foreach_llmaddr(ifp, vtnet_copy_maddr, filter);
 	allmulti = (mcnt > VTNET_MAX_MAC_ENTRIES);
 
 	if (allmulti) {
 		mcnt = 0;
 		if_printf(ifp, "more than %d multicast MAC addresses "
 		    "assigned, falling back to all-multicast mode\n",
 		    VTNET_MAX_MAC_ENTRIES);
 	}
 
 	if (promisc && allmulti)
 		goto out;
 
 	filter->vmf_unicast.nentries = vtnet_gtoh32(sc, ucnt);
 	filter->vmf_multicast.nentries = vtnet_gtoh32(sc, mcnt);
 
 	hdr.class = VIRTIO_NET_CTRL_MAC;
 	hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
 	ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, nitems(segs), segs);
 	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &filter->vmf_unicast,
 	    sizeof(uint32_t) + ucnt * ETHER_ADDR_LEN);
 	error |= sglist_append(&sg, &filter->vmf_multicast,
 	    sizeof(uint32_t) + mcnt * ETHER_ADDR_LEN);
 	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
 
 	if (error == 0)
 		vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
 	if (ack != VIRTIO_NET_OK)
 		if_printf(ifp, "error setting host MAC filter table\n");
 
 out:
 	if (promisc != 0 && vtnet_set_promisc(sc, true) != 0)
 		if_printf(ifp, "cannot enable promiscuous mode\n");
 	if (allmulti != 0 && vtnet_set_allmulti(sc, true) != 0)
 		if_printf(ifp, "cannot enable all-multicast mode\n");
 }
 
 static int
 vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr __aligned(2);
 		uint8_t pad1;
 		uint16_t tag __aligned(2);
 		uint8_t pad2;
 		uint8_t ack;
 	} s;
 	int error;
 
 	error = 0;
 	MPASS(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER);
 
 	s.hdr.class = VIRTIO_NET_CTRL_VLAN;
 	s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL;
 	s.tag = vtnet_gtoh16(sc, tag);
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, nitems(segs), segs);
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.tag, sizeof(uint16_t));
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
 
 	if (error == 0)
 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static void
 vtnet_rx_filter_vlan(struct vtnet_softc *sc)
 {
 	int i, bit;
 	uint32_t w;
 	uint16_t tag;
 
 	MPASS(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER);
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	/* Enable the filter for each configured VLAN. */
 	for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) {
 		w = sc->vtnet_vlan_filter[i];
 
 		while ((bit = ffs(w) - 1) != -1) {
 			w &= ~(1 << bit);
 			tag = sizeof(w) * CHAR_BIT * i + bit;
 
 			if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) {
 				device_printf(sc->vtnet_dev,
 				    "cannot enable VLAN %d filter\n", tag);
 			}
 		}
 	}
 }
 
 static void
 vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
 {
 	if_t ifp;
 	int idx, bit;
 
 	ifp = sc->vtnet_ifp;
 	idx = (tag >> 5) & 0x7F;
 	bit = tag & 0x1F;
 
 	if (tag == 0 || tag > 4095)
 		return;
 
 	VTNET_CORE_LOCK(sc);
 
 	if (add)
 		sc->vtnet_vlan_filter[idx] |= (1 << bit);
 	else
 		sc->vtnet_vlan_filter[idx] &= ~(1 << bit);
 
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER &&
 	    if_getdrvflags(ifp) & IFF_DRV_RUNNING &&
 	    vtnet_exec_vlan_filter(sc, add, tag) != 0) {
 		device_printf(sc->vtnet_dev,
 		    "cannot %s VLAN %d %s the host filter table\n",
 		    add ? "add" : "remove", tag, add ? "to" : "from");
 	}
 
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_register_vlan(void *arg, if_t ifp, uint16_t tag)
 {
 
 	if (if_getsoftc(ifp) != arg)
 		return;
 
 	vtnet_update_vlan_filter(arg, 1, tag);
 }
 
 static void
 vtnet_unregister_vlan(void *arg, if_t ifp, uint16_t tag)
 {
 
 	if (if_getsoftc(ifp) != arg)
 		return;
 
 	vtnet_update_vlan_filter(arg, 0, tag);
 }
 
 static void
 vtnet_update_speed_duplex(struct vtnet_softc *sc)
 {
 	if_t ifp;
 	uint32_t speed;
 
 	ifp = sc->vtnet_ifp;
 
 	if ((sc->vtnet_features & VIRTIO_NET_F_SPEED_DUPLEX) == 0)
 		return;
 
 	/* BMV: Ignore duplex. */
 	speed = virtio_read_dev_config_4(sc->vtnet_dev,
 	    offsetof(struct virtio_net_config, speed));
 	if (speed != UINT32_MAX)
 		if_setbaudrate(ifp, IF_Mbps(speed));
 }
 
 static int
 vtnet_is_link_up(struct vtnet_softc *sc)
 {
 	uint16_t status;
 
 	if ((sc->vtnet_features & VIRTIO_NET_F_STATUS) == 0)
 		return (1);
 
 	status = virtio_read_dev_config_2(sc->vtnet_dev,
 	    offsetof(struct virtio_net_config, status));
 
 	return ((status & VIRTIO_NET_S_LINK_UP) != 0);
 }
 
 static void
 vtnet_update_link_status(struct vtnet_softc *sc)
 {
 	if_t ifp;
 	int link;
 
 	ifp = sc->vtnet_ifp;
 	VTNET_CORE_LOCK_ASSERT(sc);
 	link = vtnet_is_link_up(sc);
 
 	/* Notify if the link status has changed. */
 	if (link != 0 && sc->vtnet_link_active == 0) {
 		vtnet_update_speed_duplex(sc);
 		sc->vtnet_link_active = 1;
 		if_link_state_change(ifp, LINK_STATE_UP);
 	} else if (link == 0 && sc->vtnet_link_active != 0) {
 		sc->vtnet_link_active = 0;
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 	}
 }
 
 static int
 vtnet_ifmedia_upd(if_t ifp __unused)
 {
 	return (EOPNOTSUPP);
 }
 
 static void
 vtnet_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
 {
 	struct vtnet_softc *sc;
 
 	sc = if_getsoftc(ifp);
 
 	ifmr->ifm_status = IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER;
 
 	VTNET_CORE_LOCK(sc);
 	if (vtnet_is_link_up(sc) != 0) {
 		ifmr->ifm_status |= IFM_ACTIVE;
 		ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
 	} else
 		ifmr->ifm_active |= IFM_NONE;
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_get_macaddr(struct vtnet_softc *sc)
 {
 
 	if (sc->vtnet_flags & VTNET_FLAG_MAC) {
 		virtio_read_device_config_array(sc->vtnet_dev,
 		    offsetof(struct virtio_net_config, mac),
 		    &sc->vtnet_hwaddr[0], sizeof(uint8_t), ETHER_ADDR_LEN);
 	} else {
 		/* Generate a random locally administered unicast address. */
 		sc->vtnet_hwaddr[0] = 0xB2;
 		arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0);
 	}
 }
 
 static void
 vtnet_set_macaddr(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int error;
 
 	dev = sc->vtnet_dev;
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) {
 		error = vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr);
 		if (error)
 			device_printf(dev, "unable to set MAC address\n");
 		return;
 	}
 
 	/* MAC in config is read-only in modern VirtIO. */
 	if (!vtnet_modern(sc) && sc->vtnet_flags & VTNET_FLAG_MAC) {
 		for (int i = 0; i < ETHER_ADDR_LEN; i++) {
 			virtio_write_dev_config_1(dev,
 			    offsetof(struct virtio_net_config, mac) + i,
 			    sc->vtnet_hwaddr[i]);
 		}
 	}
 }
 
 static void
 vtnet_attached_set_macaddr(struct vtnet_softc *sc)
 {
 
 	/* Assign MAC address if it was generated. */
 	if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0)
 		vtnet_set_macaddr(sc);
 }
 
 static void
 vtnet_vlan_tag_remove(struct mbuf *m)
 {
 	struct ether_vlan_header *evh;
 
 	evh = mtod(m, struct ether_vlan_header *);
 	m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag);
 	m->m_flags |= M_VLANTAG;
 
 	/* Strip the 802.1Q header. */
 	bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN,
 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
 }
 
 static void
 vtnet_set_rx_process_limit(struct vtnet_softc *sc)
 {
 	int limit;
 
 	limit = vtnet_tunable_int(sc, "rx_process_limit",
 	    vtnet_rx_process_limit);
 	if (limit < 0)
 		limit = INT_MAX;
 	sc->vtnet_rx_process_limit = limit;
 }
 
 static void
 vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *child, struct vtnet_rxq *rxq)
 {
 	struct sysctl_oid *node;
 	struct sysctl_oid_list *list;
 	struct vtnet_rxq_stats *stats;
 	char namebuf[16];
 
 	snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id);
 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue");
 	list = SYSCTL_CHILDREN(node);
 
 	stats = &rxq->vtnrx_stats;
 
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD,
 	    &stats->vrxs_ipackets, "Receive packets");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD,
 	    &stats->vrxs_ibytes, "Receive bytes");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD,
 	    &stats->vrxs_iqdrops, "Receive drops");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD,
 	    &stats->vrxs_ierrors, "Receive errors");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
 	    &stats->vrxs_csum, "Receive checksum offloaded");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD,
 	    &stats->vrxs_csum_failed, "Receive checksum offload failed");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "host_lro", CTLFLAG_RD,
 	    &stats->vrxs_host_lro, "Receive host segmentation offloaded");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
 	    &stats->vrxs_rescheduled,
 	    "Receive interrupt handler rescheduled");
 }
 
 static void
 vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *child, struct vtnet_txq *txq)
 {
 	struct sysctl_oid *node;
 	struct sysctl_oid_list *list;
 	struct vtnet_txq_stats *stats;
 	char namebuf[16];
 
 	snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id);
 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue");
 	list = SYSCTL_CHILDREN(node);
 
 	stats = &txq->vtntx_stats;
 
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD,
 	    &stats->vtxs_opackets, "Transmit packets");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD,
 	    &stats->vtxs_obytes, "Transmit bytes");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD,
 	    &stats->vtxs_omcasts, "Transmit multicasts");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
 	    &stats->vtxs_csum, "Transmit checksum offloaded");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD,
 	    &stats->vtxs_tso, "Transmit TCP segmentation offloaded");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
 	    &stats->vtxs_rescheduled,
 	    "Transmit interrupt handler rescheduled");
 }
 
 static void
 vtnet_setup_queue_sysctl(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree;
 	struct sysctl_oid_list *child;
 	int i;
 
 	dev = sc->vtnet_dev;
 	ctx = device_get_sysctl_ctx(dev);
 	tree = device_get_sysctl_tree(dev);
 	child = SYSCTL_CHILDREN(tree);
 
 	for (i = 0; i < sc->vtnet_req_vq_pairs; i++) {
 		vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]);
 		vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]);
 	}
 }
 
 static void
 vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *child, struct vtnet_softc *sc)
 {
 	struct vtnet_statistics *stats;
 	struct vtnet_rxq_stats rxaccum;
 	struct vtnet_txq_stats txaccum;
 
 	vtnet_accum_stats(sc, &rxaccum, &txaccum);
 
 	stats = &sc->vtnet_stats;
 	stats->rx_csum_offloaded = rxaccum.vrxs_csum;
 	stats->rx_csum_failed = rxaccum.vrxs_csum_failed;
 	stats->rx_task_rescheduled = rxaccum.vrxs_rescheduled;
 	stats->tx_csum_offloaded = txaccum.vtxs_csum;
 	stats->tx_tso_offloaded = txaccum.vtxs_tso;
 	stats->tx_task_rescheduled = txaccum.vtxs_rescheduled;
 
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed",
 	    CTLFLAG_RD, &stats->mbuf_alloc_failed,
 	    "Mbuf cluster allocation failures");
 
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large",
 	    CTLFLAG_RD, &stats->rx_frame_too_large,
 	    "Received frame larger than the mbuf chain");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed",
 	    CTLFLAG_RD, &stats->rx_enq_replacement_failed,
 	    "Enqueuing the replacement receive mbuf failed");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed",
 	    CTLFLAG_RD, &stats->rx_mergeable_failed,
 	    "Mergeable buffers receive failures");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype",
 	    CTLFLAG_RD, &stats->rx_csum_bad_ethtype,
 	    "Received checksum offloaded buffer with unsupported "
 	    "Ethernet type");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto",
 	    CTLFLAG_RD, &stats->rx_csum_bad_ipproto,
 	    "Received checksum offloaded buffer with incorrect IP protocol");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset",
 	    CTLFLAG_RD, &stats->rx_csum_bad_offset,
 	    "Received checksum offloaded buffer with incorrect offset");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto",
 	    CTLFLAG_RD, &stats->rx_csum_bad_proto,
 	    "Received checksum offloaded buffer with incorrect protocol");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed",
 	    CTLFLAG_RD, &stats->rx_csum_failed,
 	    "Received buffer checksum offload failed");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded",
 	    CTLFLAG_RD, &stats->rx_csum_offloaded,
 	    "Received buffer checksum offload succeeded");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled",
 	    CTLFLAG_RD, &stats->rx_task_rescheduled,
 	    "Times the receive interrupt task rescheduled itself");
 
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_unknown_ethtype",
 	    CTLFLAG_RD, &stats->tx_csum_unknown_ethtype,
 	    "Aborted transmit of checksum offloaded buffer with unknown "
 	    "Ethernet type");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_proto_mismatch",
 	    CTLFLAG_RD, &stats->tx_csum_proto_mismatch,
 	    "Aborted transmit of checksum offloaded buffer because mismatched "
 	    "protocols");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp",
 	    CTLFLAG_RD, &stats->tx_tso_not_tcp,
 	    "Aborted transmit of TSO buffer with non TCP protocol");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_without_csum",
 	    CTLFLAG_RD, &stats->tx_tso_without_csum,
 	    "Aborted transmit of TSO buffer without TCP checksum offload");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defragged",
 	    CTLFLAG_RD, &stats->tx_defragged,
 	    "Transmit mbufs defragged");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defrag_failed",
 	    CTLFLAG_RD, &stats->tx_defrag_failed,
 	    "Aborted transmit of buffer because defrag failed");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded",
 	    CTLFLAG_RD, &stats->tx_csum_offloaded,
 	    "Offloaded checksum of transmitted buffer");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded",
 	    CTLFLAG_RD, &stats->tx_tso_offloaded,
 	    "Segmentation offload of transmitted buffer");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled",
 	    CTLFLAG_RD, &stats->tx_task_rescheduled,
 	    "Times the transmit interrupt task rescheduled itself");
 }
 
 static void
 vtnet_setup_sysctl(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree;
 	struct sysctl_oid_list *child;
 
 	dev = sc->vtnet_dev;
 	ctx = device_get_sysctl_ctx(dev);
 	tree = device_get_sysctl_tree(dev);
 	child = SYSCTL_CHILDREN(tree);
 
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs",
 	    CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0,
 	    "Number of maximum supported virtqueue pairs");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "req_vq_pairs",
 	    CTLFLAG_RD, &sc->vtnet_req_vq_pairs, 0,
 	    "Number of requested virtqueue pairs");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs",
 	    CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0,
 	    "Number of active virtqueue pairs");
 
 	vtnet_setup_stat_sysctl(ctx, child, sc);
 }
 
 static void
 vtnet_load_tunables(struct vtnet_softc *sc)
 {
 
 	sc->vtnet_lro_entry_count = vtnet_tunable_int(sc,
 	    "lro_entry_count", vtnet_lro_entry_count);
 	if (sc->vtnet_lro_entry_count < TCP_LRO_ENTRIES)
 		sc->vtnet_lro_entry_count = TCP_LRO_ENTRIES;
 
 	sc->vtnet_lro_mbufq_depth = vtnet_tunable_int(sc,
 	    "lro_mbufq_depth", vtnet_lro_mbufq_depth);
 }
 
 static int
 vtnet_rxq_enable_intr(struct vtnet_rxq *rxq)
 {
 
 	return (virtqueue_enable_intr(rxq->vtnrx_vq));
 }
 
 static void
 vtnet_rxq_disable_intr(struct vtnet_rxq *rxq)
 {
 
 	virtqueue_disable_intr(rxq->vtnrx_vq);
 }
 
 static int
 vtnet_txq_enable_intr(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 
 	vq = txq->vtntx_vq;
 
 	if (vtnet_txq_below_threshold(txq) != 0)
 		return (virtqueue_postpone_intr(vq, VQ_POSTPONE_LONG));
 
 	/*
 	 * The free count is above our threshold. Keep the Tx interrupt
 	 * disabled until the queue is fuller.
 	 */
 	return (0);
 }
 
 static void
 vtnet_txq_disable_intr(struct vtnet_txq *txq)
 {
 
 	virtqueue_disable_intr(txq->vtntx_vq);
 }
 
 static void
 vtnet_enable_rx_interrupts(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		if (vtnet_rxq_enable_intr(rxq) != 0)
 			taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
 	}
 }
 
 static void
 vtnet_enable_tx_interrupts(struct vtnet_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		vtnet_txq_enable_intr(&sc->vtnet_txqs[i]);
 }
 
 static void
 vtnet_enable_interrupts(struct vtnet_softc *sc)
 {
 
 	vtnet_enable_rx_interrupts(sc);
 	vtnet_enable_tx_interrupts(sc);
 }
 
 static void
 vtnet_disable_rx_interrupts(struct vtnet_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
 		vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]);
 }
 
 static void
 vtnet_disable_tx_interrupts(struct vtnet_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
 		vtnet_txq_disable_intr(&sc->vtnet_txqs[i]);
 }
 
 static void
 vtnet_disable_interrupts(struct vtnet_softc *sc)
 {
 
 	vtnet_disable_rx_interrupts(sc);
 	vtnet_disable_tx_interrupts(sc);
 }
 
 static int
 vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def)
 {
 	char path[64];
 
 	snprintf(path, sizeof(path),
 	    "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob);
 	TUNABLE_INT_FETCH(path, &def);
 
 	return (def);
 }
 
 #ifdef DEBUGNET
 static void
 vtnet_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
 {
 	struct vtnet_softc *sc;
 
 	sc = if_getsoftc(ifp);
 
 	VTNET_CORE_LOCK(sc);
 	*nrxr = sc->vtnet_req_vq_pairs;
 	*ncl = DEBUGNET_MAX_IN_FLIGHT;
 	*clsize = sc->vtnet_rx_clustersz;
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_debugnet_event(if_t ifp __unused, enum debugnet_ev event)
 {
 	struct vtnet_softc *sc;
 	static bool sw_lro_enabled = false;
 
 	/*
 	 * Disable software LRO, since it would require entering the network
 	 * epoch when calling vtnet_txq_eof() in vtnet_debugnet_poll().
 	 */
 	sc = if_getsoftc(ifp);
 	switch (event) {
 	case DEBUGNET_START:
 		sw_lro_enabled = (sc->vtnet_flags & VTNET_FLAG_SW_LRO) != 0;
 		if (sw_lro_enabled)
 			sc->vtnet_flags &= ~VTNET_FLAG_SW_LRO;
 		break;
 	case DEBUGNET_END:
 		if (sw_lro_enabled)
 			sc->vtnet_flags |= VTNET_FLAG_SW_LRO;
 		break;
 	}
 }
 
 static int
 vtnet_debugnet_transmit(if_t ifp, struct mbuf *m)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	int error;
 
 	sc = if_getsoftc(ifp);
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	txq = &sc->vtnet_txqs[0];
 	error = vtnet_txq_encap(txq, &m, M_NOWAIT | M_USE_RESERVE);
 	if (error == 0)
 		(void)vtnet_txq_notify(txq);
 	return (error);
 }
 
 static int
 vtnet_debugnet_poll(if_t ifp, int count)
 {
 	struct vtnet_softc *sc;
 	int i;
 
 	sc = if_getsoftc(ifp);
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	(void)vtnet_txq_eof(&sc->vtnet_txqs[0]);
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		(void)vtnet_rxq_eof(&sc->vtnet_rxqs[i]);
 	return (0);
 }
 #endif /* DEBUGNET */
diff --git a/sys/net/if.h b/sys/net/if.h
index 888e7d5d7320..da3d25f2b226 100644
--- a/sys/net/if.h
+++ b/sys/net/if.h
@@ -1,694 +1,694 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NET_IF_H_
 #define	_NET_IF_H_
 
 #include <sys/cdefs.h>
 
 #if __BSD_VISIBLE
 /*
  * <net/if.h> does not depend on <sys/time.h> on most other systems.  This
  * helps userland compatibility.  (struct timeval ifi_lastchange)
  * The same holds for <sys/socket.h>.  (struct sockaddr ifru_addr)
  */
 #ifndef _KERNEL
 #include <sys/time.h>
 #include <sys/socket.h>
 #endif
 #endif
 
 /*
  * Length of interface external name, including terminating '\0'.
  * Note: this is the same size as a generic device's external name.
  */
 #define		IF_NAMESIZE	16
 #if __BSD_VISIBLE
 #define		IFNAMSIZ	IF_NAMESIZE
 #define		IF_MAXUNIT	0x7fff	/* historical value */
 #endif
 #if __BSD_VISIBLE
 
 /*
  * Structure used to query names of interface cloners.
  */
 
 struct if_clonereq {
 	int	ifcr_total;		/* total cloners (out) */
 	int	ifcr_count;		/* room for this many in user buffer */
 	char	*ifcr_buffer;		/* buffer for cloner names */
 };
 
 /*
  * Structure describing information about an interface
  * which may be of interest to management entities.
  */
 struct if_data {
 	/* generic interface information */
 	uint8_t	ifi_type;		/* ethernet, tokenring, etc */
 	uint8_t	ifi_physical;		/* e.g., AUI, Thinnet, 10base-T, etc */
 	uint8_t	ifi_addrlen;		/* media address length */
 	uint8_t	ifi_hdrlen;		/* media header length */
 	uint8_t	ifi_link_state;		/* current link state */
 	uint8_t	ifi_vhid;		/* carp vhid */
 	uint16_t	ifi_datalen;	/* length of this data struct */
 	uint32_t	ifi_mtu;	/* maximum transmission unit */
 	uint32_t	ifi_metric;	/* routing metric (external only) */
 	uint64_t	ifi_baudrate;	/* linespeed */
 	/* volatile statistics */
 	uint64_t	ifi_ipackets;	/* packets received on interface */
 	uint64_t	ifi_ierrors;	/* input errors on interface */
 	uint64_t	ifi_opackets;	/* packets sent on interface */
 	uint64_t	ifi_oerrors;	/* output errors on interface */
 	uint64_t	ifi_collisions;	/* collisions on csma interfaces */
 	uint64_t	ifi_ibytes;	/* total number of octets received */
 	uint64_t	ifi_obytes;	/* total number of octets sent */
 	uint64_t	ifi_imcasts;	/* packets received via multicast */
 	uint64_t	ifi_omcasts;	/* packets sent via multicast */
 	uint64_t	ifi_iqdrops;	/* dropped on input */
 	uint64_t	ifi_oqdrops;	/* dropped on output */
 	uint64_t	ifi_noproto;	/* destined for unsupported protocol */
 	uint64_t	ifi_hwassist;	/* HW offload capabilities, see IFCAP */
 
 	/* Unions are here to make sizes MI. */
 	union {				/* uptime at attach or stat reset */
 		time_t		tt;
 		uint64_t	ph;
 	} __ifi_epoch;
 #define	ifi_epoch	__ifi_epoch.tt
 	union {				/* time of last administrative change */
 		struct timeval	tv;
 		struct {
 			uint64_t ph1;
 			uint64_t ph2;
 		} ph;
 	} __ifi_lastchange;
 #define	ifi_lastchange	__ifi_lastchange.tv
 };
 
 /*-
  * Interface flags are of two types: network stack owned flags, and driver
  * owned flags.  Historically, these values were stored in the same ifnet
  * flags field, but with the advent of fine-grained locking, they have been
  * broken out such that the network stack is responsible for synchronizing
  * the stack-owned fields, and the device driver the device-owned fields.
  * Both halves can perform lockless reads of the other half's field, subject
  * to accepting the involved races.
  *
  * Both sets of flags come from the same number space, and should not be
  * permitted to conflict, as they are exposed to user space via a single
  * field.
  *
  * The following symbols identify read and write requirements for fields:
  *
  * (i) if_flags field set by device driver before attach, read-only there
  *     after.
  * (n) if_flags field written only by the network stack, read by either the
  *     stack or driver.
  * (d) if_drv_flags field written only by the device driver, read by either
  *     the stack or driver.
  */
 #define	IFF_UP		0x1		/* (n) interface is up */
 #define	IFF_BROADCAST	0x2		/* (i) broadcast address valid */
 #define	IFF_DEBUG	0x4		/* (n) turn on debugging */
 #define	IFF_LOOPBACK	0x8		/* (i) is a loopback net */
 #define	IFF_POINTOPOINT	0x10		/* (i) is a point-to-point link */
-#define	IFF_KNOWSEPOCH	0x20		/* (i) calls if_input in net epoch */
+#define	IFF_NEEDSEPOCH	0x20		/* (i) calls if_input w/o net epoch */
 #define	IFF_DRV_RUNNING	0x40		/* (d) resources allocated */
 #define	IFF_NOARP	0x80		/* (n) no address resolution protocol */
 #define	IFF_PROMISC	0x100		/* (n) receive all packets */
 #define	IFF_ALLMULTI	0x200		/* (n) receive all multicast packets */
 #define	IFF_DRV_OACTIVE	0x400		/* (d) tx hardware queue is full */
 #define	IFF_SIMPLEX	0x800		/* (i) can't hear own transmissions */
 #define	IFF_LINK0	0x1000		/* per link layer defined bit */
 #define	IFF_LINK1	0x2000		/* per link layer defined bit */
 #define	IFF_LINK2	0x4000		/* per link layer defined bit */
 #define	IFF_ALTPHYS	IFF_LINK2	/* use alternate physical connection */
 #define	IFF_MULTICAST	0x8000		/* (i) supports multicast */
 #define	IFF_CANTCONFIG	0x10000		/* (i) unconfigurable using ioctl(2) */
 #define	IFF_PPROMISC	0x20000		/* (n) user-requested promisc mode */
 #define	IFF_MONITOR	0x40000		/* (n) user-requested monitor mode */
 #define	IFF_STATICARP	0x80000		/* (n) static ARP */
 #define	IFF_STICKYARP	0x100000	/* (n) sticky ARP */
 #define	IFF_DYING	0x200000	/* (n) interface is winding down */
 #define	IFF_RENAMING	0x400000	/* (n) interface is being renamed */
 #define	IFF_NOGROUP	0x800000	/* (n) interface is not part of any groups */
 #define	IFF_NETLINK_1	0x1000000	/* (n) used by netlink */
 
 /*
  * Old names for driver flags so that user space tools can continue to use
  * the old (portable) names.
  */
 #ifndef _KERNEL
 #define	IFF_RUNNING	IFF_DRV_RUNNING
 #define	IFF_OACTIVE	IFF_DRV_OACTIVE
 #endif
 
 /* flags set internally only: */
 #define	IFF_CANTCHANGE \
 	(IFF_BROADCAST|IFF_POINTOPOINT|IFF_DRV_RUNNING|IFF_DRV_OACTIVE|\
 	    IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC|\
-	    IFF_DYING|IFF_CANTCONFIG|IFF_KNOWSEPOCH)
+	    IFF_DYING|IFF_CANTCONFIG|IFF_NEEDSEPOCH)
 
 /*
  * Values for if_link_state.
  */
 #define	LINK_STATE_UNKNOWN	0	/* link invalid/unknown */
 #define	LINK_STATE_DOWN		1	/* link is down */
 #define	LINK_STATE_UP		2	/* link is up */
 
 /*
  * Some convenience macros used for setting ifi_baudrate.
  * XXX 1000 vs. 1024? --thorpej@netbsd.org
  */
 #define	IF_Kbps(x)	((uintmax_t)(x) * 1000)	/* kilobits/sec. */
 #define	IF_Mbps(x)	(IF_Kbps((x) * 1000))	/* megabits/sec. */
 #define	IF_Gbps(x)	(IF_Mbps((x) * 1000))	/* gigabits/sec. */
 
 /*
  * Capabilities that interfaces can advertise.
  *
  * struct ifnet.if_capabilities
  *   contains the optional features & capabilities a particular interface
  *   supports (not only the driver but also the detected hw revision).
  *   Capabilities are defined by IFCAP_* below.
  * struct ifnet.if_capenable
  *   contains the enabled (either by default or through ifconfig) optional
  *   features & capabilities on this interface.
  *   Capabilities are defined by IFCAP_* below.
  * struct if_data.ifi_hwassist in mbuf CSUM_ flag form, controlled by above
  *   contains the enabled optional feature & capabilites that can be used
  *   individually per packet and are specified in the mbuf pkthdr.csum_flags
  *   field.  IFCAP_* and CSUM_* do not match one to one and CSUM_* may be
  *   more detailed or differentiated than IFCAP_*.
  *   Hwassist features are defined CSUM_* in sys/mbuf.h
  *
  * Capabilities that cannot be arbitrarily changed with ifconfig/ioctl
  * are listed in IFCAP_CANTCHANGE, similar to IFF_CANTCHANGE.
  * This is not strictly necessary because the common code never
  * changes capabilities, and it is left to the individual driver
  * to do the right thing. However, having the filter here
  * avoids replication of the same code in all individual drivers.
  */
 #define	IFCAP_RXCSUM		0x00001  /* can offload checksum on RX */
 #define	IFCAP_TXCSUM		0x00002  /* can offload checksum on TX */
 #define	IFCAP_NETCONS		0x00004  /* can be a network console */
 #define	IFCAP_VLAN_MTU		0x00008	/* VLAN-compatible MTU */
 #define	IFCAP_VLAN_HWTAGGING	0x00010	/* hardware VLAN tag support */
 #define	IFCAP_JUMBO_MTU		0x00020	/* 9000 byte MTU supported */
 #define	IFCAP_POLLING		0x00040	/* driver supports polling */
 #define	IFCAP_VLAN_HWCSUM	0x00080	/* can do IFCAP_HWCSUM on VLANs */
 #define	IFCAP_TSO4		0x00100	/* can do TCP Segmentation Offload */
 #define	IFCAP_TSO6		0x00200	/* can do TCP6 Segmentation Offload */
 #define	IFCAP_LRO		0x00400	/* can do Large Receive Offload */
 #define	IFCAP_WOL_UCAST		0x00800	/* wake on any unicast frame */
 #define	IFCAP_WOL_MCAST		0x01000	/* wake on any multicast frame */
 #define	IFCAP_WOL_MAGIC		0x02000	/* wake on any Magic Packet */
 #define	IFCAP_TOE4		0x04000	/* interface can offload TCP */
 #define	IFCAP_TOE6		0x08000	/* interface can offload TCP6 */
 #define	IFCAP_VLAN_HWFILTER	0x10000 /* interface hw can filter vlan tag */
 #define	IFCAP_NV		0x20000 /* can do SIOCGIFCAPNV/SIOCSIFCAPNV */
 #define	IFCAP_VLAN_HWTSO	0x40000 /* can do IFCAP_TSO on VLANs */
 #define	IFCAP_LINKSTATE		0x80000 /* the runtime link state is dynamic */
 #define	IFCAP_NETMAP		0x100000 /* netmap mode supported/enabled */
 #define	IFCAP_RXCSUM_IPV6	0x200000  /* can offload checksum on IPv6 RX */
 #define	IFCAP_TXCSUM_IPV6	0x400000  /* can offload checksum on IPv6 TX */
 #define	IFCAP_HWSTATS		0x800000 /* manages counters internally */
 #define	IFCAP_TXRTLMT		0x1000000 /* hardware supports TX rate limiting */
 #define	IFCAP_HWRXTSTMP		0x2000000 /* hardware rx timestamping */
 #define	IFCAP_MEXTPG		0x4000000 /* understands M_EXTPG mbufs */
 #define	IFCAP_TXTLS4		0x8000000 /* can do TLS encryption and segmentation for TCP */
 #define	IFCAP_TXTLS6		0x10000000 /* can do TLS encryption and segmentation for TCP6 */
 #define	IFCAP_VXLAN_HWCSUM	0x20000000 /* can do IFCAN_HWCSUM on VXLANs */
 #define	IFCAP_VXLAN_HWTSO	0x40000000 /* can do IFCAP_TSO on VXLANs */
 #define	IFCAP_TXTLS_RTLMT	0x80000000 /* can do TLS with rate limiting */
 
 /* IFCAP2_* are integers, not bits. */
 #define	IFCAP2_RXTLS4		0
 #define	IFCAP2_RXTLS6		1
 
 #define	IFCAP2_BIT(x)		(1UL << (x))
 
 #define IFCAP_HWCSUM_IPV6	(IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
 
 #define IFCAP_HWCSUM	(IFCAP_RXCSUM | IFCAP_TXCSUM)
 #define	IFCAP_TSO	(IFCAP_TSO4 | IFCAP_TSO6)
 #define	IFCAP_WOL	(IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC)
 #define	IFCAP_TOE	(IFCAP_TOE4 | IFCAP_TOE6)
 #define	IFCAP_TXTLS	(IFCAP_TXTLS4 | IFCAP_TXTLS6)
 
 #define	IFCAP_CANTCHANGE	(IFCAP_NETMAP | IFCAP_NV)
 #define	IFCAP_ALLCAPS		0xffffffff
 
 #define	IFCAP_RXCSUM_NAME	"RXCSUM"
 #define	IFCAP_TXCSUM_NAME	"TXCSUM"
 #define	IFCAP_NETCONS_NAME	"NETCONS"
 #define	IFCAP_VLAN_MTU_NAME	"VLAN_MTU"
 #define	IFCAP_VLAN_HWTAGGING_NAME "VLAN_HWTAGGING"
 #define	IFCAP_JUMBO_MTU_NAME	"JUMBO_MTU"
 #define	IFCAP_POLLING_NAME	"POLLING"
 #define	IFCAP_VLAN_HWCSUM_NAME	"VLAN_HWCSUM"
 #define	IFCAP_TSO4_NAME		"TSO4"
 #define	IFCAP_TSO6_NAME		"TSO6"
 #define	IFCAP_LRO_NAME		"LRO"
 #define	IFCAP_WOL_UCAST_NAME	"WOL_UCAST"
 #define	IFCAP_WOL_MCAST_NAME	"WOL_MCAST"
 #define	IFCAP_WOL_MAGIC_NAME	"WOL_MAGIC"
 #define	IFCAP_TOE4_NAME		"TOE4"
 #define	IFCAP_TOE6_NAME		"TOE6"
 #define	IFCAP_VLAN_HWFILTER_NAME "VLAN_HWFILTER"
 #define	IFCAP_VLAN_HWTSO_NAME	"VLAN_HWTSO"
 #define	IFCAP_LINKSTATE_NAME	"LINKSTATE"
 #define	IFCAP_NETMAP_NAME	"NETMAP"
 #define	IFCAP_RXCSUM_IPV6_NAME	"RXCSUM_IPV6"
 #define	IFCAP_TXCSUM_IPV6_NAME	"TXCSUM_IPV6"
 #define	IFCAP_HWSTATS_NAME	"HWSTATS"
 #define	IFCAP_TXRTLMT_NAME	"TXRTLMT"
 #define	IFCAP_HWRXTSTMP_NAME	"HWRXTSTMP"
 #define	IFCAP_MEXTPG_NAME	"MEXTPG"
 #define	IFCAP_TXTLS4_NAME	"TXTLS4"
 #define	IFCAP_TXTLS6_NAME	"TXTLS6"
 #define	IFCAP_VXLAN_HWCSUM_NAME	"VXLAN_HWCSUM"
 #define	IFCAP_VXLAN_HWTSO_NAME	"VXLAN_HWTSO"
 #define	IFCAP_TXTLS_RTLMT_NAME	"TXTLS_RTLMT"
 #define	IFCAP2_RXTLS4_NAME	"RXTLS4"
 #define	IFCAP2_RXTLS6_NAME	"RXTLS6"
 
 #define	IFQ_MAXLEN	50
 #define	IFNET_SLOWHZ	1		/* granularity is 1 second */
 
 /*
  * Message format for use in obtaining information about interfaces
  * from getkerninfo and the routing socket
  * For the new, extensible interface see struct if_msghdrl below.
  */
 struct if_msghdr {
 	u_short	ifm_msglen;	/* to skip over non-understood messages */
 	u_char	ifm_version;	/* future binary compatibility */
 	u_char	ifm_type;	/* message type */
 	int	ifm_addrs;	/* like rtm_addrs */
 	int	ifm_flags;	/* value of if_flags */
 	u_short	ifm_index;	/* index for associated ifp */
 	u_short	_ifm_spare1;
 	struct	if_data ifm_data;/* statistics and other data about if */
 };
 
 /*
  * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL.  It is
  * extensible after ifm_data_off or within ifm_data.  Both the if_msghdr and
  * if_data now have a member field detailing the struct length in addition to
  * the routing message length.  Macros are provided to find the start of
  * ifm_data and the start of the socket address strucutres immediately following
  * struct if_msghdrl given a pointer to struct if_msghdrl.
  */
 #define	IF_MSGHDRL_IFM_DATA(_l) \
     (struct if_data *)((char *)(_l) + (_l)->ifm_data_off)
 #define	IF_MSGHDRL_RTA(_l) \
     (void *)((uintptr_t)(_l) + (_l)->ifm_len)
 struct if_msghdrl {
 	u_short	ifm_msglen;	/* to skip over non-understood messages */
 	u_char	ifm_version;	/* future binary compatibility */
 	u_char	ifm_type;	/* message type */
 	int	ifm_addrs;	/* like rtm_addrs */
 	int	ifm_flags;	/* value of if_flags */
 	u_short	ifm_index;	/* index for associated ifp */
 	u_short _ifm_spare1;	/* spare space to grow if_index, see if_var.h */
 	u_short	ifm_len;	/* length of if_msghdrl incl. if_data */
 	u_short	ifm_data_off;	/* offset of if_data from beginning */
 	int	_ifm_spare2;
 	struct	if_data ifm_data;/* statistics and other data about if */
 };
 
 /*
  * Message format for use in obtaining information about interface addresses
  * from getkerninfo and the routing socket
  * For the new, extensible interface see struct ifa_msghdrl below.
  */
 struct ifa_msghdr {
 	u_short	ifam_msglen;	/* to skip over non-understood messages */
 	u_char	ifam_version;	/* future binary compatibility */
 	u_char	ifam_type;	/* message type */
 	int	ifam_addrs;	/* like rtm_addrs */
 	int	ifam_flags;	/* value of ifa_flags */
 	u_short	ifam_index;	/* index for associated ifp */
 	u_short	_ifam_spare1;
 	int	ifam_metric;	/* value of ifa_ifp->if_metric */
 };
 
 /*
  * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL.  It is
  * extensible after ifam_metric or within ifam_data.  Both the ifa_msghdrl and
  * if_data now have a member field detailing the struct length in addition to
  * the routing message length.  Macros are provided to find the start of
  * ifm_data and the start of the socket address strucutres immediately following
  * struct ifa_msghdrl given a pointer to struct ifa_msghdrl.
  */
 #define	IFA_MSGHDRL_IFAM_DATA(_l) \
     (struct if_data *)((char *)(_l) + (_l)->ifam_data_off)
 #define	IFA_MSGHDRL_RTA(_l) \
     (void *)((uintptr_t)(_l) + (_l)->ifam_len)
 struct ifa_msghdrl {
 	u_short	ifam_msglen;	/* to skip over non-understood messages */
 	u_char	ifam_version;	/* future binary compatibility */
 	u_char	ifam_type;	/* message type */
 	int	ifam_addrs;	/* like rtm_addrs */
 	int	ifam_flags;	/* value of ifa_flags */
 	u_short	ifam_index;	/* index for associated ifp */
 	u_short _ifam_spare1;	/* spare space to grow if_index, see if_var.h */
 	u_short	ifam_len;	/* length of ifa_msghdrl incl. if_data */
 	u_short	ifam_data_off;	/* offset of if_data from beginning */
 	int	ifam_metric;	/* value of ifa_ifp->if_metric */
 	struct	if_data ifam_data;/* statistics and other data about if or
 				 * address */
 };
 
 /*
  * Message format for use in obtaining information about multicast addresses
  * from the routing socket
  */
 struct ifma_msghdr {
 	u_short	ifmam_msglen;	/* to skip over non-understood messages */
 	u_char	ifmam_version;	/* future binary compatibility */
 	u_char	ifmam_type;	/* message type */
 	int	ifmam_addrs;	/* like rtm_addrs */
 	int	ifmam_flags;	/* value of ifa_flags */
 	u_short	ifmam_index;	/* index for associated ifp */
 	u_short	_ifmam_spare1;
 };
 
 /*
  * Message format announcing the arrival or departure of a network interface.
  */
 struct if_announcemsghdr {
 	u_short	ifan_msglen;	/* to skip over non-understood messages */
 	u_char	ifan_version;	/* future binary compatibility */
 	u_char	ifan_type;	/* message type */
 	u_short	ifan_index;	/* index for associated ifp */
 	char	ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */
 	u_short	ifan_what;	/* what type of announcement */
 };
 
 #define	IFAN_ARRIVAL	0	/* interface arrival */
 #define	IFAN_DEPARTURE	1	/* interface departure */
 
 /*
  * Buffer with length to be used in SIOCGIFDESCR/SIOCSIFDESCR requests
  */
 struct ifreq_buffer {
 	size_t	length;
 	void	*buffer;
 };
 
 struct ifreq_nv_req {
 	u_int	buf_length;	/* Total size of buffer,
 				   u_int for ABI struct ifreq */
 	u_int	length;		/* Length of the filled part */
 	void	*buffer;	/* Buffer itself, containing packed nv */
 };
 
 #define	IFR_CAP_NV_MAXBUFSIZE	(2 * 1024 * 1024)
 
 /*
  * Interface request structure used for socket
  * ioctl's.  All interface ioctl's must have parameter
  * definitions which begin with ifr_name.  The
  * remainder may be interface specific.
  */
 struct ifreq {
 	char	ifr_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	union {
 		struct	sockaddr ifru_addr;
 		struct	sockaddr ifru_dstaddr;
 		struct	sockaddr ifru_broadaddr;
 		struct	ifreq_buffer ifru_buffer;
 		short	ifru_flags[2];
 		short	ifru_index;
 		int	ifru_jid;
 		int	ifru_metric;
 		int	ifru_mtu;
 		int	ifru_phys;
 		int	ifru_media;
 		caddr_t	ifru_data;
 		int	ifru_cap[2];
 		u_int	ifru_fib;
 		u_char	ifru_vlan_pcp;
 		struct	ifreq_nv_req ifru_nv;
 	} ifr_ifru;
 #define	ifr_addr	ifr_ifru.ifru_addr	/* address */
 #define	ifr_dstaddr	ifr_ifru.ifru_dstaddr	/* other end of p-to-p link */
 #define	ifr_broadaddr	ifr_ifru.ifru_broadaddr	/* broadcast address */
 #ifndef _KERNEL
 #define	ifr_buffer	ifr_ifru.ifru_buffer	/* user supplied buffer with its length */
 #endif
 #define	ifr_flags	ifr_ifru.ifru_flags[0]	/* flags (low 16 bits) */
 #define	ifr_flagshigh	ifr_ifru.ifru_flags[1]	/* flags (high 16 bits) */
 #define	ifr_jid		ifr_ifru.ifru_jid	/* jail/vnet */
 #define	ifr_metric	ifr_ifru.ifru_metric	/* metric */
 #define	ifr_mtu		ifr_ifru.ifru_mtu	/* mtu */
 #define ifr_phys	ifr_ifru.ifru_phys	/* physical wire */
 #define ifr_media	ifr_ifru.ifru_media	/* physical media */
 #ifndef _KERNEL
 #define	ifr_data	ifr_ifru.ifru_data	/* for use by interface */
 #endif
 #define	ifr_reqcap	ifr_ifru.ifru_cap[0]	/* requested capabilities */
 #define	ifr_curcap	ifr_ifru.ifru_cap[1]	/* current capabilities */
 #define	ifr_index	ifr_ifru.ifru_index	/* interface index */
 #define	ifr_fib		ifr_ifru.ifru_fib	/* interface fib */
 #define	ifr_vlan_pcp	ifr_ifru.ifru_vlan_pcp	/* VLAN priority */
 #define	ifr_lan_pcp	ifr_ifru.ifru_vlan_pcp	/* VLAN priority */
 #define	ifr_cap_nv	ifr_ifru.ifru_nv	/* nv-based cap interface */
 };
 
 #define	_SIZEOF_ADDR_IFREQ(ifr) \
 	((ifr).ifr_addr.sa_len > sizeof(struct sockaddr) ? \
 	 (sizeof(struct ifreq) - sizeof(struct sockaddr) + \
 	  (ifr).ifr_addr.sa_len) : sizeof(struct ifreq))
 
 struct ifaliasreq {
 	char	ifra_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	struct	sockaddr ifra_addr;
 	struct	sockaddr ifra_broadaddr;
 	struct	sockaddr ifra_mask;
 	int	ifra_vhid;
 };
 
 /* 9.x compat */
 struct oifaliasreq {
 	char	ifra_name[IFNAMSIZ];
 	struct	sockaddr ifra_addr;
 	struct	sockaddr ifra_broadaddr;
 	struct	sockaddr ifra_mask;
 };
 
 struct ifmediareq {
 	char	ifm_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	int	ifm_current;		/* current media options */
 	int	ifm_mask;		/* don't care mask */
 	int	ifm_status;		/* media status */
 	int	ifm_active;		/* active options */
 	int	ifm_count;		/* # entries in ifm_ulist array */
 	int	*ifm_ulist;		/* media words */
 };
 
 struct ifdrv {
 	char		ifd_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	unsigned long	ifd_cmd;
 	size_t		ifd_len;
 	void		*ifd_data;
 };
 
 /* 
  * Structure used to retrieve aux status data from interfaces.
  * Kernel suppliers to this interface should respect the formatting
  * needed by ifconfig(8): each line starts with a TAB and ends with
  * a newline.  The canonical example to copy and paste is in if_tun.c.
  */
 
 #define	IFSTATMAX	800		/* 10 lines of text */
 struct ifstat {
 	char	ifs_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	char	ascii[IFSTATMAX + 1];
 };
 
 /*
  * Structure used in SIOCGIFCONF request.
  * Used to retrieve interface configuration
  * for machine (useful for programs which
  * must know all networks accessible).
  */
 struct ifconf {
 	int	ifc_len;		/* size of associated buffer */
 	union {
 		caddr_t	ifcu_buf;
 		struct	ifreq *ifcu_req;
 	} ifc_ifcu;
 #define	ifc_buf	ifc_ifcu.ifcu_buf	/* buffer address */
 #define	ifc_req	ifc_ifcu.ifcu_req	/* array of structures returned */
 };
 
 /*
  * interface groups
  */
 
 #define	IFG_ALL		"all"		/* group contains all interfaces */
 /* XXX: will we implement this? */
 #define	IFG_EGRESS	"egress"	/* if(s) default route(s) point to */
 
 struct ifg_req {
 	union {
 		char			 ifgrqu_group[IFNAMSIZ];
 		char			 ifgrqu_member[IFNAMSIZ];
 	} ifgrq_ifgrqu;
 #define	ifgrq_group	ifgrq_ifgrqu.ifgrqu_group
 #define	ifgrq_member	ifgrq_ifgrqu.ifgrqu_member
 };
 
 /*
  * Used to lookup groups for an interface
  */
 struct ifgroupreq {
 	char	ifgr_name[IFNAMSIZ];
 	u_int	ifgr_len;
 	union {
 		char	ifgru_group[IFNAMSIZ];
 		struct	ifg_req *ifgru_groups;
 	} ifgr_ifgru;
 #define ifgr_group	ifgr_ifgru.ifgru_group
 #define ifgr_groups	ifgr_ifgru.ifgru_groups
 };
 
 /*
  * Structure used to request i2c data
  * from interface transceivers.
  */
 struct ifi2creq {
 	uint8_t dev_addr;	/* i2c address (0xA0, 0xA2) */
 	uint8_t offset;		/* read offset */
 	uint8_t len;		/* read length */
 	uint8_t spare0;
 	uint32_t spare1;
 	uint8_t data[8];	/* read buffer */
 }; 
 
 /*
  * RSS hash.
  */
 
 #define	RSS_FUNC_NONE		0		/* RSS disabled */
 #define	RSS_FUNC_PRIVATE	1		/* non-standard */
 #define	RSS_FUNC_TOEPLITZ	2
 
 #define	RSS_TYPE_IPV4		0x00000001
 #define	RSS_TYPE_TCP_IPV4	0x00000002
 #define	RSS_TYPE_IPV6		0x00000004
 #define	RSS_TYPE_IPV6_EX	0x00000008
 #define	RSS_TYPE_TCP_IPV6	0x00000010
 #define	RSS_TYPE_TCP_IPV6_EX	0x00000020
 #define	RSS_TYPE_UDP_IPV4	0x00000040
 #define	RSS_TYPE_UDP_IPV6	0x00000080
 #define	RSS_TYPE_UDP_IPV6_EX	0x00000100
 
 #define	RSS_KEYLEN		128
 
 struct ifrsskey {
 	char		ifrk_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	uint8_t		ifrk_func;		/* RSS_FUNC_ */
 	uint8_t		ifrk_spare0;
 	uint16_t	ifrk_keylen;
 	uint8_t		ifrk_key[RSS_KEYLEN];
 };
 
 struct ifrsshash {
 	char		ifrh_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	uint8_t		ifrh_func;		/* RSS_FUNC_ */
 	uint8_t		ifrh_spare0;
 	uint16_t	ifrh_spare1;
 	uint32_t	ifrh_types;		/* RSS_TYPE_ */
 };
 
 #define	IFNET_PCP_NONE	0xff	/* PCP disabled */
 
 #define	IFDR_MSG_SIZE		64
 #define	IFDR_REASON_MSG		1
 #define	IFDR_REASON_VENDOR	2
 struct ifdownreason {
 	char		ifdr_name[IFNAMSIZ];
 	uint32_t	ifdr_reason;
 	uint32_t	ifdr_vendor;
 	char		ifdr_msg[IFDR_MSG_SIZE];
 };
 
 #endif /* __BSD_VISIBLE */
 
 /*
  * Opaque interface structure.
  */
 
 typedef struct ifnet * if_t;
 
 #ifdef _KERNEL
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_IFADDR);
 MALLOC_DECLARE(M_IFMADDR);
 #endif
 
 extern struct sx ifnet_detach_sxlock;
 
 struct nvlist;
 struct ifcap_nv_bit_name;
 int if_capnv_to_capint(const struct nvlist *nv, int *old_cap,
     const struct ifcap_nv_bit_name *nn, bool all);
 void if_capint_to_capnv(struct nvlist *nv,
     const struct ifcap_nv_bit_name *nn, int ifr_cap, int ifr_req);
 struct siocsifcapnv_driver_data {
 	int reqcap;
 	int reqcap2;
 	struct nvlist *nvcap;
 };
 #endif
 
 #ifndef _KERNEL
 struct if_nameindex {
 	unsigned int	if_index;	/* 1, 2, ... */
 	char		*if_name;	/* null terminated name: "le0", ... */
 };
 
 __BEGIN_DECLS
 void			 if_freenameindex(struct if_nameindex *);
 char			*if_indextoname(unsigned int, char *);
 struct if_nameindex	*if_nameindex(void);
 unsigned int		 if_nametoindex(const char *);
 __END_DECLS
 #endif
 #endif /* !_NET_IF_H_ */
diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c
index e9e1a48b3d58..2afbf786c9c8 100644
--- a/sys/net/if_epair.c
+++ b/sys/net/if_epair.c
@@ -1,934 +1,933 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 The FreeBSD Foundation
  * Copyright (c) 2009-2021 Bjoern A. Zeeb <bz@FreeBSD.org>
  *
  * This software was developed by CK Software GmbH under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * A pair of virtual back-to-back connected ethernet like interfaces
  * (``two interfaces with a virtual cross-over cable'').
  *
  * This is mostly intended to be used to provide connectivity between
  * different virtual network stack instances.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_rss.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/hash.h>
 #include <sys/interrupt.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/taskqueue.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #ifdef RSS
 #include <net/rss_config.h>
 #ifdef INET
 #include <netinet/in_rss.h>
 #endif
 #ifdef INET6
 #include <netinet6/in6_rss.h>
 #endif
 #endif
 #include <net/vnet.h>
 
 static const char epairname[] = "epair";
 #define	RXRSIZE	4096	/* Probably overkill by 4-8x. */
 
 static MALLOC_DEFINE(M_EPAIR, epairname,
     "Pair of virtual cross-over connected Ethernet-like interfaces");
 
 VNET_DEFINE_STATIC(struct if_clone *, epair_cloner);
 #define	V_epair_cloner	VNET(epair_cloner)
 
 static unsigned int next_index = 0;
 #define	EPAIR_LOCK_INIT()		mtx_init(&epair_n_index_mtx, "epairidx", \
 					    NULL, MTX_DEF)
 #define	EPAIR_LOCK_DESTROY()		mtx_destroy(&epair_n_index_mtx)
 #define	EPAIR_LOCK()			mtx_lock(&epair_n_index_mtx)
 #define	EPAIR_UNLOCK()			mtx_unlock(&epair_n_index_mtx)
 
 struct epair_softc;
 struct epair_queue {
 	struct mtx		 mtx;
 	struct mbufq		 q;
 	int			 id;
 	enum {
 		EPAIR_QUEUE_IDLE,
 		EPAIR_QUEUE_WAKING,
 		EPAIR_QUEUE_RUNNING,
 	}			 state;
 	struct task		 tx_task;
 	struct epair_softc	*sc;
 };
 
 static struct mtx epair_n_index_mtx;
 struct epair_softc {
 	struct ifnet		*ifp;		/* This ifp. */
 	struct ifnet		*oifp;		/* other ifp of pair. */
 	int			 num_queues;
 	struct epair_queue	*queues;
 	struct ifmedia		 media;		/* Media config (fake). */
 	STAILQ_ENTRY(epair_softc) entry;
 };
 
 struct epair_tasks_t {
 	int			 tasks;
 	struct taskqueue	 *tq[MAXCPU];
 };
 
 static struct epair_tasks_t epair_tasks;
 
 static void
 epair_clear_mbuf(struct mbuf *m)
 {
 	M_ASSERTPKTHDR(m);
 
 	/* Remove any CSUM_SND_TAG as ether_input will barf. */
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		m_snd_tag_rele(m->m_pkthdr.snd_tag);
 		m->m_pkthdr.snd_tag = NULL;
 		m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 	}
 
 	/* Clear vlan information. */
 	m->m_flags &= ~M_VLANTAG;
 	m->m_pkthdr.ether_vtag = 0;
 
 	m_tag_delete_nonpersistent(m);
 }
 
 static void
 epair_tx_start_deferred(void *arg, int pending)
 {
 	struct epair_queue *q = (struct epair_queue *)arg;
 	if_t ifp;
 	struct mbuf *m, *n;
 	bool resched;
 
 	ifp = q->sc->ifp;
 
 	if_ref(ifp);
 	CURVNET_SET(ifp->if_vnet);
 
 	mtx_lock(&q->mtx);
 	m = mbufq_flush(&q->q);
 	q->state = EPAIR_QUEUE_RUNNING;
 	mtx_unlock(&q->mtx);
 
 	while (m != NULL) {
 		n = STAILQ_NEXT(m, m_stailqpkt);
 		m->m_nextpkt = NULL;
 		if_input(ifp, m);
 		m = n;
 	}
 
 	/*
 	 * Avoid flushing the queue more than once per task.  We can otherwise
 	 * end up starving ourselves in a multi-epair routing configuration.
 	 */
 	mtx_lock(&q->mtx);
 	if (mbufq_len(&q->q) > 0) {
 		resched = true;
 		q->state = EPAIR_QUEUE_WAKING;
 	} else {
 		resched = false;
 		q->state = EPAIR_QUEUE_IDLE;
 	}
 	mtx_unlock(&q->mtx);
 
 	if (resched)
 		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
 
 	CURVNET_RESTORE();
 	if_rele(ifp);
 }
 
 static struct epair_queue *
 epair_select_queue(struct epair_softc *sc, struct mbuf *m)
 {
 	uint32_t bucket;
 #ifdef RSS
 	struct ether_header *eh;
 	int ret;
 
 	ret = rss_m2bucket(m, &bucket);
 	if (ret) {
 		/* Actually hash the packet. */
 		eh = mtod(m, struct ether_header *);
 
 		switch (ntohs(eh->ether_type)) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			rss_soft_m2cpuid_v4(m, 0, &bucket);
 			break;
 #endif
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			rss_soft_m2cpuid_v6(m, 0, &bucket);
 			break;
 #endif
 		default:
 			bucket = 0;
 			break;
 		}
 	}
 	bucket %= sc->num_queues;
 #else
 	bucket = 0;
 #endif
 	return (&sc->queues[bucket]);
 }
 
 static void
 epair_prepare_mbuf(struct mbuf *m, struct ifnet *src_ifp)
 {
 	M_ASSERTPKTHDR(m);
 	epair_clear_mbuf(m);
 	if_setrcvif(m, src_ifp);
 	M_SETFIB(m, src_ifp->if_fib);
 
 	MPASS(m->m_nextpkt == NULL);
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 }
 
 static void
 epair_menq(struct mbuf *m, struct epair_softc *osc)
 {
 	struct epair_queue *q;
 	struct ifnet *ifp, *oifp;
 	int error, len;
 	bool mcast;
 
 	/*
 	 * I know this looks weird. We pass the "other sc" as we need that one
 	 * and can get both ifps from it as well.
 	 */
 	oifp = osc->ifp;
 	ifp = osc->oifp;
 
 	epair_prepare_mbuf(m, oifp);
 
 	/* Save values as once the mbuf is queued, it's not ours anymore. */
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_BCAST | M_MCAST)) != 0;
 
 	q = epair_select_queue(osc, m);
 
 	mtx_lock(&q->mtx);
 	if (q->state == EPAIR_QUEUE_IDLE) {
 		q->state = EPAIR_QUEUE_WAKING;
 		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
 	}
 	error = mbufq_enqueue(&q->q, m);
 	mtx_unlock(&q->mtx);
 
 	if (error != 0) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 	} else {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if (mcast)
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
 	}
 }
 
 static void
 epair_start(struct ifnet *ifp)
 {
 	struct mbuf *m;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 
 	/*
 	 * We get packets here from ether_output via if_handoff()
 	 * and need to put them into the input queue of the oifp
 	 * and will put the packet into the receive-queue (rxq) of the
 	 * other interface (oifp) of our pair.
 	 */
 	sc = ifp->if_softc;
 	oifp = sc->oifp;
 	sc = oifp->if_softc;
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 		M_ASSERTPKTHDR(m);
 		BPF_MTAP(ifp, m);
 
 		/* In case either interface is not usable drop the packet. */
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 		    (ifp->if_flags & IFF_UP) == 0 ||
 		    (oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 		    (oifp->if_flags & IFF_UP) == 0) {
 			m_freem(m);
 			continue;
 		}
 
 		epair_menq(m, sc);
 	}
 }
 
 static int
 epair_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 #ifdef ALTQ
 	int len;
 	bool mcast;
 #endif
 
 	if (m == NULL)
 		return (0);
 	M_ASSERTPKTHDR(m);
 
 	/*
 	 * We are not going to use the interface en/dequeue mechanism
 	 * on the TX side. We are called from ether_output_frame()
 	 * and will put the packet into the receive-queue (rxq) of the
 	 * other interface (oifp) of our pair.
 	 */
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENETDOWN);
 	}
 
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * In case the outgoing interface is not usable,
 	 * drop the packet.
 	 */
 	sc = ifp->if_softc;
 	oifp = sc->oifp;
 	if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (oifp->if_flags & IFF_UP) == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (0);
 	}
 
 #ifdef ALTQ
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_BCAST | M_MCAST)) != 0;
 	int error = 0;
 
 	/* Support ALTQ via the classic if_start() path. */
 	IF_LOCK(&ifp->if_snd);
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error);
 		if (error)
 			if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		IF_UNLOCK(&ifp->if_snd);
 		if (!error) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 			if (mcast)
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 			epair_start(ifp);
 		}
 		return (error);
 	}
 	IF_UNLOCK(&ifp->if_snd);
 #endif
 
 	epair_menq(m, oifp->if_softc);
 	return (0);
 }
 
 static void
 epair_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static int
 epair_media_change(struct ifnet *ifp __unused)
 {
 
 	/* Do nothing. */
 	return (0);
 }
 
 static void
 epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr)
 {
 
 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
 	imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX;
 }
 
 static int
 epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epair_softc *sc;
 	struct ifreq *ifr;
 	int error;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		sc = ifp->if_softc;
 		error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
 		break;
 
 	case SIOCSIFMTU:
 		/* We basically allow all kinds of MTUs. */
 		ifp->if_mtu = ifr->ifr_mtu;
 		error = 0;
 		break;
 
 	default:
 		/* Let the common ethernet handler process this. */
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 static void
 epair_init(void *dummy __unused)
 {
 }
 
 /*
  * Interface cloning functions.
  * We use our private ones so that we can create/destroy our secondary
  * device along with the primary one.
  */
 static int
 epair_clone_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 
 	/*
 	 * Our base name is epair.
 	 * Our interfaces will be named epair<n>[ab].
 	 * So accept anything of the following list:
 	 * - epair
 	 * - epair<n>
 	 * but not the epair<n>[ab] versions.
 	 */
 	if (strncmp(epairname, name, sizeof(epairname)-1) != 0)
 		return (0);
 
 	for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static void
 epair_clone_add(struct if_clone *ifc, struct epair_softc *scb)
 {
 	struct ifnet *ifp;
 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	ifp = scb->ifp;
 	/* Copy epairNa etheraddr and change the last byte. */
 	memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN);
 	eaddr[5] = 0x0b;
 	ether_ifattach(ifp, eaddr);
 
 	if_clone_addif(ifc, ifp);
 }
 
 static struct epair_softc *
 epair_alloc_sc(struct if_clone *ifc)
 {
 	struct epair_softc *sc;
 
 	struct ifnet *ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL)
 		return (NULL);
 
 	sc = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
 	sc->ifp = ifp;
 	sc->num_queues = epair_tasks.tasks;
 	sc->queues = mallocarray(sc->num_queues, sizeof(struct epair_queue),
 	    M_EPAIR, M_WAITOK);
 	for (int i = 0; i < sc->num_queues; i++) {
 		struct epair_queue *q = &sc->queues[i];
 		q->id = i;
 		q->state = EPAIR_QUEUE_IDLE;
 		mtx_init(&q->mtx, "epairq", NULL, MTX_DEF | MTX_NEW);
 		mbufq_init(&q->q, RXRSIZE);
 		q->sc = sc;
 		NET_TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
 	}
 
 	/* Initialise pseudo media types. */
 	ifmedia_init(&sc->media, 0, epair_media_change, epair_media_status);
 	ifmedia_add(&sc->media, IFM_ETHER | IFM_10G_T, 0, NULL);
 	ifmedia_set(&sc->media, IFM_ETHER | IFM_10G_T);
 
 	return (sc);
 }
 
 static void
 epair_setup_ifp(struct epair_softc *sc, char *name, int unit)
 {
 	struct ifnet *ifp = sc->ifp;
 
 	ifp->if_softc = sc;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = epairname;
 	ifp->if_dunit = unit;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
-	ifp->if_flags |= IFF_KNOWSEPOCH;
 	ifp->if_capabilities = IFCAP_VLAN_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU;
 	ifp->if_transmit = epair_transmit;
 	ifp->if_qflush = epair_qflush;
 	ifp->if_start = epair_start;
 	ifp->if_ioctl = epair_ioctl;
 	ifp->if_init  = epair_init;
 	if_setsendqlen(ifp, ifqmaxlen);
 	if_setsendqready(ifp);
 
 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 }
 
 static void
 epair_generate_mac(struct epair_softc *sc, uint8_t *eaddr)
 {
 	uint32_t key[3];
 	uint32_t hash;
 	uint64_t hostid;
 
 	EPAIR_LOCK();
 #ifdef SMP
 	/* Get an approximate distribution. */
 	hash = next_index % mp_ncpus;
 #else
 	hash = 0;
 #endif
 	EPAIR_UNLOCK();
 
 	/*
 	 * Calculate the etheraddr hashing the hostid and the
 	 * interface index. The result would be hopefully unique.
 	 * Note that the "a" component of an epair instance may get moved
 	 * to a different VNET after creation. In that case its index
 	 * will be freed and the index can get reused by new epair instance.
 	 * Make sure we do not create same etheraddr again.
 	 */
 	getcredhostid(curthread->td_ucred, (unsigned long *)&hostid);
 	if (hostid == 0)
 		arc4rand(&hostid, sizeof(hostid), 0);
 
 	struct ifnet *ifp = sc->ifp;
 	EPAIR_LOCK();
 	if (ifp->if_index > next_index)
 		next_index = ifp->if_index;
 	else
 		next_index++;
 
 	key[0] = (uint32_t)next_index;
 	EPAIR_UNLOCK();
 	key[1] = (uint32_t)(hostid & 0xffffffff);
 	key[2] = (uint32_t)((hostid >> 32) & 0xfffffffff);
 	hash = jenkins_hash32(key, 3, 0);
 
 	eaddr[0] = 0x02;
 	memcpy(&eaddr[1], &hash, 4);
 	eaddr[5] = 0x0a;
 }
 
 static void
 epair_free_sc(struct epair_softc *sc)
 {
 	if (sc == NULL)
 		return;
 
 	if_free(sc->ifp);
 	ifmedia_removeall(&sc->media);
 	for (int i = 0; i < sc->num_queues; i++) {
 		struct epair_queue *q = &sc->queues[i];
 		mtx_destroy(&q->mtx);
 	}
 	free(sc->queues, M_EPAIR);
 	free(sc, M_EPAIR);
 }
 
 static void
 epair_set_state(struct ifnet *ifp, bool running)
 {
 	if (running) {
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		if_link_state_change(ifp, LINK_STATE_UP);
 	} else {
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	}
 }
 
 static int
 epair_handle_unit(struct if_clone *ifc, char *name, size_t len, int *punit)
 {
 	int error = 0, unit, wildcard;
 	char *dp;
 
 	/* Try to see if a special unit was requested. */
 	error = ifc_name2unit(name, &unit);
 	if (error != 0)
 		return (error);
 	wildcard = (unit < 0);
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If no unit had been given, we need to adjust the ifName.
 	 * Also make sure there is space for our extra [ab] suffix.
 	 */
 	for (dp = name; *dp != '\0'; dp++);
 	if (wildcard) {
 		int slen = snprintf(dp, len - (dp - name), "%d", unit);
 		if (slen > len - (dp - name) - 1) {
 			/* ifName too long. */
 			error = ENOSPC;
 			goto done;
 		}
 		dp += slen;
 	}
 	if (len - (dp - name) - 1 < 1) {
 		/* No space left for our [ab] suffix. */
 		error = ENOSPC;
 		goto done;
 	}
 	*dp = 'b';
 	/* Must not change dp so we can replace 'a' by 'b' later. */
 	*(dp+1) = '\0';
 
 	/* Check if 'a' and 'b' interfaces already exist. */ 
 	if (ifunit(name) != NULL) {
 		error = EEXIST;
 		goto done;
 	}
 
 	*dp = 'a';
 	if (ifunit(name) != NULL) {
 		error = EEXIST;
 		goto done;
 	}
 	*punit = unit;
 done:
 	if (error != 0)
 		ifc_free_unit(ifc, unit);
 
 	return (error);
 }
 
 static int
 epair_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct epair_softc *sca, *scb;
 	struct ifnet *ifp;
 	char *dp;
 	int error, unit;
 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	error = epair_handle_unit(ifc, name, len, &unit);
 	if (error != 0)
 		return (error);
 
 	/* Allocate memory for both [ab] interfaces */
 	sca = epair_alloc_sc(ifc);
 	scb = epair_alloc_sc(ifc);
 	if (sca == NULL || scb == NULL) {
 		epair_free_sc(sca);
 		epair_free_sc(scb);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
 
 	/*
 	 * Cross-reference the interfaces so we will be able to free both.
 	 */
 	sca->oifp = scb->ifp;
 	scb->oifp = sca->ifp;
 
 	/* Finish initialization of interface <n>a. */
 	ifp = sca->ifp;
 	epair_setup_ifp(sca, name, unit);
 	epair_generate_mac(sca, eaddr);
 
 	ether_ifattach(ifp, eaddr);
 
 	/* Swap the name and finish initialization of interface <n>b. */
 	dp = name + strlen(name) - 1;
 	*dp = 'b';
 
 	epair_setup_ifp(scb, name, unit);
 
 	ifp = scb->ifp;
 	/* We need to play some tricks here for the second interface. */
 	strlcpy(name, epairname, len);
 	/* Correctly set the name for the cloner list. */
 	strlcpy(name, scb->ifp->if_xname, len);
 
 	epair_clone_add(ifc, scb);
 
 	/*
 	 * Restore name to <n>a as the ifp for this will go into the
 	 * cloner list for the initial call.
 	 */
 	strlcpy(name, sca->ifp->if_xname, len);
 
 	/* Tell the world, that we are ready to rock. */
 	epair_set_state(sca->ifp, true);
 	epair_set_state(scb->ifp, true);
 
 	*ifpp = sca->ifp;
 
 	return (0);
 }
 
 static void
 epair_drain_rings(struct epair_softc *sc)
 {
 	for (int i = 0; i < sc->num_queues; i++) {
 		struct epair_queue *q;
 		struct mbuf *m, *n;
 
 		q = &sc->queues[i];
 		mtx_lock(&q->mtx);
 		m = mbufq_flush(&q->q);
 		mtx_unlock(&q->mtx);
 
 		for (; m != NULL; m = n) {
 			n = m->m_nextpkt;
 			m_freem(m);
 		}
 	}
 }
 
 static int
 epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct ifnet *oifp;
 	struct epair_softc *sca, *scb;
 	int unit, error;
 
 	/*
 	 * In case we called into if_clone_destroyif() ourselves
 	 * again to remove the second interface, the softc will be
 	 * NULL. In that case so not do anything but return success.
 	 */
 	if (ifp->if_softc == NULL)
 		return (0);
 
 	unit = ifp->if_dunit;
 	sca = ifp->if_softc;
 	oifp = sca->oifp;
 	scb = oifp->if_softc;
 
 	/* Frist get the interfaces down and detached. */
 	epair_set_state(ifp, false);
 	epair_set_state(oifp, false);
 
 	ether_ifdetach(ifp);
 	ether_ifdetach(oifp);
 
 	/* Third free any queued packets and all the resources. */
 	CURVNET_SET_QUIET(oifp->if_vnet);
 	epair_drain_rings(scb);
 	oifp->if_softc = NULL;
 	error = if_clone_destroyif(ifc, oifp);
 	if (error)
 		panic("%s: if_clone_destroyif() for our 2nd iface failed: %d",
 		    __func__, error);
 	epair_free_sc(scb);
 	CURVNET_RESTORE();
 
 	epair_drain_rings(sca);
 	epair_free_sc(sca);
 
 	/* Last free the cloner unit. */
 	ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 static void
 vnet_epair_init(const void *unused __unused)
 {
 	struct if_clone_addreq req = {
 		.match_f = epair_clone_match,
 		.create_f = epair_clone_create,
 		.destroy_f = epair_clone_destroy,
 	};
 	V_epair_cloner = ifc_attach_cloner(epairname, &req);
 }
 VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_epair_init, NULL);
 
 static void
 vnet_epair_uninit(const void *unused __unused)
 {
 
 	ifc_detach_cloner(V_epair_cloner);
 }
 VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_epair_uninit, NULL);
 
 static int
 epair_mod_init(void)
 {
 	char name[32];
 	epair_tasks.tasks = 0;
 
 #ifdef RSS
 	int cpu;
 
 	CPU_FOREACH(cpu) {
 		cpuset_t cpu_mask;
 
 		/* Pin to this CPU so we get appropriate NUMA allocations. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
 
 		snprintf(name, sizeof(name), "epair_task_%d", cpu);
 
 		epair_tasks.tq[cpu] = taskqueue_create(name, M_WAITOK,
 		    taskqueue_thread_enqueue,
 		    &epair_tasks.tq[cpu]);
 		CPU_SETOF(cpu, &cpu_mask);
 		taskqueue_start_threads_cpuset(&epair_tasks.tq[cpu], 1, PI_NET,
 		    &cpu_mask, "%s", name);
 
 		epair_tasks.tasks++;
 	}
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 #else
 	snprintf(name, sizeof(name), "epair_task");
 
 	epair_tasks.tq[0] = taskqueue_create(name, M_WAITOK,
 	    taskqueue_thread_enqueue,
 	    &epair_tasks.tq[0]);
 	taskqueue_start_threads(&epair_tasks.tq[0], 1, PI_NET, "%s", name);
 
 	epair_tasks.tasks = 1;
 #endif
 
 	return (0);
 }
 
 static void
 epair_mod_cleanup(void)
 {
 
 	for (int i = 0; i < epair_tasks.tasks; i++) {
 		taskqueue_drain_all(epair_tasks.tq[i]);
 		taskqueue_free(epair_tasks.tq[i]);
 	}
 }
 
 static int
 epair_modevent(module_t mod, int type, void *data)
 {
 	int ret;
 
 	switch (type) {
 	case MOD_LOAD:
 		EPAIR_LOCK_INIT();
 		ret = epair_mod_init();
 		if (ret != 0)
 			return (ret);
 		if (bootverbose)
 			printf("%s: %s initialized.\n", __func__, epairname);
 		break;
 	case MOD_UNLOAD:
 		epair_mod_cleanup();
 		EPAIR_LOCK_DESTROY();
 		if (bootverbose)
 			printf("%s: %s unloaded.\n", __func__, epairname);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t epair_mod = {
 	"if_epair",
 	epair_modevent,
 	0
 };
 
 DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
 MODULE_VERSION(if_epair, 3);
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index 839bae8e9d43..dd5c07acf634 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -1,1504 +1,1527 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ethersubr.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_netgraph.h"
 #include "opt_mbuf_profiling.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/devctl.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/msan.h>
 #include <sys/proc.h>
 #include <sys/priv.h>
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
+#ifdef KDB
+#include <sys/kdb.h>
+#endif
 
 #include <net/ieee_oui.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_private.h>
 #include <net/if_arp.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 #include <security/mac/mac_framework.h>
 
 #include <crypto/sha1.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
 CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
 #endif
 
 VNET_DEFINE(pfil_head_t, link_pfil_head);	/* Packet filter hooks */
 
 /* netgraph node hooks for ng_ether(4) */
 void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_attach_p)(struct ifnet *ifp);
 void	(*ng_ether_detach_p)(struct ifnet *ifp);
 
 void	(*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* if_bridge(4) support */
 void	(*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_ethernet_p)(struct ifnet *, struct mbuf *); 
 
 static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	int ether_resolvemulti(struct ifnet *, struct sockaddr **,
 		struct sockaddr *);
 static	int ether_requestencap(struct ifnet *, struct if_encap_req *);
 
 #define senderr(e) do { error = (e); goto bad;} while (0)
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Handle link-layer encapsulation requests.
  */
 static int
 ether_requestencap(struct ifnet *ifp, struct if_encap_req *req)
 {
 	struct ether_header *eh;
 	struct arphdr *ah;
 	uint16_t etype;
 	const u_char *lladdr;
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < ETHER_HDR_LEN)
 		return (ENOMEM);
 
 	eh = (struct ether_header *)req->buf;
 	lladdr = req->lladdr;
 	req->lladdr_off = 0;
 
 	switch (req->family) {
 	case AF_INET:
 		etype = htons(ETHERTYPE_IP);
 		break;
 	case AF_INET6:
 		etype = htons(ETHERTYPE_IPV6);
 		break;
 	case AF_ARP:
 		ah = (struct arphdr *)req->hdata;
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			etype = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			etype = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (req->flags & IFENCAP_FLAG_BROADCAST)
 			lladdr = ifp->if_broadcastaddr;
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
 	memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	req->bufsize = sizeof(struct ether_header);
 
 	return (0);
 }
 
 static int
 ether_resolve_addr(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro, u_char *phdr,
 	uint32_t *pflags, struct llentry **plle)
 {
 	uint32_t lleflags = 0;
 	int error = 0;
 #if defined(INET) || defined(INET6)
 	struct ether_header *eh = (struct ether_header *)phdr;
 	uint16_t etype;
 #endif
 
 	if (plle)
 		*plle = NULL;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
 			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags,
 			    plle);
 		else {
 			if (m->m_flags & M_BCAST)
 				memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
 				    ETHER_ADDR_LEN);
 			else {
 				const struct in_addr *a;
 				a = &(((const struct sockaddr_in *)dst)->sin_addr);
 				ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
 			}
 			etype = htons(ETHERTYPE_IP);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if ((m->m_flags & M_MCAST) == 0) {
 			int af = RO_GET_FAMILY(ro, dst);
 			error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr,
 			    &lleflags, plle);
 		} else {
 			const struct in6_addr *a6;
 			a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
 			ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
 			etype = htons(ETHERTYPE_IPV6);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		if (m != NULL)
 			m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	if (error == EHOSTDOWN) {
 		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
 			error = EHOSTUNREACH;
 	}
 
 	if (error != 0)
 		return (error);
 
 	*pflags = RT_MAY_LOOP;
 	if (lleflags & LLE_IFADDR)
 		*pflags |= RT_L2_ME;
 
 	return (0);
 }
 
 /*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 int
 ether_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	int error = 0;
 	char linkhdr[ETHER_HDR_LEN], *phdr;
 	struct ether_header *eh;
 	struct pf_mtag *t;
 	bool loop_copy;
 	int hlen;	/* link layer header length */
 	uint32_t pflags;
 	struct llentry *lle = NULL;
 	int addref = 0;
 
 	phdr = NULL;
 	pflags = 0;
 	if (ro != NULL) {
 		/* XXX BPF uses ro_prepend */
 		if (ro->ro_prepend != NULL) {
 			phdr = ro->ro_prepend;
 			hlen = ro->ro_plen;
 		} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
 			if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
 				lle = ro->ro_lle;
 				if (lle != NULL &&
 				    (lle->la_flags & LLE_VALID) == 0) {
 					LLE_FREE(lle);
 					lle = NULL;	/* redundant */
 					ro->ro_lle = NULL;
 				}
 				if (lle == NULL) {
 					/* if we lookup, keep cache */
 					addref = 1;
 				} else
 					/*
 					 * Notify LLE code that
 					 * the entry was used
 					 * by datapath.
 					 */
 					llentry_provide_feedback(lle);
 			}
 			if (lle != NULL) {
 				phdr = lle->r_linkdata;
 				hlen = lle->r_hdrlen;
 				pflags = lle->r_flags;
 			}
 		}
 	}
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 
 	if (phdr == NULL) {
 		/* No prepend data supplied. Try to calculate ourselves. */
 		phdr = linkhdr;
 		hlen = ETHER_HDR_LEN;
 		error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
 		    addref ? &lle : NULL);
 		if (addref && lle != NULL)
 			ro->ro_lle = lle;
 		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0));
 	}
 	loop_copy = (pflags & RT_MAY_LOOP) != 0;
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 *
 	 * Note that we do prepend regardless of RT_HAS_HEADER flag.
 	 * This is done because BPF code shifts m_data pointer
 	 * to the end of ethernet header prior to calling if_output().
 	 */
 	M_PREPEND(m, hlen, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	if ((pflags & RT_HAS_HEADER) == 0) {
 		eh = mtod(m, struct ether_header *);
 		memcpy(eh, phdr, hlen);
 	}
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
 		struct mbuf *n;
 
 		/*
 		 * Because if_simloop() modifies the packet, we need a
 		 * writable copy through m_dup() instead of a readonly
 		 * one as m_copy[m] would give us. The alternative would
 		 * be to modify if_simloop() to handle the readonly mbuf,
 		 * but performancewise it is mostly equivalent (trading
 		 * extra data copying vs. extra locking).
 		 *
 		 * XXX This is a local workaround.  A number of less
 		 * often used kernel parts suffer from the same bug.
 		 * See PR kern/105943 for a proposed general solution.
 		 */
 		if ((n = m_dup(m, M_NOWAIT)) != NULL) {
 			update_mbuf_csumflags(m, n);
 			(void)if_simloop(ifp, n, RO_GET_FAMILY(ro, dst), hlen);
 		} else
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	}
 
        /*
 	* Bridges require special output handling.
 	*/
 	if (ifp->if_bridge) {
 		BRIDGE_OUTPUT(ifp, m, error);
 		return (error);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (ifp->if_carp &&
 	    (error = (*carp_output_p)(ifp, m, dst)))
 		goto bad;
 #endif
 
 	/* Handle ng_ether(4) processing, if any */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_output_p != NULL,
 		    ("ng_ether_output_p is NULL"));
 		if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
 bad:			if (m != NULL)
 				m_freem(m);
 			return (error);
 		}
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Continue with link-layer output */
 	return ether_output_frame(ifp, m);
 }
 
 static bool
 ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp)
 {
 	struct ether_8021q_tag qtag;
 	struct ether_header *eh;
 
 	eh = mtod(*mp, struct ether_header *);
 	if (ntohs(eh->ether_type) == ETHERTYPE_VLAN ||
 	    ntohs(eh->ether_type) == ETHERTYPE_QINQ)
 		return (true);
 
 	qtag.vid = 0;
 	qtag.pcp = pcp;
 	qtag.proto = ETHERTYPE_VLAN;
 	if (ether_8021q_frame(mp, ifp, ifp, &qtag))
 		return (true);
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (false);
 }
 
 /*
  * Ethernet link layer output routine to send a raw frame to the device.
  *
  * This assumes that the 14 byte Ethernet header is present and contiguous
  * in the first mbuf (if BRIDGE'ing).
  */
 int
 ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 {
 	uint8_t pcp;
 
 	pcp = ifp->if_pcp;
 	if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN &&
 	    !ether_set_pcp(&m, ifp, pcp))
 		return (0);
 
 	if (PFIL_HOOKED_OUT(V_link_pfil_head))
 		switch (pfil_mbuf_out(V_link_pfil_head, &m, ifp, NULL)) {
 		case PFIL_DROPPED:
 			return (EACCES);
 		case PFIL_CONSUMED:
 			return (0);
 		}
 
 #ifdef EXPERIMENTAL
 #if defined(INET6) && defined(INET)
 	/* draft-ietf-6man-ipv6only-flag */
 	/* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) {
 		struct ether_header *eh;
 
 		eh = mtod(m, struct ether_header *);
 		switch (ntohs(eh->ether_type)) {
 		case ETHERTYPE_IP:
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 			/* NOTREACHED */
 			break;
 		};
 	}
 #endif
 #endif
 
 	/*
 	 * Queue message on interface, update output statistics if successful,
 	 * and start output if interface not yet active.
 	 *
 	 * If KMSAN is enabled, use it to verify that the data does not contain
 	 * any uninitialized bytes.
 	 */
 	kmsan_check_mbuf(m, "ether_output");
 	return ((ifp->if_transmit)(ifp, m));
 }
 
 /*
  * Process a received Ethernet packet; the packet is in the
  * mbuf chain m with the ethernet header at the front.
  */
 static void
 ether_input_internal(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	u_short etype;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
 		m_freem(m);
 		return;
 	}
 #endif
 	if (m->m_len < ETHER_HDR_LEN) {
 		/* XXX maybe should pullup? */
 		if_printf(ifp, "discard frame w/o leading ethernet "
 				"header (len %u pkt len %u)\n",
 				m->m_len, m->m_pkthdr.len);
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	random_harvest_queue_ether(m, sizeof(*m));
 
 #ifdef EXPERIMENTAL
 #if defined(INET6) && defined(INET)
 	/* draft-ietf-6man-ipv6only-flag */
 	/* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) {
 		switch (etype) {
 		case ETHERTYPE_IP:
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			m_freem(m);
 			return;
 			/* NOTREACHED */
 			break;
 		};
 	}
 #endif
 #endif
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (ETHER_IS_BROADCAST(eh->ether_dhost))
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	ETHER_BPF_MTAP(ifp, m);
 
 	/*
 	 * If the CRC is still on the packet, trim it off. We do this once
 	 * and once only in case we are re-entered. Nothing else on the
 	 * Ethernet receive path expects to see the FCS.
 	 */
 	if (m->m_flags & M_HASFCS) {
 		m_adj(m, -ETHER_CRC_LEN);
 		m->m_flags &= ~M_HASFCS;
 	}
 
 	if (!(ifp->if_capenable & IFCAP_HWSTATS))
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		CURVNET_RESTORE();
 		return;
 	}
 
 	/* Handle input from a lagg(4) port */
 	if (ifp->if_type == IFT_IEEE8023ADLAG) {
 		KASSERT(lagg_input_ethernet_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_ethernet_p)(ifp, m);
 		if (m != NULL)
 			ifp = m->m_pkthdr.rcvif;
 		else {
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 	/*
 	 * If the hardware did not process an 802.1Q tag, do this now,
 	 * to allow 802.1P priority frames to be passed to the main input
 	 * path correctly.
 	 */
 	if ((m->m_flags & M_VLANTAG) == 0 &&
 	    ((etype == ETHERTYPE_VLAN) || (etype == ETHERTYPE_QINQ))) {
 		struct ether_vlan_header *evl;
 
 		if (m->m_len < sizeof(*evl) &&
 		    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 #ifdef DIAGNOSTIC
 			if_printf(ifp, "cannot pullup VLAN header\n");
 #endif
 			if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 			CURVNET_RESTORE();
 			return;
 		}
 
 		evl = mtod(m, struct ether_vlan_header *);
 		m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
 		m->m_flags |= M_VLANTAG;
 
 		bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 		    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 		m_adj(m, ETHER_VLAN_ENCAP_LEN);
 		eh = mtod(m, struct ether_header *);
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Allow ng_ether(4) to claim this frame. */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_p != NULL,
 		    ("%s: ng_ether_input_p is NULL", __func__));
 		m->m_flags &= ~M_PROMISC;
 		(*ng_ether_input_p)(ifp, &m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 	/*
 	 * Allow if_bridge(4) to claim this frame.
 	 *
 	 * The BRIDGE_INPUT() macro will update ifp if the bridge changed it
 	 * and the frame should be delivered locally.
 	 *
 	 * If M_BRIDGE_INJECT is set, the packet was received directly by the
 	 * bridge via netmap, so "ifp" is the bridge itself and the packet
 	 * should be re-examined.
 	 */
 	if (ifp->if_bridge != NULL || (m->m_flags & M_BRIDGE_INJECT) != 0) {
 		m->m_flags &= ~M_PROMISC;
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 #if defined(INET) || defined(INET6)
 	/*
 	 * Clear M_PROMISC on frame so that carp(4) will see it when the
 	 * mbuf flows up to Layer 3.
 	 * FreeBSD's implementation of carp(4) uses the inprotosw
 	 * to dispatch IPPROTO_CARP. carp(4) also allocates its own
 	 * Ethernet addresses of the form 00:00:5e:00:01:xx, which
 	 * is outside the scope of the M_PROMISC test below.
 	 * TODO: Maintain a hash table of ethernet addresses other than
 	 * ether_dhost which may be active on this ifp.
 	 */
 	if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
 		m->m_flags &= ~M_PROMISC;
 	} else
 #endif
 	{
 		/*
 		 * If the frame received was not for our MAC address, set the
 		 * M_PROMISC flag on the mbuf chain. The frame may need to
 		 * be seen by the rest of the Ethernet input path in case of
 		 * re-entry (e.g. bridge, vlan, netgraph) but should not be
 		 * seen by upper protocol layers.
 		 */
 		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
 		    bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
 			m->m_flags |= M_PROMISC;
 	}
 
 	ether_demux(ifp, m);
 	CURVNET_RESTORE();
 }
 
 /*
  * Ethernet input dispatch; by default, direct dispatch here regardless of
  * global configuration.  However, if RSS is enabled, hook up RSS affinity
  * so that when deferred or hybrid dispatch is enabled, we can redistribute
  * load based on RSS.
  *
  * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
  * not it had already done work distribution via multi-queue.  Then we could
  * direct dispatch in the event load balancing was already complete and
  * handle the case of interfaces with different capabilities better.
  *
  * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
  * at multiple layers?
  *
  * XXXRW: For now, enable all this only if RSS is compiled in, although it
  * works fine without RSS.  Need to characterise the performance overhead
  * of the detour through the netisr code in the event the result is always
  * direct dispatch.
  */
 static void
 ether_nh_input(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL,
 	    ("%s: NULL interface pointer", __func__));
 	ether_input_internal(m->m_pkthdr.rcvif, m);
 }
 
 static struct netisr_handler	ether_nh = {
 	.nh_name = "ether",
 	.nh_handler = ether_nh_input,
 	.nh_proto = NETISR_ETHER,
 #ifdef RSS
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 	.nh_m2cpuid = rss_m2cpuid,
 #else
 	.nh_policy = NETISR_POLICY_SOURCE,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 #endif
 };
 
 static void
 ether_init(__unused void *arg)
 {
 
 	netisr_register(&ether_nh);
 }
 SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);
 
 static void
 vnet_ether_init(__unused void *arg)
 {
 	struct pfil_head_args args;
 
 	args.pa_version = PFIL_VERSION;
 	args.pa_flags = PFIL_IN | PFIL_OUT;
 	args.pa_type = PFIL_TYPE_ETHERNET;
 	args.pa_headname = PFIL_ETHER_NAME;
 	V_link_pfil_head = pfil_head_register(&args);
 
 #ifdef VIMAGE
 	netisr_register_vnet(&ether_nh);
 #endif
 }
 VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_init, NULL);
 
 #ifdef VIMAGE
 static void
 vnet_ether_pfil_destroy(__unused void *arg)
 {
 
 	pfil_head_unregister(V_link_pfil_head);
 }
 VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
     vnet_ether_pfil_destroy, NULL);
 
 static void
 vnet_ether_destroy(__unused void *arg)
 {
 
 	netisr_unregister_vnet(&ether_nh);
 }
 VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_destroy, NULL);
 #endif
 
 static void
 ether_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct mbuf *mn;
 	bool needs_epoch;
 
-	needs_epoch = !(ifp->if_flags & IFF_KNOWSEPOCH);
+	needs_epoch = (ifp->if_flags & IFF_NEEDSEPOCH);
+#ifdef INVARIANTS
+	/*
+	 * This temporary code is here to prevent epoch unaware and unmarked
+	 * drivers to panic the system.  Once all drivers are taken care of,
+	 * the whole INVARIANTS block should go away.
+	 */
+	if (!needs_epoch && !in_epoch(net_epoch_preempt)) {
+		static bool printedonce;
+
+		needs_epoch = true;
+		if (!printedonce) {
+			printedonce = true;
+			if_printf(ifp, "called %s w/o net epoch! "
+			    "PLEASE file a bug report.", __func__);
+#ifdef KDB
+			kdb_backtrace();
+#endif
+		}
+	}
+#endif
 
 	/*
 	 * The drivers are allowed to pass in a chain of packets linked with
 	 * m_nextpkt. We split them up into separate packets here and pass
 	 * them up. This allows the drivers to amortize the receive lock.
 	 */
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if (__predict_false(needs_epoch))
 		NET_EPOCH_ENTER(et);
 	while (m) {
 		mn = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We will rely on rcvif being set properly in the deferred
 		 * context, so assert it is correct here.
 		 */
 		MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 		KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p "
 		    "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp));
 		netisr_dispatch(NETISR_ETHER, m);
 		m = mn;
 	}
 	if (__predict_false(needs_epoch))
 		NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 /*
  * Upper layer processing for a received Ethernet packet.
  */
 void
 ether_demux(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	int i, isr;
 	u_short ether_type;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));
 
 	/* Do not grab PROMISC frames in case we are re-entered. */
 	if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) {
 		i = pfil_mbuf_in(V_link_pfil_head, &m, ifp, NULL);
 		if (i != 0 || m == NULL)
 			return;
 	}
 
 	eh = mtod(m, struct ether_header *);
 	ether_type = ntohs(eh->ether_type);
 
 	/*
 	 * If this frame has a VLAN tag other than 0, call vlan_input()
 	 * if its module is loaded. Otherwise, drop.
 	 */
 	if ((m->m_flags & M_VLANTAG) &&
 	    EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
 		if (ifp->if_vlantrunk == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 		KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
 		    __func__));
 		/* Clear before possibly re-entering ether_input(). */
 		m->m_flags &= ~M_PROMISC;
 		(*vlan_input_p)(ifp, m);
 		return;
 	}
 
 	/*
 	 * Pass promiscuously received frames to the upper layer if the user
 	 * requested this by setting IFF_PPROMISC. Otherwise, drop them.
 	 */
 	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper layers.
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ether_type) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			return;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		goto discard;
 	}
 
 	/* Strip off Ethernet header. */
 	m_adj(m, ETHER_HDR_LEN);
 
 	netisr_dispatch(isr, m);
 	return;
 
 discard:
 	/*
 	 * Packet is to be discarded.  If netgraph is present,
 	 * hand the packet to it for last chance processing;
 	 * otherwise dispose of it.
 	 */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_orphan_p != NULL,
 		    ("ng_ether_input_orphan_p is NULL"));
 		(*ng_ether_input_orphan_p)(ifp, m);
 		return;
 	}
 	m_freem(m);
 }
 
 /*
  * Convert Ethernet address to printable (loggable) representation.
  * This routine is for compatibility; it's better to just use
  *
  *	printf("%6D", <pointer to address>, ":");
  *
  * since there's no static buffer involved.
  */
 char *
 ether_sprintf(const u_char *ap)
 {
 	static char etherbuf[18];
 	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
 	return (etherbuf);
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 ether_ifattach(struct ifnet *ifp, const u_int8_t *lla)
 {
 	int i;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_addrlen = ETHER_ADDR_LEN;
 	ifp->if_hdrlen = ETHER_HDR_LEN;
 	ifp->if_mtu = ETHERMTU;
 	if_attach(ifp);
 	ifp->if_output = ether_output;
 	ifp->if_input = ether_input;
 	ifp->if_resolvemulti = ether_resolvemulti;
 	ifp->if_requestencap = ether_requestencap;
 #ifdef VIMAGE
 	ifp->if_reassign = ether_reassign;
 #endif
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Mbps(10);		/* just a default */
 	ifp->if_broadcastaddr = etherbroadcastaddr;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_ETHER;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	if (ifp->if_hw_addr != NULL)
 		bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen);
 
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 	if (ng_ether_attach_p != NULL)
 		(*ng_ether_attach_p)(ifp);
 
 	/* Announce Ethernet MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break; 
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
 
 	uuid_ether_add(LLADDR(sdl));
 
 	/* Add necessary bits are setup; announce it now. */
 	EVENTHANDLER_INVOKE(ether_ifattach_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL);
 }
 
 /*
  * Perform common duties while detaching an Ethernet interface
  */
 void
 ether_ifdetach(struct ifnet *ifp)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
 	uuid_ether_del(LLADDR(sdl));
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 #ifdef VIMAGE
 void
 ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused)
 {
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	if (ng_ether_attach_p != NULL) {
 		CURVNET_SET_QUIET(new_vnet);
 		(*ng_ether_attach_p)(ifp);
 		CURVNET_RESTORE();
 	}
 }
 #endif
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Ethernet");
 
 #if 0
 /*
  * This is for reference.  We have a table-driven version
  * of the little-endian crc32 generator, which is faster
  * than the double-loop.
  */
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = (crc ^ data) & 1;
 			crc >>= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_LE);
 		}
 	}
 
 	return (crc);
 }
 #else
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	static const uint32_t crctab[] = {
 		0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
 		0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
 		0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
 		0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
 	};
 	size_t i;
 	uint32_t crc;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		crc ^= buf[i];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 	}
 
 	return (crc);
 }
 #endif
 
 uint32_t
 ether_crc32_be(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc, carry;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
 			crc <<= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
 		}
 	}
 
 	return (crc);
 }
 
 int
 ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
 		    ETHER_ADDR_LEN);
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > ETHERMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 
 	case SIOCSLANPCP:
 		error = priv_check(curthread, PRIV_NET_SETLANPCP);
 		if (error != 0)
 			break;
 		if (ifr->ifr_lan_pcp > 7 &&
 		    ifr->ifr_lan_pcp != IFNET_PCP_NONE) {
 			error = EINVAL;
 		} else {
 			ifp->if_pcp = ifr->ifr_lan_pcp;
 			/* broadcast event about PCP change */
 			EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP);
 		}
 		break;
 
 	case SIOCGLANPCP:
 		ifr->ifr_lan_pcp = ifp->if_pcp;
 		break;
 
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
 	}
 	return (error);
 }
 
 static int
 ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
 	struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!ETHER_IS_MULTICAST(e_addr))
 			return EADDRNOTAVAIL;
 		*llsa = NULL;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = NULL;
 			return 0;
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return EAFNOSUPPORT;
 	}
 }
 
 static moduledata_t ether_mod = {
 	.name = "ether",
 };
 
 void
 ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen)
 {
 	struct ether_vlan_header vlan;
 	struct mbuf mv, mb;
 
 	KASSERT((m->m_flags & M_VLANTAG) != 0,
 	    ("%s: vlan information not present", __func__));
 	KASSERT(m->m_len >= sizeof(struct ether_header),
 	    ("%s: mbuf not large enough for header", __func__));
 	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
 	vlan.evl_proto = vlan.evl_encap_proto;
 	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
 	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
 	m->m_len -= sizeof(struct ether_header);
 	m->m_data += sizeof(struct ether_header);
 	/*
 	 * If a data link has been supplied by the caller, then we will need to
 	 * re-create a stack allocated mbuf chain with the following structure:
 	 *
 	 * (1) mbuf #1 will contain the supplied data link
 	 * (2) mbuf #2 will contain the vlan header
 	 * (3) mbuf #3 will contain the original mbuf's packet data
 	 *
 	 * Otherwise, submit the packet and vlan header via bpf_mtap2().
 	 */
 	if (data != NULL) {
 		mv.m_next = m;
 		mv.m_data = (caddr_t)&vlan;
 		mv.m_len = sizeof(vlan);
 		mb.m_next = &mv;
 		mb.m_data = data;
 		mb.m_len = dlen;
 		bpf_mtap(bp, &mb);
 	} else
 		bpf_mtap2(bp, &vlan, sizeof(vlan), m);
 	m->m_len += sizeof(struct ether_header);
 	m->m_data -= sizeof(struct ether_header);
 }
 
 struct mbuf *
 ether_vlanencap_proto(struct mbuf *m, uint16_t tag, uint16_t proto)
 {
 	struct ether_vlan_header *evl;
 
 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
 	if (m == NULL)
 		return (NULL);
 	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */
 
 	if (m->m_len < sizeof(*evl)) {
 		m = m_pullup(m, sizeof(*evl));
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * Transform the Ethernet header into an Ethernet header
 	 * with 802.1Q encapsulation.
 	 */
 	evl = mtod(m, struct ether_vlan_header *);
 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
 	    (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	evl->evl_encap_proto = htons(proto);
 	evl->evl_tag = htons(tag);
 	return (m);
 }
 
 void
 ether_bpf_mtap_if(struct ifnet *ifp, struct mbuf *m)
 {
 	if (bpf_peers_present(ifp->if_bpf)) {
 		M_ASSERTVALID(m);
 		if ((m->m_flags & M_VLANTAG) != 0)
 			ether_vlan_mtap(ifp->if_bpf, m, NULL, 0);
 		else
 			bpf_mtap(ifp->if_bpf, m);
 	}
 }
 
 static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IEEE 802.1Q VLAN");
 static SYSCTL_NODE(_net_link_vlan, PF_LINK, link,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "for consistency");
 
 VNET_DEFINE_STATIC(int, soft_pad);
 #define	V_soft_pad	VNET(soft_pad)
 SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(soft_pad), 0,
     "pad short frames before tagging");
 
 /*
  * For now, make preserving PCP via an mbuf tag optional, as it increases
  * per-packet memory allocations and frees.  In the future, it would be
  * preferable to reuse ether_vtag for this, or similar.
  */
 VNET_DEFINE(int, vlan_mtag_pcp) = 0;
 #define	V_vlan_mtag_pcp	VNET(vlan_mtag_pcp)
 SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(vlan_mtag_pcp), 0,
     "Retain VLAN PCP information as packets are passed up the stack");
 
 bool
 ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p,
     struct ether_8021q_tag *qtag)
 {
 	struct m_tag *mtag;
 	int n;
 	uint16_t tag;
 	static const char pad[8];	/* just zeros */
 
 	/*
 	 * Pad the frame to the minimum size allowed if told to.
 	 * This option is in accord with IEEE Std 802.1Q, 2003 Ed.,
 	 * paragraph C.4.4.3.b.  It can help to work around buggy
 	 * bridges that violate paragraph C.4.4.3.a from the same
 	 * document, i.e., fail to pad short frames after untagging.
 	 * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but
 	 * untagging it will produce a 62-byte frame, which is a runt
 	 * and requires padding.  There are VLAN-enabled network
 	 * devices that just discard such runts instead or mishandle
 	 * them somehow.
 	 */
 	if (V_soft_pad && p->if_type == IFT_ETHER) {
 		for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len;
 		     n > 0; n -= sizeof(pad)) {
 			if (!m_append(*mp, min(n, sizeof(pad)), pad))
 				break;
 		}
 		if (n > 0) {
 			m_freem(*mp);
 			*mp = NULL;
 			if_printf(ife, "cannot pad short frame");
 			return (false);
 		}
 	}
 
 	/*
 	 * If PCP is set in mbuf, use it
 	 */
 	if ((*mp)->m_flags & M_VLANTAG) {
 		qtag->pcp = EVL_PRIOFTAG((*mp)->m_pkthdr.ether_vtag);
 	}
 
 	/*
 	 * If underlying interface can do VLAN tag insertion itself,
 	 * just pass the packet along. However, we need some way to
 	 * tell the interface where the packet came from so that it
 	 * knows how to find the VLAN tag to use, so we attach a
 	 * packet tag that holds it.
 	 */
 	if (V_vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q,
 	    MTAG_8021Q_PCP_OUT, NULL)) != NULL)
 		tag = EVL_MAKETAG(qtag->vid, *(uint8_t *)(mtag + 1), 0);
 	else
 		tag = EVL_MAKETAG(qtag->vid, qtag->pcp, 0);
 	if ((p->if_capenable & IFCAP_VLAN_HWTAGGING) &&
 	    (qtag->proto == ETHERTYPE_VLAN)) {
 		(*mp)->m_pkthdr.ether_vtag = tag;
 		(*mp)->m_flags |= M_VLANTAG;
 	} else {
 		*mp = ether_vlanencap_proto(*mp, tag, qtag->proto);
 		if (*mp == NULL) {
 			if_printf(ife, "unable to prepend 802.1Q header");
 			return (false);
 		}
 	}
 	return (true);
 }
 
 /*
  * Allocate an address from the FreeBSD Foundation OUI.  This uses a
  * cryptographic hash function on the containing jail's name, UUID and the
  * interface name to attempt to provide a unique but stable address.
  * Pseudo-interfaces which require a MAC address should use this function to
  * allocate non-locally-administered addresses.
  */
 void
 ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr)
 {
 	SHA1_CTX ctx;
 	char *buf;
 	char uuid[HOSTUUIDLEN + 1];
 	uint64_t addr;
 	int i, sz;
 	char digest[SHA1_RESULTLEN];
 	char jailname[MAXHOSTNAMELEN];
 
 	getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid));
 	if (strncmp(uuid, DEFAULT_HOSTUUID, sizeof(uuid)) == 0) {
 		/* Fall back to a random mac address. */
 		goto rando;
 	}
 
 	/* If each (vnet) jail would also have a unique hostuuid this would not
 	 * be necessary. */
 	getjailname(curthread->td_ucred, jailname, sizeof(jailname));
 	sz = asprintf(&buf, M_TEMP, "%s-%s-%s", uuid, if_name(ifp),
 	    jailname);
 	if (sz < 0) {
 		/* Fall back to a random mac address. */
 		goto rando;
 	}
 
 	SHA1Init(&ctx);
 	SHA1Update(&ctx, buf, sz);
 	SHA1Final(digest, &ctx);
 	free(buf, M_TEMP);
 
 	addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) &
 	    OUI_FREEBSD_GENERATED_MASK;
 	addr = OUI_FREEBSD(addr);
 	for (i = 0; i < ETHER_ADDR_LEN; ++i) {
 		hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) &
 		    0xFF;
 	}
 
 	return;
 rando:
 	arc4rand(hwaddr, sizeof(*hwaddr), 0);
 	/* Unicast */
 	hwaddr->octet[0] &= 0xFE;
 	/* Locally administered. */
 	hwaddr->octet[0] |= 0x02;
 }
 
 DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(ether, 1);
diff --git a/sys/net/if_infiniband.c b/sys/net/if_infiniband.c
index 30f014ee669d..a11b8a8f5c56 100644
--- a/sys/net/if_infiniband.c
+++ b/sys/net/if_infiniband.c
@@ -1,663 +1,687 @@
 /*-
  * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_kbd.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/devctl.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
+#ifdef KDB
+#include <sys/kdb.h>
+#endif
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/infiniband.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_private.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_lagg.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 
 #include <security/mac/mac_framework.h>
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);
 
 #ifdef INET
 static inline void
 infiniband_ipv4_multicast_map(uint32_t addr,
     const uint8_t *broadcast, uint8_t *buf)
 {
 	uint8_t scope;
 
 	addr = ntohl(addr);
 	scope = broadcast[5] & 0xF;
 
 	buf[0] = 0;
 	buf[1] = 0xff;
 	buf[2] = 0xff;
 	buf[3] = 0xff;
 	buf[4] = 0xff;
 	buf[5] = 0x10 | scope;
 	buf[6] = 0x40;
 	buf[7] = 0x1b;
 	buf[8] = broadcast[8];
 	buf[9] = broadcast[9];
 	buf[10] = 0;
 	buf[11] = 0;
 	buf[12] = 0;
 	buf[13] = 0;
 	buf[14] = 0;
 	buf[15] = 0;
 	buf[16] = (addr >> 24) & 0xff;
 	buf[17] = (addr >> 16) & 0xff;
 	buf[18] = (addr >> 8) & 0xff;
 	buf[19] = addr & 0xff;
 }
 #endif
 
 #ifdef INET6
 static inline void
 infiniband_ipv6_multicast_map(const struct in6_addr *addr,
     const uint8_t *broadcast, uint8_t *buf)
 {
 	uint8_t scope;
 
 	scope = broadcast[5] & 0xF;
 
 	buf[0] = 0;
 	buf[1] = 0xff;
 	buf[2] = 0xff;
 	buf[3] = 0xff;
 	buf[4] = 0xff;
 	buf[5] = 0x10 | scope;
 	buf[6] = 0x60;
 	buf[7] = 0x1b;
 	buf[8] = broadcast[8];
 	buf[9] = broadcast[9];
 	memcpy(&buf[10], &addr->s6_addr[6], 10);
 }
 #endif
 
 /*
  * This is for clients that have an infiniband_header in the mbuf.
  */
 void
 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct infiniband_header *ibh;
 	struct ether_header eh;
 
 	if (!bpf_peers_present(ifp->if_bpf))
 		return;
 
 	M_ASSERTVALID(mb);
 	if (mb->m_len < sizeof(*ibh))
 		return;
 
 	ibh = mtod(mb, struct infiniband_header *);
 	eh.ether_type = ibh->ib_protocol;
 	memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
 	memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
 	mb->m_data += sizeof(*ibh);
 	mb->m_len -= sizeof(*ibh);
 	mb->m_pkthdr.len -= sizeof(*ibh);
 	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
 	mb->m_data -= sizeof(*ibh);
 	mb->m_len += sizeof(*ibh);
 	mb->m_pkthdr.len += sizeof(*ibh);
 }
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Handle link-layer encapsulation requests.
  */
 static int
 infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req)
 {
 	struct infiniband_header *ih;
 	struct arphdr *ah;
 	uint16_t etype;
 	const uint8_t *lladdr;
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < INFINIBAND_HDR_LEN)
 		return (ENOMEM);
 
 	ih = (struct infiniband_header *)req->buf;
 	lladdr = req->lladdr;
 	req->lladdr_off = 0;
 
 	switch (req->family) {
 	case AF_INET:
 		etype = htons(ETHERTYPE_IP);
 		break;
 	case AF_INET6:
 		etype = htons(ETHERTYPE_IPV6);
 		break;
 	case AF_ARP:
 		ah = (struct arphdr *)req->hdata;
 		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
 
 		switch (ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			etype = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			etype = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (req->flags & IFENCAP_FLAG_BROADCAST)
 			lladdr = ifp->if_broadcastaddr;
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	ih->ib_protocol = etype;
 	ih->ib_reserved = 0;
 	memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN);
 	req->bufsize = sizeof(struct infiniband_header);
 
 	return (0);
 }
 
 static int
 infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, struct route *ro, uint8_t *phdr,
     uint32_t *pflags, struct llentry **plle)
 {
 #if defined(INET) || defined(INET6)
 	struct infiniband_header *ih = (struct infiniband_header *)phdr;
 #endif
 	uint32_t lleflags = 0;
 	int error = 0;
 
 	if (plle)
 		*plle = NULL;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
 			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle);
 		} else {
 			if (m->m_flags & M_BCAST) {
 				memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr,
 				    INFINIBAND_ADDR_LEN);
 			} else {
 				infiniband_ipv4_multicast_map(
 				    ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
 				    ifp->if_broadcastaddr, ih->ib_hwaddr);
 			}
 			ih->ib_protocol = htons(ETHERTYPE_IP);
 			ih->ib_reserved = 0;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if ((m->m_flags & M_MCAST) == 0) {
 			int af = RO_GET_FAMILY(ro, dst);
 			error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr,
 			    &lleflags, plle);
 		} else {
 			infiniband_ipv6_multicast_map(
 			    &((const struct sockaddr_in6 *)dst)->sin6_addr,
 			    ifp->if_broadcastaddr, ih->ib_hwaddr);
 			ih->ib_protocol = htons(ETHERTYPE_IPV6);
 			ih->ib_reserved = 0;
 		}
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		if (m != NULL)
 			m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	if (error == EHOSTDOWN) {
 		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
 			error = EHOSTUNREACH;
 	}
 
 	if (error != 0)
 		return (error);
 
 	*pflags = RT_MAY_LOOP;
 	if (lleflags & LLE_IFADDR)
 		*pflags |= RT_L2_ME;
 
 	return (0);
 }
 
 /*
  * Infiniband output routine.
  */
 static int
 infiniband_output(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, struct route *ro)
 {
 	uint8_t linkhdr[INFINIBAND_HDR_LEN];
 	uint8_t *phdr;
 	struct llentry *lle = NULL;
 	struct infiniband_header *ih;
 	int error = 0;
 	int hlen;	/* link layer header length */
 	uint32_t pflags;
 	bool addref;
 
 	NET_EPOCH_ASSERT();
 
 	addref = false;
 	phdr = NULL;
 	pflags = 0;
 	if (ro != NULL) {
 		/* XXX BPF uses ro_prepend */
 		if (ro->ro_prepend != NULL) {
 			phdr = ro->ro_prepend;
 			hlen = ro->ro_plen;
 		} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
 			if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
 				lle = ro->ro_lle;
 				if (lle != NULL &&
 				    (lle->la_flags & LLE_VALID) == 0) {
 					LLE_FREE(lle);
 					lle = NULL;	/* redundant */
 					ro->ro_lle = NULL;
 				}
 				if (lle == NULL) {
 					/* if we lookup, keep cache */
 					addref = 1;
 				} else
 					/*
 					 * Notify LLE code that
 					 * the entry was used
 					 * by datapath.
 					 */
 					llentry_provide_feedback(lle);
 			}
 			if (lle != NULL) {
 				phdr = lle->r_linkdata;
 				hlen = lle->r_hdrlen;
 				pflags = lle->r_flags;
 			}
 		}
 	}
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		goto bad;
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR) {
 		error = ENETDOWN;
 		goto bad;
 	}
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		error = ENETDOWN;
 		goto bad;
 	}
 
 	if (phdr == NULL) {
 		/* No prepend data supplied. Try to calculate ourselves. */
 		phdr = linkhdr;
 		hlen = INFINIBAND_HDR_LEN;
 		error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
 		    addref ? &lle : NULL);
 		if (addref && lle != NULL)
 			ro->ro_lle = lle;
 		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0));
 	}
 
 	/*
 	 * Add local infiniband header. If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto bad;
 	}
 	if ((pflags & RT_HAS_HEADER) == 0) {
 		ih = mtod(m, struct infiniband_header *);
 		memcpy(ih, phdr, hlen);
 	}
 
 	/*
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
 	return (ifp->if_transmit(ifp, m));
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Process a received Infiniband packet.
  */
 static void
 infiniband_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct infiniband_header *ibh;
 	struct epoch_tracker et;
 	int isr;
 	bool needs_epoch;
 
-	needs_epoch = (ifp->if_flags & IFF_KNOWSEPOCH) == 0;
+	needs_epoch = (ifp->if_flags & IFF_NEEDSEPOCH);
+#ifdef INVARIANTS
+	/*
+	 * This temporary code is here to prevent epoch unaware and unmarked
+	 * drivers to panic the system.  Once all drivers are taken care of,
+	 * the whole INVARIANTS block should go away.
+	 */
+	if (!needs_epoch && !in_epoch(net_epoch_preempt)) {
+		static bool printedonce;
+
+		needs_epoch = true;
+		if (!printedonce) {
+			printedonce = true;
+			if_printf(ifp, "called %s w/o net epoch! "
+			    "PLEASE file a bug report.", __func__);
+#ifdef KDB
+			kdb_backtrace();
+#endif
+		}
+	}
+#endif
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if (__predict_false(needs_epoch))
 		NET_EPOCH_ENTER(et);
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		goto done;
 	}
 
 	ibh = mtod(m, struct infiniband_header *);
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper
 	 * layers:
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 
 	if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
 		if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
 		    ifp->if_addrlen) == 0)
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 	/* Let BPF have it before we strip the header. */
 	infiniband_bpf_mtap(ifp, m);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		goto done;
 	}
 
 	/* Direct packet to correct FIB based on interface config. */
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Handle input from a lagg<N> port */
 	if (ifp->if_type == IFT_INFINIBANDLAG) {
 		KASSERT(lagg_input_infiniband_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_infiniband_p)(ifp, m);
 		if (__predict_false(m == NULL))
 			goto done;
 		ifp = m->m_pkthdr.rcvif;
 	}
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ibh->ib_protocol) {
 #ifdef INET
 	case htons(ETHERTYPE_IP):
 		isr = NETISR_IP;
 		break;
 
 	case htons(ETHERTYPE_ARP):
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			goto done;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case htons(ETHERTYPE_IPV6):
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		goto done;
 	}
 
 	/* Strip off the Infiniband header. */
 	m_adj(m, INFINIBAND_HDR_LEN);
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	netisr_dispatch(isr, m);
 done:
 	if (__predict_false(needs_epoch))
 		NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 static int
 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
     struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	uint8_t *e_addr;
 
 	switch (sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!INFINIBAND_IS_MULTICAST(e_addr))
 			return (EADDRNOTAVAIL);
 		*llsa = NULL;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
 		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		infiniband_ipv4_multicast_map(
 		    sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		/*
 		 * An IP6 address of 0 means listen to all of the
 		 * multicast address used for IP6. This has no meaning
 		 * in infiniband.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			return (EADDRNOTAVAIL);
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
 		sdl->sdl_alen = INFINIBAND_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		infiniband_ipv6_multicast_map(
 		    &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 	default:
 		return (EAFNOSUPPORT);
 	}
 }
 
 void
 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	int i;
 
 	ifp->if_addrlen = INFINIBAND_ADDR_LEN;
 	ifp->if_hdrlen = INFINIBAND_HDR_LEN;
 	ifp->if_mtu = INFINIBAND_MTU;
 	if_attach(ifp);
 	ifp->if_output = infiniband_output;
 	ifp->if_input = infiniband_input;
 	ifp->if_resolvemulti = infiniband_resolvemulti;
 	ifp->if_requestencap = infiniband_requestencap;
 
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Gbps(10); /* default value */
 	if (llb != NULL)
 		ifp->if_broadcastaddr = llb;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_INFINIBAND;
 	sdl->sdl_alen = ifp->if_addrlen;
 
 	if (lla != NULL) {
 		memcpy(LLADDR(sdl), lla, ifp->if_addrlen);
 
 		if (ifp->if_hw_addr != NULL)
 			memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
 	} else {
 		lla = LLADDR(sdl);
 	}
 
 	/* Attach ethernet compatible network device */
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 
 	/* Announce Infiniband MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break;
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Infiniband address: %20D\n", lla, ":");
 
 	/* Add necessary bits are setup; announce it now. */
 	EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
 }
 
 /*
  * Perform common duties while detaching an Infiniband interface
  */
 void
 infiniband_ifdetach(struct ifnet *ifp)
 {
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 static int
 infiniband_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static moduledata_t infiniband_mod = {
 	.name = "if_infiniband",
 	.evhand = &infiniband_modevent,
 };
 
 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(if_infiniband, 1);
diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index aa16e5d5492b..d056570d9a99 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -1,7352 +1,7352 @@
 /*-
  * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/kobj.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/limits.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/mp_ring.h>
 #include <net/debugnet.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/ip_var.h>
 #include <netinet6/ip6_var.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/led/led.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 
 #include <net/iflib.h>
 #include <net/iflib_private.h>
 
 #include "ifdi_if.h"
 
 #ifdef PCI_IOV
 #include <dev/pci/pci_iov.h>
 #endif
 
 #include <sys/bitstring.h>
 /*
  * enable accounting of every mbuf as it comes in to and goes out of
  * iflib's software descriptor references
  */
 #define MEMORY_LOGGING 0
 /*
  * Enable mbuf vectors for compressing long mbuf chains
  */
 
 /*
  * NB:
  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
  *   we prefetch needs to be determined by the time spent in m_free vis a vis
  *   the cost of a prefetch. This will of course vary based on the workload:
  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
  *        is quite expensive, thus suggesting very little prefetch.
  *      - small packet forwarding which is just returning a single mbuf to
  *        UMA will typically be very fast vis a vis the cost of a memory
  *        access.
  */
 
 /*
  * File organization:
  *  - private structures
  *  - iflib private utility functions
  *  - ifnet functions
  *  - vlan registry and other exported functions
  *  - iflib public core functions
  *
  *
  */
 MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
 
 #define	IFLIB_RXEOF_MORE (1U << 0)
 #define	IFLIB_RXEOF_EMPTY (2U << 0)
 
 struct iflib_txq;
 typedef struct iflib_txq *iflib_txq_t;
 struct iflib_rxq;
 typedef struct iflib_rxq *iflib_rxq_t;
 struct iflib_fl;
 typedef struct iflib_fl *iflib_fl_t;
 
 struct iflib_ctx;
 
 static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
 static void iflib_timer(void *arg);
 static void iflib_tqg_detach(if_ctx_t ctx);
 
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
 	void *ifi_filter_arg;
 	struct grouptask *ifi_task;
 	void *ifi_ctx;
 } *iflib_filter_info_t;
 
 struct iflib_ctx {
 	KOBJ_FIELDS;
 	/*
 	 * Pointer to hardware driver's softc
 	 */
 	void *ifc_softc;
 	device_t ifc_dev;
 	if_t ifc_ifp;
 
 	cpuset_t ifc_cpus;
 	if_shared_ctx_t ifc_sctx;
 	struct if_softc_ctx ifc_softc_ctx;
 
 	struct sx ifc_ctx_sx;
 	struct mtx ifc_state_mtx;
 
 	iflib_txq_t ifc_txqs;
 	iflib_rxq_t ifc_rxqs;
 	uint32_t ifc_if_flags;
 	uint32_t ifc_flags;
 	uint32_t ifc_max_fl_buf_size;
 	uint32_t ifc_rx_mbuf_sz;
 
 	int ifc_link_state;
 	int ifc_watchdog_events;
 	struct cdev *ifc_led_dev;
 	struct resource *ifc_msix_mem;
 
 	struct if_irq ifc_legacy_irq;
 	struct grouptask ifc_admin_task;
 	struct grouptask ifc_vflr_task;
 	struct iflib_filter_info ifc_filter_info;
 	struct ifmedia	ifc_media;
 	struct ifmedia	*ifc_mediap;
 
 	struct sysctl_oid *ifc_sysctl_node;
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
 	uint16_t ifc_sysctl_qs_eq_override;
 	uint16_t ifc_sysctl_rx_budget;
 	uint16_t ifc_sysctl_tx_abdicate;
 	uint16_t ifc_sysctl_core_offset;
 #define	CORE_OFFSET_UNSPECIFIED	0xffff
 	uint8_t  ifc_sysctl_separate_txrx;
 	uint8_t  ifc_sysctl_use_logical_cores;
 	bool	 ifc_cpus_are_physical_cores;
 
 	qidx_t ifc_sysctl_ntxds[8];
 	qidx_t ifc_sysctl_nrxds[8];
 	struct if_txrx ifc_txrx;
 #define isc_txd_encap  ifc_txrx.ift_txd_encap
 #define isc_txd_flush  ifc_txrx.ift_txd_flush
 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
 #define isc_rxd_available ifc_txrx.ift_rxd_available
 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
 #define isc_txq_select ifc_txrx.ift_txq_select
 #define isc_txq_select_v2 ifc_txrx.ift_txq_select_v2
 	eventhandler_tag ifc_vlan_attach_event;
 	eventhandler_tag ifc_vlan_detach_event;
 	struct ether_addr ifc_mac;
 };
 
 void *
 iflib_get_softc(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_softc);
 }
 
 device_t
 iflib_get_dev(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_dev);
 }
 
 if_t
 iflib_get_ifp(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_ifp);
 }
 
 struct ifmedia *
 iflib_get_media(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_mediap);
 }
 
 uint32_t
 iflib_get_flags(if_ctx_t ctx)
 {
 	return (ctx->ifc_flags);
 }
 
 void
 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
 {
 
 	bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
 }
 
 if_softc_ctx_t
 iflib_get_softc_ctx(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_softc_ctx);
 }
 
 if_shared_ctx_t
 iflib_get_sctx(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_sctx);
 }
 
 #define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
 #define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
 
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
 
 typedef struct iflib_sw_rx_desc_array {
 	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
 	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
 	bus_addr_t	*ifsd_ba;          /* bus addr of cluster for rx */
 } iflib_rxsd_array_t;
 
 typedef struct iflib_sw_tx_desc_array {
 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
 	bus_dmamap_t	*ifsd_tso_map;     /* bus_dma maps for TSO packet */
 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
 } if_txsd_vec_t;
 
 /* magic number that should be high enough for any hardware */
 #define IFLIB_MAX_TX_SEGS		128
 #define IFLIB_RX_COPY_THRESH		128
 #define IFLIB_MAX_RX_REFRESH		32
 /* The minimum descriptors per second before we start coalescing */
 #define IFLIB_MIN_DESC_SEC		16384
 #define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
 #define IFLIB_QUEUE_IDLE		0
 #define IFLIB_QUEUE_HUNG		1
 #define IFLIB_QUEUE_WORKING		2
 /* maximum number of txqs that can share an rx interrupt */
 #define IFLIB_MAX_TX_SHARED_INTR	4
 
 /* this should really scale with ring size - this is a fairly arbitrary value */
 #define TX_BATCH_SIZE			32
 
 #define IFLIB_RESTART_BUDGET		8
 
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 
 struct iflib_txq {
 	qidx_t		ift_in_use;
 	qidx_t		ift_cidx;
 	qidx_t		ift_cidx_processed;
 	qidx_t		ift_pidx;
 	uint8_t		ift_gen;
 	uint8_t		ift_br_offset;
 	uint16_t	ift_npending;
 	uint16_t	ift_db_pending;
 	uint16_t	ift_rs_pending;
 	/* implicit pad */
 	uint8_t		ift_txd_size[8];
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
 	uint64_t	ift_cleaned_prev;
 #if MEMORY_LOGGING
 	uint64_t	ift_enqueued;
 	uint64_t	ift_dequeued;
 #endif
 	uint64_t	ift_no_tx_dma_setup;
 	uint64_t	ift_no_desc_avail;
 	uint64_t	ift_mbuf_defrag_failed;
 	uint64_t	ift_mbuf_defrag;
 	uint64_t	ift_map_failed;
 	uint64_t	ift_txd_encap_efbig;
 	uint64_t	ift_pullups;
 	uint64_t	ift_last_timer_tick;
 
 	struct mtx	ift_mtx;
 	struct mtx	ift_db_mtx;
 
 	/* constant values */
 	if_ctx_t	ift_ctx;
 	struct ifmp_ring        *ift_br;
 	struct grouptask	ift_task;
 	qidx_t		ift_size;
 	uint16_t	ift_id;
 	struct callout	ift_timer;
 #ifdef DEV_NETMAP
 	struct callout	ift_netmap_timer;
 #endif /* DEV_NETMAP */
 
 	if_txsd_vec_t	ift_sds;
 	uint8_t		ift_qstatus;
 	uint8_t		ift_closed;
 	uint8_t		ift_update_freq;
 	struct iflib_filter_info ift_filter_info;
 	bus_dma_tag_t	ift_buf_tag;
 	bus_dma_tag_t	ift_tso_buf_tag;
 	iflib_dma_info_t	ift_ifdi;
 #define	MTX_NAME_LEN	32
 	char                    ift_mtx_name[MTX_NAME_LEN];
 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ift_cpu_exec_count[256];
 #endif
 } __aligned(CACHE_LINE_SIZE);
 
 struct iflib_fl {
 	qidx_t		ifl_cidx;
 	qidx_t		ifl_pidx;
 	qidx_t		ifl_credits;
 	uint8_t		ifl_gen;
 	uint8_t		ifl_rxd_size;
 #if MEMORY_LOGGING
 	uint64_t	ifl_m_enqueued;
 	uint64_t	ifl_m_dequeued;
 	uint64_t	ifl_cl_enqueued;
 	uint64_t	ifl_cl_dequeued;
 #endif
 	/* implicit pad */
 	bitstr_t 	*ifl_rx_bitmap;
 	qidx_t		ifl_fragidx;
 	/* constant */
 	qidx_t		ifl_size;
 	uint16_t	ifl_buf_size;
 	uint16_t	ifl_cltype;
 	uma_zone_t	ifl_zone;
 	iflib_rxsd_array_t	ifl_sds;
 	iflib_rxq_t	ifl_rxq;
 	uint8_t		ifl_id;
 	bus_dma_tag_t	ifl_buf_tag;
 	iflib_dma_info_t	ifl_ifdi;
 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
 	qidx_t		ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
 }  __aligned(CACHE_LINE_SIZE);
 
 static inline qidx_t
 get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
 {
 	qidx_t used;
 
 	if (pidx > cidx)
 		used = pidx - cidx;
 	else if (pidx < cidx)
 		used = size - cidx + pidx;
 	else if (gen == 0 && pidx == cidx)
 		used = 0;
 	else if (gen == 1 && pidx == cidx)
 		used = size;
 	else
 		panic("bad state");
 
 	return (used);
 }
 
 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
 
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 struct iflib_rxq {
 	if_ctx_t	ifr_ctx;
 	iflib_fl_t	ifr_fl;
 	uint64_t	ifr_rx_irq;
 	struct pfil_head	*pfil;
 	/*
 	 * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
 	 * the completion queue consumer index.  Otherwise it's unused.
 	 */
 	qidx_t		ifr_cq_cidx;
 	uint16_t	ifr_id;
 	uint8_t		ifr_nfl;
 	uint8_t		ifr_ntxqirq;
 	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
 	uint8_t		ifr_fl_offset;
 	struct lro_ctrl			ifr_lc;
 	struct grouptask        ifr_task;
 	struct callout		ifr_watchdog;
 	struct iflib_filter_info ifr_filter_info;
 	iflib_dma_info_t		ifr_ifdi;
 
 	/* dynamically allocate if any drivers need a value substantially larger than this */
 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
 #ifdef IFLIB_DIAGNOSTICS
 	uint64_t ifr_cpu_exec_count[256];
 #endif
 }  __aligned(CACHE_LINE_SIZE);
 
 typedef struct if_rxsd {
 	caddr_t *ifsd_cl;
 	iflib_fl_t ifsd_fl;
 } *if_rxsd_t;
 
 /* multiple of word size */
 #ifdef __LP64__
 #define PKT_INFO_SIZE	6
 #define RXD_INFO_SIZE	5
 #define PKT_TYPE uint64_t
 #else
 #define PKT_INFO_SIZE	11
 #define RXD_INFO_SIZE	8
 #define PKT_TYPE uint32_t
 #endif
 #define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
 #define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
 
 typedef struct if_pkt_info_pad {
 	PKT_TYPE pkt_val[PKT_INFO_SIZE];
 } *if_pkt_info_pad_t;
 typedef struct if_rxd_info_pad {
 	PKT_TYPE rxd_val[RXD_INFO_SIZE];
 } *if_rxd_info_pad_t;
 
 CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
 CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
 
 static inline void
 pkt_info_zero(if_pkt_info_t pi)
 {
 	if_pkt_info_pad_t pi_pad;
 
 	pi_pad = (if_pkt_info_pad_t)pi;
 	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
 	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
 #ifndef __LP64__
 	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
 	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
 #endif	
 }
 
 static device_method_t iflib_pseudo_methods[] = {
 	DEVMETHOD(device_attach, noop_attach),
 	DEVMETHOD(device_detach, iflib_pseudo_detach),
 	DEVMETHOD_END
 };
 
 driver_t iflib_pseudodriver = {
 	"iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx),
 };
 
 static inline void
 rxd_info_zero(if_rxd_info_t ri)
 {
 	if_rxd_info_pad_t ri_pad;
 	int i;
 
 	ri_pad = (if_rxd_info_pad_t)ri;
 	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
 		ri_pad->rxd_val[i] = 0;
 		ri_pad->rxd_val[i+1] = 0;
 		ri_pad->rxd_val[i+2] = 0;
 		ri_pad->rxd_val[i+3] = 0;
 	}
 #ifdef __LP64__
 	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
 #endif
 }
 
 /*
  * Only allow a single packet to take up most 1/nth of the tx ring
  */
 #define MAX_SINGLE_PACKET_FRACTION 12
 #define IF_BAD_DMA (bus_addr_t)-1
 
 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
 
 #define CTX_LOCK_INIT(_sc)  sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
 #define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
 #define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
 #define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
 
 #define STATE_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
 #define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
 #define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
 #define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
 
 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
 
 void
 iflib_set_detach(if_ctx_t ctx)
 {
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_IN_DETACH;
 	STATE_UNLOCK(ctx);
 }
 
 /* Our boot-time initialization hook */
 static int	iflib_module_event_handler(module_t, int, void *);
 
 static moduledata_t iflib_moduledata = {
 	"iflib",
 	iflib_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(iflib, 1);
 
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
 TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
 
 #ifndef IFLIB_DEBUG_COUNTERS
 #ifdef INVARIANTS
 #define IFLIB_DEBUG_COUNTERS 1
 #else
 #define IFLIB_DEBUG_COUNTERS 0
 #endif /* !INVARIANTS */
 #endif
 
 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "iflib driver parameters");
 
 /*
  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  */
 static int iflib_min_tx_latency = 0;
 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
 		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
 static int iflib_no_tx_batch = 0;
 SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
 		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
 static int iflib_timer_default = 1000;
 SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
 		   &iflib_timer_default, 0, "number of ticks between iflib_timer calls");
 
 
 #if IFLIB_DEBUG_COUNTERS
 
 static int iflib_tx_seen;
 static int iflib_tx_sent;
 static int iflib_tx_encap;
 static int iflib_rx_allocs;
 static int iflib_fl_refills;
 static int iflib_fl_refills_large;
 static int iflib_tx_frees;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
 		   &iflib_tx_seen, 0, "# TX mbufs seen");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
 		   &iflib_tx_sent, 0, "# TX mbufs sent");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
 		   &iflib_tx_encap, 0, "# TX mbufs encapped");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
 		   &iflib_tx_frees, 0, "# TX frees");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
 		   &iflib_rx_allocs, 0, "# RX allocations");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
 		   &iflib_fl_refills, 0, "# refills");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
 		   &iflib_fl_refills_large, 0, "# large refills");
 
 static int iflib_txq_drain_flushing;
 static int iflib_txq_drain_oactive;
 static int iflib_txq_drain_notready;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
 		   &iflib_txq_drain_flushing, 0, "# drain flushes");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
 		   &iflib_txq_drain_oactive, 0, "# drain oactives");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
 		   &iflib_txq_drain_notready, 0, "# drain notready");
 
 static int iflib_encap_load_mbuf_fail;
 static int iflib_encap_pad_mbuf_fail;
 static int iflib_encap_txq_avail_fail;
 static int iflib_encap_txd_encap_fail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
 		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
 
 static int iflib_task_fn_rxs;
 static int iflib_rx_intr_enables;
 static int iflib_fast_intrs;
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_if_input;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
 		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
 		   &iflib_rx_intr_enables, 0, "# RX intr enables");
 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
 		   &iflib_fast_intrs, 0, "# fast_intr calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
 		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
 		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
 		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 	         &iflib_rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
 		   &iflib_verbose_debug, 0, "enable verbose debugging");
 
 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
 static void
 iflib_debug_reset(void)
 {
 	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
 		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
 		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
 		iflib_txq_drain_notready =
 		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
 		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
 		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
 		iflib_rx_unavail =
 		iflib_rx_ctx_inactive = iflib_rx_if_input =
 		iflib_rxd_flush = 0;
 }
 
 #else
 #define DBG_COUNTER_INC(name)
 static void iflib_debug_reset(void) {}
 #endif
 
 #define IFLIB_DEBUG 0
 
 static void iflib_tx_structures_free(if_ctx_t ctx);
 static void iflib_rx_structures_free(if_ctx_t ctx);
 static int iflib_queues_alloc(if_ctx_t ctx);
 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
 static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
 static int iflib_qset_structures_setup(if_ctx_t ctx);
 static int iflib_msix_init(if_ctx_t ctx);
 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
 #ifdef ALTQ
 static void iflib_altq_if_start(if_t ifp);
 static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
 #endif
 static int iflib_register(if_ctx_t);
 static void iflib_deregister(if_ctx_t);
 static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
 static uint16_t iflib_get_mbuf_size_for(unsigned int size);
 static void iflib_init_locked(if_ctx_t ctx);
 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
 static void iflib_ifmp_purge(iflib_txq_t txq);
 static void _iflib_pre_assert(if_softc_ctx_t scctx);
 static void iflib_if_init_locked(if_ctx_t ctx);
 static void iflib_free_intr_mem(if_ctx_t ctx);
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf * iflib_fixup_rx(struct mbuf *m);
 #endif
 
 static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
     SLIST_HEAD_INITIALIZER(cpu_offsets);
 struct cpu_offset {
 	SLIST_ENTRY(cpu_offset) entries;
 	cpuset_t	set;
 	unsigned int	refcount;
 	uint16_t	next_cpuid;
 };
 static struct mtx cpu_offset_mtx;
 MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
     MTX_DEF);
 
 DEBUGNET_DEFINE(iflib);
 
 static int
 iflib_num_rx_descs(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
 
 	return scctx->isc_nrxd[first_rxq];
 }
 
 static int
 iflib_num_tx_descs(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
 
 	return scctx->isc_ntxd[first_txq];
 }
 
 #ifdef DEV_NETMAP
 #include <sys/selinfo.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
 
 static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
 static void iflib_netmap_timer(void *arg);
 
 /*
  * device-specific sysctl variables:
  *
  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
  *	During regular operations the CRC is stripped, but on some
  *	hardware reception of frames not multiple of 64 is slower,
  *	so using crcstrip=0 helps in benchmarks.
  *
  * iflib_rx_miss, iflib_rx_miss_bufs:
  *	count packets that might be missed due to lost interrupts.
  */
 SYSCTL_DECL(_dev_netmap);
 /*
  * The xl driver by default strips CRCs and we do not override it.
  */
 
 int iflib_crcstrip = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
 
 int iflib_rx_miss, iflib_rx_miss_bufs;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
 
 /*
  * Register/unregister. We are already under netmap lock.
  * Only called on the first register or the last unregister.
  */
 static int
 iflib_netmap_register(struct netmap_adapter *na, int onoff)
 {
 	if_t ifp = na->ifp;
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int status;
 
 	CTX_LOCK(ctx);
 	if (!CTX_IS_VF(ctx))
 		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
 
 	iflib_stop(ctx);
 
 	/*
 	 * Enable (or disable) netmap flags, and intercept (or restore)
 	 * ifp->if_transmit. This is done once the device has been stopped
 	 * to prevent race conditions. Also, this must be done after
 	 * calling netmap_disable_all_rings() and before calling
 	 * netmap_enable_all_rings(), so that these two functions see the
 	 * updated state of the NAF_NETMAP_ON bit.
 	 */
 	if (onoff) {
 		nm_set_native_flags(na);
 	} else {
 		nm_clear_native_flags(na);
 	}
 
 	iflib_init_locked(ctx);
 	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
 	status = if_getdrvflags(ifp) & IFF_DRV_RUNNING ? 0 : 1;
 	if (status)
 		nm_clear_native_flags(na);
 	CTX_UNLOCK(ctx);
 	return (status);
 }
 
 static int
 iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
 {
 	if_t ifp = na->ifp;
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
 	iflib_fl_t fl = &rxq->ifr_fl[0];
 
 	info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	info->num_tx_descs = iflib_num_tx_descs(ctx);
 	info->num_rx_descs = iflib_num_rx_descs(ctx);
 	info->rx_buf_maxsize = fl->ifl_buf_size;
 	nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
 		info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
 		info->num_rx_descs, info->rx_buf_maxsize);
 
 	return 0;
 }
 
 static int
 netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
 {
 	struct netmap_adapter *na = kring->na;
 	u_int const lim = kring->nkr_num_slots - 1;
 	struct netmap_ring *ring = kring->ring;
 	bus_dmamap_t *map;
 	struct if_rxd_update iru;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	iflib_fl_t fl = &rxq->ifr_fl[0];
 	u_int nic_i_first, nic_i;
 	u_int nm_i;
 	int i, n;
 #if IFLIB_DEBUG_COUNTERS
 	int rf_count = 0;
 #endif
 
 	/*
 	 * This function is used both at initialization and in rxsync.
 	 * At initialization we need to prepare (with isc_rxd_refill())
 	 * all the netmap buffers currently owned by the kernel, in
 	 * such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
 	 * (except for kring->nkr_hwofs). These may be less than
 	 * kring->nkr_num_slots if netmap_reset() was called while
 	 * an application using the kring that still owned some
 	 * buffers.
 	 * At rxsync time, both indexes point to the next buffer to be
 	 * refilled.
 	 * In any case we publish (with isc_rxd_flush()) up to
 	 * (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
 	 * pointer to overrun the head/cons pointer, although this is
 	 * not necessary for some NICs (e.g. vmx).
 	 */
 	if (__predict_false(init)) {
 		n = kring->nkr_num_slots - nm_kr_rxspace(kring);
 	} else {
 		n = kring->rhead - kring->nr_hwcur;
 		if (n == 0)
 			return (0); /* Nothing to do. */
 		if (n < 0)
 			n += kring->nkr_num_slots;
 	}
 
 	iru_init(&iru, rxq, 0 /* flid */);
 	map = fl->ifl_sds.ifsd_map;
 	nic_i = fl->ifl_pidx;
 	nm_i = netmap_idx_n2k(kring, nic_i);
 	if (__predict_false(init)) {
 		/*
 		 * On init/reset, nic_i must be 0, and we must
 		 * start to refill from hwtail (see netmap_reset()).
 		 */
 		MPASS(nic_i == 0);
 		MPASS(nm_i == kring->nr_hwtail);
 	} else
 		MPASS(nm_i == kring->nr_hwcur);
 	DBG_COUNTER_INC(fl_refills);
 	while (n > 0) {
 #if IFLIB_DEBUG_COUNTERS
 		if (++rf_count == 9)
 			DBG_COUNTER_INC(fl_refills_large);
 #endif
 		nic_i_first = nic_i;
 		for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 
 			MPASS(i < IFLIB_MAX_RX_REFRESH);
 
 			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 			        return netmap_ring_reinit(kring);
 
 			fl->ifl_bus_addrs[i] = paddr +
 			    nm_get_offset(kring, slot);
 			fl->ifl_rxd_idxs[i] = nic_i;
 
 			if (__predict_false(init)) {
 				netmap_load_map(na, fl->ifl_buf_tag,
 				    map[nic_i], addr);
 			} else if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, fl->ifl_buf_tag,
 				    map[nic_i], addr);
 			}
 			bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
 			    BUS_DMASYNC_PREREAD);
 			slot->flags &= ~NS_BUF_CHANGED;
 
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 
 		iru.iru_pidx = nic_i_first;
 		iru.iru_count = i;
 		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 	}
 	fl->ifl_pidx = nic_i;
 	/*
 	 * At the end of the loop we must have refilled everything
 	 * we could possibly refill.
 	 */
 	MPASS(nm_i == kring->rhead);
 	kring->nr_hwcur = nm_i;
 
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
 	    nm_prev(nic_i, lim));
 	DBG_COUNTER_INC(rxd_flush);
 
 	return (0);
 }
 
 #define NETMAP_TX_TIMER_US	90
 
 /*
  * Reconcile kernel and user view of the transmit ring.
  *
  * All information is in the kring.
  * Userspace wants to send packets up to the one before kring->rhead,
  * kernel knows kring->nr_hwcur is the first unsent packet.
  *
  * Here we push packets out (as many as possible), and possibly
  * reclaim buffers from previously completed transmission.
  *
  * The caller (netmap) guarantees that there is only one instance
  * running at any time. Any interference with other driver
  * methods should be handled by the individual drivers.
  */
 static int
 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	if_t ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap kring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	struct if_pkt_info pi;
 	int tx_pkts = 0, tx_bytes = 0;
 
 	/*
 	 * interrupts on every tx packet are expensive so request
 	 * them every half ring, or where NS_REPORT is set
 	 */
 	u_int report_frequency = kring->nkr_num_slots >> 1;
 	/* device-specific */
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/*
 	 * First part: process new packets to send.
 	 * nm_i is the current index in the netmap kring,
 	 * nic_i is the corresponding index in the NIC ring.
 	 *
 	 * If we have packets to send (nm_i != head)
 	 * iterate over the netmap ring, fetch length and update
 	 * the corresponding slot in the NIC ring. Some drivers also
 	 * need to update the buffer's physical address in the NIC slot
 	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
 	 *
 	 * The netmap_reload_map() calls is especially expensive,
 	 * even when (as in this case) the tag is 0, so do only
 	 * when the buffer has actually changed.
 	 *
 	 * If possible do not set the report/intr bit on all slots,
 	 * but only a few times per ring or when NS_REPORT is set.
 	 *
 	 * Finally, on 10G and faster drivers, it might be useful
 	 * to prefetch the next slot and txr entry.
 	 */
 
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {	/* we have new packets to send */
 		uint32_t pkt_len = 0, seg_idx = 0;
 		int nic_i_start = -1, flags = 0;
 		pkt_info_zero(&pi);
 		pi.ipi_segs = txq->ift_segs;
 		pi.ipi_qsidx = kring->ring_id;
 		nic_i = netmap_idx_k2n(kring, nm_i);
 
 		__builtin_prefetch(&ring->slot[nm_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 
 		while (nm_i != head) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t offset = nm_get_offset(kring, slot);
 			u_int len = slot->len;
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 
 			flags |= (slot->flags & NS_REPORT ||
 				nic_i == 0 || nic_i == report_frequency) ?
 				IPI_TX_INTR : 0;
 
 			/*
 			 * If this is the first packet fragment, save the
 			 * index of the first NIC slot for later.
 			 */
 			if (nic_i_start < 0)
 				nic_i_start = nic_i;
 
 			pi.ipi_segs[seg_idx].ds_addr = paddr + offset;
 			pi.ipi_segs[seg_idx].ds_len = len;
 			if (len) {
 				pkt_len += len;
 				seg_idx++;
 			}
 
 			if (!(slot->flags & NS_MOREFRAG)) {
 				pi.ipi_len = pkt_len;
 				pi.ipi_nsegs = seg_idx;
 				pi.ipi_pidx = nic_i_start;
 				pi.ipi_ndescs = 0;
 				pi.ipi_flags = flags;
 
 				/* Prepare the NIC TX ring. */
 				ctx->isc_txd_encap(ctx->ifc_softc, &pi);
 				DBG_COUNTER_INC(tx_encap);
 
 				/* Update transmit counters */
 				tx_bytes += pi.ipi_len;
 				tx_pkts++;
 
 				/* Reinit per-packet info for the next one. */
 				flags = seg_idx = pkt_len = 0;
 				nic_i_start = -1;
 			}
 
 			/* prefetch for next round */
 			__builtin_prefetch(&ring->slot[nm_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 
 			NM_CHECK_ADDR_LEN_OFF(na, len, offset);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				netmap_reload_map(na, txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[nic_i], addr);
 			}
 			/* make sure changes to the buffer are synced */
 			bus_dmamap_sync(txq->ift_buf_tag,
 			    txq->ift_sds.ifsd_map[nic_i],
 			    BUS_DMASYNC_PREWRITE);
 
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED | NS_MOREFRAG);
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = nm_i;
 
 		/* synchronize the NIC ring */
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
 	}
 
 	/*
 	 * Second part: reclaim buffers for completed transmissions.
 	 *
 	 * If there are unclaimed buffers, attempt to reclaim them.
 	 * If we don't manage to reclaim them all, and TX IRQs are not in use,
 	 * trigger a per-tx-queue timer to try again later.
 	 */
 	if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 		if (iflib_tx_credits_update(ctx, txq)) {
 			/* some tx completed, increment avail */
 			nic_i = txq->ift_cidx_processed;
 			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
 		}
 	}
 
 	if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
 		if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 			callout_reset_sbt_on(&txq->ift_netmap_timer,
 			    NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
 			    iflib_netmap_timer, txq,
 			    txq->ift_netmap_timer.c_cpu, 0);
 		}
 
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, tx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, tx_pkts);
 
 	return (0);
 }
 
 /*
  * Reconcile kernel and user view of the receive ring.
  * Same as for the txsync, this routine must be efficient.
  * The caller guarantees a single invocations, but races against
  * the rest of the driver should be handled here.
  *
  * On call, kring->rhead is the first packet that userspace wants
  * to keep, and kring->rcur is the wakeup point.
  * The kernel has previously reported packets up to kring->rtail.
  *
  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
  * of whether or not we received an interrupt.
  */
 static int
 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct netmap_ring *ring = kring->ring;
 	if_t ifp = na->ifp;
 	uint32_t nm_i;	/* index into the netmap ring */
 	uint32_t nic_i;	/* index into the NIC ring */
 	u_int n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 	int i = 0, rx_bytes = 0, rx_pkts = 0;
 
 	if_ctx_t ctx = if_getsoftc(ifp);
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 	iflib_fl_t fl = &rxq->ifr_fl[0];
 	struct if_rxd_info ri;
 	qidx_t *cidxp;
 
 	/*
 	 * netmap only uses free list 0, to avoid out of order consumption
 	 * of receive buffers
 	 */
 
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/*
 	 * First part: import newly received packets.
 	 *
 	 * nm_i is the index of the next free slot in the netmap ring,
 	 * nic_i is the index of the next received packet in the NIC ring
 	 * (or in the free list 0 if IFLIB_HAS_RXCQ is set), and they may
 	 * differ in case if_init() has been called while
 	 * in netmap mode. For the receive ring we have
 	 *
 	 *	nic_i = fl->ifl_cidx;
 	 *	nm_i = kring->nr_hwtail (previous)
 	 * and
 	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 *
 	 * fl->ifl_cidx is set to 0 on a ring reinit
 	 */
 	if (netmap_no_pendintr || force_update) {
 		uint32_t hwtail_lim = nm_prev(kring->nr_hwcur, lim);
 		bool have_rxcq = sctx->isc_flags & IFLIB_HAS_RXCQ;
 		int crclen = iflib_crcstrip ? 0 : 4;
 		int error, avail;
 
 		/*
 		 * For the free list consumer index, we use the same
 		 * logic as in iflib_rxeof().
 		 */
 		if (have_rxcq)
 			cidxp = &rxq->ifr_cq_cidx;
 		else
 			cidxp = &fl->ifl_cidx;
 		avail = ctx->isc_rxd_available(ctx->ifc_softc,
 		    rxq->ifr_id, *cidxp, USHRT_MAX);
 
 		nic_i = fl->ifl_cidx;
 		nm_i = netmap_idx_n2k(kring, nic_i);
 		MPASS(nm_i == kring->nr_hwtail);
 		for (n = 0; avail > 0 && nm_i != hwtail_lim; n++, avail--) {
 			rxd_info_zero(&ri);
 			ri.iri_frags = rxq->ifr_frags;
 			ri.iri_qsidx = kring->ring_id;
 			ri.iri_ifp = ctx->ifc_ifp;
 			ri.iri_cidx = *cidxp;
 
 			error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 			for (i = 0; i < ri.iri_nfrags; i++) {
 				if (error) {
 					ring->slot[nm_i].len = 0;
 					ring->slot[nm_i].flags = 0;
 				} else {
 					ring->slot[nm_i].len = ri.iri_frags[i].irf_len;
 					if (i == (ri.iri_nfrags - 1)) {
 						ring->slot[nm_i].len -= crclen;
 						ring->slot[nm_i].flags = 0;
 
 						/* Update receive counters */
 						rx_bytes += ri.iri_len;
 						rx_pkts++;
 					} else
 						ring->slot[nm_i].flags = NS_MOREFRAG;
 				}
 
 				bus_dmamap_sync(fl->ifl_buf_tag,
 				    fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
 				nm_i = nm_next(nm_i, lim);
 				fl->ifl_cidx = nic_i = nm_next(nic_i, lim);
 			}
 
 			if (have_rxcq) {
 				*cidxp = ri.iri_cidx;
 				while (*cidxp >= scctx->isc_nrxd[0])
 					*cidxp -= scctx->isc_nrxd[0];
 			}
 
 		}
 		if (n) { /* update the state variables */
 			if (netmap_no_pendintr && !force_update) {
 				/* diagnostics */
 				iflib_rx_miss ++;
 				iflib_rx_miss_bufs += n;
 			}
 			kring->nr_hwtail = nm_i;
 		}
 		kring->nr_kflags &= ~NKR_PENDINTR;
 	}
 	/*
 	 * Second part: skip past packets that userspace has released.
 	 * (kring->nr_hwcur to head excluded),
 	 * and make the buffers available for reception.
 	 * As usual nm_i is the index in the netmap ring,
 	 * nic_i is the index in the NIC ring, and
 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 */
 	netmap_fl_refill(rxq, kring, false);
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	return (0);
 }
 
 static void
 iflib_netmap_intr(struct netmap_adapter *na, int onoff)
 {
 	if_ctx_t ctx = if_getsoftc(na->ifp);
 
 	CTX_LOCK(ctx);
 	if (onoff) {
 		IFDI_INTR_ENABLE(ctx);
 	} else {
 		IFDI_INTR_DISABLE(ctx);
 	}
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_netmap_attach(if_ctx_t ctx)
 {
 	struct netmap_adapter na;
 
 	bzero(&na, sizeof(na));
 
 	na.ifp = ctx->ifc_ifp;
 	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MOREFRAG | NAF_OFFSETS;
 	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
 	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
 
 	na.num_tx_desc = iflib_num_tx_descs(ctx);
 	na.num_rx_desc = iflib_num_rx_descs(ctx);
 	na.nm_txsync = iflib_netmap_txsync;
 	na.nm_rxsync = iflib_netmap_rxsync;
 	na.nm_register = iflib_netmap_register;
 	na.nm_intr = iflib_netmap_intr;
 	na.nm_config = iflib_netmap_config;
 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	return (netmap_attach(&na));
 }
 
 static int
 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
 	if (slot == NULL)
 		return (0);
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 		/*
 		 * In netmap mode, set the map for the packet buffer.
 		 * NOTE: Some drivers (not this one) also need to set
 		 * the physical buffer address in the NIC ring.
 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
 		 * netmap slot index, si
 		 */
 		int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
 		netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
 		    NMB(na, slot + si));
 	}
 	return (1);
 }
 
 static int
 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_kring *kring;
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
 	if (slot == NULL)
 		return (0);
 	kring = na->rx_rings[rxq->ifr_id];
 	netmap_fl_refill(rxq, kring, true);
 	return (1);
 }
 
 static void
 iflib_netmap_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	/*
 	 * Wake up the netmap application, to give it a chance to
 	 * call txsync and reclaim more completed TX buffers.
 	 */
 	netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
 }
 
 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
 
 #else
 #define iflib_netmap_txq_init(ctx, txq) (0)
 #define iflib_netmap_rxq_init(ctx, rxq) (0)
 #define iflib_netmap_detach(ifp)
 #define netmap_enable_all_rings(ifp)
 #define netmap_disable_all_rings(ifp)
 
 #define iflib_netmap_attach(ctx) (0)
 #define netmap_rx_irq(ifp, qid, budget) (0)
 #endif
 
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
 
 static __inline void
 prefetch2cachelines(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 #if (CACHE_LINE_SIZE < 128)
 	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
 #endif
 }
 #else
 static __inline void
 prefetch(void *x)
 {
 }
 
 static __inline void
 prefetch2cachelines(void *x)
 {
 }
 #endif
 
 static void
 iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
 {
 	iflib_fl_t fl;
 
 	fl = &rxq->ifr_fl[flid];
 	iru->iru_paddrs = fl->ifl_bus_addrs;
 	iru->iru_idxs = fl->ifl_rxd_idxs;
 	iru->iru_qsidx = rxq->ifr_id;
 	iru->iru_buf_size = fl->ifl_buf_size;
 	iru->iru_flidx = fl->ifl_id;
 }
 
 static void
 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	if (err)
 		return;
 	*(bus_addr_t *) arg = segs[0].ds_addr;
 }
 
 #define	DMA_WIDTH_TO_BUS_LOWADDR(width)				\
 	(((width) == 0) || (width) == flsll(BUS_SPACE_MAXADDR) ?	\
 	    BUS_SPACE_MAXADDR : (1ULL << (width)) - 1ULL)
 
 int
 iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
 {
 	int err;
 	device_t dev = ctx->ifc_dev;
 	bus_addr_t lowaddr;
 
 	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(ctx->ifc_softc_ctx.isc_dma_width);
 
 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
 				align, 0,		/* alignment, bounds */
 				lowaddr,		/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				size,			/* maxsize */
 				1,			/* nsegments */
 				size,			/* maxsegsize */
 				BUS_DMA_ALLOCNOW,	/* flags */
 				NULL,			/* lockfunc */
 				NULL,			/* lockarg */
 				&dma->idi_tag);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dma_tag_create failed: %d\n",
 		    __func__, err);
 		goto fail_0;
 	}
 
 	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
 	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
 		    __func__, (uintmax_t)size, err);
 		goto fail_1;
 	}
 
 	dma->idi_paddr = IF_BAD_DMA;
 	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
 	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
 	if (err || dma->idi_paddr == IF_BAD_DMA) {
 		device_printf(dev,
 		    "%s: bus_dmamap_load failed: %d\n",
 		    __func__, err);
 		goto fail_2;
 	}
 
 	dma->idi_size = size;
 	return (0);
 
 fail_2:
 	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 fail_1:
 	bus_dma_tag_destroy(dma->idi_tag);
 fail_0:
 	dma->idi_tag = NULL;
 
 	return (err);
 }
 
 int
 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 
 	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
 
 	return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
 }
 
 int
 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
 {
 	int i, err;
 	iflib_dma_info_t *dmaiter;
 
 	dmaiter = dmalist;
 	for (i = 0; i < count; i++, dmaiter++) {
 		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
 			break;
 	}
 	if (err)
 		iflib_dma_free_multi(dmalist, i);
 	return (err);
 }
 
 void
 iflib_dma_free(iflib_dma_info_t dma)
 {
 	if (dma->idi_tag == NULL)
 		return;
 	if (dma->idi_paddr != IF_BAD_DMA) {
 		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
 		dma->idi_paddr = IF_BAD_DMA;
 	}
 	if (dma->idi_vaddr != NULL) {
 		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 		dma->idi_vaddr = NULL;
 	}
 	bus_dma_tag_destroy(dma->idi_tag);
 	dma->idi_tag = NULL;
 }
 
 void
 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
 {
 	int i;
 	iflib_dma_info_t *dmaiter = dmalist;
 
 	for (i = 0; i < count; i++, dmaiter++)
 		iflib_dma_free(*dmaiter);
 }
 
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	int result;
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 iflib_fast_intr_rxtx(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	if_ctx_t ctx;
 	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
 	iflib_txq_t txq;
 	void *sc;
 	int i, cidx, result;
 	qidx_t txqid;
 	bool intr_enable, intr_legacy;
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	ctx = rxq->ifr_ctx;
 	sc = ctx->ifc_softc;
 	intr_enable = false;
 	intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
 	MPASS(rxq->ifr_ntxqirq);
 	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
 		txqid = rxq->ifr_txqid[i];
 		txq = &ctx->ifc_txqs[txqid];
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD);
 		if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
 			if (intr_legacy)
 				intr_enable = true;
 			else
 				IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
 			continue;
 		}
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 	}
 	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidx = rxq->ifr_cq_cidx;
 	else
 		cidx = rxq->ifr_fl[0].ifl_cidx;
 	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
 		GROUPTASK_ENQUEUE(gtask);
 	else {
 		if (intr_legacy)
 			intr_enable = true;
 		else
 			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 		DBG_COUNTER_INC(rx_intr_enables);
 	}
 	if (intr_enable)
 		IFDI_INTR_ENABLE(ctx);
 	return (FILTER_HANDLED);
 }
 
 static int
 iflib_fast_intr_ctx(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 	int result;
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL) {
 		result = info->ifi_filter(info->ifi_filter_arg);
 		if ((result & FILTER_SCHEDULE_THREAD) == 0)
 			return (result);
 	}
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 		 driver_filter_t filter, driver_intr_t handler, void *arg,
 		 const char *name)
 {
 	struct resource *res;
 	void *tag = NULL;
 	device_t dev = ctx->ifc_dev;
 	int flags, i, rc;
 
 	flags = RF_ACTIVE;
 	if (ctx->ifc_flags & IFC_LEGACY)
 		flags |= RF_SHAREABLE;
 	MPASS(rid < 512);
 	i = rid;
 	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags);
 	if (res == NULL) {
 		device_printf(dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 	irq->ii_res = res;
 	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
 	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
 						filter, handler, arg, &tag);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 					  rid, name ? name : "unknown", rc);
 		return (rc);
 	} else if (name)
 		bus_describe_intr(dev, res, tag, "%s", name);
 
 	irq->ii_tag = tag;
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate DMA resources for TX buffers as well as memory for the TX
  *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
  *  iflib_sw_tx_desc_array structure, storing all the information that
  *  is needed to transmit a packet on the wire.  This is called only
  *  once at attach, setup is done every reset.
  *
  **********************************************************************/
 static int
 iflib_txsd_alloc(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	bus_size_t tsomaxsize;
 	bus_addr_t lowaddr;
 	int err, nsegments, ntsosegments;
 	bool tso;
 
 	nsegments = scctx->isc_tx_nsegments;
 	ntsosegments = scctx->isc_tx_tso_segments_max;
 	tsomaxsize = scctx->isc_tx_tso_size_max;
 	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
 		tsomaxsize += sizeof(struct ether_vlan_header);
 	MPASS(scctx->isc_ntxd[0] > 0);
 	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
 	MPASS(nsegments > 0);
 	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
 		MPASS(ntsosegments > 0);
 		MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
 	}
 
 	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
 
 	/*
 	 * Set up DMA tags for TX buffers.
 	 */
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       lowaddr,			/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       sctx->isc_tx_maxsize,		/* maxsize */
 			       nsegments,	/* nsegments */
 			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_buf_tag))) {
 		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
 		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
 		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
 		goto fail;
 	}
 	tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
 	if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       lowaddr,			/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       tsomaxsize,		/* maxsize */
 			       ntsosegments,	/* nsegments */
 			       sctx->isc_tso_maxsegsize,/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_tso_buf_tag))) {
 		device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
 		    err);
 		goto fail;
 	}
 
 	/* Allocate memory for the TX mbuf map. */
 	if (!(txq->ift_sds.ifsd_m =
 	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX mbuf map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/*
 	 * Create the DMA maps for TX buffers.
 	 */
 	if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
 	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(dev,
 		    "Unable to allocate TX buffer DMA map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
 	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 		device_printf(dev,
 		    "Unable to allocate TSO TX buffer map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
 		err = bus_dmamap_create(txq->ift_buf_tag, 0,
 		    &txq->ift_sds.ifsd_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TX DMA map\n");
 			goto fail;
 		}
 		if (!tso)
 			continue;
 		err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
 		    &txq->ift_sds.ifsd_tso_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TSO TX DMA map\n");
 			goto fail;
 		}
 	}
 	return (0);
 fail:
 	/* We free all, it handles case where we are in the middle */
 	iflib_tx_structures_free(ctx);
 	return (err);
 }
 
 static void
 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	bus_dmamap_t map;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		map = txq->ift_sds.ifsd_map[i];
 		bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_buf_tag, map);
 		bus_dmamap_destroy(txq->ift_buf_tag, map);
 		txq->ift_sds.ifsd_map[i] = NULL;
 	}
 
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		map = txq->ift_sds.ifsd_tso_map[i];
 		bus_dmamap_sync(txq->ift_tso_buf_tag, map,
 		    BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_tso_buf_tag, map);
 		bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
 		txq->ift_sds.ifsd_tso_map[i] = NULL;
 	}
 }
 
 static void
 iflib_txq_destroy(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 
 	for (int i = 0; i < txq->ift_size; i++)
 		iflib_txsd_destroy(ctx, txq, i);
 
 	if (txq->ift_br != NULL) {
 		ifmp_ring_free(txq->ift_br);
 		txq->ift_br = NULL;
 	}
 
 	mtx_destroy(&txq->ift_mtx);
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		free(txq->ift_sds.ifsd_map, M_IFLIB);
 		txq->ift_sds.ifsd_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
 		txq->ift_sds.ifsd_tso_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_m != NULL) {
 		free(txq->ift_sds.ifsd_m, M_IFLIB);
 		txq->ift_sds.ifsd_m = NULL;
 	}
 	if (txq->ift_buf_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_buf_tag);
 		txq->ift_buf_tag = NULL;
 	}
 	if (txq->ift_tso_buf_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_tso_buf_tag);
 		txq->ift_tso_buf_tag = NULL;
 	}
 	if (txq->ift_ifdi != NULL) {
 		free(txq->ift_ifdi, M_IFLIB);
 	}
 }
 
 static void
 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	struct mbuf **mp;
 
 	mp = &txq->ift_sds.ifsd_m[i];
 	if (*mp == NULL)
 		return;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		bus_dmamap_sync(txq->ift_buf_tag,
 		    txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
 	}
 	if (txq->ift_sds.ifsd_tso_map != NULL) {
 		bus_dmamap_sync(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[i]);
 	}
 	m_freem(*mp);
 	DBG_COUNTER_INC(tx_frees);
 	*mp = NULL;
 }
 
 static int
 iflib_txq_setup(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	iflib_dma_info_t di;
 	int i;
 
 	/* Set number of descriptors available */
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	/* XXX make configurable */
 	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
 
 	/* Reset indices */
 	txq->ift_cidx_processed = 0;
 	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 
 	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 		bzero((void *)di->idi_vaddr, di->idi_size);
 
 	IFDI_TXQ_SETUP(ctx, txq->ift_id);
 	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 		bus_dmamap_sync(di->idi_tag, di->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate DMA resources for RX buffers as well as memory for the RX
  *  mbuf map, direct RX cluster pointer map and RX cluster bus address
  *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
  *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
  *  Since we use use one entry in iflib_sw_rx_desc_array per received
  *  packet, the maximum number of entries we'll need is equal to the
  *  number of hardware receive descriptors that we've allocated.
  *
  **********************************************************************/
 static int
 iflib_rxsd_alloc(iflib_rxq_t rxq)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	iflib_fl_t fl;
 	bus_addr_t lowaddr;
 	int			err;
 
 	MPASS(scctx->isc_nrxd[0] > 0);
 	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
 
 	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
 
 	fl = rxq->ifr_fl;
 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 		/* Set up DMA tag for RX buffers. */
 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 					 1, 0,			/* alignment, bounds */
 					 lowaddr,		/* lowaddr */
 					 BUS_SPACE_MAXADDR,	/* highaddr */
 					 NULL, NULL,		/* filter, filterarg */
 					 sctx->isc_rx_maxsize,	/* maxsize */
 					 sctx->isc_rx_nsegments,	/* nsegments */
 					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
 					 0,			/* flags */
 					 NULL,			/* lockfunc */
 					 NULL,			/* lockarg */
 					 &fl->ifl_buf_tag);
 		if (err) {
 			device_printf(dev,
 			    "Unable to allocate RX DMA tag: %d\n", err);
 			goto fail;
 		}
 
 		/* Allocate memory for the RX mbuf map. */
 		if (!(fl->ifl_sds.ifsd_m =
 		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX mbuf map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/* Allocate memory for the direct RX cluster pointer map. */
 		if (!(fl->ifl_sds.ifsd_cl =
 		      (caddr_t *) malloc(sizeof(caddr_t) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX cluster map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/* Allocate memory for the RX cluster bus address map. */
 		if (!(fl->ifl_sds.ifsd_ba =
 		      (bus_addr_t *) malloc(sizeof(bus_addr_t) *
 					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX bus address map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 
 		/*
 		 * Create the DMA maps for RX buffers.
 		 */
 		if (!(fl->ifl_sds.ifsd_map =
 		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev,
 			    "Unable to allocate RX buffer DMA map memory\n");
 			err = ENOMEM;
 			goto fail;
 		}
 		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
 			err = bus_dmamap_create(fl->ifl_buf_tag, 0,
 			    &fl->ifl_sds.ifsd_map[i]);
 			if (err != 0) {
 				device_printf(dev, "Unable to create RX buffer DMA map\n");
 				goto fail;
 			}
 		}
 	}
 	return (0);
 
 fail:
 	iflib_rx_structures_free(ctx);
 	return (err);
 }
 
 /*
  * Internal service routines
  */
 
 struct rxq_refill_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 static void
 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct rxq_refill_cb_arg *cb_arg = arg;
 
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 }
 
 /**
  * iflib_fl_refill - refill an rxq free-buffer list
  * @ctx: the iflib context
  * @fl: the free list to refill
  * @count: the number of new buffers to allocate
  *
  * (Re)populate an rxq free-buffer list with up to @count new packet buffers.
  * The caller must assure that @count does not exceed the queue's capacity
  * minus one (since we always leave a descriptor unavailable).
  */
 static uint8_t
 iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 {
 	struct if_rxd_update iru;
 	struct rxq_refill_cb_arg cb_arg;
 	struct mbuf *m;
 	caddr_t cl, *sd_cl;
 	struct mbuf **sd_m;
 	bus_dmamap_t *sd_map;
 	bus_addr_t bus_addr, *sd_ba;
 	int err, frag_idx, i, idx, n, pidx;
 	qidx_t credits;
 
 	MPASS(count <= fl->ifl_size - fl->ifl_credits - 1);
 
 	sd_m = fl->ifl_sds.ifsd_m;
 	sd_map = fl->ifl_sds.ifsd_map;
 	sd_cl = fl->ifl_sds.ifsd_cl;
 	sd_ba = fl->ifl_sds.ifsd_ba;
 	pidx = fl->ifl_pidx;
 	idx = pidx;
 	frag_idx = fl->ifl_fragidx;
 	credits = fl->ifl_credits;
 
 	i = 0;
 	n = count;
 	MPASS(n > 0);
 	MPASS(credits + n <= fl->ifl_size);
 
 	if (pidx < fl->ifl_cidx)
 		MPASS(pidx + n <= fl->ifl_cidx);
 	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
 		MPASS(fl->ifl_gen == 0);
 	if (pidx > fl->ifl_cidx)
 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
 
 	DBG_COUNTER_INC(fl_refills);
 	if (n > 8)
 		DBG_COUNTER_INC(fl_refills_large);
 	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
 	while (n-- > 0) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
 		 * initialized after rx.
 		 *
 		 * If the cluster is still set then we know a minimum sized
 		 * packet was received
 		 */
 		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
 		    &frag_idx);
 		if (frag_idx < 0)
 			bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
 		MPASS(frag_idx >= 0);
 		if ((cl = sd_cl[frag_idx]) == NULL) {
 			cl = uma_zalloc(fl->ifl_zone, M_NOWAIT);
 			if (__predict_false(cl == NULL))
 				break;
 
 			cb_arg.error = 0;
 			MPASS(sd_map != NULL);
 			err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
 			    cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
 			    BUS_DMA_NOWAIT);
 			if (__predict_false(err != 0 || cb_arg.error)) {
 				uma_zfree(fl->ifl_zone, cl);
 				break;
 			}
 
 			sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr;
 			sd_cl[frag_idx] = cl;
 #if MEMORY_LOGGING
 			fl->ifl_cl_enqueued++;
 #endif
 		} else {
 			bus_addr = sd_ba[frag_idx];
 		}
 		bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
 		    BUS_DMASYNC_PREREAD);
 
 		if (sd_m[frag_idx] == NULL) {
 			m = m_gethdr_raw(M_NOWAIT, 0);
 			if (__predict_false(m == NULL))
 				break;
 			sd_m[frag_idx] = m;
 		}
 		bit_set(fl->ifl_rx_bitmap, frag_idx);
 #if MEMORY_LOGGING
 		fl->ifl_m_enqueued++;
 #endif
 
 		DBG_COUNTER_INC(rx_allocs);
 		fl->ifl_rxd_idxs[i] = frag_idx;
 		fl->ifl_bus_addrs[i] = bus_addr;
 		credits++;
 		i++;
 		MPASS(credits <= fl->ifl_size);
 		if (++idx == fl->ifl_size) {
 #ifdef INVARIANTS
 			fl->ifl_gen = 1;
 #endif
 			idx = 0;
 		}
 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
 			iru.iru_pidx = pidx;
 			iru.iru_count = i;
 			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			fl->ifl_pidx = idx;
 			fl->ifl_credits = credits;
 			pidx = idx;
 			i = 0;
 		}
 	}
 
 	if (n < count - 1) {
 		if (i != 0) {
 			iru.iru_pidx = pidx;
 			iru.iru_count = i;
 			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			fl->ifl_pidx = idx;
 			fl->ifl_credits = credits;
 		}
 		DBG_COUNTER_INC(rxd_flush);
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id,
 		    fl->ifl_id, fl->ifl_pidx);
 		if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) {
 			fl->ifl_fragidx = frag_idx + 1;
 			if (fl->ifl_fragidx == fl->ifl_size)
 				fl->ifl_fragidx = 0;
 		} else {
 			fl->ifl_fragidx = frag_idx;
 		}
 	}
 
 	return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY);
 }
 
 static inline uint8_t
 iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl)
 {
 	/*
 	 * We leave an unused descriptor to avoid pidx to catch up with cidx.
 	 * This is important as it confuses most NICs. For instance,
 	 * Intel NICs have (per receive ring) RDH and RDT registers, where
 	 * RDH points to the next receive descriptor to be used by the NIC,
 	 * and RDT for the next receive descriptor to be published by the
 	 * driver to the NIC (RDT - 1 is thus the last valid one).
 	 * The condition RDH == RDT means no descriptors are available to
 	 * the NIC, and thus it would be ambiguous if it also meant that
 	 * all the descriptors are available to the NIC.
 	 */
 	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
 #ifdef INVARIANTS
 	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
 #endif
 
 	MPASS(fl->ifl_credits <= fl->ifl_size);
 	MPASS(reclaimable == delta);
 
 	if (reclaimable > 0)
 		return (iflib_fl_refill(ctx, fl, reclaimable));
 	return (0);
 }
 
 uint8_t
 iflib_in_detach(if_ctx_t ctx)
 {
 	bool in_detach;
 
 	STATE_LOCK(ctx);
 	in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
 	STATE_UNLOCK(ctx);
 	return (in_detach);
 }
 
 static void
 iflib_fl_bufs_free(iflib_fl_t fl)
 {
 	iflib_dma_info_t idi = fl->ifl_ifdi;
 	bus_dmamap_t sd_map;
 	uint32_t i;
 
 	for (i = 0; i < fl->ifl_size; i++) {
 		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
 		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
 
 		if (*sd_cl != NULL) {
 			sd_map = fl->ifl_sds.ifsd_map[i];
 			bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
 			    BUS_DMASYNC_POSTREAD);
 			bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
 			uma_zfree(fl->ifl_zone, *sd_cl);
 			*sd_cl = NULL;
 			if (*sd_m != NULL) {
 				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
 				m_free_raw(*sd_m);
 				*sd_m = NULL;
 			}
 		} else {
 			MPASS(*sd_m == NULL);
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_dequeued++;
 		fl->ifl_cl_dequeued++;
 #endif
 	}
 #ifdef INVARIANTS
 	for (i = 0; i < fl->ifl_size; i++) {
 		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
 		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
 	}
 #endif
 	/*
 	 * Reset free list values
 	 */
 	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
 	bzero(idi->idi_vaddr, idi->idi_size);
 }
 
 /*********************************************************************
  *
  *  Initialize a free list and its buffers.
  *
  **********************************************************************/
 static int
 iflib_fl_setup(iflib_fl_t fl)
 {
 	iflib_rxq_t rxq = fl->ifl_rxq;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int qidx;
 
 	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
 	/*
 	** Free current RX buffer structs and their mbufs
 	*/
 	iflib_fl_bufs_free(fl);
 	/* Now replenish the mbufs */
 	MPASS(fl->ifl_credits == 0);
 	qidx = rxq->ifr_fl_offset + fl->ifl_id;
 	if (scctx->isc_rxd_buf_size[qidx] != 0)
 		fl->ifl_buf_size = scctx->isc_rxd_buf_size[qidx];
 	else
 		fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
 	/*
 	 * ifl_buf_size may be a driver-supplied value, so pull it up
 	 * to the selected mbuf size.
 	 */
 	fl->ifl_buf_size = iflib_get_mbuf_size_for(fl->ifl_buf_size);
 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
 	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 
 	/*
 	 * Avoid pre-allocating zillions of clusters to an idle card
 	 * potentially speeding up attach. In any case make sure
 	 * to leave a descriptor unavailable. See the comment in
 	 * iflib_fl_refill_all().
 	 */
 	MPASS(fl->ifl_size > 0);
 	(void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1));
 	if (min(128, fl->ifl_size - 1) != fl->ifl_credits)
 		return (ENOBUFS);
 	/*
 	 * handle failure
 	 */
 	MPASS(rxq != NULL);
 	MPASS(fl->ifl_ifdi != NULL);
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Free receive ring data structures
  *
  **********************************************************************/
 static void
 iflib_rx_sds_free(iflib_rxq_t rxq)
 {
 	iflib_fl_t fl;
 	int i, j;
 
 	if (rxq->ifr_fl != NULL) {
 		for (i = 0; i < rxq->ifr_nfl; i++) {
 			fl = &rxq->ifr_fl[i];
 			if (fl->ifl_buf_tag != NULL) {
 				if (fl->ifl_sds.ifsd_map != NULL) {
 					for (j = 0; j < fl->ifl_size; j++) {
 						bus_dmamap_sync(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j],
 						    BUS_DMASYNC_POSTREAD);
 						bus_dmamap_unload(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j]);
 						bus_dmamap_destroy(
 						    fl->ifl_buf_tag,
 						    fl->ifl_sds.ifsd_map[j]);
 					}
 				}
 				bus_dma_tag_destroy(fl->ifl_buf_tag);
 				fl->ifl_buf_tag = NULL;
 			}
 			free(fl->ifl_sds.ifsd_m, M_IFLIB);
 			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
 			free(fl->ifl_sds.ifsd_ba, M_IFLIB);
 			free(fl->ifl_sds.ifsd_map, M_IFLIB);
 			free(fl->ifl_rx_bitmap, M_IFLIB);
 			fl->ifl_sds.ifsd_m = NULL;
 			fl->ifl_sds.ifsd_cl = NULL;
 			fl->ifl_sds.ifsd_ba = NULL;
 			fl->ifl_sds.ifsd_map = NULL;
 			fl->ifl_rx_bitmap = NULL;
 		}
 		free(rxq->ifr_fl, M_IFLIB);
 		rxq->ifr_fl = NULL;
 		free(rxq->ifr_ifdi, M_IFLIB);
 		rxq->ifr_ifdi = NULL;
 		rxq->ifr_cq_cidx = 0;
 	}
 }
 
 /*
  * Timer routine
  */
 static void
 iflib_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	uint64_t this_tick = ticks;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 
 	/*
 	** Check on the state of the TX queue(s), this
 	** can be done without the lock because its RO
 	** and the HUNG state will be static if set.
 	*/
 	if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) {
 		txq->ift_last_timer_tick = this_tick;
 		IFDI_TIMER(ctx, txq->ift_id);
 		if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
 		    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
 		     (sctx->isc_pause_frames == 0)))
 			goto hung;
 
 		if (txq->ift_qstatus != IFLIB_QUEUE_IDLE &&
 		    ifmp_ring_is_stalled(txq->ift_br)) {
 			KASSERT(ctx->ifc_link_state == LINK_STATE_UP,
 			    ("queue can't be marked as hung if interface is down"));
 			txq->ift_qstatus = IFLIB_QUEUE_HUNG;
 		}
 		txq->ift_cleaned_prev = txq->ift_cleaned;
 	}
 	/* handle any laggards */
 	if (txq->ift_db_pending)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 
 	sctx->isc_pause_frames = 0;
 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer,
 		    txq, txq->ift_timer.c_cpu);
 	return;
 
  hung:
 	device_printf(ctx->ifc_dev,
 	    "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
 	    txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 	STATE_LOCK(ctx);
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
 	iflib_admin_intr_deferred(ctx);
 	STATE_UNLOCK(ctx);
 }
 
 static uint16_t
 iflib_get_mbuf_size_for(unsigned int size)
 {
 
 	if (size <= MCLBYTES)
 		return (MCLBYTES);
 	else
 		return (MJUMPAGESIZE);
 }
 
 static void
 iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
 	/*
 	 * XXX don't set the max_frame_size to larger
 	 * than the hardware can handle
 	 */
 	ctx->ifc_rx_mbuf_sz =
 	    iflib_get_mbuf_size_for(sctx->isc_max_frame_size);
 }
 
 uint32_t
 iflib_get_rx_mbuf_sz(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_rx_mbuf_sz);
 }
 
 static void
 iflib_init_locked(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 
 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	IFDI_INTR_DISABLE(ctx);
 
 	/*
 	 * See iflib_stop(). Useful in case iflib_init_locked() is
 	 * called without first calling iflib_stop().
 	 */
 	netmap_disable_all_rings(ifp);
 
 	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
 	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 	/* Set hardware offload abilities */
 	if_clearhwassist(ifp);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 #ifdef DEV_NETMAP
 		callout_stop(&txq->ift_netmap_timer);
 #endif /* DEV_NETMAP */
 		CALLOUT_UNLOCK(txq);
 		(void)iflib_netmap_txq_init(ctx, txq);
 	}
 
 	/*
 	 * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
 	 * that drivers can use the value when setting up the hardware receive
 	 * buffers.
 	 */
 	iflib_calc_rx_mbuf_sz(ctx);
 
 #ifdef INVARIANTS
 	i = if_getdrvflags(ifp);
 #endif
 	IFDI_INIT(ctx);
 	MPASS(if_getdrvflags(ifp) == i);
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 		if (iflib_netmap_rxq_init(ctx, rxq) > 0) {
 			/* This rxq is in netmap mode. Skip normal init. */
 			continue;
 		}
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			if (iflib_fl_setup(fl)) {
 				device_printf(ctx->ifc_dev,
 				    "setting up free list %d failed - "
 				    "check cluster settings\n", j);
 				goto done;
 			}
 		}
 	}
 done:
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 	IFDI_INTR_ENABLE(ctx);
 	txq = ctx->ifc_txqs;
 	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
 			txq->ift_timer.c_cpu);
 
         /* Re-enable txsync/rxsync. */
 	netmap_enable_all_rings(ifp);
 }
 
 static int
 iflib_media_change(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int err;
 
 	CTX_LOCK(ctx);
 	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
 		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 static void
 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	CTX_LOCK(ctx);
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	IFDI_MEDIA_STATUS(ctx, ifmr);
 	CTX_UNLOCK(ctx);
 }
 
 void
 iflib_stop(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	iflib_dma_info_t di;
 	iflib_fl_t fl;
 	int i, j;
 
 	/* Tell the stack that the interface is no longer active */
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	IFDI_INTR_DISABLE(ctx);
 	DELAY(1000);
 	IFDI_STOP(ctx);
 	DELAY(1000);
 
 	/*
 	 * Stop any pending txsync/rxsync and prevent new ones
 	 * form starting. Processes blocked in poll() will get
 	 * POLLERR.
 	 */
 	netmap_disable_all_rings(ctx->ifc_ifp);
 
 	iflib_debug_reset();
 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 #ifdef DEV_NETMAP
 		callout_stop(&txq->ift_netmap_timer);
 #endif /* DEV_NETMAP */
 		CALLOUT_UNLOCK(txq);
 
 		/* clean any enqueued buffers */
 		iflib_ifmp_purge(txq);
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
 		}
 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
 		txq->ift_in_use = txq->ift_gen = txq->ift_no_desc_avail = 0;
 		if (sctx->isc_flags & IFLIB_PRESERVE_TX_INDICES)
 			txq->ift_cidx = txq->ift_pidx;
 		else
 			txq->ift_cidx = txq->ift_pidx = 0;
 
 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 		txq->ift_pullups = 0;
 		ifmp_ring_reset_stats(txq->ift_br);
 		for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 	}
 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 		if (rxq->ifr_task.gt_taskqueue != NULL)
 			gtaskqueue_drain(rxq->ifr_task.gt_taskqueue,
 				 &rxq->ifr_task.gt_task);
 
 		rxq->ifr_cq_cidx = 0;
 		for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 		/* also resets the free lists pidx/cidx */
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			iflib_fl_bufs_free(fl);
 	}
 }
 
 static inline caddr_t
 calc_next_rxd(iflib_fl_t fl, int cidx)
 {
 	qidx_t size;
 	int nrxd;
 	caddr_t start, end, cur, next;
 
 	nrxd = fl->ifl_size;
 	size = fl->ifl_rxd_size;
 	start = fl->ifl_ifdi->idi_vaddr;
 
 	if (__predict_false(size == 0))
 		return (start);
 	cur = start + size*cidx;
 	end = start + size*nrxd;
 	next = CACHE_PTR_NEXT(cur);
 	return (next < end ? next : start);
 }
 
 static inline void
 prefetch_pkts(iflib_fl_t fl, int cidx)
 {
 	int nextptr;
 	int nrxd = fl->ifl_size;
 	caddr_t next_rxd;
 
 	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
 	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
 	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
 	next_rxd = calc_next_rxd(fl, cidx);
 	prefetch(next_rxd);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
 	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
 }
 
 static struct mbuf *
 rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
     int *pf_rv, if_rxd_info_t ri)
 {
 	bus_dmamap_t map;
 	iflib_fl_t fl;
 	caddr_t payload;
 	struct mbuf *m;
 	int flid, cidx, len, next;
 
 	map = NULL;
 	flid = irf->irf_flid;
 	cidx = irf->irf_idx;
 	fl = &rxq->ifr_fl[flid];
 	sd->ifsd_fl = fl;
 	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
 	fl->ifl_credits--;
 #if MEMORY_LOGGING
 	fl->ifl_m_dequeued++;
 #endif
 	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
 		prefetch_pkts(fl, cidx);
 	next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
 	prefetch(&fl->ifl_sds.ifsd_map[next]);
 	map = fl->ifl_sds.ifsd_map[cidx];
 
 	bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
 
 	if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL &&
 	    irf->irf_len != 0) {
 		payload  = *sd->ifsd_cl;
 		payload +=  ri->iri_pad;
 		len = ri->iri_len - ri->iri_pad;
 		*pf_rv = pfil_mem_in(rxq->pfil, payload, len, ri->iri_ifp, &m);
 		switch (*pf_rv) {
 		case PFIL_DROPPED:
 		case PFIL_CONSUMED:
 			/*
 			 * The filter ate it.  Everything is recycled.
 			 */
 			m = NULL;
 			unload = 0;
 			break;
 		case PFIL_REALLOCED:
 			/*
 			 * The filter copied it.  Everything is recycled.
 			 * 'm' points at new mbuf.
 			 */
 			unload = 0;
 			break;
 		case PFIL_PASS:
 			/*
 			 * Filter said it was OK, so receive like
 			 * normal
 			 */
 			m = fl->ifl_sds.ifsd_m[cidx];
 			fl->ifl_sds.ifsd_m[cidx] = NULL;
 			break;
 		default:
 			MPASS(0);
 		}
 	} else {
 		m = fl->ifl_sds.ifsd_m[cidx];
 		fl->ifl_sds.ifsd_m[cidx] = NULL;
 		if (pf_rv != NULL)
 			*pf_rv = PFIL_PASS;
 	}
 
 	if (unload && irf->irf_len != 0)
 		bus_dmamap_unload(fl->ifl_buf_tag, map);
 	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
 	if (__predict_false(fl->ifl_cidx == 0))
 		fl->ifl_gen = 0;
 	bit_clear(fl->ifl_rx_bitmap, cidx);
 	return (m);
 }
 
 static struct mbuf *
 assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv)
 {
 	struct mbuf *m, *mh, *mt;
 	caddr_t cl;
 	int  *pf_rv_ptr, flags, i, padlen;
 	bool consumed;
 
 	i = 0;
 	mh = NULL;
 	consumed = false;
 	*pf_rv = PFIL_PASS;
 	pf_rv_ptr = pf_rv;
 	do {
 		m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd,
 		    pf_rv_ptr, ri);
 
 		MPASS(*sd->ifsd_cl != NULL);
 
 		/*
 		 * Exclude zero-length frags & frags from
 		 * packets the filter has consumed or dropped
 		 */
 		if (ri->iri_frags[i].irf_len == 0 || consumed ||
 		    *pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED) {
 			if (mh == NULL) {
 				/* everything saved here */
 				consumed = true;
 				pf_rv_ptr = NULL;
 				continue;
 			}
 			/* XXX we can save the cluster here, but not the mbuf */
 			m_init(m, M_NOWAIT, MT_DATA, 0);
 			m_free(m);
 			continue;
 		}
 		if (mh == NULL) {
 			flags = M_PKTHDR|M_EXT;
 			mh = mt = m;
 			padlen = ri->iri_pad;
 		} else {
 			flags = M_EXT;
 			mt->m_next = m;
 			mt = m;
 			/* assuming padding is only on the first fragment */
 			padlen = 0;
 		}
 		cl = *sd->ifsd_cl;
 		*sd->ifsd_cl = NULL;
 
 		/* Can these two be made one ? */
 		m_init(m, M_NOWAIT, MT_DATA, flags);
 		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
 		/*
 		 * These must follow m_init and m_cljset
 		 */
 		m->m_data += padlen;
 		ri->iri_len -= padlen;
 		m->m_len = ri->iri_frags[i].irf_len;
 	} while (++i < ri->iri_nfrags);
 
 	return (mh);
 }
 
 /*
  * Process one software descriptor
  */
 static struct mbuf *
 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
 	struct if_rxsd sd;
 	struct mbuf *m;
 	int pf_rv;
 
 	/* should I merge this back in now that the two paths are basically duplicated? */
 	if (ri->iri_nfrags == 1 &&
 	    ri->iri_frags[0].irf_len != 0 &&
 	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
 		m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd,
 		    &pf_rv, ri);
 		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
 			return (m);
 		if (pf_rv == PFIL_PASS) {
 			m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
 #ifndef __NO_STRICT_ALIGNMENT
 			if (!IP_ALIGNED(m) && ri->iri_pad == 0)
 				m->m_data += 2;
 #endif
 			memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
 			m->m_len = ri->iri_frags[0].irf_len;
 			m->m_data += ri->iri_pad;
 			ri->iri_len -= ri->iri_pad;
 		}
 	} else {
 		m = assemble_segments(rxq, ri, &sd, &pf_rv);
 		if (m == NULL)
 			return (NULL);
 		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
 			return (m);
 	}
 	m->m_pkthdr.len = ri->iri_len;
 	m->m_pkthdr.rcvif = ri->iri_ifp;
 	m->m_flags |= ri->iri_flags;
 	m->m_pkthdr.ether_vtag = ri->iri_vtag;
 	m->m_pkthdr.flowid = ri->iri_flowid;
 	M_HASHTYPE_SET(m, ri->iri_rsstype);
 	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
 	m->m_pkthdr.csum_data = ri->iri_csum_data;
 	return (m);
 }
 
 #if defined(INET6) || defined(INET)
 static void
 iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
 {
 	CURVNET_SET(if_getvnet(lc->ifp));
 #if defined(INET6)
 	*v6 = V_ip6_forwarding;
 #endif
 #if defined(INET)
 	*v4 = V_ipforwarding;
 #endif
 	CURVNET_RESTORE();
 }
 
 /*
  * Returns true if it's possible this packet could be LROed.
  * if it returns false, it is guaranteed that tcp_lro_rx()
  * would not return zero.
  */
 static bool
 iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
 {
 	struct ether_header *eh;
 
 	eh = mtod(m, struct ether_header *);
 	switch (eh->ether_type) {
 #if defined(INET6)
 		case htons(ETHERTYPE_IPV6):
 			return (!v6_forwarding);
 #endif
 #if defined (INET)
 		case htons(ETHERTYPE_IP):
 			return (!v4_forwarding);
 #endif
 	}
 
 	return false;
 }
 #else
 static void
 iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
 {
 }
 #endif
 
 static void
 _task_fn_rx_watchdog(void *context)
 {
 	iflib_rxq_t rxq = context;
 
 	GROUPTASK_ENQUEUE(&rxq->ifr_task);
 }
 
 static uint8_t
 iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
 {
 	if_t ifp;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int avail, i;
 	qidx_t *cidxp;
 	struct if_rxd_info ri;
 	int err, budget_left, rx_bytes, rx_pkts;
 	iflib_fl_t fl;
 	int lro_enabled;
 	bool v4_forwarding, v6_forwarding, lro_possible;
 	uint8_t retval = 0;
 
 	/*
 	 * XXX early demux data packets so that if_input processing only handles
 	 * acks in interrupt context
 	 */
 	struct mbuf *m, *mh, *mt, *mf;
 
 	NET_EPOCH_ASSERT();
 
 	lro_possible = v4_forwarding = v6_forwarding = false;
 	ifp = ctx->ifc_ifp;
 	mh = mt = NULL;
 	MPASS(budget > 0);
 	rx_pkts	= rx_bytes = 0;
 	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidxp = &rxq->ifr_cq_cidx;
 	else
 		cidxp = &rxq->ifr_fl[0].ifl_cidx;
 	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
 		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 			retval |= iflib_fl_refill_all(ctx, fl);
 		DBG_COUNTER_INC(rx_unavail);
 		return (retval);
 	}
 
 	/* pfil needs the vnet to be set */
 	CURVNET_SET_QUIET(if_getvnet(ifp));
 	for (budget_left = budget; budget_left > 0 && avail > 0;) {
 		if (__predict_false(!CTX_ACTIVE(ctx))) {
 			DBG_COUNTER_INC(rx_ctx_inactive);
 			break;
 		}
 		/*
 		 * Reset client set fields to their default values
 		 */
 		rxd_info_zero(&ri);
 		ri.iri_qsidx = rxq->ifr_id;
 		ri.iri_cidx = *cidxp;
 		ri.iri_ifp = ifp;
 		ri.iri_frags = rxq->ifr_frags;
 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 
 		if (err)
 			goto err;
 		rx_pkts += 1;
 		rx_bytes += ri.iri_len;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			*cidxp = ri.iri_cidx;
 			/* Update our consumer index */
 			/* XXX NB: shurd - check if this is still safe */
 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 			/* was this only a completion queue message? */
 			if (__predict_false(ri.iri_nfrags == 0))
 				continue;
 		}
 		MPASS(ri.iri_nfrags != 0);
 		MPASS(ri.iri_len != 0);
 
 		/* will advance the cidx on the corresponding free lists */
 		m = iflib_rxd_pkt_get(rxq, &ri);
 		avail--;
 		budget_left--;
 		if (avail == 0 && budget_left)
 			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
 
 		if (__predict_false(m == NULL))
 			continue;
 
 		/* imm_pkt: -- cxgb */
 		if (mh == NULL)
 			mh = mt = m;
 		else {
 			mt->m_nextpkt = m;
 			mt = m;
 		}
 	}
 	CURVNET_RESTORE();
 	/* make sure that we can refill faster than drain */
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 		retval |= iflib_fl_refill_all(ctx, fl);
 
 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
 	if (lro_enabled)
 		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
 	mt = mf = NULL;
 	while (mh != NULL) {
 		m = mh;
 		mh = mh->m_nextpkt;
 		m->m_nextpkt = NULL;
 #ifndef __NO_STRICT_ALIGNMENT
 		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
 			continue;
 #endif
 #if defined(INET6) || defined(INET)
 		if (lro_enabled) {
 			if (!lro_possible) {
 				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
 				if (lro_possible && mf != NULL) {
 					if_input(ifp, mf);
 					DBG_COUNTER_INC(rx_if_input);
 					mt = mf = NULL;
 				}
 			}
 			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
 			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
 				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
 					continue;
 			}
 		}
 #endif
 		if (lro_possible) {
 			if_input(ifp, m);
 			DBG_COUNTER_INC(rx_if_input);
 			continue;
 		}
 
 		if (mf == NULL)
 			mf = m;
 		if (mt != NULL)
 			mt->m_nextpkt = m;
 		mt = m;
 	}
 	if (mf != NULL) {
 		if_input(ifp, mf);
 		DBG_COUNTER_INC(rx_if_input);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	/*
 	 * Flush any outstanding LRO work
 	 */
 #if defined(INET6) || defined(INET)
 	tcp_lro_flush_all(&rxq->ifr_lc);
 #endif
 	if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0)
 		retval |= IFLIB_RXEOF_MORE;
 	return (retval);
 err:
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_DO_RESET;
 	iflib_admin_intr_deferred(ctx);
 	STATE_UNLOCK(ctx);
 	return (0);
 }
 
 #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
 static inline qidx_t
 txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
 {
 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 	qidx_t minthresh = txq->ift_size / 8;
 	if (in_use > 4*minthresh)
 		return (notify_count);
 	if (in_use > 2*minthresh)
 		return (notify_count >> 1);
 	if (in_use > minthresh)
 		return (notify_count >> 3);
 	return (0);
 }
 
 static inline qidx_t
 txq_max_rs_deferred(iflib_txq_t txq)
 {
 	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 	qidx_t minthresh = txq->ift_size / 8;
 	if (txq->ift_in_use > 4*minthresh)
 		return (notify_count);
 	if (txq->ift_in_use > 2*minthresh)
 		return (notify_count >> 1);
 	if (txq->ift_in_use > minthresh)
 		return (notify_count >> 2);
 	return (2);
 }
 
 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
 
 #define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
 #define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 
 /* forward compatibility for cxgb */
 #define FIRST_QSET(ctx) 0
 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
 #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 
 /* XXX we should be setting this to something other than zero */
 #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
 #define	MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
     (ctx)->ifc_softc_ctx.isc_tx_nsegments)
 
 static inline bool
 iflib_txd_db_check(iflib_txq_t txq, int ring)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	qidx_t dbval, max;
 
 	max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use);
 
 	/* force || threshold exceeded || at the edge of the ring */
 	if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) {
 
 		/*
 		 * 'npending' is used if the card's doorbell is in terms of the number of descriptors
 		 * pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the
 		 * producer index explicitly (INTC).
 		 */
 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 
 		/*
 		 * Absent bugs there are zero packets pending so reset pending counts to zero.
 		 */
 		txq->ift_db_pending = txq->ift_npending = 0;
 		return (true);
 	}
 	return (false);
 }
 
 #ifdef PKT_DEBUG
 static void
 print_pkt(if_pkt_info_t pi)
 {
 	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
 	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
 	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
 	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
 	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
 	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
 }
 #endif
 
 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
 #define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
 
 /**
  * Parses out ethernet header information in the given mbuf.
  * Returns in pi: ipi_etype (EtherType) and ipi_ehdrlen (Ethernet header length)
  *
  * This will account for the VLAN header if present.
  *
  * XXX: This doesn't handle QinQ, which could prevent TX offloads for those
  * types of packets.
  */
 static int
 iflib_parse_ether_header(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
 {
 	struct ether_vlan_header *eh;
 	struct mbuf *m;
 
 	m = *mp;
 	if (__predict_false(m->m_len < sizeof(*eh))) {
 		(*pullups)++;
 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 			return (ENOMEM);
 	}
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		pi->ipi_etype = ntohs(eh->evl_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
 	}
 	*mp = m;
 
 	return (0);
 }
 
 /**
  * Parse up to the L3 header and extract IPv4/IPv6 header information into pi.
  * Currently this information includes: IP ToS value, IP header version/presence
  *
  * This is missing some checks and doesn't edit the packet content as it goes,
  * unlike iflib_parse_header(), in order to keep the amount of code here minimal.
  */
 static int
 iflib_parse_header_partial(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
 {
 	struct mbuf *m;
 	int err;
 
 	*pullups = 0;
 	m = *mp;
 	if (!M_WRITABLE(m)) {
 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 			return (ENOMEM);
 		} else {
 			m_freem(*mp);
 			DBG_COUNTER_INC(tx_frees);
 			*mp = m;
 		}
 	}
 
 	/* Fills out pi->ipi_etype */
 	err = iflib_parse_ether_header(pi, mp, pullups);
 	if (err)
 		return (err);
 	m = *mp;
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct mbuf *n;
 		struct ip *ip = NULL;
 		int miniplen;
 
 		miniplen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip));
 		if (__predict_false(m->m_len < miniplen)) {
 			/*
 			 * Check for common case where the first mbuf only contains
 			 * the Ethernet header
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				/* If next mbuf contains at least the minimal IP header, then stop */
 				if (n->m_len >= sizeof(*ip)) {
 					ip = (struct ip *)n->m_data;
 				} else {
 					(*pullups)++;
 					if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				(*pullups)++;
 				if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 		}
 
 		/* Have the IPv4 header w/ no options here */
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_ip_tos = ip->ip_tos;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6;
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			(*pullups)++;
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 
 		/* Have the IPv6 fixed header here */
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
 
 	return (0);
 
 }
 
 static int
 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 {
 	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct mbuf *m;
 	int err;
 
 	m = *mp;
 	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
 	    M_WRITABLE(m) == 0) {
 		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 			return (ENOMEM);
 		} else {
 			m_freem(*mp);
 			DBG_COUNTER_INC(tx_frees);
 			*mp = m;
 		}
 	}
 
 	/* Fills out pi->ipi_etype */
 	err = iflib_parse_ether_header(pi, mp, &txq->ift_pullups);
 	if (__predict_false(err))
 		return (err);
 	m = *mp;
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct mbuf *n;
 		struct ip *ip = NULL;
 		struct tcphdr *th = NULL;
 		int minthlen;
 
 		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
 		if (__predict_false(m->m_len < minthlen)) {
 			/*
 			 * if this code bloat is causing too much of a hit
 			 * move it to a separate function and mark it noinline
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				if (n->m_len >= sizeof(*ip))  {
 					ip = (struct ip *)n->m_data;
 					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 				} else {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		}
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_ip_tos = ip->ip_tos;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		/* TCP checksum offload may require TCP header length */
 		if (IS_TX_OFFLOAD4(pi)) {
 			if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
 				if (__predict_false(th == NULL)) {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
 						return (ENOMEM);
 					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
 				}
 				pi->ipi_tcp_hflags = th->th_flags;
 				pi->ipi_tcp_hlen = th->th_off << 2;
 				pi->ipi_tcp_seq = th->th_seq;
 			}
 			if (IS_TSO4(pi)) {
 				if (__predict_false(ip->ip_p != IPPROTO_TCP))
 					return (ENXIO);
 				/*
 				 * TSO always requires hardware checksum offload.
 				 */
 				pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 						       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 				if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
 					ip->ip_sum = 0;
 					ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
 				}
 			}
 		}
 		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
                        ip->ip_sum = 0;
 
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 		struct tcphdr *th;
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			txq->ift_pullups++;
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
 
 		/* XXX-BZ this will go badly in case of ext hdrs. */
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		/* TCP checksum offload may require TCP header length */
 		if (IS_TX_OFFLOAD6(pi)) {
 			if (pi->ipi_ipproto == IPPROTO_TCP) {
 				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
 						return (ENOMEM);
 				}
 				pi->ipi_tcp_hflags = th->th_flags;
 				pi->ipi_tcp_hlen = th->th_off << 2;
 				pi->ipi_tcp_seq = th->th_seq;
 			}
 			if (IS_TSO6(pi)) {
 				if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 					return (ENXIO);
 				/*
 				 * TSO always requires hardware checksum offload.
 				 */
 				pi->ipi_csum_flags |= CSUM_IP6_TCP;
 				th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 			}
 		}
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
 
 	return (0);
 }
 
 /*
  * If dodgy hardware rejects the scatter gather chain we've handed it
  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
  * m_defrag'd mbufs
  */
 static __noinline struct mbuf *
 iflib_remove_mbuf(iflib_txq_t txq)
 {
 	int ntxd, pidx;
 	struct mbuf *m, **ifsd_m;
 
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	pidx = txq->ift_pidx & (ntxd - 1);
 	ifsd_m = txq->ift_sds.ifsd_m;
 	m = ifsd_m[pidx];
 	ifsd_m[pidx] = NULL;
 	bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
 	if (txq->ift_sds.ifsd_tso_map != NULL)
 		bus_dmamap_unload(txq->ift_tso_buf_tag,
 		    txq->ift_sds.ifsd_tso_map[pidx]);
 #if MEMORY_LOGGING
 	txq->ift_dequeued++;
 #endif
 	return (m);
 }
 
 static inline caddr_t
 calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
 {
 	qidx_t size;
 	int ntxd;
 	caddr_t start, end, cur, next;
 
 	ntxd = txq->ift_size;
 	size = txq->ift_txd_size[qid];
 	start = txq->ift_ifdi[qid].idi_vaddr;
 
 	if (__predict_false(size == 0))
 		return (start);
 	cur = start + size*cidx;
 	end = start + size*ntxd;
 	next = CACHE_PTR_NEXT(cur);
 	return (next < end ? next : start);
 }
 
 /*
  * Pad an mbuf to ensure a minimum ethernet frame size.
  * min_frame_size is the frame size (less CRC) to pad the mbuf to
  */
 static __noinline int
 iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
 {
 	/*
 	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
 	 * and ARP message is the smallest common payload I can think of
 	 */
 	static char pad[18];	/* just zeros */
 	int n;
 	struct mbuf *new_head;
 
 	if (!M_WRITABLE(*m_head)) {
 		new_head = m_dup(*m_head, M_NOWAIT);
 		if (new_head == NULL) {
 			m_freem(*m_head);
 			device_printf(dev, "cannot pad short frame, m_dup() failed");
 			DBG_COUNTER_INC(encap_pad_mbuf_fail);
 			DBG_COUNTER_INC(tx_frees);
 			return ENOMEM;
 		}
 		m_freem(*m_head);
 		*m_head = new_head;
 	}
 
 	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
 	     n > 0; n -= sizeof(pad))
 		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
 			break;
 
 	if (n > 0) {
 		m_freem(*m_head);
 		device_printf(dev, "cannot pad short frame\n");
 		DBG_COUNTER_INC(encap_pad_mbuf_fail);
 		DBG_COUNTER_INC(tx_frees);
 		return (ENOBUFS);
 	}
 
 	return 0;
 }
 
 static int
 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 {
 	if_ctx_t		ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	bus_dma_tag_t		buf_tag;
 	bus_dma_segment_t	*segs;
 	struct mbuf		*m_head, **ifsd_m;
 	void			*next_txd;
 	bus_dmamap_t		map;
 	struct if_pkt_info	pi;
 	int remap = 0;
 	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
 
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	segs = txq->ift_segs;
 	ntxd = txq->ift_size;
 	m_head = *m_headp;
 	map = NULL;
 
 	/*
 	 * If we're doing TSO the next descriptor to clean may be quite far ahead
 	 */
 	cidx = txq->ift_cidx;
 	pidx = txq->ift_pidx;
 	if (ctx->ifc_flags & IFC_PREFETCH) {
 		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
 		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
 			next_txd = calc_next_txd(txq, cidx, 0);
 			prefetch(next_txd);
 		}
 
 		/* prefetch the next cache line of mbuf pointers and flags */
 		prefetch(&txq->ift_sds.ifsd_m[next]);
 		prefetch(&txq->ift_sds.ifsd_map[next]);
 		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
 	}
 	map = txq->ift_sds.ifsd_map[pidx];
 	ifsd_m = txq->ift_sds.ifsd_m;
 
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		buf_tag = txq->ift_tso_buf_tag;
 		max_segs = scctx->isc_tx_tso_segments_max;
 		map = txq->ift_sds.ifsd_tso_map[pidx];
 		MPASS(buf_tag != NULL);
 		MPASS(max_segs > 0);
 	} else {
 		buf_tag = txq->ift_buf_tag;
 		max_segs = scctx->isc_tx_nsegments;
 		map = txq->ift_sds.ifsd_map[pidx];
 	}
 	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
 	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
 		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
 		if (err) {
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return err;
 		}
 	}
 	m_head = *m_headp;
 
 	pkt_info_zero(&pi);
 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
 	pi.ipi_pidx = pidx;
 	pi.ipi_qsidx = txq->ift_id;
 	pi.ipi_len = m_head->m_pkthdr.len;
 	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
 	pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
 
 	/* deliberate bitwise OR to make one condition */
 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
 		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return (err);
 		}
 		m_head = *m_headp;
 	}
 
 retry:
 	err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
 	    BUS_DMA_NOWAIT);
 defrag:
 	if (__predict_false(err)) {
 		switch (err) {
 		case EFBIG:
 			/* try collapse once and defrag once */
 			if (remap == 0) {
 				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
 				/* try defrag if collapsing fails */
 				if (m_head == NULL)
 					remap++;
 			}
 			if (remap == 1) {
 				txq->ift_mbuf_defrag++;
 				m_head = m_defrag(*m_headp, M_NOWAIT);
 			}
 			/*
 			 * remap should never be >1 unless bus_dmamap_load_mbuf_sg
 			 * failed to map an mbuf that was run through m_defrag
 			 */
 			MPASS(remap <= 1);
 			if (__predict_false(m_head == NULL || remap > 1))
 				goto defrag_failed;
 			remap++;
 			*m_headp = m_head;
 			goto retry;
 			break;
 		case ENOMEM:
 			txq->ift_no_tx_dma_setup++;
 			break;
 		default:
 			txq->ift_no_tx_dma_setup++;
 			m_freem(*m_headp);
 			DBG_COUNTER_INC(tx_frees);
 			*m_headp = NULL;
 			break;
 		}
 		txq->ift_map_failed++;
 		DBG_COUNTER_INC(encap_load_mbuf_fail);
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 		return (err);
 	}
 	ifsd_m[pidx] = m_head;
 	/*
 	 * XXX assumes a 1 to 1 relationship between segments and
 	 *        descriptors - this does not hold true on all drivers, e.g.
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 		txq->ift_no_desc_avail++;
 		bus_dmamap_unload(buf_tag, map);
 		DBG_COUNTER_INC(encap_txq_avail_fail);
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		return (ENOBUFS);
 	}
 	/*
 	 * On Intel cards we can greatly reduce the number of TX interrupts
 	 * we see by only setting report status on every Nth descriptor.
 	 * However, this also means that the driver will need to keep track
 	 * of the descriptors that RS was set on to check them for the DD bit.
 	 */
 	txq->ift_rs_pending += nsegs + 1;
 	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
 	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
 		pi.ipi_flags |= IPI_TX_INTR;
 		txq->ift_rs_pending = 0;
 	}
 
 	pi.ipi_segs = segs;
 	pi.ipi_nsegs = nsegs;
 
 	MPASS(pidx >= 0 && pidx < txq->ift_size);
 #ifdef PKT_DEBUG
 	print_pkt(&pi);
 #endif
 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
 		bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
 		DBG_COUNTER_INC(tx_encap);
 		MPASS(pi.ipi_new_pidx < txq->ift_size);
 
 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
 			ndesc += txq->ift_size;
 			txq->ift_gen = 1;
 		}
 		/*
 		 * drivers can need as many as 
 		 * two sentinels
 		 */
 		MPASS(ndesc <= pi.ipi_nsegs + 2);
 		MPASS(pi.ipi_new_pidx != pidx);
 		MPASS(ndesc > 0);
 		txq->ift_in_use += ndesc;
 		txq->ift_db_pending += ndesc;
 
 		/*
 		 * We update the last software descriptor again here because there may
 		 * be a sentinel and/or there may be more mbufs than segments
 		 */
 		txq->ift_pidx = pi.ipi_new_pidx;
 		txq->ift_npending += pi.ipi_ndescs;
 	} else {
 		*m_headp = m_head = iflib_remove_mbuf(txq);
 		if (err == EFBIG) {
 			txq->ift_txd_encap_efbig++;
 			if (remap < 2) {
 				remap = 1;
 				goto defrag;
 			}
 		}
 		goto defrag_failed;
 	}
 	/*
 	 * err can't possibly be non-zero here, so we don't neet to test it
 	 * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
 	 */
 	return (err);
 
 defrag_failed:
 	txq->ift_mbuf_defrag_failed++;
 	txq->ift_map_failed++;
 	m_freem(*m_headp);
 	DBG_COUNTER_INC(tx_frees);
 	*m_headp = NULL;
 	DBG_COUNTER_INC(encap_txd_encap_fail);
 	return (ENOMEM);
 }
 
 static void
 iflib_tx_desc_free(iflib_txq_t txq, int n)
 {
 	uint32_t qsize, cidx, mask, gen;
 	struct mbuf *m, **ifsd_m;
 	bool do_prefetch;
 
 	cidx = txq->ift_cidx;
 	gen = txq->ift_gen;
 	qsize = txq->ift_size;
 	mask = qsize-1;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
 
 	while (n-- > 0) {
 		if (do_prefetch) {
 			prefetch(ifsd_m[(cidx + 3) & mask]);
 			prefetch(ifsd_m[(cidx + 4) & mask]);
 		}
 		if ((m = ifsd_m[cidx]) != NULL) {
 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 				bus_dmamap_sync(txq->ift_tso_buf_tag,
 				    txq->ift_sds.ifsd_tso_map[cidx],
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(txq->ift_tso_buf_tag,
 				    txq->ift_sds.ifsd_tso_map[cidx]);
 			} else {
 				bus_dmamap_sync(txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[cidx],
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(txq->ift_buf_tag,
 				    txq->ift_sds.ifsd_map[cidx]);
 			}
 			/* XXX we don't support any drivers that batch packets yet */
 			MPASS(m->m_nextpkt == NULL);
 			m_freem(m);
 			ifsd_m[cidx] = NULL;
 #if MEMORY_LOGGING
 			txq->ift_dequeued++;
 #endif
 			DBG_COUNTER_INC(tx_frees);
 		}
 		if (__predict_false(++cidx == qsize)) {
 			cidx = 0;
 			gen = 0;
 		}
 	}
 	txq->ift_cidx = cidx;
 	txq->ift_gen = gen;
 }
 
 static __inline int
 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
 {
 	int reclaim;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 
 	/*
 	 * Need a rate-limiting check so that this isn't called every time
 	 */
 	iflib_tx_credits_update(ctx, txq);
 	reclaim = DESC_RECLAIMABLE(txq);
 
 	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
 #ifdef INVARIANTS
 		if (iflib_verbose_debug) {
 			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
 			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
 			       reclaim, thresh);
 		}
 #endif
 		return (0);
 	}
 	iflib_tx_desc_free(txq, reclaim);
 	txq->ift_cleaned += reclaim;
 	txq->ift_in_use -= reclaim;
 
 	return (reclaim);
 }
 
 static struct mbuf **
 _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
 {
 	int next, size;
 	struct mbuf **items;
 
 	size = r->size;
 	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
 	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
 
 	prefetch(items[(cidx + offset) & (size-1)]);
 	if (remaining > 1) {
 		prefetch2cachelines(&items[next]);
 		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
 		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
 		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
 	}
 	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
 }
 
 static void
 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 {
 
 	ifmp_ring_check_drainage(txq->ift_br, budget);
 }
 
 static uint32_t
 iflib_txq_can_drain(struct ifmp_ring *r)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
 		return (1);
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD);
 	return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
 	    false));
 }
 
 static uint32_t
 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	struct mbuf *m, **mp;
 	int avail, bytes_sent, skipped, count, err, i;
 	int mcast_sent, pkt_sent, reclaimed;
 	bool do_prefetch, rang, ring;
 
 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 			    !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
 	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 	rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
 	avail = IDXDIFF(pidx, cidx, r->size);
 
 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 		/*
 		 * The driver is unloading so we need to free all pending packets.
 		 */
 		DBG_COUNTER_INC(txq_drain_flushing);
 		for (i = 0; i < avail; i++) {
 			if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
 				m_freem(r->items[(cidx + i) & (r->size-1)]);
 			r->items[(cidx + i) & (r->size-1)] = NULL;
 		}
 		return (avail);
 	}
 
 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 		DBG_COUNTER_INC(txq_drain_oactive);
 		return (0);
 	}
 
 	/*
 	 * If we've reclaimed any packets this queue cannot be hung.
 	 */
 	if (reclaimed)
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	skipped = mcast_sent = bytes_sent = pkt_sent = 0;
 	count = MIN(avail, TX_BATCH_SIZE);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
 		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
 #endif
 	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
 	err = 0;
 	for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) {
 		int rem = do_prefetch ? count - i : 0;
 
 		mp = _ring_peek_one(r, cidx, i, rem);
 		MPASS(mp != NULL && *mp != NULL);
 
 		/*
 		 * Completion interrupts will use the address of the txq
 		 * as a sentinel to enqueue _something_ in order to acquire
 		 * the lock on the mp_ring (there's no direct lock call).
 		 * We obviously whave to check for these sentinel cases
 		 * and skip them.
 		 */
 		if (__predict_false(*mp == (struct mbuf *)txq)) {
 			skipped++;
 			continue;
 		}
 		err = iflib_encap(txq, mp);
 		if (__predict_false(err)) {
 			/* no room - bail out */
 			if (err == ENOBUFS)
 				break;
 			skipped++;
 			/* we can't send this packet - skip it */
 			continue;
 		}
 		pkt_sent++;
 		m = *mp;
 		DBG_COUNTER_INC(tx_sent);
 		bytes_sent += m->m_pkthdr.len;
 		mcast_sent += !!(m->m_flags & M_MCAST);
 
 		if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)))
 			break;
 		ETHER_BPF_MTAP(ifp, m);
 		rang = iflib_txd_db_check(txq, false);
 	}
 
 	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
 	ring = rang ? false  : (iflib_min_tx_latency | err);
 	iflib_txd_db_check(txq, ring);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 	if (mcast_sent)
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
 #ifdef INVARIANTS
 	if (iflib_verbose_debug)
 		printf("consumed=%d\n", skipped + pkt_sent);
 #endif
 	return (skipped + pkt_sent);
 }
 
 static uint32_t
 iflib_txq_drain_always(struct ifmp_ring *r)
 {
 	return (1);
 }
 
 static uint32_t
 iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	int i, avail;
 	struct mbuf **mp;
 	iflib_txq_t txq;
 
 	txq = r->cookie;
 
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	CALLOUT_LOCK(txq);
 	callout_stop(&txq->ift_timer);
 	CALLOUT_UNLOCK(txq);
 
 	avail = IDXDIFF(pidx, cidx, r->size);
 	for (i = 0; i < avail; i++) {
 		mp = _ring_peek_one(r, cidx, i, avail - i);
 		if (__predict_false(*mp == (struct mbuf *)txq))
 			continue;
 		m_freem(*mp);
 		DBG_COUNTER_INC(tx_frees);
 	}
 	MPASS(ifmp_ring_is_stalled(r) == 0);
 	return (avail);
 }
 
 static void
 iflib_ifmp_purge(iflib_txq_t txq)
 {
 	struct ifmp_ring *r;
 
 	r = txq->ift_br;
 	r->drain = iflib_txq_drain_free;
 	r->can_drain = iflib_txq_drain_always;
 
 	ifmp_ring_check_drainage(r, r->size);
 
 	r->drain = iflib_txq_drain;
 	r->can_drain = iflib_txq_can_drain;
 }
 
 static void
 _task_fn_tx(void *context)
 {
 	iflib_txq_t txq = context;
 	if_ctx_t ctx = txq->ift_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	int abdicate = ctx->ifc_sysctl_tx_abdicate;
 
 #ifdef IFLIB_DIAGNOSTICS
 	txq->ift_cpu_exec_count[curcpu]++;
 #endif
 	if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
 		return;
 #ifdef DEV_NETMAP
 	if ((if_getcapenable(ifp) & IFCAP_NETMAP) &&
 	    netmap_tx_irq(ifp, txq->ift_id))
 		goto skip_ifmp;
 #endif
 #ifdef ALTQ
 	if (if_altq_is_enabled(ifp))
 		iflib_altq_if_start(ifp);
 #endif
 	if (txq->ift_db_pending)
 		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
 	else if (!abdicate)
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 	/*
 	 * When abdicating, we always need to check drainage, not just when we don't enqueue
 	 */
 	if (abdicate)
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 #ifdef DEV_NETMAP
 skip_ifmp:
 #endif
 	if (ctx->ifc_flags & IFC_LEGACY)
 		IFDI_INTR_ENABLE(ctx);
 	else
 		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
 }
 
 static void
 _task_fn_rx(void *context)
 {
 	iflib_rxq_t rxq = context;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	uint8_t more;
 	uint16_t budget;
 #ifdef DEV_NETMAP
 	u_int work = 0;
 	int nmirq;
 #endif
 
 #ifdef IFLIB_DIAGNOSTICS
 	rxq->ifr_cpu_exec_count[curcpu]++;
 #endif
 	DBG_COUNTER_INC(task_fn_rxs);
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 #ifdef DEV_NETMAP
 	nmirq = netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work);
 	if (nmirq != NM_IRQ_PASS) {
 		more = (nmirq == NM_IRQ_RESCHED) ? IFLIB_RXEOF_MORE : 0;
 		goto skip_rxeof;
 	}
 #endif
 	budget = ctx->ifc_sysctl_rx_budget;
 	if (budget == 0)
 		budget = 16;	/* XXX */
 	more = iflib_rxeof(rxq, budget);
 #ifdef DEV_NETMAP
 skip_rxeof:
 #endif
 	if ((more & IFLIB_RXEOF_MORE) == 0) {
 		if (ctx->ifc_flags & IFC_LEGACY)
 			IFDI_INTR_ENABLE(ctx);
 		else
 			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 		DBG_COUNTER_INC(rx_intr_enables);
 	}
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 
 	if (more & IFLIB_RXEOF_MORE)
 		GROUPTASK_ENQUEUE(&rxq->ifr_task);
 	else if (more & IFLIB_RXEOF_EMPTY)
 		callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
 }
 
 static void
 _task_fn_admin(void *context)
 {
 	if_ctx_t ctx = context;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	iflib_txq_t txq;
 	int i;
 	bool oactive, running, do_reset, do_watchdog, in_detach;
 
 	STATE_LOCK(ctx);
 	running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
 	oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
 	do_reset = (ctx->ifc_flags & IFC_DO_RESET);
 	do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
 	in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
 	ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
 	STATE_UNLOCK(ctx);
 
 	if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 		return;
 	if (in_detach)
 		return;
 
 	CTX_LOCK(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 	}
 	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_ADMINCQ)
 		IFDI_ADMIN_COMPLETION_HANDLE(ctx);
 	if (do_watchdog) {
 		ctx->ifc_watchdog_events++;
 		IFDI_WATCHDOG_RESET(ctx);
 	}
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
 		    txq->ift_timer.c_cpu);
 	}
 	IFDI_LINK_INTR_ENABLE(ctx);
 	if (do_reset)
 		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 
 	if (LINK_ACTIVE(ctx) == 0)
 		return;
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 }
 
 static void
 _task_fn_iov(void *context)
 {
 	if_ctx_t ctx = context;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
 	    !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VFLR_HANDLE(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 	if_int_delay_info_t info;
 	if_ctx_t ctx;
 
 	info = (if_int_delay_info_t)arg1;
 	ctx = info->iidi_ctx;
 	info->iidi_req = req;
 	info->iidi_oidp = oidp;
 	CTX_LOCK(ctx);
 	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 /*********************************************************************
  *
  *  IFNET FUNCTIONS
  *
  **********************************************************************/
 
 static void
 iflib_if_init_locked(if_ctx_t ctx)
 {
 	iflib_stop(ctx);
 	iflib_init_locked(ctx);
 }
 
 static void
 iflib_if_init(void *arg)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_if_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq;
 	int err, qidx;
 	int abdicate;
 
 	if (__predict_false((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	MPASS(m->m_nextpkt == NULL);
 	/* ALTQ-enabled interfaces always use queue 0. */
 	qidx = 0;
 	/* Use driver-supplied queue selection method if it exists */
 	if (ctx->isc_txq_select_v2) {
 		struct if_pkt_info pi;
 		uint64_t early_pullups = 0;
 		pkt_info_zero(&pi);
 
 		err = iflib_parse_header_partial(&pi, &m, &early_pullups);
 		if (__predict_false(err != 0)) {
 			/* Assign pullups for bad pkts to default queue */
 			ctx->ifc_txqs[0].ift_pullups += early_pullups;
 			DBG_COUNTER_INC(encap_txd_encap_fail);
 			return (err);
 		}
 		/* Let driver make queueing decision */
 		qidx = ctx->isc_txq_select_v2(ctx->ifc_softc, m, &pi);
 		ctx->ifc_txqs[qidx].ift_pullups += early_pullups;
 	}
 	/* Backwards compatibility w/ simpler queue select */
 	else if (ctx->isc_txq_select)
 		qidx = ctx->isc_txq_select(ctx->ifc_softc, m);
 	/* If not, use iflib's standard method */
 	else if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !if_altq_is_enabled(ifp))
 		qidx = QIDX(ctx, m);
 
 	/* Set TX queue */
 	txq = &ctx->ifc_txqs[qidx];
 
 #ifdef DRIVER_BACKPRESSURE
 	if (txq->ift_closed) {
 		while (m != NULL) {
 			next = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 			DBG_COUNTER_INC(tx_frees);
 			m = next;
 		}
 		return (ENOBUFS);
 	}
 #endif
 #ifdef notyet
 	qidx = count = 0;
 	mp = marr;
 	next = m;
 	do {
 		count++;
 		next = next->m_nextpkt;
 	} while (next != NULL);
 
 	if (count > nitems(marr))
 		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
 			/* XXX check nextpkt */
 			m_freem(m);
 			/* XXX simplify for now */
 			DBG_COUNTER_INC(tx_frees);
 			return (ENOBUFS);
 		}
 	for (next = m, i = 0; next != NULL; i++) {
 		mp[i] = next;
 		next = next->m_nextpkt;
 		mp[i]->m_nextpkt = NULL;
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
 	abdicate = ctx->ifc_sysctl_tx_abdicate;
 
 	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
 
 	if (abdicate)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
  	if (err) {
 		if (!abdicate)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		/* support forthcoming later */
 #ifdef DRIVER_BACKPRESSURE
 		txq->ift_closed = TRUE;
 #endif
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 		m_freem(m);
 		DBG_COUNTER_INC(tx_frees);
 	}
 
 	return (err);
 }
 
 #ifdef ALTQ
 /*
  * The overall approach to integrating iflib with ALTQ is to continue to use
  * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
  * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
  * is redundant/unnecessary, but doing so minimizes the amount of
  * ALTQ-specific code required in iflib.  It is assumed that the overhead of
  * redundantly queueing to an intermediate mp_ring is swamped by the
  * performance limitations inherent in using ALTQ.
  *
  * When ALTQ support is compiled in, all iflib drivers will use a transmit
  * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
  * given interface.  If ALTQ is enabled for an interface, then all
  * transmitted packets for that interface will be submitted to the ALTQ
  * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
  * implementation because it uses IFQ_HANDOFF(), which will duplicatively
  * update stats that the iflib machinery handles, and which is sensitve to
  * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
  * will be installed as the start routine for use by ALTQ facilities that
  * need to trigger queue drains on a scheduled basis.
  *
  */
 static void
 iflib_altq_if_start(if_t ifp)
 {
 	struct ifaltq *ifq = &ifp->if_snd; /* XXX - DRVAPI */
 	struct mbuf *m;
 
 	IFQ_LOCK(ifq);
 	IFQ_DEQUEUE_NOLOCK(ifq, m);
 	while (m != NULL) {
 		iflib_if_transmit(ifp, m);
 		IFQ_DEQUEUE_NOLOCK(ifq, m);
 	}
 	IFQ_UNLOCK(ifq);
 }
 
 static int
 iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
 {
 	int err;
 
 	if (if_altq_is_enabled(ifp)) {
 		IFQ_ENQUEUE(&ifp->if_snd, m, err); /* XXX - DRVAPI */
 		if (err == 0)
 			iflib_altq_if_start(ifp);
 	} else
 		err = iflib_if_transmit(ifp, m);
 
 	return (err);
 }
 #endif /* ALTQ */
 
 static void
 iflib_if_qflush(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_QFLUSH;
 	STATE_UNLOCK(ctx);
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
 			iflib_txq_check_drain(txq, 0);
 	STATE_LOCK(ctx);
 	ctx->ifc_flags &= ~IFC_QFLUSH;
 	STATE_UNLOCK(ctx);
 
 	/*
 	 * When ALTQ is enabled, this will also take care of purging the
 	 * ALTQ queue(s).
 	 */
 	if_qflush(ifp);
 }
 
 #define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
 		     IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
 		     IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG)
 
 static int
 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	struct ifreq	*ifr = (struct ifreq *)data;
 #if defined(INET) || defined(INET6)
 	struct ifaddr	*ifa = (struct ifaddr *)data;
 #endif
 	bool		avoid_reset = false;
 	int		err = 0, reinit = 0, bits;
 
 	switch (command) {
 	case SIOCSIFADDR:
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			avoid_reset = true;
 #endif
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6)
 			avoid_reset = true;
 #endif
 		/*
 		** Calling init results in link renegotiation,
 		** so we avoid doing it when possible.
 		*/
 		if (avoid_reset) {
 			if_setflagbits(ifp, IFF_UP,0);
 			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
 				reinit = 1;
 #ifdef INET
 			if (!(if_getflags(ifp) & IFF_NOARP))
 				arp_ifinit(ifp, ifa);
 #endif
 		} else
 			err = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCSIFMTU:
 		CTX_LOCK(ctx);
 		if (ifr->ifr_mtu == if_getmtu(ifp)) {
 			CTX_UNLOCK(ctx);
 			break;
 		}
 		bits = if_getdrvflags(ifp);
 		/* stop the driver and free any clusters before proceeding */
 		iflib_stop(ctx);
 
 		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
 			STATE_LOCK(ctx);
 			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
 				ctx->ifc_flags |= IFC_MULTISEG;
 			else
 				ctx->ifc_flags &= ~IFC_MULTISEG;
 			STATE_UNLOCK(ctx);
 			err = if_setmtu(ifp, ifr->ifr_mtu);
 		}
 		iflib_init_locked(ctx);
 		STATE_LOCK(ctx);
 		if_setdrvflags(ifp, bits);
 		STATE_UNLOCK(ctx);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCSIFFLAGS:
 		CTX_LOCK(ctx);
 		if (if_getflags(ifp) & IFF_UP) {
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					CTX_UNLOCK(ctx);
 					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
 					CTX_LOCK(ctx);
 				}
 			} else
 				reinit = 1;
 		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			iflib_stop(ctx);
 		}
 		ctx->ifc_if_flags = if_getflags(ifp);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			CTX_LOCK(ctx);
 			IFDI_INTR_DISABLE(ctx);
 			IFDI_MULTI_SET(ctx);
 			IFDI_INTR_ENABLE(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	case SIOCSIFMEDIA:
 		CTX_LOCK(ctx);
 		IFDI_MEDIA_SET(ctx);
 		CTX_UNLOCK(ctx);
 		/* FALLTHROUGH */
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command);
 		break;
 	case SIOCGI2C:
 	{
 		struct ifi2creq i2c;
 
 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (err != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			err = EINVAL;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			err = EINVAL;
 			break;
 		}
 
 		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
 			    sizeof(i2c));
 		break;
 	}
 	case SIOCSIFCAP:
 	{
 		int mask, setmask, oldmask;
 
 		oldmask = if_getcapenable(ifp);
 		mask = ifr->ifr_reqcap ^ oldmask;
 		mask &= ctx->ifc_softc_ctx.isc_capabilities | IFCAP_MEXTPG;
 		setmask = 0;
 #ifdef TCP_OFFLOAD
 		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
 #endif
 		setmask |= (mask & IFCAP_FLAGS);
 		setmask |= (mask & IFCAP_WOL);
 
 		/*
 		 * If any RX csum has changed, change all the ones that
 		 * are supported by the driver.
 		 */
 		if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
 			setmask |= ctx->ifc_softc_ctx.isc_capabilities &
 			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 		}
 
 		/*
 		 * want to ensure that traffic has stopped before we change any of the flags
 		 */
 		if (setmask) {
 			CTX_LOCK(ctx);
 			bits = if_getdrvflags(ifp);
 			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 				iflib_stop(ctx);
 			STATE_LOCK(ctx);
 			if_togglecapenable(ifp, setmask);
 			ctx->ifc_softc_ctx.isc_capenable ^= setmask;
 			STATE_UNLOCK(ctx);
 			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 				iflib_init_locked(ctx);
 			STATE_LOCK(ctx);
 			if_setdrvflags(ifp, bits);
 			STATE_UNLOCK(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		if_vlancap(ifp);
 		break;
 	}
 	case SIOCGPRIVATE_0:
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		CTX_LOCK(ctx);
 		err = IFDI_PRIV_IOCTL(ctx, command, data);
 		CTX_UNLOCK(ctx);
 		break;
 	default:
 		err = ether_ioctl(ifp, command, data);
 		break;
 	}
 	if (reinit)
 		iflib_if_init(ctx);
 	return (err);
 }
 
 static uint64_t
 iflib_if_get_counter(if_t ifp, ift_counter cnt)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	return (IFDI_GET_COUNTER(ctx, cnt));
 }
 
 /*********************************************************************
  *
  *  OTHER FUNCTIONS EXPORTED TO THE STACK
  *
  **********************************************************************/
 
 static void
 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	if (iflib_in_detach(ctx))
 		return;
 
 	CTX_LOCK(ctx);
 	/* Driver may need all untagged packets to be flushed */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_stop(ctx);
 	IFDI_VLAN_REGISTER(ctx, vtag);
 	/* Re-init to load the changes, if required */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	/* Driver may need all tagged packets to be flushed */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_stop(ctx);
 	IFDI_VLAN_UNREGISTER(ctx, vtag);
 	/* Re-init to load the changes, if required */
 	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_led_func(void *arg, int onoff)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	IFDI_LED_FUNC(ctx, onoff);
 	CTX_UNLOCK(ctx);
 }
 
 /*********************************************************************
  *
  *  BUS FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 int
 iflib_device_probe(device_t dev)
 {
 	const pci_vendor_info_t *ent;
 	if_shared_ctx_t sctx;
 	uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id;
 	uint16_t pci_vendor_id;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_vendor_id = pci_get_vendor(dev);
 	pci_device_id = pci_get_device(dev);
 	pci_subvendor_id = pci_get_subvendor(dev);
 	pci_subdevice_id = pci_get_subdevice(dev);
 	pci_rev_id = pci_get_revid(dev);
 	if (sctx->isc_parse_devinfo != NULL)
 		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
 
 	ent = sctx->isc_vendor_info;
 	while (ent->pvi_vendor_id != 0) {
 		if (pci_vendor_id != ent->pvi_vendor_id) {
 			ent++;
 			continue;
 		}
 		if ((pci_device_id == ent->pvi_device_id) &&
 		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
 		     (ent->pvi_subvendor_id == 0)) &&
 		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
 		     (ent->pvi_subdevice_id == 0)) &&
 		    ((pci_rev_id == ent->pvi_rev_id) ||
 		     (ent->pvi_rev_id == 0))) {
 			device_set_desc_copy(dev, ent->pvi_name);
 			/* this needs to be changed to zero if the bus probing code
 			 * ever stops re-probing on best match because the sctx
 			 * may have its values over written by register calls
 			 * in subsequent probes
 			 */
 			return (BUS_PROBE_DEFAULT);
 		}
 		ent++;
 	}
 	return (ENXIO);
 }
 
 int
 iflib_device_probe_vendor(device_t dev)
 {
 	int probe;
 
 	probe = iflib_device_probe(dev);
 	if (probe == BUS_PROBE_DEFAULT)
 		return (BUS_PROBE_VENDOR);
 	else
 		return (probe);
 }
 
 static void
 iflib_reset_qvalues(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	device_t dev = ctx->ifc_dev;
 	int i;
 
 	if (ctx->ifc_sysctl_ntxqs != 0)
 		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
 	if (ctx->ifc_sysctl_nrxqs != 0)
 		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (ctx->ifc_sysctl_ntxds[i] != 0)
 			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
 		else
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (ctx->ifc_sysctl_nrxds[i] != 0)
 			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
 		else
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
 			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
 		}
 		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
 			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
 		}
 		if (!powerof2(scctx->isc_nrxd[i])) {
 			device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 		}
 	}
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
 			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
 		}
 		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
 			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
 		}
 		if (!powerof2(scctx->isc_ntxd[i])) {
 			device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 		}
 	}
 }
 
 static void
 iflib_add_pfil(if_ctx_t ctx)
 {
 	struct pfil_head *pfil;
 	struct pfil_head_args pa;
 	iflib_rxq_t rxq;
 	int i;
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = if_name(ctx->ifc_ifp);
 	pfil = pfil_head_register(&pa);
 
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		rxq->pfil = pfil;
 	}
 }
 
 static void
 iflib_rem_pfil(if_ctx_t ctx)
 {
 	struct pfil_head *pfil;
 	iflib_rxq_t rxq;
 	int i;
 
 	rxq = ctx->ifc_rxqs;
 	pfil = rxq->pfil;
 	for (i = 0; i < NRXQSETS(ctx); i++, rxq++) {
 		rxq->pfil = NULL;
 	}
 	pfil_head_unregister(pfil);
 }
 
 
 /*
  * Advance forward by n members of the cpuset ctx->ifc_cpus starting from
  * cpuid and wrapping as necessary.
  */
 static unsigned int
 cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
 {
 	unsigned int first_valid;
 	unsigned int last_valid;
 
 	/* cpuid should always be in the valid set */
 	MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
 
 	/* valid set should never be empty */
 	MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
 
 	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
 	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
 	n = n % CPU_COUNT(&ctx->ifc_cpus);
 	while (n > 0) {
 		do {
 			cpuid++;
 			if (cpuid > last_valid)
 				cpuid = first_valid;
 		} while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
 		n--;
 	}
 
 	return (cpuid);
 }
 
 #if defined(SMP) && defined(SCHED_ULE)
 extern struct cpu_group *cpu_top;              /* CPU topology */
 
 static int
 find_child_with_core(int cpu, struct cpu_group *grp)
 {
 	int i;
 
 	if (grp->cg_children == 0)
 		return -1;
 
 	MPASS(grp->cg_child);
 	for (i = 0; i < grp->cg_children; i++) {
 		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
 			return i;
 	}
 
 	return -1;
 }
 
 
 /*
  * Find an L2 neighbor of the given CPU or return -1 if none found.  This
  * does not distinguish among multiple L2 neighbors if the given CPU has
  * more than one (it will always return the same result in that case).
  */
 static int
 find_l2_neighbor(int cpu)
 {
 	struct cpu_group *grp;
 	int i;
 
 	grp = cpu_top;
 	if (grp == NULL)
 		return -1;
 
 	/*
 	 * Find the smallest CPU group that contains the given core.
 	 */
 	i = 0;
 	while ((i = find_child_with_core(cpu, grp)) != -1) {
 		/*
 		 * If the smallest group containing the given CPU has less
 		 * than two members, we conclude the given CPU has no
 		 * L2 neighbor.
 		 */
 		if (grp->cg_child[i].cg_count <= 1)
 			return (-1);
 		grp = &grp->cg_child[i];
 	}
 
 	/* Must share L2. */
 	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
 		return -1;
 
 	/*
 	 * Select the first member of the set that isn't the reference
 	 * CPU, which at this point is guaranteed to exist.
 	 */
 	for (i = 0; i < CPU_SETSIZE; i++) {
 		if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
 			return (i);
 	}
 
 	/* Should never be reached */
 	return (-1);
 }
 
 #else
 static int
 find_l2_neighbor(int cpu)
 {
 
 	return (-1);
 }
 #endif
 
 /*
  * CPU mapping behaviors
  * ---------------------
  * 'separate txrx' refers to the separate_txrx sysctl
  * 'use logical' refers to the use_logical_cores sysctl
  * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
  *
  *  separate     use     INTR
  *    txrx     logical   CPUS   result
  * ---------- --------- ------ ------------------------------------------------
  *     -          -       X     RX and TX queues mapped to consecutive physical
  *                              cores with RX/TX pairs on same core and excess
  *                              of either following
  *     -          X       X     RX and TX queues mapped to consecutive cores
  *                              of any type with RX/TX pairs on same core and
  *                              excess of either following
  *     X          -       X     RX and TX queues mapped to consecutive physical
  *                              cores; all RX then all TX
  *     X          X       X     RX queues mapped to consecutive physical cores
  *                              first, then TX queues mapped to L2 neighbor of
  *                              the corresponding RX queue if one exists,
  *                              otherwise to consecutive physical cores
  *     -         n/a      -     RX and TX queues mapped to consecutive cores of
  *                              any type with RX/TX pairs on same core and excess
  *                              of either following
  *     X         n/a      -     RX and TX queues mapped to consecutive cores of
  *                              any type; all RX then all TX
  */
 static unsigned int
 get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
     bool is_tx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	unsigned int core_index;
 
 	if (ctx->ifc_sysctl_separate_txrx) {
 		/*
 		 * When using separate CPUs for TX and RX, the assignment
 		 * will always be of a consecutive CPU out of the set of
 		 * context CPUs, except for the specific case where the
 		 * context CPUs are phsyical cores, the use of logical cores
 		 * has been enabled, the assignment is for TX, the TX qid
 		 * corresponds to an RX qid, and the CPU assigned to the
 		 * corresponding RX queue has an L2 neighbor.
 		 */
 		if (ctx->ifc_sysctl_use_logical_cores &&
 		    ctx->ifc_cpus_are_physical_cores &&
 		    is_tx && qid < scctx->isc_nrxqsets) {
 			int l2_neighbor;
 			unsigned int rx_cpuid;
 
 			rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
 			l2_neighbor = find_l2_neighbor(rx_cpuid);
 			if (l2_neighbor != -1) {
 				return (l2_neighbor);
 			}
 			/*
 			 * ... else fall through to the normal
 			 * consecutive-after-RX assignment scheme.
 			 *
 			 * Note that we are assuming that all RX queue CPUs
 			 * have an L2 neighbor, or all do not.  If a mixed
 			 * scenario is possible, we will have to keep track
 			 * separately of how many queues prior to this one
 			 * were not able to be assigned to an L2 neighbor.
 			 */
 		}
 		if (is_tx)
 			core_index = scctx->isc_nrxqsets + qid;
 		else
 			core_index = qid;
 	} else {
 		core_index = qid;
 	}
 
 	return (cpuid_advance(ctx, base_cpuid, core_index));
 }
 
 static uint16_t
 get_ctx_core_offset(if_ctx_t ctx)
 {
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	struct cpu_offset *op;
 	cpuset_t assigned_cpus;
 	unsigned int cores_consumed;
 	unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
 	unsigned int first_valid;
 	unsigned int last_valid;
 	unsigned int i;
 
 	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
 	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
 
 	if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
 		/*
 		 * Align the user-chosen base CPU ID to the next valid CPU
 		 * for this device.  If the chosen base CPU ID is smaller
 		 * than the first valid CPU or larger than the last valid
 		 * CPU, we assume the user does not know what the valid
 		 * range is for this device and is thinking in terms of a
 		 * zero-based reference frame, and so we shift the given
 		 * value into the valid range (and wrap accordingly) so the
 		 * intent is translated to the proper frame of reference.
 		 * If the base CPU ID is within the valid first/last, but
 		 * does not correspond to a valid CPU, it is advanced to the
 		 * next valid CPU (wrapping if necessary).
 		 */
 		if (base_cpuid < first_valid || base_cpuid > last_valid) {
 			/* shift from zero-based to first_valid-based */
 			base_cpuid += first_valid;
 			/* wrap to range [first_valid, last_valid] */
 			base_cpuid = (base_cpuid - first_valid) %
 			    (last_valid - first_valid + 1);
 		}
 		if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
 			/*
 			 * base_cpuid is in [first_valid, last_valid], but
 			 * not a member of the valid set.  In this case,
 			 * there will always be a member of the valid set
 			 * with a CPU ID that is greater than base_cpuid,
 			 * and we simply advance to it.
 			 */
 			while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
 				base_cpuid++;
 		}
 		return (base_cpuid);
 	}
 
 	/*
 	 * Determine how many cores will be consumed by performing the CPU
 	 * assignments and counting how many of the assigned CPUs correspond
 	 * to CPUs in the set of context CPUs.  This is done using the CPU
 	 * ID first_valid as the base CPU ID, as the base CPU must be within
 	 * the set of context CPUs.
 	 *
 	 * Note not all assigned CPUs will be in the set of context CPUs
 	 * when separate CPUs are being allocated to TX and RX queues,
 	 * assignment to logical cores has been enabled, the set of context
 	 * CPUs contains only physical CPUs, and TX queues are mapped to L2
 	 * neighbors of CPUs that RX queues have been mapped to - in this
 	 * case we do only want to count how many CPUs in the set of context
 	 * CPUs have been consumed, as that determines the next CPU in that
 	 * set to start allocating at for the next device for which
 	 * core_offset is not set.
 	 */
 	CPU_ZERO(&assigned_cpus);
 	for (i = 0; i < scctx->isc_ntxqsets; i++)
 		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
 		    &assigned_cpus);
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
 		    &assigned_cpus);
 	CPU_AND(&assigned_cpus, &assigned_cpus, &ctx->ifc_cpus);
 	cores_consumed = CPU_COUNT(&assigned_cpus);
 
 	mtx_lock(&cpu_offset_mtx);
 	SLIST_FOREACH(op, &cpu_offsets, entries) {
 		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
 			base_cpuid = op->next_cpuid;
 			op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
 			    cores_consumed);
 			MPASS(op->refcount < UINT_MAX);
 			op->refcount++;
 			break;
 		}
 	}
 	if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
 		base_cpuid = first_valid;
 		op = malloc(sizeof(struct cpu_offset), M_IFLIB,
 		    M_NOWAIT | M_ZERO);
 		if (op == NULL) {
 			device_printf(ctx->ifc_dev,
 			    "allocation for cpu offset failed.\n");
 		} else {
 			op->next_cpuid = cpuid_advance(ctx, base_cpuid,
 			    cores_consumed);
 			op->refcount = 1;
 			CPU_COPY(&ctx->ifc_cpus, &op->set);
 			SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
 		}
 	}
 	mtx_unlock(&cpu_offset_mtx);
 
 	return (base_cpuid);
 }
 
 static void
 unref_ctx_core_offset(if_ctx_t ctx)
 {
 	struct cpu_offset *op, *top;
 
 	mtx_lock(&cpu_offset_mtx);
 	SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
 		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
 			MPASS(op->refcount > 0);
 			op->refcount--;
 			if (op->refcount == 0) {
 				SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
 				free(op, M_IFLIB);
 			}
 			break;
 		}
 	}
 	mtx_unlock(&cpu_offset_mtx);
 }
 
 int
 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
 {
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	kobjop_desc_t kobj_desc;
 	kobj_method_t *kobj_method;
 	int err, msix, rid;
 	int num_txd, num_rxd;
 
 	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
 
 	if (sc == NULL) {
 		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 		device_set_softc(dev, ctx);
 		ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	}
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_dev = dev;
 	ctx->ifc_softc = sc;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "iflib_register failed %d\n", err);
 		goto fail_ctx_free;
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	iflib_reset_qvalues(ctx);
 	IFNET_WLOCK();
 	CTX_LOCK(ctx);
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		goto fail_unlock;
 	}
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 	MPASS(scctx->isc_dma_width <= flsll(BUS_SPACE_MAXADDR));
 
 	if (sctx->isc_flags & IFLIB_DRIVER_MEDIA)
 		ctx->ifc_mediap = scctx->isc_media;
 
 #ifdef INVARIANTS
 	if (scctx->isc_capabilities & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp,
 	    scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_MEXTPG);
 	if_setcapenable(ifp,
 	    scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_MEXTPG);
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 	num_txd = iflib_num_tx_descs(ctx);
 	num_rxd = iflib_num_rx_descs(ctx);
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
 	    num_txd, num_rxd);
 
 	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, num_txd /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > num_txd /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    num_txd / MAX_SINGLE_PACKET_FRACTION);
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	if (if_getcapabilities(ifp) & IFCAP_TSO) {
 		/*
 		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
 		 * but some MACs do.
 		 */
 		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 		    IP_MAXPACKET));
 		/*
 		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 		 * into account.  In the worst case, each of these calls will
 		 * add another mbuf and, thus, the requirement for another DMA
 		 * segment.  So for best performance, it doesn't make sense to
 		 * advertize a maximum of TSO segments that typically will
 		 * require defragmentation in iflib_encap().
 		 */
 		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 	}
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 	    NULL, NULL, "admin");
 
 	/* Set up cpu set.  If it fails, use the set of all CPUs. */
 	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
 		device_printf(dev, "Unable to fetch CPU list\n");
 		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
 		ctx->ifc_cpus_are_physical_cores = false;
 	} else
 		ctx->ifc_cpus_are_physical_cores = true;
 	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
 
 	/*
 	** Now set up MSI or MSI-X, should return us the number of supported
 	** vectors (will be 1 for a legacy interrupt and MSI).
 	*/
 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 		msix = scctx->isc_vectors;
 	} else if (scctx->isc_msix_bar != 0)
 	       /*
 		* The simple fact that isc_msix_bar is not 0 does not mean we
 		* we have a good value there that is known to work.
 		*/
 		msix = iflib_msix_init(ctx);
 	else {
 		scctx->isc_vectors = 1;
 		scctx->isc_ntxqsets = 1;
 		scctx->isc_nrxqsets = 1;
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 		msix = 0;
 	}
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail_intr_free;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx)))
 		goto fail_queues;
 
 	/*
 	 * Now that we know how many queues there are, get the core offset.
 	 */
 	ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
 
 	if (msix > 1) {
 		/*
 		 * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
 		 * aren't the default NULL implementation.
 		 */
 		kobj_desc = &ifdi_rx_queue_intr_enable_desc;
 		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
 		    kobj_desc);
 		if (kobj_method == &kobj_desc->deflt) {
 			device_printf(dev,
 			    "MSI-X requires ifdi_rx_queue_intr_enable method");
 			err = EOPNOTSUPP;
 			goto fail_queues;
 		}
 		kobj_desc = &ifdi_tx_queue_intr_enable_desc;
 		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
 		    kobj_desc);
 		if (kobj_method == &kobj_desc->deflt) {
 			device_printf(dev,
 			    "MSI-X requires ifdi_tx_queue_intr_enable method");
 			err = EOPNOTSUPP;
 			goto fail_queues;
 		}
 
 		/*
 		 * Assign the MSI-X vectors.
 		 * Note that the default NULL ifdi_msix_intr_assign method will
 		 * fail here, too.
 		 */
 		err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
 		if (err != 0) {
 			device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
 			    err);
 			goto fail_queues;
 		}
 	} else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
 		rid = 0;
 		if (scctx->isc_intr == IFLIB_INTR_MSI) {
 			MPASS(msix == 1);
 			rid = 1;
 		}
 		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
 			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
 			goto fail_queues;
 		}
 	} else {
 		device_printf(dev,
 		    "Cannot use iflib with only 1 MSI-X interrupt!\n");
 		err = ENODEV;
 		goto fail_queues;
 	}
 
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
 
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 
 	/*
 	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 	 * This must appear after the call to ether_ifattach() because
 	 * ether_ifattach() sets if_hdrlen to the default value.
 	 */
 	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 
 	if ((err = iflib_netmap_attach(ctx))) {
 		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
 		goto fail_detach;
 	}
 	*ctxp = ctx;
 
 	DEBUGNET_SET(ctx->ifc_ifp, iflib);
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	iflib_add_pfil(ctx);
 	ctx->ifc_flags |= IFC_INIT_DONE;
 	CTX_UNLOCK(ctx);
 	IFNET_WUNLOCK();
 
 	return (0);
 
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_queues:
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 fail_intr_free:
 	iflib_free_intr_mem(ctx);
 fail_unlock:
 	CTX_UNLOCK(ctx);
 	IFNET_WUNLOCK();
 	iflib_deregister(ctx);
 fail_ctx_free:
 	device_set_softc(ctx->ifc_dev, NULL);
         if (ctx->ifc_flags & IFC_SC_ALLOCATED)
                 free(ctx->ifc_softc, M_IFLIB);
         free(ctx, M_IFLIB);
 	return (err);
 }
 
 int
 iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp,
 					  struct iflib_cloneattach_ctx *clctx)
 {
 	int num_txd, num_rxd;
 	int err;
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	int i;
 	void *sc;
 
 	ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO);
 	sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 	ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL))
 		ctx->ifc_flags |= IFC_PSEUDO;
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_softc = sc;
 	ctx->ifc_dev = dev;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "%s: iflib_register failed %d\n", __func__, err);
 		goto fail_ctx_free;
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
 
 	iflib_reset_qvalues(ctx);
 	CTX_LOCK(ctx);
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		goto fail_unlock;
 	}
 	if (sctx->isc_flags & IFLIB_GEN_MAC)
 		ether_gen_addr(ifp, &ctx->ifc_mac);
 	if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name,
 								clctx->cc_params)) != 0) {
 		device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err);
 		goto fail_unlock;
 	}
 #ifdef INVARIANTS
 	if (scctx->isc_capabilities & IFCAP_TXCSUM)
 		MPASS(scctx->isc_tx_csum_flags);
 #endif
 
 	if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 
 	if_setflagbits(ifp, IFF_NOGROUP, 0);
 	if (sctx->isc_flags & IFLIB_PSEUDO) {
 		ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
 		ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
 		if (sctx->isc_flags & IFLIB_PSEUDO_ETHER) {
 			ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
 		} else {
 			if_attach(ctx->ifc_ifp);
 			bpfattach(ctx->ifc_ifp, DLT_NULL, sizeof(u_int32_t));
 		}
 
 		if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 			device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 			goto fail_detach;
 		}
 		*ctxp = ctx;
 
 		/*
 		 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 		 * This must appear after the call to ether_ifattach() because
 		 * ether_ifattach() sets if_hdrlen to the default value.
 		 */
 		if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 			if_setifheaderlen(ifp,
 			    sizeof(struct ether_vlan_header));
 
 		if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 		iflib_add_device_sysctl_post(ctx);
 		ctx->ifc_flags |= IFC_INIT_DONE;
 		CTX_UNLOCK(ctx);
 		return (0);
 	}
 	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
 	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
 
 	_iflib_pre_assert(scctx);
 	ctx->ifc_txrx = *scctx->isc_txrx;
 
 	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 
 	num_txd = iflib_num_tx_descs(ctx);
 	num_rxd = iflib_num_rx_descs(ctx);
 
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
 	    num_txd, num_rxd);
 
 	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, num_txd /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > num_txd /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    num_txd / MAX_SINGLE_PACKET_FRACTION);
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	if (if_getcapabilities(ifp) & IFCAP_TSO) {
 		/*
 		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
 		 * but some MACs do.
 		 */
 		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 		    IP_MAXPACKET));
 		/*
 		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 		 * into account.  In the worst case, each of these calls will
 		 * add another mbuf and, thus, the requirement for another DMA
 		 * segment.  So for best performance, it doesn't make sense to
 		 * advertize a maximum of TSO segments that typically will
 		 * require defragmentation in iflib_encap().
 		 */
 		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 	}
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 
 	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 	/* XXX format name */
 	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 	    NULL, NULL, "admin");
 
 	/* XXX --- can support > 1 -- but keep it simple for now */
 	scctx->isc_intr = IFLIB_INTR_LEGACY;
 
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail_iflib_detach;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx))) {
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
 
 	/*
 	 * XXX What if anything do we want to do about interrupts?
 	 */
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 
 	/*
 	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 	 * This must appear after the call to ether_ifattach() because
 	 * ether_ifattach() sets if_hdrlen to the default value.
 	 */
 	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 
 	/* XXX handle more than one queue */
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl);
 
 	*ctxp = ctx;
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	ctx->ifc_flags |= IFC_INIT_DONE;
 	CTX_UNLOCK(ctx);
 
 	return (0);
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_queues:
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 fail_iflib_detach:
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 fail_unlock:
 	CTX_UNLOCK(ctx);
 	iflib_deregister(ctx);
 fail_ctx_free:
 	free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (err);
 }
 
 int
 iflib_pseudo_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 
 	/* Unregister VLAN event handlers early */
 	iflib_unregister_vlan_handlers(ctx);
 
 	if ((sctx->isc_flags & IFLIB_PSEUDO)  &&
 		(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0) {
 		bpfdetach(ifp);
 		if_detach(ifp);
 	} else {
 		ether_ifdetach(ifp);
 	}
 
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 
 	iflib_deregister(ctx);
 
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 int
 iflib_device_attach(device_t dev)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_enable_busmaster(dev);
 
 	return (iflib_device_register(dev, NULL, sctx, &ctx));
 }
 
 int
 iflib_device_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	device_t dev = ctx->ifc_dev;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
 		device_printf(dev, "Vlan in use, detach first\n");
 		return (EBUSY);
 	}
 #ifdef PCI_IOV
 	if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
 		device_printf(dev, "SR-IOV in use; detach first.\n");
 		return (EBUSY);
 	}
 #endif
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_IN_DETACH;
 	STATE_UNLOCK(ctx);
 
 	/* Unregister VLAN handlers before calling iflib_stop() */
 	iflib_unregister_vlan_handlers(ctx);
 
 	iflib_netmap_detach(ifp);
 	ether_ifdetach(ifp);
 
 	CTX_LOCK(ctx);
 	iflib_stop(ctx);
 	CTX_UNLOCK(ctx);
 
 	iflib_rem_pfil(ctx);
 	if (ctx->ifc_led_dev != NULL)
 		led_destroy(ctx->ifc_led_dev);
 
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 
 	CTX_LOCK(ctx);
 	IFDI_DETACH(ctx);
 	IFDI_QUEUES_FREE(ctx);
 	CTX_UNLOCK(ctx);
 
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	iflib_free_intr_mem(ctx);
 
 	bus_generic_detach(dev);
 
 	iflib_deregister(ctx);
 
 	device_set_softc(ctx->ifc_dev, NULL);
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	unref_ctx_core_offset(ctx);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 static void
 iflib_tqg_detach(if_ctx_t ctx)
 {
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i;
 	struct taskqgroup *tqg;
 
 	/* XXX drain any dependent tasks */
 	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
 #ifdef DEV_NETMAP
 		callout_drain(&txq->ift_netmap_timer);
 #endif /* DEV_NETMAP */
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 }
 
 static void
 iflib_free_intr_mem(if_ctx_t ctx)
 {
 
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
 		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
 	}
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
 		pci_release_msi(ctx->ifc_dev);
 	}
 	if (ctx->ifc_msix_mem != NULL) {
 		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
 		    rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 	}
 }
 
 int
 iflib_device_detach(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	return (iflib_device_deregister(ctx));
 }
 
 int
 iflib_device_suspend(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SUSPEND(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 int
 iflib_device_shutdown(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SHUTDOWN(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 
 int
 iflib_device_resume(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	CTX_LOCK(ctx);
 	IFDI_RESUME(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 
 	return (bus_generic_resume(dev));
 }
 
 int
 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_INIT(ctx, num_vfs, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 void
 iflib_device_iov_uninit(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_IOV_UNINIT(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 int
 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 /*********************************************************************
  *
  *  MODULE FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 /*
  * - Start a fast taskqueue thread for each core
  * - Start a taskqueue for control operations
  */
 static int
 iflib_module_init(void)
 {
 	iflib_timer_default = hz / 2;
 	return (0);
 }
 
 static int
 iflib_module_event_handler(module_t mod, int what, void *arg)
 {
 	int err;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((err = iflib_module_init()) != 0)
 			return (err);
 		break;
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 /*********************************************************************
  *
  *  PUBLIC FUNCTION DEFINITIONS
  *     ordered as in iflib.h
  *
  **********************************************************************/
 
 static void
 _iflib_assert(if_shared_ctx_t sctx)
 {
 	int i;
 
 	MPASS(sctx->isc_tx_maxsize);
 	MPASS(sctx->isc_tx_maxsegsize);
 
 	MPASS(sctx->isc_rx_maxsize);
 	MPASS(sctx->isc_rx_nsegments);
 	MPASS(sctx->isc_rx_maxsegsize);
 
 	MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		MPASS(sctx->isc_nrxd_min[i]);
 		MPASS(powerof2(sctx->isc_nrxd_min[i]));
 		MPASS(sctx->isc_nrxd_max[i]);
 		MPASS(powerof2(sctx->isc_nrxd_max[i]));
 		MPASS(sctx->isc_nrxd_default[i]);
 		MPASS(powerof2(sctx->isc_nrxd_default[i]));
 	}
 
 	MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		MPASS(sctx->isc_ntxd_min[i]);
 		MPASS(powerof2(sctx->isc_ntxd_min[i]));
 		MPASS(sctx->isc_ntxd_max[i]);
 		MPASS(powerof2(sctx->isc_ntxd_max[i]));
 		MPASS(sctx->isc_ntxd_default[i]);
 		MPASS(powerof2(sctx->isc_ntxd_default[i]));
 	}
 }
 
 static void
 _iflib_pre_assert(if_softc_ctx_t scctx)
 {
 
 	MPASS(scctx->isc_txrx->ift_txd_encap);
 	MPASS(scctx->isc_txrx->ift_txd_flush);
 	MPASS(scctx->isc_txrx->ift_txd_credits_update);
 	MPASS(scctx->isc_txrx->ift_rxd_available);
 	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
 	MPASS(scctx->isc_txrx->ift_rxd_refill);
 	MPASS(scctx->isc_txrx->ift_rxd_flush);
 }
 
 static int
 iflib_register(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	driver_t *driver = sctx->isc_driver;
 	device_t dev = ctx->ifc_dev;
 	if_t ifp;
 	u_char type;
 	int iflags;
 
 	if ((sctx->isc_flags & IFLIB_PSEUDO) == 0)
 		_iflib_assert(sctx);
 
 	CTX_LOCK_INIT(ctx);
 	STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
 	if (sctx->isc_flags & IFLIB_PSEUDO) {
 		if (sctx->isc_flags & IFLIB_PSEUDO_ETHER)
 			type = IFT_ETHER;
 		else
 			type = IFT_PPP;
 	} else
 		type = IFT_ETHER;
 	ifp = ctx->ifc_ifp = if_alloc(type);
 	if (ifp == NULL) {
 		device_printf(dev, "can not allocate ifnet structure\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Initialize our context's device specific methods
 	 */
 	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
 	kobj_class_compile((kobj_class_t) driver);
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	if_setsoftc(ifp, ctx);
 	if_setdev(ifp, dev);
 	if_setinitfn(ifp, iflib_if_init);
 	if_setioctlfn(ifp, iflib_if_ioctl);
 #ifdef ALTQ
 	if_setstartfn(ifp, iflib_altq_if_start);
 	if_settransmitfn(ifp, iflib_altq_if_transmit);
 	if_setsendqready(ifp);
 #else
 	if_settransmitfn(ifp, iflib_if_transmit);
 #endif
 	if_setqflushfn(ifp, iflib_if_qflush);
-	iflags = IFF_MULTICAST | IFF_KNOWSEPOCH;
+	iflags = IFF_MULTICAST;
 
 	if ((sctx->isc_flags & IFLIB_PSEUDO) &&
 		(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0)
 		iflags |= IFF_POINTOPOINT;
 	else
 		iflags |= IFF_BROADCAST | IFF_SIMPLEX;
 	if_setflags(ifp, iflags);
 	ctx->ifc_vlan_attach_event =
 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 	ctx->ifc_vlan_detach_event =
 		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 
 	if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) {
 		ctx->ifc_mediap = &ctx->ifc_media;
 		ifmedia_init(ctx->ifc_mediap, IFM_IMASK,
 		    iflib_media_change, iflib_media_status);
 	}
 	return (0);
 }
 
 static void
 iflib_unregister_vlan_handlers(if_ctx_t ctx)
 {
 	/* Unregister VLAN events */
 	if (ctx->ifc_vlan_attach_event != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 		ctx->ifc_vlan_attach_event = NULL;
 	}
 	if (ctx->ifc_vlan_detach_event != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 		ctx->ifc_vlan_detach_event = NULL;
 	}
 
 }
 
 static void
 iflib_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 
 	/* Remove all media */
 	ifmedia_removeall(&ctx->ifc_media);
 
 	/* Ensure that VLAN event handlers are unregistered */
 	iflib_unregister_vlan_handlers(ctx);
 
 	/* Release kobject reference */
 	kobj_delete((kobj_t) ctx, NULL);
 
 	/* Free the ifnet structure */
 	if_free(ifp);
 
 	STATE_LOCK_DESTROY(ctx);
 
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	CTX_LOCK_DESTROY(ctx);
 }
 
 static int
 iflib_queues_alloc(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int nrxqsets = scctx->isc_nrxqsets;
 	int ntxqsets = scctx->isc_ntxqsets;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	iflib_fl_t fl = NULL;
 	int i, j, cpu, err, txconf, rxconf;
 	iflib_dma_info_t ifdip;
 	uint32_t *rxqsizes = scctx->isc_rxqsizes;
 	uint32_t *txqsizes = scctx->isc_txqsizes;
 	uint8_t nrxqs = sctx->isc_nrxqs;
 	uint8_t ntxqs = sctx->isc_ntxqs;
 	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
 	int fl_offset = (sctx->isc_flags & IFLIB_HAS_RXCQ ? 1 : 0);
 	caddr_t *vaddrs;
 	uint64_t *paddrs;
 
 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs >= fl_offset + nfree_lists,
            ("there must be at least a rxq for each free list"));
 
 	/* Allocate the TX ring struct memory */
 	if (!(ctx->ifc_txqs =
 	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
 	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX ring memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/* Now allocate the RX */
 	if (!(ctx->ifc_rxqs =
 	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
 	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate RX ring memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
 
 	txq = ctx->ifc_txqs;
 	rxq = ctx->ifc_rxqs;
 
 	/*
 	 * XXX handle allocation failure
 	 */
 	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
 		    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 			device_printf(dev,
 			    "Unable to allocate TX DMA info memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		txq->ift_ifdi = ifdip;
 		for (j = 0; j < ntxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
 				device_printf(dev,
 				    "Unable to allocate TX descriptors\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 		}
 		txq->ift_ctx = ctx;
 		txq->ift_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
 			txq->ift_br_offset = 1;
 		} else {
 			txq->ift_br_offset = 0;
 		}
 
 		if (iflib_txsd_alloc(txq)) {
 			device_printf(dev, "Critical Failure setting up TX buffers\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		/* Initialize the TX lock */
 		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
 		    device_get_nameunit(dev), txq->ift_id);
 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
 		txq->ift_timer.c_cpu = cpu;
 #ifdef DEV_NETMAP
 		callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
 		txq->ift_netmap_timer.c_cpu = cpu;
 #endif /* DEV_NETMAP */
 
 		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
 				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
 		if (err) {
 			/* XXX free any allocated rings */
 			device_printf(dev, "Unable to allocate buf_ring\n");
 			goto err_tx_desc;
 		}
 	}
 
 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
 		/* Set up some basics */
 		callout_init(&rxq->ifr_watchdog, 1);
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
 		   M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 			device_printf(dev,
 			    "Unable to allocate RX DMA info memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		rxq->ifr_ifdi = ifdip;
 		/* XXX this needs to be changed if #rx queues != #tx queues */
 		rxq->ifr_ntxqirq = 1;
 		rxq->ifr_txqid[0] = i;
 		for (j = 0; j < nrxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
 				device_printf(dev,
 				    "Unable to allocate RX descriptors\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
 		}
 		rxq->ifr_ctx = ctx;
 		rxq->ifr_id = i;
 		rxq->ifr_fl_offset = fl_offset;
 		rxq->ifr_nfl = nfree_lists;
 		if (!(fl =
 			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate free list memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		rxq->ifr_fl = fl;
 		for (j = 0; j < nfree_lists; j++) {
 			fl[j].ifl_rxq = rxq;
 			fl[j].ifl_id = j;
 			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
 			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
 		}
 		/* Allocate receive buffers for the ring */
 		if (iflib_rxsd_alloc(rxq)) {
 			device_printf(dev,
 			    "Critical Failure setting up receive buffers\n");
 			err = ENOMEM;
 			goto err_rx_desc;
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) 
 			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
 			    M_WAITOK);
 	}
 
 	/* TXQs */
 	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < ntxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
 
 		for (j = 0; j < ntxqs; j++, di++) {
 			vaddrs[i*ntxqs + j] = di->idi_vaddr;
 			paddrs[i*ntxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
 		device_printf(ctx->ifc_dev,
 		    "Unable to allocate device TX queue\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	/* RXQs */
 	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < nrxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
 
 		for (j = 0; j < nrxqs; j++, di++) {
 			vaddrs[i*nrxqs + j] = di->idi_vaddr;
 			paddrs[i*nrxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
 		device_printf(ctx->ifc_dev,
 		    "Unable to allocate device RX queue\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	return (0);
 
 /* XXX handle allocation failure changes */
 err_rx_desc:
 err_tx_desc:
 rx_fail:
 	if (ctx->ifc_rxqs != NULL)
 		free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 	if (ctx->ifc_txqs != NULL)
 		free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 fail:
 	return (err);
 }
 
 static int
 iflib_tx_structures_setup(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_setup(txq);
 
 	return (0);
 }
 
 static void
 iflib_tx_structures_free(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	int i, j;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		for (j = 0; j < sctx->isc_ntxqs; j++)
 			iflib_dma_free(&txq->ift_ifdi[j]);
 		iflib_txq_destroy(txq);
 	}
 	free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 }
 
 /*********************************************************************
  *
  *  Initialize all receive rings.
  *
  **********************************************************************/
 static int
 iflib_rx_structures_setup(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	int q;
 #if defined(INET6) || defined(INET)
 	int err, i;
 #endif
 
 	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
 #if defined(INET6) || defined(INET)
 		err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
 		    TCP_LRO_ENTRIES, min(1024,
 		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
 		if (err != 0) {
 			device_printf(ctx->ifc_dev,
 			    "LRO Initialization failed!\n");
 			goto fail;
 		}
 #endif
 		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
 	}
 	return (0);
 #if defined(INET6) || defined(INET)
 fail:
 	/*
 	 * Free LRO resources allocated so far, we will only handle
 	 * the rings that completed, the failing case will have
 	 * cleaned up for itself.  'q' failed, so its the terminus.
 	 */
 	rxq = ctx->ifc_rxqs;
 	for (i = 0; i < q; ++i, rxq++) {
 		tcp_lro_free(&rxq->ifr_lc);
 	}
 	return (err);
 #endif
 }
 
 /*********************************************************************
  *
  *  Free all receive rings.
  *
  **********************************************************************/
 static void
 iflib_rx_structures_free(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	int i, j;
 
 	for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
 		for (j = 0; j < sctx->isc_nrxqs; j++)
 			iflib_dma_free(&rxq->ifr_ifdi[j]);
 		iflib_rx_sds_free(rxq);
 #if defined(INET6) || defined(INET)
 		tcp_lro_free(&rxq->ifr_lc);
 #endif
 	}
 	free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 }
 
 static int
 iflib_qset_structures_setup(if_ctx_t ctx)
 {
 	int err;
 
 	/*
 	 * It is expected that the caller takes care of freeing queues if this
 	 * fails.
 	 */
 	if ((err = iflib_tx_structures_setup(ctx)) != 0) {
 		device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
 		return (err);
 	}
 
 	if ((err = iflib_rx_structures_setup(ctx)) != 0)
 		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
 
 	return (err);
 }
 
 int
 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 		driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
 {
 
 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 }
 
 /* Just to avoid copy/paste */
 static inline int
 iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
     int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
     const char *name)
 {
 	device_t dev;
 	unsigned int base_cpuid, cpuid;
 	int err;
 
 	dev = ctx->ifc_dev;
 	base_cpuid = ctx->ifc_sysctl_core_offset;
 	cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
 	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
 	    irq ? irq->ii_res : NULL, name);
 	if (err) {
 		device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
 		return (err);
 	}
 #ifdef notyet
 	if (cpuid > ctx->ifc_cpuid_highest)
 		ctx->ifc_cpuid_highest = cpuid;
 #endif
 	return (0);
 }
 
 int
 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 			iflib_intr_type_t type, driver_filter_t *filter,
 			void *filter_arg, int qid, const char *name)
 {
 	device_t dev;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	iflib_filter_info_t info;
 	gtask_fn_t *fn;
 	int tqrid, err;
 	driver_filter_t *intr_fast;
 	void *q;
 
 	info = &ctx->ifc_filter_info;
 	tqrid = rid;
 
 	switch (type) {
 	/* XXX merge tx/rx for netmap? */
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		info = &ctx->ifc_txqs[qid].ift_filter_info;
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		intr_fast = iflib_fast_intr;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		intr_fast = iflib_fast_intr;
 		NET_GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RXTX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		intr_fast = iflib_fast_intr_rxtx;
 		NET_GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_ADMIN:
 		q = ctx;
 		tqrid = -1;
 		info = &ctx->ifc_filter_info;
 		gtask = &ctx->ifc_admin_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_admin;
 		intr_fast = iflib_fast_intr_ctx;
 		break;
 	default:
 		device_printf(ctx->ifc_dev, "%s: unknown net intr type\n",
 		    __func__);
 		return (EINVAL);
 	}
 
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = q;
 
 	dev = ctx->ifc_dev;
 	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
 	if (err != 0) {
 		device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
 		return (err);
 	}
 	if (type == IFLIB_INTR_ADMIN)
 		return (0);
 
 	if (tqrid != -1) {
 		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
 		    name);
 		if (err)
 			return (err);
 	} else {
 		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
 	}
 
 	return (0);
 }
 
 void
 iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
 {
 	device_t dev;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	void *q;
 	int err;
 
 	switch (type) {
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		gtask = &ctx->ifc_txqs[qid].ift_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
 		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
 		NET_GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_IOV:
 		q = ctx;
 		gtask = &ctx->ifc_vflr_task;
 		tqg = qgroup_if_config_tqg;
 		fn = _task_fn_iov;
 		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 	err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
 	if (err) {
 		dev = ctx->ifc_dev;
 		taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
 		    name);
 	}
 }
 
 void
 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
 {
 
 	if (irq->ii_tag)
 		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
 
 	if (irq->ii_res)
 		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
 		    rman_get_rid(irq->ii_res), irq->ii_res);
 }
 
 static int
 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_irq_t irq = &ctx->ifc_legacy_irq;
 	iflib_filter_info_t info;
 	device_t dev;
 	struct grouptask *gtask;
 	struct resource *res;
 	struct taskqgroup *tqg;
 	void *q;
 	int err, tqrid;
 	bool rx_only;
 
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;
 	tqg = qgroup_if_io_tqg;
 	tqrid = *rid;
 	rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0;
 
 	ctx->ifc_flags |= IFC_LEGACY;
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 	info->ifi_ctx = rx_only ? ctx : q;
 
 	dev = ctx->ifc_dev;
 	/* We allocate a single interrupt resource */
 	err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr_ctx :
 	    iflib_fast_intr_rxtx, NULL, info, name);
 	if (err != 0)
 		return (err);
 	NET_GROUPTASK_INIT(gtask, 0, _task_fn_rx, q);
 	res = irq->ii_res;
 	taskqgroup_attach(tqg, gtask, q, dev, res, name);
 
 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
 	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res,
 	    "tx");
 	return (0);
 }
 
 void
 iflib_led_create(if_ctx_t ctx)
 {
 
 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
 	    device_get_nameunit(ctx->ifc_dev));
 }
 
 void
 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
 }
 
 void
 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
 }
 
 void
 iflib_admin_intr_deferred(if_ctx_t ctx)
 {
 
 	MPASS(ctx->ifc_admin_task.gt_taskqueue != NULL);
 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 }
 
 void
 iflib_iov_intr_deferred(if_ctx_t ctx)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
 }
 
 void
 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, const char *name)
 {
 
 	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL,
 	    name);
 }
 
 void
 iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
 	const char *name)
 {
 
 	GROUPTASK_INIT(gtask, 0, fn, ctx);
 	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, NULL, NULL,
 	    name);
 }
 
 void
 iflib_config_gtask_deinit(struct grouptask *gtask)
 {
 
 	taskqgroup_detach(qgroup_if_config_tqg, gtask);	
 }
 
 void
 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	if_setbaudrate(ifp, baudrate);
 	if (baudrate >= IF_Gbps(10)) {
 		STATE_LOCK(ctx);
 		ctx->ifc_flags |= IFC_PREFETCH;
 		STATE_UNLOCK(ctx);
 	}
 	/* If link down, disable watchdog */
 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
 		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
 			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	}
 	ctx->ifc_link_state = link_state;
 	if_link_state_change(ifp, link_state);
 }
 
 static int
 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 {
 	int credits;
 #ifdef INVARIANTS
 	int credits_pre = txq->ift_cidx_processed;
 #endif
 
 	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 	    BUS_DMASYNC_POSTREAD);
 	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
 		return (0);
 
 	txq->ift_processed += credits;
 	txq->ift_cidx_processed += credits;
 
 	MPASS(credits_pre + credits == txq->ift_cidx_processed);
 	if (txq->ift_cidx_processed >= txq->ift_size)
 		txq->ift_cidx_processed -= txq->ift_size;
 	return (credits);
 }
 
 static int
 iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
 {
 	iflib_fl_t fl;
 	u_int i;
 
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
 		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
 	    budget));
 }
 
 void
 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
 	const char *description, if_int_delay_info_t info,
 	int offset, int value)
 {
 	info->iidi_ctx = ctx;
 	info->iidi_offset = offset;
 	info->iidi_value = value;
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
 	    OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    info, 0, iflib_sysctl_int_delay, "I", description);
 }
 
 struct sx *
 iflib_ctx_lock_get(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_ctx_sx);
 }
 
 static int
 iflib_msix_init(if_ctx_t ctx)
 {
 	device_t dev = ctx->ifc_dev;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues;
 	int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors;
 
 	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
 	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
 
 	if (bootverbose)
 		device_printf(dev, "msix_init qsets capped at %d\n",
 		    imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
 
 	/* Override by tuneable */
 	if (scctx->isc_disable_msix)
 		goto msi;
 
 	/* First try MSI-X */
 	if ((msgs = pci_msix_count(dev)) == 0) {
 		if (bootverbose)
 			device_printf(dev, "MSI-X not supported or disabled\n");
 		goto msi;
 	}
 
 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
 	/*
 	 * bar == -1 => "trust me I know what I'm doing"
 	 * Some drivers are for hardware that is so shoddily
 	 * documented that no one knows which bars are which
 	 * so the developer has to map all bars. This hack
 	 * allows shoddy garbage to use MSI-X in this framework.
 	 */
 	if (bar != -1) {
 		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
 	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
 		if (ctx->ifc_msix_mem == NULL) {
 			device_printf(dev, "Unable to map MSI-X table\n");
 			goto msi;
 		}
 	}
 
 	admincnt = sctx->isc_admin_intrcnt;
 #if IFLIB_DEBUG
 	/* use only 1 qset in debug mode */
 	queuemsgs = min(msgs - admincnt, 1);
 #else
 	queuemsgs = msgs - admincnt;
 #endif
 #ifdef RSS
 	queues = imin(queuemsgs, rss_getnumbuckets());
 #else
 	queues = queuemsgs;
 #endif
 	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
 	if (bootverbose)
 		device_printf(dev,
 		    "intr CPUs: %d queue msgs: %d admincnt: %d\n",
 		    CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 #ifdef  RSS
 	/* If we're doing RSS, clamp at the number of RSS buckets */
 	if (queues > rss_getnumbuckets())
 		queues = rss_getnumbuckets();
 #endif
 	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
 		rx_queues = iflib_num_rx_queues;
 	else
 		rx_queues = queues;
 
 	if (rx_queues > scctx->isc_nrxqsets)
 		rx_queues = scctx->isc_nrxqsets;
 
 	/*
 	 * We want this to be all logical CPUs by default
 	 */
 	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
 		tx_queues = iflib_num_tx_queues;
 	else
 		tx_queues = mp_ncpus;
 
 	if (tx_queues > scctx->isc_ntxqsets)
 		tx_queues = scctx->isc_ntxqsets;
 
 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
 #ifdef INVARIANTS
 		if (tx_queues != rx_queues)
 			device_printf(dev,
 			    "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
 			    min(rx_queues, tx_queues), min(rx_queues, tx_queues));
 #endif
 		tx_queues = min(rx_queues, tx_queues);
 		rx_queues = min(rx_queues, tx_queues);
 	}
 
 	vectors = rx_queues + admincnt;
 	if (msgs < vectors) {
 		device_printf(dev,
 		    "insufficient number of MSI-X vectors "
 		    "(supported %d, need %d)\n", msgs, vectors);
 		goto msi;
 	}
 
 	device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues,
 	    tx_queues);
 	msgs = vectors;
 	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
 		if (vectors != msgs) {
 			device_printf(dev,
 			    "Unable to allocate sufficient MSI-X vectors "
 			    "(got %d, need %d)\n", vectors, msgs);
 			pci_release_msi(dev);
 			if (bar != -1) {
 				bus_release_resource(dev, SYS_RES_MEMORY, bar,
 				    ctx->ifc_msix_mem);
 				ctx->ifc_msix_mem = NULL;
 			}
 			goto msi;
 		}
 		device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
 		    vectors);
 		scctx->isc_vectors = vectors;
 		scctx->isc_nrxqsets = rx_queues;
 		scctx->isc_ntxqsets = tx_queues;
 		scctx->isc_intr = IFLIB_INTR_MSIX;
 
 		return (vectors);
 	} else {
 		device_printf(dev,
 		    "failed to allocate %d MSI-X vectors, err: %d\n", vectors,
 		    err);
 		if (bar != -1) {
 			bus_release_resource(dev, SYS_RES_MEMORY, bar,
 			    ctx->ifc_msix_mem);
 			ctx->ifc_msix_mem = NULL;
 		}
 	}
 
 msi:
 	vectors = pci_msi_count(dev);
 	scctx->isc_nrxqsets = 1;
 	scctx->isc_ntxqsets = 1;
 	scctx->isc_vectors = vectors;
 	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
 		device_printf(dev,"Using an MSI interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_MSI;
 	} else {
 		scctx->isc_vectors = 1;
 		device_printf(dev,"Using a Legacy interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 	}
 
 	return (vectors);
 }
 
 static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
 
 static int
 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
 	struct sbuf *sb;
 	const char *ring_state = "UNKNOWN";
 
 	/* XXX needed ? */
 	rc = sysctl_wire_old_buffer(req, 0);
 	MPASS(rc == 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
 	MPASS(sb != NULL);
 	if (sb == NULL)
 		return (ENOMEM);
 	if (state[3] <= 3)
 		ring_state = ring_states[state[3]];
 
 	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
 		    state[0], state[1], state[2], ring_state);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
         return(rc);
 }
 
 enum iflib_ndesc_handler {
 	IFLIB_NTXD_HANDLER,
 	IFLIB_NRXD_HANDLER,
 };
 
 static int
 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 {
 	if_ctx_t ctx = (void *)arg1;
 	enum iflib_ndesc_handler type = arg2;
 	char buf[256] = {0};
 	qidx_t *ndesc;
 	char *p, *next;
 	int nqs, rc, i;
 
 	nqs = 8;
 	switch(type) {
 	case IFLIB_NTXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_ntxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_ntxqs;
 		break;
 	case IFLIB_NRXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_nrxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_nrxqs;
 		break;
 	default:
 		printf("%s: unhandled type\n", __func__);
 		return (EINVAL);
 	}
 	if (nqs == 0)
 		nqs = 8;
 
 	for (i=0; i<8; i++) {
 		if (i >= nqs)
 			break;
 		if (i)
 			strcat(buf, ",");
 		sprintf(strchr(buf, 0), "%d", ndesc[i]);
 	}
 
 	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (rc || req->newptr == NULL)
 		return rc;
 
 	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
 	    i++, p = strsep(&next, " ,")) {
 		ndesc[i] = strtoul(p, NULL, 10);
 	}
 
 	return(rc);
 }
 
 #define NAME_BUFLEN 32
 static void
 iflib_add_device_sysctl_pre(if_ctx_t ctx)
 {
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child, *oid_list;
 	struct sysctl_ctx_list *ctx_list;
 	struct sysctl_oid *node;
 
 	ctx_list = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IFLIB fields");
 	oid_list = SYSCTL_CHILDREN(node);
 
 	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version,
 		       "driver version");
 
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 			"# of txqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 			"# of rxqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
                        "permit #txq != #rxq");
 	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
                       CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
                       "disable MSI-X (default 0)");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
 		       "set the RX budget");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
 		       "cause TX to abdicate instead of running to completion");
 	ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
 		       CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
 		       "offset to start using cores at");
 	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
 		       CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
 		       "use separate cores for TX and RX");
 	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
 		      CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
 		      "try to make use of logical cores for TX and RX");
 
 	/* XXX change for per-queue sizes */
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
 	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
 	    IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A",
 	    "list of # of TX descriptors to use, 0 = use default #");
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
 	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
 	    IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A",
 	    "list of # of RX descriptors to use, 0 = use default #");
 }
 
 static void
 iflib_add_device_sysctl_post(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx_list;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j;
 	char namebuf[NAME_BUFLEN];
 	char *qfmt;
 	struct sysctl_oid *queue_node, *fl_node, *node;
 	struct sysctl_oid_list *queue_list, *fl_list;
 	ctx_list = device_get_sysctl_ctx(dev);
 
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);
 
 	if (scctx->isc_ntxqsets > 100)
 		qfmt = "txq%03d";
 	else if (scctx->isc_ntxqsets > 10)
 		qfmt = "txq%02d";
 	else
 		qfmt = "txq%d";
 	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
 			       CTLFLAG_RD,
 			       &txq->ift_task.gt_cpu, 0, "cpu this queue is bound to");
 #if MEMORY_LOGGING
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
 				CTLFLAG_RD,
 				&txq->ift_dequeued, "total mbufs freed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
 				CTLFLAG_RD,
 				&txq->ift_enqueued, "total mbufs enqueued");
 #endif
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
 				   CTLFLAG_RD,
 				   &txq->ift_pullups, "# of times m_pullup was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
 				   CTLFLAG_RD,
 				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_map_failed, "# of times DMA map failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
 				   CTLFLAG_RD,
 				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
 				   CTLFLAG_RD,
 				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
 				   CTLFLAG_RD,
 				   &txq->ift_pidx, 1, "Producer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx, 1, "Consumer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
 				   CTLFLAG_RD,
 				   &txq->ift_in_use, 1, "descriptors in use");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_processed, "descriptors procesed for clean");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
 				   CTLFLAG_RD,
 				   &txq->ift_cleaned, "total cleaned");
 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 		    __DEVOLATILE(uint64_t *, &txq->ift_br->state), 0,
 		    mp_ring_state_handler, "A", "soft ring state");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
 				       CTLFLAG_RD, &txq->ift_br->enqueues,
 				       "# of enqueues to the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
 				       CTLFLAG_RD, &txq->ift_br->drops,
 				       "# of drops in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
 				       CTLFLAG_RD, &txq->ift_br->starts,
 				       "# of normal consumer starts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
 				       CTLFLAG_RD, &txq->ift_br->stalls,
 					       "# of consumer stalls in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
 			       CTLFLAG_RD, &txq->ift_br->restarts,
 				       "# of consumer restarts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
 				       CTLFLAG_RD, &txq->ift_br->abdications,
 				       "# of consumer abdications in the mp_ring for this queue");
 	}
 
 	if (scctx->isc_nrxqsets > 100)
 		qfmt = "rxq%03d";
 	else if (scctx->isc_nrxqsets > 10)
 		qfmt = "rxq%02d";
 	else
 		qfmt = "rxq%d";
 	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
 			       CTLFLAG_RD,
 			       &rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to");
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
 		}
 
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
 			    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist Name");
 			fl_list = SYSCTL_CHILDREN(fl_node);
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_cidx, 1, "Consumer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
 				       CTLFLAG_RD,
 				       &fl->ifl_credits, 1, "credits available");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "buf_size",
 				       CTLFLAG_RD,
 				       &fl->ifl_buf_size, 1, "buffer size");
 #if MEMORY_LOGGING
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_m_enqueued, "mbufs allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_m_dequeued, "mbufs freed");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_enqueued, "clusters allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_dequeued, "clusters freed");
 #endif
 		}
 	}
 
 }
 
 void
 iflib_request_reset(if_ctx_t ctx)
 {
 
 	STATE_LOCK(ctx);
 	ctx->ifc_flags |= IFC_DO_RESET;
 	STATE_UNLOCK(ctx);
 }
 
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf *
 iflib_fixup_rx(struct mbuf *m)
 {
 	struct mbuf *n;
 
 	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
 		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
 		m->m_data += ETHER_HDR_LEN;
 		n = m;
 	} else {
 		MGETHDR(n, M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return (NULL);
 		}
 		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
 		m->m_data += ETHER_HDR_LEN;
 		m->m_len -= ETHER_HDR_LEN;
 		n->m_len = ETHER_HDR_LEN;
 		M_MOVE_PKTHDR(n, m);
 		n->m_next = m;
 	}
 	return (n);
 }
 #endif
 
 #ifdef DEBUGNET
 static void
 iflib_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
 {
 	if_ctx_t ctx;
 
 	ctx = if_getsoftc(ifp);
 	CTX_LOCK(ctx);
 	*nrxr = NRXQSETS(ctx);
 	*ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
 	*clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_debugnet_event(if_t ifp, enum debugnet_ev event)
 {
 	if_ctx_t ctx;
 	if_softc_ctx_t scctx;
 	iflib_fl_t fl;
 	iflib_rxq_t rxq;
 	int i, j;
 
 	ctx = if_getsoftc(ifp);
 	scctx = &ctx->ifc_softc_ctx;
 
 	switch (event) {
 	case DEBUGNET_START:
 		for (i = 0; i < scctx->isc_nrxqsets; i++) {
 			rxq = &ctx->ifc_rxqs[i];
 			for (j = 0; j < rxq->ifr_nfl; j++) {
 				fl = rxq->ifr_fl;
 				fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 			}
 		}
 		iflib_no_tx_batch = 1;
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 iflib_debugnet_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t ctx;
 	iflib_txq_t txq;
 	int error;
 
 	ctx = if_getsoftc(ifp);
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	txq = &ctx->ifc_txqs[0];
 	error = iflib_encap(txq, &m);
 	if (error == 0)
 		(void)iflib_txd_db_check(txq, true);
 	return (error);
 }
 
 static int
 iflib_debugnet_poll(if_t ifp, int count)
 {
 	struct epoch_tracker et;
 	if_ctx_t ctx;
 	if_softc_ctx_t scctx;
 	iflib_txq_t txq;
 	int i;
 
 	ctx = if_getsoftc(ifp);
 	scctx = &ctx->ifc_softc_ctx;
 
 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return (EBUSY);
 
 	txq = &ctx->ifc_txqs[0];
 	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 
 	NET_EPOCH_ENTER(et);
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
 		(void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 #endif /* DEBUGNET */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 8b2f4724d2fd..fe4581e456d3 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1,1467 +1,1467 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
  *
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "ipoib.h"
 #include <sys/eventhandler.h>
 
 #include <linux/module.h>
 
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 
 #include <linux/if_vlan.h>
 
 #include <net/infiniband.h>
 
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
 
 int ipoib_sendq_size = IPOIB_TX_RING_SIZE;
 int ipoib_recvq_size = IPOIB_RX_RING_SIZE;
 
 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level = 1;
 
 module_param_named(debug_level, ipoib_debug_level, int, 0644);
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 #endif
 
 struct ipoib_path_iter {
 	struct ipoib_dev_priv *priv;
 	struct ipoib_path  path;
 };
 
 static const u8 ipv4_bcast_addr[] = {
 	0x00, 0xff, 0xff, 0xff,
 	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
 	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
 };
 
 struct workqueue_struct *ipoib_workqueue;
 
 struct ib_sa_client ipoib_sa_client;
 
 static void ipoib_add_one(struct ib_device *device);
 static void ipoib_remove_one(struct ib_device *device, void *client_data);
 static if_t ipoib_get_net_dev_by_params(
 		struct ib_device *dev, u8 port, u16 pkey,
 		const union ib_gid *gid, const struct sockaddr *addr,
 		void *client_data);
 static void ipoib_start(if_t dev);
 static int ipoib_ioctl(if_t ifp, u_long command, caddr_t data);
 
 static struct unrhdr *ipoib_unrhdr;
 
 static void
 ipoib_unrhdr_init(void *arg)
 {
 
 	ipoib_unrhdr = new_unrhdr(0, 65535, NULL);
 }
 SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL);
 
 static void
 ipoib_unrhdr_uninit(void *arg)
 {
 
 	if (ipoib_unrhdr != NULL) {
 		struct unrhdr *hdr;
 
 		hdr = ipoib_unrhdr;
 		ipoib_unrhdr = NULL;
 
 		delete_unrhdr(hdr);
 	}
 }
 SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL);
 
 static struct ib_client ipoib_client = {
 	.name   = "ipoib",
 	.add    = ipoib_add_one,
 	.remove = ipoib_remove_one,
 	.get_net_dev_by_params = ipoib_get_net_dev_by_params,
 };
 
 int
 ipoib_open(struct ipoib_dev_priv *priv)
 {
 	if_t dev = priv->dev;
 
 	ipoib_dbg(priv, "bringing up interface\n");
 
 	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
 	if (ipoib_pkey_dev_delay_open(priv))
 		return 0;
 
 	if (ipoib_ib_dev_open(priv))
 		goto err_disable;
 
 	if (ipoib_ib_dev_up(priv))
 		goto err_stop;
 
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
 
 		/* Bring up any child interfaces too */
 		mutex_lock(&priv->vlan_mutex);
 		list_for_each_entry(cpriv, &priv->child_intfs, list)
 			if ((if_getdrvflags(cpriv->dev) & IFF_DRV_RUNNING) == 0)
 				ipoib_open(cpriv);
 		mutex_unlock(&priv->vlan_mutex);
 	}
 	if_setdrvflagbits(dev, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 
 	return 0;
 
 err_stop:
 	ipoib_ib_dev_stop(priv, 1);
 
 err_disable:
 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
 	return -EINVAL;
 }
 
 static void
 ipoib_init(void *arg)
 {
 	if_t dev;
 	struct ipoib_dev_priv *priv;
 
 	priv = arg;
 	dev = priv->dev;
 	if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0)
 		ipoib_open(priv);
 	queue_work(ipoib_workqueue, &priv->flush_light);
 }
 
 
 static int
 ipoib_stop(struct ipoib_dev_priv *priv)
 {
 	if_t dev = priv->dev;
 
 	ipoib_dbg(priv, "stopping interface\n");
 
 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
 	if_setdrvflagbits(dev, 0, IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 	ipoib_ib_dev_down(priv, 0);
 	ipoib_ib_dev_stop(priv, 0);
 
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
 
 		/* Bring down any child interfaces too */
 		mutex_lock(&priv->vlan_mutex);
 		list_for_each_entry(cpriv, &priv->child_intfs, list)
 			if ((if_getdrvflags(cpriv->dev) & IFF_DRV_RUNNING) != 0)
 				ipoib_stop(cpriv);
 		mutex_unlock(&priv->vlan_mutex);
 	}
 
 	return 0;
 }
 
 static int
 ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv *priv, int new_mtu,
     bool propagate)
 {
 	if_t ifp;
 	struct ifreq ifr;
 	int error;
 
 	ifp = priv->dev;
 	if (if_getmtu(ifp) == new_mtu)
 		return (0);
 	if (propagate) {
 		strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ);
 		ifr.ifr_mtu = new_mtu;
 		CURVNET_SET(if_getvnet(ifp));
 		error = ifhwioctl(SIOCSIFMTU, ifp, (caddr_t)&ifr, curthread);
 		CURVNET_RESTORE();
 	} else {
 		if_setmtu(ifp, new_mtu);
 		error = 0;
 	}
 	return (error);
 }
 
 int
 ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate)
 {
 	int error, prev_admin_mtu;
 
 	/* dev->if_mtu > 2K ==> connected mode */
 	if (ipoib_cm_admin_enabled(priv)) {
 		if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)))
 			return -EINVAL;
 
 		if (new_mtu > priv->mcast_mtu)
 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 				   priv->mcast_mtu);
 
 		return (ipoib_propagate_ifnet_mtu(priv, new_mtu, propagate));
 	}
 
 	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 		return -EINVAL;
 
 	prev_admin_mtu = priv->admin_mtu;
 	priv->admin_mtu = new_mtu;
 	error = ipoib_propagate_ifnet_mtu(priv, min(priv->mcast_mtu,
 	    priv->admin_mtu), propagate);
 	if (error == 0) {
 		/* check for MTU change to avoid infinite loop */
 		if (prev_admin_mtu != new_mtu)
 			queue_work(ipoib_workqueue, &priv->flush_light);
 	} else
 		priv->admin_mtu = prev_admin_mtu;
 	return (error);
 }
 
 static int
 ipoib_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	struct ipoib_dev_priv *priv = if_getsoftc(ifp);
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	/* check if detaching */
 	if (priv == NULL)
 		return (ENXIO);
 	/* wait for device to become ready, if any */
 	while (priv->gone == 2)
 		pause("W", 1);
 	/* check for device gone */
 	if (priv->gone != 0)
 		return (ENXIO);
 
 	switch (command) {
 	case SIOCSIFFLAGS:
 		if (if_getflags(ifp) & IFF_UP) {
 			if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
 				error = -ipoib_open(priv);
 		} else
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
 				ipoib_stop(priv);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
 			queue_work(ipoib_workqueue, &priv->restart_task);
 		break;
 	case SIOCSIFADDR:
 		if_setflagbits(ifp, IFF_UP, 0);
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if_init(ifp, if_getsoftc(ifp));	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			if_init(ifp, if_getsoftc(ifp));
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 			bcopy(if_getlladdr(ifp), &ifr->ifr_addr.sa_data[0],
                             INFINIBAND_ALEN);
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		error = -ipoib_change_mtu(priv, ifr->ifr_mtu, false);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 
 static struct ipoib_path *
 __path_find(struct ipoib_dev_priv *priv, void *gid)
 {
 	struct rb_node *n = priv->path_tree.rb_node;
 	struct ipoib_path *path;
 	int ret;
 
 	while (n) {
 		path = rb_entry(n, struct ipoib_path, rb_node);
 
 		ret = memcmp(gid, path->pathrec.dgid.raw,
 			     sizeof (union ib_gid));
 
 		if (ret < 0)
 			n = n->rb_left;
 		else if (ret > 0)
 			n = n->rb_right;
 		else
 			return path;
 	}
 
 	return NULL;
 }
 
 static int
 __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path)
 {
 	struct rb_node **n = &priv->path_tree.rb_node;
 	struct rb_node *pn = NULL;
 	struct ipoib_path *tpath;
 	int ret;
 
 	while (*n) {
 		pn = *n;
 		tpath = rb_entry(pn, struct ipoib_path, rb_node);
 
 		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
 			     sizeof (union ib_gid));
 		if (ret < 0)
 			n = &pn->rb_left;
 		else if (ret > 0)
 			n = &pn->rb_right;
 		else
 			return -EEXIST;
 	}
 
 	rb_link_node(&path->rb_node, pn, n);
 	rb_insert_color(&path->rb_node, &priv->path_tree);
 
 	list_add_tail(&path->list, &priv->path_list);
 
 	return 0;
 }
 
 void
 ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path)
 {
 
 	_IF_DRAIN(&path->queue);
 
 	if (path->ah)
 		ipoib_put_ah(path->ah);
 	if (ipoib_cm_get(path))
 		ipoib_cm_destroy_tx(ipoib_cm_get(path));
 
 	kfree(path);
 }
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 
 struct ipoib_path_iter *
 ipoib_path_iter_init(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_path_iter *iter;
 
 	iter = kmalloc(sizeof *iter, GFP_KERNEL);
 	if (!iter)
 		return NULL;
 
 	iter->priv = priv;
 	memset(iter->path.pathrec.dgid.raw, 0, 16);
 
 	if (ipoib_path_iter_next(iter)) {
 		kfree(iter);
 		return NULL;
 	}
 
 	return iter;
 }
 
 int
 ipoib_path_iter_next(struct ipoib_path_iter *iter)
 {
 	struct ipoib_dev_priv *priv = iter->priv;
 	struct rb_node *n;
 	struct ipoib_path *path;
 	int ret = 1;
 
 	spin_lock_irq(&priv->lock);
 
 	n = rb_first(&priv->path_tree);
 
 	while (n) {
 		path = rb_entry(n, struct ipoib_path, rb_node);
 
 		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
 			   sizeof (union ib_gid)) < 0) {
 			iter->path = *path;
 			ret = 0;
 			break;
 		}
 
 		n = rb_next(n);
 	}
 
 	spin_unlock_irq(&priv->lock);
 
 	return ret;
 }
 
 void
 ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path)
 {
 	*path = iter->path;
 }
 
 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 
 void
 ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_path *path, *tp;
 
 	spin_lock_irq(&priv->lock);
 
 	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
 		ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n",
 			be16_to_cpu(path->pathrec.dlid),
 			path->pathrec.dgid.raw, ":");
 		path->valid =  0;
 	}
 
 	spin_unlock_irq(&priv->lock);
 }
 
 void
 ipoib_flush_paths(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_path *path, *tp;
 	LIST_HEAD(remove_list);
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->lock, flags);
 
 	list_splice_init(&priv->path_list, &remove_list);
 
 	list_for_each_entry(path, &remove_list, list)
 		rb_erase(&path->rb_node, &priv->path_tree);
 
 	list_for_each_entry_safe(path, tp, &remove_list, list) {
 		if (path->query)
 			ib_sa_cancel_query(path->query_id, path->query);
 		spin_unlock_irqrestore(&priv->lock, flags);
 		wait_for_completion(&path->done);
 		ipoib_path_free(priv, path);
 		spin_lock_irqsave(&priv->lock, flags);
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static void
 path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr)
 {
 	struct ipoib_path *path = path_ptr;
 	struct ipoib_dev_priv *priv = path->priv;
 	if_t dev = priv->dev;
 	struct ipoib_ah *ah = NULL;
 	struct ipoib_ah *old_ah = NULL;
 	struct epoch_tracker et;
 	struct ifqueue mbqueue;
 	struct mbuf *mb;
 	unsigned long flags;
 
 	if (!status)
 		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n",
 			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":");
 	else
 		ipoib_dbg(priv, "PathRec status %d for GID %16D\n",
 			  status, path->pathrec.dgid.raw, ":");
 
 	bzero(&mbqueue, sizeof(mbqueue));
 
 	if (!status) {
 		struct ib_ah_attr av;
 
 		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
 			ah = ipoib_create_ah(priv, priv->pd, &av);
 	}
 
 	spin_lock_irqsave(&priv->lock, flags);
 
 	if (ah) {
 		path->pathrec = *pathrec;
 
 		old_ah   = path->ah;
 		path->ah = ah;
 
 		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
 			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
 
 		for (;;) {
 			_IF_DEQUEUE(&path->queue, mb);
 			if (mb == NULL)
 				break;
 			_IF_ENQUEUE(&mbqueue, mb);
 		}
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 		if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path))
 			ipoib_cm_set(path, ipoib_cm_create_tx(priv, path));
 #endif
 
 		path->valid = 1;
 	}
 
 	path->query = NULL;
 	complete(&path->done);
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	if (old_ah)
 		ipoib_put_ah(old_ah);
 
 	NET_EPOCH_ENTER(et);
 	for (;;) {
 		_IF_DEQUEUE(&mbqueue, mb);
 		if (mb == NULL)
 			break;
 		mb->m_pkthdr.rcvif = dev;
 		if (if_transmit(dev, mb))
 			ipoib_warn(priv, "dev_queue_xmit failed "
 				   "to requeue packet\n");
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 static struct ipoib_path *
 path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
 {
 	struct ipoib_path *path;
 
 	if (!priv->broadcast)
 		return NULL;
 
 	path = kzalloc(sizeof *path, GFP_ATOMIC);
 	if (!path)
 		return NULL;
 
 	path->priv = priv;
 
 	bzero(&path->queue, sizeof(path->queue));
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN);
 #endif
 	memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid));
 	path->pathrec.sgid	    = priv->local_gid;
 	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
 	path->pathrec.numb_path     = 1;
 	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 
 	return path;
 }
 
 static int
 path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path)
 {
 	if_t dev = priv->dev;
 
 	ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;
 	struct ib_sa_path_rec p_rec;
 
 	p_rec = path->pathrec;
 	p_rec.mtu_selector = IB_SA_GT;
 
 	switch (roundup_pow_of_two(if_getmtu(dev) + IPOIB_ENCAP_LEN)) {
 	case 512:
 		p_rec.mtu = IB_MTU_256;
 		break;
 	case 1024:
 		p_rec.mtu = IB_MTU_512;
 		break;
 	case 2048:
 		p_rec.mtu = IB_MTU_1024;
 		break;
 	case 4096:
 		p_rec.mtu = IB_MTU_2048;
 		break;
 	default:
 		/* Wildcard everything */
 		comp_mask = 0;
 		p_rec.mtu = 0;
 		p_rec.mtu_selector = 0;
 	}
 
 	ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n",
 		  p_rec.dgid.raw, ":",
 		  comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);
 
 	init_completion(&path->done);
 
 	path->query_id =
 		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
 				   &p_rec, comp_mask		|
 				   IB_SA_PATH_REC_DGID		|
 				   IB_SA_PATH_REC_SGID		|
 				   IB_SA_PATH_REC_NUMB_PATH	|
 				   IB_SA_PATH_REC_TRAFFIC_CLASS |
 				   IB_SA_PATH_REC_PKEY,
 				   1000, GFP_ATOMIC,
 				   path_rec_completion,
 				   path, &path->query);
 	if (path->query_id < 0) {
 		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
 		path->query = NULL;
 		complete(&path->done);
 		return path->query_id;
 	}
 
 	return 0;
 }
 
 static void
 ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh)
 {
 	struct ipoib_path *path;
 
 	path = __path_find(priv, eh->hwaddr + 4);
 	if (!path || !path->valid) {
 		int new_path = 0;
 
 		if (!path) {
 			path = path_rec_create(priv, eh->hwaddr);
 			new_path = 1;
 		}
 		if (path) {
 			if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE)
 				_IF_ENQUEUE(&path->queue, mb);
 			else {
 				if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
 				m_freem(mb);
 			}
 
 			if (!path->query && path_rec_start(priv, path)) {
 				if (new_path)
 					ipoib_path_free(priv, path);
 				return;
 			} else
 				__path_add(priv, path);
 		} else {
 			if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
 			m_freem(mb);
 		}
 
 		return;
 	}
 
 	if (ipoib_cm_get(path) && ipoib_cm_up(path)) {
 		ipoib_cm_send(priv, mb, ipoib_cm_get(path));
 	} else if (path->ah) {
 		ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr));
 	} else if ((path->query || !path_rec_start(priv, path)) &&
 		    path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) {
 		_IF_ENQUEUE(&path->queue, mb);
 	} else {
 		if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
 		m_freem(mb);
 	}
 }
 
 static int
 ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb)
 {
 	struct ipoib_header *eh;
 
 	eh = mtod(mb, struct ipoib_header *);
 	if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
 		/* Add in the P_Key for multicast*/
 		eh->hwaddr[8] = (priv->pkey >> 8) & 0xff;
 		eh->hwaddr[9] = priv->pkey & 0xff;
 
 		ipoib_mcast_send(priv, eh->hwaddr + 4, mb);
 	} else
 		ipoib_unicast_send(mb, priv, eh);
 
 	return 0;
 }
 
 void
 ipoib_start_locked(if_t dev, struct ipoib_dev_priv *priv)
 {
 	struct mbuf *mb;
 
 	assert_spin_locked(&priv->lock);
 
 	while (!if_sendq_empty(dev) &&
 	    (if_getdrvflags(dev) & IFF_DRV_OACTIVE) == 0) {
 		mb = if_dequeue(dev);
 		if (mb == NULL)
 			break;
 		infiniband_bpf_mtap(dev, mb);
 		ipoib_send_one(priv, mb);
 	}
 }
 
 static void
 _ipoib_start(if_t dev, struct ipoib_dev_priv *priv)
 {
 
 	if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return;
 
 	spin_lock(&priv->lock);
 	ipoib_start_locked(dev, priv);
 	spin_unlock(&priv->lock);
 }
 
 static void
 ipoib_start(if_t dev)
 {
 	_ipoib_start(dev, if_getsoftc(dev));
 }
 
 static void
 ipoib_vlan_start(if_t dev)
 {
 	struct ipoib_dev_priv *priv;
 	struct mbuf *mb;
 
 	priv = VLAN_COOKIE(dev);
 	if (priv != NULL)
 		return _ipoib_start(dev, priv);
 	while (!if_sendq_empty(dev)) {
 		mb = if_dequeue(dev);
 		if (mb == NULL)
 			break;
 		m_freem(mb);
 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
 	}
 }
 
 int
 ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
 {
 
 	/* Allocate RX/TX "rings" to hold queued mbs */
 	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
 				GFP_KERNEL);
 	if (!priv->rx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
 		       ca->name, ipoib_recvq_size);
 		goto out;
 	}
 
 	priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL);
 	if (!priv->tx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
 		       ca->name, ipoib_sendq_size);
 		goto out_rx_ring_cleanup;
 	}
 	memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
 
 	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
 	if (ipoib_ib_dev_init(priv, ca, port))
 		goto out_tx_ring_cleanup;
 
 	return 0;
 
 out_tx_ring_cleanup:
 	kfree(priv->tx_ring);
 
 out_rx_ring_cleanup:
 	kfree(priv->rx_ring);
 
 out:
 	return -ENOMEM;
 }
 
 static void
 ipoib_ifdetach(struct ipoib_dev_priv *priv)
 {
 	if_t dev;
 
 	dev = priv->dev;
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		priv->gone = 1;
 		infiniband_ifdetach(dev);
 	}
 }
 
 static void
 ipoib_detach(struct ipoib_dev_priv *priv)
 {
 	if_t dev;
 
 	dev = priv->dev;
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		if_free(dev);
 		free_unr(ipoib_unrhdr, priv->unit);
 	} else
 		VLAN_SETCOOKIE(priv->dev, NULL);
 
 	free(priv, M_TEMP);
 }
 
 void
 ipoib_dev_cleanup(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_dev_priv *cpriv, *tcpriv;
 
 	/* Delete any child interfaces first */
 	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
 		ipoib_ifdetach(cpriv);
 		ipoib_dev_cleanup(cpriv);
 		ipoib_detach(cpriv);
 	}
 
 	ipoib_ib_dev_cleanup(priv);
 
 	kfree(priv->rx_ring);
 	kfree(priv->tx_ring);
 
 	priv->rx_ring = NULL;
 	priv->tx_ring = NULL;
 }
 
 static struct ipoib_dev_priv *
 ipoib_priv_alloc(void)
 {
 	struct ipoib_dev_priv *priv;
 
 	priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);
 	spin_lock_init(&priv->lock);
 	spin_lock_init(&priv->drain_lock);
 	mutex_init(&priv->vlan_mutex);
 	INIT_LIST_HEAD(&priv->path_list);
 	INIT_LIST_HEAD(&priv->child_intfs);
 	INIT_LIST_HEAD(&priv->dead_ahs);
 	INIT_LIST_HEAD(&priv->multicast_list);
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
 	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 	memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN);
 
 	return (priv);
 }
 
 struct ipoib_dev_priv *
 ipoib_intf_alloc(const char *name, struct ib_device *hca)
 {
 	struct ipoib_dev_priv *priv;
 	if_t dev;
 
 	priv = ipoib_priv_alloc();
 	dev = priv->dev = if_alloc(IFT_INFINIBAND);
 	if (!dev) {
 		free(priv, M_TEMP);
 		return NULL;
 	}
 	if_setsoftc(dev, priv);
 	priv->gone = 2; /* initializing */
 	priv->unit = alloc_unr(ipoib_unrhdr);
 	if (priv->unit == -1) {
 		if_free(dev);
 		free(priv, M_TEMP);
 		return NULL;
 	}
 	if_initname(dev, name, priv->unit);
 	if_setflags(dev, IFF_BROADCAST | IFF_MULTICAST);
-	if (hca->attrs.device_cap_flags & IB_DEVICE_KNOWSEPOCH)
-		if_setflagbits(dev, IFF_KNOWSEPOCH, 0);
+	if ((hca->attrs.device_cap_flags & IB_DEVICE_KNOWSEPOCH) == 0)
+		if_setflagbits(dev, IFF_NEEDSEPOCH, 0);
 
 	infiniband_ifattach(priv->dev, NULL, priv->broadcastaddr);
 
 	if_setinitfn(dev, ipoib_init);
 	if_setioctlfn(dev, ipoib_ioctl);
 	if_setstartfn(dev, ipoib_start);
 
 	if_setsendqlen(dev, ipoib_sendq_size * 2);
 
 	priv->dev = dev;
 	if_link_state_change(priv->dev, LINK_STATE_DOWN);
 
 	return if_getsoftc(dev);
 }
 
 int
 ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
 {
 	struct ib_device_attr *device_attr = &hca->attrs;
 
 	priv->hca_caps = device_attr->device_cap_flags;
 
 	if_sethwassist(priv->dev, 0);
 	if_setcapabilities(priv->dev, 0);
 
 #ifndef CONFIG_INFINIBAND_IPOIB_CM
 	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
 		set_bit(IPOIB_FLAG_CSUM, &priv->flags);
 		if_sethwassist(priv->dev, CSUM_IP | CSUM_TCP | CSUM_UDP);
 		if_setcapabilities(priv->dev, IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM);
 	}
 
 #if 0
 	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) {
 		priv->dev->if_capabilities |= IFCAP_TSO4;
 		priv->dev->if_hwassist |= CSUM_TSO;
 	}
 #endif
 #endif
 	if_setcapabilitiesbit(priv->dev,
 	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE, 0);
 	if_setcapenable(priv->dev, if_getcapabilities(priv->dev));
 
 	return 0;
 }
 
 
 static if_t
 ipoib_add_port(const char *format, struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
 	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format, hca);
 	if (!priv)
 		goto alloc_mem_failed;
 
 	if (!ib_query_port(hca, port, &attr))
 		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
 	else {
 		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
 		       hca->name, port);
 		goto device_init_failed;
 	}
 
 	/* MTU will be reset when mcast join happens */
 	if_setmtu(priv->dev, IPOIB_UD_MTU(priv->max_ib_mtu));
 	priv->mcast_mtu = priv->admin_mtu = if_getmtu(priv->dev);
 
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
 		       hca->name, port, result);
 		goto device_init_failed;
 	}
 
 	if (ipoib_set_dev_features(priv, hca))
 		goto device_init_failed;
 
 	/*
 	 * Set the full membership bit, so that we join the right
 	 * broadcast group, etc.
 	 */
 	priv->pkey |= 0x8000;
 
 	priv->broadcastaddr[8] = priv->pkey >> 8;
 	priv->broadcastaddr[9] = priv->pkey & 0xff;
 
 	result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
 		       hca->name, port, result);
 		goto device_init_failed;
 	}
 	memcpy(if_getlladdr(priv->dev) + 4, priv->local_gid.raw, sizeof(union ib_gid));
 
 	result = ipoib_dev_init(priv, hca, port);
 	if (result < 0) {
 		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
 		       hca->name, port, result);
 		goto device_init_failed;
 	}
 	if (ipoib_cm_admin_enabled(priv))
 		if_setmtu(priv->dev, IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)));
 
 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
 			      priv->ca, ipoib_event);
 	result = ib_register_event_handler(&priv->event_handler);
 	if (result < 0) {
 		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
 		       "port %d (ret = %d)\n",
 		       hca->name, port, result);
 		goto event_failed;
 	}
 	if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port);
 
 	priv->gone = 0;	/* ready */
 
 	return priv->dev;
 
 event_failed:
 	ipoib_dev_cleanup(priv);
 
 device_init_failed:
 	ipoib_ifdetach(priv);
 	ipoib_detach(priv);
 
 alloc_mem_failed:
 	return ERR_PTR(result);
 }
 
 static void
 ipoib_add_one(struct ib_device *device)
 {
 	struct list_head *dev_list;
 	if_t dev;
 	struct ipoib_dev_priv *priv;
 	int s, e, p;
 
 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
 		return;
 
 	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
 	if (!dev_list)
 		return;
 
 	INIT_LIST_HEAD(dev_list);
 
 	if (device->node_type == RDMA_NODE_IB_SWITCH) {
 		s = 0;
 		e = 0;
 	} else {
 		s = 1;
 		e = device->phys_port_cnt;
 	}
 
 	for (p = s; p <= e; ++p) {
 		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
 			continue;
 		dev = ipoib_add_port("ib", device, p);
 		if (!IS_ERR(dev)) {
 			priv = if_getsoftc(dev);
 			list_add_tail(&priv->list, dev_list);
 		}
 	}
 
 	ib_set_client_data(device, &ipoib_client, dev_list);
 }
 
 static void
 ipoib_remove_one(struct ib_device *device, void *client_data)
 {
 	struct ipoib_dev_priv *priv, *tmp;
 	struct list_head *dev_list = client_data;
 
 	if (!dev_list)
 		return;
 
 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
 		return;
 
 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
 		if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
 			continue;
 
 		ipoib_ifdetach(priv);
 		ipoib_stop(priv);
 
 		ib_unregister_event_handler(&priv->event_handler);
 
 		flush_workqueue(ipoib_workqueue);
 
 		ipoib_dev_cleanup(priv);
 		ipoib_detach(priv);
 	}
 
 	kfree(dev_list);
 }
 
 static u_int
 ipoib_match_dev_addr_cb(void *arg, struct ifaddr *ifa, u_int count)
 {
 	struct sockaddr *addr = arg;
 
 	/* If a match is already found, skip this. */
 	if (count > 0)
 		return (0);
 
 	if (ifa->ifa_addr->sa_len != addr->sa_len)
 		return (0);
 
 	if (memcmp(ifa->ifa_addr, addr, addr->sa_len) == 0)
 		return (1);
 
 	return (0);
 }
 
 static int
 ipoib_match_dev_addr(const struct sockaddr *addr, if_t dev)
 {
 	struct epoch_tracker et;
 	int retval = 0;
 
 	NET_EPOCH_ENTER(et);
 	retval = if_foreach_addr_type(dev, addr->sa_family,
 	    ipoib_match_dev_addr_cb, __DECONST(void *, addr));
 	NET_EPOCH_EXIT(et);
 
 	return (retval);
 }
 
 /*
  * ipoib_match_gid_pkey_addr - returns the number of IPoIB netdevs on
  * top a given ipoib device matching a pkey_index and address, if one
  * exists.
  *
  * @found_net_dev: contains a matching net_device if the return value
  * >= 1, with a reference held.
  */
 static int
 ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
     const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr,
     if_t *found_net_dev)
 {
 	struct ipoib_dev_priv *child_priv;
 	int matches = 0;
 
 	if (priv->pkey_index == pkey_index &&
 	    (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
 		if (addr == NULL || ipoib_match_dev_addr(addr, priv->dev) != 0) {
 			if (*found_net_dev == NULL) {
 				if_t net_dev;
 
 				if (priv->parent != NULL)
 					net_dev = priv->parent;
 				else
 					net_dev = priv->dev;
 				*found_net_dev = net_dev;
 				dev_hold(net_dev);
 			}
 			matches++;
 		}
 	}
 
 	/* Check child interfaces */
 	mutex_lock(&priv->vlan_mutex);
 	list_for_each_entry(child_priv, &priv->child_intfs, list) {
 		matches += ipoib_match_gid_pkey_addr(child_priv, gid,
 		    pkey_index, addr, found_net_dev);
 		if (matches > 1)
 			break;
 	}
 	mutex_unlock(&priv->vlan_mutex);
 
 	return matches;
 }
 
 /*
  * __ipoib_get_net_dev_by_params - returns the number of matching
  * net_devs found (between 0 and 2). Also return the matching
  * net_device in the @net_dev parameter, holding a reference to the
  * net_device, if the number of matches >= 1
  */
 static int
 __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
     u16 pkey_index, const union ib_gid *gid,
     const struct sockaddr *addr, if_t *net_dev)
 {
 	struct ipoib_dev_priv *priv;
 	int matches = 0;
 
 	*net_dev = NULL;
 
 	list_for_each_entry(priv, dev_list, list) {
 		if (priv->port != port)
 			continue;
 
 		matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
 		    addr, net_dev);
 
 		if (matches > 1)
 			break;
 	}
 
 	return matches;
 }
 
 static if_t
 ipoib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey,
     const union ib_gid *gid, const struct sockaddr *addr, void *client_data)
 {
 	if_t net_dev;
 	struct list_head *dev_list = client_data;
 	u16 pkey_index;
 	int matches;
 	int ret;
 
 	if (!rdma_protocol_ib(dev, port))
 		return NULL;
 
 	ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
 	if (ret)
 		return NULL;
 
 	if (!dev_list)
 		return NULL;
 
 	/* See if we can find a unique device matching the L2 parameters */
 	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 						gid, NULL, &net_dev);
 
 	switch (matches) {
 	case 0:
 		return NULL;
 	case 1:
 		return net_dev;
 	}
 
 	dev_put(net_dev);
 
 	/* Couldn't find a unique device with L2 parameters only. Use L3
 	 * address to uniquely match the net device */
 	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 						gid, addr, &net_dev);
 	switch (matches) {
 	case 0:
 		return NULL;
 	default:
 		dev_warn_ratelimited(&dev->dev,
 				     "duplicate IP address detected\n");
 		/* Fall through */
 	case 1:
 		return net_dev;
 	}
 }
 
 static void
 ipoib_config_vlan(void *arg, if_t ifp, uint16_t vtag)
 {
 	struct ipoib_dev_priv *parent;
 	struct ipoib_dev_priv *priv;
 	struct epoch_tracker et;
 	if_t dev;
 	uint16_t pkey;
 	int error;
 
 	if (if_gettype(ifp) != IFT_INFINIBAND)
 		return;
 	NET_EPOCH_ENTER(et);
 	dev = VLAN_DEVAT(ifp, vtag);
 	NET_EPOCH_EXIT(et);
 	if (dev == NULL)
 		return;
 	priv = NULL;
 	error = 0;
 	parent = if_getsoftc(ifp);
 	/* We only support 15 bits of pkey. */
 	if (vtag & 0x8000)
 		return;
 	pkey = vtag | 0x8000;	/* Set full membership bit. */
 	if (pkey == parent->pkey)
 		return;
 	/* Check for dups */
 	mutex_lock(&parent->vlan_mutex);
 	list_for_each_entry(priv, &parent->child_intfs, list) {
 		if (priv->pkey == pkey) {
 			priv = NULL;
 			error = EBUSY;
 			goto out;
 		}
 	}
 	priv = ipoib_priv_alloc();
 	priv->dev = dev;
 	priv->max_ib_mtu = parent->max_ib_mtu;
 	priv->mcast_mtu = priv->admin_mtu = if_getmtu(parent->dev);
 	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
 	error = ipoib_set_dev_features(priv, parent->ca);
 	if (error)
 		goto out;
 	priv->pkey = pkey;
 	priv->broadcastaddr[8] = pkey >> 8;
 	priv->broadcastaddr[9] = pkey & 0xff;
 	if_setbroadcastaddr(dev, priv->broadcastaddr);
 	error = ipoib_dev_init(priv, parent->ca, parent->port);
 	if (error)
 		goto out;
 	priv->parent = parent->dev;
 	list_add_tail(&priv->list, &parent->child_intfs);
 	VLAN_SETCOOKIE(dev, priv);
 	if_setstartfn(dev, ipoib_vlan_start);
 	if_setdrvflagbits(dev, 0, IFF_DRV_RUNNING);
 	if_setifheaderlen(dev, IPOIB_HEADER_LEN);
 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
 		ipoib_open(priv);
 	mutex_unlock(&parent->vlan_mutex);
 	return;
 out:
 	mutex_unlock(&parent->vlan_mutex);
 	if (priv)
 		free(priv, M_TEMP);
 	if (error)
 		ipoib_warn(parent,
 		    "failed to initialize subinterface: device %s, port %d vtag 0x%X",
 		    parent->ca->name, parent->port, vtag);
 	return;
 }
 
 static void
 ipoib_unconfig_vlan(void *arg, if_t ifp, uint16_t vtag)
 {
 	struct ipoib_dev_priv *parent;
 	struct ipoib_dev_priv *priv;
 	struct epoch_tracker et;
 	if_t dev;
 	uint16_t pkey;
 
 	if (if_gettype(ifp) != IFT_INFINIBAND)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	dev = VLAN_DEVAT(ifp, vtag);
 	NET_EPOCH_EXIT(et);
 	if (dev)
 		VLAN_SETCOOKIE(dev, NULL);
 	pkey = vtag | 0x8000;
 	parent = if_getsoftc(ifp);
 	mutex_lock(&parent->vlan_mutex);
 	list_for_each_entry(priv, &parent->child_intfs, list) {
 		if (priv->pkey == pkey) {
 			ipoib_dev_cleanup(priv);
 			list_del(&priv->list);
 			break;
 		}
 	}
 	mutex_unlock(&parent->vlan_mutex);
 }
 
 eventhandler_tag ipoib_vlan_attach;
 eventhandler_tag ipoib_vlan_detach;
 
 static int __init
 ipoib_init_module(void)
 {
 	int ret;
 
 	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
 	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
 	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
 
 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
 	ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
 						     IPOIB_MIN_QUEUE_SIZE));
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
 #endif
 
 	ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST);
 	ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST);
 
 	/*
 	 * We create our own workqueue mainly because we want to be
 	 * able to flush it when devices are being removed.  We can't
 	 * use schedule_work()/flush_scheduled_work() because both
 	 * unregister_netdev() and linkwatch_event take the rtnl lock,
 	 * so flush_scheduled_work() can deadlock during device
 	 * removal.
 	 */
 	ipoib_workqueue = create_singlethread_workqueue("ipoib");
 	if (!ipoib_workqueue) {
 		ret = -ENOMEM;
 		goto err_fs;
 	}
 
 	ib_sa_register_client(&ipoib_sa_client);
 
 	ret = ib_register_client(&ipoib_client);
 	if (ret)
 		goto err_sa;
 
 	return 0;
 
 err_sa:
 	ib_sa_unregister_client(&ipoib_sa_client);
 	destroy_workqueue(ipoib_workqueue);
 
 err_fs:
 	return ret;
 }
 
 static void __exit
 ipoib_cleanup_module(void)
 {
 
 	EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach);
 	EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach);
 	ib_unregister_client(&ipoib_client);
 	ib_sa_unregister_client(&ipoib_sa_client);
 	destroy_workqueue(ipoib_workqueue);
 }
 module_init_order(ipoib_init_module, SI_ORDER_FIFTH);
 module_exit_order(ipoib_cleanup_module, SI_ORDER_FIFTH);
 
 static int
 ipoib_evhand(module_t mod, int event, void *arg)
 {
 	return (0);
 }
 
 static moduledata_t ipoib_mod = {
 	.name = "ipoib",
 	.evhand = ipoib_evhand,
 };
 
 DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY);
 MODULE_DEPEND(ipoib, ibcore, 1, 1, 1);
 MODULE_DEPEND(ipoib, if_infiniband, 1, 1, 1);
 MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1);