Index: head/sys/dev/cxgbe/iw_cxgbe/cm.c
===================================================================
--- head/sys/dev/cxgbe/iw_cxgbe/cm.c	(revision 362486)
+++ head/sys/dev/cxgbe/iw_cxgbe/cm.c	(revision 362487)
@@ -1,3050 +1,3051 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *	  copyright notice, this list of conditions and the following
  *	  disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *	  copyright notice, this list of conditions and the following
  *	  disclaimer in the documentation and/or other materials
  *	  provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <net/route.h>
+#include <net/route/nhop.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/in_fib.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcpip.h>
 
 #include <netinet/toecore.h>
 
 struct sge_iq;
 struct rss_header;
 struct cpl_set_tcb_rpl;
 #include <linux/types.h>
 #include "offload.h"
 #include "tom/t4_tom.h"
 
 #define TOEPCB(so)  ((struct toepcb *)(so_sototcpcb((so))->t_toe))
 
 #include "iw_cxgbe.h"
 #include <linux/module.h>
 #include <linux/workqueue.h>
 #include <linux/notifier.h>
 #include <linux/inetdevice.h>
 #include <linux/if_vlan.h>
 #include <net/netevent.h>
 #include <rdma/rdma_cm.h>
 
 static spinlock_t req_lock;
 static TAILQ_HEAD(c4iw_ep_list, c4iw_ep_common) req_list;
 static struct work_struct c4iw_task;
 static struct workqueue_struct *c4iw_taskq;
 static LIST_HEAD(err_cqe_list);
 static spinlock_t err_cqe_lock;
 static LIST_HEAD(listen_port_list);
 static DEFINE_MUTEX(listen_port_mutex);
 
 static void process_req(struct work_struct *ctx);
 static void start_ep_timer(struct c4iw_ep *ep);
 static int stop_ep_timer(struct c4iw_ep *ep);
 static int set_tcpinfo(struct c4iw_ep *ep);
 static void process_timeout(struct c4iw_ep *ep);
 static void process_err_cqes(void);
 static void *alloc_ep(int size, gfp_t flags);
 static void close_socket(struct socket *so);
 static int send_mpa_req(struct c4iw_ep *ep);
 static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen);
 static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen);
 static void close_complete_upcall(struct c4iw_ep *ep, int status);
 static int send_abort(struct c4iw_ep *ep);
 static void peer_close_upcall(struct c4iw_ep *ep);
 static void peer_abort_upcall(struct c4iw_ep *ep);
 static void connect_reply_upcall(struct c4iw_ep *ep, int status);
 static int connect_request_upcall(struct c4iw_ep *ep);
 static void established_upcall(struct c4iw_ep *ep);
 static int process_mpa_reply(struct c4iw_ep *ep);
 static int process_mpa_request(struct c4iw_ep *ep);
 static void process_peer_close(struct c4iw_ep *ep);
 static void process_conn_error(struct c4iw_ep *ep);
 static void process_close_complete(struct c4iw_ep *ep);
 static void ep_timeout(unsigned long arg);
 static void setiwsockopt(struct socket *so);
 static void init_iwarp_socket(struct socket *so, void *arg);
 static void uninit_iwarp_socket(struct socket *so);
 static void process_data(struct c4iw_ep *ep);
 static void process_connected(struct c4iw_ep *ep);
 static int c4iw_so_upcall(struct socket *so, void *arg, int waitflag);
 static void process_socket_event(struct c4iw_ep *ep);
 static void release_ep_resources(struct c4iw_ep *ep);
 static int process_terminate(struct c4iw_ep *ep);
 static int terminate(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m);
 static int add_ep_to_req_list(struct c4iw_ep *ep, int ep_events);
 static struct listen_port_info *
 add_ep_to_listenlist(struct c4iw_listen_ep *lep);
 static int rem_ep_from_listenlist(struct c4iw_listen_ep *lep);
 static struct c4iw_listen_ep *
 find_real_listen_ep(struct c4iw_listen_ep *master_lep, struct socket *so);
 static int get_ifnet_from_raddr(struct sockaddr_storage *raddr,
 		struct ifnet **ifp);
 static void process_newconn(struct c4iw_listen_ep *master_lep,
 		struct socket *new_so);
 #define START_EP_TIMER(ep) \
     do { \
 	    CTR3(KTR_IW_CXGBE, "start_ep_timer (%s:%d) ep %p", \
 		__func__, __LINE__, (ep)); \
 	    start_ep_timer(ep); \
     } while (0)
 
 #define STOP_EP_TIMER(ep) \
     ({ \
 	    CTR3(KTR_IW_CXGBE, "stop_ep_timer (%s:%d) ep %p", \
 		__func__, __LINE__, (ep)); \
 	    stop_ep_timer(ep); \
     })
 
 #define GET_LOCAL_ADDR(pladdr, so) \
 	do { \
 		struct sockaddr_storage *__a = NULL; \
 		struct  inpcb *__inp = sotoinpcb(so); \
 		KASSERT(__inp != NULL, \
 		   ("GET_LOCAL_ADDR(%s):so:%p, inp = NULL", __func__, so)); \
 		if (__inp->inp_vflag & INP_IPV4) \
 			in_getsockaddr(so, (struct sockaddr **)&__a); \
 		else \
 			in6_getsockaddr(so, (struct sockaddr **)&__a); \
 		*(pladdr) = *__a; \
 		free(__a, M_SONAME); \
 	} while (0)
 
 #define GET_REMOTE_ADDR(praddr, so) \
 	do { \
 		struct sockaddr_storage *__a = NULL; \
 		struct  inpcb *__inp = sotoinpcb(so); \
 		KASSERT(__inp != NULL, \
 		   ("GET_REMOTE_ADDR(%s):so:%p, inp = NULL", __func__, so)); \
 		if (__inp->inp_vflag & INP_IPV4) \
 			in_getpeeraddr(so, (struct sockaddr **)&__a); \
 		else \
 			in6_getpeeraddr(so, (struct sockaddr **)&__a); \
 		*(praddr) = *__a; \
 		free(__a, M_SONAME); \
 	} while (0)
 
 static char *states[] = {
 	"idle",
 	"listen",
 	"connecting",
 	"mpa_wait_req",
 	"mpa_req_sent",
 	"mpa_req_rcvd",
 	"mpa_rep_sent",
 	"fpdu_mode",
 	"aborting",
 	"closing",
 	"moribund",
 	"dead",
 	NULL,
 };
 
 static void deref_cm_id(struct c4iw_ep_common *epc)
 {
       epc->cm_id->rem_ref(epc->cm_id);
       epc->cm_id = NULL;
       set_bit(CM_ID_DEREFED, &epc->history);
 }
 
 static void ref_cm_id(struct c4iw_ep_common *epc)
 {
       set_bit(CM_ID_REFED, &epc->history);
       epc->cm_id->add_ref(epc->cm_id);
 }
 
 static void deref_qp(struct c4iw_ep *ep)
 {
 	c4iw_qp_rem_ref(&ep->com.qp->ibqp);
 	clear_bit(QP_REFERENCED, &ep->com.flags);
 	set_bit(QP_DEREFED, &ep->com.history);
 }
 
 static void ref_qp(struct c4iw_ep *ep)
 {
 	set_bit(QP_REFERENCED, &ep->com.flags);
 	set_bit(QP_REFED, &ep->com.history);
 	c4iw_qp_add_ref(&ep->com.qp->ibqp);
 }
 /* allocated per TCP port while listening */
 struct listen_port_info {
 	uint16_t port_num; /* TCP port address */
 	struct list_head list; /* belongs to listen_port_list */
 	struct list_head lep_list; /* per port lep list */
 	uint32_t refcnt; /* number of lep's listening */
 };
 
 /*
  * Following two lists are used to manage INADDR_ANY listeners:
  * 1)listen_port_list
  * 2)lep_list
  *
  * Below is the INADDR_ANY listener lists overview on a system with a two port
  * adapter:
  *   |------------------|
  *   |listen_port_list  |
  *   |------------------|
  *            |
  *            |              |-----------|       |-----------|  
  *            |              | port_num:X|       | port_num:X|  
  *            |--------------|-list------|-------|-list------|-------....
  *                           | lep_list----|     | lep_list----|
  *                           | refcnt    | |     | refcnt    | |
  *                           |           | |     |           | |
  *                           |           | |     |           | |
  *                           |-----------| |     |-----------| |
  *                                         |                   |
  *                                         |                   |
  *                                         |                   |
  *                                         |                   |         lep1                  lep2         
  *                                         |                   |    |----------------|    |----------------|
  *                                         |                   |----| listen_ep_list |----| listen_ep_list |
  *                                         |                        |----------------|    |----------------|
  *                                         |
  *                                         |
  *                                         |        lep1                  lep2         
  *                                         |   |----------------|    |----------------|
  *                                         |---| listen_ep_list |----| listen_ep_list |
  *                                             |----------------|    |----------------|
  *
  * Because of two port adapter, the number of lep's are two(lep1 & lep2) for
  * each TCP port number.
  *
  * Here 'lep1' is always marked as Master lep, because solisten() is always
  * called through first lep. 
  *
  */
 static struct listen_port_info *
 add_ep_to_listenlist(struct c4iw_listen_ep *lep)
 {
 	uint16_t port;
 	struct listen_port_info *port_info = NULL;
 	struct sockaddr_storage *laddr = &lep->com.local_addr;
 
 	port = (laddr->ss_family == AF_INET) ?
 		((struct sockaddr_in *)laddr)->sin_port :
 		((struct sockaddr_in6 *)laddr)->sin6_port;
 
 	mutex_lock(&listen_port_mutex);
 
 	list_for_each_entry(port_info, &listen_port_list, list)
 		if (port_info->port_num == port)
 			goto found_port;
 
 	port_info = malloc(sizeof(*port_info), M_CXGBE, M_WAITOK);
 	port_info->port_num = port;
 	port_info->refcnt    = 0;
 
 	list_add_tail(&port_info->list, &listen_port_list);
 	INIT_LIST_HEAD(&port_info->lep_list);
 
 found_port:
 	port_info->refcnt++;
 	list_add_tail(&lep->listen_ep_list, &port_info->lep_list);
 	mutex_unlock(&listen_port_mutex);
 	return port_info;
 }
 
 static int
 rem_ep_from_listenlist(struct c4iw_listen_ep *lep)
 {
 	uint16_t port;
 	struct listen_port_info *port_info = NULL;
 	struct sockaddr_storage *laddr = &lep->com.local_addr;
 	int refcnt = 0;
 
 	port = (laddr->ss_family == AF_INET) ?
 		((struct sockaddr_in *)laddr)->sin_port :
 		((struct sockaddr_in6 *)laddr)->sin6_port;
 
 	mutex_lock(&listen_port_mutex);
 
 	/* get the port_info structure based on the lep's port address */
 	list_for_each_entry(port_info, &listen_port_list, list) {
 		if (port_info->port_num == port) {
 			port_info->refcnt--;
 			refcnt = port_info->refcnt;
 			/* remove the current lep from the listen list */
 			list_del(&lep->listen_ep_list);
 			if (port_info->refcnt == 0) {
 				/* Remove this entry from the list as there
 				 * are no more listeners for this port_num.
 				 */
 				list_del(&port_info->list);
 				kfree(port_info);
 			}
 			break;
 		}
 	}
 	mutex_unlock(&listen_port_mutex);
 	return refcnt;
 }
 
 /*
  * Find the lep that belongs to the ifnet on which the SYN frame was received.
  */
 struct c4iw_listen_ep *
 find_real_listen_ep(struct c4iw_listen_ep *master_lep, struct socket *so)
 {
 	struct adapter *adap = NULL;
 	struct c4iw_listen_ep *lep = NULL;
 	struct ifnet *ifp = NULL, *hw_ifp = NULL;
 	struct listen_port_info *port_info = NULL;
 	int i = 0, found_portinfo = 0, found_lep = 0;
 	uint16_t port;
 
 	/*
 	 * STEP 1: Figure out 'ifp' of the physical interface, not pseudo
 	 * interfaces like vlan, lagg, etc..
 	 * TBD: lagg support, lagg + vlan support.
 	 */
 	ifp = TOEPCB(so)->l2te->ifp;
 	if (ifp->if_type == IFT_L2VLAN) {
 		hw_ifp = VLAN_TRUNKDEV(ifp);
 		if (hw_ifp == NULL) {
 			CTR4(KTR_IW_CXGBE, "%s: Failed to get parent ifnet of "
 				"vlan ifnet %p, sock %p, master_lep %p",
 				__func__, ifp, so, master_lep);
 			return (NULL);
 		}
 	} else
 		hw_ifp = ifp;
 
 	/* STEP 2: Find 'port_info' with listener local port address. */
 	port = (master_lep->com.local_addr.ss_family == AF_INET) ?
 		((struct sockaddr_in *)&master_lep->com.local_addr)->sin_port :
 		((struct sockaddr_in6 *)&master_lep->com.local_addr)->sin6_port;
 
 
 	mutex_lock(&listen_port_mutex);
 	list_for_each_entry(port_info, &listen_port_list, list)
 		if (port_info->port_num == port) {
 			found_portinfo =1;
 			break;
 		}
 	if (!found_portinfo)
 		goto out;
 
 	/* STEP 3: Traverse through list of lep's that are bound to the current
 	 * TCP port address and find the lep that belongs to the ifnet on which
 	 * the SYN frame was received.
 	 */
 	list_for_each_entry(lep, &port_info->lep_list, listen_ep_list) {
 		adap = lep->com.dev->rdev.adap;
 		for_each_port(adap, i) {
 			if (hw_ifp == adap->port[i]->vi[0].ifp) {
 				found_lep =1;
 				goto out;
 			}
 		}
 	}
 out:
 	mutex_unlock(&listen_port_mutex);
 	return found_lep ? lep : (NULL);
 }
 
 static void process_timeout(struct c4iw_ep *ep)
 {
 	struct c4iw_qp_attributes attrs = {0};
 	int abort = 1;
 
 	CTR4(KTR_IW_CXGBE, "%s ep :%p, tid:%u, state %d", __func__,
 			ep, ep->hwtid, ep->com.state);
 	set_bit(TIMEDOUT, &ep->com.history);
 	switch (ep->com.state) {
 	case MPA_REQ_SENT:
 		connect_reply_upcall(ep, -ETIMEDOUT);
 		break;
 	case MPA_REQ_WAIT:
 	case MPA_REQ_RCVD:
 	case MPA_REP_SENT:
 	case FPDU_MODE:
 		break;
 	case CLOSING:
 	case MORIBUND:
 		if (ep->com.cm_id && ep->com.qp) {
 			attrs.next_state = C4IW_QP_STATE_ERROR;
 			c4iw_modify_qp(ep->com.dev, ep->com.qp,
 					C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
 		}
 		close_complete_upcall(ep, -ETIMEDOUT);
 		break;
 	case ABORTING:
 	case DEAD:
 		/*
 		 * These states are expected if the ep timed out at the same
 		 * time as another thread was calling stop_ep_timer().
 		 * So we silently do nothing for these states.
 		 */
 		abort = 0;
 		break;
 	default:
 		CTR4(KTR_IW_CXGBE, "%s unexpected state ep %p tid %u state %u"
 				, __func__, ep, ep->hwtid, ep->com.state);
 		abort = 0;
 	}
 	if (abort)
 		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
 	c4iw_put_ep(&ep->com);
 	return;
 }
 
 struct cqe_list_entry {
 	struct list_head entry;
 	struct c4iw_dev *rhp;
 	struct t4_cqe err_cqe;
 };
 
 static void
 process_err_cqes(void)
 {
 	unsigned long flag;
 	struct cqe_list_entry *cle;
 
 	spin_lock_irqsave(&err_cqe_lock, flag);
 	while (!list_empty(&err_cqe_list)) {
 		struct list_head *tmp;
 		tmp = err_cqe_list.next;
 		list_del(tmp);
 		tmp->next = tmp->prev = NULL;
 		spin_unlock_irqrestore(&err_cqe_lock, flag);
 		cle = list_entry(tmp, struct cqe_list_entry, entry);
 		c4iw_ev_dispatch(cle->rhp, &cle->err_cqe);
 		free(cle, M_CXGBE);
 		spin_lock_irqsave(&err_cqe_lock, flag);
 	}
 	spin_unlock_irqrestore(&err_cqe_lock, flag);
 
 	return;
 }
 
 static void
 process_req(struct work_struct *ctx)
 {
 	struct c4iw_ep_common *epc;
 	unsigned long flag;
 	int ep_events;
 
 	process_err_cqes();
 	spin_lock_irqsave(&req_lock, flag);
 	while (!TAILQ_EMPTY(&req_list)) {
 		epc = TAILQ_FIRST(&req_list);
 		TAILQ_REMOVE(&req_list, epc, entry);
 		epc->entry.tqe_prev = NULL;
 		ep_events = epc->ep_events;
 		epc->ep_events = 0;
 		spin_unlock_irqrestore(&req_lock, flag);
 		mutex_lock(&epc->mutex);
 		CTR5(KTR_IW_CXGBE, "%s: so %p, ep %p, ep_state %s events 0x%x",
 		    __func__, epc->so, epc, states[epc->state], ep_events);
 		if (ep_events & C4IW_EVENT_TERM)
 			process_terminate((struct c4iw_ep *)epc);
 		if (ep_events & C4IW_EVENT_TIMEOUT)
 			process_timeout((struct c4iw_ep *)epc);
 		if (ep_events & C4IW_EVENT_SOCKET)
 			process_socket_event((struct c4iw_ep *)epc);
 		mutex_unlock(&epc->mutex);
 		c4iw_put_ep(epc);
 		process_err_cqes();
 		spin_lock_irqsave(&req_lock, flag);
 	}
 	spin_unlock_irqrestore(&req_lock, flag);
 }
 
 /*
  * XXX: doesn't belong here in the iWARP driver.
  * XXX: assumes that the connection was offloaded by cxgbe/t4_tom if TF_TOE is
  *      set.  Is this a valid assumption for active open?
  */
 static int
 set_tcpinfo(struct c4iw_ep *ep)
 {
 	struct socket *so = ep->com.so;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	int rc = 0;
 
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 	if ((tp->t_flags & TF_TOE) == 0) {
 		rc = EINVAL;
 		log(LOG_ERR, "%s: connection not offloaded (so %p, ep %p)\n",
 		    __func__, so, ep);
 		goto done;
 	}
 	toep = TOEPCB(so);
 
 	ep->hwtid = toep->tid;
 	ep->snd_seq = tp->snd_nxt;
 	ep->rcv_seq = tp->rcv_nxt;
 done:
 	INP_WUNLOCK(inp);
 	return (rc);
 
 }
 static int
 get_ifnet_from_raddr(struct sockaddr_storage *raddr, struct ifnet **ifp)
 {
 	int err = 0;
+	struct nhop_object *nh;
 
 	if (raddr->ss_family == AF_INET) {
 		struct sockaddr_in *raddr4 = (struct sockaddr_in *)raddr;
-		struct nhop4_extended nh4 = {0};
 
-		err = fib4_lookup_nh_ext(RT_DEFAULT_FIB, raddr4->sin_addr,
-				NHR_REF, 0, &nh4);
-		*ifp = nh4.nh_ifp;
-		if (err)
-			fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4);
+		nh = fib4_lookup(RT_DEFAULT_FIB, raddr4->sin_addr, 0,
+				NHR_NONE, 0);
 	} else {
 		struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *)raddr;
-		struct nhop6_extended nh6 = {0};
 		struct in6_addr addr6;
 		uint32_t scopeid;
 
 		memset(&addr6, 0, sizeof(addr6));
 		in6_splitscope((struct in6_addr *)&raddr6->sin6_addr,
 					&addr6, &scopeid);
-		err = fib6_lookup_nh_ext(RT_DEFAULT_FIB, &addr6, scopeid,
-				NHR_REF, 0, &nh6);
-		*ifp = nh6.nh_ifp;
-		if (err)
-			fib6_free_nh_ext(RT_DEFAULT_FIB, &nh6);
+		nh = fib6_lookup(RT_DEFAULT_FIB, &addr6, scopeid,
+				NHR_NONE, 0);
 	}
 
+	if (nh == NULL)
+		err = EHOSTUNREACH;
+	else
+		*ifp = nh->nh_ifp;
 	CTR2(KTR_IW_CXGBE, "%s: return: %d", __func__, err);
 	return err;
 }
 
 static void
 close_socket(struct socket *so)
 {
 	uninit_iwarp_socket(so);
 	soclose(so);
 }
 
 static void
 process_peer_close(struct c4iw_ep *ep)
 {
 	struct c4iw_qp_attributes attrs = {0};
 	int disconnect = 1;
 	int release = 0;
 
 	CTR4(KTR_IW_CXGBE, "%s:ppcB ep %p so %p state %s", __func__, ep,
 	    ep->com.so, states[ep->com.state]);
 
 	switch (ep->com.state) {
 
 		case MPA_REQ_WAIT:
 			CTR2(KTR_IW_CXGBE, "%s:ppc1 %p MPA_REQ_WAIT DEAD",
 			    __func__, ep);
 			/* Fallthrough */
 		case MPA_REQ_SENT:
 			CTR2(KTR_IW_CXGBE, "%s:ppc2 %p MPA_REQ_SENT DEAD",
 			    __func__, ep);
 			ep->com.state = DEAD;
 			connect_reply_upcall(ep, -ECONNABORTED);
 
 			disconnect = 0;
 			STOP_EP_TIMER(ep);
 			close_socket(ep->com.so);
 			deref_cm_id(&ep->com);
 			release = 1;
 			break;
 
 		case MPA_REQ_RCVD:
 
 			/*
 			 * We're gonna mark this puppy DEAD, but keep
 			 * the reference on it until the ULP accepts or
 			 * rejects the CR.
 			 */
 			CTR2(KTR_IW_CXGBE, "%s:ppc3 %p MPA_REQ_RCVD CLOSING",
 			    __func__, ep);
 			ep->com.state = CLOSING;
 			break;
 
 		case MPA_REP_SENT:
 			CTR2(KTR_IW_CXGBE, "%s:ppc4 %p MPA_REP_SENT CLOSING",
 			    __func__, ep);
 			ep->com.state = CLOSING;
 			break;
 
 		case FPDU_MODE:
 			CTR2(KTR_IW_CXGBE, "%s:ppc5 %p FPDU_MODE CLOSING",
 			    __func__, ep);
 			START_EP_TIMER(ep);
 			ep->com.state = CLOSING;
 			attrs.next_state = C4IW_QP_STATE_CLOSING;
 			c4iw_modify_qp(ep->com.dev, ep->com.qp,
 					C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
 			peer_close_upcall(ep);
 			break;
 
 		case ABORTING:
 			CTR2(KTR_IW_CXGBE, "%s:ppc6 %p ABORTING (disconn)",
 			    __func__, ep);
 			disconnect = 0;
 			break;
 
 		case CLOSING:
 			CTR2(KTR_IW_CXGBE, "%s:ppc7 %p CLOSING MORIBUND",
 			    __func__, ep);
 			ep->com.state = MORIBUND;
 			disconnect = 0;
 			break;
 
 		case MORIBUND:
 			CTR2(KTR_IW_CXGBE, "%s:ppc8 %p MORIBUND DEAD", __func__,
 			    ep);
 			STOP_EP_TIMER(ep);
 			if (ep->com.cm_id && ep->com.qp) {
 				attrs.next_state = C4IW_QP_STATE_IDLE;
 				c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
 						C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
 			}
 			close_socket(ep->com.so);
 			close_complete_upcall(ep, 0);
 			ep->com.state = DEAD;
 			release = 1;
 			disconnect = 0;
 			break;
 
 		case DEAD:
 			CTR2(KTR_IW_CXGBE, "%s:ppc9 %p DEAD (disconn)",
 			    __func__, ep);
 			disconnect = 0;
 			break;
 
 		default:
 			panic("%s: ep %p state %d", __func__, ep,
 			    ep->com.state);
 			break;
 	}
 
 
 	if (disconnect) {
 
 		CTR2(KTR_IW_CXGBE, "%s:ppca %p", __func__, ep);
 		c4iw_ep_disconnect(ep, 0, M_NOWAIT);
 	}
 	if (release) {
 
 		CTR2(KTR_IW_CXGBE, "%s:ppcb %p", __func__, ep);
 		c4iw_put_ep(&ep->com);
 	}
 	CTR2(KTR_IW_CXGBE, "%s:ppcE %p", __func__, ep);
 	return;
 }
 
 static void
 process_conn_error(struct c4iw_ep *ep)
 {
 	struct c4iw_qp_attributes attrs = {0};
 	int ret;
 	int state;
 
 	state = ep->com.state;
 	CTR5(KTR_IW_CXGBE, "%s:pceB ep %p so %p so->so_error %u state %s",
 	    __func__, ep, ep->com.so, ep->com.so->so_error,
 	    states[ep->com.state]);
 
 	switch (state) {
 
 		case MPA_REQ_WAIT:
 			STOP_EP_TIMER(ep);
 			c4iw_put_ep(&ep->parent_ep->com);
 			break;
 
 		case MPA_REQ_SENT:
 			STOP_EP_TIMER(ep);
 			connect_reply_upcall(ep, -ECONNRESET);
 			break;
 
 		case MPA_REP_SENT:
 			ep->com.rpl_err = ECONNRESET;
 			CTR1(KTR_IW_CXGBE, "waking up ep %p", ep);
 			break;
 
 		case MPA_REQ_RCVD:
 			break;
 
 		case MORIBUND:
 		case CLOSING:
 			STOP_EP_TIMER(ep);
 			/*FALLTHROUGH*/
 		case FPDU_MODE:
 
 			if (ep->com.cm_id && ep->com.qp) {
 
 				attrs.next_state = C4IW_QP_STATE_ERROR;
 				ret = c4iw_modify_qp(ep->com.qp->rhp,
 					ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
 					&attrs, 1);
 				if (ret)
 					log(LOG_ERR,
 							"%s - qp <- error failed!\n",
 							__func__);
 			}
 			peer_abort_upcall(ep);
 			break;
 
 		case ABORTING:
 			break;
 
 		case DEAD:
 			CTR2(KTR_IW_CXGBE, "%s so_error %d IN DEAD STATE!!!!",
 			    __func__, ep->com.so->so_error);
 			return;
 
 		default:
 			panic("%s: ep %p state %d", __func__, ep, state);
 			break;
 	}
 
 	if (state != ABORTING) {
 		close_socket(ep->com.so);
 		ep->com.state = DEAD;
 		c4iw_put_ep(&ep->com);
 	}
 	CTR2(KTR_IW_CXGBE, "%s:pceE %p", __func__, ep);
 	return;
 }
 
 static void
 process_close_complete(struct c4iw_ep *ep)
 {
 	struct c4iw_qp_attributes attrs = {0};
 	int release = 0;
 
 	CTR4(KTR_IW_CXGBE, "%s:pccB ep %p so %p state %s", __func__, ep,
 	    ep->com.so, states[ep->com.state]);
 
 	/* The cm_id may be null if we failed to connect */
 	set_bit(CLOSE_CON_RPL, &ep->com.history);
 
 	switch (ep->com.state) {
 
 		case CLOSING:
 			CTR2(KTR_IW_CXGBE, "%s:pcc1 %p CLOSING MORIBUND",
 			    __func__, ep);
 			ep->com.state = MORIBUND;
 			break;
 
 		case MORIBUND:
 			CTR2(KTR_IW_CXGBE, "%s:pcc1 %p MORIBUND DEAD", __func__,
 			    ep);
 			STOP_EP_TIMER(ep);
 
 			if ((ep->com.cm_id) && (ep->com.qp)) {
 
 				CTR2(KTR_IW_CXGBE, "%s:pcc2 %p QP_STATE_IDLE",
 				    __func__, ep);
 				attrs.next_state = C4IW_QP_STATE_IDLE;
 				c4iw_modify_qp(ep->com.dev,
 						ep->com.qp,
 						C4IW_QP_ATTR_NEXT_STATE,
 						&attrs, 1);
 			}
 
 			close_socket(ep->com.so);
 			close_complete_upcall(ep, 0);
 			ep->com.state = DEAD;
 			release = 1;
 			break;
 
 		case ABORTING:
 			CTR2(KTR_IW_CXGBE, "%s:pcc5 %p ABORTING", __func__, ep);
 			break;
 
 		case DEAD:
 			CTR2(KTR_IW_CXGBE, "%s:pcc6 %p DEAD", __func__, ep);
 			break;
 		default:
 			CTR2(KTR_IW_CXGBE, "%s:pcc7 %p unknown ep state",
 					__func__, ep);
 			panic("%s:pcc6 %p unknown ep state", __func__, ep);
 			break;
 	}
 
 	if (release) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pcc8 %p", __func__, ep);
 		release_ep_resources(ep);
 	}
 	CTR2(KTR_IW_CXGBE, "%s:pccE %p", __func__, ep);
 	return;
 }
 
 static void
 setiwsockopt(struct socket *so)
 {
 	int rc;
 	struct sockopt sopt;
 	int on = 1;
 
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = IPPROTO_TCP;
 	sopt.sopt_name = TCP_NODELAY;
 	sopt.sopt_val = (caddr_t)&on;
 	sopt.sopt_valsize = sizeof on;
 	sopt.sopt_td = NULL;
 	rc = -sosetopt(so, &sopt);
 	if (rc) {
 		log(LOG_ERR, "%s: can't set TCP_NODELAY on so %p (%d)\n",
 		    __func__, so, rc);
 	}
 }
 
 static void
 init_iwarp_socket(struct socket *so, void *arg)
 {
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 		solisten_upcall_set(so, c4iw_so_upcall, arg);
 		so->so_state |= SS_NBIO;
 		SOLISTEN_UNLOCK(so);
 	} else {
 		SOCKBUF_LOCK(&so->so_rcv);
 		soupcall_set(so, SO_RCV, c4iw_so_upcall, arg);
 		so->so_state |= SS_NBIO;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 	}
 }
 
 static void
 uninit_iwarp_socket(struct socket *so)
 {
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 		solisten_upcall_set(so, NULL, NULL);
 		SOLISTEN_UNLOCK(so);
 	} else {
 		SOCKBUF_LOCK(&so->so_rcv);
 		soupcall_clear(so, SO_RCV);
 		SOCKBUF_UNLOCK(&so->so_rcv);
 	}
 }
 
 static void
 process_data(struct c4iw_ep *ep)
 {
 	int ret = 0;
 	int disconnect = 0;
 	struct c4iw_qp_attributes attrs = {0};
 
 	CTR5(KTR_IW_CXGBE, "%s: so %p, ep %p, state %s, sbused %d", __func__,
 	    ep->com.so, ep, states[ep->com.state], sbused(&ep->com.so->so_rcv));
 
 	switch (ep->com.state) {
 	case MPA_REQ_SENT:
 		disconnect = process_mpa_reply(ep);
 		break;
 	case MPA_REQ_WAIT:
 		disconnect = process_mpa_request(ep);
 		if (disconnect)
 			/* Refered in process_newconn() */
 			c4iw_put_ep(&ep->parent_ep->com);
 		break;
 	case FPDU_MODE:
 		MPASS(ep->com.qp != NULL);
 		attrs.next_state = C4IW_QP_STATE_TERMINATE;
 		ret = c4iw_modify_qp(ep->com.dev, ep->com.qp,
 					C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
 		if (ret != -EINPROGRESS)
 			disconnect = 1;
 		break;
 	default:
 		log(LOG_ERR, "%s: Unexpected streaming data. ep %p, "
 			    "state %d, so %p, so_state 0x%x, sbused %u\n",
 			    __func__, ep, ep->com.state, ep->com.so,
 			    ep->com.so->so_state, sbused(&ep->com.so->so_rcv));
 		break;
 	}
 	if (disconnect)
 		c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
 
 }
 
 static void
 process_connected(struct c4iw_ep *ep)
 {
 	struct socket *so = ep->com.so;
 
 	if ((so->so_state & SS_ISCONNECTED) && !so->so_error) {
 		if (send_mpa_req(ep))
 			goto err;
 	} else {
 		connect_reply_upcall(ep, -so->so_error);
 		goto err;
 	}
 	return;
 err:
 	close_socket(so);
 	ep->com.state = DEAD;
 	c4iw_put_ep(&ep->com);
 	return;
 }
 
 static inline int c4iw_zero_addr(struct sockaddr *addr)
 {
 	struct in6_addr *ip6;
 
 	if (addr->sa_family == AF_INET)
 		return IN_ZERONET(
 			ntohl(((struct sockaddr_in *)addr)->sin_addr.s_addr));
 	else {
 		ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
 		return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
 				ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0;
 	}
 }
 
 static inline int c4iw_loopback_addr(struct sockaddr *addr)
 {
 	if (addr->sa_family == AF_INET)
 		return IN_LOOPBACK(
 			ntohl(((struct sockaddr_in *) addr)->sin_addr.s_addr));
 	else
 		return IN6_IS_ADDR_LOOPBACK(
 				&((struct sockaddr_in6 *) addr)->sin6_addr);
 }
 
 static inline int c4iw_any_addr(struct sockaddr *addr)
 {
 	return c4iw_zero_addr(addr) || c4iw_loopback_addr(addr);
 }
 
 static void
 process_newconn(struct c4iw_listen_ep *master_lep, struct socket *new_so)
 {
 	struct c4iw_listen_ep *real_lep = NULL;
 	struct c4iw_ep *new_ep = NULL;
 	struct sockaddr_in *remote = NULL;
 	int ret = 0;
 
 	MPASS(new_so != NULL);
 
 	if (c4iw_any_addr((struct sockaddr *)&master_lep->com.local_addr)) {
 		/* Here we need to find the 'real_lep' that belongs to the
 		 * incomming socket's network interface, such that the newly
 		 * created 'ep' can be attached to the real 'lep'.
 		 */
 		real_lep = find_real_listen_ep(master_lep, new_so);
 		if (real_lep == NULL) {
 			CTR2(KTR_IW_CXGBE, "%s: Could not find the real listen "
 					"ep for sock: %p", __func__, new_so);
 			log(LOG_ERR,"%s: Could not find the real listen ep for "
 					"sock: %p\n", __func__, new_so);
 			/* FIXME: properly free the 'new_so' in failure case.
 			 * Use of soabort() and  soclose() are not legal
 			 * here(before soaccept()).
 			 */
 			return;
 		}
 	} else /* for Non-Wildcard address, master_lep is always the real_lep */
 		real_lep = master_lep;
 
 	new_ep = alloc_ep(sizeof(*new_ep), GFP_KERNEL);
 
 	CTR6(KTR_IW_CXGBE, "%s: master_lep %p, real_lep: %p, new ep %p, "
 	    "listening so %p, new so %p", __func__, master_lep, real_lep,
 	    new_ep, master_lep->com.so, new_so);
 
 	new_ep->com.dev = real_lep->com.dev;
 	new_ep->com.so = new_so;
 	new_ep->com.cm_id = NULL;
 	new_ep->com.thread = real_lep->com.thread;
 	new_ep->parent_ep = real_lep;
 
 	GET_LOCAL_ADDR(&new_ep->com.local_addr, new_so);
 	GET_REMOTE_ADDR(&new_ep->com.remote_addr, new_so);
 	c4iw_get_ep(&real_lep->com);
 	init_timer(&new_ep->timer);
 	new_ep->com.state = MPA_REQ_WAIT;
 
 	setiwsockopt(new_so);
 	ret = soaccept(new_so, (struct sockaddr **)&remote);
 	if (ret != 0) {
 		CTR4(KTR_IW_CXGBE,
 				"%s:listen sock:%p, new sock:%p, ret:%d",
 				__func__, master_lep->com.so, new_so, ret);
 		if (remote != NULL)
 			free(remote, M_SONAME);
 		soclose(new_so);
 		c4iw_put_ep(&new_ep->com);
 		c4iw_put_ep(&real_lep->com);
 		return;
 	}
 	free(remote, M_SONAME);
 
 	START_EP_TIMER(new_ep);
 
 	/* MPA request might have been queued up on the socket already, so we
 	 * initialize the socket/upcall_handler under lock to prevent processing
 	 * MPA request on another thread(via process_req()) simultaniously.
 	 */
 	c4iw_get_ep(&new_ep->com); /* Dereferenced at the end below, this is to
 				      avoid freeing of ep before ep unlock. */
 	mutex_lock(&new_ep->com.mutex);
 	init_iwarp_socket(new_so, &new_ep->com);
 
 	ret = process_mpa_request(new_ep);
 	if (ret) {
 		/* ABORT */
 		c4iw_ep_disconnect(new_ep, 1, GFP_KERNEL);
 		c4iw_put_ep(&real_lep->com);
 	}
 	mutex_unlock(&new_ep->com.mutex);
 	c4iw_put_ep(&new_ep->com);
 	return;
 }
 
 static int
 add_ep_to_req_list(struct c4iw_ep *ep, int new_ep_event)
 {
 	unsigned long flag;
 
 	spin_lock_irqsave(&req_lock, flag);
 	if (ep && ep->com.so) {
 		ep->com.ep_events |= new_ep_event;
 		if (!ep->com.entry.tqe_prev) {
 			c4iw_get_ep(&ep->com);
 			TAILQ_INSERT_TAIL(&req_list, &ep->com, entry);
 			queue_work(c4iw_taskq, &c4iw_task);
 		}
 	}
 	spin_unlock_irqrestore(&req_lock, flag);
 
 	return (0);
 }
 
 static int
 c4iw_so_upcall(struct socket *so, void *arg, int waitflag)
 {
 	struct c4iw_ep *ep = arg;
 
 	CTR6(KTR_IW_CXGBE,
 	    "%s: so %p, so_state 0x%x, ep %p, ep_state %s, tqe_prev %p",
 	    __func__, so, so->so_state, ep, states[ep->com.state],
 	    ep->com.entry.tqe_prev);
 
 	MPASS(ep->com.so == so);
 	/*
 	 * Wake up any threads waiting in rdma_init()/rdma_fini(),
 	 * with locks held.
 	 */
 	if (so->so_error || (ep->com.dev->rdev.flags & T4_FATAL_ERROR))
 		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
 	add_ep_to_req_list(ep, C4IW_EVENT_SOCKET);
 
 	return (SU_OK);
 }
 
 
 static int
 terminate(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rdma_terminate *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct socket *so;
 	struct c4iw_ep *ep;
 
 	INP_WLOCK(toep->inp);
 	so = inp_inpcbtosocket(toep->inp);
 	ep = so->so_rcv.sb_upcallarg;
 	INP_WUNLOCK(toep->inp);
 
 	CTR3(KTR_IW_CXGBE, "%s: so %p, ep %p", __func__, so, ep);
 	add_ep_to_req_list(ep, C4IW_EVENT_TERM);
 
 	return 0;
 }
 
 static void
 process_socket_event(struct c4iw_ep *ep)
 {
 	int state = ep->com.state;
 	struct socket *so = ep->com.so;
 
 	if (ep->com.state == DEAD) {
 		CTR3(KTR_IW_CXGBE, "%s: Pending socket event discarded "
 			"ep %p ep_state %s", __func__, ep, states[state]); 
 		return;
 	}
 
 	CTR6(KTR_IW_CXGBE, "process_socket_event: so %p, so_state 0x%x, "
 	    "so_err %d, sb_state 0x%x, ep %p, ep_state %s", so, so->so_state,
 	    so->so_error, so->so_rcv.sb_state, ep, states[state]);
 
 	if (state == CONNECTING) {
 		process_connected(ep);
 		return;
 	}
 
 	if (state == LISTEN) {
 		struct c4iw_listen_ep *lep = (struct c4iw_listen_ep *)ep;
 		struct socket *listen_so = so, *new_so = NULL;
 		int error = 0;
 
 		SOLISTEN_LOCK(listen_so);
 		do {
 			error = solisten_dequeue(listen_so, &new_so,
 						SOCK_NONBLOCK);
 			if (error) {
 				CTR4(KTR_IW_CXGBE, "%s: lep %p listen_so %p "
 					"error %d", __func__, lep, listen_so,
 					error);
 				return;
 			}
 			process_newconn(lep, new_so);
 
 			/* solisten_dequeue() unlocks while return, so aquire
 			 * lock again for sol_qlen and also for next iteration.
 			 */
 			SOLISTEN_LOCK(listen_so);
 		} while (listen_so->sol_qlen);
 		SOLISTEN_UNLOCK(listen_so);
 
 		return;
 	}
 
 	/* connection error */
 	if (so->so_error) {
 		process_conn_error(ep);
 		return;
 	}
 
 	/* peer close */
 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state <= CLOSING) {
 		process_peer_close(ep);
 		/*
 		 * check whether socket disconnect event is pending before
 		 * returning. Fallthrough if yes.
 		 */
 		if (!(so->so_state & SS_ISDISCONNECTED))
 			return;
 	}
 
 	/* close complete */
 	if (so->so_state & SS_ISDISCONNECTED) {
 		process_close_complete(ep);
 		return;
 	}
 
 	/* rx data */
 	if (sbused(&ep->com.so->so_rcv)) {
 		process_data(ep);
 		return;
 	}
 
 	/* Socket events for 'MPA Request Received' and 'Close Complete'
 	 * were already processed earlier in their previous events handlers.
 	 * Hence, these socket events are skipped.
 	 * And any other socket events must have handled above.
 	 */
 	MPASS((ep->com.state == MPA_REQ_RCVD) || (ep->com.state == MORIBUND));
 
 	if ((ep->com.state != MPA_REQ_RCVD) && (ep->com.state != MORIBUND))
 		log(LOG_ERR, "%s: Unprocessed socket event so %p, "
 		"so_state 0x%x, so_err %d, sb_state 0x%x, ep %p, ep_state %s\n",
 		__func__, so, so->so_state, so->so_error, so->so_rcv.sb_state,
 			ep, states[state]);
 
 }
 
 SYSCTL_NODE(_hw, OID_AUTO, iw_cxgbe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "iw_cxgbe driver parameters");
 
 static int dack_mode = 0;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, dack_mode, CTLFLAG_RWTUN, &dack_mode, 0,
 		"Delayed ack mode (default = 0)");
 
 int c4iw_max_read_depth = 8;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, c4iw_max_read_depth, CTLFLAG_RWTUN, &c4iw_max_read_depth, 0,
 		"Per-connection max ORD/IRD (default = 8)");
 
 static int enable_tcp_timestamps;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_timestamps, CTLFLAG_RWTUN, &enable_tcp_timestamps, 0,
 		"Enable tcp timestamps (default = 0)");
 
 static int enable_tcp_sack;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_sack, CTLFLAG_RWTUN, &enable_tcp_sack, 0,
 		"Enable tcp SACK (default = 0)");
 
 static int enable_tcp_window_scaling = 1;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_window_scaling, CTLFLAG_RWTUN, &enable_tcp_window_scaling, 0,
 		"Enable tcp window scaling (default = 1)");
 
 int c4iw_debug = 0;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, c4iw_debug, CTLFLAG_RWTUN, &c4iw_debug, 0,
 		"Enable debug logging (default = 0)");
 
 static int peer2peer = 1;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, peer2peer, CTLFLAG_RWTUN, &peer2peer, 0,
 		"Support peer2peer ULPs (default = 1)");
 
 static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, p2p_type, CTLFLAG_RWTUN, &p2p_type, 0,
 		"RDMAP opcode to use for the RTR message: 1 = RDMA_READ 0 = RDMA_WRITE (default 1)");
 
 static int ep_timeout_secs = 60;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, ep_timeout_secs, CTLFLAG_RWTUN, &ep_timeout_secs, 0,
 		"CM Endpoint operation timeout in seconds (default = 60)");
 
 static int mpa_rev = 1;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, mpa_rev, CTLFLAG_RWTUN, &mpa_rev, 0,
 		"MPA Revision, 0 supports amso1100, 1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft compliant (default = 1)");
 
 static int markers_enabled;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, markers_enabled, CTLFLAG_RWTUN, &markers_enabled, 0,
 		"Enable MPA MARKERS (default(0) = disabled)");
 
 static int crc_enabled = 1;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, crc_enabled, CTLFLAG_RWTUN, &crc_enabled, 0,
 		"Enable MPA CRC (default(1) = enabled)");
 
 static int rcv_win = 256 * 1024;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, rcv_win, CTLFLAG_RWTUN, &rcv_win, 0,
 		"TCP receive window in bytes (default = 256KB)");
 
 static int snd_win = 128 * 1024;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, snd_win, CTLFLAG_RWTUN, &snd_win, 0,
 		"TCP send window in bytes (default = 128KB)");
 
 int use_dsgl = 1;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0,
 		"Use DSGL for PBL/FastReg (default=1)");
 
 int inline_threshold = 128;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, inline_threshold, CTLFLAG_RWTUN, &inline_threshold, 0,
 		"inline vs dsgl threshold (default=128)");
 
 static int reuseaddr = 0;
 SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, reuseaddr, CTLFLAG_RWTUN, &reuseaddr, 0,
 		"Enable SO_REUSEADDR & SO_REUSEPORT socket options on all iWARP client connections(default = 0)");
 
 static void
 start_ep_timer(struct c4iw_ep *ep)
 {
 
 	if (timer_pending(&ep->timer)) {
 		CTR2(KTR_IW_CXGBE, "%s: ep %p, already started", __func__, ep);
 		printk(KERN_ERR "%s timer already started! ep %p\n", __func__,
 		    ep);
 		return;
 	}
 	clear_bit(TIMEOUT, &ep->com.flags);
 	c4iw_get_ep(&ep->com);
 	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
 	ep->timer.data = (unsigned long)ep;
 	ep->timer.function = ep_timeout;
 	add_timer(&ep->timer);
 }
 
 static int
 stop_ep_timer(struct c4iw_ep *ep)
 {
 
 	del_timer_sync(&ep->timer);
 	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
 		c4iw_put_ep(&ep->com);
 		return 0;
 	}
 	return 1;
 }
 
 static void *
 alloc_ep(int size, gfp_t gfp)
 {
 	struct c4iw_ep_common *epc;
 
 	epc = kzalloc(size, gfp);
 	if (epc == NULL)
 		return (NULL);
 
 	kref_init(&epc->kref);
 	mutex_init(&epc->mutex);
 	c4iw_init_wr_wait(&epc->wr_wait);
 
 	return (epc);
 }
 
 void _c4iw_free_ep(struct kref *kref)
 {
 	struct c4iw_ep *ep;
 	struct c4iw_ep_common *epc;
 
 	ep = container_of(kref, struct c4iw_ep, com.kref);
 	epc = &ep->com;
 	KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list",
 	    __func__, epc));
 	if (test_bit(QP_REFERENCED, &ep->com.flags))
 		deref_qp(ep);
 	CTR4(KTR_IW_CXGBE, "%s: ep %p, history 0x%lx, flags 0x%lx",
 	    __func__, ep, epc->history, epc->flags);
 	kfree(ep);
 }
 
 static void release_ep_resources(struct c4iw_ep *ep)
 {
 	CTR2(KTR_IW_CXGBE, "%s:rerB %p", __func__, ep);
 	set_bit(RELEASE_RESOURCES, &ep->com.flags);
 	c4iw_put_ep(&ep->com);
 	CTR2(KTR_IW_CXGBE, "%s:rerE %p", __func__, ep);
 }
 
 static int
 send_mpa_req(struct c4iw_ep *ep)
 {
 	int mpalen;
 	struct mpa_message *mpa;
 	struct mpa_v2_conn_params mpa_v2_params;
 	struct mbuf *m;
 	char mpa_rev_to_use = mpa_rev;
 	int err = 0;
 
 	if (ep->retry_with_mpa_v1)
 		mpa_rev_to_use = 1;
 	mpalen = sizeof(*mpa) + ep->plen;
 	if (mpa_rev_to_use == 2)
 		mpalen += sizeof(struct mpa_v2_conn_params);
 
 	mpa = malloc(mpalen, M_CXGBE, M_NOWAIT);
 	if (mpa == NULL) {
 		err = -ENOMEM;
 		CTR3(KTR_IW_CXGBE, "%s:smr1 ep: %p , error: %d",
 				__func__, ep, err);
 		goto err;
 	}
 
 	memset(mpa, 0, mpalen);
 	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
 	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
 		(markers_enabled ? MPA_MARKERS : 0) |
 		(mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0);
 	mpa->private_data_size = htons(ep->plen);
 	mpa->revision = mpa_rev_to_use;
 
 	if (mpa_rev_to_use == 1) {
 		ep->tried_with_mpa_v1 = 1;
 		ep->retry_with_mpa_v1 = 0;
 	}
 
 	if (mpa_rev_to_use == 2) {
 		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
 					    sizeof(struct mpa_v2_conn_params));
 		mpa_v2_params.ird = htons((u16)ep->ird);
 		mpa_v2_params.ord = htons((u16)ep->ord);
 
 		if (peer2peer) {
 			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
 
 			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) {
 				mpa_v2_params.ord |=
 				    htons(MPA_V2_RDMA_WRITE_RTR);
 			} else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) {
 				mpa_v2_params.ord |=
 					htons(MPA_V2_RDMA_READ_RTR);
 			}
 		}
 		memcpy(mpa->private_data, &mpa_v2_params,
 			sizeof(struct mpa_v2_conn_params));
 
 		if (ep->plen) {
 
 			memcpy(mpa->private_data +
 				sizeof(struct mpa_v2_conn_params),
 				ep->mpa_pkt + sizeof(*mpa), ep->plen);
 		}
 	} else {
 
 		if (ep->plen)
 			memcpy(mpa->private_data,
 					ep->mpa_pkt + sizeof(*mpa), ep->plen);
 		CTR2(KTR_IW_CXGBE, "%s:smr7 %p", __func__, ep);
 	}
 
 	m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		err = -ENOMEM;
 		CTR3(KTR_IW_CXGBE, "%s:smr2 ep: %p , error: %d",
 				__func__, ep, err);
 		free(mpa, M_CXGBE);
 		goto err;
 	}
 	m_copyback(m, 0, mpalen, (void *)mpa);
 	free(mpa, M_CXGBE);
 
 	err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT,
 			ep->com.thread);
 	if (err) {
 		CTR3(KTR_IW_CXGBE, "%s:smr3 ep: %p , error: %d",
 				__func__, ep, err);
 		goto err;
 	}
 
 	START_EP_TIMER(ep);
 	ep->com.state = MPA_REQ_SENT;
 	ep->mpa_attr.initiator = 1;
 	CTR3(KTR_IW_CXGBE, "%s:smrE %p, error: %d", __func__, ep, err);
 	return 0;
 err:
 	connect_reply_upcall(ep, err);
 	CTR3(KTR_IW_CXGBE, "%s:smrE %p, error: %d", __func__, ep, err);
 	return err;
 }
 
 static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
 {
 	int mpalen ;
 	struct mpa_message *mpa;
 	struct mpa_v2_conn_params mpa_v2_params;
 	struct mbuf *m;
 	int err;
 
 	CTR4(KTR_IW_CXGBE, "%s:smrejB %p %u %d", __func__, ep, ep->hwtid,
 	    ep->plen);
 
 	mpalen = sizeof(*mpa) + plen;
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
 
 		mpalen += sizeof(struct mpa_v2_conn_params);
 		CTR4(KTR_IW_CXGBE, "%s:smrej1 %p %u %d", __func__, ep,
 		    ep->mpa_attr.version, mpalen);
 	}
 
 	mpa = malloc(mpalen, M_CXGBE, M_NOWAIT);
 	if (mpa == NULL)
 		return (-ENOMEM);
 
 	memset(mpa, 0, mpalen);
 	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
 	mpa->flags = MPA_REJECT;
 	mpa->revision = mpa_rev;
 	mpa->private_data_size = htons(plen);
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
 
 		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
 		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
 					    sizeof(struct mpa_v2_conn_params));
 		mpa_v2_params.ird = htons(((u16)ep->ird) |
 				(peer2peer ? MPA_V2_PEER2PEER_MODEL :
 				 0));
 		mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ?
 					(p2p_type ==
 					 FW_RI_INIT_P2PTYPE_RDMA_WRITE ?
 					 MPA_V2_RDMA_WRITE_RTR : p2p_type ==
 					 FW_RI_INIT_P2PTYPE_READ_REQ ?
 					 MPA_V2_RDMA_READ_RTR : 0) : 0));
 		memcpy(mpa->private_data, &mpa_v2_params,
 				sizeof(struct mpa_v2_conn_params));
 
 		if (ep->plen)
 			memcpy(mpa->private_data +
 				sizeof(struct mpa_v2_conn_params), pdata, plen);
 		CTR5(KTR_IW_CXGBE, "%s:smrej3 %p %d %d %d", __func__, ep,
 		    mpa_v2_params.ird, mpa_v2_params.ord, ep->plen);
 	} else
 		if (plen)
 			memcpy(mpa->private_data, pdata, plen);
 
 	m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		free(mpa, M_CXGBE);
 		return (-ENOMEM);
 	}
 	m_copyback(m, 0, mpalen, (void *)mpa);
 	free(mpa, M_CXGBE);
 
 	err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread);
 	if (!err)
 		ep->snd_seq += mpalen;
 	CTR4(KTR_IW_CXGBE, "%s:smrejE %p %u %d", __func__, ep, ep->hwtid, err);
 	return err;
 }
 
 static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
 {
 	int mpalen;
 	struct mpa_message *mpa;
 	struct mbuf *m;
 	struct mpa_v2_conn_params mpa_v2_params;
 	int err;
 
 	CTR2(KTR_IW_CXGBE, "%s:smrepB %p", __func__, ep);
 
 	mpalen = sizeof(*mpa) + plen;
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
 
 		CTR3(KTR_IW_CXGBE, "%s:smrep1 %p %d", __func__, ep,
 		    ep->mpa_attr.version);
 		mpalen += sizeof(struct mpa_v2_conn_params);
 	}
 
 	mpa = malloc(mpalen, M_CXGBE, M_NOWAIT);
 	if (mpa == NULL)
 		return (-ENOMEM);
 
 	memset(mpa, 0, sizeof(*mpa));
 	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
 	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
 		(markers_enabled ? MPA_MARKERS : 0);
 	mpa->revision = ep->mpa_attr.version;
 	mpa->private_data_size = htons(plen);
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
 
 		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
 		mpa->private_data_size +=
 			htons(sizeof(struct mpa_v2_conn_params));
 		mpa_v2_params.ird = htons((u16)ep->ird);
 		mpa_v2_params.ord = htons((u16)ep->ord);
 		CTR5(KTR_IW_CXGBE, "%s:smrep3 %p %d %d %d", __func__, ep,
 		    ep->mpa_attr.version, mpa_v2_params.ird, mpa_v2_params.ord);
 
 		if (peer2peer && (ep->mpa_attr.p2p_type !=
 			FW_RI_INIT_P2PTYPE_DISABLED)) {
 
 			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
 
 			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) {
 
 				mpa_v2_params.ord |=
 					htons(MPA_V2_RDMA_WRITE_RTR);
 				CTR5(KTR_IW_CXGBE, "%s:smrep4 %p %d %d %d",
 				    __func__, ep, p2p_type, mpa_v2_params.ird,
 				    mpa_v2_params.ord);
 			}
 			else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) {
 
 				mpa_v2_params.ord |=
 					htons(MPA_V2_RDMA_READ_RTR);
 				CTR5(KTR_IW_CXGBE, "%s:smrep5 %p %d %d %d",
 				    __func__, ep, p2p_type, mpa_v2_params.ird,
 				    mpa_v2_params.ord);
 			}
 		}
 
 		memcpy(mpa->private_data, &mpa_v2_params,
 			sizeof(struct mpa_v2_conn_params));
 
 		if (ep->plen)
 			memcpy(mpa->private_data +
 				sizeof(struct mpa_v2_conn_params), pdata, plen);
 	} else
 		if (plen)
 			memcpy(mpa->private_data, pdata, plen);
 
 	m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		free(mpa, M_CXGBE);
 		return (-ENOMEM);
 	}
 	m_copyback(m, 0, mpalen, (void *)mpa);
 	free(mpa, M_CXGBE);
 
 
 	ep->com.state = MPA_REP_SENT;
 	ep->snd_seq += mpalen;
 	err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT,
 			ep->com.thread);
 	CTR3(KTR_IW_CXGBE, "%s:smrepE %p %d", __func__, ep, err);
 	return err;
 }
 
 
 
 static void close_complete_upcall(struct c4iw_ep *ep, int status)
 {
 	struct iw_cm_event event;
 
 	CTR2(KTR_IW_CXGBE, "%s:ccuB %p", __func__, ep);
 	memset(&event, 0, sizeof(event));
 	event.event = IW_CM_EVENT_CLOSE;
 	event.status = status;
 
 	if (ep->com.cm_id) {
 
 		CTR2(KTR_IW_CXGBE, "%s:ccu1 %1", __func__, ep);
 		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
 		deref_cm_id(&ep->com);
 		set_bit(CLOSE_UPCALL, &ep->com.history);
 	}
 	CTR2(KTR_IW_CXGBE, "%s:ccuE %p", __func__, ep);
 }
 
 static int
 send_abort(struct c4iw_ep *ep)
 {
 	struct socket *so = ep->com.so;
 	struct sockopt sopt;
 	int rc;
 	struct linger l;
 
 	CTR5(KTR_IW_CXGBE, "%s ep %p so %p state %s tid %d", __func__, ep, so,
 	    states[ep->com.state], ep->hwtid);
 
 	l.l_onoff = 1;
 	l.l_linger = 0;
 
 	/* linger_time of 0 forces RST to be sent */
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = SOL_SOCKET;
 	sopt.sopt_name = SO_LINGER;
 	sopt.sopt_val = (caddr_t)&l;
 	sopt.sopt_valsize = sizeof l;
 	sopt.sopt_td = NULL;
 	rc = -sosetopt(so, &sopt);
 	if (rc != 0) {
 		log(LOG_ERR, "%s: sosetopt(%p, linger = 0) failed with %d.\n",
 		    __func__, so, rc);
 	}
 
 	uninit_iwarp_socket(so);
 	soclose(so);
 	set_bit(ABORT_CONN, &ep->com.history);
 
 	/*
 	 * TBD: iw_cxgbe driver should receive ABORT reply for every ABORT
 	 * request it has sent. But the current TOE driver is not propagating
 	 * this ABORT reply event (via do_abort_rpl) to iw_cxgbe. So as a work-
 	 * around de-refererece 'ep' here instead of doing it in abort_rpl()
 	 * handler(not yet implemented) of iw_cxgbe driver.
 	 */
 	release_ep_resources(ep);
 	ep->com.state = DEAD;
 
 	return (0);
 }
 
 static void peer_close_upcall(struct c4iw_ep *ep)
 {
 	struct iw_cm_event event;
 
 	CTR2(KTR_IW_CXGBE, "%s:pcuB %p", __func__, ep);
 	memset(&event, 0, sizeof(event));
 	event.event = IW_CM_EVENT_DISCONNECT;
 
 	if (ep->com.cm_id) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pcu1 %p", __func__, ep);
 		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
 		set_bit(DISCONN_UPCALL, &ep->com.history);
 	}
 	CTR2(KTR_IW_CXGBE, "%s:pcuE %p", __func__, ep);
 }
 
 static void peer_abort_upcall(struct c4iw_ep *ep)
 {
 	struct iw_cm_event event;
 
 	CTR2(KTR_IW_CXGBE, "%s:pauB %p", __func__, ep);
 	memset(&event, 0, sizeof(event));
 	event.event = IW_CM_EVENT_CLOSE;
 	event.status = -ECONNRESET;
 
 	if (ep->com.cm_id) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pau1 %p", __func__, ep);
 		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
 		deref_cm_id(&ep->com);
 		set_bit(ABORT_UPCALL, &ep->com.history);
 	}
 	CTR2(KTR_IW_CXGBE, "%s:pauE %p", __func__, ep);
 }
 
 static void connect_reply_upcall(struct c4iw_ep *ep, int status)
 {
 	struct iw_cm_event event;
 
 	CTR3(KTR_IW_CXGBE, "%s:cruB %p, status: %d", __func__, ep, status);
 	memset(&event, 0, sizeof(event));
 	event.event = IW_CM_EVENT_CONNECT_REPLY;
 	event.status = ((status == -ECONNABORTED) || (status == -EPIPE)) ?
 					-ECONNRESET : status;
 	event.local_addr = ep->com.local_addr;
 	event.remote_addr = ep->com.remote_addr;
 
 	if ((status == 0) || (status == -ECONNREFUSED)) {
 
 		if (!ep->tried_with_mpa_v1) {
 
 			CTR2(KTR_IW_CXGBE, "%s:cru1 %p", __func__, ep);
 			/* this means MPA_v2 is used */
 			event.ord = ep->ird;
 			event.ird = ep->ord;
 			event.private_data_len = ep->plen -
 				sizeof(struct mpa_v2_conn_params);
 			event.private_data = ep->mpa_pkt +
 				sizeof(struct mpa_message) +
 				sizeof(struct mpa_v2_conn_params);
 		} else {
 
 			CTR2(KTR_IW_CXGBE, "%s:cru2 %p", __func__, ep);
 			/* this means MPA_v1 is used */
 			event.ord = c4iw_max_read_depth;
 			event.ird = c4iw_max_read_depth;
 			event.private_data_len = ep->plen;
 			event.private_data = ep->mpa_pkt +
 				sizeof(struct mpa_message);
 		}
 	}
 
 	if (ep->com.cm_id) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cru3 %p", __func__, ep);
 		set_bit(CONN_RPL_UPCALL, &ep->com.history);
 		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
 	}
 
 	if(status == -ECONNABORTED) {
 
 		CTR3(KTR_IW_CXGBE, "%s:cruE %p %d", __func__, ep, status);
 		return;
 	}
 
 	if (status < 0) {
 
 		CTR3(KTR_IW_CXGBE, "%s:cru4 %p %d", __func__, ep, status);
 		deref_cm_id(&ep->com);
 	}
 
 	CTR2(KTR_IW_CXGBE, "%s:cruE %p", __func__, ep);
 }
 
 static int connect_request_upcall(struct c4iw_ep *ep)
 {
 	struct iw_cm_event event;
 	int ret;
 
 	CTR3(KTR_IW_CXGBE, "%s: ep %p, mpa_v1 %d", __func__, ep,
 	    ep->tried_with_mpa_v1);
 
 	memset(&event, 0, sizeof(event));
 	event.event = IW_CM_EVENT_CONNECT_REQUEST;
 	event.local_addr = ep->com.local_addr;
 	event.remote_addr = ep->com.remote_addr;
 	event.provider_data = ep;
 
 	if (!ep->tried_with_mpa_v1) {
 		/* this means MPA_v2 is used */
 		event.ord = ep->ord;
 		event.ird = ep->ird;
 		event.private_data_len = ep->plen -
 			sizeof(struct mpa_v2_conn_params);
 		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) +
 			sizeof(struct mpa_v2_conn_params);
 	} else {
 
 		/* this means MPA_v1 is used. Send max supported */
 		event.ord = c4iw_max_read_depth;
 		event.ird = c4iw_max_read_depth;
 		event.private_data_len = ep->plen;
 		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
 	}
 
 	c4iw_get_ep(&ep->com);
 	ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id,
 	    &event);
 	if(ret) {
 		CTR3(KTR_IW_CXGBE, "%s: ep %p, Failure while notifying event to"
 			" IWCM, err:%d", __func__, ep, ret);
 		c4iw_put_ep(&ep->com);
 	} else
 		/* Dereference parent_ep only in success case.
 		 * In case of failure, parent_ep is dereferenced by the caller
 		 * of process_mpa_request().
 		 */
 		c4iw_put_ep(&ep->parent_ep->com);
 
 	set_bit(CONNREQ_UPCALL, &ep->com.history);
 	return ret;
 }
 
 static void established_upcall(struct c4iw_ep *ep)
 {
 	struct iw_cm_event event;
 
 	CTR2(KTR_IW_CXGBE, "%s:euB %p", __func__, ep);
 	memset(&event, 0, sizeof(event));
 	event.event = IW_CM_EVENT_ESTABLISHED;
 	event.ird = ep->ord;
 	event.ord = ep->ird;
 
 	if (ep->com.cm_id) {
 
 		CTR2(KTR_IW_CXGBE, "%s:eu1 %p", __func__, ep);
 		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
 		set_bit(ESTAB_UPCALL, &ep->com.history);
 	}
 	CTR2(KTR_IW_CXGBE, "%s:euE %p", __func__, ep);
 }
 
 
 #define RELAXED_IRD_NEGOTIATION 1
 
 /*
  * process_mpa_reply - process streaming mode MPA reply
  *
  * Returns:
  *
  * 0 upon success indicating a connect request was delivered to the ULP
  * or the mpa request is incomplete but valid so far.
  *
  * 1 if a failure requires the caller to close the connection.
  *
  * 2 if a failure requires the caller to abort the connection.
  */
 static int process_mpa_reply(struct c4iw_ep *ep)
 {
 	struct mpa_message *mpa;
 	struct mpa_v2_conn_params *mpa_v2_params;
 	u16 plen;
 	u16 resp_ird, resp_ord;
 	u8 rtr_mismatch = 0, insuff_ird = 0;
 	struct c4iw_qp_attributes attrs = {0};
 	enum c4iw_qp_attr_mask mask;
 	int err;
 	struct mbuf *top, *m;
 	int flags = MSG_DONTWAIT;
 	struct uio uio;
 	int disconnect = 0;
 
 	CTR2(KTR_IW_CXGBE, "%s:pmrB %p", __func__, ep);
 
 	/*
 	 * Stop mpa timer.  If it expired, then
 	 * we ignore the MPA reply.  process_timeout()
 	 * will abort the connection.
 	 */
 	if (STOP_EP_TIMER(ep))
 		return 0;
 
 	uio.uio_resid = 1000000;
 	uio.uio_td = ep->com.thread;
 	err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags);
 
 	if (err) {
 
 		if (err == EWOULDBLOCK) {
 
 			CTR2(KTR_IW_CXGBE, "%s:pmr1 %p", __func__, ep);
 			START_EP_TIMER(ep);
 			return 0;
 		}
 		err = -err;
 		CTR2(KTR_IW_CXGBE, "%s:pmr2 %p", __func__, ep);
 		goto err;
 	}
 
 	if (ep->com.so->so_rcv.sb_mb) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmr3 %p", __func__, ep);
 		printf("%s data after soreceive called! so %p sb_mb %p top %p\n",
 		       __func__, ep->com.so, ep->com.so->so_rcv.sb_mb, top);
 	}
 
 	m = top;
 
 	do {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmr4 %p", __func__, ep);
 		/*
 		 * If we get more than the supported amount of private data
 		 * then we must fail this connection.
 		 */
 		if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) {
 
 			CTR3(KTR_IW_CXGBE, "%s:pmr5 %p %d", __func__, ep,
 			    ep->mpa_pkt_len + m->m_len);
 			err = (-EINVAL);
 			goto err_stop_timer;
 		}
 
 		/*
 		 * copy the new data into our accumulation buffer.
 		 */
 		m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len]));
 		ep->mpa_pkt_len += m->m_len;
 		if (!m->m_next)
 			m = m->m_nextpkt;
 		else
 			m = m->m_next;
 	} while (m);
 
 	m_freem(top);
 	/*
 	 * if we don't even have the mpa message, then bail.
 	 */
 	if (ep->mpa_pkt_len < sizeof(*mpa)) {
 		return 0;
 	}
 	mpa = (struct mpa_message *) ep->mpa_pkt;
 
 	/* Validate MPA header. */
 	if (mpa->revision > mpa_rev) {
 
 		CTR4(KTR_IW_CXGBE, "%s:pmr6 %p %d %d", __func__, ep,
 		    mpa->revision, mpa_rev);
 		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d, "
 				" Received = %d\n", __func__, mpa_rev, mpa->revision);
 		err = -EPROTO;
 		goto err_stop_timer;
 	}
 
 	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmr7 %p", __func__, ep);
 		err = -EPROTO;
 		goto err_stop_timer;
 	}
 
 	plen = ntohs(mpa->private_data_size);
 
 	/*
 	 * Fail if there's too much private data.
 	 */
 	if (plen > MPA_MAX_PRIVATE_DATA) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmr8 %p", __func__, ep);
 		err = -EPROTO;
 		goto err_stop_timer;
 	}
 
 	/*
 	 * If plen does not account for pkt size
 	 */
 	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmr9 %p", __func__, ep);
 		STOP_EP_TIMER(ep);
 		err = -EPROTO;
 		goto err_stop_timer;
 	}
 
 	ep->plen = (u8) plen;
 
 	/*
 	 * If we don't have all the pdata yet, then bail.
 	 * We'll continue process when more data arrives.
 	 */
 	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmra %p", __func__, ep);
 		return 0;
 	}
 
 	if (mpa->flags & MPA_REJECT) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmrb %p", __func__, ep);
 		err = -ECONNREFUSED;
 		goto err_stop_timer;
 	}
 
 	/*
 	 * If we get here we have accumulated the entire mpa
 	 * start reply message including private data. And
 	 * the MPA header is valid.
 	 */
 	ep->com.state = FPDU_MODE;
 	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
 	ep->mpa_attr.recv_marker_enabled = markers_enabled;
 	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
 	ep->mpa_attr.version = mpa->revision;
 	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
 
 	if (mpa->revision == 2) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmrc %p", __func__, ep);
 		ep->mpa_attr.enhanced_rdma_conn =
 			mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
 
 		if (ep->mpa_attr.enhanced_rdma_conn) {
 
 			CTR2(KTR_IW_CXGBE, "%s:pmrd %p", __func__, ep);
 			mpa_v2_params = (struct mpa_v2_conn_params *)
 				(ep->mpa_pkt + sizeof(*mpa));
 			resp_ird = ntohs(mpa_v2_params->ird) &
 				MPA_V2_IRD_ORD_MASK;
 			resp_ord = ntohs(mpa_v2_params->ord) &
 				MPA_V2_IRD_ORD_MASK;
 
 			/*
 			 * This is a double-check. Ideally, below checks are
 			 * not required since ird/ord stuff has been taken
 			 * care of in c4iw_accept_cr
 			 */
 			if (ep->ird < resp_ord) {
 				if (RELAXED_IRD_NEGOTIATION && resp_ord <=
 				   ep->com.dev->rdev.adap->params.max_ordird_qp)
 					ep->ird = resp_ord;
 				else
 					insuff_ird = 1;
 			} else if (ep->ird > resp_ord) {
 				ep->ird = resp_ord;
 			}
 			if (ep->ord > resp_ird) {
 				if (RELAXED_IRD_NEGOTIATION)
 					ep->ord = resp_ird;
 				else
 					insuff_ird = 1;
 			}
 			if (insuff_ird) {
 				err = -ENOMEM;
 				ep->ird = resp_ord;
 				ep->ord = resp_ird;
 			}
 
 			if (ntohs(mpa_v2_params->ird) &
 				MPA_V2_PEER2PEER_MODEL) {
 
 				CTR2(KTR_IW_CXGBE, "%s:pmrf %p", __func__, ep);
 				if (ntohs(mpa_v2_params->ord) &
 					MPA_V2_RDMA_WRITE_RTR) {
 
 					CTR2(KTR_IW_CXGBE, "%s:pmrg %p", __func__, ep);
 					ep->mpa_attr.p2p_type =
 						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
 				}
 				else if (ntohs(mpa_v2_params->ord) &
 					MPA_V2_RDMA_READ_RTR) {
 
 					CTR2(KTR_IW_CXGBE, "%s:pmrh %p", __func__, ep);
 					ep->mpa_attr.p2p_type =
 						FW_RI_INIT_P2PTYPE_READ_REQ;
 				}
 			}
 		}
 	} else {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmri %p", __func__, ep);
 
 		if (mpa->revision == 1) {
 
 			CTR2(KTR_IW_CXGBE, "%s:pmrj %p", __func__, ep);
 
 			if (peer2peer) {
 
 				CTR2(KTR_IW_CXGBE, "%s:pmrk %p", __func__, ep);
 				ep->mpa_attr.p2p_type = p2p_type;
 			}
 		}
 	}
 
 	if (set_tcpinfo(ep)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmrl %p", __func__, ep);
 		printf("%s set_tcpinfo error\n", __func__);
 		err = -ECONNRESET;
 		goto err;
 	}
 
 	CTR6(KTR_IW_CXGBE, "%s - crc_enabled = %d, recv_marker_enabled = %d, "
 	    "xmit_marker_enabled = %d, version = %d p2p_type = %d", __func__,
 	    ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
 	    ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
 	    ep->mpa_attr.p2p_type);
 
 	/*
 	 * If responder's RTR does not match with that of initiator, assign
 	 * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not
 	 * generated when moving QP to RTS state.
 	 * A TERM message will be sent after QP has moved to RTS state
 	 */
 	if ((ep->mpa_attr.version == 2) && peer2peer &&
 		(ep->mpa_attr.p2p_type != p2p_type)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmrm %p", __func__, ep);
 		ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
 		rtr_mismatch = 1;
 	}
 
 
 	//ep->ofld_txq = TOEPCB(ep->com.so)->ofld_txq;
 	attrs.mpa_attr = ep->mpa_attr;
 	attrs.max_ird = ep->ird;
 	attrs.max_ord = ep->ord;
 	attrs.llp_stream_handle = ep;
 	attrs.next_state = C4IW_QP_STATE_RTS;
 
 	mask = C4IW_QP_ATTR_NEXT_STATE |
 		C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR |
 		C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD;
 
 	/* bind QP and TID with INIT_WR */
 	err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1);
 
 	if (err) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmrn %p", __func__, ep);
 		goto err;
 	}
 
 	/*
 	 * If responder's RTR requirement did not match with what initiator
 	 * supports, generate TERM message
 	 */
 	if (rtr_mismatch) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmro %p", __func__, ep);
 		printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__);
 		attrs.layer_etype = LAYER_MPA | DDP_LLP;
 		attrs.ecode = MPA_NOMATCH_RTR;
 		attrs.next_state = C4IW_QP_STATE_TERMINATE;
 		attrs.send_term = 1;
 		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
 			C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
 		err = -ENOMEM;
 		disconnect = 1;
 		goto out;
 	}
 
 	/*
 	 * Generate TERM if initiator IRD is not sufficient for responder
 	 * provided ORD. Currently, we do the same behaviour even when
 	 * responder provided IRD is also not sufficient as regards to
 	 * initiator ORD.
 	 */
 	if (insuff_ird) {
 
 		CTR2(KTR_IW_CXGBE, "%s:pmrp %p", __func__, ep);
 		printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n",
 				__func__);
 		attrs.layer_etype = LAYER_MPA | DDP_LLP;
 		attrs.ecode = MPA_INSUFF_IRD;
 		attrs.next_state = C4IW_QP_STATE_TERMINATE;
 		attrs.send_term = 1;
 		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
 			C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
 		err = -ENOMEM;
 		disconnect = 1;
 		goto out;
 	}
 	goto out;
 err_stop_timer:
 	STOP_EP_TIMER(ep);
 err:
 	disconnect = 2;
 out:
 	connect_reply_upcall(ep, err);
 	CTR2(KTR_IW_CXGBE, "%s:pmrE %p", __func__, ep);
 	return disconnect;
 }
 
 /*
  * process_mpa_request - process streaming mode MPA request
  *
  * Returns:
  *
  * 0 upon success indicating a connect request was delivered to the ULP
  * or the mpa request is incomplete but valid so far.
  *
  * 1 if a failure requires the caller to close the connection.
  *
  * 2 if a failure requires the caller to abort the connection.
  */
 static int
 process_mpa_request(struct c4iw_ep *ep)
 {
 	struct mpa_message *mpa;
 	struct mpa_v2_conn_params *mpa_v2_params;
 	u16 plen;
 	int flags = MSG_DONTWAIT;
 	int rc;
 	struct iovec iov;
 	struct uio uio;
 	enum c4iw_ep_state state = ep->com.state;
 
 	CTR3(KTR_IW_CXGBE, "%s: ep %p, state %s", __func__, ep, states[state]);
 
 	if (state != MPA_REQ_WAIT)
 		return 0;
 
 	iov.iov_base = &ep->mpa_pkt[ep->mpa_pkt_len];
 	iov.iov_len = sizeof(ep->mpa_pkt) - ep->mpa_pkt_len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = sizeof(ep->mpa_pkt) - ep->mpa_pkt_len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = NULL; /* uio.uio_td = ep->com.thread; */
 
 	rc = soreceive(ep->com.so, NULL, &uio, NULL, NULL, &flags);
 	if (rc == EAGAIN)
 		return 0;
 	else if (rc)
 		goto err_stop_timer;
 
 	KASSERT(uio.uio_offset > 0, ("%s: sorecieve on so %p read no data",
 	    __func__, ep->com.so));
 	ep->mpa_pkt_len += uio.uio_offset;
 
 	/*
 	 * If we get more than the supported amount of private data then we must
 	 * fail this connection.  XXX: check so_rcv->sb_cc, or peek with another
 	 * soreceive, or increase the size of mpa_pkt by 1 and abort if the last
 	 * byte is filled by the soreceive above.
 	 */
 
 	/* Don't even have the MPA message.  Wait for more data to arrive. */
 	if (ep->mpa_pkt_len < sizeof(*mpa))
 		return 0;
 	mpa = (struct mpa_message *) ep->mpa_pkt;
 
 	/*
 	 * Validate MPA Header.
 	 */
 	if (mpa->revision > mpa_rev) {
 		log(LOG_ERR, "%s: MPA version mismatch. Local = %d,"
 		    " Received = %d\n", __func__, mpa_rev, mpa->revision);
 		goto err_stop_timer;
 	}
 
 	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)))
 		goto err_stop_timer;
 
 	/*
 	 * Fail if there's too much private data.
 	 */
 	plen = ntohs(mpa->private_data_size);
 	if (plen > MPA_MAX_PRIVATE_DATA)
 		goto err_stop_timer;
 
 	/*
 	 * If plen does not account for pkt size
 	 */
 	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen))
 		goto err_stop_timer;
 
 	ep->plen = (u8) plen;
 
 	/*
 	 * If we don't have all the pdata yet, then bail.
 	 */
 	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
 		return 0;
 
 	/*
 	 * If we get here we have accumulated the entire mpa
 	 * start reply message including private data.
 	 */
 	ep->mpa_attr.initiator = 0;
 	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
 	ep->mpa_attr.recv_marker_enabled = markers_enabled;
 	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
 	ep->mpa_attr.version = mpa->revision;
 	if (mpa->revision == 1)
 		ep->tried_with_mpa_v1 = 1;
 	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
 
 	if (mpa->revision == 2) {
 		ep->mpa_attr.enhanced_rdma_conn =
 		    mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
 		if (ep->mpa_attr.enhanced_rdma_conn) {
 			mpa_v2_params = (struct mpa_v2_conn_params *)
 				(ep->mpa_pkt + sizeof(*mpa));
 			ep->ird = ntohs(mpa_v2_params->ird) &
 				MPA_V2_IRD_ORD_MASK;
 			ep->ird = min_t(u32, ep->ird,
 					cur_max_read_depth(ep->com.dev));
 			ep->ord = ntohs(mpa_v2_params->ord) &
 				MPA_V2_IRD_ORD_MASK;
 			ep->ord = min_t(u32, ep->ord,
 					cur_max_read_depth(ep->com.dev));
 			CTR3(KTR_IW_CXGBE, "%s initiator ird %u ord %u",
 				 __func__, ep->ird, ep->ord);
 			if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL)
 				if (peer2peer) {
 					if (ntohs(mpa_v2_params->ord) &
 							MPA_V2_RDMA_WRITE_RTR)
 						ep->mpa_attr.p2p_type =
 						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
 					else if (ntohs(mpa_v2_params->ord) &
 							MPA_V2_RDMA_READ_RTR)
 						ep->mpa_attr.p2p_type =
 						FW_RI_INIT_P2PTYPE_READ_REQ;
 				}
 		}
 	} else if (mpa->revision == 1 && peer2peer)
 		ep->mpa_attr.p2p_type = p2p_type;
 
 	if (set_tcpinfo(ep))
 		goto err_stop_timer;
 
 	CTR5(KTR_IW_CXGBE, "%s: crc_enabled = %d, recv_marker_enabled = %d, "
 	    "xmit_marker_enabled = %d, version = %d", __func__,
 	    ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
 	    ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
 
 	ep->com.state = MPA_REQ_RCVD;
 	STOP_EP_TIMER(ep);
 
 	/* drive upcall */
 	if (ep->parent_ep->com.state != DEAD)
 		if (connect_request_upcall(ep))
 			goto err_out;
 	return 0;
 
 err_stop_timer:
 	STOP_EP_TIMER(ep);
 err_out:
 	return 2;
 }
 
 /*
  * Upcall from the adapter indicating data has been transmitted.
  * For us its just the single MPA request or reply.  We can now free
  * the skb holding the mpa message.
  */
 int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
 {
 	int err;
 	struct c4iw_ep *ep = to_ep(cm_id);
 	int abort = 0;
 
 	mutex_lock(&ep->com.mutex);
 	CTR2(KTR_IW_CXGBE, "%s:crcB %p", __func__, ep);
 
 	if ((ep->com.state == DEAD) ||
 			(ep->com.state != MPA_REQ_RCVD)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:crc1 %p", __func__, ep);
 		mutex_unlock(&ep->com.mutex);
 		c4iw_put_ep(&ep->com);
 		return -ECONNRESET;
 	}
 	set_bit(ULP_REJECT, &ep->com.history);
 
 	if (mpa_rev == 0) {
 
 		CTR2(KTR_IW_CXGBE, "%s:crc2 %p", __func__, ep);
 		abort = 1;
 	}
 	else {
 
 		CTR2(KTR_IW_CXGBE, "%s:crc3 %p", __func__, ep);
 		abort = send_mpa_reject(ep, pdata, pdata_len);
 	}
 	STOP_EP_TIMER(ep);
 	err = c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL);
 	mutex_unlock(&ep->com.mutex);
 	c4iw_put_ep(&ep->com);
 	CTR3(KTR_IW_CXGBE, "%s:crc4 %p, err: %d", __func__, ep, err);
 	return 0;
 }
 
 int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 {
 	int err;
 	struct c4iw_qp_attributes attrs = {0};
 	enum c4iw_qp_attr_mask mask;
 	struct c4iw_ep *ep = to_ep(cm_id);
 	struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
 	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
 	int abort = 0;
 
 	mutex_lock(&ep->com.mutex);
 	CTR2(KTR_IW_CXGBE, "%s:cacB %p", __func__, ep);
 
 	if ((ep->com.state == DEAD) ||
 			(ep->com.state != MPA_REQ_RCVD)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cac1 %p", __func__, ep);
 		err = -ECONNRESET;
 		goto err_out;
 	}
 
 	BUG_ON(!qp);
 
 	set_bit(ULP_ACCEPT, &ep->com.history);
 
 	if ((conn_param->ord > c4iw_max_read_depth) ||
 		(conn_param->ird > c4iw_max_read_depth)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cac2 %p", __func__, ep);
 		err = -EINVAL;
 		goto err_abort;
 	}
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cac3 %p", __func__, ep);
 
 		if (conn_param->ord > ep->ird) {
 			if (RELAXED_IRD_NEGOTIATION) {
 				conn_param->ord = ep->ird;
 			} else {
 				ep->ird = conn_param->ird;
 				ep->ord = conn_param->ord;
 				send_mpa_reject(ep, conn_param->private_data,
 						conn_param->private_data_len);
 				err = -ENOMEM;
 				goto err_abort;
 			}
 		}
 		if (conn_param->ird < ep->ord) {
 			if (RELAXED_IRD_NEGOTIATION &&
 			    ep->ord <= h->rdev.adap->params.max_ordird_qp) {
 				conn_param->ird = ep->ord;
 			} else {
 				err = -ENOMEM;
 				goto err_abort;
 			}
 		}
 	}
 	ep->ird = conn_param->ird;
 	ep->ord = conn_param->ord;
 
 	if (ep->mpa_attr.version == 1) {
 		if (peer2peer && ep->ird == 0)
 			ep->ird = 1;
 	} else {
 		if (peer2peer &&
 		    (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED) &&
 		    (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) && ep->ird == 0)
 			ep->ird = 1;
 	}
 
 	CTR4(KTR_IW_CXGBE, "%s %d ird %d ord %d", __func__, __LINE__,
 			ep->ird, ep->ord);
 
 	ep->com.cm_id = cm_id;
 	ref_cm_id(&ep->com);
 	ep->com.qp = qp;
 	ref_qp(ep);
 	//ep->ofld_txq = TOEPCB(ep->com.so)->ofld_txq;
 
 	/* bind QP to EP and move to RTS */
 	attrs.mpa_attr = ep->mpa_attr;
 	attrs.max_ird = ep->ird;
 	attrs.max_ord = ep->ord;
 	attrs.llp_stream_handle = ep;
 	attrs.next_state = C4IW_QP_STATE_RTS;
 
 	/* bind QP and TID with INIT_WR */
 	mask = C4IW_QP_ATTR_NEXT_STATE |
 		C4IW_QP_ATTR_LLP_STREAM_HANDLE |
 		C4IW_QP_ATTR_MPA_ATTR |
 		C4IW_QP_ATTR_MAX_IRD |
 		C4IW_QP_ATTR_MAX_ORD;
 
 	err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1);
 	if (err) {
 		CTR3(KTR_IW_CXGBE, "%s:caca %p, err: %d", __func__, ep, err);
 		goto err_defef_cm_id;
 	}
 
 	err = send_mpa_reply(ep, conn_param->private_data,
 			conn_param->private_data_len);
 	if (err) {
 		CTR3(KTR_IW_CXGBE, "%s:cacb %p, err: %d", __func__, ep, err);
 		goto err_defef_cm_id;
 	}
 
 	ep->com.state = FPDU_MODE;
 	established_upcall(ep);
 	mutex_unlock(&ep->com.mutex);
 	c4iw_put_ep(&ep->com);
 	CTR2(KTR_IW_CXGBE, "%s:cacE %p", __func__, ep);
 	return 0;
 err_defef_cm_id:
 	deref_cm_id(&ep->com);
 err_abort:
 	abort = 1;
 err_out:
 	if (abort)
 		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
 	mutex_unlock(&ep->com.mutex);
 	c4iw_put_ep(&ep->com);
 	CTR2(KTR_IW_CXGBE, "%s:cacE err %p", __func__, ep);
 	return err;
 }
 
 static int
 c4iw_sock_create(struct sockaddr_storage *laddr, struct socket **so)
 {
 	int ret;
 	int size, on;
 	struct socket *sock = NULL;
 	struct sockopt sopt;
 
 	ret = sock_create_kern(laddr->ss_family,
 			SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (ret) {
 		CTR2(KTR_IW_CXGBE, "%s:Failed to create TCP socket. err %d",
 				__func__, ret);
 		return ret;
 	}
 
 	if (reuseaddr) {
 		bzero(&sopt, sizeof(struct sockopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = SOL_SOCKET;
 		sopt.sopt_name = SO_REUSEADDR;
 		on = 1;
 		sopt.sopt_val = &on;
 		sopt.sopt_valsize = sizeof(on);
 		ret = -sosetopt(sock, &sopt);
 		if (ret != 0) {
 			log(LOG_ERR, "%s: sosetopt(%p, SO_REUSEADDR) "
 				"failed with %d.\n", __func__, sock, ret);
 		}
 		bzero(&sopt, sizeof(struct sockopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = SOL_SOCKET;
 		sopt.sopt_name = SO_REUSEPORT;
 		on = 1;
 		sopt.sopt_val = &on;
 		sopt.sopt_valsize = sizeof(on);
 		ret = -sosetopt(sock, &sopt);
 		if (ret != 0) {
 			log(LOG_ERR, "%s: sosetopt(%p, SO_REUSEPORT) "
 				"failed with %d.\n", __func__, sock, ret);
 		}
 	}
 
 	ret = -sobind(sock, (struct sockaddr *)laddr, curthread);
 	if (ret) {
 		CTR2(KTR_IW_CXGBE, "%s:Failed to bind socket. err %p",
 				__func__, ret);
 		sock_release(sock);
 		return ret;
 	}
 
 	size = laddr->ss_family == AF_INET6 ?
 		sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
 	ret = sock_getname(sock, (struct sockaddr *)laddr, &size, 0);
 	if (ret) {
 		CTR2(KTR_IW_CXGBE, "%s:sock_getname failed. err %p",
 				__func__, ret);
 		sock_release(sock);
 		return ret;
 	}
 
 	*so = sock;
 	return 0;
 }
 
 int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 {
 	int err = 0;
 	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
 	struct c4iw_ep *ep = NULL;
 	struct ifnet    *nh_ifp;        /* Logical egress interface */
+	struct epoch_tracker et;
 #ifdef VIMAGE
 	struct rdma_cm_id *rdma_id = (struct rdma_cm_id*)cm_id->context;
 	struct vnet *vnet = rdma_id->route.addr.dev_addr.net;
 #endif
 
 	CTR2(KTR_IW_CXGBE, "%s:ccB %p", __func__, cm_id);
 
 
 	if ((conn_param->ord > c4iw_max_read_depth) ||
 		(conn_param->ird > c4iw_max_read_depth)) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cc1 %p", __func__, cm_id);
 		err = -EINVAL;
 		goto out;
 	}
 	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
 	cm_id->provider_data = ep;
 
 	init_timer(&ep->timer);
 	ep->plen = conn_param->private_data_len;
 
 	if (ep->plen) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cc3 %p", __func__, ep);
 		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
 				conn_param->private_data, ep->plen);
 	}
 	ep->ird = conn_param->ird;
 	ep->ord = conn_param->ord;
 
 	if (peer2peer && ep->ord == 0) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cc4 %p", __func__, ep);
 		ep->ord = 1;
 	}
 
 	ep->com.dev = dev;
 	ep->com.cm_id = cm_id;
 	ref_cm_id(&ep->com);
 	ep->com.qp = get_qhp(dev, conn_param->qpn);
 
 	if (!ep->com.qp) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cc5 %p", __func__, ep);
 		err = -EINVAL;
 		goto fail;
 	}
 	ref_qp(ep);
 	ep->com.thread = curthread;
 
+	NET_EPOCH_ENTER(et);
 	CURVNET_SET(vnet);
 	err = get_ifnet_from_raddr(&cm_id->remote_addr, &nh_ifp);
 	CURVNET_RESTORE();
+	NET_EPOCH_EXIT(et);
 
 	if (err) {
 
 		CTR2(KTR_IW_CXGBE, "%s:cc7 %p", __func__, ep);
 		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
 		err = EHOSTUNREACH;
 		return err;
 	}
 
 	if (!(nh_ifp->if_capenable & IFCAP_TOE) ||
 	    TOEDEV(nh_ifp) == NULL) {
 		err = -ENOPROTOOPT;
 		goto fail;
 	}
 	ep->com.state = CONNECTING;
 	ep->tos = 0;
 	ep->com.local_addr = cm_id->local_addr;
 	ep->com.remote_addr = cm_id->remote_addr;
 
 	err = c4iw_sock_create(&cm_id->local_addr, &ep->com.so);
 	if (err)
 		goto fail;
 
 	setiwsockopt(ep->com.so);
 	init_iwarp_socket(ep->com.so, &ep->com);
 	err = -soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr,
 		ep->com.thread);
 	if (err)
 		goto fail_free_so;
 	CTR2(KTR_IW_CXGBE, "%s:ccE, ep %p", __func__, ep);
 	return 0;
 
 fail_free_so:
 	uninit_iwarp_socket(ep->com.so);
 	ep->com.state = DEAD;
 	sock_release(ep->com.so);
 fail:
 	deref_cm_id(&ep->com);
 	c4iw_put_ep(&ep->com);
 	ep = NULL;
 out:
 	CTR2(KTR_IW_CXGBE, "%s:ccE Error %d", __func__, err);
 	return err;
 }
 
 /*
  * iwcm->create_listen.  Returns -errno on failure.
  */
 int
 c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 {
 	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
 	struct c4iw_listen_ep *lep = NULL;
 	struct listen_port_info *port_info = NULL;
 	int rc = 0;
 
 	CTR3(KTR_IW_CXGBE, "%s: cm_id %p, backlog %s", __func__, cm_id,
 			backlog);
 	if (c4iw_fatal_error(&dev->rdev)) {
 		CTR2(KTR_IW_CXGBE, "%s: cm_id %p, fatal error", __func__,
 			       cm_id);
 		return -EIO;
 	}
 	lep = alloc_ep(sizeof(*lep), GFP_KERNEL);
 	lep->com.cm_id = cm_id;
 	ref_cm_id(&lep->com);
 	lep->com.dev = dev;
 	lep->backlog = backlog;
 	lep->com.local_addr = cm_id->local_addr;
 	lep->com.thread = curthread;
 	cm_id->provider_data = lep;
 	lep->com.state = LISTEN;
 
 	/* In case of INDADDR_ANY, ibcore creates cmid for each device and
 	 * invokes iw_cxgbe listener callbacks assuming that iw_cxgbe creates
 	 * HW listeners for each device seperately. But toecore expects single
 	 * solisten() call with INADDR_ANY address to create HW listeners on
 	 * all devices for a given port number. So iw_cxgbe driver calls
 	 * solisten() only once for INADDR_ANY(usually done at first time
 	 * listener callback from ibcore). And all the subsequent INADDR_ANY
 	 * listener callbacks from ibcore(for the same port address) do not
 	 * invoke solisten() as first listener callback has already created
 	 * listeners for all other devices(via solisten).
 	 */
 	if (c4iw_any_addr((struct sockaddr *)&lep->com.local_addr)) {
 		port_info = add_ep_to_listenlist(lep);
 		/* skip solisten() if refcnt > 1, as the listeners were
 		 * alredy created by 'Master lep'
 		 */
 		if (port_info->refcnt > 1) {
 			/* As there will be only one listener socket for a TCP
 			 * port, copy Master lep's socket pointer to other lep's
 			 * that are belonging to same TCP port.
 			 */
 			struct c4iw_listen_ep *head_lep =
 					container_of(port_info->lep_list.next,
 					struct c4iw_listen_ep, listen_ep_list);
 			lep->com.so =  head_lep->com.so;
 			goto out;
 		}
 	}
 	rc = c4iw_sock_create(&cm_id->local_addr, &lep->com.so);
 	if (rc) {
 		CTR2(KTR_IW_CXGBE, "%s:Failed to create socket. err %d",
 				__func__, rc);
 		goto fail;
 	}
 
 	rc = -solisten(lep->com.so, backlog, curthread);
 	if (rc) {
 		CTR3(KTR_IW_CXGBE, "%s:Failed to listen on sock:%p. err %d",
 				__func__, lep->com.so, rc);
 		goto fail_free_so;
 	}
 	init_iwarp_socket(lep->com.so, &lep->com);
 out:
 	return 0;
 
 fail_free_so:
 	sock_release(lep->com.so);
 fail:
 	if (port_info)
 		rem_ep_from_listenlist(lep);
 	deref_cm_id(&lep->com);
 	c4iw_put_ep(&lep->com);
 	return rc;
 }
 
 int
 c4iw_destroy_listen(struct iw_cm_id *cm_id)
 {
 	struct c4iw_listen_ep *lep = to_listen_ep(cm_id);
 
 	mutex_lock(&lep->com.mutex);
 	CTR3(KTR_IW_CXGBE, "%s: cm_id %p, state %s", __func__, cm_id,
 	    states[lep->com.state]);
 
 	lep->com.state = DEAD;
 	if (c4iw_any_addr((struct sockaddr *)&lep->com.local_addr)) {
 		/* if no refcount then close listen socket */
 		if (!rem_ep_from_listenlist(lep))
 			close_socket(lep->com.so);
 	} else
 		close_socket(lep->com.so);
 	deref_cm_id(&lep->com);
 	mutex_unlock(&lep->com.mutex);
 	c4iw_put_ep(&lep->com);
 	return 0;
 }
 
 int __c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 {
 	int ret;
 	mutex_lock(&ep->com.mutex);
 	ret = c4iw_ep_disconnect(ep, abrupt, gfp);
 	mutex_unlock(&ep->com.mutex);
 	return ret;
 }
 
 int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 {
 	int ret = 0;
 	int close = 0;
 	struct c4iw_rdev *rdev;
 
 
 	CTR2(KTR_IW_CXGBE, "%s:cedB %p", __func__, ep);
 
 	rdev = &ep->com.dev->rdev;
 
 	if (c4iw_fatal_error(rdev)) {
 		CTR3(KTR_IW_CXGBE, "%s:ced1 fatal error %p %s", __func__, ep,
 					states[ep->com.state]);
 		if (ep->com.state != DEAD) {
 			send_abort(ep);
 			ep->com.state = DEAD;
 		}
 		close_complete_upcall(ep, -ECONNRESET);
 		return ECONNRESET;
 	}
 	CTR3(KTR_IW_CXGBE, "%s:ced2 %p %s", __func__, ep,
 	    states[ep->com.state]);
 
 	/*
 	 * Ref the ep here in case we have fatal errors causing the
 	 * ep to be released and freed.
 	 */
 	c4iw_get_ep(&ep->com);
 	switch (ep->com.state) {
 
 		case MPA_REQ_WAIT:
 		case MPA_REQ_SENT:
 		case MPA_REQ_RCVD:
 		case MPA_REP_SENT:
 		case FPDU_MODE:
 			close = 1;
 			if (abrupt)
 				ep->com.state = ABORTING;
 			else {
 				ep->com.state = CLOSING;
 				START_EP_TIMER(ep);
 			}
 			set_bit(CLOSE_SENT, &ep->com.flags);
 			break;
 
 		case CLOSING:
 
 			if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
 
 				close = 1;
 				if (abrupt) {
 					STOP_EP_TIMER(ep);
 					ep->com.state = ABORTING;
 				} else
 					ep->com.state = MORIBUND;
 			}
 			break;
 
 		case MORIBUND:
 		case ABORTING:
 		case DEAD:
 			CTR3(KTR_IW_CXGBE,
 			    "%s ignoring disconnect ep %p state %u", __func__,
 			    ep, ep->com.state);
 			break;
 
 		default:
 			BUG();
 			break;
 	}
 
 
 	if (close) {
 
 		CTR2(KTR_IW_CXGBE, "%s:ced3 %p", __func__, ep);
 
 		if (abrupt) {
 
 			CTR2(KTR_IW_CXGBE, "%s:ced4 %p", __func__, ep);
 			set_bit(EP_DISC_ABORT, &ep->com.history);
 			close_complete_upcall(ep, -ECONNRESET);
 			send_abort(ep);
 		} else {
 
 			CTR2(KTR_IW_CXGBE, "%s:ced5 %p", __func__, ep);
 			set_bit(EP_DISC_CLOSE, &ep->com.history);
 
 			if (!ep->parent_ep)
 				ep->com.state = MORIBUND;
 
 			CURVNET_SET(ep->com.so->so_vnet);
 			ret = sodisconnect(ep->com.so);
 			CURVNET_RESTORE();
 			if (ret) {
 				CTR2(KTR_IW_CXGBE, "%s:ced6 %p", __func__, ep);
 				STOP_EP_TIMER(ep);
 				send_abort(ep);
 				ep->com.state = DEAD;
 				close_complete_upcall(ep, -ECONNRESET);
 				set_bit(EP_DISC_FAIL, &ep->com.history);
 				if (ep->com.qp) {
 					struct c4iw_qp_attributes attrs = {0};
 
 					attrs.next_state = C4IW_QP_STATE_ERROR;
 					ret = c4iw_modify_qp(
 							ep->com.dev, ep->com.qp,
 							C4IW_QP_ATTR_NEXT_STATE,
 							&attrs, 1);
 					CTR3(KTR_IW_CXGBE, "%s:ced7 %p ret %d",
 						__func__, ep, ret);
 				}
 			}
 		}
 	}
 	c4iw_put_ep(&ep->com);
 	CTR2(KTR_IW_CXGBE, "%s:cedE %p", __func__, ep);
 	return ret;
 }
 
 #ifdef C4IW_EP_REDIRECT
 int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
 		struct l2t_entry *l2t)
 {
 	struct c4iw_ep *ep = ctx;
 
 	if (ep->dst != old)
 		return 0;
 
 	PDBG("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new,
 			l2t);
 	dst_hold(new);
 	cxgb4_l2t_release(ep->l2t);
 	ep->l2t = l2t;
 	dst_release(old);
 	ep->dst = new;
 	return 1;
 }
 #endif
 
 
 
 static void ep_timeout(unsigned long arg)
 {
 	struct c4iw_ep *ep = (struct c4iw_ep *)arg;
 
 	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
 
 		/*
 		 * Only insert if it is not already on the list.
 		 */
 		if (!(ep->com.ep_events & C4IW_EVENT_TIMEOUT)) {
 			CTR2(KTR_IW_CXGBE, "%s:et1 %p", __func__, ep);
 			add_ep_to_req_list(ep, C4IW_EVENT_TIMEOUT);
 		}
 	}
 }
 
 static int fw6_wr_rpl(struct adapter *sc, const __be64 *rpl)
 {
 	uint64_t val = be64toh(*rpl);
 	int ret;
 	struct c4iw_wr_wait *wr_waitp;
 
 	ret = (int)((val >> 8) & 0xff);
 	wr_waitp = (struct c4iw_wr_wait *)rpl[1];
 	CTR3(KTR_IW_CXGBE, "%s wr_waitp %p ret %u", __func__, wr_waitp, ret);
 	if (wr_waitp)
 		c4iw_wake_up(wr_waitp, ret ? -ret : 0);
 
 	return (0);
 }
 
 static int fw6_cqe_handler(struct adapter *sc, const __be64 *rpl)
 {
 	struct cqe_list_entry *cle;
 	unsigned long flag;
 
 	cle = malloc(sizeof(*cle), M_CXGBE, M_NOWAIT);
 	cle->rhp = sc->iwarp_softc;
 	cle->err_cqe = *(const struct t4_cqe *)(&rpl[0]);
 
 	spin_lock_irqsave(&err_cqe_lock, flag);
 	list_add_tail(&cle->entry, &err_cqe_list);
 	queue_work(c4iw_taskq, &c4iw_task);
 	spin_unlock_irqrestore(&err_cqe_lock, flag);
 
 	return (0);
 }
 
 static int
 process_terminate(struct c4iw_ep *ep)
 {
 	struct c4iw_qp_attributes attrs = {0};
 
 	CTR2(KTR_IW_CXGBE, "%s:tB %p %d", __func__, ep);
 
 	if (ep && ep->com.qp) {
 
 		printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n",
 				ep->hwtid, ep->com.qp->wq.sq.qid);
 		attrs.next_state = C4IW_QP_STATE_TERMINATE;
 		c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs,
 				1);
 	} else
 		printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n",
 								ep->hwtid);
 	CTR2(KTR_IW_CXGBE, "%s:tE %p %d", __func__, ep);
 
 	return 0;
 }
 
 int __init c4iw_cm_init(void)
 {
 
 	t4_register_cpl_handler(CPL_RDMA_TERMINATE, terminate);
 	t4_register_fw_msg_handler(FW6_TYPE_WR_RPL, fw6_wr_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_CQE, fw6_cqe_handler);
 	t4_register_an_handler(c4iw_ev_handler);
 
 	TAILQ_INIT(&req_list);
 	spin_lock_init(&req_lock);
 	INIT_LIST_HEAD(&err_cqe_list);
 	spin_lock_init(&err_cqe_lock);
 
 	INIT_WORK(&c4iw_task, process_req);
 
 	c4iw_taskq = create_singlethread_workqueue("iw_cxgbe");
 	if (!c4iw_taskq)
 		return -ENOMEM;
 
 	return 0;
 }
 
 void __exit c4iw_cm_term(void)
 {
 	WARN_ON(!TAILQ_EMPTY(&req_list));
 	WARN_ON(!list_empty(&err_cqe_list));
 	flush_workqueue(c4iw_taskq);
 	destroy_workqueue(c4iw_taskq);
 
 	t4_register_cpl_handler(CPL_RDMA_TERMINATE, NULL);
 	t4_register_fw_msg_handler(FW6_TYPE_WR_RPL, NULL);
 	t4_register_fw_msg_handler(FW6_TYPE_CQE, NULL);
 	t4_register_an_handler(NULL);
 }
 #endif
Index: head/sys/dev/cxgbe/tom/t4_listen.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_listen.c	(revision 362486)
+++ head/sys/dev/cxgbe/tom/t4_listen.c	(revision 362487)
@@ -1,1534 +1,1538 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/domain.h>
 #include <sys/fnv_hash.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
+#include <net/route/nhop.h>
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/tcp_timer.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 #include <netinet/cc/cc.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "t4_clip.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 /* stid services */
 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
 static struct listen_ctx *lookup_stid(struct adapter *, int);
 static void free_stid(struct adapter *, struct listen_ctx *);
 
 /* lctx services */
 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
     struct vi_info *);
 static int free_lctx(struct adapter *, struct listen_ctx *);
 static void hold_lctx(struct listen_ctx *);
 static void listen_hash_add(struct adapter *, struct listen_ctx *);
 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
 
 static void send_reset_synqe(struct toedev *, struct synq_entry *);
 
 static int
 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
 {
 	struct tid_info *t = &sc->tids;
 	u_int stid, n, f, mask;
 	struct stid_region *sr = &lctx->stid_region;
 
 	/*
 	 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
 	 * the TCAM.  The start of the stid region is properly aligned (the chip
 	 * requires each region to be 128-cell aligned).
 	 */
 	n = isipv6 ? 2 : 1;
 	mask = n - 1;
 	KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
 	    ("%s: stid region (%u, %u) not properly aligned.  n = %u",
 	    __func__, t->stid_base, t->nstids, n));
 
 	mtx_lock(&t->stid_lock);
 	if (n > t->nstids - t->stids_in_use) {
 		mtx_unlock(&t->stid_lock);
 		return (-1);
 	}
 
 	if (t->nstids_free_head >= n) {
 		/*
 		 * This allocation will definitely succeed because the region
 		 * starts at a good alignment and we just checked we have enough
 		 * stids free.
 		 */
 		f = t->nstids_free_head & mask;
 		t->nstids_free_head -= n + f;
 		stid = t->nstids_free_head;
 		TAILQ_INSERT_HEAD(&t->stids, sr, link);
 	} else {
 		struct stid_region *s;
 
 		stid = t->nstids_free_head;
 		TAILQ_FOREACH(s, &t->stids, link) {
 			stid += s->used + s->free;
 			f = stid & mask;
 			if (s->free >= n + f) {
 				stid -= n + f;
 				s->free -= n + f;
 				TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
 				goto allocated;
 			}
 		}
 
 		if (__predict_false(stid != t->nstids)) {
 			panic("%s: stids TAILQ (%p) corrupt."
 			    "  At %d instead of %d at the end of the queue.",
 			    __func__, &t->stids, stid, t->nstids);
 		}
 
 		mtx_unlock(&t->stid_lock);
 		return (-1);
 	}
 
 allocated:
 	sr->used = n;
 	sr->free = f;
 	t->stids_in_use += n;
 	t->stid_tab[stid] = lctx;
 	mtx_unlock(&t->stid_lock);
 
 	KASSERT(((stid + t->stid_base) & mask) == 0,
 	    ("%s: EDOOFUS.", __func__));
 	return (stid + t->stid_base);
 }
 
 static struct listen_ctx *
 lookup_stid(struct adapter *sc, int stid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->stid_tab[stid - t->stid_base]);
 }
 
 static void
 free_stid(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct tid_info *t = &sc->tids;
 	struct stid_region *sr = &lctx->stid_region;
 	struct stid_region *s;
 
 	KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
 
 	mtx_lock(&t->stid_lock);
 	s = TAILQ_PREV(sr, stid_head, link);
 	if (s != NULL)
 		s->free += sr->used + sr->free;
 	else
 		t->nstids_free_head += sr->used + sr->free;
 	KASSERT(t->stids_in_use >= sr->used,
 	    ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
 	    t->stids_in_use, sr->used));
 	t->stids_in_use -= sr->used;
 	TAILQ_REMOVE(&t->stids, sr, link);
 	mtx_unlock(&t->stid_lock);
 }
 
 static struct listen_ctx *
 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
 {
 	struct listen_ctx *lctx;
 
 	INP_WLOCK_ASSERT(inp);
 
 	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
 	if (lctx == NULL)
 		return (NULL);
 
 	lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
 	if (lctx->stid < 0) {
 		free(lctx, M_CXGBE);
 		return (NULL);
 	}
 
 	if (inp->inp_vflag & INP_IPV6 &&
 	    !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
 		lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL);
 		if (lctx->ce == NULL) {
 			free(lctx, M_CXGBE);
 			return (NULL);
 		}
 	}
 
 	lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
 	lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
 	refcount_init(&lctx->refcount, 1);
 
 	lctx->inp = inp;
 	lctx->vnet = inp->inp_socket->so_vnet;
 	in_pcbref(inp);
 
 	return (lctx);
 }
 
 /* Don't call this directly, use release_lctx instead */
 static int
 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct inpcb *inp = lctx->inp;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(lctx->refcount == 0,
 	    ("%s: refcount %d", __func__, lctx->refcount));
 	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
 
 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
 	    __func__, lctx->stid, lctx, lctx->inp);
 
 	if (lctx->ce)
 		t4_release_lip(sc, lctx->ce);
 	free_stid(sc, lctx);
 	free(lctx, M_CXGBE);
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 static void
 hold_lctx(struct listen_ctx *lctx)
 {
 
 	refcount_acquire(&lctx->refcount);
 }
 
 static inline uint32_t
 listen_hashfn(void *key, u_long mask)
 {
 
 	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
 }
 
 /*
  * Add a listen_ctx entry to the listen hash table.
  */
 static void
 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct tom_data *td = sc->tom_softc;
 	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
 
 	mtx_lock(&td->lctx_hash_lock);
 	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
 	td->lctx_count++;
 	mtx_unlock(&td->lctx_hash_lock);
 }
 
 /*
  * Look for the listening socket's context entry in the hash and return it.
  */
 static struct listen_ctx *
 listen_hash_find(struct adapter *sc, struct inpcb *inp)
 {
 	struct tom_data *td = sc->tom_softc;
 	int bucket = listen_hashfn(inp, td->listen_mask);
 	struct listen_ctx *lctx;
 
 	mtx_lock(&td->lctx_hash_lock);
 	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
 		if (lctx->inp == inp)
 			break;
 	}
 	mtx_unlock(&td->lctx_hash_lock);
 
 	return (lctx);
 }
 
 /*
  * Removes the listen_ctx structure for inp from the hash and returns it.
  */
 static struct listen_ctx *
 listen_hash_del(struct adapter *sc, struct inpcb *inp)
 {
 	struct tom_data *td = sc->tom_softc;
 	int bucket = listen_hashfn(inp, td->listen_mask);
 	struct listen_ctx *lctx, *l;
 
 	mtx_lock(&td->lctx_hash_lock);
 	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
 		if (lctx->inp == inp) {
 			LIST_REMOVE(lctx, link);
 			td->lctx_count--;
 			break;
 		}
 	}
 	mtx_unlock(&td->lctx_hash_lock);
 
 	return (lctx);
 }
 
 /*
  * Releases a hold on the lctx.  Must be called with the listening socket's inp
  * locked.  The inp may be freed by this function and it returns NULL to
  * indicate this.
  */
 static struct inpcb *
 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct inpcb *inp = lctx->inp;
 	int inp_freed = 0;
 
 	INP_WLOCK_ASSERT(inp);
 	if (refcount_release(&lctx->refcount))
 		inp_freed = free_lctx(sc, lctx);
 
 	return (inp_freed ? NULL : inp);
 }
 
 static void
 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m = synqe->syn;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	struct cpl_abort_req *req;
 	int flowclen;
 	struct sge_wrq *ofld_txq;
 	struct sge_ofld_rxq *ofld_rxq;
 	const int nparams = 6;
 	const u_int pfvf = sc->pf << S_FW_VIID_PFN;
 
 	INP_WLOCK_ASSERT(synqe->lctx->inp);
 
 	CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
 	    __func__, synqe, synqe->flags, synqe->tid,
 	    synqe->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 	if (synqe->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 	synqe->flags |= TPF_ABORT_SHUTDOWN;
 
 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
 	ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
 
 	/* The wrqe will have two WRs - a flowc followed by an abort_req */
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
 
 	/* First the flowc ... */
 	memset(flowc, 0, wr->wr_len);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(synqe->tid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
  	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
  	flowc->mnemval[4].val = htobe32(512);
  	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
  	flowc->mnemval[5].val = htobe32(512);
 	synqe->flags |= TPF_FLOWC_WR_SENT;
 
 	/* ... then ABORT request */
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
 	req->rsvd0 = 0;	/* don't have a snd_nxt */
 	req->rsvd1 = 1;	/* no data sent yet */
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	t4_l2t_send(sc, wr, e);
 }
 
 static int
 create_server(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct wrqe *wr;
 	struct cpl_pass_open_req *req;
 	struct inpcb *inp = lctx->inp;
 
 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
 	if (wr == NULL) {
 		log(LOG_ERR, "%s: allocation failure", __func__);
 		return (ENOMEM);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
 	req->local_port = inp->inp_lport;
 	req->peer_port = 0;
 	req->local_ip = inp->inp_laddr.s_addr;
 	req->peer_ip = 0;
 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
 
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 create_server6(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct wrqe *wr;
 	struct cpl_pass_open_req6 *req;
 	struct inpcb *inp = lctx->inp;
 
 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
 	if (wr == NULL) {
 		log(LOG_ERR, "%s: allocation failure", __func__);
 		return (ENOMEM);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
 	req->local_port = inp->inp_lport;
 	req->peer_port = 0;
 	req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
 	req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
 	req->peer_ip_hi = 0;
 	req->peer_ip_lo = 0;
 	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
 	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
 	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
 
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
 {
 	struct wrqe *wr;
 	struct cpl_close_listsvr_req *req;
 
 	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
 	    lctx->stid));
 	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
 	req->rsvd = htobe16(0);
 
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 /*
  * Start a listening server by sending a passive open request to HW.
  *
  * Can't take adapter lock here and access to sc->flags,
  * sc->offload_map, if_capenable are all race prone.
  */
 int
 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct vi_info *vi;
 	struct port_info *pi;
 	struct inpcb *inp = tp->t_inpcb;
 	struct listen_ctx *lctx;
 	int i, rc, v;
 	struct offload_settings settings;
 
 	INP_WLOCK_ASSERT(inp);
 
 	rw_rlock(&sc->policy_lock);
 	settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
 	    EVL_MAKETAG(0xfff, 0, 0), inp);
 	rw_runlock(&sc->policy_lock);
 	if (!settings.offload)
 		return (0);
 
 	/* Don't start a hardware listener for any loopback address. */
 	if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
 		return (0);
 	if (!(inp->inp_vflag & INP_IPV6) &&
 	    IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
 		return (0);
 	if (sc->flags & KERN_TLS_OK)
 		return (0);
 #if 0
 	ADAPTER_LOCK(sc);
 	if (IS_BUSY(sc)) {
 		log(LOG_ERR, "%s: listen request ignored, %s is busy",
 		    __func__, device_get_nameunit(sc->dev));
 		goto done;
 	}
 
 	KASSERT(uld_active(sc, ULD_TOM),
 	    ("%s: TOM not initialized", __func__));
 #endif
 
 	/*
 	 * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
 	 * such VI's queues to send the passive open and receive the reply to
 	 * it.
 	 *
 	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
 	 * then reject any attempt to bring down such a port (and maybe reject
 	 * attempts to disable IFCAP_TOE on that port too?).
 	 */
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		for_each_vi(pi, v, vi) {
 			if (vi->flags & VI_INIT_DONE &&
 			    vi->ifp->if_capenable & IFCAP_TOE)
 				goto found;
 		}
 	}
 	goto done;	/* no port that's UP with IFCAP_TOE enabled */
 found:
 
 	if (listen_hash_find(sc, inp) != NULL)
 		goto done;	/* already setup */
 
 	lctx = alloc_lctx(sc, inp, vi);
 	if (lctx == NULL) {
 		log(LOG_ERR,
 		    "%s: listen request ignored, %s couldn't allocate lctx\n",
 		    __func__, device_get_nameunit(sc->dev));
 		goto done;
 	}
 	listen_hash_add(sc, lctx);
 
 	CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
 	    __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
 	    inp->inp_vflag);
 
 	if (inp->inp_vflag & INP_IPV6)
 		rc = create_server6(sc, lctx);
 	else
 		rc = create_server(sc, lctx);
 	if (rc != 0) {
 		log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
 		    __func__, device_get_nameunit(sc->dev), rc);
 		(void) listen_hash_del(sc, inp);
 		inp = release_lctx(sc, lctx);
 		/* can't be freed, host stack has a reference */
 		KASSERT(inp != NULL, ("%s: inp freed", __func__));
 		goto done;
 	}
 	lctx->flags |= LCTX_RPL_PENDING;
 done:
 #if 0
 	ADAPTER_UNLOCK(sc);
 #endif
 	return (0);
 }
 
 int
 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
 {
 	struct listen_ctx *lctx;
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	lctx = listen_hash_del(sc, inp);
 	if (lctx == NULL)
 		return (ENOENT);	/* no hardware listener for this inp */
 
 	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
 	    lctx, lctx->flags);
 
 	/*
 	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
 	 * arrive and clean up when it does.
 	 */
 	if (lctx->flags & LCTX_RPL_PENDING) {
 		return (EINPROGRESS);
 	}
 
 	destroy_server(sc, lctx);
 	return (0);
 }
 
 static inline struct synq_entry *
 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
 {
 	struct synq_entry *synqe;
 
 	INP_WLOCK_ASSERT(lctx->inp);
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 
 	synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
 	if (__predict_true(synqe != NULL)) {
 		synqe->flags = TPF_SYNQE;
 		refcount_init(&synqe->refcnt, 1);
 		synqe->lctx = lctx;
 		hold_lctx(lctx);	/* Every synqe has a ref on its lctx. */
 		synqe->syn = NULL;
 	}
 
 	return (synqe);
 }
 
 static inline void
 hold_synqe(struct synq_entry *synqe)
 {
 
 	refcount_acquire(&synqe->refcnt);
 }
 
 static inline struct inpcb *
 release_synqe(struct adapter *sc, struct synq_entry *synqe)
 {
 	struct inpcb *inp;
 
 	MPASS(synqe->flags & TPF_SYNQE);
 	MPASS(synqe->lctx != NULL);
 
 	inp = synqe->lctx->inp;
 	MPASS(inp != NULL);
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&synqe->refcnt)) {
 		inp = release_lctx(sc, synqe->lctx);
 		m_freem(synqe->syn);
 		free(synqe, M_CXGBE);
 	}
 
 	return (inp);
 }
 
 void
 t4_syncache_added(struct toedev *tod __unused, void *arg)
 {
 	struct synq_entry *synqe = arg;
 
 	hold_synqe(synqe);
 }
 
 void
 t4_syncache_removed(struct toedev *tod, void *arg)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct synq_entry *synqe = arg;
 	struct inpcb *inp = synqe->lctx->inp;
 
 	/*
 	 * XXX: this is a LOR but harmless when running from the softclock.
 	 */
 	INP_WLOCK(inp);
 	inp = release_synqe(sc, synqe);
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 int
 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
 {
 	struct synq_entry *synqe = arg;
 
 	if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
 		struct tcpopt to;
 		struct ip *ip = mtod(m, struct ip *);
 		struct tcphdr *th;
 
 		if (ip->ip_v == IPVERSION)
 			th = (void *)(ip + 1);
 		else
 			th = (void *)((struct ip6_hdr *)ip + 1);
 		bzero(&to, sizeof(to));
 		tcp_dooptions(&to, (void *)(th + 1),
 		    (th->th_off << 2) - sizeof(*th), TO_SYN);
 
 		/* save these for later */
 		synqe->iss = be32toh(th->th_seq);
 		synqe->irs = be32toh(th->th_ack) - 1;
 		synqe->ts = to.to_tsval;
 	}
 
 	m_freem(m);	/* don't need this any more */
 	return (0);
 }
 
 static int
 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
 	int stid = GET_TID(cpl);
 	unsigned int status = cpl->status;
 	struct listen_ctx *lctx = lookup_stid(sc, stid);
 	struct inpcb *inp = lctx->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PASS_OPEN_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
 	    __func__, stid, status, lctx->flags);
 
 	lctx->flags &= ~LCTX_RPL_PENDING;
 
 	if (status != CPL_ERR_NONE)
 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
 
 #ifdef INVARIANTS
 	/*
 	 * If the inp has been dropped (listening socket closed) then
 	 * listen_stop must have run and taken the inp out of the hash.
 	 */
 	if (inp->inp_flags & INP_DROPPED) {
 		KASSERT(listen_hash_del(sc, inp) == NULL,
 		    ("%s: inp %p still in listen hash", __func__, inp));
 	}
 #endif
 
 	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
 		if (release_lctx(sc, lctx) != NULL)
 			INP_WUNLOCK(inp);
 		return (status);
 	}
 
 	/*
 	 * Listening socket stopped listening earlier and now the chip tells us
 	 * it has started the hardware listener.  Stop it; the lctx will be
 	 * released in do_close_server_rpl.
 	 */
 	if (inp->inp_flags & INP_DROPPED) {
 		destroy_server(sc, lctx);
 		INP_WUNLOCK(inp);
 		return (status);
 	}
 
 	/*
 	 * Failed to start hardware listener.  Take inp out of the hash and
 	 * release our reference on it.  An error message has been logged
 	 * already.
 	 */
 	if (status != CPL_ERR_NONE) {
 		listen_hash_del(sc, inp);
 		if (release_lctx(sc, lctx) != NULL)
 			INP_WUNLOCK(inp);
 		return (status);
 	}
 
 	/* hardware listener open for business */
 
 	INP_WUNLOCK(inp);
 	return (status);
 }
 
 static int
 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
 	int stid = GET_TID(cpl);
 	unsigned int status = cpl->status;
 	struct listen_ctx *lctx = lookup_stid(sc, stid);
 	struct inpcb *inp = lctx->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 
 	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
 
 	if (status != CPL_ERR_NONE) {
 		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
 		    __func__, status, stid);
 		return (status);
 	}
 
 	INP_WLOCK(inp);
 	inp = release_lctx(sc, lctx);
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 
 	return (status);
 }
 
 static void
 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
 {
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp;
 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
 	int ntids;
 
 	INP_WLOCK_ASSERT(inp);
 	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
 
 	remove_tid(sc, synqe->tid, ntids);
 	release_tid(sc, synqe->tid, lctx->ctrlq);
 	t4_l2t_release(e);
 	inp = release_synqe(sc, synqe);
 	if (inp)
 		INP_WUNLOCK(inp);
 }
 
 void
 synack_failure_cleanup(struct adapter *sc, int tid)
 {
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 
 	INP_WLOCK(synqe->lctx->inp);
 	done_with_synqe(sc, synqe);
 }
 
 int
 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp;
 	struct sge_wrq *ofld_txq;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
 
 	if (negative_advice(cpl->status))
 		return (0);	/* Ignore negative advice */
 
 	INP_WLOCK(inp);
 
 	ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (synqe->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 
 	done_with_synqe(sc, synqe);
 	/* inp lock released by done_with_synqe */
 done:
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 int
 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
 	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
 
 	INP_WLOCK(inp);
 	KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
 	    __func__, synqe, synqe->flags));
 
 	done_with_synqe(sc, synqe);
 	/* inp lock released by done_with_synqe */
 
 	return (0);
 }
 
 void
 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct synq_entry *synqe = arg;
 	struct inpcb *inp = sotoinpcb(so);
 	struct toepcb *toep = synqe->toep;
 
 	NET_EPOCH_ASSERT();	/* prevents bad race with accept() */
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(synqe->flags & TPF_SYNQE,
 	    ("%s: %p not a synq_entry?", __func__, arg));
 	MPASS(toep->tid == synqe->tid);
 
 	offload_socket(so, toep);
 	make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
 	toep->flags |= TPF_CPL_PENDING;
 	update_tid(sc, synqe->tid, toep);
 	synqe->flags |= TPF_SYNQE_EXPANDED;
 	inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
 	inp->inp_flowid = synqe->rss_hash;
 }
 
 static void
 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
 {
 	bzero(to, sizeof(*to));
 
 	if (t4opt->mss) {
 		to->to_flags |= TOF_MSS;
 		to->to_mss = be16toh(t4opt->mss);
 	}
 
 	if (t4opt->wsf > 0 && t4opt->wsf < 15) {
 		to->to_flags |= TOF_SCALE;
 		to->to_wscale = t4opt->wsf;
 	}
 
 	if (t4opt->tstamp)
 		to->to_flags |= TOF_TS;
 
 	if (t4opt->sack)
 		to->to_flags |= TOF_SACKPERM;
 }
 
 static void
 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
     struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
 {
 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
 	const struct ether_header *eh;
 	unsigned int hlen = be32toh(cpl->hdr_len);
 	uintptr_t l3hdr;
 	const struct tcphdr *tcp;
 
 	eh = (const void *)(cpl + 1);
 	if (chip_id(sc) >= CHELSIO_T6) {
 		l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
 		tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
 	} else {
 		l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
 		tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
 	}
 
 	/* extract TOS (DiffServ + ECN) byte for AccECN */
 	if (iptos) {
 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
 			const struct ip *ip = (const void *)l3hdr;
 			*iptos = ip->ip_tos;
 		}
 #ifdef INET6
 		else
 		if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
 			*iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 		}
 #endif /* INET */
 	}
 
 	if (inc) {
 		bzero(inc, sizeof(*inc));
 		inc->inc_fport = tcp->th_sport;
 		inc->inc_lport = tcp->th_dport;
 		if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
 			const struct ip *ip = (const void *)l3hdr;
 
 			inc->inc_faddr = ip->ip_src;
 			inc->inc_laddr = ip->ip_dst;
 		} else {
 			const struct ip6_hdr *ip6 = (const void *)l3hdr;
 
 			inc->inc_flags |= INC_ISIPV6;
 			inc->inc6_faddr = ip6->ip6_src;
 			inc->inc6_laddr = ip6->ip6_dst;
 		}
 	}
 
 	if (th) {
 		bcopy(tcp, th, sizeof(*th));
 		tcp_fields_to_host(th);		/* just like tcp_input */
 	}
 }
 
 static struct l2t_entry *
 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
     struct in_conninfo *inc)
 {
 	struct l2t_entry *e;
 	struct sockaddr_in6 sin6;
 	struct sockaddr *dst = (void *)&sin6;
+	struct nhop_object *nh;
 
 	if (inc->inc_flags & INC_ISIPV6) {
-		struct nhop6_basic nh6;
-
 		bzero(dst, sizeof(struct sockaddr_in6));
 		dst->sa_len = sizeof(struct sockaddr_in6);
 		dst->sa_family = AF_INET6;
 
 		if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
 			/* no need for route lookup */
 			e = t4_l2t_get(pi, ifp, dst);
 			return (e);
 		}
 
-		if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
-		    0, 0, 0, &nh6) != 0)
+		nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
+		if (nh == NULL)
 			return (NULL);
-		if (nh6.nh_ifp != ifp)
+		if (nh->nh_ifp != ifp)
 			return (NULL);
-		((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
+		if (nh->nh_flags & NHF_GATEWAY)
+			((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
+		else
+			((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
 	} else {
-		struct nhop4_basic nh4;
-
 		dst->sa_len = sizeof(struct sockaddr_in);
 		dst->sa_family = AF_INET;
 
-		if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
-		    &nh4) != 0)
+		nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
+		if (nh == NULL)
 			return (NULL);
-		if (nh4.nh_ifp != ifp)
+		if (nh->nh_ifp != ifp)
 			return (NULL);
-		((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
+		if (nh->nh_flags & NHF_GATEWAY)
+			((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
+		else
+			((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
 	}
 
 	e = t4_l2t_get(pi, ifp, dst);
 	return (e);
 }
 
 static int
 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
     uint32_t opt2, int tid)
 {
 	struct wrqe *wr;
 	struct cpl_pass_accept_rpl *rpl;
 	struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
 
 	wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
 	    sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
 	if (wr == NULL)
 		return (ENOMEM);
 	rpl = wrtod(wr);
 
 	if (is_t4(sc))
 		INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
 	else {
 		struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
 
 		INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
 		rpl5->iss = htobe32(synqe->iss);
 	}
 	rpl->opt0 = opt0;
 	rpl->opt2 = opt2;
 
 	return (t4_l2t_send(sc, wr, e));
 }
 
 #define REJECT_PASS_ACCEPT_REQ(tunnel)	do { \
 	if (!tunnel) { \
 		m_freem(m); \
 		m = NULL; \
 	} \
 	reject_reason = __LINE__; \
 	goto reject; \
 } while (0)
 
 /*
  * The context associated with a tid entry via insert_tid could be a synq_entry
  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
  */
 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
 
 /*
  * Incoming SYN on a listening socket.
  *
  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
  * etc.
  */
 static int
 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct toedev *tod;
 	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
 	unsigned int tid = GET_TID(cpl);
 	struct listen_ctx *lctx = lookup_stid(sc, stid);
 	struct inpcb *inp;
 	struct socket *so;
 	struct in_conninfo inc;
 	struct tcphdr th;
 	struct tcpopt to;
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct ifnet *hw_ifp, *ifp;
 	struct l2t_entry *e = NULL;
 	struct synq_entry *synqe = NULL;
 	int reject_reason, v, ntids;
 	uint16_t vid, l2info;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 	struct offload_settings settings;
 	uint8_t iptos;
 
 	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 
 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
 	    lctx);
 
 	CURVNET_SET(lctx->vnet);	/* before any potential REJECT */
 
 	/*
 	 * Use the MAC index to lookup the associated VI.  If this SYN didn't
 	 * match a perfect MAC filter, punt.
 	 */
 	l2info = be16toh(cpl->l2info);
 	pi = sc->port[G_SYN_INTF(l2info)];
 	if (!(l2info & F_SYN_XACT_MATCH)) {
 		REJECT_PASS_ACCEPT_REQ(false);
 	}
 	for_each_vi(pi, v, vi) {
 		if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
 			goto found;
 	}
 	REJECT_PASS_ACCEPT_REQ(false);
 found:
 	hw_ifp = vi->ifp;	/* the cxgbe ifnet */
 	m->m_pkthdr.rcvif = hw_ifp;
 	tod = TOEDEV(hw_ifp);
 
 	/*
 	 * Don't offload if the peer requested a TCP option that's not known to
 	 * the silicon.  Send the SYN to the kernel instead.
 	 */
 	if (__predict_false(cpl->tcpopt.unknown))
 		REJECT_PASS_ACCEPT_REQ(true);
 
 	/*
 	 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
 	 * involved.  Don't offload if the SYN had a VLAN tag and the vid
 	 * doesn't match anything on this interface.
 	 *
 	 * XXX: lagg support, lagg + vlan support.
 	 */
 	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
 	if (vid != 0xfff && vid != 0) {
 		ifp = VLAN_DEVAT(hw_ifp, vid);
 		if (ifp == NULL)
 			REJECT_PASS_ACCEPT_REQ(true);
 	} else
 		ifp = hw_ifp;
 
 	/*
 	 * Don't offload if the ifnet that the SYN came in on is not in the same
 	 * vnet as the listening socket.
 	 */
 	if (lctx->vnet != ifp->if_vnet)
 		REJECT_PASS_ACCEPT_REQ(true);
 
 	pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
 	if (inc.inc_flags & INC_ISIPV6) {
 
 		/* Don't offload if the ifcap isn't enabled */
 		if ((ifp->if_capenable & IFCAP_TOE6) == 0)
 			REJECT_PASS_ACCEPT_REQ(true);
 
 		/*
 		 * SYN must be directed to an IP6 address on this ifnet.  This
 		 * is more restrictive than in6_localip.
 		 */
 		NET_EPOCH_ENTER(et);
 		if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
 			NET_EPOCH_EXIT(et);
 			REJECT_PASS_ACCEPT_REQ(true);
 		}
 
 		ntids = 2;
 	} else {
 
 		/* Don't offload if the ifcap isn't enabled */
 		if ((ifp->if_capenable & IFCAP_TOE4) == 0)
 			REJECT_PASS_ACCEPT_REQ(true);
 
 		/*
 		 * SYN must be directed to an IP address on this ifnet.  This
 		 * is more restrictive than in_localip.
 		 */
 		NET_EPOCH_ENTER(et);
 		if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
 			NET_EPOCH_EXIT(et);
 			REJECT_PASS_ACCEPT_REQ(true);
 		}
 
 		ntids = 1;
 	}
 
 	e = get_l2te_for_nexthop(pi, ifp, &inc);
 	if (e == NULL) {
 		NET_EPOCH_EXIT(et);
 		REJECT_PASS_ACCEPT_REQ(true);
 	}
 
 	/* Don't offload if the 4-tuple is already in use */
 	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
 		NET_EPOCH_EXIT(et);
 		REJECT_PASS_ACCEPT_REQ(false);
 	}
 
 	inp = lctx->inp;		/* listening socket, not owned by TOE */
 	INP_WLOCK(inp);
 
 	/* Don't offload if the listening socket has closed */
 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		REJECT_PASS_ACCEPT_REQ(false);
 	}
 	so = inp->inp_socket;
 	rw_rlock(&sc->policy_lock);
 	settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
 	    EVL_MAKETAG(0xfff, 0, 0), inp);
 	rw_runlock(&sc->policy_lock);
 	if (!settings.offload) {
 		INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		REJECT_PASS_ACCEPT_REQ(true);	/* Rejected by COP. */
 	}
 
 	synqe = alloc_synqe(sc, lctx, M_NOWAIT);
 	if (synqe == NULL) {
 		INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		REJECT_PASS_ACCEPT_REQ(true);
 	}
 	MPASS(rss->hash_type == RSS_HASH_TCP);
 	synqe->rss_hash = be32toh(rss->hash_val);
 	atomic_store_int(&synqe->ok_to_respond, 0);
 
 	init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
 	    &synqe->params);
 
 	/*
 	 * If all goes well t4_syncache_respond will get called during
 	 * syncache_add.  Note that syncache_add releases the pcb lock.
 	 */
 	t4opt_to_tcpopt(&cpl->tcpopt, &to);
 	toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
 
 	if (atomic_load_int(&synqe->ok_to_respond) > 0) {
 		uint64_t opt0;
 		uint32_t opt2;
 
 		opt0 = calc_options0(vi, &synqe->params);
 		opt2 = calc_options2(vi, &synqe->params);
 
 		insert_tid(sc, tid, synqe, ntids);
 		synqe->tid = tid;
 		synqe->syn = m;
 		m = NULL;
 
 		if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
 			remove_tid(sc, tid, ntids);
 			m = synqe->syn;
 			synqe->syn = NULL;
 			NET_EPOCH_EXIT(et);
 			REJECT_PASS_ACCEPT_REQ(true);
 		}
 
 		CTR6(KTR_CXGBE,
 		    "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
 		    __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
 	} else {
 		NET_EPOCH_EXIT(et);
 		REJECT_PASS_ACCEPT_REQ(false);
 	}
 
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 reject:
 	CURVNET_RESTORE();
 	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
 	    reject_reason);
 
 	if (e)
 		t4_l2t_release(e);
 	release_tid(sc, tid, lctx->ctrlq);
 	if (synqe) {
 		inp = synqe->lctx->inp;
 		INP_WLOCK(inp);
 		inp = release_synqe(sc, synqe);
 		if (inp)
 			INP_WUNLOCK(inp);
 	}
 
 	if (m) {
 		/*
 		 * The connection request hit a TOE listener but is being passed
 		 * on to the kernel sw stack instead of getting offloaded.
 		 */
 		m_adj(m, sizeof(*cpl));
 		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
 		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 		m->m_pkthdr.csum_data = 0xffff;
 		hw_ifp->if_input(hw_ifp, m);
 	}
 
 	return (reject_reason);
 }
 
 static void
 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
     struct tcphdr *th, struct tcpopt *to)
 {
 	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
 	uint8_t iptos;
 
 	/* start off with the original SYN */
 	pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
 
 	/* modify parts to make it look like the ACK to our SYN|ACK */
 	th->th_flags = TH_ACK;
 	th->th_ack = synqe->iss + 1;
 	th->th_seq = be32toh(cpl->rcv_isn);
 	bzero(to, sizeof(*to));
 	if (G_TCPOPT_TSTAMP(tcp_opt)) {
 		to->to_flags |= TOF_TS;
 		to->to_tsecr = synqe->ts;
 	}
 }
 
 static int
 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
 #if defined(KTR) || defined(INVARIANTS)
 	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
 #endif
 	unsigned int tid = GET_TID(cpl);
 	struct synq_entry *synqe = lookup_tid(sc, tid);
 	struct listen_ctx *lctx = synqe->lctx;
 	struct inpcb *inp = lctx->inp, *new_inp;
 	struct socket *so;
 	struct tcphdr th;
 	struct tcpopt to;
 	struct in_conninfo inc;
 	struct toepcb *toep;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PASS_ESTABLISH,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 	KASSERT(synqe->flags & TPF_SYNQE,
 	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
 
 	CURVNET_SET(lctx->vnet);
 	NET_EPOCH_ENTER(et);	/* for syncache_expand */
 	INP_WLOCK(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
 	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
 
 	ifp = synqe->syn->m_pkthdr.rcvif;
 	vi = ifp->if_softc;
 	KASSERT(vi->pi->adapter == sc,
 	    ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
 
 	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 reset:
 		send_reset_synqe(TOEDEV(ifp), synqe);
 		INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 		return (0);
 	}
 
 	KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
 	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
 	    synqe->params.rxq_idx,
 	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
 
 	toep = alloc_toepcb(vi, M_NOWAIT);
 	if (toep == NULL)
 		goto reset;
 	toep->tid = tid;
 	toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
 	toep->vnet = lctx->vnet;
 	bcopy(&synqe->params, &toep->params, sizeof(toep->params));
 	init_toepcb(vi, toep);
 
 	MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
 	MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
 	synqe->tcp_opt = cpl->tcp_opt;
 	synqe->toep = toep;
 
 	/* Come up with something that syncache_expand should be ok with. */
 	synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
 	if (inc.inc_flags & INC_ISIPV6)
 		toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce);
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
 
 	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
 		free_toepcb(toep);
 		goto reset;
 	}
 
 	/* New connection inpcb is already locked by syncache_expand(). */
 	new_inp = sotoinpcb(so);
 	INP_WLOCK_ASSERT(new_inp);
 	MPASS(so->so_vnet == lctx->vnet);
 
 	/*
 	 * This is for expansion from syncookies.
 	 *
 	 * XXX: we've held the tcbinfo lock throughout so there's no risk of
 	 * anyone accept'ing a connection before we've installed our hooks, but
 	 * this somewhat defeats the purpose of having a tod_offload_socket :-(
 	 */
 	if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
 		tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
 		t4_offload_socket(TOEDEV(ifp), synqe, so);
 	}
 
 	INP_WUNLOCK(new_inp);
 
 	/* Done with the synqe */
 	inp = release_synqe(sc, synqe);
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 
 	return (0);
 }
 
 void
 t4_init_listen_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
 }
 
 void
 t4_uninit_listen_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
 	t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
 	t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
 }
 #endif