Index: stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
===================================================================
--- stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 330302)
+++ stable/10/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 330303)
@@ -1,1811 +1,1810 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/sockstate.h>
 #include <sys/sockopt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/protosw.h>
 #include <sys/priv.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <net/route.h>
 
 #include "cxgb_include.h"
 #include "ulp/tom/cxgb_l2t.h"
 #include "ulp/tom/cxgb_tom.h"
 #include "ulp/tom/cxgb_toepcb.h"
 
 VNET_DECLARE(int, tcp_do_autosndbuf);
 #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
 VNET_DECLARE(int, tcp_autosndbuf_max);
 #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
-extern int always_keepalive;
 
 /*
  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  * of the messages sent by the host but that are part of the TCP payload and
  * therefore consume TCP sequence space.  Tx connection parameters that
  * operate in TCP sequence space are affected by the HW additions and need to
  * compensate for them to accurately track TCP sequence numbers. This array
  * contains the compensating extra lengths for ULP packets.  It is indexed by
  * a packet's ULP submode.
  */
 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
 /*
  * Min receive window.  We want it to be large enough to accommodate receive
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 
 static void t3_release_offload_resources(struct toepcb *);
 static void send_reset(struct toepcb *toep);
 
 /*
  * Called after the last CPL for the toepcb has been received.
  *
  * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the
  * time this function exits.
  */
 static int
 toepcb_release(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->tp_inp;
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 	int rc;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(!(toep->tp_flags & TP_CPL_DONE),
 	    ("%s: double release?", __func__));
 
 	CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid);
 
 	toep->tp_flags |= TP_CPL_DONE;
 	toep->tp_inp = NULL;
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	if (!(toep->tp_flags & TP_ATTACHED))
 		t3_release_offload_resources(toep);
 
 	rc = in_pcbrele_wlocked(inp);
 	if (!rc)
 		INP_WUNLOCK(inp);
 	return (rc);
 }
 
 /*
  * One sided detach.  The tcpcb is going away and we need to unhook the toepcb
  * hanging off it.  If the TOE driver is also done with the toepcb we'll release
  * all offload resources.
  */
 static void
 toepcb_detach(struct inpcb *inp)
 {
 	struct toepcb *toep;
 	struct tcpcb *tp;
 
 	KASSERT(inp, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 
 	tp = intotcpcb(inp);
 	toep = tp->t_toe;
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__));
 
 	CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__,
 	    tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid,
 	    toep, inp, tp);
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->tp_flags &= ~TP_ATTACHED;
 
 	if (toep->tp_flags & TP_CPL_DONE)
 		t3_release_offload_resources(toep);
 }
 
 void
 t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 
 	toepcb_detach(tp->t_inpcb);
 }
 
 static int
 alloc_atid(struct tid_info *t, void *ctx)
 {
 	int atid = -1;
 
 	mtx_lock(&t->atid_lock);
 	if (t->afree) {
 		union active_open_entry *p = t->afree;
 
 		atid = (p - t->atid_tab) + t->atid_base;
 		t->afree = p->next;
 		p->ctx = ctx;
 		t->atids_in_use++;
 	}
 	mtx_unlock(&t->atid_lock);
 
 	return (atid);
 }
 
 static void
 free_atid(struct tid_info *t, int atid)
 {
 	union active_open_entry *p = atid2entry(t, atid);
 
 	mtx_lock(&t->atid_lock);
 	p->next = t->afree;
 	t->afree = p;
 	t->atids_in_use--;
 	mtx_unlock(&t->atid_lock);
 }
 
 void
 insert_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = ctx;
 	atomic_add_int(&t->tids_in_use, 1);
 }
 
 void
 update_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = ctx;
 }
 
 void
 remove_tid(struct tom_data *td, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = NULL;
 	atomic_add_int(&t->tids_in_use, -1);
 }
 
 /* use ctx as a next pointer in the tid release list */
 void
 queue_tid_release(struct toedev *tod, unsigned int tid)
 {
 	struct tom_data *td = t3_tomdata(tod);
 	void **p = &td->tid_maps.tid_tab[tid];
 	struct adapter *sc = tod->tod_softc;
 
 	mtx_lock(&td->tid_release_lock);
 	*p = td->tid_release_list;
 	td->tid_release_list = p;
 	if (!*p)
 		taskqueue_enqueue(sc->tq, &td->tid_release_task);
 	mtx_unlock(&td->tid_release_lock);
 }
 
 /*
  * Populate a TID_RELEASE WR.
  */
 static inline void
 mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid)
 {
 
 	cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 }
 
 void
 release_tid(struct toedev *tod, unsigned int tid, int qset)
 {
 	struct tom_data *td = t3_tomdata(tod);
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
 	struct cpl_tid_release *cpl;
 #ifdef INVARIANTS
 	struct tid_info *t = &td->tid_maps;
 #endif
 
 	KASSERT(tid >= 0 && tid < t->ntids,
 	    ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids));
 
 	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
 	if (m) {
 		mk_tid_release(cpl, tid);
 		t3_offload_tx(sc, m);
 		remove_tid(td, tid);
 	} else
 		queue_tid_release(tod, tid);
 
 }
 
 void
 t3_process_tid_release_list(void *data, int pending)
 {
 	struct mbuf *m;
 	struct tom_data *td = data;
 	struct adapter *sc = td->tod.tod_softc;
 
 	mtx_lock(&td->tid_release_lock);
 	while (td->tid_release_list) {
 		void **p = td->tid_release_list;
 		unsigned int tid = p - td->tid_maps.tid_tab;
 		struct cpl_tid_release *cpl;
 
 		td->tid_release_list = (void **)*p;
 		m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */
 		if (m == NULL)
 			break;	/* XXX: who reschedules the release task? */
 		mtx_unlock(&td->tid_release_lock);
 		mk_tid_release(cpl, tid);
 		t3_offload_tx(sc, m);
 		remove_tid(td, tid);
 		mtx_lock(&td->tid_release_lock);
 	}
 	mtx_unlock(&td->tid_release_lock);
 }
 
 static void
 close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct cpl_close_con_req *req;
 
 	if (toep->tp_flags & TP_FIN_SENT)
 		return;
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
 	if (m == NULL)
 		CXGB_UNIMPLEMENTED();
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid));
 	req->rsvd = 0;
 
 	toep->tp_flags |= TP_FIN_SENT;
 	t3_offload_tx(sc, m);
 }
 
 static inline void
 make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len,
     struct mbuf *tail)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct sockbuf *snd;
 
 	inp_lock_assert(tp->t_inpcb);
 	snd = so_sockbuf_snd(so);
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
 	/* len includes the length of any HW ULP additions */
 	req->len = htonl(len);
 	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 	/* V_TX_ULP_SUBMODE sets both the mode and submode */
 	req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) |
 	    V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)));
 	req->sndseq = htonl(tp->snd_nxt);
 	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 		struct adapter *sc = toep->tp_tod->tod_softc;
 		int cpu_idx = sc->rrss_map[toep->tp_qset];
 
 		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 		    V_TX_CPU_IDX(cpu_idx));
 
 		/* Sendbuffer is in units of 32KB. */
 		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 
 			req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15));
 		else
 			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 
 		toep->tp_flags |= TP_DATASENT;
 	}
 }
 
 /*
  * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc.
  * TOM_XXX_MOVE to some common header file.
  */
 /*
  * IMM_LEN: # of bytes that can be tx'd as immediate data.  There are 16 flits
  * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more
  * for the second gen bit flit.  This leaves us with 12 flits.
  *
  * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs.
  * The first desc has a tx_data_wr (which includes the WR header), the rest have
  * the WR header only.  All descs have the second gen bit flit.
  *
  * sgllen_to_descs: # of tx descs used up by an sgl of given length.  The first
  * desc has a tx_data_wr (which includes the WR header), the rest have the WR
  * header only.  All descs have the second gen bit flit.
  *
  * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits.
  *
  */
 #define IMM_LEN 96
 static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35};
 static int sgllen_to_descs[TX_MAX_SEGS] = {
 	0, 1, 1, 1, 1, 1, 1, 1, 1, 2,	/*  0 -  9 */
 	2, 2, 2, 2, 2, 2, 2, 2, 3, 3,	/* 10 - 19 */
 	3, 3, 3, 3, 3, 3, 3, 4, 4, 4,	/* 20 - 29 */
 	4, 4, 4, 4, 4, 4		/* 30 - 35 */
 };
 #if 0
 static int flits_to_sgllen[TX_DESC_FLITS + 1] = {
 	0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10
 };
 #endif
 #if SGE_NUM_GENBITS != 2
 #error "SGE_NUM_GENBITS really must be 2"
 #endif
 
 int
 t3_push_frames(struct socket *so, int req_completion)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct mbuf *m0, *sndptr, *m;
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	int bytes, ndesc, total_bytes = 0, mlen;
 	struct sockbuf *snd;
 	struct sglist *sgl;
 	struct ofld_hdr *oh;
 	caddr_t dst;
 	struct tx_data_wr *wr;
 
 	inp_lock_assert(tp->t_inpcb);
 
 	snd = so_sockbuf_snd(so);
 	SOCKBUF_LOCK(snd);
 
 	/*
 	 * Autosize the send buffer.
 	 */
 	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
 		if (snd->sb_cc >= (snd->sb_hiwat / 8 * 7) &&
 		    snd->sb_cc < VNET(tcp_autosndbuf_max)) {
 			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
 			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
 			    so, curthread))
 				snd->sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr)
 		sndptr = toep->tp_m_last->m_next;
 	else
 		sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 
 	/* Nothing to send or no WRs available for sending data */
 	if (toep->tp_wr_avail == 0 || sndptr == NULL)
 		goto out;
 
 	/* Something to send and at least 1 WR available */
 	while (toep->tp_wr_avail && sndptr != NULL) {
 
 		m0 = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m0 == NULL)
 			break;
 		oh = mtod(m0, struct ofld_hdr *);
 		wr = (void *)(oh + 1);
 		dst = (void *)(wr + 1);
 
 		m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr);
 		oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF |
 		    V_HDR_QSET(toep->tp_qset);
 
 		/*
 		 * Try to construct an immediate data WR if possible.  Stuff as
 		 * much data into it as possible, one whole mbuf at a time.
 		 */
 		mlen = sndptr->m_len;
 		ndesc = bytes = 0;
 		while (mlen <= IMM_LEN - bytes) {
 			bcopy(sndptr->m_data, dst, mlen);
 			bytes += mlen;
 			dst += mlen;
 
 			if (!(sndptr = sndptr->m_next))
 				break;
 			mlen = sndptr->m_len;
 		}
 
 		if (bytes) {
 
 			/* Was able to fit 'bytes' bytes in an immediate WR */
 
 			ndesc = 1;
 			make_tx_data_wr(so, wr, bytes, sndptr);
 
 			m0->m_len += bytes;
 			m0->m_pkthdr.len = m0->m_len;
 
 		} else {
 			int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC);
 
 			/* Need to make an SGL */
 
 			sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT);
 			if (sgl == NULL)
 				break;
 
 			for (m = sndptr; m != NULL; m = m->m_next) {
 				if ((mlen = m->m_len) > 0) {
 					if (sglist_append(sgl, m->m_data, mlen))
 					    break;
 				}
 				bytes += mlen;
 			}
 			sndptr = m;
 			if (bytes == 0) {
 				sglist_free(sgl);
 				break;
 			}
 			ndesc = sgllen_to_descs[sgl->sg_nseg];
 			oh->flags |= F_HDR_SGL;
 			oh->sgl = sgl;
 			make_tx_data_wr(so, wr, bytes, sndptr);
 		}
 
 		oh->flags |= V_HDR_NDESC(ndesc);
 		oh->plen = bytes;
 
 		snd->sb_sndptr = sndptr;
 		snd->sb_sndptroff += bytes;
 		if (sndptr == NULL) {
 			snd->sb_sndptr = snd->sb_mbtail;
 			snd->sb_sndptroff -= snd->sb_mbtail->m_len;
 			toep->tp_m_last = snd->sb_mbtail;
 		} else
 			toep->tp_m_last = NULL;
 
 		total_bytes += bytes;
 
 		toep->tp_wr_avail -= ndesc;
 		toep->tp_wr_unacked += ndesc;
 
 		if ((req_completion && toep->tp_wr_unacked == ndesc) ||
 		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 			wr->wr.wrh_hi |= htonl(F_WR_COMPL);
 			toep->tp_wr_unacked = 0;	
 		}
 
 		enqueue_wr(toep, m0);
 		l2t_send(sc, m0, toep->tp_l2t);
 	}
 out:
 	SOCKBUF_UNLOCK(snd);
 
 	if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN))
 		close_conn(sc, toep);
 
 	return (total_bytes);
 }
 
 static int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct mbuf *m;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req);
 	if (m == NULL)
 		return (0);
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wrh_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 	t3_offload_tx(sc, m);
 	return (credits);
 }
 
 void
 t3_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *so_rcv = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int must_send;
 
 	INP_WLOCK_ASSERT(inp);
 
 	SOCKBUF_LOCK(so_rcv);
 	KASSERT(toep->tp_enqueued >= so_rcv->sb_cc,
 	    ("%s: so_rcv->sb_cc > enqueued", __func__));
 	toep->tp_rx_credits += toep->tp_enqueued - so_rcv->sb_cc;
 	toep->tp_enqueued = so_rcv->sb_cc;
 	SOCKBUF_UNLOCK(so_rcv);
 
 	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
 	if (must_send || toep->tp_rx_credits >= 15 * 1024) {
 		int credits;
 
 		credits = send_rx_credits(sc, toep, toep->tp_rx_credits);
 		toep->tp_rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
 }
 
 static int
 do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_rx_urg_notify *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
 	log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp);
 
 	m_freem(m);
 	return (0);
 }
 
 int
 t3_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp_inpcbtosocket(inp);
 #if defined(KTR)
 	unsigned int tid = toep->tp_tid;
 #endif
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
 	    toep->tp_flags);
 
 	toep->tp_flags |= TP_SEND_FIN;
 	t3_push_frames(so, 1);
 
 	return (0);
 }
 
 int
 t3_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 
 	t3_push_frames(so, 1);
 	return (0);
 }
 
 /* What mtu_idx to use, given a 4-tuple and/or an MSS cap */
 int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i = 0, mss;
 
 	KASSERT(inc != NULL || pmss > 0,
 	    ("%s: at least one of inc/pmss must be specified", __func__));
 
 	mss = inc ? tcp_mssopt(inc) : pmss;
 	if (pmss > 0 && mss > pmss)
 		mss = pmss;
 
 	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
 		++i;
 
 	return (i);
 }
 
 static inline void
 purge_wr_queue(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct ofld_hdr *oh;
 
 	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) {
 		oh = mtod(m, struct ofld_hdr *);
 		if (oh->flags & F_HDR_SGL)
 			sglist_free(oh->sgl);
 		m_freem(m);
 	}
 }
 
 /*
  * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T
  * entry, etc.)
  */
 static void
 t3_release_offload_resources(struct toepcb *toep)
 {
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 
 	/*
 	 * The TOM explicitly detaches its toepcb from the system's inp before
 	 * it releases the offload resources.
 	 */
 	if (toep->tp_inp) {
 		panic("%s: inp %p still attached to toepcb %p",
 		    __func__, toep->tp_inp, toep);
 	}
 
 	if (toep->tp_wr_avail != toep->tp_wr_max)
 		purge_wr_queue(toep);
 
 	if (toep->tp_l2t) {
 		l2t_release(td->l2t, toep->tp_l2t);
 		toep->tp_l2t = NULL;
 	}
 
 	if (toep->tp_tid >= 0)
 		release_tid(tod, toep->tp_tid, toep->tp_qset);
 
 	toepcb_free(toep);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 unsigned long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	SOCKBUF_LOCK(&so->so_snd);
 	so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Update TCP PCB */
 	tp->tod = toep->tp_tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->tp_inp = inp;
 	toep->tp_flags |= TP_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 
 	INP_WLOCK_ASSERT(inp);
 
 	so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE;
 	so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE;
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->tp_inp = NULL;
 	toep->tp_flags &= ~TP_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /*
  * Socket could be a listening socket, and we may not have a toepcb at all at
  * this time.
  */
 uint32_t
 calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e)
 {
 	uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) |
 	    V_MSS_IDX(mtu_idx);
 
 	if (so != NULL) {
 		struct inpcb *inp = sotoinpcb(so);
 		struct tcpcb *tp = intotcpcb(inp);
-		int keepalive = always_keepalive ||
+		int keepalive = tcp_always_keepalive ||
 		    so_options_get(so) & SO_KEEPALIVE;
 
 		opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
 		opt0h |= V_KEEP_ALIVE(keepalive != 0);
 	}
 
 	if (e != NULL)
 		opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx);
 
 	return (htobe32(opt0h));
 }
 
 uint32_t
 calc_opt0l(struct socket *so, int rcv_bufsize)
 {
 	uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize);
 
 	KASSERT(rcv_bufsize <= M_RCV_BUFSIZ,
 	    ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize));
 
 	if (so != NULL)		/* optional because noone cares about IP TOS */
 		opt0l |= V_TOS(INP_TOS(sotoinpcb(so)));
 
 	return (htobe32(opt0l));
 }
 
 /*
  * Convert an ACT_OPEN_RPL status to an errno.
  */
 static int
 act_open_rpl_status_to_errno(int status)
 {
 	switch (status) {
 	case CPL_ERR_CONN_RESET:
 		return (ECONNREFUSED);
 	case CPL_ERR_ARP_MISS:
 		return (EHOSTUNREACH);
 	case CPL_ERR_CONN_TIMEDOUT:
 		return (ETIMEDOUT);
 	case CPL_ERR_TCAM_FULL:
 		return (EAGAIN);
 	case CPL_ERR_CONN_EXIST:
 		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
 		return (EAGAIN);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * Return whether a failed active open has allocated a TID
  */
 static inline int
 act_open_has_tid(int status)
 {
 	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
 	       status != CPL_ERR_ARP_MISS;
 }
 
 /*
  * Active open failed.
  */
 static int
 do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct toedev *tod = &td->tod;
 	struct cpl_act_open_rpl *rpl = mtod(m, void *);
 	unsigned int atid = G_TID(ntohl(rpl->atid));
 	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
 	struct inpcb *inp = toep->tp_inp;
 	int s = rpl->status, rc;
 
 	CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s);
 
 	free_atid(&td->tid_maps, atid);
 	toep->tp_tid = -1;
 
 	if (act_open_has_tid(s))
 		queue_tid_release(tod, GET_TID(rpl));
 
 	rc = act_open_rpl_status_to_errno(s);
 	if (rc != EAGAIN)
 		INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	toe_connect_failed(tod, inp, rc);
 	toepcb_release(toep);	/* unlocks inp */
 	if (rc != EAGAIN)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Send an active open request.
  *
  * State of affairs on entry:
  * soisconnecting (so_state |= SS_ISCONNECTING)
  * tcbinfo not locked (this has changed - used to be WLOCKed)
  * inp WLOCKed
  * tp->t_state = TCPS_SYN_SENT
  * rtalloc1, RT_UNLOCK on rt.
  */
 int
 t3_connect(struct toedev *tod, struct socket *so,
     struct rtentry *rt, struct sockaddr *nam)
 {
 	struct mbuf *m = NULL;
 	struct l2t_entry *e = NULL;
 	struct tom_data *td = t3_tomdata(tod);
 	struct adapter *sc = tod->tod_softc;
 	struct cpl_act_open_req *cpl;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep;
 	int atid = -1, mtu_idx, rscale, cpu_idx, qset;
 	struct sockaddr *gw;
 	struct ifnet *ifp = rt->rt_ifp;
 	struct port_info *pi = ifp->if_softc;	/* XXX wrong for VLAN etc. */
 
 	INP_WLOCK_ASSERT(inp);
 
 	toep = toepcb_alloc(tod);
 	if (toep == NULL)
 		goto failed;
 
 	atid = alloc_atid(&td->tid_maps, toep);
 	if (atid < 0)
 		goto failed;
 
 	qset = pi->first_qset + (arc4random() % pi->nqsets);
 
 	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
 	if (m == NULL)
 		goto failed;
 
 	gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam;
 	e = t3_l2t_get(pi, ifp, gw);
 	if (e == NULL)
 		goto failed;
 
 	toep->tp_l2t = e;
 	toep->tp_tid = atid;	/* used to double check response */
 	toep->tp_qset = qset;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 	toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	offload_socket(so, toep);
 
 	/*
 	 * The kernel sets request_r_scale based on sb_max whereas we need to
 	 * take hardware's MAX_RCV_WND into account too.  This is normally a
 	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
 	 */
 	if (tp->t_flags & TF_REQ_SCALE)
 		rscale = tp->request_r_scale = select_rcv_wscale();
 	else
 		rscale = 0;
 	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
 	cpu_idx = sc->rrss_map[qset];
 
 	cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD));
 	cpl->wr.wrh_lo = 0;
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 
 	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
 	    &cpl->peer_port);
 	cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e);
 	cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits);
 	cpl->params = 0;
 	cpl->opt2 = calc_opt2(cpu_idx);
 
 	CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__,
 	    toep->tp_tid, tcpstates[tp->t_state], toep, inp);
 
 	if (l2t_send(sc, m, e) == 0)
 		return (0);
 
 	undo_offload_socket(so);
 
 failed:
 	CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p",
 	    __func__, atid, toep, e, m);
 
 	if (atid >= 0)
 		free_atid(&td->tid_maps, atid);
 
 	if (e)
 		l2t_release(td->l2t, e);
 
 	if (toep)
 		toepcb_free(toep);
 
 	m_freem(m);
 
 	return (ENOMEM);
 }
 
 /*
  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do not
  * send multiple ABORT_REQs for the same connection and also that we do not try
  * to send a message after the connection has closed.
  */
 static void
 send_reset(struct toepcb *toep)
 {
 
 	struct cpl_abort_req *req;
 	unsigned int tid = toep->tp_tid;
 	struct inpcb *inp = toep->tp_inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
 	    toep->tp_flags);
 
 	if (toep->tp_flags & TP_ABORT_SHUTDOWN)
 		return;
 
 	toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
 
 	/* Purge the send queue */
 	sbflush(so_sockbuf_snd(so));
 	purge_wr_queue(toep);
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
 	if (m == NULL)
 		CXGB_UNIMPLEMENTED();
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
 	req->wr.wrh_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
 	req->rsvd0 = htonl(tp->snd_nxt);
 	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	if (tp->t_state == TCPS_SYN_SENT)
 		mbufq_tail(&toep->out_of_order_queue, m); /* defer */
 	else
 		l2t_send(sc, m, toep->tp_l2t);
 }
 
 int
 t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp)
 {
 
 	send_reset(tp->t_toe);
 	return (0);
 }
 
 /*
  * Handler for RX_DATA CPL messages.
  */
 static int
 do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_rx_data *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *so_rcv;	
 
 	/* Advance over CPL */
 	m_adj(m, sizeof(*hdr));
 
 	/* XXX: revisit.  This comes from the T4 TOM */
 	if (__predict_false(inp == NULL)) {
 		/*
 		 * do_pass_establish failed and must be attempting to abort the
 		 * connection.  Meanwhile, the T4 has sent us data for such a
 		 * connection.
 		 */
 #ifdef notyet
 		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
 		    ("%s: inp NULL and tid isn't being aborted", __func__));
 #endif
 		m_freem(m);
 		return (0);
 	}
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode))
 		toep->tp_delack_mode = hdr->dack_mode;
 
 	tp = intotcpcb(inp);
 
 #ifdef INVARIANTS
 	if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) {
 		log(LOG_ERR,
 		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
 		    __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt);
 	}
 #endif
 	tp->rcv_nxt += m->m_pkthdr.len;
 	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
 	    ("%s: negative window size", __func__));
 	tp->rcv_wnd -= m->m_pkthdr.len;
 	tp->t_rcvtime = ticks;
 
 	so  = inp->inp_socket;
 	so_rcv = &so->so_rcv;
 	SOCKBUF_LOCK(so_rcv);
 
 	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, m->m_pkthdr.len);
 		SOCKBUF_UNLOCK(so_rcv);
 		INP_WUNLOCK(inp);
 
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		m_freem(m);
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	if (so_rcv->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) {
 		unsigned int hiwat = so_rcv->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
 			so_rcv->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->tp_rx_credits += newsize - hiwat;
 	}
 
 	toep->tp_enqueued += m->m_pkthdr.len;
 	sbappendstream_locked(so_rcv, m);
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(so_rcv);
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /*
  * Handler for PEER_CLOSE CPL messages.
  */
 static int
 do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_peer_close *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
 	    tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp);
 
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING)
 		goto done;
 
 	so = inp_inpcbtosocket(inp);
 
 	socantrcvmore(so);
 	tp->rcv_nxt++;
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 	case TCPS_ESTABLISHED:
 		tp->t_state = TCPS_CLOSE_WAIT;
 		break;
 	case TCPS_FIN_WAIT_1:
 		tp->t_state = TCPS_CLOSING;
 		break;
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		INP_WLOCK(inp);
 		toepcb_release(toep);	/* no more CPLs expected */
 
 		m_freem(m);
 		return (0);
 	default:
 		log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n",
 		    __func__, toep->tp_tid, tp->t_state);
 	}
 
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Handler for CLOSE_CON_RPL CPL messages.  peer ACK to our FIN received.
  */
 static int
 do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_close_con_rpl *rpl = mtod(m, void *);
 	unsigned int tid = GET_TID(rpl);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid,
 	    tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags);
 
 	if ((toep->tp_flags & TP_ABORT_RPL_PENDING))
 		goto done;
 
 	so = inp_inpcbtosocket(inp);
 	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		INP_WLOCK(inp);
 		toepcb_release(toep);	/* no more CPLs expected */
 	
 		m_freem(m);
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tp->t_state = TCPS_FIN_WAIT_2;
 		break;
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
 		    __func__, toep->tp_tid, tp->t_state);
 	}
 
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct cpl_smt_write_rpl *rpl = mtod(m, void *);
 
 	if (rpl->status != CPL_ERR_NONE) {
 		log(LOG_ERR,
 		    "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
 		    rpl->status, GET_TID(rpl));
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct cpl_set_tcb_rpl *rpl = mtod(m, void *);
 
 	if (rpl->status != CPL_ERR_NONE) {
 		log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n",
 		    rpl->status, GET_TID(rpl));
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Handle an ABORT_RPL_RSS CPL message.
  */
 static int
 do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
 	unsigned int tid = GET_TID(rpl);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp;
 
 	/*
 	 * Ignore replies to post-close aborts indicating that the abort was
 	 * requested too late.  These connections are terminated when we get
 	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
 	 * arrives the TID is either no longer used or it has been recycled.
 	 */
 	if (rpl->status == CPL_ERR_ABORT_FAILED) {
 		m_freem(m);
 		return (0);
 	}
 
 	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
 		return (do_abort_rpl_synqe(qs, r, m));
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep,
 	    rpl->status);
 
 	inp = toep->tp_inp;
 	INP_WLOCK(inp);
 
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) {
 			toep->tp_flags |= TP_ABORT_RPL_RCVD;
 			INP_WUNLOCK(inp);
 		} else {
 			toep->tp_flags &= ~TP_ABORT_RPL_RCVD;
 			toep->tp_flags &= TP_ABORT_RPL_PENDING;
 			toepcb_release(toep);	/* no more CPLs expected */
 		}
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Convert the status code of an ABORT_REQ into a FreeBSD error code.
  */
 static int
 abort_status_to_errno(struct tcpcb *tp, int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * Returns whether an ABORT_REQ_RSS message is a negative advice.
  */
 static inline int
 is_neg_adv_abort(unsigned int status)
 {
 	return status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE;
 }
 
 void
 send_abort_rpl(struct toedev *tod, int tid, int qset)
 {
 	struct mbuf *reply;
 	struct cpl_abort_rpl *rpl;
 	struct adapter *sc = tod->tod_softc;
 
 	reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl);
 	if (!reply)
 		CXGB_UNIMPLEMENTED();
 
 	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
 	rpl->wr.wrh_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
 	rpl->cmd = CPL_ABORT_NO_RST;
 
 	t3_offload_tx(sc, reply);
 }
 
 /*
  * Handle an ABORT_REQ_RSS CPL message.  If we're waiting for an ABORT_RPL we
  * ignore this request except that we need to reply to it.
  */
 static int
 do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct toedev *tod = &td->tod;
 	const struct cpl_abort_req_rss *req = mtod(m, void *);
 	unsigned int tid = GET_TID(req);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	int qset = toep->tp_qset;
 
 	if (is_neg_adv_abort(req->status)) {
 		CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)",
 		    __func__, req->status, tid, toep->tp_flags);
 		m_freem(m);
 		return (0);
 	}
 
 	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
 		return (do_abort_req_synqe(qs, r, m));
 
 	inp = toep->tp_inp;
 	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 	so = inp->inp_socket;
 
 	CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d",
 	    __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags,
 	    req->status);
 
 	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) {
 		toep->tp_flags |= TP_ABORT_REQ_RCVD;
 		toep->tp_flags |= TP_ABORT_SHUTDOWN;
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		m_freem(m);
 		return (0);
 	}
 	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
 
 	/*
 	 * If we'd sent a reset on this toep, we'll ignore this and clean up in
 	 * the T3's reply to our reset instead.
 	 */
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		toep->tp_flags |= TP_ABORT_RPL_SENT;
 		INP_WUNLOCK(inp);
 	} else {
 		so_error_set(so, abort_status_to_errno(tp, req->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 		toepcb_release(toep);	/* no more CPLs expected */
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	send_abort_rpl(tod, tid, qset);
 	m_freem(m);
 	return (0);
 }
 
 static void
 assign_rxopt(struct tcpcb *tp, uint16_t tcpopt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = toep->tp_tod->tod_softc;
 
 	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40;
 
 	if (G_TCPOPT_TSTAMP(tcpopt)) {
 		tp->t_flags |= TF_RCVD_TSTMP;
 		tp->t_flags |= TF_REQ_TSTMP;	/* forcibly set */
 		tp->ts_recent = 0;		/* XXX */
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
 	}
 
 	if (G_TCPOPT_SACK(tcpopt))
 		tp->t_flags |= TF_SACK_PERMIT;
 	else
 		tp->t_flags &= ~TF_SACK_PERMIT;
 
 	if (G_TCPOPT_WSCALE_OK(tcpopt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt);
 	}
 
 }
 
 /*
  * The ISS and IRS are from after the exchange of SYNs and are off by 1.
  */
 void
 make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs,
     uint16_t cpl_tcpopt)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	long bufsize;
 	uint32_t iss = be32toh(cpl_iss) - 1;	/* true ISS */
 	uint32_t irs = be32toh(cpl_irs) - 1;	/* true IRS */
 	uint16_t tcpopt = be16toh(cpl_tcpopt);
 
 	INP_WLOCK_ASSERT(inp);
 
 	tp->t_state = TCPS_ESTABLISHED;
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state],
 	    toep->tp_tid, toep, inp);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = toep->tp_rx_credits << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	/*
 	 * If we were unable to send all rx credits via opt0, save the remainder
 	 * in rx_credits so that they can be handed over with the next credit
 	 * update.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	bufsize = select_rcv_wnd(so);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	toep->tp_rx_credits = bufsize - tp->rcv_wnd;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 	soisconnected(so);
 }
 
 /*
  * Fill in the right TID for CPL messages waiting in the out-of-order queue
  * and send them to the TOE.
  */
 static void
 fixup_and_send_ofo(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	unsigned int tid = toep->tp_tid;
 
 	inp_lock_assert(toep->tp_inp);
 
 	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
 		struct ofld_hdr *oh = mtod(m, void *);
 		/*
 		 * A variety of messages can be waiting but the fields we'll
 		 * be touching are common to all so any message type will do.
 		 */
 		struct cpl_close_con_req *p = (void *)(oh + 1);
 
 		p->wr.wrh_lo = htonl(V_WR_TID(tid));
 		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
 		t3_offload_tx(sc, m);
 	}
 }
 
 /*
  * Process a CPL_ACT_ESTABLISH message.
  */
 static int
 do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_act_establish *req = mtod(m, void *);
 	unsigned int tid = GET_TID(req);
 	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
 	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so; 
 
 	CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid);
 
 	free_atid(&td->tid_maps, atid);
 
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	KASSERT(toep->tp_qset == qs->idx,
 	    ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx));
 	KASSERT(toep->tp_tid == atid,
 	    ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid));
 
 	toep->tp_tid = tid;
 	insert_tid(td, toep, tid);
 
 	if (inp->inp_flags & INP_DROPPED) {
 		/* socket closed by the kernel before hw told us it connected */
 		send_reset(toep);
 		goto done;
 	}
 
 	KASSERT(tp->t_state == TCPS_SYN_SENT,
 	    ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state));
 
 	so = inp->inp_socket;
 	make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt);
 
 	/*
 	 * Now that we finally have a TID send any CPL messages that we had to
 	 * defer for lack of a TID.
 	 */
 	if (mbufq_len(&toep->out_of_order_queue))
 		fixup_and_send_ofo(toep);
 
 done:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Process an acknowledgment of WR completion.  Advance snd_una and send the
  * next batch of work requests from the write queue.
  */
 static void
 wr_ack(struct toepcb *toep, struct mbuf *m)
 {
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct cpl_wr_ack *hdr = mtod(m, void *);
 	struct socket *so;
 	unsigned int credits = ntohs(hdr->credits);
 	u32 snd_una = ntohl(hdr->snd_una);
 	int bytes = 0;
 	struct sockbuf *snd;
 	struct mbuf *p;
 	struct ofld_hdr *oh;
 
 	inp_wlock(inp);
 	tp = intotcpcb(inp);
 	so = inp->inp_socket;
 	toep->tp_wr_avail += credits;
 	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
 		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
 
 	while (credits) {
 		p = peek_wr(toep);
 
 		if (__predict_false(!p)) {
 			CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, "
 			    "tid %u, state %u, wr_avail %u", __func__, credits,
 			    toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 
 			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
 			    "nothing pending, state %u wr_avail=%u\n",
 			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 			break;
 		}
 
 		oh = mtod(p, struct ofld_hdr *);
 
 		KASSERT(credits >= G_HDR_NDESC(oh->flags),
 		    ("%s: partial credits?  %d %d", __func__, credits,
 		    G_HDR_NDESC(oh->flags)));
 
 		dequeue_wr(toep);
 		credits -= G_HDR_NDESC(oh->flags);
 		bytes += oh->plen;
 
 		if (oh->flags & F_HDR_SGL)
 			sglist_free(oh->sgl);
 		m_freem(p);
 	}
 
 	if (__predict_false(SEQ_LT(snd_una, tp->snd_una)))
 		goto out_free;
 
 	if (tp->snd_una != snd_una) {
 		tp->snd_una = snd_una;
 		tp->ts_recent_age = tcp_ts_getticks();
 		if (tp->snd_una == tp->snd_nxt)
 			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
 	}
 
 	snd = so_sockbuf_snd(so);
 	if (bytes) {
 		SOCKBUF_LOCK(snd);
 		sbdrop_locked(snd, bytes);
 		so_sowwakeup_locked(so);
 	}
 
 	if (snd->sb_sndptroff < snd->sb_cc)
 		t3_push_frames(so, 0);
 
 out_free:
 	inp_wunlock(tp->t_inpcb);
 	m_freem(m);
 }
 
 /*
  * Handler for TX_DATA_ACK CPL messages.
  */
 static int
 do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_wr_ack *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
 	/* XXX bad race */
 	if (toep)
 		wr_ack(toep, m);
 
 	return (0);
 }
 
 void
 t3_init_cpl_io(struct adapter *sc)
 {
 	t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
 	t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
 	t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify);
 	t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
 	t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack);
 	t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
 	t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
 	t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
 	t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl);
 	t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
 }
 #endif
Index: stable/10/sys/dev/cxgbe/tom/t4_tom.c
===================================================================
--- stable/10/sys/dev/cxgbe/tom/t4_tom.c	(revision 330302)
+++ stable/10/sys/dev/cxgbe/tom/t4_tom.c	(revision 330303)
@@ -1,1270 +1,1269 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/scope6_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
 #include <netinet/toecore.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static struct protosw ddp_protosw;
 static struct pr_usrreqs ddp_usrreqs;
 
 static struct protosw ddp6_protosw;
 static struct pr_usrreqs ddp6_usrreqs;
 
 /* Module ops */
 static int t4_tom_mod_load(void);
 static int t4_tom_mod_unload(void);
 static int t4_tom_modevent(module_t, int, void *);
 
 /* ULD ops and helpers */
 static int t4_tom_activate(struct adapter *);
 static int t4_tom_deactivate(struct adapter *);
 
 static struct uld_info tom_uld_info = {
 	.uld_id = ULD_TOM,
 	.activate = t4_tom_activate,
 	.deactivate = t4_tom_deactivate,
 };
 
 static void queue_tid_release(struct adapter *, int);
 static void release_offload_resources(struct toepcb *);
 static int alloc_tid_tabs(struct tid_info *);
 static void free_tid_tabs(struct tid_info *);
 static int add_lip(struct adapter *, struct in6_addr *);
 static int delete_lip(struct adapter *, struct in6_addr *);
 static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *);
 static void init_clip_table(struct adapter *, struct tom_data *);
 static void update_clip(struct adapter *, void *);
 static void t4_clip_task(void *, int);
 static void update_clip_table(struct adapter *, struct tom_data *);
 static void destroy_clip_table(struct adapter *, struct tom_data *);
 static void free_tom_data(struct adapter *, struct tom_data *);
 static void reclaim_wr_resources(void *, int);
 
 static int in6_ifaddr_gen;
 static eventhandler_tag ifaddr_evhandler;
 static struct timeout_task clip_task;
 
 static void
 mbufq_init(struct mbufq *q, int limit)
 {
 
 	q->head = q->tail = NULL;
 }
 
 static void
 mbufq_drain(struct mbufq *q)
 {
 	struct mbuf *m;
 
 	while ((m = q->head) != NULL) {
 		q->head = m->m_nextpkt;
 		m_freem(m);
 	}
 	q->tail = NULL;
 }
 
 #ifdef INVARIANTS
 static inline int
 mbufq_len(const struct mbufq *q)
 {
 	struct mbuf *m;
 	int len;
 
 	len = 0;
 	for (m = q->head; m != NULL; m = m->m_nextpkt)
 		len++;
 	return (len);
 }
 #endif
 	
 struct toepcb *
 alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct toepcb *toep;
 	int tx_credits, txsd_total, len;
 
 	/*
 	 * The firmware counts tx work request credits in units of 16 bytes
 	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
 	 * about tx credits if it wants to abort a connection.
 	 */
 	tx_credits = sc->params.ofldq_wr_cred;
 	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
 
 	/*
 	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
 	 * immediate payload, and firmware counts tx work request credits in
 	 * units of 16 byte.  Calculate the maximum work requests possible.
 	 */
 	txsd_total = tx_credits /
 	    howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
 
 	if (txqid < 0)
 		txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
 	KASSERT(txqid >= vi->first_ofld_txq &&
 	    txqid < vi->first_ofld_txq + vi->nofldtxq,
 	    ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi,
 		vi->first_ofld_txq, vi->nofldtxq));
 
 	if (rxqid < 0)
 		rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
 	KASSERT(rxqid >= vi->first_ofld_rxq &&
 	    rxqid < vi->first_ofld_rxq + vi->nofldrxq,
 	    ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi,
 		vi->first_ofld_rxq, vi->nofldrxq));
 
 	len = offsetof(struct toepcb, txsd) +
 	    txsd_total * sizeof(struct ofld_tx_sdesc);
 
 	toep = malloc(len, M_CXGBE, M_ZERO | flags);
 	if (toep == NULL)
 		return (NULL);
 
 	toep->td = sc->tom_softc;
 	toep->vi = vi;
 	toep->tx_total = tx_credits;
 	toep->tx_credits = tx_credits;
 	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
 	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
 	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	mbufq_init(&toep->ulp_pduq, INT_MAX);
 	mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
 	toep->txsd_total = txsd_total;
 	toep->txsd_avail = txsd_total;
 	toep->txsd_pidx = 0;
 	toep->txsd_cidx = 0;
 
 	return (toep);
 }
 
 void
 free_toepcb(struct toepcb *toep)
 {
 
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: attached to an inpcb", __func__));
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: CPL pending", __func__));
 
 	free(toep, M_CXGBE);
 }
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		if (inp->inp_vflag & INP_IPV6)
 			so->so_proto = &ddp6_protosw;
 		else
 			so->so_proto = &ddp_protosw;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	/* Update TCP PCB */
 	tp->tod = &td->tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->inp = inp;
 	toep->flags |= TPF_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct tom_data *td = toep->td;
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->inp = NULL;
 	toep->flags &= ~TPF_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 static void
 release_offload_resources(struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct adapter *sc = td_adapter(td);
 	int tid = toep->tid;
 
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: %p has CPL pending.", __func__, toep));
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: %p is still attached.", __func__, toep));
 
 	CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
 	    __func__, toep, tid, toep->l2te, toep->ce);
 
 	/*
 	 * These queues should have been emptied at approximately the same time
 	 * that a normal connection's socket's so_snd would have been purged or
 	 * drained.  Do _not_ clean up here.
 	 */
 	MPASS(mbufq_len(&toep->ulp_pduq) == 0);
 	MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		release_ddp_resources(toep);
 
 	if (toep->l2te)
 		t4_l2t_release(toep->l2te);
 
 	if (tid >= 0) {
 		remove_tid(sc, tid, toep->ce ? 2 : 1);
 		release_tid(sc, tid, toep->ctrlq);
 	}
 
 	if (toep->ce)
 		release_lip(td, toep->ce);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	free_toepcb(toep);
 }
 
 /*
  * The kernel is done with the TCP PCB and this is our opportunity to unhook the
  * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
  * pending CPL) then it is time to release all resources tied to the toepcb.
  *
  * Also gets called when an offloaded active open fails and the TOM wants the
  * kernel to take the TCP PCB back.
  */
 static void
 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 #if defined(KTR) || defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->flags & TPF_ATTACHED,
 	    ("%s: not attached", __func__));
 
 #ifdef KTR
 	if (tp->t_state == TCPS_SYN_SENT) {
 		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
 		    __func__, toep->tid, toep, toep->flags, inp,
 		    inp->inp_flags);
 	} else {
 		CTR6(KTR_CXGBE,
 		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
 		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
 		    inp->inp_flags);
 	}
 #endif
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->flags &= ~TPF_ATTACHED;
 
 	if (!(toep->flags & TPF_CPL_PENDING))
 		release_offload_resources(toep);
 }
 
 /*
  * setsockopt handler.
  */
 static void
 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct toepcb *toep = tp->t_toe;
 
 	if (dir == SOPT_GET)
 		return;
 
 	CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
 
 	switch (name) {
 	case TCP_NODELAY:
 		t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
 		    V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1),
 		    0, 0, toep->ofld_rxq->iq.abs_id);
 		break;
 	default:
 		break;
 	}
 }
 
 /*
  * The TOE driver will not receive any more CPLs for the tid associated with the
  * toepcb; release the hold on the inpcb.
  */
 void
 final_cpl_received(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->inp;
 
 	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_CPL_PENDING,
 	    ("%s: CPL not pending already?", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
 	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
 
 	toep->inp = NULL;
 	toep->flags &= ~TPF_CPL_PENDING;
 	mbufq_drain(&toep->ulp_pdu_reclaimq);
 
 	if (!(toep->flags & TPF_ATTACHED))
 		release_offload_resources(toep);
 
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 
 void
 insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 	atomic_add_int(&t->tids_in_use, ntids);
 }
 
 void *
 lookup_tid(struct adapter *sc, int tid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->tid_tab[tid]);
 }
 
 void
 update_tid(struct adapter *sc, int tid, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 }
 
 void
 remove_tid(struct adapter *sc, int tid, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = NULL;
 	atomic_subtract_int(&t->tids_in_use, ntids);
 }
 
 void
 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
 {
 	struct wrqe *wr;
 	struct cpl_tid_release *req;
 
 	wr = alloc_wrqe(sizeof(*req), ctrlq);
 	if (wr == NULL) {
 		queue_tid_release(sc, tid);	/* defer */
 		return;
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
 
 	t4_wrq_tx(sc, wr);
 }
 
 static void
 queue_tid_release(struct adapter *sc, int tid)
 {
 
 	CXGBE_UNIMPLEMENTED("deferred tid release");
 }
 
 /*
  * What mtu_idx to use, given a 4-tuple and/or an MSS cap
  */
 int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i, mss, n;
 
 	KASSERT(inc != NULL || pmss > 0,
 	    ("%s: at least one of inc/pmss must be specified", __func__));
 
 	mss = inc ? tcp_mssopt(inc) : pmss;
 	if (pmss > 0 && mss > pmss)
 		mss = pmss;
 
 	if (inc->inc_flags & INC_ISIPV6)
 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		n = sizeof(struct ip) + sizeof(struct tcphdr);
 
 	for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++)
 		continue;
 
 	return (i);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 u_long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
-extern int always_keepalive;
-
 /*
  * socket so could be a listening socket too.
  */
 uint64_t
 calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e,
     int mtu_idx, int rscale, int rx_credits, int ulp_mode)
 {
 	uint64_t opt0;
 
 	KASSERT(rx_credits <= M_RCV_BUFSIZ,
 	    ("%s: rcv_bufsiz too high", __func__));
 
 	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
 	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
 
 	if (so != NULL) {
 		struct inpcb *inp = sotoinpcb(so);
 		struct tcpcb *tp = intotcpcb(inp);
-		int keepalive = always_keepalive ||
+		int keepalive = tcp_always_keepalive ||
 		    so_options_get(so) & SO_KEEPALIVE;
 
 		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
 		opt0 |= V_KEEP_ALIVE(keepalive != 0);
 	}
 
 	if (e != NULL)
 		opt0 |= V_L2T_IDX(e->idx);
 
 	if (vi != NULL) {
 		opt0 |= V_SMAC_SEL(vi->smt_idx);
 		opt0 |= V_TX_CHAN(vi->pi->tx_chan);
 	}
 
 	return htobe64(opt0);
 }
 
 uint64_t
 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct tp_params *tp = &sc->params.tp;
 	uint16_t viid = vi->viid;
 	uint64_t ntuple = 0;
 
 	/*
 	 * Initialize each of the fields which we care about which are present
 	 * in the Compressed Filter Tuple.
 	 */
 	if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE)
 		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
 
 	if (tp->port_shift >= 0)
 		ntuple |= (uint64_t)e->lport << tp->port_shift;
 
 	if (tp->protocol_shift >= 0)
 		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
 
 	if (tp->vnic_shift >= 0) {
 		uint32_t vf = G_FW_VIID_VIN(viid);
 		uint32_t pf = G_FW_VIID_PFN(viid);
 		uint32_t vld = G_FW_VIID_VIVLD(viid);
 
 		ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) |
 		    V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift;
 	}
 
 	if (is_t4(sc))
 		return (htobe32((uint32_t)ntuple));
 	else
 		return (htobe64(V_FILTER_TUPLE(ntuple)));
 }
 
 void
 set_tcpddp_ulp_mode(struct toepcb *toep)
 {
 
 	toep->ulp_mode = ULP_MODE_TCPDDP;
 	toep->ddp_flags = DDP_OK;
 	toep->ddp_score = DDP_LOW_SCORE;
 }
 
 int
 negative_advice(int status)
 {
 
 	return (status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE ||
 	    status == CPL_ERR_KEEPALV_NEG_ADVICE);
 }
 
 static int
 alloc_tid_tabs(struct tid_info *t)
 {
 	size_t size;
 	unsigned int i;
 
 	size = t->ntids * sizeof(*t->tid_tab) +
 	    t->natids * sizeof(*t->atid_tab) +
 	    t->nstids * sizeof(*t->stid_tab);
 
 	t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
 	if (t->tid_tab == NULL)
 		return (ENOMEM);
 
 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
 	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
 	t->afree = t->atid_tab;
 	t->atids_in_use = 0;
 	for (i = 1; i < t->natids; i++)
 		t->atid_tab[i - 1].next = &t->atid_tab[i];
 	t->atid_tab[t->natids - 1].next = NULL;
 
 	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
 	t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids];
 	t->stids_in_use = 0;
 	TAILQ_INIT(&t->stids);
 	t->nstids_free_head = t->nstids;
 
 	atomic_store_rel_int(&t->tids_in_use, 0);
 
 	return (0);
 }
 
 static void
 free_tid_tabs(struct tid_info *t)
 {
 	KASSERT(t->tids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
 	KASSERT(t->atids_in_use == 0,
 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
 	KASSERT(t->stids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
 
 	free(t->tid_tab, M_CXGBE);
 	t->tid_tab = NULL;
 
 	if (mtx_initialized(&t->atid_lock))
 		mtx_destroy(&t->atid_lock);
 	if (mtx_initialized(&t->stid_lock))
 		mtx_destroy(&t->stid_lock);
 }
 
 static int
 add_lip(struct adapter *sc, struct in6_addr *lip)
 {
         struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
         memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static int
 delete_lip(struct adapter *sc, struct in6_addr *lip)
 {
 	struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
 	memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_READ);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static struct clip_entry *
 search_lip(struct tom_data *td, struct in6_addr *lip)
 {
 	struct clip_entry *ce;
 
 	mtx_assert(&td->clip_table_lock, MA_OWNED);
 
 	TAILQ_FOREACH(ce, &td->clip_table, link) {
 		if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 			return (ce);
 	}
 
 	return (NULL);
 }
 
 struct clip_entry *
 hold_lip(struct tom_data *td, struct in6_addr *lip, struct clip_entry *ce)
 {
 
 	mtx_lock(&td->clip_table_lock);
 	if (ce == NULL)
 		ce = search_lip(td, lip);
 	if (ce != NULL)
 		ce->refcount++;
 	mtx_unlock(&td->clip_table_lock);
 
 	return (ce);
 }
 
 void
 release_lip(struct tom_data *td, struct clip_entry *ce)
 {
 
 	mtx_lock(&td->clip_table_lock);
 	KASSERT(search_lip(td, &ce->lip) == ce,
 	    ("%s: CLIP entry %p p not in CLIP table.", __func__, ce));
 	KASSERT(ce->refcount > 0,
 	    ("%s: CLIP entry %p has refcount 0", __func__, ce));
 	--ce->refcount;
 	mtx_unlock(&td->clip_table_lock);
 }
 
 static void
 init_clip_table(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->clip_table);
 	td->clip_gen = -1;
 
 	update_clip_table(sc, td);
 }
 
 static void
 update_clip(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc"))
 		return;
 
 	if (uld_active(sc, ULD_TOM))
 		update_clip_table(sc, sc->tom_softc);
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 
 static void
 t4_clip_task(void *arg, int count)
 {
 
 	t4_iterate(update_clip, NULL);
 }
 
 static void
 update_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct in6_ifaddr *ia;
 	struct in6_addr *lip, tlip;
 	struct clip_head stale;
 	struct clip_entry *ce, *ce_temp;
 	struct vi_info *vi;
 	int rc, gen, i, j;
 	uintptr_t last_vnet;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	IN6_IFADDR_RLOCK();
 	mtx_lock(&td->clip_table_lock);
 
 	gen = atomic_load_acq_int(&in6_ifaddr_gen);
 	if (gen == td->clip_gen)
 		goto done;
 
 	TAILQ_INIT(&stale);
 	TAILQ_CONCAT(&stale, &td->clip_table, link);
 
 	/*
 	 * last_vnet optimizes the common cases where all if_vnet = NULL (no
 	 * VIMAGE) or all if_vnet = vnet0.
 	 */
 	last_vnet = (uintptr_t)(-1);
 	for_each_port(sc, i)
 	for_each_vi(sc->port[i], j, vi) {
 		if (last_vnet == (uintptr_t)vi->ifp->if_vnet)
 			continue;
 
 		/* XXX: races with if_vmove */
 		CURVNET_SET(vi->ifp->if_vnet);
 		TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 			lip = &ia->ia_addr.sin6_addr;
 
 			KASSERT(!IN6_IS_ADDR_MULTICAST(lip),
 			    ("%s: mcast address in in6_ifaddr list", __func__));
 
 			if (IN6_IS_ADDR_LOOPBACK(lip))
 				continue;
 			if (IN6_IS_SCOPE_EMBED(lip)) {
 				/* Remove the embedded scope */
 				tlip = *lip;
 				lip = &tlip;
 				in6_clearscope(lip);
 			}
 			/*
 			 * XXX: how to weed out the link local address for the
 			 * loopback interface?  It's fe80::1 usually (always?).
 			 */
 
 			/*
 			 * If it's in the main list then we already know it's
 			 * not stale.
 			 */
 			TAILQ_FOREACH(ce, &td->clip_table, link) {
 				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 					goto next;
 			}
 
 			/*
 			 * If it's in the stale list we should move it to the
 			 * main list.
 			 */
 			TAILQ_FOREACH(ce, &stale, link) {
 				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) {
 					TAILQ_REMOVE(&stale, ce, link);
 					TAILQ_INSERT_TAIL(&td->clip_table, ce,
 					    link);
 					goto next;
 				}
 			}
 
 			/* A new IP6 address; add it to the CLIP table */
 			ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT);
 			memcpy(&ce->lip, lip, sizeof(ce->lip));
 			ce->refcount = 0;
 			rc = add_lip(sc, lip);
 			if (rc == 0)
 				TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
 			else {
 				char ip[INET6_ADDRSTRLEN];
 
 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
 				    sizeof(ip));
 				log(LOG_ERR, "%s: could not add %s (%d)\n",
 				    __func__, ip, rc);
 				free(ce, M_CXGBE);
 			}
 next:
 			continue;
 		}
 		CURVNET_RESTORE();
 		last_vnet = (uintptr_t)vi->ifp->if_vnet;
 	}
 
 	/*
 	 * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are
 	 * no longer referenced by the driver.
 	 */
 	TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) {
 		if (ce->refcount == 0) {
 			rc = delete_lip(sc, &ce->lip);
 			if (rc == 0) {
 				TAILQ_REMOVE(&stale, ce, link);
 				free(ce, M_CXGBE);
 			} else {
 				char ip[INET6_ADDRSTRLEN];
 
 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
 				    sizeof(ip));
 				log(LOG_ERR, "%s: could not delete %s (%d)\n",
 				    __func__, ip, rc);
 			}
 		}
 	}
 	/* The ones that are still referenced need to stay in the CLIP table */
 	TAILQ_CONCAT(&td->clip_table, &stale, link);
 
 	td->clip_gen = gen;
 done:
 	mtx_unlock(&td->clip_table_lock);
 	IN6_IFADDR_RUNLOCK();
 }
 
 static void
 destroy_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct clip_entry *ce, *ce_temp;
 
 	if (mtx_initialized(&td->clip_table_lock)) {
 		mtx_lock(&td->clip_table_lock);
 		TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) {
 			KASSERT(ce->refcount == 0,
 			    ("%s: CLIP entry %p still in use (%d)", __func__,
 			    ce, ce->refcount));
 			TAILQ_REMOVE(&td->clip_table, ce, link);
 			delete_lip(sc, &ce->lip);
 			free(ce, M_CXGBE);
 		}
 		mtx_unlock(&td->clip_table_lock);
 		mtx_destroy(&td->clip_table_lock);
 	}
 }
 
 static void
 free_tom_data(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	KASSERT(TAILQ_EMPTY(&td->toep_list),
 	    ("%s: TOE PCB list is not empty.", __func__));
 	KASSERT(td->lctx_count == 0,
 	    ("%s: lctx hash table is not empty.", __func__));
 
 	t4_free_ppod_region(&td->pr);
 	destroy_clip_table(sc, td);
 
 	if (td->listen_mask != 0)
 		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
 
 	if (mtx_initialized(&td->unsent_wr_lock))
 		mtx_destroy(&td->unsent_wr_lock);
 	if (mtx_initialized(&td->lctx_hash_lock))
 		mtx_destroy(&td->lctx_hash_lock);
 	if (mtx_initialized(&td->toep_list_lock))
 		mtx_destroy(&td->toep_list_lock);
 
 	free_tid_tabs(&sc->tids);
 	free(td, M_CXGBE);
 }
 
 static void
 reclaim_wr_resources(void *arg, int count)
 {
 	struct tom_data *td = arg;
 	STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
 	struct cpl_act_open_req *cpl;
 	u_int opcode, atid;
 	struct wrqe *wr;
 	struct adapter *sc;
 
 	mtx_lock(&td->unsent_wr_lock);
 	STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
 	mtx_unlock(&td->unsent_wr_lock);
 
 	while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
 		STAILQ_REMOVE_HEAD(&twr_list, link);
 
 		cpl = wrtod(wr);
 		opcode = GET_OPCODE(cpl);
 
 		switch (opcode) {
 		case CPL_ACT_OPEN_REQ:
 		case CPL_ACT_OPEN_REQ6:
 			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
 			sc = td_adapter(td);
 
 			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
 			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
 			free(wr, M_CXGBE);
 			break;
 		default:
 			log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
 			    "opcode %x\n", __func__, wr, wr->wr_len, opcode);
 			/* WR not freed here; go look at it with a debugger.  */
 		}
 	}
 }
 
 /*
  * Ground control to Major TOM
  * Commencing countdown, engines on
  */
 static int
 t4_tom_activate(struct adapter *sc)
 {
 	struct tom_data *td;
 	struct toedev *tod;
 	struct vi_info *vi;
 	struct sge_ofld_rxq *ofld_rxq;
 	int i, j, rc, v;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	/* per-adapter softc for TOM */
 	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (td == NULL)
 		return (ENOMEM);
 
 	/* List of TOE PCBs and associated lock */
 	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->toep_list);
 
 	/* Listen context */
 	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
 	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
 	    &td->listen_mask, HASH_NOWAIT);
 
 	/* List of WRs for which L2 resolution failed */
 	mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
 	STAILQ_INIT(&td->unsent_wr_list);
 	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
 
 	/* TID tables */
 	rc = alloc_tid_tabs(&sc->tids);
 	if (rc != 0)
 		goto done;
 
 	rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
 	    t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
 	if (rc != 0)
 		goto done;
 	t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
 	    V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
 
 	/* CLIP table for IPv6 offload */
 	init_clip_table(sc, td);
 
 	/* toedev ops */
 	tod = &td->tod;
 	init_toedev(tod);
 	tod->tod_softc = sc;
 	tod->tod_connect = t4_connect;
 	tod->tod_listen_start = t4_listen_start;
 	tod->tod_listen_stop = t4_listen_stop;
 	tod->tod_rcvd = t4_rcvd;
 	tod->tod_output = t4_tod_output;
 	tod->tod_send_rst = t4_send_rst;
 	tod->tod_send_fin = t4_send_fin;
 	tod->tod_pcb_detach = t4_pcb_detach;
 	tod->tod_l2_update = t4_l2_update;
 	tod->tod_syncache_added = t4_syncache_added;
 	tod->tod_syncache_removed = t4_syncache_removed;
 	tod->tod_syncache_respond = t4_syncache_respond;
 	tod->tod_offload_socket = t4_offload_socket;
 	tod->tod_ctloutput = t4_ctloutput;
 
 	for_each_port(sc, i) {
 		for_each_vi(sc->port[i], v, vi) {
 			TOEDEV(vi->ifp) = &td->tod;
 			for_each_ofld_rxq(vi, j, ofld_rxq) {
 				ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl;
 				ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2;
 			}
 		}
 	}
 
 	sc->tom_softc = td;
 	register_toedev(sc->tom_softc);
 
 done:
 	if (rc != 0)
 		free_tom_data(sc, td);
 	return (rc);
 }
 
 static int
 t4_tom_deactivate(struct adapter *sc)
 {
 	int rc = 0;
 	struct tom_data *td = sc->tom_softc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (td == NULL)
 		return (0);	/* XXX. KASSERT? */
 
 	if (sc->offload_map != 0)
 		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
 
 	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
 		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
 
 	mtx_lock(&td->toep_list_lock);
 	if (!TAILQ_EMPTY(&td->toep_list))
 		rc = EBUSY;
 	mtx_unlock(&td->toep_list_lock);
 
 	mtx_lock(&td->lctx_hash_lock);
 	if (td->lctx_count > 0)
 		rc = EBUSY;
 	mtx_unlock(&td->lctx_hash_lock);
 
 	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
 	mtx_lock(&td->unsent_wr_lock);
 	if (!STAILQ_EMPTY(&td->unsent_wr_list))
 		rc = EBUSY;
 	mtx_unlock(&td->unsent_wr_lock);
 
 	if (rc == 0) {
 		unregister_toedev(sc->tom_softc);
 		free_tom_data(sc, td);
 		sc->tom_softc = NULL;
 	}
 
 	return (rc);
 }
 
 static void
 t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp)
 {
 
 	atomic_add_rel_int(&in6_ifaddr_gen, 1);
 	taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4);
 }
 
 static int
 t4_tom_mod_load(void)
 {
 	int rc;
 	struct protosw *tcp_protosw, *tcp6_protosw;
 
 	/* CPL handlers */
 	t4_init_connect_cpl_handlers();
 	t4_init_listen_cpl_handlers();
 	t4_init_cpl_io_handlers();
 
 	rc = t4_ddp_mod_load();
 	if (rc != 0)
 		return (rc);
 
 	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp_protosw, &ddp_protosw, sizeof(ddp_protosw));
 	bcopy(tcp_protosw->pr_usrreqs, &ddp_usrreqs, sizeof(ddp_usrreqs));
 	ddp_usrreqs.pru_soreceive = t4_soreceive_ddp;
 	ddp_protosw.pr_usrreqs = &ddp_usrreqs;
 
 	tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp6_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp6_protosw, &ddp6_protosw, sizeof(ddp6_protosw));
 	bcopy(tcp6_protosw->pr_usrreqs, &ddp6_usrreqs, sizeof(ddp6_usrreqs));
 	ddp6_usrreqs.pru_soreceive = t4_soreceive_ddp;
 	ddp6_protosw.pr_usrreqs = &ddp6_usrreqs;
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL);
 	ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event,
 	    t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
 
 	rc = t4_register_uld(&tom_uld_info);
 	if (rc != 0)
 		t4_tom_mod_unload();
 
 	return (rc);
 }
 
 static void
 tom_uninit(struct adapter *sc, void *arg __unused)
 {
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
 		return;
 
 	/* Try to free resources (works only if no port has IFCAP_TOE) */
 	if (uld_active(sc, ULD_TOM))
 		t4_deactivate_uld(sc, ULD_TOM);
 
 	end_synchronized_op(sc, 0);
 }
 
 static int
 t4_tom_mod_unload(void)
 {
 	t4_iterate(tom_uninit, NULL);
 
 	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
 		return (EBUSY);
 
 	if (ifaddr_evhandler) {
 		EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler);
 		taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL);
 	}
 
 	t4_ddp_mod_unload();
 
 	t4_uninit_connect_cpl_handlers();
 	t4_uninit_listen_cpl_handlers();
 	t4_uninit_cpl_io_handlers();
 
 	return (0);
 }
 #endif	/* TCP_OFFLOAD */
 
 static int
 t4_tom_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = t4_tom_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = t4_tom_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 	return (rc);
 }
 
 static moduledata_t t4_tom_moddata= {
 	"t4_tom",
 	t4_tom_modevent,
 	0
 };
 
 MODULE_VERSION(t4_tom, 1);
 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
Index: stable/10/sys/netinet/tcp_timer.c
===================================================================
--- stable/10/sys/netinet/tcp_timer.c	(revision 330302)
+++ stable/10/sys/netinet/tcp_timer.c	(revision 330303)
@@ -1,947 +1,949 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 
 int    tcp_persmin;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
 
 int    tcp_persmax;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
 
 int	tcp_keepinit;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
 
 int	tcp_keepidle;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
 
 int	tcp_keepintvl;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
 
 int	tcp_delacktime;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
     "Time before a delayed ACK is sent");
 
 int	tcp_msl;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 
 int	tcp_rexmit_min;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
     "Minimum Retransmission Timeout");
 
 int	tcp_rexmit_slop;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
     "Retransmission Timer Slop");
 
-static int	always_keepalive = 1;
+int	tcp_always_keepalive = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
-    &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
+    &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
+__strong_reference(tcp_always_keepalive, always_keepalive);
 
 int    tcp_fast_finwait2_recycle = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 
     &tcp_fast_finwait2_recycle, 0,
     "Recycle closed FIN_WAIT_2 connections faster");
 
 int    tcp_finwait2_timeout;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
 
 int	tcp_keepcnt = TCPTV_KEEPCNT;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
     "Number of keepalive probes to send");
 
 	/* max idle probes */
 int	tcp_maxpersistidle;
 
 static int	tcp_rexmit_drop_options = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
     &tcp_rexmit_drop_options, 0,
     "Drop TCP options from 3rd and later retransmitted SYN");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
 #define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
     CTLFLAG_RW,
     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
     "Path MTU Discovery Black Hole Detection Enabled");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
 #define	V_tcp_pmtud_blackhole_activated \
     VNET(tcp_pmtud_blackhole_activated)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
     CTLFLAG_RD,
     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
     "Path MTU Discovery Black Hole Detection, Activation Count");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
 #define	V_tcp_pmtud_blackhole_activated_min_mss \
     VNET(tcp_pmtud_blackhole_activated_min_mss)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
     CTLFLAG_RD,
     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
 #define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
     CTLFLAG_RD,
     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
     "Path MTU Discovery Black Hole Detection, Failure Count");
 
 #ifdef INET
 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
 #define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
     CTLFLAG_RW,
     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
     "Path MTU Discovery Black Hole Detection lowered MSS");
 #endif
 
 #ifdef INET6
 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
 #define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
     CTLFLAG_RW,
     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
 #endif
 
 static int	per_cpu_timers = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
     &per_cpu_timers , 0, "run tcp timers on all cpus");
 
 #define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
 		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
 
 /*
  * Tcp protocol timeout routine called every 500 ms.
  * Updates timestamps used for TCP
  * causes finite state machine actions if timers expire.
  */
 void
 tcp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		(void) tcp_tw_2msl_scan(0);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 
 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
 
 static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
 
 /*
  * TCP timer processing.
  */
 
 void
 tcp_timer_delack(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_delack) ||
 	    !callout_active(&tp->t_timers->tt_delack)) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_delack);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
 		("%s: tp %p delack callout should be running", __func__, tp));
 
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
 	(void) tcp_output(tp);
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_2msl(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	tcp_free_sackholes(tp);
 	if (callout_pending(&tp->t_timers->tt_2msl) ||
 	    !callout_active(&tp->t_timers->tt_2msl)) {
 		INP_WUNLOCK(tp->t_inpcb);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_2msl);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
 		("%s: tp %p 2msl callout should be running", __func__, tp));
 	/*
 	 * 2 MSL timeout in shutdown went off.  If we're closed but
 	 * still waiting for peer to close and connection has been idle
 	 * too long delete connection control block.  Otherwise, check
 	 * again in a bit.
 	 *
 	 * If in TIME_WAIT state just ignore as this timeout is handled in
 	 * tcp_tw_2msl_scan().
 	 *
 	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 
 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 
 	 * Ignore fact that there were recent incoming segments.
 	 */
 	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
 	    tp->t_inpcb && tp->t_inpcb->inp_socket && 
 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		TCPSTAT_INC(tcps_finwait2_drops);
 		tp = tcp_close(tp);             
 	} else {
 		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
 			if (!callout_reset(&tp->t_timers->tt_2msl,
 			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
 				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
 			}
 		} else
 		       tp = tcp_close(tp);
        }
 
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_keep(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_keep) ||
 	    !callout_active(&tp->t_timers->tt_keep)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_keep);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
 		("%s: tp %p keep callout should be running", __func__, tp));
 	/*
 	 * Keep-alive timer went off; send something
 	 * or drop connection if idle for too long.
 	 */
 	TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
-	if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
+	if ((tcp_always_keepalive ||
+	    inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response
 		 * if the peer is up and reachable:
 		 * either an ACK if the connection is still alive,
 		 * or an RST if the peer has closed the connection
 		 * due to timeout or reboot.
 		 * Using sequence number tp->snd_una-1
 		 * causes the transmitted zero-length segment
 		 * to lie outside the receive window;
 		 * by the protocol spec, this requires the
 		 * correspondent TCP to respond.
 		 */
 		TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			tcp_respond(tp, t_template->tt_ipgen,
 				    &t_template->tt_t, (struct mbuf *)NULL,
 				    tp->rcv_nxt, tp->snd_una - 1, 0);
 			free(t_template, M_TEMP);
 		}
 		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
 		    tcp_timer_keep, tp)) {
 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
 		}
 	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
 		    tcp_timer_keep, tp)) {
 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
 		}
 
 #ifdef TCPDEBUG
 	if (inp->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return;
 
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
 	tp = tcp_drop(tp, ETIMEDOUT);
 
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_persist(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_persist) ||
 	    !callout_active(&tp->t_timers->tt_persist)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_persist);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
 		("%s: tp %p persist callout should be running", __func__, tp));
 	/*
 	 * Persistance timer into zero window.
 	 * Force a byte to be output, if possible.
 	 */
 	TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not
 	 * time out if the window is closed.  After a full
 	 * backoff, drop the connection if the idle time
 	 * (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tp = tcp_drop(tp, ETIMEDOUT);
 		goto out;
 	}
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tp = tcp_drop(tp, ETIMEDOUT);
 		goto out;
 	}
 	tcp_setpersist(tp);
 	tp->t_flags |= TF_FORCEDATA;
 	(void) tcp_output(tp);
 	tp->t_flags &= ~TF_FORCEDATA;
 
 out:
 #ifdef TCPDEBUG
 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_rexmt(void * xtp)
 {
 	struct tcpcb *tp = xtp;
 	CURVNET_SET(tp->t_vnet);
 	int rexmt;
 	int headlocked;
 	struct inpcb *inp;
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
 	    !callout_active(&tp->t_timers->tt_rexmt)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_rexmt);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
 		("%s: tp %p rexmt callout should be running", __func__, tp));
 	tcp_free_sackholes(tp);
 	/*
 	 * Retransmission timer went off.  Message has not
 	 * been acked within retransmit interval.  Back off
 	 * to a longer retransmit interval and retransmit one segment.
 	 */
 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		TCPSTAT_INC(tcps_timeoutdrop);
 
 		tp = tcp_drop(tp, tp->t_softerror ?
 			      tp->t_softerror : ETIMEDOUT);
 		headlocked = 1;
 		goto out;
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	headlocked = 0;
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be
 		 * limited to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can
 		 * be recovered if this turns out to be a "bad" retransmit.
 		 * A retransmit is considered "bad" if an ACK for this
 		 * segment is received within RTT/2 interval; the assumption
 		 * here is that the ACK was already in flight.  See
 		 * "On Estimating End-to-End Network Path Properties" by
 		 * Allman and Paxson for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
 		if (IN_FASTRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASFRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASFRECOVERY;
 		if (IN_CONGRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASCRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASCRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 		tp->t_flags |= TF_PREVVALID;
 	} else
 		tp->t_flags &= ~TF_PREVVALID;
 	TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 		      tp->t_rttmin, TCPTV_REXMTMAX);
 
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple of
 	 * packets and process straight to FIN. In that case we won't catch
 	 * ESTABLISHED state.
 	 */
 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
 		int optlen;
 #ifdef INET6
 		int isipv6;
 #endif
 
 		/*
 		 * Idea here is that at each stage of mtu probe (usually, 1448
 		 * -> 1188 -> 524) should be given 2 chances to recover before
 		 *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
 		 *  take care of that.
 		 */
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism:
 			 * - Disable Path MTU Discovery (IP "DF" bit).
 			 * - Reduce MTU to lower value than what we
 			 *   negotiated with peer.
 			 */
 			/* Record that we may have found a black hole. */
 			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 
 			/* Keep track of previous MSS. */
 			optlen = tp->t_maxopd - tp->t_maxseg;
 			tp->t_pmtud_saved_maxopd = tp->t_maxopd;
 
 			/* 
 			 * Reduce the MSS to blackhole value or to the default
 			 * in an attempt to retransmit.
 			 */
 #ifdef INET6
 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
 			if (isipv6 &&
 			    tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
 				V_tcp_pmtud_blackhole_activated++;
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxopd = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				V_tcp_pmtud_blackhole_activated_min_mss++;
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
 				V_tcp_pmtud_blackhole_activated++;
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxopd = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				V_tcp_pmtud_blackhole_activated_min_mss++;
 			}
 #endif
 			tp->t_maxseg = tp->t_maxopd - optlen;
 			/*
 			 * Reset the slow-start flight size
 			 * as it may depend on the new MSS.
 			 */
 			if (CC_ALGO(tp)->conn_init != NULL)
 				CC_ALGO(tp)->conn_init(tp->ccv);
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole and
 			 * we restore the previous MSS and blackhole detection
 			 * flags.
 			 * The limit '6' is determined by giving each probe
 			 * stage (1448, 1188, 524) 2 chances to recover.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift > 6)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				optlen = tp->t_maxopd - tp->t_maxseg;
 				tp->t_maxopd = tp->t_pmtud_saved_maxopd;
 				tp->t_maxseg = tp->t_maxopd - optlen;
 				V_tcp_pmtud_blackhole_failed++;
 				/*
 				 * Reset the slow-start flight size as it
 				 * may depend on the new MSS.
 				 */
 				if (CC_ALGO(tp)->conn_init != NULL)
 					CC_ALGO(tp)->conn_init(tp->ccv);
 			}
 		}
 	}
 
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to
 	 * our third SYN to work-around some broken terminal servers
 	 * (most of which have hopefully been retired) that have bad VJ
 	 * header compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, our srtt estimate is probably bogus.
 	 * Clobber it so we'll take the next rtt measurement as our srtt;
 	 * move the current srtt into rttvar to keep the current
 	 * retransmit times until then.
 	 */
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 #ifdef INET6
 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 			in6_losing(tp->t_inpcb);
 #endif
 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 		tp->t_srtt = 0;
 	}
 	tp->snd_nxt = tp->snd_una;
 	tp->snd_recover = tp->snd_max;
 	/*
 	 * Force a segment to be sent.
 	 */
 	tp->t_flags |= TF_ACKNOW;
 	/*
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
 
 	cc_cong_signal(tp, NULL, CC_RTO);
 
 	(void) tcp_output(tp);
 
 out:
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	if (headlocked)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 {
 	struct callout *t_callout;
 	timeout_t *f_callout;
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = INP_CPU(inp);
 	uint32_t f_reset;
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return;
 #endif
 
 	if (tp->t_timers->tt_flags & TT_STOPPED)
 		return;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_callout = tcp_timer_delack;
 			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_callout = tcp_timer_rexmt;
 			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_callout = tcp_timer_persist;
 			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_callout = tcp_timer_keep;
 			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl;
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	if (delta == 0) {
 		if ((tp->t_timers->tt_flags & timer_type) &&
 		    callout_stop(t_callout) &&
 		    (tp->t_timers->tt_flags & f_reset)) {
 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
 		}
 	} else {
 		if ((tp->t_timers->tt_flags & timer_type) == 0) {
 			tp->t_timers->tt_flags |= (timer_type | f_reset);
 			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
 		} else {
 			/* Reset already running callout on the same CPU. */
 			if (!callout_reset(t_callout, delta, f_callout, tp)) {
 				/*
 				 * Callout not cancelled, consider it as not
 				 * properly restarted. */
 				tp->t_timers->tt_flags &= ~f_reset;
 			}
 		}
 	}
 }
 
 int
 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
 		default:
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	return callout_active(t_callout);
 }
 
 void
 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 	timeout_t *f_callout;
 	uint32_t f_reset;
 
 	tp->t_timers->tt_flags |= TT_STOPPED;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_callout = tcp_timer_delack_discard;
 			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_callout = tcp_timer_rexmt_discard;
 			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_callout = tcp_timer_persist_discard;
 			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_callout = tcp_timer_keep_discard;
 			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl_discard;
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 
 	if (tp->t_timers->tt_flags & timer_type) {
 		if (callout_stop(t_callout) &&
 		    (tp->t_timers->tt_flags & f_reset)) {
 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
 		} else {
 			/*
 			 * Can't stop the callout, defer tcpcb actual deletion
 			 * to the last tcp timer discard callout.
 			 * The TT_STOPPED flag will ensure that no tcp timer
 			 * callouts can be restarted on our behalf, and
 			 * past this point currently running callouts waiting
 			 * on inp lock will return right away after the
 			 * classical check for callout reset/stop events:
 			 * callout_pending() || !callout_active()
 			 */
 			callout_reset(t_callout, 1, f_callout, tp);
 		}
 	}
 }
 
 #define	ticks_to_msecs(t)	(1000*(t) / hz)
 
 void
 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
     struct xtcp_timer *xtimer)
 {
 	sbintime_t now;
 
 	bzero(xtimer, sizeof(*xtimer));
 	if (timer == NULL)
 		return;
 	now = getsbinuptime();
 	if (callout_active(&timer->tt_delack))
 		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_rexmt))
 		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_persist))
 		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_keep))
 		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_2msl))
 		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
 }
Index: stable/10/sys/netinet/tcp_timer.h
===================================================================
--- stable/10/sys/netinet/tcp_timer.h	(revision 330302)
+++ stable/10/sys/netinet/tcp_timer.h	(revision 330303)
@@ -1,213 +1,214 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_timer.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_TCP_TIMER_H_
 #define _NETINET_TCP_TIMER_H_
 
 /*
  * The TCPT_REXMT timer is used to force retransmissions.
  * The TCP has the TCPT_REXMT timer set whenever segments
  * have been sent for which ACKs are expected but not yet
  * received.  If an ACK is received which advances tp->snd_una,
  * then the retransmit timer is cleared (if there are no more
  * outstanding segments) or reset to the base value (if there
  * are more ACKs expected).  Whenever the retransmit timer goes off,
  * we retransmit one unacknowledged segment, and do a backoff
  * on the retransmit timer.
  *
  * The TCPT_PERSIST timer is used to keep window size information
  * flowing even if the window goes shut.  If all previous transmissions
  * have been acknowledged (so that there are no retransmissions in progress),
  * and the window is too small to bother sending anything, then we start
  * the TCPT_PERSIST timer.  When it expires, if the window is nonzero,
  * we go to transmit state.  Otherwise, at intervals send a single byte
  * into the peer's window to force him to update our window information.
  * We do this at most as often as TCPT_PERSMIN time intervals,
  * but no more frequently than the current estimate of round-trip
  * packet time.  The TCPT_PERSIST timer is cleared whenever we receive
  * a window update from the peer.
  *
  * The TCPT_KEEP timer is used to keep connections alive.  If an
  * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time,
  * but not yet established, then we drop the connection.  Once the connection
  * is established, if the connection is idle for TCPTV_KEEP_IDLE time
  * (and keepalives have been enabled on the socket), we begin to probe
  * the connection.  We force the peer to send us a segment by sending:
  *	<SEQ=SND.UNA-1><ACK=RCV.NXT><CTL=ACK>
  * This segment is (deliberately) outside the window, and should elicit
  * an ack segment in response from the peer.  If, despite the TCPT_KEEP
  * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE
  * amount of time probing, then we drop the connection.
  */
 
 /*
  * Time constants.
  */
 #define	TCPTV_MSL	( 30*hz)		/* max seg lifetime (hah!) */
 #define	TCPTV_SRTTBASE	0			/* base roundtrip time;
 						   if 0, no idea yet */
 #define	TCPTV_RTOBASE	(  3*hz)		/* assumed RTO if no info */
 #define	TCPTV_SRTTDFLT	(  3*hz)		/* assumed RTT if no info */
 
 #define	TCPTV_PERSMIN	(  5*hz)		/* minimum persist interval */
 #define	TCPTV_PERSMAX	( 60*hz)		/* maximum persist interval */
 
 #define	TCPTV_KEEP_INIT	( 75*hz)		/* initial connect keepalive */
 #define	TCPTV_KEEP_IDLE	(120*60*hz)		/* dflt time before probing */
 #define	TCPTV_KEEPINTVL	( 75*hz)		/* default probe interval */
 #define	TCPTV_KEEPCNT	8			/* max probes before drop */
 
 #define TCPTV_FINWAIT2_TIMEOUT (60*hz)         /* FIN_WAIT_2 timeout if no receiver */
 
 /*
  * Minimum retransmit timer is 3 ticks, for algorithmic stability.
  * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with
  * the expected worst-case processing variances by the kernels
  * representing the end points.  Such variances do not always show
  * up in the srtt because the timestamp is often calculated at
  * the interface rather then at the TCP layer.  This value is
  * typically 50ms.  However, it is also possible that delayed
  * acks (typically 100ms) could create issues so we set the slop
  * to 200ms to try to cover it.  Note that, properly speaking,
  * delayed-acks should not create a major issue for interactive
  * environments which 'P'ush the last segment, at least as
  * long as implementations do the required 'at least one ack
  * for every two packets' for the non-interactive streaming case.
  * (maybe the RTO calculation should use 2*RTT instead of RTT
  * to handle the ack-every-other-packet case).
  *
  * The prior minimum of 1*hz (1 second) badly breaks throughput on any
  * networks faster then a modem that has minor (e.g. 1%) packet loss.
  */
 #define	TCPTV_MIN	( hz/33 )		/* minimum allowable value */
 #define TCPTV_CPU_VAR	( hz/5 )		/* cpu variance allowed (200ms) */
 #define	TCPTV_REXMTMAX	( 64*hz)		/* max allowable REXMT value */
 
 #define TCPTV_TWTRUNC	8			/* RTO factor to truncate TW */
 
 #define	TCP_LINGERTIME	120			/* linger at most 2 minutes */
 
 #define	TCP_MAXRXTSHIFT	12			/* maximum retransmits */
 
 #define	TCPTV_DELACK	( hz/10 )		/* 100ms timeout */
 
 #ifdef	TCPTIMERS
 static const char *tcptimers[] =
     { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" };
 #endif
 
 /*
  * Force a time value to be in a certain range.
  */
 #define	TCPT_RANGESET(tv, value, tvmin, tvmax) do { \
 	(tv) = (value) + tcp_rexmit_slop; \
 	if ((u_long)(tv) < (u_long)(tvmin)) \
 		(tv) = (tvmin); \
 	if ((u_long)(tv) > (u_long)(tvmax)) \
 		(tv) = (tvmax); \
 } while(0)
 
 #ifdef _KERNEL
 
 struct xtcp_timer;
 
 struct tcp_timer {
 	struct	callout tt_rexmt;	/* retransmit timer */
 	struct	callout tt_persist;	/* retransmit persistence */
 	struct	callout tt_keep;	/* keepalive */
 	struct	callout tt_2msl;	/* 2*msl TIME_WAIT timer */
 	struct	callout tt_delack;	/* delayed ACK timer */
 	uint32_t	tt_flags;	/* Timers flags */
 	uint32_t	tt_spare;	/* TDB */
 };
 
 /*
  * Flags for the tt_flags field.
  */
 #define TT_DELACK	0x0001
 #define TT_REXMT	0x0002
 #define TT_PERSIST	0x0004
 #define TT_KEEP		0x0008
 #define TT_2MSL		0x0010
 #define TT_MASK		(TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
 
 #define TT_DELACK_RST	0x0100
 #define TT_REXMT_RST	0x0200
 #define TT_PERSIST_RST	0x0400
 #define TT_KEEP_RST	0x0800
 #define TT_2MSL_RST	0x1000
 
 #define TT_STOPPED	0x00010000
 
 #define	TP_KEEPINIT(tp)	((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
 #define	TP_KEEPIDLE(tp)	((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
 #define	TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl)
 #define	TP_KEEPCNT(tp)	((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt)
 #define	TP_MAXIDLE(tp)	(TP_KEEPCNT(tp) * TP_KEEPINTVL(tp))
 
 extern int tcp_persmin;			/* minimum persist interval */
 extern int tcp_persmax;			/* maximum persist interval */
 extern int tcp_keepinit;		/* time to establish connection */
 extern int tcp_keepidle;		/* time before keepalive probes begin */
 extern int tcp_keepintvl;		/* time between keepalive probes */
 extern int tcp_keepcnt;			/* number of keepalives */
 extern int tcp_delacktime;		/* time before sending a delayed ACK */
 extern int tcp_maxpersistidle;
 extern int tcp_rexmit_min;
 extern int tcp_rexmit_slop;
 extern int tcp_msl;
 extern int tcp_ttl;			/* time to live for TCP segs */
 extern int tcp_backoff[];
 extern int tcp_syn_backoff[];
 
+extern int tcp_always_keepalive;
 extern int tcp_finwait2_timeout;
 extern int tcp_fast_finwait2_recycle;
 
 void	tcp_timer_init(void);
 void	tcp_timer_2msl(void *xtp);
 struct tcptw *
 	tcp_tw_2msl_scan(int reuse);	/* XXX temporary? */
 void	tcp_timer_keep(void *xtp);
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);
 void	tcp_timer_delack(void *xtp);
 void	tcp_timer_2msl_discard(void *xtp);
 void	tcp_timer_keep_discard(void *xtp);
 void	tcp_timer_persist_discard(void *xtp);
 void	tcp_timer_rexmt_discard(void *xtp);
 void	tcp_timer_delack_discard(void *xtp);
 void	tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
 	struct xtcp_timer *xtimer);
 
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_TCP_TIMER_H_ */
Index: stable/10
===================================================================
--- stable/10	(revision 330302)
+++ stable/10	(revision 330303)

Property changes on: stable/10
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r328608
Index: stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
===================================================================
--- stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 330302)
+++ stable/11/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c	(revision 330303)
@@ -1,1811 +1,1810 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/sockstate.h>
 #include <sys/sockopt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/protosw.h>
 #include <sys/priv.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <net/route.h>
 
 #include "cxgb_include.h"
 #include "ulp/tom/cxgb_l2t.h"
 #include "ulp/tom/cxgb_tom.h"
 #include "ulp/tom/cxgb_toepcb.h"
 
 VNET_DECLARE(int, tcp_do_autosndbuf);
 #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
 VNET_DECLARE(int, tcp_autosndbuf_max);
 #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
-extern int always_keepalive;
 
 /*
  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  * of the messages sent by the host but that are part of the TCP payload and
  * therefore consume TCP sequence space.  Tx connection parameters that
  * operate in TCP sequence space are affected by the HW additions and need to
  * compensate for them to accurately track TCP sequence numbers. This array
  * contains the compensating extra lengths for ULP packets.  It is indexed by
  * a packet's ULP submode.
  */
 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
  * be set through option0, the rest needs to be set through RX_DATA_ACK.
  */
 #define MAX_RCV_WND ((1U << 27) - 1)
 
 /*
  * Min receive window.  We want it to be large enough to accommodate receive
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 
 static void t3_release_offload_resources(struct toepcb *);
 static void send_reset(struct toepcb *toep);
 
 /*
  * Called after the last CPL for the toepcb has been received.
  *
  * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the
  * time this function exits.
  */
 static int
 toepcb_release(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->tp_inp;
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 	int rc;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(!(toep->tp_flags & TP_CPL_DONE),
 	    ("%s: double release?", __func__));
 
 	CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid);
 
 	toep->tp_flags |= TP_CPL_DONE;
 	toep->tp_inp = NULL;
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	if (!(toep->tp_flags & TP_ATTACHED))
 		t3_release_offload_resources(toep);
 
 	rc = in_pcbrele_wlocked(inp);
 	if (!rc)
 		INP_WUNLOCK(inp);
 	return (rc);
 }
 
 /*
  * One sided detach.  The tcpcb is going away and we need to unhook the toepcb
  * hanging off it.  If the TOE driver is also done with the toepcb we'll release
  * all offload resources.
  */
 static void
 toepcb_detach(struct inpcb *inp)
 {
 	struct toepcb *toep;
 	struct tcpcb *tp;
 
 	KASSERT(inp, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 
 	tp = intotcpcb(inp);
 	toep = tp->t_toe;
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__));
 
 	CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__,
 	    tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid,
 	    toep, inp, tp);
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->tp_flags &= ~TP_ATTACHED;
 
 	if (toep->tp_flags & TP_CPL_DONE)
 		t3_release_offload_resources(toep);
 }
 
 void
 t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 
 	toepcb_detach(tp->t_inpcb);
 }
 
 static int
 alloc_atid(struct tid_info *t, void *ctx)
 {
 	int atid = -1;
 
 	mtx_lock(&t->atid_lock);
 	if (t->afree) {
 		union active_open_entry *p = t->afree;
 
 		atid = (p - t->atid_tab) + t->atid_base;
 		t->afree = p->next;
 		p->ctx = ctx;
 		t->atids_in_use++;
 	}
 	mtx_unlock(&t->atid_lock);
 
 	return (atid);
 }
 
 static void
 free_atid(struct tid_info *t, int atid)
 {
 	union active_open_entry *p = atid2entry(t, atid);
 
 	mtx_lock(&t->atid_lock);
 	p->next = t->afree;
 	t->afree = p;
 	t->atids_in_use--;
 	mtx_unlock(&t->atid_lock);
 }
 
 void
 insert_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = ctx;
 	atomic_add_int(&t->tids_in_use, 1);
 }
 
 void
 update_tid(struct tom_data *td, void *ctx, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = ctx;
 }
 
 void
 remove_tid(struct tom_data *td, unsigned int tid)
 {
 	struct tid_info *t = &td->tid_maps;
 
 	t->tid_tab[tid] = NULL;
 	atomic_add_int(&t->tids_in_use, -1);
 }
 
 /* use ctx as a next pointer in the tid release list */
 void
 queue_tid_release(struct toedev *tod, unsigned int tid)
 {
 	struct tom_data *td = t3_tomdata(tod);
 	void **p = &td->tid_maps.tid_tab[tid];
 	struct adapter *sc = tod->tod_softc;
 
 	mtx_lock(&td->tid_release_lock);
 	*p = td->tid_release_list;
 	td->tid_release_list = p;
 	if (!*p)
 		taskqueue_enqueue(sc->tq, &td->tid_release_task);
 	mtx_unlock(&td->tid_release_lock);
 }
 
 /*
  * Populate a TID_RELEASE WR.
  */
 static inline void
 mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid)
 {
 
 	cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 }
 
 void
 release_tid(struct toedev *tod, unsigned int tid, int qset)
 {
 	struct tom_data *td = t3_tomdata(tod);
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
 	struct cpl_tid_release *cpl;
 #ifdef INVARIANTS
 	struct tid_info *t = &td->tid_maps;
 #endif
 
 	KASSERT(tid < t->ntids,
 	    ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids));
 
 	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
 	if (m) {
 		mk_tid_release(cpl, tid);
 		t3_offload_tx(sc, m);
 		remove_tid(td, tid);
 	} else
 		queue_tid_release(tod, tid);
 
 }
 
 void
 t3_process_tid_release_list(void *data, int pending)
 {
 	struct mbuf *m;
 	struct tom_data *td = data;
 	struct adapter *sc = td->tod.tod_softc;
 
 	mtx_lock(&td->tid_release_lock);
 	while (td->tid_release_list) {
 		void **p = td->tid_release_list;
 		unsigned int tid = p - td->tid_maps.tid_tab;
 		struct cpl_tid_release *cpl;
 
 		td->tid_release_list = (void **)*p;
 		m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */
 		if (m == NULL)
 			break;	/* XXX: who reschedules the release task? */
 		mtx_unlock(&td->tid_release_lock);
 		mk_tid_release(cpl, tid);
 		t3_offload_tx(sc, m);
 		remove_tid(td, tid);
 		mtx_lock(&td->tid_release_lock);
 	}
 	mtx_unlock(&td->tid_release_lock);
 }
 
 static void
 close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct cpl_close_con_req *req;
 
 	if (toep->tp_flags & TP_FIN_SENT)
 		return;
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
 	if (m == NULL)
 		CXGB_UNIMPLEMENTED();
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid));
 	req->rsvd = 0;
 
 	toep->tp_flags |= TP_FIN_SENT;
 	t3_offload_tx(sc, m);
 }
 
 static inline void
 make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len,
     struct mbuf *tail)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct sockbuf *snd;
 
 	inp_lock_assert(tp->t_inpcb);
 	snd = so_sockbuf_snd(so);
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 	req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid));
 	/* len includes the length of any HW ULP additions */
 	req->len = htonl(len);
 	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 	/* V_TX_ULP_SUBMODE sets both the mode and submode */
 	req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) |
 	    V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1)));
 	req->sndseq = htonl(tp->snd_nxt);
 	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 		struct adapter *sc = toep->tp_tod->tod_softc;
 		int cpu_idx = sc->rrss_map[toep->tp_qset];
 
 		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 		    V_TX_CPU_IDX(cpu_idx));
 
 		/* Sendbuffer is in units of 32KB. */
 		if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 
 			req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15));
 		else
 			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 
 		toep->tp_flags |= TP_DATASENT;
 	}
 }
 
 /*
  * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc.
  * TOM_XXX_MOVE to some common header file.
  */
 /*
  * IMM_LEN: # of bytes that can be tx'd as immediate data.  There are 16 flits
  * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more
  * for the second gen bit flit.  This leaves us with 12 flits.
  *
  * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs.
  * The first desc has a tx_data_wr (which includes the WR header), the rest have
  * the WR header only.  All descs have the second gen bit flit.
  *
  * sgllen_to_descs: # of tx descs used up by an sgl of given length.  The first
  * desc has a tx_data_wr (which includes the WR header), the rest have the WR
  * header only.  All descs have the second gen bit flit.
  *
  * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits.
  *
  */
 #define IMM_LEN 96
 static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35};
 static int sgllen_to_descs[TX_MAX_SEGS] = {
 	0, 1, 1, 1, 1, 1, 1, 1, 1, 2,	/*  0 -  9 */
 	2, 2, 2, 2, 2, 2, 2, 2, 3, 3,	/* 10 - 19 */
 	3, 3, 3, 3, 3, 3, 3, 4, 4, 4,	/* 20 - 29 */
 	4, 4, 4, 4, 4, 4		/* 30 - 35 */
 };
 #if 0
 static int flits_to_sgllen[TX_DESC_FLITS + 1] = {
 	0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10
 };
 #endif
 #if SGE_NUM_GENBITS != 2
 #error "SGE_NUM_GENBITS really must be 2"
 #endif
 
 int
 t3_push_frames(struct socket *so, int req_completion)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct mbuf *m0, *sndptr, *m;
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	int bytes, ndesc, total_bytes = 0, mlen;
 	struct sockbuf *snd;
 	struct sglist *sgl;
 	struct ofld_hdr *oh;
 	caddr_t dst;
 	struct tx_data_wr *wr;
 
 	inp_lock_assert(tp->t_inpcb);
 
 	snd = so_sockbuf_snd(so);
 	SOCKBUF_LOCK(snd);
 
 	/*
 	 * Autosize the send buffer.
 	 */
 	if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) {
 		if (sbused(snd) >= (snd->sb_hiwat / 8 * 7) &&
 		    sbused(snd) < VNET(tcp_autosndbuf_max)) {
 			if (!sbreserve_locked(snd, min(snd->sb_hiwat +
 			    VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)),
 			    so, curthread))
 				snd->sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr)
 		sndptr = toep->tp_m_last->m_next;
 	else
 		sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 
 	/* Nothing to send or no WRs available for sending data */
 	if (toep->tp_wr_avail == 0 || sndptr == NULL)
 		goto out;
 
 	/* Something to send and at least 1 WR available */
 	while (toep->tp_wr_avail && sndptr != NULL) {
 
 		m0 = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m0 == NULL)
 			break;
 		oh = mtod(m0, struct ofld_hdr *);
 		wr = (void *)(oh + 1);
 		dst = (void *)(wr + 1);
 
 		m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr);
 		oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF |
 		    V_HDR_QSET(toep->tp_qset);
 
 		/*
 		 * Try to construct an immediate data WR if possible.  Stuff as
 		 * much data into it as possible, one whole mbuf at a time.
 		 */
 		mlen = sndptr->m_len;
 		ndesc = bytes = 0;
 		while (mlen <= IMM_LEN - bytes) {
 			bcopy(sndptr->m_data, dst, mlen);
 			bytes += mlen;
 			dst += mlen;
 
 			if (!(sndptr = sndptr->m_next))
 				break;
 			mlen = sndptr->m_len;
 		}
 
 		if (bytes) {
 
 			/* Was able to fit 'bytes' bytes in an immediate WR */
 
 			ndesc = 1;
 			make_tx_data_wr(so, wr, bytes, sndptr);
 
 			m0->m_len += bytes;
 			m0->m_pkthdr.len = m0->m_len;
 
 		} else {
 			int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC);
 
 			/* Need to make an SGL */
 
 			sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT);
 			if (sgl == NULL)
 				break;
 
 			for (m = sndptr; m != NULL; m = m->m_next) {
 				if ((mlen = m->m_len) > 0) {
 					if (sglist_append(sgl, m->m_data, mlen))
 					    break;
 				}
 				bytes += mlen;
 			}
 			sndptr = m;
 			if (bytes == 0) {
 				sglist_free(sgl);
 				break;
 			}
 			ndesc = sgllen_to_descs[sgl->sg_nseg];
 			oh->flags |= F_HDR_SGL;
 			oh->sgl = sgl;
 			make_tx_data_wr(so, wr, bytes, sndptr);
 		}
 
 		oh->flags |= V_HDR_NDESC(ndesc);
 		oh->plen = bytes;
 
 		snd->sb_sndptr = sndptr;
 		snd->sb_sndptroff += bytes;
 		if (sndptr == NULL) {
 			snd->sb_sndptr = snd->sb_mbtail;
 			snd->sb_sndptroff -= snd->sb_mbtail->m_len;
 			toep->tp_m_last = snd->sb_mbtail;
 		} else
 			toep->tp_m_last = NULL;
 
 		total_bytes += bytes;
 
 		toep->tp_wr_avail -= ndesc;
 		toep->tp_wr_unacked += ndesc;
 
 		if ((req_completion && toep->tp_wr_unacked == ndesc) ||
 		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 			wr->wr.wrh_hi |= htonl(F_WR_COMPL);
 			toep->tp_wr_unacked = 0;	
 		}
 
 		enqueue_wr(toep, m0);
 		l2t_send(sc, m0, toep->tp_l2t);
 	}
 out:
 	SOCKBUF_UNLOCK(snd);
 
 	if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN))
 		close_conn(sc, toep);
 
 	return (total_bytes);
 }
 
 static int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct mbuf *m;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req);
 	if (m == NULL)
 		return (0);
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	req->wr.wrh_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 	t3_offload_tx(sc, m);
 	return (credits);
 }
 
 void
 t3_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *so_rcv = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int must_send;
 
 	INP_WLOCK_ASSERT(inp);
 
 	SOCKBUF_LOCK(so_rcv);
 	KASSERT(toep->tp_enqueued >= sbused(so_rcv),
 	    ("%s: sbused(so_rcv) > enqueued", __func__));
 	toep->tp_rx_credits += toep->tp_enqueued - sbused(so_rcv);
 	toep->tp_enqueued = sbused(so_rcv);
 	SOCKBUF_UNLOCK(so_rcv);
 
 	must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd;
 	if (must_send || toep->tp_rx_credits >= 15 * 1024) {
 		int credits;
 
 		credits = send_rx_credits(sc, toep, toep->tp_rx_credits);
 		toep->tp_rx_credits -= credits;
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
 }
 
 static int
 do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_rx_urg_notify *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
 	log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp);
 
 	m_freem(m);
 	return (0);
 }
 
 int
 t3_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp_inpcbtosocket(inp);
 #if defined(KTR)
 	unsigned int tid = toep->tp_tid;
 #endif
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
 	    toep->tp_flags);
 
 	toep->tp_flags |= TP_SEND_FIN;
 	t3_push_frames(so, 1);
 
 	return (0);
 }
 
 int
 t3_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 
 	t3_push_frames(so, 1);
 	return (0);
 }
 
 /* What mtu_idx to use, given a 4-tuple and/or an MSS cap */
 int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i = 0, mss;
 
 	KASSERT(inc != NULL || pmss > 0,
 	    ("%s: at least one of inc/pmss must be specified", __func__));
 
 	mss = inc ? tcp_mssopt(inc) : pmss;
 	if (pmss > 0 && mss > pmss)
 		mss = pmss;
 
 	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
 		++i;
 
 	return (i);
 }
 
 static inline void
 purge_wr_queue(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct ofld_hdr *oh;
 
 	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) {
 		oh = mtod(m, struct ofld_hdr *);
 		if (oh->flags & F_HDR_SGL)
 			sglist_free(oh->sgl);
 		m_freem(m);
 	}
 }
 
 /*
  * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T
  * entry, etc.)
  */
 static void
 t3_release_offload_resources(struct toepcb *toep)
 {
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 
 	/*
 	 * The TOM explicitly detaches its toepcb from the system's inp before
 	 * it releases the offload resources.
 	 */
 	if (toep->tp_inp) {
 		panic("%s: inp %p still attached to toepcb %p",
 		    __func__, toep->tp_inp, toep);
 	}
 
 	if (toep->tp_wr_avail != toep->tp_wr_max)
 		purge_wr_queue(toep);
 
 	if (toep->tp_l2t) {
 		l2t_release(td->l2t, toep->tp_l2t);
 		toep->tp_l2t = NULL;
 	}
 
 	if (toep->tp_tid >= 0)
 		release_tid(tod, toep->tp_tid, toep->tp_qset);
 
 	toepcb_free(toep);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 unsigned long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	SOCKBUF_LOCK(&so->so_snd);
 	so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Update TCP PCB */
 	tp->tod = toep->tp_tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->tp_inp = inp;
 	toep->tp_flags |= TP_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct toedev *tod = toep->tp_tod;
 	struct tom_data *td = t3_tomdata(tod);
 
 	INP_WLOCK_ASSERT(inp);
 
 	so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE;
 	so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE;
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->tp_inp = NULL;
 	toep->tp_flags &= ~TP_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /*
  * Socket could be a listening socket, and we may not have a toepcb at all at
  * this time.
  */
 uint32_t
 calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e)
 {
 	uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) |
 	    V_MSS_IDX(mtu_idx);
 
 	if (so != NULL) {
 		struct inpcb *inp = sotoinpcb(so);
 		struct tcpcb *tp = intotcpcb(inp);
-		int keepalive = always_keepalive ||
+		int keepalive = tcp_always_keepalive ||
 		    so_options_get(so) & SO_KEEPALIVE;
 
 		opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
 		opt0h |= V_KEEP_ALIVE(keepalive != 0);
 	}
 
 	if (e != NULL)
 		opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx);
 
 	return (htobe32(opt0h));
 }
 
 uint32_t
 calc_opt0l(struct socket *so, int rcv_bufsize)
 {
 	uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize);
 
 	KASSERT(rcv_bufsize <= M_RCV_BUFSIZ,
 	    ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize));
 
 	if (so != NULL)		/* optional because no one cares about IP TOS */
 		opt0l |= V_TOS(INP_TOS(sotoinpcb(so)));
 
 	return (htobe32(opt0l));
 }
 
 /*
  * Convert an ACT_OPEN_RPL status to an errno.
  */
 static int
 act_open_rpl_status_to_errno(int status)
 {
 	switch (status) {
 	case CPL_ERR_CONN_RESET:
 		return (ECONNREFUSED);
 	case CPL_ERR_ARP_MISS:
 		return (EHOSTUNREACH);
 	case CPL_ERR_CONN_TIMEDOUT:
 		return (ETIMEDOUT);
 	case CPL_ERR_TCAM_FULL:
 		return (EAGAIN);
 	case CPL_ERR_CONN_EXIST:
 		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
 		return (EAGAIN);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * Return whether a failed active open has allocated a TID
  */
 static inline int
 act_open_has_tid(int status)
 {
 	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
 	       status != CPL_ERR_ARP_MISS;
 }
 
 /*
  * Active open failed.
  */
 static int
 do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct toedev *tod = &td->tod;
 	struct cpl_act_open_rpl *rpl = mtod(m, void *);
 	unsigned int atid = G_TID(ntohl(rpl->atid));
 	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
 	struct inpcb *inp = toep->tp_inp;
 	int s = rpl->status, rc;
 
 	CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s);
 
 	free_atid(&td->tid_maps, atid);
 	toep->tp_tid = -1;
 
 	if (act_open_has_tid(s))
 		queue_tid_release(tod, GET_TID(rpl));
 
 	rc = act_open_rpl_status_to_errno(s);
 	if (rc != EAGAIN)
 		INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	toe_connect_failed(tod, inp, rc);
 	toepcb_release(toep);	/* unlocks inp */
 	if (rc != EAGAIN)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Send an active open request.
  *
  * State of affairs on entry:
  * soisconnecting (so_state |= SS_ISCONNECTING)
  * tcbinfo not locked (this has changed - used to be WLOCKed)
  * inp WLOCKed
  * tp->t_state = TCPS_SYN_SENT
  * rtalloc1, RT_UNLOCK on rt.
  */
 int
 t3_connect(struct toedev *tod, struct socket *so,
     struct rtentry *rt, struct sockaddr *nam)
 {
 	struct mbuf *m = NULL;
 	struct l2t_entry *e = NULL;
 	struct tom_data *td = t3_tomdata(tod);
 	struct adapter *sc = tod->tod_softc;
 	struct cpl_act_open_req *cpl;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep;
 	int atid = -1, mtu_idx, rscale, cpu_idx, qset;
 	struct sockaddr *gw;
 	struct ifnet *ifp = rt->rt_ifp;
 	struct port_info *pi = ifp->if_softc;	/* XXX wrong for VLAN etc. */
 
 	INP_WLOCK_ASSERT(inp);
 
 	toep = toepcb_alloc(tod);
 	if (toep == NULL)
 		goto failed;
 
 	atid = alloc_atid(&td->tid_maps, toep);
 	if (atid < 0)
 		goto failed;
 
 	qset = pi->first_qset + (arc4random() % pi->nqsets);
 
 	m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl);
 	if (m == NULL)
 		goto failed;
 
 	gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam;
 	e = t3_l2t_get(pi, ifp, gw);
 	if (e == NULL)
 		goto failed;
 
 	toep->tp_l2t = e;
 	toep->tp_tid = atid;	/* used to double check response */
 	toep->tp_qset = qset;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 	toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	offload_socket(so, toep);
 
 	/*
 	 * The kernel sets request_r_scale based on sb_max whereas we need to
 	 * take hardware's MAX_RCV_WND into account too.  This is normally a
 	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
 	 */
 	if (tp->t_flags & TF_REQ_SCALE)
 		rscale = tp->request_r_scale = select_rcv_wscale();
 	else
 		rscale = 0;
 	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
 	cpu_idx = sc->rrss_map[qset];
 
 	cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD));
 	cpl->wr.wrh_lo = 0;
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 
 	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
 	    &cpl->peer_port);
 	cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e);
 	cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits);
 	cpl->params = 0;
 	cpl->opt2 = calc_opt2(cpu_idx);
 
 	CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__,
 	    toep->tp_tid, tcpstates[tp->t_state], toep, inp);
 
 	if (l2t_send(sc, m, e) == 0)
 		return (0);
 
 	undo_offload_socket(so);
 
 failed:
 	CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p",
 	    __func__, atid, toep, e, m);
 
 	if (atid >= 0)
 		free_atid(&td->tid_maps, atid);
 
 	if (e)
 		l2t_release(td->l2t, e);
 
 	if (toep)
 		toepcb_free(toep);
 
 	m_freem(m);
 
 	return (ENOMEM);
 }
 
 /*
  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do not
  * send multiple ABORT_REQs for the same connection and also that we do not try
  * to send a message after the connection has closed.
  */
 static void
 send_reset(struct toepcb *toep)
 {
 
 	struct cpl_abort_req *req;
 	unsigned int tid = toep->tp_tid;
 	struct inpcb *inp = toep->tp_inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	struct mbuf *m;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
 	    toep->tp_flags);
 
 	if (toep->tp_flags & TP_ABORT_SHUTDOWN)
 		return;
 
 	toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN);
 
 	/* Purge the send queue */
 	sbflush(so_sockbuf_snd(so));
 	purge_wr_queue(toep);
 
 	m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req);
 	if (m == NULL)
 		CXGB_UNIMPLEMENTED();
 
 	req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
 	req->wr.wrh_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
 	req->rsvd0 = htonl(tp->snd_nxt);
 	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	if (tp->t_state == TCPS_SYN_SENT)
 		(void )mbufq_enqueue(&toep->out_of_order_queue, m); /* defer */
 	else
 		l2t_send(sc, m, toep->tp_l2t);
 }
 
 int
 t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp)
 {
 
 	send_reset(tp->t_toe);
 	return (0);
 }
 
 /*
  * Handler for RX_DATA CPL messages.
  */
 static int
 do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_rx_data *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *so_rcv;	
 
 	/* Advance over CPL */
 	m_adj(m, sizeof(*hdr));
 
 	/* XXX: revisit.  This comes from the T4 TOM */
 	if (__predict_false(inp == NULL)) {
 		/*
 		 * do_pass_establish failed and must be attempting to abort the
 		 * connection.  Meanwhile, the T4 has sent us data for such a
 		 * connection.
 		 */
 #ifdef notyet
 		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
 		    ("%s: inp NULL and tid isn't being aborted", __func__));
 #endif
 		m_freem(m);
 		return (0);
 	}
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode))
 		toep->tp_delack_mode = hdr->dack_mode;
 
 	tp = intotcpcb(inp);
 
 #ifdef INVARIANTS
 	if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) {
 		log(LOG_ERR,
 		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
 		    __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt);
 	}
 #endif
 	tp->rcv_nxt += m->m_pkthdr.len;
 	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
 	    ("%s: negative window size", __func__));
 	tp->rcv_wnd -= m->m_pkthdr.len;
 	tp->t_rcvtime = ticks;
 
 	so  = inp->inp_socket;
 	so_rcv = &so->so_rcv;
 	SOCKBUF_LOCK(so_rcv);
 
 	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, m->m_pkthdr.len);
 		SOCKBUF_UNLOCK(so_rcv);
 		INP_WUNLOCK(inp);
 
 		INP_INFO_RLOCK(&V_tcbinfo);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		m_freem(m);
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	if (so_rcv->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) {
 		unsigned int hiwat = so_rcv->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
 			so_rcv->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->tp_rx_credits += newsize - hiwat;
 	}
 
 	toep->tp_enqueued += m->m_pkthdr.len;
 	sbappendstream_locked(so_rcv, m, 0);
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(so_rcv);
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /*
  * Handler for PEER_CLOSE CPL messages.
  */
 static int
 do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_peer_close *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
 	    tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp);
 
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING)
 		goto done;
 
 	so = inp_inpcbtosocket(inp);
 
 	socantrcvmore(so);
 	tp->rcv_nxt++;
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 	case TCPS_ESTABLISHED:
 		tp->t_state = TCPS_CLOSE_WAIT;
 		break;
 	case TCPS_FIN_WAIT_1:
 		tp->t_state = TCPS_CLOSING;
 		break;
 	case TCPS_FIN_WAIT_2:
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		INP_WLOCK(inp);
 		toepcb_release(toep);	/* no more CPLs expected */
 
 		m_freem(m);
 		return (0);
 	default:
 		log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n",
 		    __func__, toep->tp_tid, tp->t_state);
 	}
 
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Handler for CLOSE_CON_RPL CPL messages.  peer ACK to our FIN received.
  */
 static int
 do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_close_con_rpl *rpl = mtod(m, void *);
 	unsigned int tid = GET_TID(rpl);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid,
 	    tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags);
 
 	if ((toep->tp_flags & TP_ABORT_RPL_PENDING))
 		goto done;
 
 	so = inp_inpcbtosocket(inp);
 	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 
 		INP_WLOCK(inp);
 		toepcb_release(toep);	/* no more CPLs expected */
 	
 		m_freem(m);
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tp->t_state = TCPS_FIN_WAIT_2;
 		break;
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
 		    __func__, toep->tp_tid, tp->t_state);
 	}
 
 done:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct cpl_smt_write_rpl *rpl = mtod(m, void *);
 
 	if (rpl->status != CPL_ERR_NONE) {
 		log(LOG_ERR,
 		    "Unexpected SMT_WRITE_RPL status %u for entry %u\n",
 		    rpl->status, GET_TID(rpl));
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct cpl_set_tcb_rpl *rpl = mtod(m, void *);
 
 	if (rpl->status != CPL_ERR_NONE) {
 		log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n",
 		    rpl->status, GET_TID(rpl));
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Handle an ABORT_RPL_RSS CPL message.
  */
 static int
 do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	const struct cpl_abort_rpl_rss *rpl = mtod(m, void *);
 	unsigned int tid = GET_TID(rpl);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp;
 
 	/*
 	 * Ignore replies to post-close aborts indicating that the abort was
 	 * requested too late.  These connections are terminated when we get
 	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
 	 * arrives the TID is either no longer used or it has been recycled.
 	 */
 	if (rpl->status == CPL_ERR_ABORT_FAILED) {
 		m_freem(m);
 		return (0);
 	}
 
 	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
 		return (do_abort_rpl_synqe(qs, r, m));
 
 	CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep,
 	    rpl->status);
 
 	inp = toep->tp_inp;
 	INP_WLOCK(inp);
 
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) {
 			toep->tp_flags |= TP_ABORT_RPL_RCVD;
 			INP_WUNLOCK(inp);
 		} else {
 			toep->tp_flags &= ~TP_ABORT_RPL_RCVD;
 			toep->tp_flags &= TP_ABORT_RPL_PENDING;
 			toepcb_release(toep);	/* no more CPLs expected */
 		}
 	}
 
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Convert the status code of an ABORT_REQ into a FreeBSD error code.
  */
 static int
 abort_status_to_errno(struct tcpcb *tp, int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * Returns whether an ABORT_REQ_RSS message is a negative advice.
  */
 static inline int
 is_neg_adv_abort(unsigned int status)
 {
 	return status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE;
 }
 
 void
 send_abort_rpl(struct toedev *tod, int tid, int qset)
 {
 	struct mbuf *reply;
 	struct cpl_abort_rpl *rpl;
 	struct adapter *sc = tod->tod_softc;
 
 	reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl);
 	if (!reply)
 		CXGB_UNIMPLEMENTED();
 
 	rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
 	rpl->wr.wrh_lo = htonl(V_WR_TID(tid));
 	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
 	rpl->cmd = CPL_ABORT_NO_RST;
 
 	t3_offload_tx(sc, reply);
 }
 
 /*
  * Handle an ABORT_REQ_RSS CPL message.  If we're waiting for an ABORT_RPL we
  * ignore this request except that we need to reply to it.
  */
 static int
 do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct toedev *tod = &td->tod;
 	const struct cpl_abort_req_rss *req = mtod(m, void *);
 	unsigned int tid = GET_TID(req);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	int qset = toep->tp_qset;
 
 	if (is_neg_adv_abort(req->status)) {
 		CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)",
 		    __func__, req->status, tid, toep->tp_flags);
 		m_freem(m);
 		return (0);
 	}
 
 	if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY)
 		return (do_abort_req_synqe(qs, r, m));
 
 	inp = toep->tp_inp;
 	INP_INFO_RLOCK(&V_tcbinfo);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 	so = inp->inp_socket;
 
 	CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d",
 	    __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags,
 	    req->status);
 
 	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) {
 		toep->tp_flags |= TP_ABORT_REQ_RCVD;
 		toep->tp_flags |= TP_ABORT_SHUTDOWN;
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		m_freem(m);
 		return (0);
 	}
 	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
 
 	/*
 	 * If we'd sent a reset on this toep, we'll ignore this and clean up in
 	 * the T3's reply to our reset instead.
 	 */
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 		toep->tp_flags |= TP_ABORT_RPL_SENT;
 		INP_WUNLOCK(inp);
 	} else {
 		so_error_set(so, abort_status_to_errno(tp, req->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 		toepcb_release(toep);	/* no more CPLs expected */
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	send_abort_rpl(tod, tid, qset);
 	m_freem(m);
 	return (0);
 }
 
 static void
 assign_rxopt(struct tcpcb *tp, uint16_t tcpopt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = toep->tp_tod->tod_softc;
 
 	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40;
 
 	if (G_TCPOPT_TSTAMP(tcpopt)) {
 		tp->t_flags |= TF_RCVD_TSTMP;
 		tp->t_flags |= TF_REQ_TSTMP;	/* forcibly set */
 		tp->ts_recent = 0;		/* XXX */
 		tp->ts_recent_age = tcp_ts_getticks();
 	}
 
 	if (G_TCPOPT_SACK(tcpopt))
 		tp->t_flags |= TF_SACK_PERMIT;
 	else
 		tp->t_flags &= ~TF_SACK_PERMIT;
 
 	if (G_TCPOPT_WSCALE_OK(tcpopt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt);
 	}
 
 }
 
 /*
  * The ISS and IRS are from after the exchange of SYNs and are off by 1.
  */
 void
 make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs,
     uint16_t cpl_tcpopt)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	long bufsize;
 	uint32_t iss = be32toh(cpl_iss) - 1;	/* true ISS */
 	uint32_t irs = be32toh(cpl_irs) - 1;	/* true IRS */
 	uint16_t tcpopt = be16toh(cpl_tcpopt);
 
 	INP_WLOCK_ASSERT(inp);
 
 	tp->t_state = TCPS_ESTABLISHED;
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state],
 	    toep->tp_tid, toep, inp);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = toep->tp_rx_credits << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	/*
 	 * If we were unable to send all rx credits via opt0, save the remainder
 	 * in rx_credits so that they can be handed over with the next credit
 	 * update.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	bufsize = select_rcv_wnd(so);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	toep->tp_rx_credits = bufsize - tp->rcv_wnd;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 	soisconnected(so);
 }
 
 /*
  * Fill in the right TID for CPL messages waiting in the out-of-order queue
  * and send them to the TOE.
  */
 static void
 fixup_and_send_ofo(struct toepcb *toep)
 {
 	struct mbuf *m;
 	struct toedev *tod = toep->tp_tod;
 	struct adapter *sc = tod->tod_softc;
 	unsigned int tid = toep->tp_tid;
 
 	inp_lock_assert(toep->tp_inp);
 
 	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
 		struct ofld_hdr *oh = mtod(m, void *);
 		/*
 		 * A variety of messages can be waiting but the fields we'll
 		 * be touching are common to all so any message type will do.
 		 */
 		struct cpl_close_con_req *p = (void *)(oh + 1);
 
 		p->wr.wrh_lo = htonl(V_WR_TID(tid));
 		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
 		t3_offload_tx(sc, m);
 	}
 }
 
 /*
  * Process a CPL_ACT_ESTABLISH message.
  */
 static int
 do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_act_establish *req = mtod(m, void *);
 	unsigned int tid = GET_TID(req);
 	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
 	struct toepcb *toep = lookup_atid(&td->tid_maps, atid);
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct socket *so; 
 
 	CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid);
 
 	free_atid(&td->tid_maps, atid);
 
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	KASSERT(toep->tp_qset == qs->idx,
 	    ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx));
 	KASSERT(toep->tp_tid == atid,
 	    ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid));
 
 	toep->tp_tid = tid;
 	insert_tid(td, toep, tid);
 
 	if (inp->inp_flags & INP_DROPPED) {
 		/* socket closed by the kernel before hw told us it connected */
 		send_reset(toep);
 		goto done;
 	}
 
 	KASSERT(tp->t_state == TCPS_SYN_SENT,
 	    ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state));
 
 	so = inp->inp_socket;
 	make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt);
 
 	/*
 	 * Now that we finally have a TID send any CPL messages that we had to
 	 * defer for lack of a TID.
 	 */
 	if (mbufq_len(&toep->out_of_order_queue))
 		fixup_and_send_ofo(toep);
 
 done:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
 
 /*
  * Process an acknowledgment of WR completion.  Advance snd_una and send the
  * next batch of work requests from the write queue.
  */
 static void
 wr_ack(struct toepcb *toep, struct mbuf *m)
 {
 	struct inpcb *inp = toep->tp_inp;
 	struct tcpcb *tp;
 	struct cpl_wr_ack *hdr = mtod(m, void *);
 	struct socket *so;
 	unsigned int credits = ntohs(hdr->credits);
 	u32 snd_una = ntohl(hdr->snd_una);
 	int bytes = 0;
 	struct sockbuf *snd;
 	struct mbuf *p;
 	struct ofld_hdr *oh;
 
 	inp_wlock(inp);
 	tp = intotcpcb(inp);
 	so = inp->inp_socket;
 	toep->tp_wr_avail += credits;
 	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
 		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
 
 	while (credits) {
 		p = peek_wr(toep);
 
 		if (__predict_false(!p)) {
 			CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, "
 			    "tid %u, state %u, wr_avail %u", __func__, credits,
 			    toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 
 			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
 			    "nothing pending, state %u wr_avail=%u\n",
 			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 			break;
 		}
 
 		oh = mtod(p, struct ofld_hdr *);
 
 		KASSERT(credits >= G_HDR_NDESC(oh->flags),
 		    ("%s: partial credits?  %d %d", __func__, credits,
 		    G_HDR_NDESC(oh->flags)));
 
 		dequeue_wr(toep);
 		credits -= G_HDR_NDESC(oh->flags);
 		bytes += oh->plen;
 
 		if (oh->flags & F_HDR_SGL)
 			sglist_free(oh->sgl);
 		m_freem(p);
 	}
 
 	if (__predict_false(SEQ_LT(snd_una, tp->snd_una)))
 		goto out_free;
 
 	if (tp->snd_una != snd_una) {
 		tp->snd_una = snd_una;
 		tp->ts_recent_age = tcp_ts_getticks();
 		if (tp->snd_una == tp->snd_nxt)
 			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
 	}
 
 	snd = so_sockbuf_snd(so);
 	if (bytes) {
 		SOCKBUF_LOCK(snd);
 		sbdrop_locked(snd, bytes);
 		so_sowwakeup_locked(so);
 	}
 
 	if (snd->sb_sndptroff < sbused(snd))
 		t3_push_frames(so, 0);
 
 out_free:
 	inp_wunlock(tp->t_inpcb);
 	m_freem(m);
 }
 
 /*
  * Handler for TX_DATA_ACK CPL messages.
  */
 static int
 do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
 {
 	struct adapter *sc = qs->adap;
 	struct tom_data *td = sc->tom_softc;
 	struct cpl_wr_ack *hdr = mtod(m, void *);
 	unsigned int tid = GET_TID(hdr);
 	struct toepcb *toep = lookup_tid(&td->tid_maps, tid);
 
 	/* XXX bad race */
 	if (toep)
 		wr_ack(toep, m);
 
 	return (0);
 }
 
 void
 t3_init_cpl_io(struct adapter *sc)
 {
 	t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
 	t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
 	t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify);
 	t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
 	t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack);
 	t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
 	t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
 	t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
 	t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl);
 	t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl);
 }
 #endif
Index: stable/11/sys/dev/cxgbe/tom/t4_tom.c
===================================================================
--- stable/11/sys/dev/cxgbe/tom/t4_tom.c	(revision 330302)
+++ stable/11/sys/dev/cxgbe/tom/t4_tom.c	(revision 330303)
@@ -1,1272 +1,1271 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/limits.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/refcount.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/scope6_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static struct protosw toe_protosw;
 static struct pr_usrreqs toe_usrreqs;
 
 static struct protosw toe6_protosw;
 static struct pr_usrreqs toe6_usrreqs;
 
 /* Module ops */
 static int t4_tom_mod_load(void);
 static int t4_tom_mod_unload(void);
 static int t4_tom_modevent(module_t, int, void *);
 
 /* ULD ops and helpers */
 static int t4_tom_activate(struct adapter *);
 static int t4_tom_deactivate(struct adapter *);
 
 static struct uld_info tom_uld_info = {
 	.uld_id = ULD_TOM,
 	.activate = t4_tom_activate,
 	.deactivate = t4_tom_deactivate,
 };
 
 static void queue_tid_release(struct adapter *, int);
 static void release_offload_resources(struct toepcb *);
 static int alloc_tid_tabs(struct tid_info *);
 static void free_tid_tabs(struct tid_info *);
 static int add_lip(struct adapter *, struct in6_addr *);
 static int delete_lip(struct adapter *, struct in6_addr *);
 static struct clip_entry *search_lip(struct tom_data *, struct in6_addr *);
 static void init_clip_table(struct adapter *, struct tom_data *);
 static void update_clip(struct adapter *, void *);
 static void t4_clip_task(void *, int);
 static void update_clip_table(struct adapter *, struct tom_data *);
 static void destroy_clip_table(struct adapter *, struct tom_data *);
 static void free_tom_data(struct adapter *, struct tom_data *);
 static void reclaim_wr_resources(void *, int);
 
 static int in6_ifaddr_gen;
 static eventhandler_tag ifaddr_evhandler;
 static struct timeout_task clip_task;
 
 struct toepcb *
 alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct toepcb *toep;
 	int tx_credits, txsd_total, len;
 
 	/*
 	 * The firmware counts tx work request credits in units of 16 bytes
 	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
 	 * about tx credits if it wants to abort a connection.
 	 */
 	tx_credits = sc->params.ofldq_wr_cred;
 	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
 
 	/*
 	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
 	 * immediate payload, and firmware counts tx work request credits in
 	 * units of 16 byte.  Calculate the maximum work requests possible.
 	 */
 	txsd_total = tx_credits /
 	    howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
 
 	if (txqid < 0)
 		txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
 	KASSERT(txqid >= vi->first_ofld_txq &&
 	    txqid < vi->first_ofld_txq + vi->nofldtxq,
 	    ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi,
 		vi->first_ofld_txq, vi->nofldtxq));
 
 	if (rxqid < 0)
 		rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
 	KASSERT(rxqid >= vi->first_ofld_rxq &&
 	    rxqid < vi->first_ofld_rxq + vi->nofldrxq,
 	    ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi,
 		vi->first_ofld_rxq, vi->nofldrxq));
 
 	len = offsetof(struct toepcb, txsd) +
 	    txsd_total * sizeof(struct ofld_tx_sdesc);
 
 	toep = malloc(len, M_CXGBE, M_ZERO | flags);
 	if (toep == NULL)
 		return (NULL);
 
 	refcount_init(&toep->refcount, 1);
 	toep->td = sc->tom_softc;
 	toep->vi = vi;
 	toep->tx_total = tx_credits;
 	toep->tx_credits = tx_credits;
 	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
 	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
 	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	mbufq_init(&toep->ulp_pduq, INT_MAX);
 	mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
 	toep->txsd_total = txsd_total;
 	toep->txsd_avail = txsd_total;
 	toep->txsd_pidx = 0;
 	toep->txsd_cidx = 0;
 	aiotx_init_toep(toep);
 	ddp_init_toep(toep);
 
 	return (toep);
 }
 
 struct toepcb *
 hold_toepcb(struct toepcb *toep)
 {
 
 	refcount_acquire(&toep->refcount);
 	return (toep);
 }
 
 void
 free_toepcb(struct toepcb *toep)
 {
 
 	if (refcount_release(&toep->refcount) == 0)
 		return;
 
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: attached to an inpcb", __func__));
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: CPL pending", __func__));
 
 	ddp_uninit_toep(toep);
 	free(toep, M_CXGBE);
 }
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	if (inp->inp_vflag & INP_IPV6)
 		so->so_proto = &toe6_protosw;
 	else
 		so->so_proto = &toe_protosw;
 	SOCKBUF_UNLOCK(sb);
 
 	/* Update TCP PCB */
 	tp->tod = &td->tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->inp = inp;
 	toep->flags |= TPF_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct tom_data *td = toep->td;
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->inp = NULL;
 	toep->flags &= ~TPF_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 static void
 release_offload_resources(struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct adapter *sc = td_adapter(td);
 	int tid = toep->tid;
 
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: %p has CPL pending.", __func__, toep));
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: %p is still attached.", __func__, toep));
 
 	CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
 	    __func__, toep, tid, toep->l2te, toep->ce);
 
 	/*
 	 * These queues should have been emptied at approximately the same time
 	 * that a normal connection's socket's so_snd would have been purged or
 	 * drained.  Do _not_ clean up here.
 	 */
 	MPASS(mbufq_len(&toep->ulp_pduq) == 0);
 	MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
 #ifdef INVARIANTS
 	ddp_assert_empty(toep);
 #endif
 
 	if (toep->l2te)
 		t4_l2t_release(toep->l2te);
 
 	if (tid >= 0) {
 		remove_tid(sc, tid, toep->ce ? 2 : 1);
 		release_tid(sc, tid, toep->ctrlq);
 	}
 
 	if (toep->ce)
 		release_lip(td, toep->ce);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	free_toepcb(toep);
 }
 
 /*
  * The kernel is done with the TCP PCB and this is our opportunity to unhook the
  * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
  * pending CPL) then it is time to release all resources tied to the toepcb.
  *
  * Also gets called when an offloaded active open fails and the TOM wants the
  * kernel to take the TCP PCB back.
  */
 static void
 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 #if defined(KTR) || defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->flags & TPF_ATTACHED,
 	    ("%s: not attached", __func__));
 
 #ifdef KTR
 	if (tp->t_state == TCPS_SYN_SENT) {
 		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
 		    __func__, toep->tid, toep, toep->flags, inp,
 		    inp->inp_flags);
 	} else {
 		CTR6(KTR_CXGBE,
 		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
 		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
 		    inp->inp_flags);
 	}
 #endif
 
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->flags &= ~TPF_ATTACHED;
 
 	if (!(toep->flags & TPF_CPL_PENDING))
 		release_offload_resources(toep);
 }
 
 /*
  * setsockopt handler.
  */
 static void
 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct toepcb *toep = tp->t_toe;
 
 	if (dir == SOPT_GET)
 		return;
 
 	CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
 
 	switch (name) {
 	case TCP_NODELAY:
 		t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
 		    V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1),
 		    0, 0, toep->ofld_rxq->iq.abs_id);
 		break;
 	default:
 		break;
 	}
 }
 
 /*
  * The TOE driver will not receive any more CPLs for the tid associated with the
  * toepcb; release the hold on the inpcb.
  */
 void
 final_cpl_received(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->inp;
 
 	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_CPL_PENDING,
 	    ("%s: CPL not pending already?", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
 	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP)
 		release_ddp_resources(toep);
 	toep->inp = NULL;
 	toep->flags &= ~TPF_CPL_PENDING;
 	mbufq_drain(&toep->ulp_pdu_reclaimq);
 
 	if (!(toep->flags & TPF_ATTACHED))
 		release_offload_resources(toep);
 
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 
 void
 insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 	atomic_add_int(&t->tids_in_use, ntids);
 }
 
 void *
 lookup_tid(struct adapter *sc, int tid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->tid_tab[tid]);
 }
 
 void
 update_tid(struct adapter *sc, int tid, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = ctx;
 }
 
 void
 remove_tid(struct adapter *sc, int tid, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid] = NULL;
 	atomic_subtract_int(&t->tids_in_use, ntids);
 }
 
 void
 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
 {
 	struct wrqe *wr;
 	struct cpl_tid_release *req;
 
 	wr = alloc_wrqe(sizeof(*req), ctrlq);
 	if (wr == NULL) {
 		queue_tid_release(sc, tid);	/* defer */
 		return;
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
 
 	t4_wrq_tx(sc, wr);
 }
 
 static void
 queue_tid_release(struct adapter *sc, int tid)
 {
 
 	CXGBE_UNIMPLEMENTED("deferred tid release");
 }
 
 /*
  * What mtu_idx to use, given a 4-tuple and/or an MSS cap
  */
 int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i, mss, n;
 
 	KASSERT(inc != NULL || pmss > 0,
 	    ("%s: at least one of inc/pmss must be specified", __func__));
 
 	mss = inc ? tcp_mssopt(inc) : pmss;
 	if (pmss > 0 && mss > pmss)
 		mss = pmss;
 
 	if (inc->inc_flags & INC_ISIPV6)
 		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		n = sizeof(struct ip) + sizeof(struct tcphdr);
 
 	for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++)
 		continue;
 
 	return (i);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 u_long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
-extern int always_keepalive;
-
 /*
  * socket so could be a listening socket too.
  */
 uint64_t
 calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e,
     int mtu_idx, int rscale, int rx_credits, int ulp_mode)
 {
 	uint64_t opt0;
 
 	KASSERT(rx_credits <= M_RCV_BUFSIZ,
 	    ("%s: rcv_bufsiz too high", __func__));
 
 	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
 	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
 
 	if (so != NULL) {
 		struct inpcb *inp = sotoinpcb(so);
 		struct tcpcb *tp = intotcpcb(inp);
-		int keepalive = always_keepalive ||
+		int keepalive = tcp_always_keepalive ||
 		    so_options_get(so) & SO_KEEPALIVE;
 
 		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
 		opt0 |= V_KEEP_ALIVE(keepalive != 0);
 	}
 
 	if (e != NULL)
 		opt0 |= V_L2T_IDX(e->idx);
 
 	if (vi != NULL) {
 		opt0 |= V_SMAC_SEL(vi->smt_idx);
 		opt0 |= V_TX_CHAN(vi->pi->tx_chan);
 	}
 
 	return htobe64(opt0);
 }
 
 uint64_t
 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct tp_params *tp = &sc->params.tp;
 	uint16_t viid = vi->viid;
 	uint64_t ntuple = 0;
 
 	/*
 	 * Initialize each of the fields which we care about which are present
 	 * in the Compressed Filter Tuple.
 	 */
 	if (tp->vlan_shift >= 0 && e->vlan != CPL_L2T_VLAN_NONE)
 		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
 
 	if (tp->port_shift >= 0)
 		ntuple |= (uint64_t)e->lport << tp->port_shift;
 
 	if (tp->protocol_shift >= 0)
 		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
 
 	if (tp->vnic_shift >= 0) {
 		uint32_t vf = G_FW_VIID_VIN(viid);
 		uint32_t pf = G_FW_VIID_PFN(viid);
 		uint32_t vld = G_FW_VIID_VIVLD(viid);
 
 		ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vf) | V_FT_VNID_ID_PF(pf) |
 		    V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift;
 	}
 
 	if (is_t4(sc))
 		return (htobe32((uint32_t)ntuple));
 	else
 		return (htobe64(V_FILTER_TUPLE(ntuple)));
 }
 
 void
 set_tcpddp_ulp_mode(struct toepcb *toep)
 {
 
 	toep->ulp_mode = ULP_MODE_TCPDDP;
 	toep->ddp_flags = DDP_OK;
 }
 
 int
 negative_advice(int status)
 {
 
 	return (status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE ||
 	    status == CPL_ERR_KEEPALV_NEG_ADVICE);
 }
 
 static int
 alloc_tid_tabs(struct tid_info *t)
 {
 	size_t size;
 	unsigned int i;
 
 	size = t->ntids * sizeof(*t->tid_tab) +
 	    t->natids * sizeof(*t->atid_tab) +
 	    t->nstids * sizeof(*t->stid_tab);
 
 	t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
 	if (t->tid_tab == NULL)
 		return (ENOMEM);
 
 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
 	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
 	t->afree = t->atid_tab;
 	t->atids_in_use = 0;
 	for (i = 1; i < t->natids; i++)
 		t->atid_tab[i - 1].next = &t->atid_tab[i];
 	t->atid_tab[t->natids - 1].next = NULL;
 
 	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
 	t->stid_tab = (struct listen_ctx **)&t->atid_tab[t->natids];
 	t->stids_in_use = 0;
 	TAILQ_INIT(&t->stids);
 	t->nstids_free_head = t->nstids;
 
 	atomic_store_rel_int(&t->tids_in_use, 0);
 
 	return (0);
 }
 
 static void
 free_tid_tabs(struct tid_info *t)
 {
 	KASSERT(t->tids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
 	KASSERT(t->atids_in_use == 0,
 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
 	KASSERT(t->stids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
 
 	free(t->tid_tab, M_CXGBE);
 	t->tid_tab = NULL;
 
 	if (mtx_initialized(&t->atid_lock))
 		mtx_destroy(&t->atid_lock);
 	if (mtx_initialized(&t->stid_lock))
 		mtx_destroy(&t->stid_lock);
 }
 
 static int
 add_lip(struct adapter *sc, struct in6_addr *lip)
 {
         struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
         memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static int
 delete_lip(struct adapter *sc, struct in6_addr *lip)
 {
 	struct fw_clip_cmd c;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	/* mtx_assert(&td->clip_table_lock, MA_OWNED); */
 
 	memset(&c, 0, sizeof(c));
 	c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_READ);
         c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
         c.ip_hi = *(uint64_t *)&lip->s6_addr[0];
         c.ip_lo = *(uint64_t *)&lip->s6_addr[8];
 
 	return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c));
 }
 
 static struct clip_entry *
 search_lip(struct tom_data *td, struct in6_addr *lip)
 {
 	struct clip_entry *ce;
 
 	mtx_assert(&td->clip_table_lock, MA_OWNED);
 
 	TAILQ_FOREACH(ce, &td->clip_table, link) {
 		if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 			return (ce);
 	}
 
 	return (NULL);
 }
 
 struct clip_entry *
 hold_lip(struct tom_data *td, struct in6_addr *lip, struct clip_entry *ce)
 {
 
 	mtx_lock(&td->clip_table_lock);
 	if (ce == NULL)
 		ce = search_lip(td, lip);
 	if (ce != NULL)
 		ce->refcount++;
 	mtx_unlock(&td->clip_table_lock);
 
 	return (ce);
 }
 
 void
 release_lip(struct tom_data *td, struct clip_entry *ce)
 {
 
 	mtx_lock(&td->clip_table_lock);
 	KASSERT(search_lip(td, &ce->lip) == ce,
 	    ("%s: CLIP entry %p p not in CLIP table.", __func__, ce));
 	KASSERT(ce->refcount > 0,
 	    ("%s: CLIP entry %p has refcount 0", __func__, ce));
 	--ce->refcount;
 	mtx_unlock(&td->clip_table_lock);
 }
 
 static void
 init_clip_table(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	mtx_init(&td->clip_table_lock, "CLIP table lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->clip_table);
 	td->clip_gen = -1;
 
 	update_clip_table(sc, td);
 }
 
 static void
 update_clip(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4tomuc"))
 		return;
 
 	if (uld_active(sc, ULD_TOM))
 		update_clip_table(sc, sc->tom_softc);
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 
 static void
 t4_clip_task(void *arg, int count)
 {
 
 	t4_iterate(update_clip, NULL);
 }
 
 static void
 update_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 	struct in6_addr *lip, tlip;
 	struct clip_head stale;
 	struct clip_entry *ce, *ce_temp;
 	struct vi_info *vi;
 	int rc, gen, i, j;
 	uintptr_t last_vnet;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	mtx_lock(&td->clip_table_lock);
 
 	gen = atomic_load_acq_int(&in6_ifaddr_gen);
 	if (gen == td->clip_gen)
 		goto done;
 
 	TAILQ_INIT(&stale);
 	TAILQ_CONCAT(&stale, &td->clip_table, link);
 
 	/*
 	 * last_vnet optimizes the common cases where all if_vnet = NULL (no
 	 * VIMAGE) or all if_vnet = vnet0.
 	 */
 	last_vnet = (uintptr_t)(-1);
 	for_each_port(sc, i)
 	for_each_vi(sc->port[i], j, vi) {
 		if (last_vnet == (uintptr_t)vi->ifp->if_vnet)
 			continue;
 
 		/* XXX: races with if_vmove */
 		CURVNET_SET(vi->ifp->if_vnet);
 		TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 			lip = &ia->ia_addr.sin6_addr;
 
 			KASSERT(!IN6_IS_ADDR_MULTICAST(lip),
 			    ("%s: mcast address in in6_ifaddr list", __func__));
 
 			if (IN6_IS_ADDR_LOOPBACK(lip))
 				continue;
 			if (IN6_IS_SCOPE_EMBED(lip)) {
 				/* Remove the embedded scope */
 				tlip = *lip;
 				lip = &tlip;
 				in6_clearscope(lip);
 			}
 			/*
 			 * XXX: how to weed out the link local address for the
 			 * loopback interface?  It's fe80::1 usually (always?).
 			 */
 
 			/*
 			 * If it's in the main list then we already know it's
 			 * not stale.
 			 */
 			TAILQ_FOREACH(ce, &td->clip_table, link) {
 				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip))
 					goto next;
 			}
 
 			/*
 			 * If it's in the stale list we should move it to the
 			 * main list.
 			 */
 			TAILQ_FOREACH(ce, &stale, link) {
 				if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) {
 					TAILQ_REMOVE(&stale, ce, link);
 					TAILQ_INSERT_TAIL(&td->clip_table, ce,
 					    link);
 					goto next;
 				}
 			}
 
 			/* A new IP6 address; add it to the CLIP table */
 			ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT);
 			memcpy(&ce->lip, lip, sizeof(ce->lip));
 			ce->refcount = 0;
 			rc = add_lip(sc, lip);
 			if (rc == 0)
 				TAILQ_INSERT_TAIL(&td->clip_table, ce, link);
 			else {
 				char ip[INET6_ADDRSTRLEN];
 
 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
 				    sizeof(ip));
 				log(LOG_ERR, "%s: could not add %s (%d)\n",
 				    __func__, ip, rc);
 				free(ce, M_CXGBE);
 			}
 next:
 			continue;
 		}
 		CURVNET_RESTORE();
 		last_vnet = (uintptr_t)vi->ifp->if_vnet;
 	}
 
 	/*
 	 * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are
 	 * no longer referenced by the driver.
 	 */
 	TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) {
 		if (ce->refcount == 0) {
 			rc = delete_lip(sc, &ce->lip);
 			if (rc == 0) {
 				TAILQ_REMOVE(&stale, ce, link);
 				free(ce, M_CXGBE);
 			} else {
 				char ip[INET6_ADDRSTRLEN];
 
 				inet_ntop(AF_INET6, &ce->lip, &ip[0],
 				    sizeof(ip));
 				log(LOG_ERR, "%s: could not delete %s (%d)\n",
 				    __func__, ip, rc);
 			}
 		}
 	}
 	/* The ones that are still referenced need to stay in the CLIP table */
 	TAILQ_CONCAT(&td->clip_table, &stale, link);
 
 	td->clip_gen = gen;
 done:
 	mtx_unlock(&td->clip_table_lock);
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 }
 
 static void
 destroy_clip_table(struct adapter *sc, struct tom_data *td)
 {
 	struct clip_entry *ce, *ce_temp;
 
 	if (mtx_initialized(&td->clip_table_lock)) {
 		mtx_lock(&td->clip_table_lock);
 		TAILQ_FOREACH_SAFE(ce, &td->clip_table, link, ce_temp) {
 			KASSERT(ce->refcount == 0,
 			    ("%s: CLIP entry %p still in use (%d)", __func__,
 			    ce, ce->refcount));
 			TAILQ_REMOVE(&td->clip_table, ce, link);
 			delete_lip(sc, &ce->lip);
 			free(ce, M_CXGBE);
 		}
 		mtx_unlock(&td->clip_table_lock);
 		mtx_destroy(&td->clip_table_lock);
 	}
 }
 
 static void
 free_tom_data(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	KASSERT(TAILQ_EMPTY(&td->toep_list),
 	    ("%s: TOE PCB list is not empty.", __func__));
 	KASSERT(td->lctx_count == 0,
 	    ("%s: lctx hash table is not empty.", __func__));
 
 	t4_free_ppod_region(&td->pr);
 	destroy_clip_table(sc, td);
 
 	if (td->listen_mask != 0)
 		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
 
 	if (mtx_initialized(&td->unsent_wr_lock))
 		mtx_destroy(&td->unsent_wr_lock);
 	if (mtx_initialized(&td->lctx_hash_lock))
 		mtx_destroy(&td->lctx_hash_lock);
 	if (mtx_initialized(&td->toep_list_lock))
 		mtx_destroy(&td->toep_list_lock);
 
 	free_tid_tabs(&sc->tids);
 	free(td, M_CXGBE);
 }
 
 static void
 reclaim_wr_resources(void *arg, int count)
 {
 	struct tom_data *td = arg;
 	STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
 	struct cpl_act_open_req *cpl;
 	u_int opcode, atid;
 	struct wrqe *wr;
 	struct adapter *sc;
 
 	mtx_lock(&td->unsent_wr_lock);
 	STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
 	mtx_unlock(&td->unsent_wr_lock);
 
 	while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
 		STAILQ_REMOVE_HEAD(&twr_list, link);
 
 		cpl = wrtod(wr);
 		opcode = GET_OPCODE(cpl);
 
 		switch (opcode) {
 		case CPL_ACT_OPEN_REQ:
 		case CPL_ACT_OPEN_REQ6:
 			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
 			sc = td_adapter(td);
 
 			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
 			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
 			free(wr, M_CXGBE);
 			break;
 		default:
 			log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
 			    "opcode %x\n", __func__, wr, wr->wr_len, opcode);
 			/* WR not freed here; go look at it with a debugger.  */
 		}
 	}
 }
 
 /*
  * Ground control to Major TOM
  * Commencing countdown, engines on
  */
 static int
 t4_tom_activate(struct adapter *sc)
 {
 	struct tom_data *td;
 	struct toedev *tod;
 	struct vi_info *vi;
 	struct sge_ofld_rxq *ofld_rxq;
 	int i, j, rc, v;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	/* per-adapter softc for TOM */
 	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (td == NULL)
 		return (ENOMEM);
 
 	/* List of TOE PCBs and associated lock */
 	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->toep_list);
 
 	/* Listen context */
 	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
 	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
 	    &td->listen_mask, HASH_NOWAIT);
 
 	/* List of WRs for which L2 resolution failed */
 	mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
 	STAILQ_INIT(&td->unsent_wr_list);
 	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
 
 	/* TID tables */
 	rc = alloc_tid_tabs(&sc->tids);
 	if (rc != 0)
 		goto done;
 
 	rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
 	    t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
 	if (rc != 0)
 		goto done;
 	t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
 	    V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
 
 	/* CLIP table for IPv6 offload */
 	init_clip_table(sc, td);
 
 	/* toedev ops */
 	tod = &td->tod;
 	init_toedev(tod);
 	tod->tod_softc = sc;
 	tod->tod_connect = t4_connect;
 	tod->tod_listen_start = t4_listen_start;
 	tod->tod_listen_stop = t4_listen_stop;
 	tod->tod_rcvd = t4_rcvd;
 	tod->tod_output = t4_tod_output;
 	tod->tod_send_rst = t4_send_rst;
 	tod->tod_send_fin = t4_send_fin;
 	tod->tod_pcb_detach = t4_pcb_detach;
 	tod->tod_l2_update = t4_l2_update;
 	tod->tod_syncache_added = t4_syncache_added;
 	tod->tod_syncache_removed = t4_syncache_removed;
 	tod->tod_syncache_respond = t4_syncache_respond;
 	tod->tod_offload_socket = t4_offload_socket;
 	tod->tod_ctloutput = t4_ctloutput;
 
 	for_each_port(sc, i) {
 		for_each_vi(sc->port[i], v, vi) {
 			TOEDEV(vi->ifp) = &td->tod;
 			for_each_ofld_rxq(vi, j, ofld_rxq) {
 				ofld_rxq->iq.set_tcb_rpl = do_set_tcb_rpl;
 				ofld_rxq->iq.l2t_write_rpl = do_l2t_write_rpl2;
 			}
 		}
 	}
 
 	sc->tom_softc = td;
 	register_toedev(sc->tom_softc);
 
 done:
 	if (rc != 0)
 		free_tom_data(sc, td);
 	return (rc);
 }
 
 static int
 t4_tom_deactivate(struct adapter *sc)
 {
 	int rc = 0;
 	struct tom_data *td = sc->tom_softc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (td == NULL)
 		return (0);	/* XXX. KASSERT? */
 
 	if (sc->offload_map != 0)
 		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
 
 	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
 		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
 
 	mtx_lock(&td->toep_list_lock);
 	if (!TAILQ_EMPTY(&td->toep_list))
 		rc = EBUSY;
 	mtx_unlock(&td->toep_list_lock);
 
 	mtx_lock(&td->lctx_hash_lock);
 	if (td->lctx_count > 0)
 		rc = EBUSY;
 	mtx_unlock(&td->lctx_hash_lock);
 
 	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
 	mtx_lock(&td->unsent_wr_lock);
 	if (!STAILQ_EMPTY(&td->unsent_wr_list))
 		rc = EBUSY;
 	mtx_unlock(&td->unsent_wr_lock);
 
 	if (rc == 0) {
 		unregister_toedev(sc->tom_softc);
 		free_tom_data(sc, td);
 		sc->tom_softc = NULL;
 	}
 
 	return (rc);
 }
 
 static void
 t4_tom_ifaddr_event(void *arg __unused, struct ifnet *ifp)
 {
 
 	atomic_add_rel_int(&in6_ifaddr_gen, 1);
 	taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4);
 }
 
 static int
 t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	int error;
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
 		error = t4_aio_queue_ddp(so, job);
 		if (error != EOPNOTSUPP)
 			return (error);
 	}
 
 	return (t4_aio_queue_aiotx(so, job));
 }
 
 static int
 t4_tom_mod_load(void)
 {
 	int rc;
 	struct protosw *tcp_protosw, *tcp6_protosw;
 
 	/* CPL handlers */
 	t4_init_connect_cpl_handlers();
 	t4_init_listen_cpl_handlers();
 	t4_init_cpl_io_handlers();
 
 	rc = t4_ddp_mod_load();
 	if (rc != 0)
 		return (rc);
 
 	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw));
 	bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs));
 	toe_usrreqs.pru_aio_queue = t4_aio_queue_tom;
 	toe_protosw.pr_usrreqs = &toe_usrreqs;
 
 	tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
 	if (tcp6_protosw == NULL)
 		return (ENOPROTOOPT);
 	bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
 	bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs));
 	toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom;
 	toe6_protosw.pr_usrreqs = &toe6_usrreqs;
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL);
 	ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event,
 	    t4_tom_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
 
 	rc = t4_register_uld(&tom_uld_info);
 	if (rc != 0)
 		t4_tom_mod_unload();
 
 	return (rc);
 }
 
 static void
 tom_uninit(struct adapter *sc, void *arg __unused)
 {
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
 		return;
 
 	/* Try to free resources (works only if no port has IFCAP_TOE) */
 	if (uld_active(sc, ULD_TOM))
 		t4_deactivate_uld(sc, ULD_TOM);
 
 	end_synchronized_op(sc, 0);
 }
 
 static int
 t4_tom_mod_unload(void)
 {
 	t4_iterate(tom_uninit, NULL);
 
 	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
 		return (EBUSY);
 
 	if (ifaddr_evhandler) {
 		EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_evhandler);
 		taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL);
 	}
 
 	t4_ddp_mod_unload();
 
 	t4_uninit_connect_cpl_handlers();
 	t4_uninit_listen_cpl_handlers();
 	t4_uninit_cpl_io_handlers();
 
 	return (0);
 }
 #endif	/* TCP_OFFLOAD */
 
 static int
 t4_tom_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = t4_tom_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = t4_tom_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 	return (rc);
 }
 
 static moduledata_t t4_tom_moddata= {
 	"t4_tom",
 	t4_tom_modevent,
 	0
 };
 
 MODULE_VERSION(t4_tom, 1);
 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
Index: stable/11/sys/netinet/tcp_timer.c
===================================================================
--- stable/11/sys/netinet/tcp_timer.c	(revision 330302)
+++ stable/11/sys/netinet/tcp_timer.c	(revision 330303)
@@ -1,1006 +1,1008 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 #include <net/netisr.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 
 int    tcp_persmin;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
 
 int    tcp_persmax;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
 
 int	tcp_keepinit;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
 
 int	tcp_keepidle;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
 
 int	tcp_keepintvl;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
 
 int	tcp_delacktime;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
     "Time before a delayed ACK is sent");
 
 int	tcp_msl;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 
 int	tcp_rexmit_min;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
     "Minimum Retransmission Timeout");
 
 int	tcp_rexmit_slop;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
     "Retransmission Timer Slop");
 
-static int	always_keepalive = 1;
+int	tcp_always_keepalive = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
-    &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
+    &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
+__strong_reference(tcp_always_keepalive, always_keepalive);
 
 int    tcp_fast_finwait2_recycle = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 
     &tcp_fast_finwait2_recycle, 0,
     "Recycle closed FIN_WAIT_2 connections faster");
 
 int    tcp_finwait2_timeout;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
 
 int	tcp_keepcnt = TCPTV_KEEPCNT;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
     "Number of keepalive probes to send");
 
 	/* max idle probes */
 int	tcp_maxpersistidle;
 
 static int	tcp_rexmit_drop_options = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
     &tcp_rexmit_drop_options, 0,
     "Drop TCP options from 3rd and later retransmitted SYN");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
 #define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
     "Path MTU Discovery Black Hole Detection Enabled");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
 #define	V_tcp_pmtud_blackhole_activated \
     VNET(tcp_pmtud_blackhole_activated)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
     CTLFLAG_RD|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
     "Path MTU Discovery Black Hole Detection, Activation Count");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
 #define	V_tcp_pmtud_blackhole_activated_min_mss \
     VNET(tcp_pmtud_blackhole_activated_min_mss)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
     CTLFLAG_RD|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
 #define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
     CTLFLAG_RD|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
     "Path MTU Discovery Black Hole Detection, Failure Count");
 
 #ifdef INET
 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
 #define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
     "Path MTU Discovery Black Hole Detection lowered MSS");
 #endif
 
 #ifdef INET6
 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
 #define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
 #endif
 
 #ifdef	RSS
 static int	per_cpu_timers = 1;
 #else
 static int	per_cpu_timers = 0;
 #endif
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
     &per_cpu_timers , 0, "run tcp timers on all cpus");
 
 #if 0
 #define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
 		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
 #endif
 
 /*
  * Map the given inp to a CPU id.
  *
  * This queries RSS if it's compiled in, else it defaults to the current
  * CPU ID.
  */
 static inline int
 inp_to_cpuid(struct inpcb *inp)
 {
 	u_int cpuid;
 
 #ifdef	RSS
 	if (per_cpu_timers) {
 		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 		if (cpuid == NETISR_CPUID_NONE)
 			return (curcpu);	/* XXX */
 		else
 			return (cpuid);
 	}
 #else
 	/* Legacy, pre-RSS behaviour */
 	if (per_cpu_timers) {
 		/*
 		 * We don't have a flowid -> cpuid mapping, so cheat and
 		 * just map unknown cpuids to curcpu.  Not the best, but
 		 * apparently better than defaulting to swi 0.
 		 */
 		cpuid = inp->inp_flowid % (mp_maxid + 1);
 		if (! CPU_ABSENT(cpuid))
 			return (cpuid);
 		return (curcpu);
 	}
 #endif
 	/* Default for RSS and non-RSS - cpuid 0 */
 	else {
 		return (0);
 	}
 }
 
 /*
  * Tcp protocol timeout routine called every 500 ms.
  * Updates timestamps used for TCP
  * causes finite state machine actions if timers expire.
  */
 void
 tcp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		(void) tcp_tw_2msl_scan(0);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 
 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
 
 static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
 
 /*
  * TCP timer processing.
  */
 
 void
 tcp_timer_delack(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_delack) ||
 	    !callout_active(&tp->t_timers->tt_delack)) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_delack);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
 		("%s: tp %p delack callout should be running", __func__, tp));
 
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
 	(void) tp->t_fb->tfb_tcp_output(tp);
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_2msl(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	tcp_free_sackholes(tp);
 	if (callout_pending(&tp->t_timers->tt_2msl) ||
 	    !callout_active(&tp->t_timers->tt_2msl)) {
 		INP_WUNLOCK(tp->t_inpcb);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_2msl);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
 		("%s: tp %p 2msl callout should be running", __func__, tp));
 	/*
 	 * 2 MSL timeout in shutdown went off.  If we're closed but
 	 * still waiting for peer to close and connection has been idle
 	 * too long delete connection control block.  Otherwise, check
 	 * again in a bit.
 	 *
 	 * If in TIME_WAIT state just ignore as this timeout is handled in
 	 * tcp_tw_2msl_scan().
 	 *
 	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 
 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 
 	 * Ignore fact that there were recent incoming segments.
 	 */
 	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
 	    tp->t_inpcb && tp->t_inpcb->inp_socket && 
 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		TCPSTAT_INC(tcps_finwait2_drops);
 		tp = tcp_close(tp);             
 	} else {
 		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
 			if (!callout_reset(&tp->t_timers->tt_2msl,
 			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
 				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
 			}
 		} else
 		       tp = tcp_close(tp);
        }
 
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_keep(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_keep) ||
 	    !callout_active(&tp->t_timers->tt_keep)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_keep);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
 		("%s: tp %p keep callout should be running", __func__, tp));
 	/*
 	 * Keep-alive timer went off; send something
 	 * or drop connection if idle for too long.
 	 */
 	TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
-	if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
+	if ((tcp_always_keepalive ||
+	    inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response
 		 * if the peer is up and reachable:
 		 * either an ACK if the connection is still alive,
 		 * or an RST if the peer has closed the connection
 		 * due to timeout or reboot.
 		 * Using sequence number tp->snd_una-1
 		 * causes the transmitted zero-length segment
 		 * to lie outside the receive window;
 		 * by the protocol spec, this requires the
 		 * correspondent TCP to respond.
 		 */
 		TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			tcp_respond(tp, t_template->tt_ipgen,
 				    &t_template->tt_t, (struct mbuf *)NULL,
 				    tp->rcv_nxt, tp->snd_una - 1, 0);
 			free(t_template, M_TEMP);
 		}
 		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
 		    tcp_timer_keep, tp)) {
 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
 		}
 	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
 		    tcp_timer_keep, tp)) {
 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
 		}
 
 #ifdef TCPDEBUG
 	if (inp->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return;
 
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
 	tp = tcp_drop(tp, ETIMEDOUT);
 
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_persist(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_persist) ||
 	    !callout_active(&tp->t_timers->tt_persist)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_persist);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
 		("%s: tp %p persist callout should be running", __func__, tp));
 	/*
 	 * Persistence timer into zero window.
 	 * Force a byte to be output, if possible.
 	 */
 	TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not
 	 * time out if the window is closed.  After a full
 	 * backoff, drop the connection if the idle time
 	 * (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tp = tcp_drop(tp, ETIMEDOUT);
 		goto out;
 	}
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tp = tcp_drop(tp, ETIMEDOUT);
 		goto out;
 	}
 	tcp_setpersist(tp);
 	tp->t_flags |= TF_FORCEDATA;
 	(void) tp->t_fb->tfb_tcp_output(tp);
 	tp->t_flags &= ~TF_FORCEDATA;
 
 out:
 #ifdef TCPDEBUG
 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_rexmt(void * xtp)
 {
 	struct tcpcb *tp = xtp;
 	CURVNET_SET(tp->t_vnet);
 	int rexmt;
 	int headlocked;
 	struct inpcb *inp;
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
 	    !callout_active(&tp->t_timers->tt_rexmt)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_rexmt);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
 		("%s: tp %p rexmt callout should be running", __func__, tp));
 	tcp_free_sackholes(tp);
 	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
 		/* The stack has a timer action too. */
 		(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
 	}
 	/*
 	 * Retransmission timer went off.  Message has not
 	 * been acked within retransmit interval.  Back off
 	 * to a longer retransmit interval and retransmit one segment.
 	 */
 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		TCPSTAT_INC(tcps_timeoutdrop);
 
 		tp = tcp_drop(tp, tp->t_softerror ?
 			      tp->t_softerror : ETIMEDOUT);
 		headlocked = 1;
 		goto out;
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	headlocked = 0;
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be
 		 * limited to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can
 		 * be recovered if this turns out to be a "bad" retransmit.
 		 * A retransmit is considered "bad" if an ACK for this
 		 * segment is received within RTT/2 interval; the assumption
 		 * here is that the ACK was already in flight.  See
 		 * "On Estimating End-to-End Network Path Properties" by
 		 * Allman and Paxson for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
 		if (IN_FASTRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASFRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASFRECOVERY;
 		if (IN_CONGRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASCRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASCRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 		tp->t_flags |= TF_PREVVALID;
 	} else
 		tp->t_flags &= ~TF_PREVVALID;
 	TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 		      tp->t_rttmin, TCPTV_REXMTMAX);
 
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple of
 	 * packets and process straight to FIN. In that case we won't catch
 	 * ESTABLISHED state.
 	 */
 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
 #ifdef INET6
 		int isipv6;
 #endif
 
 		/*
 		 * Idea here is that at each stage of mtu probe (usually, 1448
 		 * -> 1188 -> 524) should be given 2 chances to recover before
 		 *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
 		 *  take care of that.
 		 */
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism:
 			 * - Disable Path MTU Discovery (IP "DF" bit).
 			 * - Reduce MTU to lower value than what we
 			 *   negotiated with peer.
 			 */
 			/* Record that we may have found a black hole. */
 			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 
 			/* Keep track of previous MSS. */
 			tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 
 			/* 
 			 * Reduce the MSS to blackhole value or to the default
 			 * in an attempt to retransmit.
 			 */
 #ifdef INET6
 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
 			if (isipv6 &&
 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 				V_tcp_pmtud_blackhole_activated++;
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				V_tcp_pmtud_blackhole_activated_min_mss++;
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 				V_tcp_pmtud_blackhole_activated++;
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				V_tcp_pmtud_blackhole_activated_min_mss++;
 			}
 #endif
 			/*
 			 * Reset the slow-start flight size
 			 * as it may depend on the new MSS.
 			 */
 			if (CC_ALGO(tp)->conn_init != NULL)
 				CC_ALGO(tp)->conn_init(tp->ccv);
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole and
 			 * we restore the previous MSS and blackhole detection
 			 * flags.
 			 * The limit '6' is determined by giving each probe
 			 * stage (1448, 1188, 524) 2 chances to recover.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift > 6)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 				V_tcp_pmtud_blackhole_failed++;
 				/*
 				 * Reset the slow-start flight size as it
 				 * may depend on the new MSS.
 				 */
 				if (CC_ALGO(tp)->conn_init != NULL)
 					CC_ALGO(tp)->conn_init(tp->ccv);
 			}
 		}
 	}
 
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to
 	 * our third SYN to work-around some broken terminal servers
 	 * (most of which have hopefully been retired) that have bad VJ
 	 * header compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, our srtt estimate is probably bogus.
 	 * Clobber it so we'll take the next rtt measurement as our srtt;
 	 * move the current srtt into rttvar to keep the current
 	 * retransmit times until then.
 	 */
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 #ifdef INET6
 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 			in6_losing(tp->t_inpcb);
 		else
 #endif
 			in_losing(tp->t_inpcb);
 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 		tp->t_srtt = 0;
 	}
 	tp->snd_nxt = tp->snd_una;
 	tp->snd_recover = tp->snd_max;
 	/*
 	 * Force a segment to be sent.
 	 */
 	tp->t_flags |= TF_ACKNOW;
 	/*
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
 
 	cc_cong_signal(tp, NULL, CC_RTO);
 
 	(void) tp->t_fb->tfb_tcp_output(tp);
 
 out:
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	if (headlocked)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 {
 	struct callout *t_callout;
 	timeout_t *f_callout;
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = inp_to_cpuid(inp);
 	uint32_t f_reset;
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return;
 #endif
 
 	if (tp->t_timers->tt_flags & TT_STOPPED)
 		return;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_callout = tcp_timer_delack;
 			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_callout = tcp_timer_rexmt;
 			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_callout = tcp_timer_persist;
 			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_callout = tcp_timer_keep;
 			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl;
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_activate) {
 				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
 				return;
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	if (delta == 0) {
 		if ((tp->t_timers->tt_flags & timer_type) &&
 		    (callout_stop(t_callout) > 0) &&
 		    (tp->t_timers->tt_flags & f_reset)) {
 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
 		}
 	} else {
 		if ((tp->t_timers->tt_flags & timer_type) == 0) {
 			tp->t_timers->tt_flags |= (timer_type | f_reset);
 			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
 		} else {
 			/* Reset already running callout on the same CPU. */
 			if (!callout_reset(t_callout, delta, f_callout, tp)) {
 				/*
 				 * Callout not cancelled, consider it as not
 				 * properly restarted. */
 				tp->t_timers->tt_flags &= ~f_reset;
 			}
 		}
 	}
 }
 
 int
 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_active) {
 				return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	return callout_active(t_callout);
 }
 
 void
 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 	uint32_t f_reset;
 
 	tp->t_timers->tt_flags |= TT_STOPPED;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_stop) {
 				/* 
 				 * XXXrrs we need to look at this with the
 				 * stop case below (flags).
 				 */
 				tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
 				return;
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 
 	if (tp->t_timers->tt_flags & timer_type) {
 		if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
 			/*
 			 * Can't stop the callout, defer tcpcb actual deletion
 			 * to the last one. We do this using the async drain
 			 * function and incrementing the count in 
 			 */
 			tp->t_timers->tt_draincnt++;
 		}
 	}
 }
 
 #define	ticks_to_msecs(t)	(1000*(t) / hz)
 
 void
 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
     struct xtcp_timer *xtimer)
 {
 	sbintime_t now;
 
 	bzero(xtimer, sizeof(*xtimer));
 	if (timer == NULL)
 		return;
 	now = getsbinuptime();
 	if (callout_active(&timer->tt_delack))
 		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_rexmt))
 		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_persist))
 		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_keep))
 		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_2msl))
 		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
 }
Index: stable/11/sys/netinet/tcp_timer.h
===================================================================
--- stable/11/sys/netinet/tcp_timer.h	(revision 330302)
+++ stable/11/sys/netinet/tcp_timer.h	(revision 330303)
@@ -1,208 +1,209 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_timer.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_TCP_TIMER_H_
 #define _NETINET_TCP_TIMER_H_
 
 /*
  * The TCPT_REXMT timer is used to force retransmissions.
  * The TCP has the TCPT_REXMT timer set whenever segments
  * have been sent for which ACKs are expected but not yet
  * received.  If an ACK is received which advances tp->snd_una,
  * then the retransmit timer is cleared (if there are no more
  * outstanding segments) or reset to the base value (if there
  * are more ACKs expected).  Whenever the retransmit timer goes off,
  * we retransmit one unacknowledged segment, and do a backoff
  * on the retransmit timer.
  *
  * The TCPT_PERSIST timer is used to keep window size information
  * flowing even if the window goes shut.  If all previous transmissions
  * have been acknowledged (so that there are no retransmissions in progress),
  * and the window is too small to bother sending anything, then we start
  * the TCPT_PERSIST timer.  When it expires, if the window is nonzero,
  * we go to transmit state.  Otherwise, at intervals send a single byte
  * into the peer's window to force him to update our window information.
  * We do this at most as often as TCPT_PERSMIN time intervals,
  * but no more frequently than the current estimate of round-trip
  * packet time.  The TCPT_PERSIST timer is cleared whenever we receive
  * a window update from the peer.
  *
  * The TCPT_KEEP timer is used to keep connections alive.  If an
  * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time,
  * but not yet established, then we drop the connection.  Once the connection
  * is established, if the connection is idle for TCPTV_KEEP_IDLE time
  * (and keepalives have been enabled on the socket), we begin to probe
  * the connection.  We force the peer to send us a segment by sending:
  *	<SEQ=SND.UNA-1><ACK=RCV.NXT><CTL=ACK>
  * This segment is (deliberately) outside the window, and should elicit
  * an ack segment in response from the peer.  If, despite the TCPT_KEEP
  * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE
  * amount of time probing, then we drop the connection.
  */
 
 /*
  * Time constants.
  */
 #define	TCPTV_MSL	( 30*hz)		/* max seg lifetime (hah!) */
 #define	TCPTV_SRTTBASE	0			/* base roundtrip time;
 						   if 0, no idea yet */
 #define	TCPTV_RTOBASE	(  3*hz)		/* assumed RTO if no info */
 
 #define	TCPTV_PERSMIN	(  5*hz)		/* minimum persist interval */
 #define	TCPTV_PERSMAX	( 60*hz)		/* maximum persist interval */
 
 #define	TCPTV_KEEP_INIT	( 75*hz)		/* initial connect keepalive */
 #define	TCPTV_KEEP_IDLE	(120*60*hz)		/* dflt time before probing */
 #define	TCPTV_KEEPINTVL	( 75*hz)		/* default probe interval */
 #define	TCPTV_KEEPCNT	8			/* max probes before drop */
 
 #define TCPTV_FINWAIT2_TIMEOUT (60*hz)         /* FIN_WAIT_2 timeout if no receiver */
 
 /*
  * Minimum retransmit timer is 3 ticks, for algorithmic stability.
  * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with
  * the expected worst-case processing variances by the kernels
  * representing the end points.  Such variances do not always show
  * up in the srtt because the timestamp is often calculated at
  * the interface rather then at the TCP layer.  This value is
  * typically 50ms.  However, it is also possible that delayed
  * acks (typically 100ms) could create issues so we set the slop
  * to 200ms to try to cover it.  Note that, properly speaking,
  * delayed-acks should not create a major issue for interactive
  * environments which 'P'ush the last segment, at least as
  * long as implementations do the required 'at least one ack
  * for every two packets' for the non-interactive streaming case.
  * (maybe the RTO calculation should use 2*RTT instead of RTT
  * to handle the ack-every-other-packet case).
  *
  * The prior minimum of 1*hz (1 second) badly breaks throughput on any
  * networks faster then a modem that has minor (e.g. 1%) packet loss.
  */
 #define	TCPTV_MIN	( hz/33 )		/* minimum allowable value */
 #define TCPTV_CPU_VAR	( hz/5 )		/* cpu variance allowed (200ms) */
 #define	TCPTV_REXMTMAX	( 64*hz)		/* max allowable REXMT value */
 
 #define TCPTV_TWTRUNC	8			/* RTO factor to truncate TW */
 
 #define	TCP_LINGERTIME	120			/* linger at most 2 minutes */
 
 #define	TCP_MAXRXTSHIFT	12			/* maximum retransmits */
 
 #define	TCPTV_DELACK	( hz/10 )		/* 100ms timeout */
 
 #ifdef	TCPTIMERS
 static const char *tcptimers[] =
     { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" };
 #endif
 
 /*
  * Force a time value to be in a certain range.
  */
 #define	TCPT_RANGESET(tv, value, tvmin, tvmax) do { \
 	(tv) = (value) + tcp_rexmit_slop; \
 	if ((u_long)(tv) < (u_long)(tvmin)) \
 		(tv) = (tvmin); \
 	if ((u_long)(tv) > (u_long)(tvmax)) \
 		(tv) = (tvmax); \
 } while(0)
 
 #ifdef _KERNEL
 
 struct xtcp_timer;
 
 struct tcp_timer {
 	struct	callout tt_rexmt;	/* retransmit timer */
 	struct	callout tt_persist;	/* retransmit persistence */
 	struct	callout tt_keep;	/* keepalive */
 	struct	callout tt_2msl;	/* 2*msl TIME_WAIT timer */
 	struct	callout tt_delack;	/* delayed ACK timer */
 	uint32_t	tt_flags;	/* Timers flags */
 	uint32_t	tt_draincnt;	/* Count being drained */
 };
 
 /*
  * Flags for the tt_flags field.
  */
 #define TT_DELACK	0x0001
 #define TT_REXMT	0x0002
 #define TT_PERSIST	0x0004
 #define TT_KEEP		0x0008
 #define TT_2MSL		0x0010
 #define TT_MASK		(TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
 
 #define TT_DELACK_RST	0x0100
 #define TT_REXMT_RST	0x0200
 #define TT_PERSIST_RST	0x0400
 #define TT_KEEP_RST	0x0800
 #define TT_2MSL_RST	0x1000
 
 #define TT_STOPPED	0x00010000
 
 #define	TP_KEEPINIT(tp)	((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
 #define	TP_KEEPIDLE(tp)	((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
 #define	TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl)
 #define	TP_KEEPCNT(tp)	((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt)
 #define	TP_MAXIDLE(tp)	(TP_KEEPCNT(tp) * TP_KEEPINTVL(tp))
 
 extern int tcp_persmin;			/* minimum persist interval */
 extern int tcp_persmax;			/* maximum persist interval */
 extern int tcp_keepinit;		/* time to establish connection */
 extern int tcp_keepidle;		/* time before keepalive probes begin */
 extern int tcp_keepintvl;		/* time between keepalive probes */
 extern int tcp_keepcnt;			/* number of keepalives */
 extern int tcp_delacktime;		/* time before sending a delayed ACK */
 extern int tcp_maxpersistidle;
 extern int tcp_rexmit_min;
 extern int tcp_rexmit_slop;
 extern int tcp_msl;
 extern int tcp_ttl;			/* time to live for TCP segs */
 extern int tcp_backoff[];
 extern int tcp_syn_backoff[];
 
+extern int tcp_always_keepalive;
 extern int tcp_finwait2_timeout;
 extern int tcp_fast_finwait2_recycle;
 
 void	tcp_timer_init(void);
 void	tcp_timer_2msl(void *xtp);
 void	tcp_timer_discard(void *);
 struct tcptw *
 	tcp_tw_2msl_scan(int reuse);	/* XXX temporary? */
 void	tcp_timer_keep(void *xtp);
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);
 void	tcp_timer_delack(void *xtp);
 void	tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
 	struct xtcp_timer *xtimer);
 
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_TCP_TIMER_H_ */
Index: stable/11
===================================================================
--- stable/11	(revision 330302)
+++ stable/11	(revision 330303)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r328608